git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@8640 e1793c9e...
[qpalma.git] / scripts / compile_dataset.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import sys
5 import random
6 import os
7 import re
8 import pdb
9 import cPickle
10
11 import numpy
12 from numpy.matlib import mat,zeros,ones,inf
13
14 import qpalma
15 import qpalma.tools
16 from qpalma.tools.parseGff import parse_gff
17
18 from qpalma.parsers import FilteredReadParser, RemappedReadParser, MapParser
19
20 from Genefinding import *
21
22 from genome_utils import load_genomic
23
24 import qpalma.Configuration as Conf
25
26 warning = """
27 #########################################################
28 WARNING ! Disabled the use of non-bracket reads WARNING !
29 #########################################################
30
31 """
32
33
34 help = """
35
36 Usage of this script is:
37
38 ./compile_dataset.py gff_file dna_flat_files solexa_reads remapped_reads dataset_file
39
40 """
41
42 info = """
43 The purpose of this script is to compile a full data set for QPalma out of the
44 following files:
45
46 - The information of the gff (gene annotation).
47
48 - The flat dna files with the sequence of the respective gene positions
49
50 - The solexa reads in the filtered version
51
52 - The solexa reads in the remapped version
53
54 This represents a dataset for use with QPalma
55 The entries are def'd as follows:
56
57 - 'id' a unique id for the read
58 - 'chr' the chromosome identifier
59 - 'strand' the identifier for the strand either '+' or '-'
60
61 - 'read_start'
62 - 'exon_stop'
63 - 'exon_start'
64 - 'read_stop'
65
66 - 'remappped_pos' - the position of the remapped read
67 """
68
69 # some useful constants
70
71 extended_alphabet = ['-','a','c','g','t','n','[',']']
72 alphabet = ['-','a','c','g','t','n']
73
74 # some dummy elements to be returned if current example violates assertions
75 nil_dna_frag = ('','','','','')
76 remapped_nil = ('')
77 process_nil = ('','','','','','','','')
78
79 nil_splice_scores = ('','')
80
81 def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,dataset_file,test):
82
83 assert os.path.exists(filtered_reads)
84 assert os.path.exists(remapped_reads)
85 assert not os.path.exists(dataset_file), 'The data_file already exists!'
86
87 joinp = os.path.join
88
89 # first read the gff file(s) and create pickled gene information
90 allGenes = [None]*6
91
92 for i in range(1,6):
93 allGenes[i]= parse_gff(gff_file%i) #,joinp(tmp_dir,'gff_info.pickle'))
94
95 print 'parsing filtered reads..'
96 frp = FilteredReadParser(filtered_reads)
97 all_filtered_reads = frp.parse()
98
99 print 'found %d filtered reads' % len(all_filtered_reads)
100
101 print 'parsing map reads'
102 rrp = MapParser(remapped_reads)
103 all_remapped_reads = rrp.parse()
104
105 if all_remapped_reads != None:
106 size_rem = len(all_remapped_reads)
107 else:
108 size_rem = 0
109
110 print 'found %d remapped reads' % size_rem
111
112 # this stores the new dataset
113 SeqInfo = []
114 Exons = []
115 OriginalEsts = []
116 Qualities = []
117 AlternativeSequences = []
118 SplitPositions = []
119
120 # Iterate over all remapped reads in order to generate for each read an
121 # training / prediction example
122 instance_counter = 0
123 noVmatchHitCtr = 0
124 posCtr = 0
125
126 for fId,currentFRead in all_filtered_reads.items():
127
128 if currentFRead['strand'] != '+':
129 #print 'wrong strand'
130 continue
131
132 reId = str(1000000000000+int(fId))
133 try:
134 reReads = all_remapped_reads[reId]
135 except:
136 noVmatchHitCtr += 1
137 continue
138
139 gene_id = currentFRead['gene_id']
140 chro = currentFRead['chr']
141
142 #try:
143 # gene_id = currentFRead['gene_id']
144 # chro = currentFRead['chr']
145 #except:
146 # continue
147
148 seq_info, exons, cut_pos =\
149 process_filtered_read(currentFRead,dna_flat_files,test)
150
151 original_est = currentFRead['seq']
152
153 onePositiveLabel,alternative_seq = process_map(reReads,currentFRead,dna_flat_files)
154
155 if onePositiveLabel:
156 posCtr += 1
157
158 SeqInfo.append(seq_info)
159 Exons.append(exons)
160 OriginalEsts.append(original_est)
161
162 quality = currentFRead['prb']
163 cal_prb = currentFRead['cal_prb']
164 chastity = currentFRead['chastity']
165
166 Qualities.append( (quality,cal_prb,chastity) )
167 AlternativeSequences.append(alternative_seq)
168 SplitPositions.append(cut_pos)
169
170 instance_counter += 1
171
172 if instance_counter % 1000 == 0:
173 print 'processed %d examples' % instance_counter
174
175 if instance_counter == 200000:
176 break
177
178 print 'Full dataset has size %d' % len(SeqInfo)
179 print 'Number of reads which had no Vmatch hit: %d' % noVmatchHitCtr
180 print 'Number of reads which had correct Vmatch hit: %d' % posCtr
181
182 dataset = [SeqInfo, Exons, OriginalEsts, Qualities,\
183 AlternativeSequences, SplitPositions]
184
185 # saving new dataset
186 #io_pickle.save(dataset_file,dataset)
187
188 cPickle.dump(dataset,open(dataset_file,'w+'),protocol=2)
189
190
191 def process_filtered_read(fRead,dna_flat_files,test):
192 """
193 Creates a training example
194 """
195
196 id = fRead['id']
197 id = int(id)
198
199 chr = fRead['chr']
200 strand = fRead['strand']
201
202 genomic_start = fRead['p_start']
203 genomic_stop = fRead['p_stop']
204
205 CUT_OFFSET = random.randint(Conf.extension[0],Conf.extension[1])
206
207 if genomic_start <= CUT_OFFSET:
208 up_cut = genomic_start
209 else:
210 up_cut = genomic_start - CUT_OFFSET
211
212 #if genomic_stop + CUT_OFFSET > :
213 # down_cut = currentGene.stop - currentExons[1,1]
214 #else:
215 down_cut = genomic_stop + CUT_OFFSET
216
217 seq_info = (id,chr,strand,up_cut,down_cut,fRead['true_cut'])
218
219 # now that we have p_start and p_stop properly
220 # we can define the correct exons borders
221 currentExons = zeros((2,2),dtype=numpy.int)
222
223 currentExons[0,0] = fRead['p_start']
224 currentExons[0,1] = fRead['exon_stop']
225
226 currentExons[1,0] = fRead['exon_start']
227 currentExons[1,1] = fRead['p_stop']
228
229 return seq_info, currentExons, fRead['true_cut']
230
231
232 def _getSpliceScores(chr,strand,intervalBegin,intervalEnd):
233 """
234 Now we want to use interval_query to get the predicted splice scores trained
235 on the TAIR sequence and annotation.
236 """
237
238 assert intervalEnd > intervalBegin
239
240 size = intervalEnd-intervalBegin
241
242 acc = size*[0.0]
243 don = size*[0.0]
244
245 interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
246 pos_size = new_intp()
247 intp_assign(pos_size,1)
248
249 # fetch acceptor scores
250 sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_%d%s' % (chr,strand)
251 result = interval_query(sscore_filename, ['Conf'], 1, interval_matrix, 1,pos_size)
252
253 num_hits = result[0]
254 assert num_hits <= size, pdb.set_trace()
255 pos = result[1]
256 scores = result[3]
257
258 acc = [-inf]*(size)
259
260 #print [p - intervalBegin for p in pos]
261
262 for i in range(num_hits):
263 position = pos[i] - intervalBegin
264 if not (position > 1 and position < size):
265 continue
266
267 assert 0 <= position <= size, pdb.set_trace()
268 acc[position] = scores[i]
269
270 # fetch acceptor scores
271 sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_%d%s' % (chr,strand)
272 result = interval_query(sscore_filename, ['Conf'], 1, interval_matrix, 1,pos_size)
273
274 num_hits = result[0]
275 assert num_hits <= size, pdb.set_trace()
276 pos = result[1]
277 scores = result[3]
278
279 don = [-inf]*(size)
280
281 #print [p - intervalBegin for p in pos]
282
283 for i in range(num_hits):
284 position = pos[i] - intervalBegin
285 if not ( position > 1 and position < size ):
286 continue
287
288 assert 0 <= position <= size, pdb.set_trace()
289 don[position] = scores[i]
290
291 return acc, don
292
293
294 def getSpliceScores(chr,strand,intervalBegin,intervalEnd):
295 """
296 Now we want to use interval_query to get the predicted splice scores trained
297 on the TAIR sequence and annotation.
298 """
299
300 assert intervalEnd > intervalBegin
301
302 size = intervalEnd-intervalBegin
303
304 acc = size*[0.0]
305 don = size*[0.0]
306
307 interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
308 pos_size = new_intp()
309 intp_assign(pos_size,1)
310
311 # fetch acceptor scores
312 sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_%d%s'
313 acc = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename)
314
315 # fetch donor scores
316 sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_%d%s'
317 don = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename)
318
319 return acc, don
320
321
322 def process_map(reReads,fRead,dna_flat_files):
323 """
324 For all matches found by Vmatch we calculate the fragment of the DNA we
325 would like to perform an aligment during prediction.
326 """
327
328 fragment_size = 3000
329
330 alternativeSeq = []
331
332 onePositiveLabel = False
333
334 for currentRRead in reReads:
335 rId = currentRRead['id']
336 pos = currentRRead['pos']
337 chr = currentRRead['chr']
338 strand = currentRRead['strand']
339 seq = currentRRead['seq']
340
341 if strand != '+':
342 continue
343
344 length = currentRRead['length']
345 offset = currentRRead['offset']
346
347 currentLabel = False
348
349 CUT_OFFSET = random.randint(Conf.extension[0],Conf.extension[1])
350
351 # vmatch found the correct position
352 if fRead['chr'] == chr and fRead['strand'] == strand and fRead['p_start']-36 <= pos <= fRead['p_stop']+36:
353 currentLabel = True
354 genomicSeq_start = fRead['p_start'] - CUT_OFFSET
355 genomicSeq_stop = fRead['p_stop'] + CUT_OFFSET
356 #pdb.set_trace()
357 else:
358 #pdb.set_trace()
359 genomicSeq_start = pos - (fragment_size/2)
360 genomicSeq_stop = pos + (fragment_size/2)
361
362 onePositiveLabel = onePositiveLabel or currentLabel
363
364 alternativeSeq.append( (chr,strand,genomicSeq_start,genomicSeq_stop,currentLabel) )
365
366
367 return onePositiveLabel,alternativeSeq
368
369
370 def check_remapped_reads(reReads,fRead,dna_flat_files):
371 """
372 For all matches found by Vmatch we calculate the fragment of the DNA we
373 would like to perform an aligment during prediction.
374 """
375
376 fragment_size = 3000
377
378 seq = fRead['seq']
379 strand = fRead['strand']
380 pos = fRead['p_start']
381 chr = fRead['chr']
382
383 print 'fRead:\t%d %d %d %d' %\
384 (fRead['p_start'],fRead['exon_stop'],fRead['exon_start'],fRead['p_stop'])
385
386 for currentRRead in reReads:
387 rId = currentRRead['id']
388
389 pos1 = currentRRead['pos1']
390 chr1 = currentRRead['chr1']
391 seq1 = currentRRead['seq1']
392
393 seq2 = currentRRead['seq2']
394 pos2 = 0
395
396 if not seq2 == '':
397 pos2 = currentRRead['pos2']
398 chr2 = currentRRead['chr2']
399 s2_pos = seq.find(seq2)
400
401 print 'reRead:\t %d %d %d %d'%(pos1,pos1+len(seq1),pos2,pos2+len(seq2))
402
403
404 def get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_files):
405 """
406 This function expects an interval, chromosome and strand information and
407 returns then the genomic sequence of this interval and the associated scores.
408 """
409
410 chrom = 'chr%d' % chr
411 genomicSeq = load_genomic(chrom,strand,genomicSeq_start-1,genomicSeq_stop,dna_flat_files,one_based=False)
412 genomicSeq = genomicSeq.lower()
413
414 # check the obtained dna sequence
415 assert genomicSeq != '', 'load_genomic returned empty sequence!'
416 #for elem in genomicSeq:
417 # if not elem in alphabet:
418
419 no_base = re.compile('[^acgt]')
420 genomicSeq = no_base.sub('n',genomicSeq)
421
422 intervalBegin = genomicSeq_start-100
423 intervalEnd = genomicSeq_stop+100
424 seq_pos_offset = genomicSeq_start
425
426 currentAcc, currentDon = getSpliceScores(chr,strand,intervalBegin,intervalEnd)
427
428 currentAcc = currentAcc[100:-98]
429 currentAcc = currentAcc[1:]
430 currentDon = currentDon[100:-100]
431
432 length = len(genomicSeq)
433 currentAcc = currentAcc[:length]
434
435 currentDon = currentDon+[-inf]*(length-len(currentDon))
436
437 ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and genomicSeq[p-1]=='a' and genomicSeq[p]=='g' ]
438 gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
439
440 assert ag_tuple_pos == [p for p,e in enumerate(currentAcc) if e != -inf and p > 1], pdb.set_trace()
441 assert gt_tuple_pos == [p for p,e in enumerate(currentDon) if e != -inf and p > 0], pdb.set_trace()
442 assert len(currentAcc) == len(currentDon)
443
444 return genomicSeq, currentAcc, currentDon
445
446
447 def reverse_complement(seq):
448 map = {'a':'t','c':'g','g':'c','t':'a'}
449
450 new_seq = [map[elem] for elem in seq]
451 new_seq.reverse()
452 new_seq = "".join(new_seq)
453
454 return new_seq
455
456 def get_seq(begin,end,exon_end):
457 """
458 """
459
460 dna_flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
461
462 if exon_end:
463 gene_start = begin
464 gene_stop = end+2
465 else:
466 gene_start = begin-2
467 gene_stop = end
468
469 chrom = 'chr%d' % 1
470 strand = '+'
471
472 genomicSeq = load_genomic(chrom,strand,gene_start,gene_stop,dna_flat_files,one_based=False)
473 genomicSeq = genomicSeq.lower()
474
475 return genomicSeq
476
477
478 if __name__ == '__main__':
479 if len(sys.argv) == 1:
480 print info
481
482 assert len(sys.argv) == 6, help
483 compile_d(gff_file,dna_flat_files,solexa_reads,remapped_reads,dataset_file)