+ fixed nasty index bugs in the when creating the label of the ground truth
[qpalma.git] / scripts / compile_dataset.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import sys
5 import random
6 import os
7 import re
8 import pdb
9 import cPickle
10
11 import numpy
12 from numpy.matlib import mat,zeros,ones,inf
13
14 import qpalma
15 import qpalma.tools
16 from qpalma.tools.parseGff import parse_gff
17
18 from qpalma.parsers import FilteredReadParser, RemappedReadParser, MapParser
19
20 from Genefinding import *
21
22 from genome_utils import load_genomic
23
24 import qpalma.Configuration as Conf
25
26 warning = """
27 #########################################################
28 WARNING ! Disabled the use of non-bracket reads WARNING !
29 #########################################################
30
31 """
32
33
34 help = """
35
36 Usage of this script is:
37
38 ./compile_dataset.py gff_file dna_flat_files solexa_reads remapped_reads dataset_file
39
40 """
41
42 info = """
43 The purpose of this script is to compile a full data set for QPalma out of the
44 following files:
45
46 - The information of the gff (gene annotation).
47
48 - The flat dna files with the sequence of the respective gene positions
49
50 - The solexa reads in the filtered version
51
52 - The solexa reads in the remapped version
53
54 This represents a dataset for use with QPalma
55 The entries are def'd as follows:
56
57 - 'id' a unique id for the read
58 - 'chr' the chromosome identifier
59 - 'strand' the identifier for the strand either '+' or '-'
60
61 - 'read_start'
62 - 'exon_stop'
63 - 'exon_start'
64 - 'read_stop'
65
66 - 'remappped_pos' - the position of the remapped read
67 """
68
69 # some useful constants
70
71 extended_alphabet = ['-','a','c','g','t','n','[',']']
72 alphabet = ['-','a','c','g','t','n']
73
74 # some dummy elements to be returned if current example violates assertions
75 nil_dna_frag = ('','','','','')
76 remapped_nil = ('')
77 process_nil = ('','','','','','','','')
78
79 nil_splice_scores = ('','')
80
81 def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,dataset_file,test):
82
83 assert os.path.exists(filtered_reads)
84 assert os.path.exists(remapped_reads)
85 assert not os.path.exists(dataset_file), 'The data_file already exists!'
86
87 joinp = os.path.join
88
89 # first read the gff file(s) and create pickled gene information
90 allGenes = [None]*6
91
92 for i in range(1,6):
93 allGenes[i]= parse_gff(gff_file%i) #,joinp(tmp_dir,'gff_info.pickle'))
94
95 print 'parsing filtered reads..'
96 frp = FilteredReadParser(filtered_reads)
97 all_filtered_reads = frp.parse()
98
99 print 'found %d filtered reads' % len(all_filtered_reads)
100
101 print 'parsing map reads'
102 rrp = MapParser(remapped_reads)
103 all_remapped_reads = rrp.parse()
104
105 if all_remapped_reads != None:
106 size_rem = len(all_remapped_reads)
107 else:
108 size_rem = 0
109
110 print 'found %d remapped reads' % size_rem
111
112 # this stores the new dataset
113 SeqInfo = []
114 Exons = []
115 OriginalEsts = []
116 Qualities = []
117 AlternativeSequences = []
118
119 # Iterate over all remapped reads in order to generate for each read an
120 # training / prediction example
121 instance_counter = 0
122
123 for fId,currentFRead in all_filtered_reads.items():
124
125 reId = str(1000000000000+int(fId))
126 try:
127 reReads = all_remapped_reads[reId]
128 except:
129 continue
130
131 try:
132 gene_id = currentFRead['gene_id']
133 chro = currentFRead['chr']
134 except:
135 continue
136
137 if currentFRead['strand'] != '+':
138 #print 'wrong strand'
139 continue
140
141 seq_info, exons =\
142 process_filtered_read(currentFRead,dna_flat_files,test)
143
144 original_est = currentFRead['seq']
145
146 alternative_seq = process_map(reReads,currentFRead,dna_flat_files)
147
148 SeqInfo.append(seq_info)
149 Exons.append(exons)
150 OriginalEsts.append(original_est)
151
152 quality = currentFRead['prb']
153 cal_prb = currentFRead['cal_prb']
154 chastity = currentFRead['chastity']
155
156 Qualities.append( (quality,cal_prb,chastity) )
157 AlternativeSequences.append(alternative_seq)
158
159 instance_counter += 1
160
161 if instance_counter % 1000 == 0:
162 print 'processed %d examples' % instance_counter
163
164 if instance_counter == 20000:
165 break
166
167 print 'Full dataset has size %d' % len(SeqInfo)
168
169 dataset = [SeqInfo, Exons, OriginalEsts, Qualities,\
170 AlternativeSequences]
171
172 # saving new dataset
173 #io_pickle.save(dataset_file,dataset)
174
175 cPickle.dump(dataset,open(dataset_file,'w+'),protocol=2)
176
177
178 def process_filtered_read(fRead,dna_flat_files,test):
179 """
180 Creates a training example
181 """
182
183 chr = fRead['chr']
184 strand = fRead['strand']
185
186 genomic_start = fRead['p_start']
187 genomic_stop = fRead['p_stop']
188
189 CUT_OFFSET = random.randint(Conf.extension[0],Conf.extension[1])
190
191 if genomic_start <= CUT_OFFSET:
192 up_cut = genomic_start
193 else:
194 up_cut = genomic_start - CUT_OFFSET
195
196 #if genomic_stop + CUT_OFFSET > :
197 # down_cut = currentGene.stop - currentExons[1,1]
198 #else:
199 down_cut = genomic_stop + CUT_OFFSET
200
201 seq_info = (chr,strand,up_cut,down_cut)
202
203 # now that we have p_start and p_stop properly
204 # we can define the correct exons borders
205 currentExons = zeros((2,2),dtype=numpy.int)
206
207 currentExons[0,0] = fRead['p_start']
208 currentExons[0,1] = fRead['exon_stop']
209
210 currentExons[1,0] = fRead['exon_start']
211 currentExons[1,1] = fRead['p_stop']
212
213 return seq_info, currentExons
214
215
216 def _getSpliceScores(chr,strand,intervalBegin,intervalEnd):
217 """
218 Now we want to use interval_query to get the predicted splice scores trained
219 on the TAIR sequence and annotation.
220 """
221
222 assert intervalEnd > intervalBegin
223
224 size = intervalEnd-intervalBegin
225
226 acc = size*[0.0]
227 don = size*[0.0]
228
229 interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
230 pos_size = new_intp()
231 intp_assign(pos_size,1)
232
233 # fetch acceptor scores
234 sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_%d%s' % (chr,strand)
235 result = interval_query(sscore_filename, ['Conf'], 1, interval_matrix, 1,pos_size)
236
237 num_hits = result[0]
238 assert num_hits <= size, pdb.set_trace()
239 pos = result[1]
240 scores = result[3]
241
242 acc = [-inf]*(size)
243
244 #print [p - intervalBegin for p in pos]
245
246 for i in range(num_hits):
247 position = pos[i] - intervalBegin
248 if not (position > 1 and position < size):
249 continue
250
251 assert 0 <= position <= size, pdb.set_trace()
252 acc[position] = scores[i]
253
254 # fetch acceptor scores
255 sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_%d%s' % (chr,strand)
256 result = interval_query(sscore_filename, ['Conf'], 1, interval_matrix, 1,pos_size)
257
258 num_hits = result[0]
259 assert num_hits <= size, pdb.set_trace()
260 pos = result[1]
261 scores = result[3]
262
263 don = [-inf]*(size)
264
265 #print [p - intervalBegin for p in pos]
266
267 for i in range(num_hits):
268 position = pos[i] - intervalBegin
269 if not ( position > 1 and position < size ):
270 continue
271
272 assert 0 <= position <= size, pdb.set_trace()
273 don[position] = scores[i]
274
275 return acc, don
276
277
278 def getSpliceScores(chr,strand,intervalBegin,intervalEnd):
279 """
280 Now we want to use interval_query to get the predicted splice scores trained
281 on the TAIR sequence and annotation.
282 """
283
284 assert intervalEnd > intervalBegin
285
286 size = intervalEnd-intervalBegin
287
288 acc = size*[0.0]
289 don = size*[0.0]
290
291 interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
292 pos_size = new_intp()
293 intp_assign(pos_size,1)
294
295 # fetch acceptor scores
296 sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_%d%s'
297 acc = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename)
298
299 # fetch donor scores
300 sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_%d%s'
301 don = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename)
302
303 return acc, don
304
305
306 def process_map(reReads,fRead,dna_flat_files):
307 """
308 For all matches found by Vmatch we calculate the fragment of the DNA we
309 would like to perform an aligment during prediction.
310 """
311
312 fragment_size = 3000
313
314 alternativeSeq = []
315
316 for currentRRead in reReads:
317 rId = currentRRead['id']
318 pos = currentRRead['pos']
319 chr = currentRRead['chr']
320 strand = currentRRead['strand']
321 seq = currentRRead['seq']
322
323 if strand != '+':
324 continue
325
326 length = currentRRead['length']
327 offset = currentRRead['offset']
328
329 currentLabel = False
330
331 # vmatch found the correct position
332 if fRead['chr'] == chr and fRead['strand'] == strand and fRead['p_start'] <= pos <= fRead['p_stop']:
333 currentLabel = True
334
335 # first half of the read matches somewhere
336 # search downstream for the rest
337 if offset == 0:
338 genomicSeq_start = pos
339 genomicSeq_stop = pos + fragment_size
340 alternativeSeq.append( (chr,strand,genomicSeq_start,genomicSeq_stop,currentLabel) )
341 continue
342
343 # second half of the read matches somewhere
344 # search upstream for the rest
345 elif offset != 0 and length + offset == Conf.read_size:
346 genomicSeq_start = pos - fragment_size
347 genomicSeq_stop = pos+length
348 alternativeSeq.append( (chr,strand,genomicSeq_start,genomicSeq_stop,currentLabel) )
349 continue
350
351 # middle part of the read matches somewhere
352 # search up/downstream for the rest
353 else:
354 genomicSeq_start = pos - (fragment_size/2)
355 genomicSeq_stop = pos + (fragment_size/2)
356 alternativeSeq.append( (chr,strand,genomicSeq_start,genomicSeq_stop,currentLabel) )
357
358 #genomicSeq_start = pos - fragment_size
359 #genomicSeq_stop = pos+36
360 #alternativeSeq.append( (chr,strand,genomicSeq_start,genomicSeq_stop,currentLabel) )
361 continue
362
363 return alternativeSeq
364
365
366 def check_remapped_reads(reReads,fRead,dna_flat_files):
367 """
368 For all matches found by Vmatch we calculate the fragment of the DNA we
369 would like to perform an aligment during prediction.
370 """
371
372 fragment_size = 3000
373
374 seq = fRead['seq']
375 strand = fRead['strand']
376 pos = fRead['p_start']
377 chr = fRead['chr']
378
379 print 'fRead:\t%d %d %d %d' %\
380 (fRead['p_start'],fRead['exon_stop'],fRead['exon_start'],fRead['p_stop'])
381
382 for currentRRead in reReads:
383 rId = currentRRead['id']
384
385 pos1 = currentRRead['pos1']
386 chr1 = currentRRead['chr1']
387 seq1 = currentRRead['seq1']
388
389 seq2 = currentRRead['seq2']
390 pos2 = 0
391
392 if not seq2 == '':
393 pos2 = currentRRead['pos2']
394 chr2 = currentRRead['chr2']
395 s2_pos = seq.find(seq2)
396
397 print 'reRead:\t %d %d %d %d'%(pos1,pos1+len(seq1),pos2,pos2+len(seq2))
398
399
400 def get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_files):
401 """
402 This function expects an interval, chromosome and strand information and
403 returns then the genomic sequence of this interval and the associated scores.
404 """
405
406 chrom = 'chr%d' % chr
407 genomicSeq = load_genomic(chrom,strand,genomicSeq_start-1,genomicSeq_stop,dna_flat_files,one_based=False)
408 genomicSeq = genomicSeq.lower()
409
410 # check the obtained dna sequence
411 assert genomicSeq != '', 'load_genomic returned empty sequence!'
412 #for elem in genomicSeq:
413 # if not elem in alphabet:
414
415 no_base = re.compile('[^acgt]')
416 genomicSeq = no_base.sub('n',genomicSeq)
417
418 intervalBegin = genomicSeq_start-100
419 intervalEnd = genomicSeq_stop+100
420 seq_pos_offset = genomicSeq_start
421
422 currentAcc, currentDon = getSpliceScores(chr,strand,intervalBegin,intervalEnd)
423
424 currentAcc = currentAcc[100:-98]
425 currentAcc = currentAcc[1:]
426 currentDon = currentDon[100:-100]
427
428 length = len(genomicSeq)
429 currentAcc = currentAcc[:length]
430
431 currentDon = currentDon+[-inf]*(length-len(currentDon))
432
433 ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and genomicSeq[p-1]=='a' and genomicSeq[p]=='g' ]
434 gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
435
436 assert ag_tuple_pos == [p for p,e in enumerate(currentAcc) if e != -inf and p > 1], pdb.set_trace()
437 assert gt_tuple_pos == [p for p,e in enumerate(currentDon) if e != -inf and p > 0], pdb.set_trace()
438 assert len(currentAcc) == len(currentDon)
439
440 return genomicSeq, currentAcc, currentDon
441
442
443 def reverse_complement(seq):
444 map = {'a':'t','c':'g','g':'c','t':'a'}
445
446 new_seq = [map[elem] for elem in seq]
447 new_seq.reverse()
448 new_seq = "".join(new_seq)
449
450 return new_seq
451
452 def get_seq(begin,end,exon_end):
453 """
454 """
455
456 dna_flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
457
458 if exon_end:
459 gene_start = begin
460 gene_stop = end+2
461 else:
462 gene_start = begin-2
463 gene_stop = end
464
465 chrom = 'chr%d' % 1
466 strand = '+'
467
468 genomicSeq = load_genomic(chrom,strand,gene_start,gene_stop,dna_flat_files,one_based=False)
469 genomicSeq = genomicSeq.lower()
470
471 return genomicSeq
472
473
474 if __name__ == '__main__':
475 if len(sys.argv) == 1:
476 print info
477
478 assert len(sys.argv) == 6, help
479 compile_d(gff_file,dna_flat_files,solexa_reads,remapped_reads,dataset_file)