+ added information on cut positions of spliced reads
[qpalma.git] / scripts / compile_dataset.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import sys
5 import random
6 import os
7 import re
8 import pdb
9 import cPickle
10
11 import numpy
12 from numpy.matlib import mat,zeros,ones,inf
13
14 import qpalma
15 import qpalma.tools
16 from qpalma.tools.parseGff import parse_gff
17
18 from qpalma.parsers import FilteredReadParser, RemappedReadParser, MapParser
19
20 from Genefinding import *
21
22 from genome_utils import load_genomic
23
24 import qpalma.Configuration as Conf
25
26 warning = """
27 #########################################################
28 WARNING ! Disabled the use of non-bracket reads WARNING !
29 #########################################################
30
31 """
32
33
34 help = """
35
36 Usage of this script is:
37
38 ./compile_dataset.py gff_file dna_flat_files solexa_reads remapped_reads dataset_file
39
40 """
41
42 info = """
43 The purpose of this script is to compile a full data set for QPalma out of the
44 following files:
45
46 - The information of the gff (gene annotation).
47
48 - The flat dna files with the sequence of the respective gene positions
49
50 - The solexa reads in the filtered version
51
52 - The solexa reads in the remapped version
53
54 This represents a dataset for use with QPalma
55 The entries are def'd as follows:
56
57 - 'id' a unique id for the read
58 - 'chr' the chromosome identifier
59 - 'strand' the identifier for the strand either '+' or '-'
60
61 - 'read_start'
62 - 'exon_stop'
63 - 'exon_start'
64 - 'read_stop'
65
66 - 'remappped_pos' - the position of the remapped read
67 """
68
69 # some useful constants
70
71 extended_alphabet = ['-','a','c','g','t','n','[',']']
72 alphabet = ['-','a','c','g','t','n']
73
74 # some dummy elements to be returned if current example violates assertions
75 nil_dna_frag = ('','','','','')
76 remapped_nil = ('')
77 process_nil = ('','','','','','','','')
78
79 nil_splice_scores = ('','')
80
81 def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,dataset_file,test):
82
83 assert os.path.exists(filtered_reads)
84 assert os.path.exists(remapped_reads)
85 assert not os.path.exists(dataset_file), 'The data_file already exists!'
86
87 joinp = os.path.join
88
89 # first read the gff file(s) and create pickled gene information
90 allGenes = [None]*6
91
92 for i in range(1,6):
93 allGenes[i]= parse_gff(gff_file%i) #,joinp(tmp_dir,'gff_info.pickle'))
94
95 print 'parsing filtered reads..'
96 frp = FilteredReadParser(filtered_reads)
97 all_filtered_reads = frp.parse()
98
99 print 'found %d filtered reads' % len(all_filtered_reads)
100
101 print 'parsing map reads'
102 rrp = MapParser(remapped_reads)
103 all_remapped_reads = rrp.parse()
104
105 if all_remapped_reads != None:
106 size_rem = len(all_remapped_reads)
107 else:
108 size_rem = 0
109
110 print 'found %d remapped reads' % size_rem
111
112 # this stores the new dataset
113 SeqInfo = []
114 Exons = []
115 OriginalEsts = []
116 Qualities = []
117 AlternativeSequences = []
118 SplitPositions = []
119
120 # Iterate over all remapped reads in order to generate for each read an
121 # training / prediction example
122 instance_counter = 0
123 noVmatchHitCtr = 0
124 posCtr = 0
125
126 for fId,currentFRead in all_filtered_reads.items():
127
128 if currentFRead['strand'] != '+':
129 #print 'wrong strand'
130 continue
131
132 reId = str(1000000000000+int(fId))
133 try:
134 reReads = all_remapped_reads[reId]
135 except:
136 noVmatchHitCtr += 1
137 continue
138
139 gene_id = currentFRead['gene_id']
140 chro = currentFRead['chr']
141
142 #try:
143 # gene_id = currentFRead['gene_id']
144 # chro = currentFRead['chr']
145 #except:
146 # continue
147
148 seq_info, exons, cut_pos =\
149 process_filtered_read(currentFRead,dna_flat_files,test)
150
151 original_est = currentFRead['seq']
152
153 onePositiveLabel,alternative_seq = process_map(reReads,currentFRead,dna_flat_files)
154
155 if onePositiveLabel:
156 posCtr += 1
157
158 SeqInfo.append(seq_info)
159 Exons.append(exons)
160 OriginalEsts.append(original_est)
161
162 quality = currentFRead['prb']
163 cal_prb = currentFRead['cal_prb']
164 chastity = currentFRead['chastity']
165
166 Qualities.append( (quality,cal_prb,chastity) )
167 AlternativeSequences.append(alternative_seq)
168 SplitPositions.append(cut_pos)
169
170 instance_counter += 1
171
172 if instance_counter % 1000 == 0:
173 print 'processed %d examples' % instance_counter
174
175 if instance_counter == 100000:
176 break
177
178 print 'Full dataset has size %d' % len(SeqInfo)
179 print 'Number of reads which had no Vmatch hit: %d' % noVmatchHitCtr
180 print 'Number of reads which had correct Vmatch hit: %d' % posCtr
181
182 dataset = [SeqInfo, Exons, OriginalEsts, Qualities,\
183 AlternativeSequences, SplitPositions]
184
185 # saving new dataset
186 #io_pickle.save(dataset_file,dataset)
187
188 cPickle.dump(dataset,open(dataset_file,'w+'),protocol=2)
189
190
191 def process_filtered_read(fRead,dna_flat_files,test):
192 """
193 Creates a training example
194 """
195
196 chr = fRead['chr']
197 strand = fRead['strand']
198
199 genomic_start = fRead['p_start']
200 genomic_stop = fRead['p_stop']
201
202 CUT_OFFSET = random.randint(Conf.extension[0],Conf.extension[1])
203
204 if genomic_start <= CUT_OFFSET:
205 up_cut = genomic_start
206 else:
207 up_cut = genomic_start - CUT_OFFSET
208
209 #if genomic_stop + CUT_OFFSET > :
210 # down_cut = currentGene.stop - currentExons[1,1]
211 #else:
212 down_cut = genomic_stop + CUT_OFFSET
213
214 seq_info = (chr,strand,up_cut,down_cut)
215
216 # now that we have p_start and p_stop properly
217 # we can define the correct exons borders
218 currentExons = zeros((2,2),dtype=numpy.int)
219
220 currentExons[0,0] = fRead['p_start']
221 currentExons[0,1] = fRead['exon_stop']
222
223 currentExons[1,0] = fRead['exon_start']
224 currentExons[1,1] = fRead['p_stop']
225
226 return seq_info, currentExons, fRead['splitpos']
227
228
229 def _getSpliceScores(chr,strand,intervalBegin,intervalEnd):
230 """
231 Now we want to use interval_query to get the predicted splice scores trained
232 on the TAIR sequence and annotation.
233 """
234
235 assert intervalEnd > intervalBegin
236
237 size = intervalEnd-intervalBegin
238
239 acc = size*[0.0]
240 don = size*[0.0]
241
242 interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
243 pos_size = new_intp()
244 intp_assign(pos_size,1)
245
246 # fetch acceptor scores
247 sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_%d%s' % (chr,strand)
248 result = interval_query(sscore_filename, ['Conf'], 1, interval_matrix, 1,pos_size)
249
250 num_hits = result[0]
251 assert num_hits <= size, pdb.set_trace()
252 pos = result[1]
253 scores = result[3]
254
255 acc = [-inf]*(size)
256
257 #print [p - intervalBegin for p in pos]
258
259 for i in range(num_hits):
260 position = pos[i] - intervalBegin
261 if not (position > 1 and position < size):
262 continue
263
264 assert 0 <= position <= size, pdb.set_trace()
265 acc[position] = scores[i]
266
267 # fetch acceptor scores
268 sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_%d%s' % (chr,strand)
269 result = interval_query(sscore_filename, ['Conf'], 1, interval_matrix, 1,pos_size)
270
271 num_hits = result[0]
272 assert num_hits <= size, pdb.set_trace()
273 pos = result[1]
274 scores = result[3]
275
276 don = [-inf]*(size)
277
278 #print [p - intervalBegin for p in pos]
279
280 for i in range(num_hits):
281 position = pos[i] - intervalBegin
282 if not ( position > 1 and position < size ):
283 continue
284
285 assert 0 <= position <= size, pdb.set_trace()
286 don[position] = scores[i]
287
288 return acc, don
289
290
291 def getSpliceScores(chr,strand,intervalBegin,intervalEnd):
292 """
293 Now we want to use interval_query to get the predicted splice scores trained
294 on the TAIR sequence and annotation.
295 """
296
297 assert intervalEnd > intervalBegin
298
299 size = intervalEnd-intervalBegin
300
301 acc = size*[0.0]
302 don = size*[0.0]
303
304 interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
305 pos_size = new_intp()
306 intp_assign(pos_size,1)
307
308 # fetch acceptor scores
309 sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_%d%s'
310 acc = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename)
311
312 # fetch donor scores
313 sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_%d%s'
314 don = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename)
315
316 return acc, don
317
318
319 def process_map(reReads,fRead,dna_flat_files):
320 """
321 For all matches found by Vmatch we calculate the fragment of the DNA we
322 would like to perform an aligment during prediction.
323 """
324
325 fragment_size = 3000
326
327 alternativeSeq = []
328
329 onePositiveLabel = False
330
331 for currentRRead in reReads:
332 rId = currentRRead['id']
333 pos = currentRRead['pos']
334 chr = currentRRead['chr']
335 strand = currentRRead['strand']
336 seq = currentRRead['seq']
337
338 if strand != '+':
339 continue
340
341 length = currentRRead['length']
342 offset = currentRRead['offset']
343
344 currentLabel = False
345
346 CUT_OFFSET = random.randint(Conf.extension[0],Conf.extension[1])
347
348
349 """
350 # vmatch found the correct position
351 if fRead['chr'] == chr and fRead['strand'] == strand and fRead['p_start'] <= pos <= fRead['p_stop']:
352 currentLabel = True
353
354 # first half of the read matches somewhere
355 # search downstream for the rest
356 if offset == 0:
357 genomicSeq_start = pos
358 genomicSeq_stop = pos + fragment_size
359 alternativeSeq.append( (chr,strand,genomicSeq_start,genomicSeq_stop,currentLabel) )
360 if currentLabel:
361 pdb.set_trace()
362 continue
363
364 # second half of the read matches somewhere
365 # search upstream for the rest
366 elif offset != 0 and length + offset == Conf.read_size:
367 genomicSeq_start = pos - fragment_size
368 genomicSeq_stop = pos+length
369 alternativeSeq.append( (chr,strand,genomicSeq_start,genomicSeq_stop,currentLabel) )
370 if currentLabel:
371 pdb.set_trace()
372 continue
373
374 # middle part of the read matches somewhere
375 # search up/downstream for the rest
376 else:
377 genomicSeq_start = pos - (fragment_size/2)
378 genomicSeq_stop = pos + (fragment_size/2)
379 alternativeSeq.append( (chr,strand,genomicSeq_start,genomicSeq_stop,currentLabel) )
380 continue
381 """
382
383
384 # vmatch found the correct position
385 if fRead['chr'] == chr and fRead['strand'] == strand and fRead['p_start']-36 <= pos <= fRead['p_stop']+36:
386 currentLabel = True
387 genomicSeq_start = fRead['p_start'] - CUT_OFFSET
388 genomicSeq_stop = fRead['p_stop'] + CUT_OFFSET
389 #pdb.set_trace()
390 else:
391 #pdb.set_trace()
392 genomicSeq_start = pos - (fragment_size/2)
393 genomicSeq_stop = pos + (fragment_size/2)
394
395 onePositiveLabel = onePositiveLabel or currentLabel
396
397 alternativeSeq.append( (chr,strand,genomicSeq_start,genomicSeq_stop,currentLabel) )
398
399
400 return onePositiveLabel,alternativeSeq
401
402
403 def check_remapped_reads(reReads,fRead,dna_flat_files):
404 """
405 For all matches found by Vmatch we calculate the fragment of the DNA we
406 would like to perform an aligment during prediction.
407 """
408
409 fragment_size = 3000
410
411 seq = fRead['seq']
412 strand = fRead['strand']
413 pos = fRead['p_start']
414 chr = fRead['chr']
415
416 print 'fRead:\t%d %d %d %d' %\
417 (fRead['p_start'],fRead['exon_stop'],fRead['exon_start'],fRead['p_stop'])
418
419 for currentRRead in reReads:
420 rId = currentRRead['id']
421
422 pos1 = currentRRead['pos1']
423 chr1 = currentRRead['chr1']
424 seq1 = currentRRead['seq1']
425
426 seq2 = currentRRead['seq2']
427 pos2 = 0
428
429 if not seq2 == '':
430 pos2 = currentRRead['pos2']
431 chr2 = currentRRead['chr2']
432 s2_pos = seq.find(seq2)
433
434 print 'reRead:\t %d %d %d %d'%(pos1,pos1+len(seq1),pos2,pos2+len(seq2))
435
436
437 def get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_files):
438 """
439 This function expects an interval, chromosome and strand information and
440 returns then the genomic sequence of this interval and the associated scores.
441 """
442
443 chrom = 'chr%d' % chr
444 genomicSeq = load_genomic(chrom,strand,genomicSeq_start-1,genomicSeq_stop,dna_flat_files,one_based=False)
445 genomicSeq = genomicSeq.lower()
446
447 # check the obtained dna sequence
448 assert genomicSeq != '', 'load_genomic returned empty sequence!'
449 #for elem in genomicSeq:
450 # if not elem in alphabet:
451
452 no_base = re.compile('[^acgt]')
453 genomicSeq = no_base.sub('n',genomicSeq)
454
455 intervalBegin = genomicSeq_start-100
456 intervalEnd = genomicSeq_stop+100
457 seq_pos_offset = genomicSeq_start
458
459 currentAcc, currentDon = getSpliceScores(chr,strand,intervalBegin,intervalEnd)
460
461 currentAcc = currentAcc[100:-98]
462 currentAcc = currentAcc[1:]
463 currentDon = currentDon[100:-100]
464
465 length = len(genomicSeq)
466 currentAcc = currentAcc[:length]
467
468 currentDon = currentDon+[-inf]*(length-len(currentDon))
469
470 ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and genomicSeq[p-1]=='a' and genomicSeq[p]=='g' ]
471 gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
472
473 assert ag_tuple_pos == [p for p,e in enumerate(currentAcc) if e != -inf and p > 1], pdb.set_trace()
474 assert gt_tuple_pos == [p for p,e in enumerate(currentDon) if e != -inf and p > 0], pdb.set_trace()
475 assert len(currentAcc) == len(currentDon)
476
477 return genomicSeq, currentAcc, currentDon
478
479
480 def reverse_complement(seq):
481 map = {'a':'t','c':'g','g':'c','t':'a'}
482
483 new_seq = [map[elem] for elem in seq]
484 new_seq.reverse()
485 new_seq = "".join(new_seq)
486
487 return new_seq
488
489 def get_seq(begin,end,exon_end):
490 """
491 """
492
493 dna_flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
494
495 if exon_end:
496 gene_start = begin
497 gene_stop = end+2
498 else:
499 gene_start = begin-2
500 gene_stop = end
501
502 chrom = 'chr%d' % 1
503 strand = '+'
504
505 genomicSeq = load_genomic(chrom,strand,gene_start,gene_stop,dna_flat_files,one_based=False)
506 genomicSeq = genomicSeq.lower()
507
508 return genomicSeq
509
510
511 if __name__ == '__main__':
512 if len(sys.argv) == 1:
513 print info
514
515 assert len(sys.argv) == 6, help
516 compile_d(gff_file,dna_flat_files,solexa_reads,remapped_reads,dataset_file)