+ fixed bug in reads stemming from negative strand
[qpalma.git] / scripts / PipelineHeuristic.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import cPickle
5 import sys
6 import pdb
7 import os
8 import os.path
9 import resource
10
11 from qpalma.computeSpliceWeights import *
12 from qpalma.set_param_palma import *
13 from qpalma.computeSpliceAlignWithQuality import *
14
15 from numpy.matlib import mat,zeros,ones,inf
16 from numpy import inf,mean
17
18 #from qpalma.parsers import *
19
20 from ParaParser import ParaParser,IN_VECTOR
21
22 from qpalma.Lookup import LookupTable
23 from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper,reverse_complement,unbracket_seq
24
25
26 def cpu():
27 return (resource.getrusage(resource.RUSAGE_SELF).ru_utime+\
28 resource.getrusage(resource.RUSAGE_SELF).ru_stime)
29
30
31 class BracketWrapper:
32 fields = ['id', 'chr', 'pos', 'strand', 'seq', 'prb']
33
34 def __init__(self,filename):
35 self.parser = ParaParser("%lu%d%d%s%s%s",self.fields,len(self.fields),IN_VECTOR)
36 self.parser.parseFile(filename)
37
38 def __len__(self):
39 return self.parser.getSize(IN_VECTOR)
40
41 def __getitem__(self,key):
42 return self.parser.fetchEntry(key)
43
44 def __iter__(self):
45 self.counter = 0
46 self.size = self.parser.getSize(IN_VECTOR)
47 return self
48
49 def next(self):
50 if not self.counter < self.size:
51 raise StopIteration
52 return_val = self.parser.fetchEntry(self.counter)
53 self.counter += 1
54 return return_val
55
56
57 class PipelineHeuristic:
58 """
59 This class wraps the filter which decides whether an alignment found by
60 vmatch is spliced an should be then newly aligned using QPalma or not.
61 """
62
63 def __init__(self,run_fname,data_fname,param_fname,result_fname):
64 """
65 We need a run object holding information about the nr. of support points
66 etc.
67 """
68
69 run = cPickle.load(open(run_fname))
70 self.run = run
71
72 start = cpu()
73 self.all_remapped_reads = BracketWrapper(data_fname)
74 stop = cpu()
75
76 print 'parsed %d reads in %f sec' % (len(self.all_remapped_reads),stop-start)
77
78 #g_dir = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/contigs'
79 #acc_dir = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/acc'
80 #don_dir = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/don'
81
82 #g_fmt = 'contig%d.dna.flat'
83 #s_fmt = 'contig_%d%s'
84
85 #self.chromo_range = range(0,1099)
86
87 g_dir = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
88 acc_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
89 don_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
90
91 g_fmt = 'chr%d.dna.flat'
92 s_fmt = 'contig_%d%s'
93
94 #self.chromo_range = range(1,6)
95 self.chromo_range = range(1,2)
96
97 num_chromo = 2
98 accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
99 self.seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
100
101 start = cpu()
102 self.lt1 = LookupTable(g_dir,acc_dir,don_dir,g_fmt,s_fmt,self.chromo_range)
103 stop = cpu()
104
105 print 'prefetched sequence and splice data in %f sec' % (stop-start)
106
107 self.result_spliced_fh = open('%s.spliced'%result_fname,'w+')
108 self.result_unspliced_fh = open('%s.unspliced'%result_fname,'w+')
109
110 start = cpu()
111
112 self.data_fname = data_fname
113
114 self.param = cPickle.load(open(param_fname))
115
116 # Set the parameters such as limits penalties for the Plifs
117 [h,d,a,mmatrix,qualityPlifs] = set_param_palma(self.param,True,run)
118
119 self.h = h
120 self.d = d
121 self.a = a
122 self.mmatrix = mmatrix
123 self.qualityPlifs = qualityPlifs
124
125 self.read_size = 38
126
127 # parameters of the heuristics to decide whether the read is spliced
128 #self.splice_thresh = 0.005
129 self.splice_thresh = 0.01
130 self.max_intron_size = 2000
131 self.max_mismatch = 2
132 #self.splice_stop_thresh = 0.99
133 self.splice_stop_thresh = 0.8
134 self.spliced_bias = 0.0
135
136 param_lines = [\
137 ('%f','splice_thresh',self.splice_thresh),\
138 ('%d','max_intron_size',self.max_intron_size),\
139 ('%d','max_mismatch',self.max_mismatch),\
140 ('%f','splice_stop_thresh',self.splice_stop_thresh),\
141 ('%f','spliced_bias',self.spliced_bias)]
142
143 param_lines = [('# %s: '+form+'\n')%(name,val) for form,name,val in param_lines]
144 param_lines.append('# data: %s\n'%self.data_fname)
145 param_lines.append('# param: %s\n'%param_fname)
146
147 #pdb.set_trace()
148
149 for p_line in param_lines:
150 self.result_spliced_fh.write(p_line)
151 self.result_unspliced_fh.write(p_line)
152
153 #self.original_reads = {}
154
155 # we do not have this information
156 #for line in open(reads_pipeline_fn):
157 # line = line.strip()
158 # id,seq,q1,q2,q3 = line.split()
159 # id = int(id)
160 # self.original_reads[id] = seq
161
162 lengthSP = run['numLengthSuppPoints']
163 donSP = run['numDonSuppPoints']
164 accSP = run['numAccSuppPoints']
165 mmatrixSP = run['matchmatrixRows']*run['matchmatrixCols']
166 numq = run['numQualSuppPoints']
167 totalQualSP = run['totalQualSuppPoints']
168
169 currentPhi = zeros((run['numFeatures'],1))
170 currentPhi[0:lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
171 currentPhi[lengthSP:lengthSP+donSP] = mat(d.penalties[:]).reshape(donSP,1)
172 currentPhi[lengthSP+donSP:lengthSP+donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
173 currentPhi[lengthSP+donSP+accSP:lengthSP+donSP+accSP+mmatrixSP] = mmatrix[:]
174
175 totalQualityPenalties = self.param[-totalQualSP:]
176 currentPhi[lengthSP+donSP+accSP+mmatrixSP:] = totalQualityPenalties[:]
177 self.currentPhi = currentPhi
178
179 # we want to identify spliced reads
180 # so true pos are spliced reads that are predicted "spliced"
181 self.true_pos = 0
182
183 # as false positives we count all reads that are not spliced but predicted
184 # as "spliced"
185 self.false_pos = 0
186
187 self.true_neg = 0
188 self.false_neg = 0
189
190 # total time spend for get seq and scores
191 self.get_time = 0.0
192 self.calcAlignmentScoreTime = 0.0
193 self.alternativeScoresTime = 0.0
194
195 self.count_time = 0.0
196 self.main_loop = 0.0
197 self.splice_site_time = 0.0
198 self.computeSpliceAlignWithQualityTime = 0.0
199 self.computeSpliceWeightsTime = 0.0
200 self.DotProdTime = 0.0
201 self.array_stuff = 0.0
202 stop = cpu()
203
204 self.init_time = stop-start
205
206 def filter(self):
207 """
208 This method...
209 """
210 run = self.run
211
212 ctr = 0
213 unspliced_ctr = 0
214 spliced_ctr = 0
215
216 print 'Starting filtering...'
217 _start = cpu()
218
219 #for readId,currentReadLocations in all_remapped_reads.items():
220 #for location in currentReadLocations[:1]:
221
222 for location,original_line in self.all_remapped_reads:
223
224 if ctr % 1000 == 0:
225 print ctr
226
227 id = location['id']
228 chr = location['chr']
229 pos = location['pos']
230 strand = location['strand']
231 seq = location['seq']
232 prb = location['prb']
233
234 id = int(id)
235
236 seq = seq.lower()
237
238 strand_map = {'D':'+', 'P':'-'}
239
240 strand = strand_map[strand]
241
242 if not chr in self.chromo_range:
243 continue
244
245 unb_seq = unbracket_seq(seq)
246
247 # forgot to do this
248 if strand == '-':
249 #pos = self.lt1.seqInfo.chromo_sizes[chr]-pos-self.read_size
250 unb_seq = reverse_complement(unb_seq)
251 seq = reverse_complement(seq)
252
253 effective_len = len(unb_seq)
254
255 genomicSeq_start = pos
256 #genomicSeq_stop = pos+effective_len-1
257 genomicSeq_stop = pos+effective_len
258
259 start = cpu()
260 currentDNASeq_, currentAcc_, currentDon_ = self.seqInfo.get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,False)
261 currentDNASeq, currentAcc, currentDon = self.lt1.get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,False)
262
263 assert currentDNASeq_ == currentDNASeq
264 assert currentAcc_ == currentAcc_
265 assert currentDon_ == currentDon_
266
267 stop = cpu()
268 self.get_time += stop-start
269
270 dna = currentDNASeq
271 exons = zeros((2,1))
272 exons[0,0] = 0
273 exons[1,0] = effective_len
274 est = unb_seq
275 original_est = seq
276 quality = map(lambda x:ord(x)-64,prb)
277
278 currentVMatchAlignment = dna, exons, est, original_est, quality,\
279 currentAcc, currentDon
280
281 #pdb.set_trace()
282
283 alternativeAlignmentScores = self.calcAlternativeAlignments(location)
284 #try:
285 # alternativeAlignmentScores = self.calcAlternativeAlignments(location)
286 #except:
287 # alternativeAlignmentScores = []
288
289
290 if alternativeAlignmentScores == []:
291 # no alignment necessary
292 maxAlternativeAlignmentScore = -inf
293 vMatchScore = 0.0
294 else:
295 maxAlternativeAlignmentScore = max(alternativeAlignmentScores)
296 # compute alignment for vmatch unspliced read
297 vMatchScore = self.calcAlignmentScore(currentVMatchAlignment)
298
299 start = cpu()
300
301 #print 'vMatchScore/alternativeScore: %f %f ' % (vMatchScore,maxAlternativeAlignmentScore)
302 #print 'all candidates %s' % str(alternativeAlignmentScores)
303
304 new_id = id - 1000000300000
305
306 unspliced = False
307 # unspliced
308 if new_id > 0:
309 unspliced = True
310
311 # Seems that according to our learned parameters VMatch found a good
312 # alignment of the current read
313 if maxAlternativeAlignmentScore < vMatchScore:
314 unspliced_ctr += 1
315
316 self.result_unspliced_fh.write(original_line+'\n')
317
318 if unspliced:
319 self.true_neg += 1
320 else:
321 self.false_neg += 1
322
323 # We found an alternative alignment considering splice sites that scores
324 # higher than the VMatch alignment
325 else:
326 spliced_ctr += 1
327
328 self.result_spliced_fh.write(original_line+'\n')
329
330 if unspliced:
331 self.false_pos += 1
332 else:
333 self.true_pos += 1
334
335 ctr += 1
336 stop = cpu()
337 self.count_time = stop-start
338
339 _stop = cpu()
340 self.main_loop = _stop-_start
341
342 print 'Unspliced/Splice: %d %d'%(unspliced_ctr,spliced_ctr)
343 print 'True pos / false pos : %d %d'%(self.true_pos,self.false_pos)
344 print 'True neg / false neg : %d %d'%(self.true_neg,self.false_neg)
345
346
347 def findHighestScoringSpliceSites(self, currentAcc, currentDon, DNA, max_intron_size, read_size, splice_thresh):
348
349 def signum(a):
350 if a>0:
351 return 1
352 elif a<0:
353 return -1
354 else:
355 return 0
356
357 proximal_acc = []
358 for idx in xrange(max_intron_size, max_intron_size+read_size/2):
359 if currentAcc[idx]>= splice_thresh:
360 proximal_acc.append((idx,currentAcc[idx]))
361
362 proximal_acc.sort(lambda x,y: signum(x[1]-y[1]))
363 proximal_acc=proximal_acc[-2:]
364
365 distal_acc = []
366 for idx in xrange(max_intron_size+read_size, len(currentAcc)):
367 if currentAcc[idx]>= splice_thresh and idx+read_size<len(currentAcc):
368 distal_acc.append((idx, currentAcc[idx], DNA[idx+1:idx+read_size]))
369
370 #distal_acc.sort(lambda x,y: signum(x[1]-y[1]))
371 #distal_acc=distal_acc[-2:]
372
373
374 proximal_don = []
375 for idx in xrange(max_intron_size+read_size/2, max_intron_size+read_size):
376 if currentDon[idx] >= splice_thresh:
377 proximal_don.append((idx, currentDon[idx]))
378
379 proximal_don.sort(lambda x,y: signum(x[1]-y[1]))
380 proximal_don=proximal_don[-2:]
381
382 distal_don = []
383 for idx in xrange(1, max_intron_size):
384 if currentDon[idx] >= splice_thresh and idx>read_size:
385 distal_don.append((idx, currentDon[idx], DNA[idx-read_size:idx]))
386
387 distal_don.sort(lambda x,y: y[0]-x[0])
388 #distal_don=distal_don[-2:]
389
390 return proximal_acc,proximal_don,distal_acc,distal_don
391
392
393 def calcAlternativeAlignments(self,location):
394 """
395 Given an alignment proposed by Vmatch this function calculates possible
396 alternative alignments taking into account for example matched
397 donor/acceptor positions.
398 """
399
400 run = self.run
401
402 id = location['id']
403 chr = location['chr']
404 pos = location['pos']
405 strand = location['strand']
406 original_est = location['seq']
407 quality = location['prb']
408 quality = map(lambda x:ord(x)-64,quality)
409
410 original_est = original_est.lower()
411 est = unbracket_seq(original_est)
412 effective_len = len(est)
413
414 genomicSeq_start = pos - self.max_intron_size
415 genomicSeq_stop = pos + self.max_intron_size + len(est)
416
417 strand_map = {'D':'+', 'P':'-'}
418 strand = strand_map[strand]
419
420 if strand == '-':
421 est = reverse_complement(est)
422 original_est = reverse_complement(original_est)
423
424 start = cpu()
425 #currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr, strand, genomicSeq_start, genomicSeq_stop, run['dna_flat_files'])
426 currentDNASeq, currentAcc, currentDon = self.lt1.get_seq_and_scores(chr, strand, genomicSeq_start, genomicSeq_stop, run['dna_flat_files'])
427 stop = cpu()
428 self.get_time += stop-start
429 dna = currentDNASeq
430
431 proximal_acc,proximal_don,distal_acc,distal_don = self.findHighestScoringSpliceSites(currentAcc, currentDon, dna, self.max_intron_size, len(est), self.splice_thresh)
432
433 alternativeScores = []
434
435 # inlined
436 h = self.h
437 d = self.d
438 a = self.a
439 mmatrix = self.mmatrix
440 qualityPlifs = self.qualityPlifs
441 # inlined
442
443 # find an intron on the 3' end
444 _start = cpu()
445 for (don_pos,don_score) in proximal_don:
446 DonorScore = calculatePlif(d, [don_score])[0]
447
448 for (acc_pos,acc_score,acc_dna) in distal_acc:
449
450 IntronScore = calculatePlif(h, [acc_pos-don_pos])[0]
451 AcceptorScore = calculatePlif(a, [acc_score])[0]
452
453 #print 'don splice: ', (don_pos,don_score), (acc_pos,acc_score,acc_dna), (DonorScore,IntronScore,AcceptorScore)
454
455 # construct a new "original_est"
456 original_est_cut=''
457
458 est_ptr=0
459 dna_ptr=self.max_intron_size
460 ptr=0
461 acc_dna_ptr=0
462 num_mismatch = 0
463
464 while ptr<len(original_est):
465 #print acc_dna_ptr,len(acc_dna),acc_pos,don_pos
466
467 if original_est[ptr]=='[':
468 dnaletter=original_est[ptr+1]
469 estletter=original_est[ptr+2]
470 if dna_ptr < don_pos:
471 original_est_cut+=original_est[ptr:ptr+4]
472 num_mismatch += 1
473 else:
474 if acc_dna[acc_dna_ptr]==estletter:
475 original_est_cut += estletter # EST letter
476 else:
477 original_est_cut += '['+acc_dna[acc_dna_ptr]+estletter+']' # EST letter
478 num_mismatch += 1
479 #print '['+acc_dna[acc_dna_ptr]+estletter+']'
480 acc_dna_ptr+=1
481 ptr+=4
482 else:
483 dnaletter=original_est[ptr]
484 estletter=dnaletter
485
486 if dna_ptr < don_pos:
487 original_est_cut+=estletter # EST letter
488 else:
489 if acc_dna[acc_dna_ptr]==estletter:
490 original_est_cut += estletter # EST letter
491 else:
492 num_mismatch += 1
493 original_est_cut += '['+acc_dna[acc_dna_ptr]+estletter+']' # EST letter
494 #print '('+acc_dna[acc_dna_ptr]+estletter+')'
495 acc_dna_ptr+=1
496
497 ptr+=1
498
499 if estletter=='-':
500 dna_ptr+=1
501 elif dnaletter=='-':
502 est_ptr+=1
503 else:
504 dna_ptr+=1
505 est_ptr+=1
506 if num_mismatch>self.max_mismatch:
507 continue
508
509 assert(dna_ptr<=len(dna))
510 assert(est_ptr<=len(est))
511
512 #print original_est, original_est_cut
513
514 score = computeSpliceAlignScoreWithQuality(original_est_cut, quality, qualityPlifs, run, self.currentPhi)
515 score += AcceptorScore + IntronScore + DonorScore + self.spliced_bias
516
517 alternativeScores.append(score)
518
519 if acc_score>=self.splice_stop_thresh:
520 break
521
522 _stop = cpu()
523 self.alternativeScoresTime += _stop-_start
524
525 # find an intron on the 5' end
526 _start = cpu()
527 for (acc_pos,acc_score) in proximal_acc:
528
529 AcceptorScore = calculatePlif(a, [acc_score])[0]
530
531 for (don_pos,don_score,don_dna) in distal_don:
532
533 DonorScore = calculatePlif(d, [don_score])[0]
534 IntronScore = calculatePlif(h, [acc_pos-don_pos])[0]
535
536 #print 'acc splice: ', (don_pos,don_score,don_dna), (acc_pos,acc_score), (DonorScore,IntronScore,AcceptorScore)
537
538 # construct a new "original_est"
539 original_est_cut=''
540
541 est_ptr=0
542 dna_ptr=self.max_intron_size
543 ptr=0
544 num_mismatch = 0
545 don_dna_ptr=len(don_dna)-(acc_pos-self.max_intron_size)-1
546 while ptr<len(original_est):
547
548 if original_est[ptr]=='[':
549 dnaletter=original_est[ptr+1]
550 estletter=original_est[ptr+2]
551 if dna_ptr > acc_pos:
552 original_est_cut+=original_est[ptr:ptr+4]
553 num_mismatch += 1
554 else:
555 if don_dna[don_dna_ptr]==estletter:
556 original_est_cut += estletter # EST letter
557 else:
558 original_est_cut += '['+don_dna[don_dna_ptr]+estletter+']' # EST letter
559 num_mismatch += 1
560 #print '['+don_dna[don_dna_ptr]+estletter+']'
561 don_dna_ptr+=1
562 ptr+=4
563 else:
564 dnaletter=original_est[ptr]
565 estletter=dnaletter
566
567 if dna_ptr > acc_pos:
568 original_est_cut+=estletter # EST letter
569 else:
570 if don_dna[don_dna_ptr]==estletter:
571 original_est_cut += estletter # EST letter
572 else:
573 original_est_cut += '['+don_dna[don_dna_ptr]+estletter+']' # EST letter
574 num_mismatch += 1
575 #print '('+don_dna[don_dna_ptr]+estletter+')'
576 don_dna_ptr+=1
577
578 ptr+=1
579
580 if estletter=='-':
581 dna_ptr+=1
582 elif dnaletter=='-':
583 est_ptr+=1
584 else:
585 dna_ptr+=1
586 est_ptr+=1
587
588 if num_mismatch>self.max_mismatch:
589 continue
590 assert(dna_ptr<=len(dna))
591 assert(est_ptr<=len(est))
592
593 #print original_est, original_est_cut
594
595 score = computeSpliceAlignScoreWithQuality(original_est_cut, quality, qualityPlifs, run, self.currentPhi)
596 score += AcceptorScore + IntronScore + DonorScore + self.spliced_bias
597
598 alternativeScores.append(score)
599
600 if don_score>=self.splice_stop_thresh:
601 break
602
603 _stop = cpu()
604 self.alternativeScoresTime += _stop-_start
605
606 return alternativeScores
607
608
609 def calcAlignmentScore(self,alignment):
610 """
611 Given an alignment (dna,exons,est) and the current parameter for QPalma
612 this function calculates the dot product of the feature representation of
613 the alignment and the parameter vector i.e the alignment score.
614 """
615
616 start = cpu()
617 run = self.run
618
619 # Lets start calculation
620 dna, exons, est, original_est, quality, acc_supp, don_supp = alignment
621
622 score = computeSpliceAlignScoreWithQuality(original_est, quality, self.qualityPlifs, run, self.currentPhi)
623
624 stop = cpu()
625 self.calcAlignmentScoreTime += stop-start
626
627 return score