+ changed output format of alignment file script to be compatible with est2gff
[qpalma.git] / scripts / PipelineHeuristic.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import cPickle
5 import sys
6 import pdb
7 import os
8 import os.path
9 import math
10 import resource
11
12 from qpalma.DataProc import *
13 from qpalma.computeSpliceWeights import *
14 from qpalma.set_param_palma import *
15 from qpalma.computeSpliceAlignWithQuality import *
16 from qpalma.penalty_lookup_new import *
17 from qpalma.compute_donacc import *
18 from qpalma.TrainingParam import Param
19 from qpalma.Plif import Plf
20
21 from qpalma.Configuration import *
22
23 from numpy.matlib import mat,zeros,ones,inf
24 from numpy import inf,mean
25
26 from qpalma.parsers import *
27
28 from ParaParser import *
29
30 from Lookup import *
31
32 from qpalma.sequence_utils import *
33
34
35
36 class BracketWrapper:
37 fields = ['id', 'chr', 'pos', 'strand', 'mismatches', 'length',\
38 'offset', 'seq', 'prb', 'cal_prb', 'chastity']
39
40 def __init__(self,filename):
41 self.parser = ParaParser("%lu%d%d%s%d%d%d%s%s%s%s",self.fields,len(self.fields),IN_VECTOR)
42 self.parser.parseFile(filename)
43
44 def __len__(self):
45 return self.parser.getSize(IN_VECTOR)
46
47 def __getitem__(self,key):
48 return self.parser.fetchEntry(key)
49
50 def __iter__(self):
51 self.counter = 0
52 self.size = self.parser.getSize(IN_VECTOR)
53 return self
54
55 def next(self):
56 if not self.counter < self.size:
57 raise StopIteration
58 return_val = self.parser.fetchEntry(self.counter)
59 self.counter += 1
60 return return_val
61
62
63 class PipelineHeuristic:
64 """
65 This class wraps the filter which decides whether an alignment found by
66 vmatch is spliced an should be then newly aligned using QPalma or not.
67 """
68
69 def __init__(self,run_fname,data_fname,param_fname,result_fname):
70 """
71 We need a run object holding information about the nr. of support points
72 etc.
73 """
74
75 run = cPickle.load(open(run_fname))
76 self.run = run
77
78 dna_flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
79
80
81 start = cpu()
82
83 # old version
84 #self.all_remapped_reads = parse_map_vm_heuristic(data_fname)
85 self.all_remapped_reads = BracketWrapper(data_fname)
86
87 stop = cpu()
88
89 print 'parsed %d reads in %f sec' % (len(self.all_remapped_reads),stop-start)
90
91
92 start = cpu()
93 self.lt1 = Lookup(dna_flat_files)
94 stop = cpu()
95 print 'prefetched sequence and splice data in %f sec' % (stop-start)
96
97 self.result_spliced_fh = open('%s.spliced'%result_fname,'w+')
98 self.result_unspliced_fh = open('%s.unspliced'%result_fname,'w+')
99
100 start = cpu()
101
102 self.data_fname = data_fname
103
104 self.param = cPickle.load(open(param_fname))
105
106 # Set the parameters such as limits penalties for the Plifs
107 [h,d,a,mmatrix,qualityPlifs] = set_param_palma(self.param,True,run)
108
109 self.h = h
110 self.d = d
111 self.a = a
112 self.mmatrix = mmatrix
113 self.qualityPlifs = qualityPlifs
114
115 #self.read_size = 36
116
117 # parameters of the heuristics to decide whether the read is spliced
118 #self.splice_thresh = 0.005
119 self.splice_thresh = 0.01
120 self.max_intron_size = 2000
121 self.max_mismatch = 2
122 #self.splice_stop_thresh = 0.99
123 self.splice_stop_thresh = 0.8
124 self.spliced_bias = 0.0
125
126 param_lines = [\
127 ('%f','splice_thresh',self.splice_thresh),\
128 ('%d','max_intron_size',self.max_intron_size),\
129 ('%d','max_mismatch',self.max_mismatch),\
130 ('%f','splice_stop_thresh',self.splice_stop_thresh),\
131 ('%f','spliced_bias',self.spliced_bias)]
132
133 param_lines = [('# %s: '+form+'\n')%(name,val) for form,name,val in param_lines]
134 param_lines.append('# data: %s\n'%self.data_fname)
135 param_lines.append('# param: %s\n'%param_fname)
136
137 #pdb.set_trace()
138
139 for p_line in param_lines:
140 self.result_spliced_fh.write(p_line)
141 self.result_unspliced_fh.write(p_line)
142
143 #self.original_reads = {}
144
145 # we do not have this information
146 #for line in open(reads_pipeline_fn):
147 # line = line.strip()
148 # id,seq,q1,q2,q3 = line.split()
149 # id = int(id)
150 # self.original_reads[id] = seq
151
152 lengthSP = run['numLengthSuppPoints']
153 donSP = run['numDonSuppPoints']
154 accSP = run['numAccSuppPoints']
155 mmatrixSP = run['matchmatrixRows']*run['matchmatrixCols']
156 numq = run['numQualSuppPoints']
157 totalQualSP = run['totalQualSuppPoints']
158
159 currentPhi = zeros((run['numFeatures'],1))
160 currentPhi[0:lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
161 currentPhi[lengthSP:lengthSP+donSP] = mat(d.penalties[:]).reshape(donSP,1)
162 currentPhi[lengthSP+donSP:lengthSP+donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
163 currentPhi[lengthSP+donSP+accSP:lengthSP+donSP+accSP+mmatrixSP] = mmatrix[:]
164
165 totalQualityPenalties = self.param[-totalQualSP:]
166 currentPhi[lengthSP+donSP+accSP+mmatrixSP:] = totalQualityPenalties[:]
167 self.currentPhi = currentPhi
168
169 # we want to identify spliced reads
170 # so true pos are spliced reads that are predicted "spliced"
171 self.true_pos = 0
172
173 # as false positives we count all reads that are not spliced but predicted
174 # as "spliced"
175 self.false_pos = 0
176
177 self.true_neg = 0
178 self.false_neg = 0
179
180 # total time spend for get seq and scores
181 self.get_time = 0.0
182 self.calcAlignmentScoreTime = 0.0
183 self.alternativeScoresTime = 0.0
184
185 self.count_time = 0.0
186 self.main_loop = 0.0
187 self.splice_site_time = 0.0
188 self.computeSpliceAlignWithQualityTime = 0.0
189 self.computeSpliceWeightsTime = 0.0
190 self.DotProdTime = 0.0
191 self.array_stuff = 0.0
192 stop = cpu()
193
194 self.init_time = stop-start
195
196 def filter(self):
197 """
198 This method...
199 """
200 run = self.run
201
202
203 ctr = 0
204 unspliced_ctr = 0
205 spliced_ctr = 0
206
207 print 'Starting filtering...'
208 _start = cpu()
209
210 #for readId,currentReadLocations in all_remapped_reads.items():
211 #for location in currentReadLocations[:1]:
212
213 for location,original_line in self.all_remapped_reads:
214
215 if ctr % 1000 == 0:
216 print ctr
217
218
219 id = location['id']
220 chr = location['chr']
221 pos = location['pos']
222 strand = location['strand']
223 mismatch = location['mismatches']
224 length = location['length']
225 off = location['offset']
226 seq = location['seq']
227 prb = location['prb']
228 cal_prb = location['cal_prb']
229 chastity = location['chastity']
230
231 id = int(id)
232
233 seq = seq.lower()
234
235 strand_map = {'D':'+', 'P':'-'}
236
237 strand = strand_map[strand]
238
239 #if strand == '-':
240 # continue
241
242 if not chr in range(1,6):
243 continue
244
245 unb_seq = unbracket_est(seq)
246 effective_len = len(unb_seq)
247
248 genomicSeq_start = pos
249 genomicSeq_stop = pos+effective_len-1
250
251 start = cpu()
252 #currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
253 currentDNASeq, currentAcc, currentDon = self.lt1.get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
254 stop = cpu()
255 self.get_time += stop-start
256
257 dna = currentDNASeq
258 exons = zeros((2,1))
259 exons[0,0] = 0
260 exons[1,0] = effective_len
261 est = unb_seq
262 original_est = seq
263 quality = prb
264
265 #pdb.set_trace()
266
267 currentVMatchAlignment = dna, exons, est, original_est, quality,\
268 currentAcc, currentDon
269
270 try:
271 alternativeAlignmentScores = self.calcAlternativeAlignments(location)
272 except:
273 alternativeAlignmentScores = []
274
275
276 if alternativeAlignmentScores == []:
277 # no alignment necessary
278 maxAlternativeAlignmentScore = -inf
279 vMatchScore = 0.0
280 else:
281 maxAlternativeAlignmentScore = max(alternativeAlignmentScores)
282 # compute alignment for vmatch unspliced read
283 vMatchScore = self.calcAlignmentScore(currentVMatchAlignment)
284
285 start = cpu()
286
287 #print 'vMatchScore/alternativeScore: %f %f ' % (vMatchScore,maxAlternativeAlignmentScore)
288 #print 'all candidates %s' % str(alternativeAlignmentScores)
289
290 new_id = id - 1000000300000
291
292 unspliced = False
293 # unspliced
294 if new_id > 0:
295 unspliced = True
296
297 # Seems that according to our learned parameters VMatch found a good
298 # alignment of the current read
299 if maxAlternativeAlignmentScore < vMatchScore:
300 unspliced_ctr += 1
301
302 self.result_unspliced_fh.write(original_line+'\n')
303
304 if unspliced:
305 self.true_neg += 1
306 else:
307 self.false_neg += 1
308
309 # We found an alternative alignment considering splice sites that scores
310 # higher than the VMatch alignment
311 else:
312 spliced_ctr += 1
313
314 self.result_spliced_fh.write(original_line+'\n')
315
316 if unspliced:
317 self.false_pos += 1
318 else:
319 self.true_pos += 1
320
321 ctr += 1
322 stop = cpu()
323 self.count_time = stop-start
324
325 _stop = cpu()
326 self.main_loop = _stop-_start
327
328 print 'Unspliced/Splice: %d %d'%(unspliced_ctr,spliced_ctr)
329 print 'True pos / false pos : %d %d'%(self.true_pos,self.false_pos)
330 print 'True neg / false neg : %d %d'%(self.true_neg,self.false_neg)
331
332
333 def findHighestScoringSpliceSites(self, currentAcc, currentDon, DNA, max_intron_size, read_size, splice_thresh):
334
335 def signum(a):
336 if a>0:
337 return 1
338 elif a<0:
339 return -1
340 else:
341 return 0
342
343 proximal_acc = []
344 for idx in xrange(max_intron_size, max_intron_size+read_size/2):
345 if currentAcc[idx]>= splice_thresh:
346 proximal_acc.append((idx,currentAcc[idx]))
347
348 proximal_acc.sort(lambda x,y: signum(x[1]-y[1]))
349 proximal_acc=proximal_acc[-2:]
350
351 distal_acc = []
352 for idx in xrange(max_intron_size+read_size, len(currentAcc)):
353 if currentAcc[idx]>= splice_thresh and idx+read_size<len(currentAcc):
354 distal_acc.append((idx, currentAcc[idx], DNA[idx+1:idx+read_size]))
355
356 #distal_acc.sort(lambda x,y: signum(x[1]-y[1]))
357 #distal_acc=distal_acc[-2:]
358
359
360 proximal_don = []
361 for idx in xrange(max_intron_size+read_size/2, max_intron_size+read_size):
362 if currentDon[idx] >= splice_thresh:
363 proximal_don.append((idx, currentDon[idx]))
364
365 proximal_don.sort(lambda x,y: signum(x[1]-y[1]))
366 proximal_don=proximal_don[-2:]
367
368 distal_don = []
369 for idx in xrange(1, max_intron_size):
370 if currentDon[idx] >= splice_thresh and idx>read_size:
371 distal_don.append((idx, currentDon[idx], DNA[idx-read_size:idx]))
372
373 distal_don.sort(lambda x,y: y[0]-x[0])
374 #distal_don=distal_don[-2:]
375
376 return proximal_acc,proximal_don,distal_acc,distal_don
377
378 def calcAlternativeAlignments(self,location):
379 """
380 Given an alignment proposed by Vmatch this function calculates possible
381 alternative alignments taking into account for example matched
382 donor/acceptor positions.
383 """
384
385 run = self.run
386
387 id = location['id']
388 chr = location['chr']
389 pos = location['pos']
390 strand = location['strand']
391 original_est = location['seq']
392 quality = location['prb']
393 cal_prb = location['cal_prb']
394
395 original_est = original_est.lower()
396 est = unbracket_est(original_est)
397 effective_len = len(est)
398
399 genomicSeq_start = pos - self.max_intron_size
400 genomicSeq_stop = pos + self.max_intron_size + len(est)
401
402 strand_map = {'D':'+', 'P':'-'}
403 strand = strand_map[strand]
404
405 start = cpu()
406 #currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr, strand, genomicSeq_start, genomicSeq_stop, run['dna_flat_files'])
407 currentDNASeq, currentAcc, currentDon = self.lt1.get_seq_and_scores(chr, strand, genomicSeq_start, genomicSeq_stop, run['dna_flat_files'])
408 stop = cpu()
409 self.get_time += stop-start
410 dna = currentDNASeq
411
412 proximal_acc,proximal_don,distal_acc,distal_don = self.findHighestScoringSpliceSites(currentAcc, currentDon, dna, self.max_intron_size, len(est), self.splice_thresh)
413
414 alternativeScores = []
415
416 # inlined
417 h = self.h
418 d = self.d
419 a = self.a
420 mmatrix = self.mmatrix
421 qualityPlifs = self.qualityPlifs
422 # inlined
423
424 # find an intron on the 3' end
425 _start = cpu()
426 for (don_pos,don_score) in proximal_don:
427 DonorScore = calculatePlif(d, [don_score])[0]
428
429 for (acc_pos,acc_score,acc_dna) in distal_acc:
430
431 IntronScore = calculatePlif(h, [acc_pos-don_pos])[0]
432 AcceptorScore = calculatePlif(a, [acc_score])[0]
433
434 #print 'don splice: ', (don_pos,don_score), (acc_pos,acc_score,acc_dna), (DonorScore,IntronScore,AcceptorScore)
435
436 # construct a new "original_est"
437 original_est_cut=''
438
439 est_ptr=0
440 dna_ptr=self.max_intron_size
441 ptr=0
442 acc_dna_ptr=0
443 num_mismatch = 0
444
445 while ptr<len(original_est):
446 #print acc_dna_ptr,len(acc_dna),acc_pos,don_pos
447
448 if original_est[ptr]=='[':
449 dnaletter=original_est[ptr+1]
450 estletter=original_est[ptr+2]
451 if dna_ptr < don_pos:
452 original_est_cut+=original_est[ptr:ptr+4]
453 num_mismatch += 1
454 else:
455 if acc_dna[acc_dna_ptr]==estletter:
456 original_est_cut += estletter # EST letter
457 else:
458 original_est_cut += '['+acc_dna[acc_dna_ptr]+estletter+']' # EST letter
459 num_mismatch += 1
460 #print '['+acc_dna[acc_dna_ptr]+estletter+']'
461 acc_dna_ptr+=1
462 ptr+=4
463 else:
464 dnaletter=original_est[ptr]
465 estletter=dnaletter
466
467 if dna_ptr < don_pos:
468 original_est_cut+=estletter # EST letter
469 else:
470 if acc_dna[acc_dna_ptr]==estletter:
471 original_est_cut += estletter # EST letter
472 else:
473 num_mismatch += 1
474 original_est_cut += '['+acc_dna[acc_dna_ptr]+estletter+']' # EST letter
475 #print '('+acc_dna[acc_dna_ptr]+estletter+')'
476 acc_dna_ptr+=1
477
478 ptr+=1
479
480 if estletter=='-':
481 dna_ptr+=1
482 elif dnaletter=='-':
483 est_ptr+=1
484 else:
485 dna_ptr+=1
486 est_ptr+=1
487 if num_mismatch>self.max_mismatch:
488 continue
489
490 assert(dna_ptr<=len(dna))
491 assert(est_ptr<=len(est))
492
493 #print original_est, original_est_cut
494
495 score = computeSpliceAlignScoreWithQuality(original_est_cut, quality, qualityPlifs, run, self.currentPhi)
496 score += AcceptorScore + IntronScore + DonorScore + self.spliced_bias
497
498 alternativeScores.append(score)
499
500 if acc_score>=self.splice_stop_thresh:
501 break
502
503 _stop = cpu()
504 self.alternativeScoresTime += _stop-_start
505
506 # find an intron on the 5' end
507 _start = cpu()
508 for (acc_pos,acc_score) in proximal_acc:
509
510 AcceptorScore = calculatePlif(a, [acc_score])[0]
511
512 for (don_pos,don_score,don_dna) in distal_don:
513
514 DonorScore = calculatePlif(d, [don_score])[0]
515 IntronScore = calculatePlif(h, [acc_pos-don_pos])[0]
516
517 #print 'acc splice: ', (don_pos,don_score,don_dna), (acc_pos,acc_score), (DonorScore,IntronScore,AcceptorScore)
518
519 # construct a new "original_est"
520 original_est_cut=''
521
522 est_ptr=0
523 dna_ptr=self.max_intron_size
524 ptr=0
525 num_mismatch = 0
526 don_dna_ptr=len(don_dna)-(acc_pos-self.max_intron_size)-1
527 while ptr<len(original_est):
528
529 if original_est[ptr]=='[':
530 dnaletter=original_est[ptr+1]
531 estletter=original_est[ptr+2]
532 if dna_ptr > acc_pos:
533 original_est_cut+=original_est[ptr:ptr+4]
534 num_mismatch += 1
535 else:
536 if don_dna[don_dna_ptr]==estletter:
537 original_est_cut += estletter # EST letter
538 else:
539 original_est_cut += '['+don_dna[don_dna_ptr]+estletter+']' # EST letter
540 num_mismatch += 1
541 #print '['+don_dna[don_dna_ptr]+estletter+']'
542 don_dna_ptr+=1
543 ptr+=4
544 else:
545 dnaletter=original_est[ptr]
546 estletter=dnaletter
547
548 if dna_ptr > acc_pos:
549 original_est_cut+=estletter # EST letter
550 else:
551 if don_dna[don_dna_ptr]==estletter:
552 original_est_cut += estletter # EST letter
553 else:
554 original_est_cut += '['+don_dna[don_dna_ptr]+estletter+']' # EST letter
555 num_mismatch += 1
556 #print '('+don_dna[don_dna_ptr]+estletter+')'
557 don_dna_ptr+=1
558
559 ptr+=1
560
561 if estletter=='-':
562 dna_ptr+=1
563 elif dnaletter=='-':
564 est_ptr+=1
565 else:
566 dna_ptr+=1
567 est_ptr+=1
568
569 if num_mismatch>self.max_mismatch:
570 continue
571 assert(dna_ptr<=len(dna))
572 assert(est_ptr<=len(est))
573
574 #print original_est, original_est_cut
575
576 score = computeSpliceAlignScoreWithQuality(original_est_cut, quality, qualityPlifs, run, self.currentPhi)
577 score += AcceptorScore + IntronScore + DonorScore + self.spliced_bias
578
579 alternativeScores.append(score)
580
581 if don_score>=self.splice_stop_thresh:
582 break
583
584 _stop = cpu()
585 self.alternativeScoresTime += _stop-_start
586
587 return alternativeScores
588
589
590 def calcAlignmentScore(self,alignment):
591 """
592 Given an alignment (dna,exons,est) and the current parameter for QPalma
593 this function calculates the dot product of the feature representation of
594 the alignment and the parameter vector i.e the alignment score.
595 """
596
597 start = cpu()
598 run = self.run
599
600 # Lets start calculation
601 dna, exons, est, original_est, quality, acc_supp, don_supp = alignment
602
603 score = computeSpliceAlignScoreWithQuality(original_est, quality, self.qualityPlifs, run, self.currentPhi)
604
605 stop = cpu()
606 self.calcAlignmentScoreTime += stop-start
607
608 return score
609
610
611 def cpu():
612 return (resource.getrusage(resource.RUSAGE_SELF).ru_utime+\
613 resource.getrusage(resource.RUSAGE_SELF).ru_stime)
614
615
616 if __name__ == '__main__':
617 if len(sys.argv) != 6:
618 print 'Usage: ./%s data param run spliced.results unspliced.results' % (sys.argv[0])
619 sys.exit(1)
620
621 data_fname = sys.argv[1]
622 param_fname = sys.argv[2]
623 run_fname = sys.argv[3]
624
625 result_spliced_fname = sys.argv[4]
626 result_unspliced_fname = sys.argv[5]
627
628 jp = os.path.join
629
630 ph1 = PipelineHeuristic(run_fname,data_fname,param_fname,result_spliced_fname,result_unspliced_fname)
631
632 start = cpu()
633 ph1.filter()
634 stop = cpu()
635 print 'total time elapsed: %f' % (stop-start)
636
637 print 'time spend for get seq: %f' % ph1.get_time
638 print 'time spend for calcAlignmentScoreTime: %f' % ph1.calcAlignmentScoreTime
639 print 'time spend for alternativeScoresTime: %f' % ph1.alternativeScoresTime
640 print 'time spend for count time: %f' % ph1.count_time
641 print 'time spend for init time: %f' % ph1.init_time
642 print 'time spend for main_loop time: %f' % ph1.main_loop
643 print 'time spend for splice_site_time time: %f' % ph1.splice_site_time
644
645 print 'time spend for computeSpliceAlignWithQualityTime time: %f'% ph1.computeSpliceAlignWithQualityTime
646 print 'time spend for computeSpliceWeightsTime time: %f'% ph1.computeSpliceWeightsTime
647 print 'time spend for DotProdTime time: %f'% ph1.DotProdTime
648 print 'time spend forarray_stuff time: %f'% ph1.array_stuff