+ saving results in two files (spliced/unspliced)
[qpalma.git] / scripts / PipelineHeuristic.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import cPickle
5 import sys
6 import pdb
7 import os
8 import os.path
9 import math
10 import resource
11
12 from qpalma.DataProc import *
13 from qpalma.computeSpliceWeights import *
14 from qpalma.set_param_palma import *
15 from qpalma.computeSpliceAlignWithQuality import *
16 from qpalma.penalty_lookup_new import *
17 from qpalma.compute_donacc import *
18 from qpalma.TrainingParam import Param
19 from qpalma.Plif import Plf
20
21 #from qpalma.tools.splicesites import getDonAccScores
22 from qpalma.Configuration import *
23
24 from compile_dataset import getSpliceScores, get_seq_and_scores
25
26 from numpy.matlib import mat,zeros,ones,inf
27 from numpy import inf,mean
28
29 from qpalma.parsers import PipelineReadParser
30
31
32 def unbracket_est(est):
33 new_est = ''
34 e = 0
35
36 while True:
37 if e >= len(est):
38 break
39
40 if est[e] == '[':
41 new_est += est[e+2]
42 e += 4
43 else:
44 new_est += est[e]
45 e += 1
46
47 return "".join(new_est).lower()
48
49
50 class PipelineHeuristic:
51 """
52 This class wraps the filter which decides whether an alignment found by
53 vmatch is spliced an should be then newly aligned using QPalma or not.
54 """
55
56 def __init__(self,run_fname,data_fname,param_fname,result_spliced_fname,result_unspliced_fname):
57 """
58 We need a run object holding information about the nr. of support points
59 etc.
60 """
61
62 run = cPickle.load(open(run_fname))
63 self.run = run
64
65 self.result_spliced_fh = open(result_spliced_fname,'w+')
66 self.result_unspliced_fh = open(result_unspliced_fname,'w+')
67
68 start = cpu()
69
70 self.data_fname = data_fname
71
72 self.param = cPickle.load(open(param_fname))
73
74 # Set the parameters such as limits penalties for the Plifs
75 [h,d,a,mmatrix,qualityPlifs] = set_param_palma(self.param,True,run)
76
77 self.h = h
78 self.d = d
79 self.a = a
80 self.mmatrix = mmatrix
81 self.qualityPlifs = qualityPlifs
82
83 # when we look for alternative alignments with introns this value is the
84 # mean of intron size
85 self.intron_size = 250
86
87 self.read_size = 36
88
89 self.original_reads = {}
90
91 for line in open('/fml/ag-raetsch/share/projects/qpalma/solexa/allReads.pipeline'):
92 line = line.strip()
93 id,seq,q1,q2,q3 = line.split()
94 id = int(id)
95 self.original_reads[id] = seq
96
97 lengthSP = run['numLengthSuppPoints']
98 donSP = run['numDonSuppPoints']
99 accSP = run['numAccSuppPoints']
100 mmatrixSP = run['matchmatrixRows']*run['matchmatrixCols']
101 numq = run['numQualSuppPoints']
102 totalQualSP = run['totalQualSuppPoints']
103
104 currentPhi = zeros((run['numFeatures'],1))
105 currentPhi[0:lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
106 currentPhi[lengthSP:lengthSP+donSP] = mat(d.penalties[:]).reshape(donSP,1)
107 currentPhi[lengthSP+donSP:lengthSP+donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
108 currentPhi[lengthSP+donSP+accSP:lengthSP+donSP+accSP+mmatrixSP] = mmatrix[:]
109
110 totalQualityPenalties = self.param[-totalQualSP:]
111 currentPhi[lengthSP+donSP+accSP+mmatrixSP:] = totalQualityPenalties[:]
112 self.currentPhi = currentPhi
113
114 # we want to identify spliced reads
115 # so true pos are spliced reads that are predicted "spliced"
116 self.true_pos = 0
117
118 # as false positives we count all reads that are not spliced but predicted
119 # as "spliced"
120 self.false_pos = 0
121
122 self.true_neg = 0
123 self.false_neg = 0
124
125 # total time spend for get seq and scores
126 self.get_time = 0.0
127 self.calcAlignmentScoreTime = 0.0
128 self.alternativeScoresTime = 0.0
129
130 self.count_time = 0.0
131 self.read_parsing = 0.0
132 self.main_loop = 0.0
133 self.splice_site_time = 0.0
134 self.computeSpliceAlignWithQualityTime = 0.0
135 self.computeSpliceWeightsTime = 0.0
136 self.DotProdTime = 0.0
137 self.array_stuff = 0.0
138 stop = cpu()
139
140 self.init_time = stop-start
141
142 def filter(self):
143 """
144 This method...
145 """
146 run = self.run
147
148 start = cpu()
149
150 rrp = PipelineReadParser(self.data_fname)
151 all_remapped_reads = rrp.parse()
152
153 stop = cpu()
154
155 self.read_parsing = stop-start
156
157 ctr = 0
158 unspliced_ctr = 0
159 spliced_ctr = 0
160
161 print 'Starting filtering...'
162 _start = cpu()
163
164 for readId,currentReadLocations in all_remapped_reads.items():
165 for location in currentReadLocations[:1]:
166
167 id = location['id']
168 chr = location['chr']
169 pos = location['pos']
170 strand = location['strand']
171 mismatch = location['mismatches']
172 length = location['length']
173 off = location['offset']
174 seq = location['seq']
175 prb = location['prb']
176 cal_prb = location['cal_prb']
177 chastity = location['chastity']
178
179 id = int(id)
180
181 strand_map = {'+':'D', '-':'P'}
182
183
184 original_line = '%d\t%d\t%d\t%s\t%d\t%d\t%d\t%s\t%s\t%s\t%s\n' %\
185 (id,chr,pos,strand_map[strand],int(mismatch),int(length),int(off),seq.upper(),prb,cal_prb,chastity)
186
187
188 if strand == '-':
189 continue
190
191 if ctr == 1000:
192 break
193
194 #if pos > 10000000:
195 # continue
196
197 unb_seq = unbracket_est(seq)
198 effective_len = len(unb_seq)
199
200 genomicSeq_start = pos
201 genomicSeq_stop = pos+effective_len-1
202
203 start = cpu()
204 #print genomicSeq_start,genomicSeq_stop
205 currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
206 stop = cpu()
207 self.get_time += stop-start
208
209 dna = currentDNASeq
210 exons = zeros((2,1))
211 exons[0,0] = 0
212 exons[1,0] = effective_len
213 est = unb_seq
214 original_est = seq
215 quality = prb
216
217 #pdb.set_trace()
218
219 currentVMatchAlignment = dna, exons, est, original_est, quality,\
220 currentAcc, currentDon
221 vMatchScore = self.calcAlignmentScore(currentVMatchAlignment)
222
223 alternativeAlignmentScores = self.calcAlternativeAlignments(location)
224
225 start = cpu()
226 # found no alternatives
227 #if alternativeAlignmentScores == []:
228 # continue
229
230 if alternativeAlignmentScores == []:
231 maxAlternativeAlignmentScore = -inf
232 else:
233 maxAlternativeAlignmentScore = max(alternativeAlignmentScores)
234 #print 'vMatchScore/alternativeScore: %f %f ' % (vMatchScore,maxAlternativeAlignmentScore)
235 #print 'all candidates %s' % str(alternativeAlignmentScores)
236
237 new_id = id - 1000000300000
238
239 unspliced = False
240 # unspliced
241 if new_id > 0:
242 unspliced = True
243
244 # Seems that according to our learned parameters VMatch found a good
245 # alignment of the current read
246 if maxAlternativeAlignmentScore < vMatchScore:
247 unspliced_ctr += 1
248
249 self.result_unspliced_fh.write(original_line)
250
251 if unspliced:
252 self.true_neg += 1
253 else:
254 self.false_neg += 1
255
256 # We found an alternative alignment considering splice sites that scores
257 # higher than the VMatch alignment
258 else:
259 spliced_ctr += 1
260
261 self.result_spliced_fh.write(original_line)
262
263 if unspliced:
264 self.false_pos += 1
265 else:
266 self.true_pos += 1
267
268 ctr += 1
269 stop = cpu()
270 self.count_time = stop-start
271
272 _stop = cpu()
273 self.main_loop = _stop-_start
274
275 print 'Unspliced/Splice: %d %d'%(unspliced_ctr,spliced_ctr)
276 print 'True pos / false pos : %d %d'%(self.true_pos,self.false_pos)
277 print 'True neg / false neg : %d %d'%(self.true_neg,self.false_neg)
278
279
280 def findHighestScoringSpliceSites(self, currentAcc, currentDon, DNA, max_intron_size, read_size, splice_thresh):
281
282 def signum(a):
283 if a>0:
284 return 1
285 elif a<0:
286 return -1
287 else:
288 return 0
289
290 proximal_acc = []
291 for idx in xrange(max_intron_size, max_intron_size+read_size):
292 if currentAcc[idx]>= splice_thresh:
293 proximal_acc.append((idx,currentAcc[idx]))
294
295 proximal_acc.sort(lambda x,y: signum(x[1]-y[1]))
296 proximal_acc=proximal_acc[-2:]
297
298 distal_acc = []
299 for idx in xrange(max_intron_size+read_size, len(currentAcc)):
300 if currentAcc[idx]>= splice_thresh and idx+read_size<len(currentAcc):
301 distal_acc.append((idx, currentAcc[idx], DNA[idx+1:idx+read_size]))
302
303 #distal_acc.sort(lambda x,y: signum(x[1]-y[1]))
304 #distal_acc=distal_acc[-2:]
305
306
307 proximal_don = []
308 for idx in xrange(max_intron_size, max_intron_size+read_size):
309 if currentDon[idx] >= splice_thresh:
310 proximal_don.append((idx, currentDon[idx]))
311
312 proximal_don.sort(lambda x,y: signum(x[1]-y[1]))
313 proximal_don=proximal_don[-2:]
314
315 distal_don = []
316 for idx in xrange(1, max_intron_size):
317 if currentDon[idx] >= splice_thresh and idx>read_size:
318 distal_don.append((idx, currentDon[idx], DNA[idx-read_size:idx]))
319
320 distal_don.sort(lambda x,y: y[0]-x[0])
321 #distal_don=distal_don[-2:]
322
323 return proximal_acc,proximal_don,distal_acc,distal_don
324
325 def calcAlternativeAlignments(self,location):
326 """
327 Given an alignment proposed by Vmatch this function calculates possible
328 alternative alignments taking into account for example matched
329 donor/acceptor positions.
330 """
331
332 run = self.run
333 splice_thresh = 0.1
334 max_intron_size = 2000
335 bias = 0.0
336
337 id = location['id']
338 chr = location['chr']
339 pos = location['pos']
340 strand = location['strand']
341 original_est = location['seq']
342 quality = location['prb']
343 cal_prb = location['cal_prb']
344
345 est = unbracket_est(original_est)
346 effective_len = len(est)
347
348 genomicSeq_start = pos - max_intron_size
349 genomicSeq_stop = pos + max_intron_size + len(est)
350
351 start = cpu()
352 currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
353 stop = cpu()
354 self.get_time += stop-start
355 dna = currentDNASeq
356
357 proximal_acc,proximal_don,distal_acc,distal_don = self.findHighestScoringSpliceSites(currentAcc,currentDon, dna, max_intron_size, len(est), splice_thresh)
358
359 #print proximal_acc
360 #print proximal_don
361 #print distal_acc
362 #print distal_don
363
364 alternativeScores = []
365
366 # inlined
367 h = self.h
368 d = self.d
369 a = self.a
370 mmatrix = self.mmatrix
371 qualityPlifs = self.qualityPlifs
372 # inlined
373
374 _start = cpu()
375 for (don_pos,don_score) in proximal_don:
376 for (acc_pos,acc_score,acc_dna) in distal_acc:
377
378 DonorScore = calculatePlif(d, [don_score])[0]
379 IntronScore = calculatePlif(h, [acc_pos-don_pos])[0]
380 AcceptorScore = calculatePlif(a, [acc_score])[0]
381
382 #print 'don splice: ', (don_pos,don_score), (acc_pos,acc_score,acc_dna), (DonorScore,IntronScore,AcceptorScore)
383
384 # construct a new "original_est"
385 original_est_cut=''
386
387 est_ptr=0
388 dna_ptr=max_intron_size
389 ptr=0
390 acc_dna_ptr=0
391 while ptr<len(original_est):
392
393 if original_est[ptr]=='[':
394 dnaletter=original_est[ptr+1]
395 estletter=original_est[ptr+2]
396 if dna_ptr < don_pos:
397 original_est_cut+=original_est[ptr:ptr+4]
398 else:
399 if acc_dna[acc_dna_ptr]==estletter:
400 original_est_cut += estletter # EST letter
401 else:
402 original_est_cut += '['+acc_dna[acc_dna_ptr]+estletter+']' # EST letter
403 #print '['+acc_dna[acc_dna_ptr]+estletter+']'
404 acc_dna_ptr+=1
405 ptr+=4
406 else:
407 dnaletter=original_est[ptr]
408 estletter=dnaletter
409
410 if dna_ptr < don_pos:
411 original_est_cut+=estletter # EST letter
412 else:
413 if acc_dna[acc_dna_ptr]==estletter:
414 original_est_cut += estletter # EST letter
415 else:
416 original_est_cut += '['+acc_dna[acc_dna_ptr]+estletter+']' # EST letter
417 #print '('+acc_dna[acc_dna_ptr]+estletter+')'
418 acc_dna_ptr+=1
419
420 ptr+=1
421
422 if estletter=='-':
423 dna_ptr+=1
424 elif dnaletter=='-':
425 est_ptr+=1
426 else:
427 dna_ptr+=1
428 est_ptr+=1
429
430 assert(dna_ptr<=len(dna))
431 assert(est_ptr<=len(est))
432
433 #print original_est, original_est_cut
434
435 score = computeSpliceAlignScoreWithQuality(original_est_cut, quality, qualityPlifs, run, self.currentPhi)
436 score += AcceptorScore + IntronScore + DonorScore + bias
437
438 alternativeScores.append(score)
439
440 _stop = cpu()
441 self.alternativeScoresTime += _stop-_start
442
443 # acceptor
444 _start = cpu()
445 for (acc_pos,acc_score) in proximal_acc:
446 for (don_pos,don_score,don_dna) in distal_don:
447
448 DonorScore = calculatePlif(d, [don_score])[0]
449 IntronScore = calculatePlif(h, [acc_pos-don_pos])[0]
450 AcceptorScore = calculatePlif(a, [acc_score])[0]
451
452 #print 'acc splice: ', (don_pos,don_score,don_dna), (acc_pos,acc_score), (DonorScore,IntronScore,AcceptorScore)
453
454 # construct a new "original_est"
455 original_est_cut=''
456
457 est_ptr=0
458 dna_ptr=max_intron_size
459 ptr=0
460 don_dna_ptr=len(don_dna)-(acc_pos-max_intron_size)-1
461 while ptr<len(original_est):
462
463 if original_est[ptr]=='[':
464 dnaletter=original_est[ptr+1]
465 estletter=original_est[ptr+2]
466 if dna_ptr > acc_pos:
467 original_est_cut+=original_est[ptr:ptr+4]
468 else:
469 if don_dna[don_dna_ptr]==estletter:
470 original_est_cut += estletter # EST letter
471 else:
472 original_est_cut += '['+don_dna[don_dna_ptr]+estletter+']' # EST letter
473 #print '['+don_dna[don_dna_ptr]+estletter+']'
474 don_dna_ptr+=1
475 ptr+=4
476 else:
477 dnaletter=original_est[ptr]
478 estletter=dnaletter
479
480 if dna_ptr > acc_pos:
481 original_est_cut+=estletter # EST letter
482 else:
483 if don_dna[don_dna_ptr]==estletter:
484 original_est_cut += estletter # EST letter
485 else:
486 original_est_cut += '['+don_dna[don_dna_ptr]+estletter+']' # EST letter
487 #print '('+don_dna[don_dna_ptr]+estletter+')'
488 don_dna_ptr+=1
489
490 ptr+=1
491
492 if estletter=='-':
493 dna_ptr+=1
494 elif dnaletter=='-':
495 est_ptr+=1
496 else:
497 dna_ptr+=1
498 est_ptr+=1
499
500 assert(dna_ptr<=len(dna))
501 assert(est_ptr<=len(est))
502
503 #print original_est, original_est_cut
504
505 score = computeSpliceAlignScoreWithQuality(original_est_cut, quality, qualityPlifs, run, self.currentPhi)
506 score += AcceptorScore + IntronScore + DonorScore + bias
507
508 alternativeScores.append(score)
509
510 _stop = cpu()
511 self.alternativeScoresTime += _stop-_start
512
513 return alternativeScores
514
515
516 def calcAlignmentScore(self,alignment):
517 """
518 Given an alignment (dna,exons,est) and the current parameter for QPalma
519 this function calculates the dot product of the feature representation of
520 the alignment and the parameter vector i.e the alignment score.
521 """
522
523 start = cpu()
524 run = self.run
525
526 # Lets start calculation
527 dna, exons, est, original_est, quality, acc_supp, don_supp = alignment
528
529 score = computeSpliceAlignScoreWithQuality(original_est, quality, self.qualityPlifs, run, self.currentPhi)
530
531 stop = cpu()
532 self.calcAlignmentScoreTime += stop-start
533
534 return score
535
536
537 def cpu():
538 return (resource.getrusage(resource.RUSAGE_SELF).ru_utime+\
539 resource.getrusage(resource.RUSAGE_SELF).ru_stime)
540
541
542 if __name__ == '__main__':
543 #run_fname = sys.argv[1]
544 #data_fname = sys.argv[2]
545 #param_filename = sys.argv[3]
546
547 dir = '/fml/ag-raetsch/home/fabio/tmp/QPalma_test/run_+_quality_+_splicesignals_+_intron_len'
548 jp = os.path.join
549
550 run_fname = jp(dir,'run_object.pickle')
551 #data_fname = '/fml/ag-raetsch/share/projects/qpalma/solexa/current_data/map.vm_unspliced_1k'
552
553 data_fname = '/fml/ag-raetsch/share/projects/qpalma/solexa/pipeline_data/map.vm_2k'
554 #data_fname = '/fml/ag-raetsch/share/projects/qpalma/solexa/pipeline_data/map.vm_100'
555
556 param_fname = jp(dir,'param_500.pickle')
557
558 result_spliced_fname = 'splicedReads.heuristic'
559 result_unspliced_fname = 'unsplicedReads.heuristic'
560
561 ph1 = PipelineHeuristic(run_fname,data_fname,param_fname,result_spliced_fname,result_unspliced_fname)
562
563 start = cpu()
564 ph1.filter()
565 stop = cpu()
566
567 print 'total time elapsed: %f' % (stop-start)
568 print 'time spend for get seq: %f' % ph1.get_time
569 print 'time spend for calcAlignmentScoreTime: %f' % ph1.calcAlignmentScoreTime
570 print 'time spend for alternativeScoresTime: %f' % ph1.alternativeScoresTime
571 print 'time spend for count time: %f' % ph1.count_time
572 print 'time spend for init time: %f' % ph1.init_time
573 print 'time spend for read_parsing time: %f' % ph1.read_parsing
574 print 'time spend for main_loop time: %f' % ph1.main_loop
575 print 'time spend for splice_site_time time: %f' % ph1.splice_site_time
576
577 print 'time spend for computeSpliceAlignWithQualityTime time: %f'% ph1.computeSpliceAlignWithQualityTime
578 print 'time spend for computeSpliceWeightsTime time: %f'% ph1.computeSpliceWeightsTime
579 print 'time spend for DotProdTime time: %f'% ph1.DotProdTime
580 print 'time spend forarray_stuff time: %f'% ph1.array_stuff
581 #import cProfile
582 #cProfile.run('ph1.filter()')
583
584 #import hotshot
585 #p = hotshot.Profile('profile.log')
586 #p.runcall(ph1.filter)