git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@8663 e1793c9e...
[qpalma.git] / scripts / PipelineHeuristic.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import cPickle
5 import sys
6 import pdb
7 import os
8 import os.path
9 import math
10 import resource
11
12 from qpalma.DataProc import *
13 from qpalma.computeSpliceWeights import *
14 from qpalma.set_param_palma import *
15 from qpalma.computeSpliceAlignWithQuality import *
16 from qpalma.penalty_lookup_new import *
17 from qpalma.compute_donacc import *
18 from qpalma.TrainingParam import Param
19 from qpalma.Plif import Plf
20
21 from qpalma.tools.splicesites import getDonAccScores
22 from qpalma.Configuration import *
23
24 from compile_dataset import getSpliceScores, get_seq_and_scores
25
26 from numpy.matlib import mat,zeros,ones,inf
27 from numpy import inf,mean
28
29 from qpalma.parsers import PipelineReadParser
30
31
32 def unbracket_est(est):
33 new_est = ''
34 e = 0
35
36 while True:
37 if e >= len(est):
38 break
39
40 if est[e] == '[':
41 new_est += est[e+2]
42 e += 4
43 else:
44 new_est += est[e]
45 e += 1
46
47 return "".join(new_est).lower()
48
49
50 class PipelineHeuristic:
51 """
52 This class wraps the filter which decides whether an alignment found by
53 vmatch is spliced an should be then newly aligned using QPalma or not.
54 """
55
56 def __init__(self,run_fname,data_fname,param_fname):
57 """
58 We need a run object holding information about the nr. of support points
59 etc.
60 """
61
62 run = cPickle.load(open(run_fname))
63 self.run = run
64
65 start = cpu()
66
67 self.data_fname = data_fname
68
69 self.param = cPickle.load(open(param_fname))
70
71 # Set the parameters such as limits penalties for the Plifs
72 [h,d,a,mmatrix,qualityPlifs] = set_param_palma(self.param,True,run)
73
74 self.h = h
75 self.d = d
76 self.a = a
77 self.mmatrix = mmatrix
78 self.qualityPlifs = qualityPlifs
79
80 # when we look for alternative alignments with introns this value is the
81 # mean of intron size
82 self.intron_size = 250
83
84 self.read_size = 36
85
86 self.original_reads = {}
87
88 for line in open('/fml/ag-raetsch/share/projects/qpalma/solexa/allReads.pipeline'):
89 line = line.strip()
90 id,seq,q1,q2,q3 = line.split()
91 id = int(id)
92 self.original_reads[id] = seq
93
94 lengthSP = run['numLengthSuppPoints']
95 donSP = run['numDonSuppPoints']
96 accSP = run['numAccSuppPoints']
97 mmatrixSP = run['matchmatrixRows']*run['matchmatrixCols']
98 numq = run['numQualSuppPoints']
99 totalQualSP = run['totalQualSuppPoints']
100
101 currentPhi = zeros((run['numFeatures'],1))
102 currentPhi[0:lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
103 currentPhi[lengthSP:lengthSP+donSP] = mat(d.penalties[:]).reshape(donSP,1)
104 currentPhi[lengthSP+donSP:lengthSP+donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
105 currentPhi[lengthSP+donSP+accSP:lengthSP+donSP+accSP+mmatrixSP] = mmatrix[:]
106
107 totalQualityPenalties = self.param[-totalQualSP:]
108 currentPhi[lengthSP+donSP+accSP+mmatrixSP:] = totalQualityPenalties[:]
109 self.currentPhi = currentPhi
110
111 # we want to identify spliced reads
112 # so true pos are spliced reads that are predicted "spliced"
113 self.true_pos = 0
114
115 # as false positives we count all reads that are not spliced but predicted
116 # as "spliced"
117 self.false_pos = 0
118
119 self.true_neg = 0
120 self.false_neg = 0
121
122 # total time spend for get seq and scores
123 self.get_time = 0.0
124 self.calcAlignmentScoreTime = 0.0
125 self.alternativeScoresTime = 0.0
126
127 self.count_time = 0.0
128 self.read_parsing = 0.0
129 self.main_loop = 0.0
130 self.splice_site_time = 0.0
131 self.computeSpliceAlignWithQualityTime = 0.0
132 self.computeSpliceWeightsTime = 0.0
133 self.DotProdTime = 0.0
134 self.array_stuff = 0.0
135 stop = cpu()
136
137 self.init_time = stop-start
138
139 def filter(self):
140 """
141 This method...
142 """
143 run = self.run
144
145 start = cpu()
146
147 rrp = PipelineReadParser(self.data_fname)
148 all_remapped_reads = rrp.parse()
149
150 stop = cpu()
151
152 self.read_parsing = stop-start
153
154 ctr = 0
155 unspliced_ctr = 0
156 spliced_ctr = 0
157
158 print 'Starting filtering...'
159 _start = cpu()
160
161 for readId,currentReadLocations in all_remapped_reads.items():
162 for location in currentReadLocations[:1]:
163
164 id = location['id']
165 chr = location['chr']
166 pos = location['pos']
167 strand = location['strand']
168 mismatch = location['mismatches']
169 length = location['length']
170 off = location['offset']
171 seq = location['seq']
172 prb = location['prb']
173 cal_prb = location['cal_prb']
174 chastity = location['chastity']
175
176 id = int(id)
177
178 if strand == '-':
179 continue
180
181 if ctr == 1000:
182 break
183
184 #if pos > 10000000:
185 # continue
186
187 unb_seq = unbracket_est(seq)
188 effective_len = len(unb_seq)
189
190 genomicSeq_start = pos
191 genomicSeq_stop = pos+effective_len-1
192
193 start = cpu()
194 #print genomicSeq_start,genomicSeq_stop
195 currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
196 stop = cpu()
197 self.get_time += stop-start
198
199 dna = currentDNASeq
200 exons = zeros((2,1))
201 exons[0,0] = 0
202 exons[1,0] = effective_len
203 est = unb_seq
204 original_est = seq
205 quality = prb
206
207 #pdb.set_trace()
208
209 currentVMatchAlignment = dna, exons, est, original_est, quality,\
210 currentAcc, currentDon
211 vMatchScore = self.calcAlignmentScore(currentVMatchAlignment)
212
213 alternativeAlignmentScores = self.calcAlternativeAlignments(location)
214
215 start = cpu()
216 # found no alternatives
217 if alternativeAlignmentScores == []:
218 continue
219
220 maxAlternativeAlignmentScore = max(alternativeAlignmentScores)
221 #print 'vMatchScore/alternativeScore: %f %f ' % (vMatchScore,maxAlternativeAlignmentScore)
222 #print 'all candidates %s' % str(alternativeAlignmentScores)
223
224 new_id = id - 1000000300000
225
226 unspliced = False
227 # unspliced
228 if new_id > 0:
229 unspliced = True
230
231 # Seems that according to our learned parameters VMatch found a good
232 # alignment of the current read
233 if maxAlternativeAlignmentScore < vMatchScore:
234 unspliced_ctr += 1
235
236 if unspliced:
237 self.true_neg += 1
238 else:
239 self.false_neg += 1
240
241 # We found an alternative alignment considering splice sites that scores
242 # higher than the VMatch alignment
243 else:
244 spliced_ctr += 1
245
246 if unspliced:
247 self.false_pos += 1
248 else:
249 self.true_pos += 1
250
251 ctr += 1
252 stop = cpu()
253 self.count_time = stop-start
254
255 _stop = cpu()
256 self.main_loop = _stop-_start
257
258 print 'Unspliced/Splice: %d %d'%(unspliced_ctr,spliced_ctr)
259 print 'True pos / false pos : %d %d'%(self.true_pos,self.false_pos)
260 print 'True neg / false neg : %d %d'%(self.true_neg,self.false_neg)
261
262
263 def findHighestScoringSpliceSites(self, currentAcc, currentDon, DNA, max_intron_size, read_size, splice_thresh):
264
265 def signum(a):
266 if a>0:
267 return 1
268 elif a<0:
269 return -1
270 else:
271 return 0
272
273 proximal_acc = []
274 for idx in xrange(max_intron_size, max_intron_size+read_size):
275 if currentAcc[idx]>= splice_thresh:
276 proximal_acc.append((idx,currentAcc[idx]))
277
278 proximal_acc.sort(lambda x,y: signum(x[1]-y[1]))
279 proximal_acc=proximal_acc[-2:]
280
281 distal_acc = []
282 for idx in xrange(max_intron_size+read_size, len(currentAcc)):
283 if currentAcc[idx]>= splice_thresh:
284 distal_acc.append((idx, currentAcc[idx], DNA[idx+1:idx+read_size]))
285
286 #distal_acc.sort(lambda x,y: signum(x[1]-y[1]))
287 #distal_acc=distal_acc[-2:]
288
289
290 proximal_don = []
291 for idx in xrange(max_intron_size, max_intron_size+read_size):
292 if currentDon[idx] >= splice_thresh:
293 proximal_don.append((idx, currentDon[idx]))
294
295 proximal_don.sort(lambda x,y: signum(x[1]-y[1]))
296 proximal_don=proximal_don[-2:]
297
298 distal_don = []
299 for idx in xrange(1, max_intron_size):
300 if currentDon[idx] >= splice_thresh:
301 distal_don.append((idx, currentDon[idx], DNA[idx-read_size:idx]))
302
303 #distal_don.sort(lambda x,y: signum(x[1]-y[1]))
304 #distal_don=distal_don[-2:]
305
306 return proximal_acc,proximal_don,distal_acc,distal_don
307
308 def calcAlternativeAlignments(self,location):
309 """
310 Given an alignment proposed by Vmatch this function calculates possible
311 alternative alignments taking into account for example matched
312 donor/acceptor positions.
313 """
314
315 run = self.run
316 splice_thresh = 0.1
317 max_intron_size = 2000
318
319 id = location['id']
320 chr = location['chr']
321 pos = location['pos']
322 strand = location['strand']
323 original_est = location['seq']
324 quality = location['prb']
325 cal_prb = location['cal_prb']
326
327 est = unbracket_est(original_est)
328 effective_len = len(est)
329
330 genomicSeq_start = pos - max_intron_size
331 genomicSeq_stop = pos + max_intron_size + len(est)
332
333 start = cpu()
334 currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
335 stop = cpu()
336 self.get_time += stop-start
337 dna = currentDNASeq
338
339 proximal_acc,proximal_don,distal_acc,distal_don = self.findHighestScoringSpliceSites(currentAcc,currentDon, dna, max_intron_size, len(est), splice_thresh)
340
341 print proximal_acc
342 print proximal_don
343 print distal_acc
344 print distal_don
345
346 alternativeScores = []
347
348 # inlined
349 h = self.h
350 d = self.d
351 a = self.a
352 mmatrix = self.mmatrix
353 qualityPlifs = self.qualityPlifs
354 # inlined
355
356 # compute dummy scores
357 #IntronScore = calculatePlif(h, [math.fabs(max_acc_pos-30)])[0]
358 #dummyAcceptorScore = calculatePlif(a, [max_acc_score])[0]
359
360 _start = cpu()
361 for (don_pos,don_score) in proximal_don:
362
363 for (acc_pos,acc_score,acc_dna) in distal_acc:
364
365 DonorScore = calculatePlif(d, [don_score])[0]
366 IntronScore = calculatePlif(h, [acc_pos-don_pos])[0]
367 AcceptorScore = calculatePlif(a, [acc_score])[0]
368
369 print 'splice: ', (don_pos,don_score), (acc_pos,acc_score,acc_dna), (DonorScore,IntronScore,AcceptorScore)
370
371 # construct a new "original_est"
372 original_est_cut=''
373
374 est_ptr=0
375 dna_ptr=max_intron_size
376 ptr=0
377 acc_dna_ptr=0
378 while ptr<len(original_est):
379
380 if original_est[ptr]=='[':
381 dnaletter=original_est[ptr+1]
382 estletter=original_est[ptr+2]
383 if dna_ptr < don_pos:
384 original_est_cut+=original_est[ptr:ptr+4]
385 else:
386 if acc_dna[acc_dna_ptr]==estletter:
387 original_est_cut += estletter # EST letter
388 else:
389 original_est_cut += '['+acc_dna[acc_dna_ptr]+estletter+']' # EST letter
390 print '['+acc_dna[acc_dna_ptr]+estletter+']'
391 acc_dna_ptr+=1
392 ptr+=4
393 else:
394 dnaletter=original_est[ptr]
395 estletter=dnaletter
396
397 original_est_cut+=estletter # EST letter
398 ptr+=1
399
400 if estletter=='-':
401 dna_ptr+=1
402 elif dnaletter=='-':
403 est_ptr+=1
404 else:
405 dna_ptr+=1
406 est_ptr+=1
407
408 assert(dna_ptr<=len(dna))
409 assert(est_ptr<=len(est))
410
411 print original_est, original_est_cut
412
413 score = computeSpliceAlignScoreWithQuality(original_est_cut, quality, qualityPlifs, run, self.currentPhi)
414 score += AcceptorScore + IntronScore + DonorScore
415
416 alternativeScores.append(score)
417
418 _stop = cpu()
419 self.alternativeScoresTime += _stop-_start
420
421 _start = cpu()
422 for (acc_pos,acc_score) in alt_acc:
423 # remove mismatching positions in the second exon
424 original_est_cut=''
425
426 est_ptr=0
427 dna_ptr=0
428 ptr=0
429 while ptr<len(original_est):
430
431 if original_est[ptr]=='[':
432 dnaletter=original_est[ptr+1]
433 estletter=original_est[ptr+2]
434 if est_ptr>=acc_pos:
435 original_est_cut+=original_est[ptr:ptr+4]
436 else:
437 original_est_cut+=estletter # EST letter
438 ptr+=4
439 else:
440 dnaletter=original_est[ptr]
441 estletter=dnaletter
442
443 original_est_cut+=estletter # EST letter
444 ptr+=1
445
446 if estletter=='-':
447 dna_ptr+=1
448 elif dnaletter=='-':
449 est_ptr+=1
450 else:
451 dna_ptr+=1
452 est_ptr+=1
453
454 assert(dna_ptr<=len(dna))
455 assert(est_ptr<=len(est))
456
457 #print original_est,original_est_cut
458
459 AcceptorScore = calculatePlif(d, [acc_score])[0]
460 #print "Acceptor"
461 #print AcceptorScore,acc_score,acc_pos
462
463 #if acc_score<0.1:
464 # print currentAcc[0:50]
465 # print currentDon[0:50]
466
467 score = computeSpliceAlignScoreWithQuality(original_est_cut, quality, qualityPlifs, run, self.currentPhi)
468 score += AcceptorScore + IntronScore + dummyDonorScore
469
470 #print 'diff %f,%f,%f' % ((trueWeight.T * self.currentPhi)[0,0] - score,(trueWeight.T * self.currentPhi)[0,0], score)
471 alternativeScores.append(score)
472
473 _stop = cpu()
474 self.alternativeScoresTime += _stop-_start
475
476 return alternativeScores
477
478
479 def calcAlignmentScore(self,alignment):
480 """
481 Given an alignment (dna,exons,est) and the current parameter for QPalma
482 this function calculates the dot product of the feature representation of
483 the alignment and the parameter vector i.e the alignment score.
484 """
485
486 start = cpu()
487 run = self.run
488
489 # Lets start calculation
490 dna, exons, est, original_est, quality, acc_supp, don_supp = alignment
491
492 score = computeSpliceAlignScoreWithQuality(original_est, quality, self.qualityPlifs, run, self.currentPhi)
493
494 stop = cpu()
495 self.calcAlignmentScoreTime += stop-start
496
497 return score
498
499
500 def cpu():
501 return (resource.getrusage(resource.RUSAGE_SELF).ru_utime+\
502 resource.getrusage(resource.RUSAGE_SELF).ru_stime)
503
504
505 if __name__ == '__main__':
506 #run_fname = sys.argv[1]
507 #data_fname = sys.argv[2]
508 #param_filename = sys.argv[3]
509
510 dir = '/fml/ag-raetsch/home/fabio/tmp/QPalma_test/run_+_quality_+_splicesignals_+_intron_len'
511 jp = os.path.join
512
513 run_fname = jp(dir,'run_object.pickle')
514 #data_fname = '/fml/ag-raetsch/share/projects/qpalma/solexa/current_data/map.vm_unspliced_1k'
515
516 data_fname = '/fml/ag-raetsch/share/projects/qpalma/solexa/pipeline_data/map.vm_2k'
517 #data_fname = '/fml/ag-raetsch/share/projects/qpalma/solexa/pipeline_data/map.vm_100'
518
519 param_fname = jp(dir,'param_500.pickle')
520
521 ph1 = PipelineHeuristic(run_fname,data_fname,param_fname)
522
523 start = cpu()
524 ph1.filter()
525 stop = cpu()
526
527 print 'total time elapsed: %f' % (stop-start)
528 print 'time spend for get seq: %f' % ph1.get_time
529 print 'time spend for calcAlignmentScoreTime: %f' % ph1.calcAlignmentScoreTime
530 print 'time spend for alternativeScoresTime: %f' % ph1.alternativeScoresTime
531 print 'time spend for count time: %f' % ph1.count_time
532 print 'time spend for init time: %f' % ph1.init_time
533 print 'time spend for read_parsing time: %f' % ph1.read_parsing
534 print 'time spend for main_loop time: %f' % ph1.main_loop
535 print 'time spend for splice_site_time time: %f' % ph1.splice_site_time
536
537 print 'time spend for computeSpliceAlignWithQualityTime time: %f'% ph1.computeSpliceAlignWithQualityTime
538 print 'time spend for computeSpliceWeightsTime time: %f'% ph1.computeSpliceWeightsTime
539 print 'time spend for DotProdTime time: %f'% ph1.DotProdTime
540 print 'time spend forarray_stuff time: %f'% ph1.array_stuff
541 #import cProfile
542 #cProfile.run('ph1.filter()')
543
544 #import hotshot
545 #p = hotshot.Profile('profile.log')
546 #p.runcall(ph1.filter)