47dae451475e0fd9c11482746907aa53d74bbd7a
[qpalma.git] / scripts / PipelineHeuristic.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import cPickle
5 import sys
6 import pdb
7 import os
8 import os.path
9 import resource
10
11 from qpalma.computeSpliceWeights import *
12 from qpalma.set_param_palma import *
13 from qpalma.computeSpliceAlignWithQuality import *
14
15 from numpy.matlib import mat,zeros,ones,inf
16 from numpy import inf,mean
17
18 from ParaParser import ParaParser,IN_VECTOR
19
20 from qpalma.Lookup import LookupTable
21 from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper,reverse_complement,unbracket_seq
22
23
24 def cpu():
25 return (resource.getrusage(resource.RUSAGE_SELF).ru_utime+\
26 resource.getrusage(resource.RUSAGE_SELF).ru_stime)
27
28
29 class BracketWrapper:
30 fields = ['id', 'chr', 'pos', 'strand', 'seq', 'prb']
31
32 def __init__(self,filename):
33 self.parser = ParaParser("%lu%d%d%s%s%s",self.fields,len(self.fields),IN_VECTOR)
34 self.parser.parseFile(filename)
35
36 def __len__(self):
37 return self.parser.getSize(IN_VECTOR)
38
39 def __getitem__(self,key):
40 return self.parser.fetchEntry(key)
41
42 def __iter__(self):
43 self.counter = 0
44 self.size = self.parser.getSize(IN_VECTOR)
45 return self
46
47 def next(self):
48 if not self.counter < self.size:
49 raise StopIteration
50 return_val = self.parser.fetchEntry(self.counter)
51 self.counter += 1
52 return return_val
53
54
55 class PipelineHeuristic:
56 """
57 This class wraps the filter which decides whether an alignment found by
58 vmatch is spliced an should be then newly aligned using QPalma or not.
59 """
60
61 def __init__(self,run_fname,data_fname,param_fname,result_fname,settings):
62 """
63 We need a run object holding information about the nr. of support points
64 etc.
65 """
66
67 run = cPickle.load(open(run_fname))
68 self.run = run
69
70 start = cpu()
71 self.all_remapped_reads = BracketWrapper(data_fname)
72 stop = cpu()
73
74 print 'parsed %d reads in %f sec' % (len(self.all_remapped_reads),stop-start)
75
76 self.chromo_range = settings['allowed_fragments']
77
78 accessWrapper = DataAccessWrapper(settings)
79 seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
80
81 start = cpu()
82 self.lt1 = LookupTable(settings)
83 stop = cpu()
84
85 print 'prefetched sequence and splice data in %f sec' % (stop-start)
86
87 self.result_spliced_fh = open('%s.spliced'%result_fname,'w+')
88 self.result_unspliced_fh = open('%s.unspliced'%result_fname,'w+')
89
90 start = cpu()
91
92 self.data_fname = data_fname
93
94 self.param = cPickle.load(open(param_fname))
95
96 # Set the parameters such as limits penalties for the Plifs
97 [h,d,a,mmatrix,qualityPlifs] = set_param_palma(self.param,True,run)
98
99 self.h = h
100 self.d = d
101 self.a = a
102 self.mmatrix = mmatrix
103 self.qualityPlifs = qualityPlifs
104
105 self.read_size = 38
106 self.prb_offset = 64
107 #self.prb_offset = 50
108
109 # parameters of the heuristics to decide whether the read is spliced
110 #self.splice_thresh = 0.005
111 self.splice_thresh = 0.01
112 self.max_intron_size = 2000
113 self.max_mismatch = 2
114 #self.splice_stop_thresh = 0.99
115 self.splice_stop_thresh = 0.8
116 self.spliced_bias = 0.0
117
118 param_lines = [\
119 ('%f','splice_thresh',self.splice_thresh),\
120 ('%d','max_intron_size',self.max_intron_size),\
121 ('%d','max_mismatch',self.max_mismatch),\
122 ('%f','splice_stop_thresh',self.splice_stop_thresh),\
123 ('%f','spliced_bias',self.spliced_bias)]
124
125 param_lines = [('# %s: '+form+'\n')%(name,val) for form,name,val in param_lines]
126 param_lines.append('# data: %s\n'%self.data_fname)
127 param_lines.append('# param: %s\n'%param_fname)
128
129 #pdb.set_trace()
130
131 for p_line in param_lines:
132 self.result_spliced_fh.write(p_line)
133 self.result_unspliced_fh.write(p_line)
134
135 #self.original_reads = {}
136
137 # we do not have this information
138 #for line in open(reads_pipeline_fn):
139 # line = line.strip()
140 # id,seq,q1,q2,q3 = line.split()
141 # id = int(id)
142 # self.original_reads[id] = seq
143
144 lengthSP = run['numLengthSuppPoints']
145 donSP = run['numDonSuppPoints']
146 accSP = run['numAccSuppPoints']
147 mmatrixSP = run['matchmatrixRows']*run['matchmatrixCols']
148 numq = run['numQualSuppPoints']
149 totalQualSP = run['totalQualSuppPoints']
150
151 currentPhi = zeros((run['numFeatures'],1))
152 currentPhi[0:lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
153 currentPhi[lengthSP:lengthSP+donSP] = mat(d.penalties[:]).reshape(donSP,1)
154 currentPhi[lengthSP+donSP:lengthSP+donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
155 currentPhi[lengthSP+donSP+accSP:lengthSP+donSP+accSP+mmatrixSP] = mmatrix[:]
156
157 totalQualityPenalties = self.param[-totalQualSP:]
158 currentPhi[lengthSP+donSP+accSP+mmatrixSP:] = totalQualityPenalties[:]
159 self.currentPhi = currentPhi
160
161 # we want to identify spliced reads
162 # so true pos are spliced reads that are predicted "spliced"
163 self.true_pos = 0
164
165 # as false positives we count all reads that are not spliced but predicted
166 # as "spliced"
167 self.false_pos = 0
168
169 self.true_neg = 0
170 self.false_neg = 0
171
172 # total time spend for get seq and scores
173 self.get_time = 0.0
174 self.calcAlignmentScoreTime = 0.0
175 self.alternativeScoresTime = 0.0
176
177 self.count_time = 0.0
178 self.main_loop = 0.0
179 self.splice_site_time = 0.0
180 self.computeSpliceAlignWithQualityTime = 0.0
181 self.computeSpliceWeightsTime = 0.0
182 self.DotProdTime = 0.0
183 self.array_stuff = 0.0
184 stop = cpu()
185
186 self.init_time = stop-start
187
188 def filter(self):
189 """
190 This method...
191 """
192 run = self.run
193
194 ctr = 0
195 unspliced_ctr = 0
196 spliced_ctr = 0
197
198 print 'Starting filtering...'
199 _start = cpu()
200
201 for location,original_line in self.all_remapped_reads:
202
203 if ctr % 1000 == 0:
204 print ctr
205
206 id = location['id']
207 chr = location['chr']
208 pos = location['pos']
209 strand = location['strand']
210 seq = location['seq']
211 prb = location['prb']
212
213 id = int(id)
214
215 seq = seq.lower()
216
217 strand_map = {'D':'+', 'P':'-'}
218
219 strand = strand_map[strand]
220
221 if not chr in self.chromo_range:
222 continue
223
224 unb_seq = unbracket_seq(seq)
225
226 # forgot to do this
227 if strand == '-':
228 unb_seq = reverse_complement(unb_seq)
229 seq = reverse_complement(seq)
230
231 effective_len = len(unb_seq)
232
233 genomicSeq_start = pos
234 #genomicSeq_stop = pos+effective_len-1
235 genomicSeq_stop = pos+effective_len
236
237 start = cpu()
238 #currentDNASeq_, currentAcc_, currentDon_ = self.seqInfo.get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,False)
239 currentDNASeq, currentAcc, currentDon = self.lt1.get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop)
240
241 # just performed some tests to check LookupTable results
242 #assert currentDNASeq_ == currentDNASeq
243 #assert currentAcc_ == currentAcc_
244 #assert currentDon_ == currentDon_
245
246 stop = cpu()
247 self.get_time += stop-start
248
249 dna = currentDNASeq
250 exons = zeros((2,1))
251 exons[0,0] = 0
252 exons[1,0] = effective_len
253 est = unb_seq
254 original_est = seq
255 quality = map(lambda x:ord(x)-self.prb_offset,prb)
256
257 currentVMatchAlignment = dna, exons, est, original_est, quality,\
258 currentAcc, currentDon
259
260 #pdb.set_trace()
261
262 #alternativeAlignmentScores = self.calcAlternativeAlignments(location)
263
264 try:
265 alternativeAlignmentScores = self.calcAlternativeAlignments(location)
266 except:
267 alternativeAlignmentScores = []
268
269 if alternativeAlignmentScores == []:
270 # no alignment necessary
271 maxAlternativeAlignmentScore = -inf
272 vMatchScore = 0.0
273 else:
274 maxAlternativeAlignmentScore = max(alternativeAlignmentScores)
275 # compute alignment for vmatch unspliced read
276 vMatchScore = self.calcAlignmentScore(currentVMatchAlignment)
277 #print 'vMatchScore/alternativeScore: %f %f %s' % (vMatchScore,maxAlternativeAlignmentScore,strand)
278
279
280 start = cpu()
281
282 #print 'all candidates %s' % str(alternativeAlignmentScores)
283
284 new_id = id - 1000000300000
285
286 unspliced = False
287 # unspliced
288 if new_id > 0:
289 unspliced = True
290
291 # Seems that according to our learned parameters VMatch found a good
292 # alignment of the current read
293 if maxAlternativeAlignmentScore < vMatchScore:
294 unspliced_ctr += 1
295
296 self.result_unspliced_fh.write(original_line+'\n')
297
298 if unspliced:
299 self.true_neg += 1
300 else:
301 self.false_neg += 1
302
303 # We found an alternative alignment considering splice sites that scores
304 # higher than the VMatch alignment
305 else:
306 spliced_ctr += 1
307
308 self.result_spliced_fh.write(original_line+'\n')
309
310 if unspliced:
311 self.false_pos += 1
312 else:
313 self.true_pos += 1
314
315 ctr += 1
316 stop = cpu()
317 self.count_time = stop-start
318
319 _stop = cpu()
320 self.main_loop = _stop-_start
321
322 print 'Unspliced/Splice: %d %d'%(unspliced_ctr,spliced_ctr)
323 print 'True pos / false pos : %d %d'%(self.true_pos,self.false_pos)
324 print 'True neg / false neg : %d %d'%(self.true_neg,self.false_neg)
325
326
327 def findHighestScoringSpliceSites(self, currentAcc, currentDon, DNA, max_intron_size, read_size, splice_thresh):
328
329 def signum(a):
330 if a>0:
331 return 1
332 elif a<0:
333 return -1
334 else:
335 return 0
336
337 proximal_acc = []
338 for idx in xrange(max_intron_size, max_intron_size+read_size/2):
339 if currentAcc[idx]>= splice_thresh:
340 proximal_acc.append((idx,currentAcc[idx]))
341
342 proximal_acc.sort(lambda x,y: signum(x[1]-y[1]))
343 proximal_acc=proximal_acc[-2:]
344
345 distal_acc = []
346 for idx in xrange(max_intron_size+read_size, len(currentAcc)):
347 if currentAcc[idx]>= splice_thresh and idx+read_size<len(currentAcc):
348 distal_acc.append((idx, currentAcc[idx], DNA[idx+1:idx+read_size]))
349
350 #distal_acc.sort(lambda x,y: signum(x[1]-y[1]))
351 #distal_acc=distal_acc[-2:]
352
353 proximal_don = []
354 for idx in xrange(max_intron_size+read_size/2, max_intron_size+read_size):
355 if currentDon[idx] >= splice_thresh:
356 proximal_don.append((idx, currentDon[idx]))
357
358 proximal_don.sort(lambda x,y: signum(x[1]-y[1]))
359 proximal_don=proximal_don[-2:]
360
361 distal_don = []
362 for idx in xrange(1, max_intron_size):
363 if currentDon[idx] >= splice_thresh and idx>read_size:
364 distal_don.append((idx, currentDon[idx], DNA[idx-read_size:idx]))
365
366 distal_don.sort(lambda x,y: y[0]-x[0])
367 #distal_don=distal_don[-2:]
368
369 return proximal_acc,proximal_don,distal_acc,distal_don
370
371
372 def calcAlternativeAlignments(self,location):
373 """
374 Given an alignment proposed by Vmatch this function calculates possible
375 alternative alignments taking into account for example matched
376 donor/acceptor positions.
377 """
378
379 run = self.run
380
381 id = location['id']
382 chr = location['chr']
383 pos = location['pos']
384 strand = location['strand']
385 original_est = location['seq']
386 quality = location['prb']
387 quality = map(lambda x:ord(x)-self.prb_offset,quality)
388
389 original_est = original_est.lower()
390 est = unbracket_seq(original_est)
391 effective_len = len(est)
392
393 genomicSeq_start = pos - self.max_intron_size
394 genomicSeq_stop = pos + self.max_intron_size + len(est)
395
396 strand_map = {'D':'+', 'P':'-'}
397 strand = strand_map[strand]
398
399 if strand == '-':
400 est = reverse_complement(est)
401 original_est = reverse_complement(original_est)
402
403 start = cpu()
404 #currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr, strand, genomicSeq_start, genomicSeq_stop, run['dna_flat_files'])
405 currentDNASeq, currentAcc, currentDon = self.lt1.get_seq_and_scores(chr, strand, genomicSeq_start, genomicSeq_stop)
406 stop = cpu()
407 self.get_time += stop-start
408 dna = currentDNASeq
409
410 proximal_acc,proximal_don,distal_acc,distal_don = self.findHighestScoringSpliceSites(currentAcc, currentDon, dna, self.max_intron_size, len(est), self.splice_thresh)
411
412 alternativeScores = []
413
414 #pdb.set_trace()
415
416 # inlined
417 h = self.h
418 d = self.d
419 a = self.a
420 mmatrix = self.mmatrix
421 qualityPlifs = self.qualityPlifs
422 # inlined
423
424 # find an intron on the 3' end
425 _start = cpu()
426 for (don_pos,don_score) in proximal_don:
427 DonorScore = calculatePlif(d, [don_score])[0]
428
429 for (acc_pos,acc_score,acc_dna) in distal_acc:
430
431 IntronScore = calculatePlif(h, [acc_pos-don_pos])[0]
432 AcceptorScore = calculatePlif(a, [acc_score])[0]
433
434 #print 'don splice: ', (don_pos,don_score), (acc_pos,acc_score,acc_dna), (DonorScore,IntronScore,AcceptorScore)
435
436 # construct a new "original_est"
437 original_est_cut=''
438
439 est_ptr=0
440 dna_ptr=self.max_intron_size
441 ptr=0
442 acc_dna_ptr=0
443 num_mismatch = 0
444
445 while ptr<len(original_est):
446 #print acc_dna_ptr,len(acc_dna),acc_pos,don_pos
447
448 if original_est[ptr]=='[':
449 dnaletter=original_est[ptr+1]
450 estletter=original_est[ptr+2]
451 if dna_ptr < don_pos:
452 original_est_cut+=original_est[ptr:ptr+4]
453 num_mismatch += 1
454 else:
455 if acc_dna[acc_dna_ptr]==estletter:
456 original_est_cut += estletter # EST letter
457 else:
458 original_est_cut += '['+acc_dna[acc_dna_ptr]+estletter+']' # EST letter
459 num_mismatch += 1
460 #print '['+acc_dna[acc_dna_ptr]+estletter+']'
461 acc_dna_ptr+=1
462 ptr+=4
463 else:
464 dnaletter=original_est[ptr]
465 estletter=dnaletter
466
467 if dna_ptr < don_pos:
468 original_est_cut+=estletter # EST letter
469 else:
470 if acc_dna[acc_dna_ptr]==estletter:
471 original_est_cut += estletter # EST letter
472 else:
473 num_mismatch += 1
474 original_est_cut += '['+acc_dna[acc_dna_ptr]+estletter+']' # EST letter
475 #print '('+acc_dna[acc_dna_ptr]+estletter+')'
476 acc_dna_ptr+=1
477
478 ptr+=1
479
480 if estletter=='-':
481 dna_ptr+=1
482 elif dnaletter=='-':
483 est_ptr+=1
484 else:
485 dna_ptr+=1
486 est_ptr+=1
487 if num_mismatch>self.max_mismatch:
488 continue
489
490 assert(dna_ptr<=len(dna))
491 assert(est_ptr<=len(est))
492
493 #print original_est, original_est_cut
494
495 score = computeSpliceAlignScoreWithQuality(original_est_cut, quality, qualityPlifs, run, self.currentPhi)
496 score += AcceptorScore + IntronScore + DonorScore + self.spliced_bias
497
498 alternativeScores.append(score)
499
500 if acc_score>=self.splice_stop_thresh:
501 break
502
503 #pdb.set_trace()
504
505 _stop = cpu()
506 self.alternativeScoresTime += _stop-_start
507
508 # find an intron on the 5' end
509 _start = cpu()
510 for (acc_pos,acc_score) in proximal_acc:
511
512 AcceptorScore = calculatePlif(a, [acc_score])[0]
513
514 for (don_pos,don_score,don_dna) in distal_don:
515
516 DonorScore = calculatePlif(d, [don_score])[0]
517 IntronScore = calculatePlif(h, [acc_pos-don_pos])[0]
518
519 #print 'acc splice: ', (don_pos,don_score,don_dna), (acc_pos,acc_score), (DonorScore,IntronScore,AcceptorScore)
520
521 # construct a new "original_est"
522 original_est_cut=''
523
524 est_ptr=0
525 dna_ptr=self.max_intron_size
526 ptr=0
527 num_mismatch = 0
528 don_dna_ptr=len(don_dna)-(acc_pos-self.max_intron_size)-1
529 while ptr<len(original_est):
530
531 if original_est[ptr]=='[':
532 dnaletter=original_est[ptr+1]
533 estletter=original_est[ptr+2]
534 if dna_ptr > acc_pos:
535 original_est_cut+=original_est[ptr:ptr+4]
536 num_mismatch += 1
537 else:
538 if don_dna[don_dna_ptr]==estletter:
539 original_est_cut += estletter # EST letter
540 else:
541 original_est_cut += '['+don_dna[don_dna_ptr]+estletter+']' # EST letter
542 num_mismatch += 1
543 #print '['+don_dna[don_dna_ptr]+estletter+']'
544 don_dna_ptr+=1
545 ptr+=4
546 else:
547 dnaletter=original_est[ptr]
548 estletter=dnaletter
549
550 if dna_ptr > acc_pos:
551 original_est_cut+=estletter # EST letter
552 else:
553 if don_dna[don_dna_ptr]==estletter:
554 original_est_cut += estletter # EST letter
555 else:
556 original_est_cut += '['+don_dna[don_dna_ptr]+estletter+']' # EST letter
557 num_mismatch += 1
558 #print '('+don_dna[don_dna_ptr]+estletter+')'
559 don_dna_ptr+=1
560
561 ptr+=1
562
563 if estletter=='-':
564 dna_ptr+=1
565 elif dnaletter=='-':
566 est_ptr+=1
567 else:
568 dna_ptr+=1
569 est_ptr+=1
570
571 if num_mismatch>self.max_mismatch:
572 continue
573 assert(dna_ptr<=len(dna))
574 assert(est_ptr<=len(est))
575
576 #print original_est, original_est_cut
577
578 score = computeSpliceAlignScoreWithQuality(original_est_cut, quality, qualityPlifs, run, self.currentPhi)
579 score += AcceptorScore + IntronScore + DonorScore + self.spliced_bias
580
581 alternativeScores.append(score)
582
583 if don_score>=self.splice_stop_thresh:
584 break
585
586 _stop = cpu()
587 self.alternativeScoresTime += _stop-_start
588
589 return alternativeScores
590
591
592 def calcAlignmentScore(self,alignment):
593 """
594 Given an alignment (dna,exons,est) and the current parameter for QPalma
595 this function calculates the dot product of the feature representation of
596 the alignment and the parameter vector i.e the alignment score.
597 """
598
599 start = cpu()
600 run = self.run
601
602 # Lets start calculation
603 dna, exons, est, original_est, quality, acc_supp, don_supp = alignment
604
605 score = computeSpliceAlignScoreWithQuality(original_est, quality, self.qualityPlifs, run, self.currentPhi)
606
607 stop = cpu()
608 self.calcAlignmentScoreTime += stop-start
609
610 return score