+ cleaned up directories
[qpalma.git] / qpalma / PipelineHeuristic.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 # This program is free software; you can redistribute it and/or modify
5 # it under the terms of the GNU General Public License as published by
6 # the Free Software Foundation; either version 2 of the License, or
7 # (at your option) any later version.
8 #
9 # Written (W) 2008 Gunnar R├Ątsch, Fabio De Bona
10 # Copyright (C) 2008 Max-Planck-Society
11
12 import cPickle
13 import sys
14 import pdb
15 import os
16 import os.path
17 import resource
18
19 from qpalma.computeSpliceWeights import *
20 from qpalma.set_param_palma import *
21 from qpalma.computeSpliceAlignWithQuality import *
22
23 from numpy.matlib import mat,zeros,ones,inf
24 from numpy import inf,mean
25
26 from ParaParser import ParaParser,IN_VECTOR
27
28 from qpalma.Lookup import LookupTable
29 from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper,reverse_complement,unbracket_seq
30
31
32 def cpu():
33 return (resource.getrusage(resource.RUSAGE_SELF).ru_utime+\
34 resource.getrusage(resource.RUSAGE_SELF).ru_stime)
35
36
37 class BracketWrapper:
38 fields = ['id', 'chr', 'pos', 'strand', 'seq', 'prb']
39
40 def __init__(self,filename):
41 self.parser = ParaParser("%lu%d%d%s%s%s",self.fields,len(self.fields),IN_VECTOR)
42 self.parser.parseFile(filename)
43
44 def __len__(self):
45 return self.parser.getSize(IN_VECTOR)
46
47 def __getitem__(self,key):
48 return self.parser.fetchEntry(key)
49
50 def __iter__(self):
51 self.counter = 0
52 self.size = self.parser.getSize(IN_VECTOR)
53 return self
54
55 def next(self):
56 if not self.counter < self.size:
57 raise StopIteration
58 return_val = self.parser.fetchEntry(self.counter)
59 self.counter += 1
60 return return_val
61
62
63 class PipelineHeuristic:
64 """
65 This class wraps the filter which decides whether an alignment found by
66 vmatch is spliced an should be then newly aligned using QPalma or not.
67 """
68
69 def __init__(self,run_fname,data_fname,param_fname,result_fname,settings):
70 """
71 We need a run object holding information about the nr. of support points
72 etc.
73 """
74
75 run = cPickle.load(open(run_fname))
76 self.run = run
77
78 start = cpu()
79 self.all_remapped_reads = BracketWrapper(data_fname)
80 stop = cpu()
81
82 print 'parsed %d reads in %f sec' % (len(self.all_remapped_reads),stop-start)
83
84 self.chromo_range = settings['allowed_fragments']
85
86 accessWrapper = DataAccessWrapper(settings)
87 seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
88
89 start = cpu()
90 self.lt1 = LookupTable(settings)
91 stop = cpu()
92
93 print 'prefetched sequence and splice data in %f sec' % (stop-start)
94
95 self.result_spliced_fh = open('%s.spliced'%result_fname,'w+')
96 self.result_unspliced_fh = open('%s.unspliced'%result_fname,'w+')
97
98 start = cpu()
99
100 self.data_fname = data_fname
101
102 self.param = cPickle.load(open(param_fname))
103
104 # Set the parameters such as limits penalties for the Plifs
105 [h,d,a,mmatrix,qualityPlifs] = set_param_palma(self.param,True,run)
106
107 self.h = h
108 self.d = d
109 self.a = a
110 self.mmatrix = mmatrix
111 self.qualityPlifs = qualityPlifs
112
113 self.read_size = 38
114 self.prb_offset = 64
115 #self.prb_offset = 50
116
117 # parameters of the heuristics to decide whether the read is spliced
118 #self.splice_thresh = 0.005
119 self.splice_thresh = 0.01
120 self.max_intron_size = 2000
121 self.max_mismatch = 2
122 #self.splice_stop_thresh = 0.99
123 self.splice_stop_thresh = 0.8
124 self.spliced_bias = 0.0
125
126 param_lines = [\
127 ('%f','splice_thresh',self.splice_thresh),\
128 ('%d','max_intron_size',self.max_intron_size),\
129 ('%d','max_mismatch',self.max_mismatch),\
130 ('%f','splice_stop_thresh',self.splice_stop_thresh),\
131 ('%f','spliced_bias',self.spliced_bias)]
132
133 param_lines = [('# %s: '+form+'\n')%(name,val) for form,name,val in param_lines]
134 param_lines.append('# data: %s\n'%self.data_fname)
135 param_lines.append('# param: %s\n'%param_fname)
136
137 #pdb.set_trace()
138
139 for p_line in param_lines:
140 self.result_spliced_fh.write(p_line)
141 self.result_unspliced_fh.write(p_line)
142
143 #self.original_reads = {}
144
145 # we do not have this information
146 #for line in open(reads_pipeline_fn):
147 # line = line.strip()
148 # id,seq,q1,q2,q3 = line.split()
149 # id = int(id)
150 # self.original_reads[id] = seq
151
152 lengthSP = run['numLengthSuppPoints']
153 donSP = run['numDonSuppPoints']
154 accSP = run['numAccSuppPoints']
155 mmatrixSP = run['matchmatrixRows']*run['matchmatrixCols']
156 numq = run['numQualSuppPoints']
157 totalQualSP = run['totalQualSuppPoints']
158
159 currentPhi = zeros((run['numFeatures'],1))
160 currentPhi[0:lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
161 currentPhi[lengthSP:lengthSP+donSP] = mat(d.penalties[:]).reshape(donSP,1)
162 currentPhi[lengthSP+donSP:lengthSP+donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
163 currentPhi[lengthSP+donSP+accSP:lengthSP+donSP+accSP+mmatrixSP] = mmatrix[:]
164
165 totalQualityPenalties = self.param[-totalQualSP:]
166 currentPhi[lengthSP+donSP+accSP+mmatrixSP:] = totalQualityPenalties[:]
167 self.currentPhi = currentPhi
168
169 # we want to identify spliced reads
170 # so true pos are spliced reads that are predicted "spliced"
171 self.true_pos = 0
172
173 # as false positives we count all reads that are not spliced but predicted
174 # as "spliced"
175 self.false_pos = 0
176
177 self.true_neg = 0
178 self.false_neg = 0
179
180 # total time spend for get seq and scores
181 self.get_time = 0.0
182 self.calcAlignmentScoreTime = 0.0
183 self.alternativeScoresTime = 0.0
184
185 self.count_time = 0.0
186 self.main_loop = 0.0
187 self.splice_site_time = 0.0
188 self.computeSpliceAlignWithQualityTime = 0.0
189 self.computeSpliceWeightsTime = 0.0
190 self.DotProdTime = 0.0
191 self.array_stuff = 0.0
192 stop = cpu()
193
194 self.init_time = stop-start
195
196 def filter(self):
197 """
198 This method...
199 """
200 run = self.run
201
202 ctr = 0
203 unspliced_ctr = 0
204 spliced_ctr = 0
205
206 print 'Starting filtering...'
207 _start = cpu()
208
209 for location,original_line in self.all_remapped_reads:
210
211 if ctr % 1000 == 0:
212 print ctr
213
214 id = location['id']
215 chr = location['chr']
216 pos = location['pos']
217 strand = location['strand']
218 seq = location['seq']
219 prb = location['prb']
220
221 id = int(id)
222
223 seq = seq.lower()
224
225 strand_map = {'D':'+', 'P':'-'}
226
227 strand = strand_map[strand]
228
229 if not chr in self.chromo_range:
230 continue
231
232 unb_seq = unbracket_seq(seq)
233
234 # forgot to do this
235 if strand == '-':
236 unb_seq = reverse_complement(unb_seq)
237 seq = reverse_complement(seq)
238
239 effective_len = len(unb_seq)
240
241 genomicSeq_start = pos
242 #genomicSeq_stop = pos+effective_len-1
243 genomicSeq_stop = pos+effective_len
244
245 start = cpu()
246 #currentDNASeq_, currentAcc_, currentDon_ = self.seqInfo.get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,False)
247 currentDNASeq, currentAcc, currentDon = self.lt1.get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop)
248
249 # just performed some tests to check LookupTable results
250 #assert currentDNASeq_ == currentDNASeq
251 #assert currentAcc_ == currentAcc_
252 #assert currentDon_ == currentDon_
253
254 stop = cpu()
255 self.get_time += stop-start
256
257 dna = currentDNASeq
258 exons = zeros((2,1))
259 exons[0,0] = 0
260 exons[1,0] = effective_len
261 est = unb_seq
262 original_est = seq
263 quality = map(lambda x:ord(x)-self.prb_offset,prb)
264
265 currentVMatchAlignment = dna, exons, est, original_est, quality,\
266 currentAcc, currentDon
267
268 #pdb.set_trace()
269
270 #alternativeAlignmentScores = self.calcAlternativeAlignments(location)
271
272 try:
273 alternativeAlignmentScores = self.calcAlternativeAlignments(location)
274 except:
275 alternativeAlignmentScores = []
276
277 if alternativeAlignmentScores == []:
278 # no alignment necessary
279 maxAlternativeAlignmentScore = -inf
280 vMatchScore = 0.0
281 else:
282 maxAlternativeAlignmentScore = max(alternativeAlignmentScores)
283 # compute alignment for vmatch unspliced read
284 vMatchScore = self.calcAlignmentScore(currentVMatchAlignment)
285 #print 'vMatchScore/alternativeScore: %f %f %s' % (vMatchScore,maxAlternativeAlignmentScore,strand)
286
287
288 start = cpu()
289
290 #print 'all candidates %s' % str(alternativeAlignmentScores)
291
292 new_id = id - 1000000300000
293
294 unspliced = False
295 # unspliced
296 if new_id > 0:
297 unspliced = True
298
299 # Seems that according to our learned parameters VMatch found a good
300 # alignment of the current read
301 if maxAlternativeAlignmentScore < vMatchScore:
302 unspliced_ctr += 1
303
304 self.result_unspliced_fh.write(original_line+'\n')
305
306 if unspliced:
307 self.true_neg += 1
308 else:
309 self.false_neg += 1
310
311 # We found an alternative alignment considering splice sites that scores
312 # higher than the VMatch alignment
313 else:
314 spliced_ctr += 1
315
316 self.result_spliced_fh.write(original_line+'\n')
317
318 if unspliced:
319 self.false_pos += 1
320 else:
321 self.true_pos += 1
322
323 ctr += 1
324 stop = cpu()
325 self.count_time = stop-start
326
327 _stop = cpu()
328 self.main_loop = _stop-_start
329
330 print 'Unspliced/Splice: %d %d'%(unspliced_ctr,spliced_ctr)
331 print 'True pos / false pos : %d %d'%(self.true_pos,self.false_pos)
332 print 'True neg / false neg : %d %d'%(self.true_neg,self.false_neg)
333
334
335 def findHighestScoringSpliceSites(self, currentAcc, currentDon, DNA, max_intron_size, read_size, splice_thresh):
336
337 def signum(a):
338 if a>0:
339 return 1
340 elif a<0:
341 return -1
342 else:
343 return 0
344
345 proximal_acc = []
346 for idx in xrange(max_intron_size, max_intron_size+read_size/2):
347 if currentAcc[idx]>= splice_thresh:
348 proximal_acc.append((idx,currentAcc[idx]))
349
350 proximal_acc.sort(lambda x,y: signum(x[1]-y[1]))
351 proximal_acc=proximal_acc[-2:]
352
353 distal_acc = []
354 for idx in xrange(max_intron_size+read_size, len(currentAcc)):
355 if currentAcc[idx]>= splice_thresh and idx+read_size<len(currentAcc):
356 distal_acc.append((idx, currentAcc[idx], DNA[idx+1:idx+read_size]))
357
358 #distal_acc.sort(lambda x,y: signum(x[1]-y[1]))
359 #distal_acc=distal_acc[-2:]
360
361 proximal_don = []
362 for idx in xrange(max_intron_size+read_size/2, max_intron_size+read_size):
363 if currentDon[idx] >= splice_thresh:
364 proximal_don.append((idx, currentDon[idx]))
365
366 proximal_don.sort(lambda x,y: signum(x[1]-y[1]))
367 proximal_don=proximal_don[-2:]
368
369 distal_don = []
370 for idx in xrange(1, max_intron_size):
371 if currentDon[idx] >= splice_thresh and idx>read_size:
372 distal_don.append((idx, currentDon[idx], DNA[idx-read_size:idx]))
373
374 distal_don.sort(lambda x,y: y[0]-x[0])
375 #distal_don=distal_don[-2:]
376
377 return proximal_acc,proximal_don,distal_acc,distal_don
378
379
380 def calcAlternativeAlignments(self,location):
381 """
382 Given an alignment proposed by Vmatch this function calculates possible
383 alternative alignments taking into account for example matched
384 donor/acceptor positions.
385 """
386
387 run = self.run
388
389 id = location['id']
390 chr = location['chr']
391 pos = location['pos']
392 strand = location['strand']
393 original_est = location['seq']
394 quality = location['prb']
395 quality = map(lambda x:ord(x)-self.prb_offset,quality)
396
397 original_est = original_est.lower()
398 est = unbracket_seq(original_est)
399 effective_len = len(est)
400
401 genomicSeq_start = pos - self.max_intron_size
402 genomicSeq_stop = pos + self.max_intron_size + len(est)
403
404 strand_map = {'D':'+', 'P':'-'}
405 strand = strand_map[strand]
406
407 if strand == '-':
408 est = reverse_complement(est)
409 original_est = reverse_complement(original_est)
410
411 start = cpu()
412 #currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr, strand, genomicSeq_start, genomicSeq_stop, run['dna_flat_files'])
413 currentDNASeq, currentAcc, currentDon = self.lt1.get_seq_and_scores(chr, strand, genomicSeq_start, genomicSeq_stop)
414 stop = cpu()
415 self.get_time += stop-start
416 dna = currentDNASeq
417
418 proximal_acc,proximal_don,distal_acc,distal_don = self.findHighestScoringSpliceSites(currentAcc, currentDon, dna, self.max_intron_size, len(est), self.splice_thresh)
419
420 alternativeScores = []
421
422 #pdb.set_trace()
423
424 # inlined
425 h = self.h
426 d = self.d
427 a = self.a
428 mmatrix = self.mmatrix
429 qualityPlifs = self.qualityPlifs
430 # inlined
431
432 # find an intron on the 3' end
433 _start = cpu()
434 for (don_pos,don_score) in proximal_don:
435 DonorScore = calculatePlif(d, [don_score])[0]
436
437 for (acc_pos,acc_score,acc_dna) in distal_acc:
438
439 IntronScore = calculatePlif(h, [acc_pos-don_pos])[0]
440 AcceptorScore = calculatePlif(a, [acc_score])[0]
441
442 #print 'don splice: ', (don_pos,don_score), (acc_pos,acc_score,acc_dna), (DonorScore,IntronScore,AcceptorScore)
443
444 # construct a new "original_est"
445 original_est_cut=''
446
447 est_ptr=0
448 dna_ptr=self.max_intron_size
449 ptr=0
450 acc_dna_ptr=0
451 num_mismatch = 0
452
453 while ptr<len(original_est):
454 #print acc_dna_ptr,len(acc_dna),acc_pos,don_pos
455
456 if original_est[ptr]=='[':
457 dnaletter=original_est[ptr+1]
458 estletter=original_est[ptr+2]
459 if dna_ptr < don_pos:
460 original_est_cut+=original_est[ptr:ptr+4]
461 num_mismatch += 1
462 else:
463 if acc_dna[acc_dna_ptr]==estletter:
464 original_est_cut += estletter # EST letter
465 else:
466 original_est_cut += '['+acc_dna[acc_dna_ptr]+estletter+']' # EST letter
467 num_mismatch += 1
468 #print '['+acc_dna[acc_dna_ptr]+estletter+']'
469 acc_dna_ptr+=1
470 ptr+=4
471 else:
472 dnaletter=original_est[ptr]
473 estletter=dnaletter
474
475 if dna_ptr < don_pos:
476 original_est_cut+=estletter # EST letter
477 else:
478 if acc_dna[acc_dna_ptr]==estletter:
479 original_est_cut += estletter # EST letter
480 else:
481 num_mismatch += 1
482 original_est_cut += '['+acc_dna[acc_dna_ptr]+estletter+']' # EST letter
483 #print '('+acc_dna[acc_dna_ptr]+estletter+')'
484 acc_dna_ptr+=1
485
486 ptr+=1
487
488 if estletter=='-':
489 dna_ptr+=1
490 elif dnaletter=='-':
491 est_ptr+=1
492 else:
493 dna_ptr+=1
494 est_ptr+=1
495 if num_mismatch>self.max_mismatch:
496 continue
497
498 assert(dna_ptr<=len(dna))
499 assert(est_ptr<=len(est))
500
501 #print original_est, original_est_cut
502
503 score = computeSpliceAlignScoreWithQuality(original_est_cut, quality, qualityPlifs, run, self.currentPhi)
504 score += AcceptorScore + IntronScore + DonorScore + self.spliced_bias
505
506 alternativeScores.append(score)
507
508 if acc_score>=self.splice_stop_thresh:
509 break
510
511 #pdb.set_trace()
512
513 _stop = cpu()
514 self.alternativeScoresTime += _stop-_start
515
516 # find an intron on the 5' end
517 _start = cpu()
518 for (acc_pos,acc_score) in proximal_acc:
519
520 AcceptorScore = calculatePlif(a, [acc_score])[0]
521
522 for (don_pos,don_score,don_dna) in distal_don:
523
524 DonorScore = calculatePlif(d, [don_score])[0]
525 IntronScore = calculatePlif(h, [acc_pos-don_pos])[0]
526
527 #print 'acc splice: ', (don_pos,don_score,don_dna), (acc_pos,acc_score), (DonorScore,IntronScore,AcceptorScore)
528
529 # construct a new "original_est"
530 original_est_cut=''
531
532 est_ptr=0
533 dna_ptr=self.max_intron_size
534 ptr=0
535 num_mismatch = 0
536 don_dna_ptr=len(don_dna)-(acc_pos-self.max_intron_size)-1
537 while ptr<len(original_est):
538
539 if original_est[ptr]=='[':
540 dnaletter=original_est[ptr+1]
541 estletter=original_est[ptr+2]
542 if dna_ptr > acc_pos:
543 original_est_cut+=original_est[ptr:ptr+4]
544 num_mismatch += 1
545 else:
546 if don_dna[don_dna_ptr]==estletter:
547 original_est_cut += estletter # EST letter
548 else:
549 original_est_cut += '['+don_dna[don_dna_ptr]+estletter+']' # EST letter
550 num_mismatch += 1
551 #print '['+don_dna[don_dna_ptr]+estletter+']'
552 don_dna_ptr+=1
553 ptr+=4
554 else:
555 dnaletter=original_est[ptr]
556 estletter=dnaletter
557
558 if dna_ptr > acc_pos:
559 original_est_cut+=estletter # EST letter
560 else:
561 if don_dna[don_dna_ptr]==estletter:
562 original_est_cut += estletter # EST letter
563 else:
564 original_est_cut += '['+don_dna[don_dna_ptr]+estletter+']' # EST letter
565 num_mismatch += 1
566 #print '('+don_dna[don_dna_ptr]+estletter+')'
567 don_dna_ptr+=1
568
569 ptr+=1
570
571 if estletter=='-':
572 dna_ptr+=1
573 elif dnaletter=='-':
574 est_ptr+=1
575 else:
576 dna_ptr+=1
577 est_ptr+=1
578
579 if num_mismatch>self.max_mismatch:
580 continue
581 assert(dna_ptr<=len(dna))
582 assert(est_ptr<=len(est))
583
584 #print original_est, original_est_cut
585
586 score = computeSpliceAlignScoreWithQuality(original_est_cut, quality, qualityPlifs, run, self.currentPhi)
587 score += AcceptorScore + IntronScore + DonorScore + self.spliced_bias
588
589 alternativeScores.append(score)
590
591 if don_score>=self.splice_stop_thresh:
592 break
593
594 _stop = cpu()
595 self.alternativeScoresTime += _stop-_start
596
597 return alternativeScores
598
599
600 def calcAlignmentScore(self,alignment):
601 """
602 Given an alignment (dna,exons,est) and the current parameter for QPalma
603 this function calculates the dot product of the feature representation of
604 the alignment and the parameter vector i.e the alignment score.
605 """
606
607 start = cpu()
608 run = self.run
609
610 # Lets start calculation
611 dna, exons, est, original_est, quality, acc_supp, don_supp = alignment
612
613 score = computeSpliceAlignScoreWithQuality(original_est, quality, self.qualityPlifs, run, self.currentPhi)
614
615 stop = cpu()
616 self.calcAlignmentScoreTime += stop-start
617
618 return score