2 # -*- coding: utf-8 -*-
11 from qpalma
.computeSpliceWeights
import *
12 from qpalma
.set_param_palma
import *
13 from qpalma
.computeSpliceAlignWithQuality
import *
15 from numpy
.matlib
import mat
,zeros
,ones
,inf
16 from numpy
import inf
,mean
18 from ParaParser
import ParaParser
,IN_VECTOR
20 from qpalma
.Lookup
import LookupTable
21 from qpalma
.sequence_utils
import SeqSpliceInfo
,DataAccessWrapper
,reverse_complement
,unbracket_seq
25 return (resource
.getrusage(resource
.RUSAGE_SELF
).ru_utime
+\
26 resource
.getrusage(resource
.RUSAGE_SELF
).ru_stime
)
30 fields
= ['id', 'chr', 'pos', 'strand', 'seq', 'prb']
32 def __init__(self
,filename
):
33 self
.parser
= ParaParser("%lu%d%d%s%s%s",self
.fields
,len(self
.fields
),IN_VECTOR
)
34 self
.parser
.parseFile(filename
)
37 return self
.parser
.getSize(IN_VECTOR
)
39 def __getitem__(self
,key
):
40 return self
.parser
.fetchEntry(key
)
44 self
.size
= self
.parser
.getSize(IN_VECTOR
)
48 if not self
.counter
< self
.size
:
50 return_val
= self
.parser
.fetchEntry(self
.counter
)
55 class PipelineHeuristic
:
57 This class wraps the filter which decides whether an alignment found by
58 vmatch is spliced an should be then newly aligned using QPalma or not.
61 def __init__(self
,run_fname
,data_fname
,param_fname
,result_fname
):
63 We need a run object holding information about the nr. of support points
67 run
= cPickle
.load(open(run_fname
))
71 self
.all_remapped_reads
= BracketWrapper(data_fname
)
74 print 'parsed %d reads in %f sec' % (len(self
.all_remapped_reads
),stop
-start
)
76 #g_dir = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/contigs'
77 #acc_dir = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/acc'
78 #don_dir = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/don'
80 #g_fmt = 'contig%d.dna.flat'
81 #s_fmt = 'contig_%d%s'
83 #self.chromo_range = range(0,1099)
85 g_dir
= '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
86 acc_dir
= '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
87 don_dir
= '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
89 g_fmt
= 'chr%d.dna.flat'
92 self
.chromo_range
= range(1,6)
93 #self.chromo_range = range(1,2)
96 accessWrapper
= DataAccessWrapper(g_dir
,acc_dir
,don_dir
,g_fmt
,s_fmt
)
97 self
.seqInfo
= SeqSpliceInfo(accessWrapper
,range(1,num_chromo
))
100 self
.lt1
= LookupTable(g_dir
,acc_dir
,don_dir
,g_fmt
,s_fmt
,self
.chromo_range
)
103 print 'prefetched sequence and splice data in %f sec' % (stop
-start
)
105 self
.result_spliced_fh
= open('%s.spliced'%result
_fname
,'w+')
106 self
.result_unspliced_fh
= open('%s.unspliced'%result
_fname
,'w+')
110 self
.data_fname
= data_fname
112 self
.param
= cPickle
.load(open(param_fname
))
114 # Set the parameters such as limits penalties for the Plifs
115 [h
,d
,a
,mmatrix
,qualityPlifs
] = set_param_palma(self
.param
,True,run
)
120 self
.mmatrix
= mmatrix
121 self
.qualityPlifs
= qualityPlifs
125 #self.prb_offset = 50
127 # parameters of the heuristics to decide whether the read is spliced
128 #self.splice_thresh = 0.005
129 self
.splice_thresh
= 0.01
130 self
.max_intron_size
= 2000
131 self
.max_mismatch
= 2
132 #self.splice_stop_thresh = 0.99
133 self
.splice_stop_thresh
= 0.8
134 self
.spliced_bias
= 0.0
137 ('%f','splice_thresh',self
.splice_thresh
),\
138 ('%d','max_intron_size',self
.max_intron_size
),\
139 ('%d','max_mismatch',self
.max_mismatch
),\
140 ('%f','splice_stop_thresh',self
.splice_stop_thresh
),\
141 ('%f','spliced_bias',self
.spliced_bias
)]
143 param_lines
= [('# %s: '+form
+'\n')%(name
,val
) for form
,name
,val
in param_lines
]
144 param_lines
.append('# data: %s\n'%self
.data_fname
)
145 param_lines
.append('# param: %s\n'%param_fname
)
149 for p_line
in param_lines
:
150 self
.result_spliced_fh
.write(p_line
)
151 self
.result_unspliced_fh
.write(p_line
)
153 #self.original_reads = {}
155 # we do not have this information
156 #for line in open(reads_pipeline_fn):
157 # line = line.strip()
158 # id,seq,q1,q2,q3 = line.split()
160 # self.original_reads[id] = seq
162 lengthSP
= run
['numLengthSuppPoints']
163 donSP
= run
['numDonSuppPoints']
164 accSP
= run
['numAccSuppPoints']
165 mmatrixSP
= run
['matchmatrixRows']*run
['matchmatrixCols']
166 numq
= run
['numQualSuppPoints']
167 totalQualSP
= run
['totalQualSuppPoints']
169 currentPhi
= zeros((run
['numFeatures'],1))
170 currentPhi
[0:lengthSP
] = mat(h
.penalties
[:]).reshape(lengthSP
,1)
171 currentPhi
[lengthSP
:lengthSP
+donSP
] = mat(d
.penalties
[:]).reshape(donSP
,1)
172 currentPhi
[lengthSP
+donSP
:lengthSP
+donSP
+accSP
] = mat(a
.penalties
[:]).reshape(accSP
,1)
173 currentPhi
[lengthSP
+donSP
+accSP
:lengthSP
+donSP
+accSP
+mmatrixSP
] = mmatrix
[:]
175 totalQualityPenalties
= self
.param
[-totalQualSP
:]
176 currentPhi
[lengthSP
+donSP
+accSP
+mmatrixSP
:] = totalQualityPenalties
[:]
177 self
.currentPhi
= currentPhi
179 # we want to identify spliced reads
180 # so true pos are spliced reads that are predicted "spliced"
183 # as false positives we count all reads that are not spliced but predicted
190 # total time spend for get seq and scores
192 self
.calcAlignmentScoreTime
= 0.0
193 self
.alternativeScoresTime
= 0.0
195 self
.count_time
= 0.0
197 self
.splice_site_time
= 0.0
198 self
.computeSpliceAlignWithQualityTime
= 0.0
199 self
.computeSpliceWeightsTime
= 0.0
200 self
.DotProdTime
= 0.0
201 self
.array_stuff
= 0.0
204 self
.init_time
= stop
-start
216 print 'Starting filtering...'
219 #for readId,currentReadLocations in all_remapped_reads.items():
220 #for location in currentReadLocations[:1]:
222 for location
,original_line
in self
.all_remapped_reads
:
228 chr = location
['chr']
229 pos
= location
['pos']
230 strand
= location
['strand']
231 seq
= location
['seq']
232 prb
= location
['prb']
238 strand_map
= {'D':'+', 'P':'-'}
240 strand
= strand_map
[strand
]
242 if not chr in self
.chromo_range
:
245 unb_seq
= unbracket_seq(seq
)
249 #pos = self.lt1.seqInfo.chromo_sizes[chr]-pos-self.read_size
250 unb_seq
= reverse_complement(unb_seq
)
251 seq
= reverse_complement(seq
)
253 effective_len
= len(unb_seq
)
255 genomicSeq_start
= pos
256 #genomicSeq_stop = pos+effective_len-1
257 genomicSeq_stop
= pos
+effective_len
260 #currentDNASeq_, currentAcc_, currentDon_ = self.seqInfo.get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,False)
261 currentDNASeq
, currentAcc
, currentDon
= self
.lt1
.get_seq_and_scores(chr,strand
,genomicSeq_start
,genomicSeq_stop
)
263 # just performed some tests to check LookupTable results
264 #assert currentDNASeq_ == currentDNASeq
265 #assert currentAcc_ == currentAcc_
266 #assert currentDon_ == currentDon_
269 self
.get_time
+= stop
-start
274 exons
[1,0] = effective_len
277 quality
= map(lambda x
:ord(x
)-self
.prb_offset
,prb
)
279 currentVMatchAlignment
= dna
, exons
, est
, original_est
, quality
,\
280 currentAcc
, currentDon
284 #alternativeAlignmentScores = self.calcAlternativeAlignments(location)
287 alternativeAlignmentScores
= self
.calcAlternativeAlignments(location
)
289 alternativeAlignmentScores
= []
291 if alternativeAlignmentScores
== []:
292 # no alignment necessary
293 maxAlternativeAlignmentScore
= -inf
296 maxAlternativeAlignmentScore
= max(alternativeAlignmentScores
)
297 # compute alignment for vmatch unspliced read
298 vMatchScore
= self
.calcAlignmentScore(currentVMatchAlignment
)
299 #print 'vMatchScore/alternativeScore: %f %f %s' % (vMatchScore,maxAlternativeAlignmentScore,strand)
304 #print 'all candidates %s' % str(alternativeAlignmentScores)
306 new_id
= id - 1000000300000
313 # Seems that according to our learned parameters VMatch found a good
314 # alignment of the current read
315 if maxAlternativeAlignmentScore
< vMatchScore
:
318 self
.result_unspliced_fh
.write(original_line
+'\n')
325 # We found an alternative alignment considering splice sites that scores
326 # higher than the VMatch alignment
330 self
.result_spliced_fh
.write(original_line
+'\n')
339 self
.count_time
= stop
-start
342 self
.main_loop
= _stop
-_start
344 print 'Unspliced/Splice: %d %d'%(unspliced_ctr
,spliced_ctr
)
345 print 'True pos / false pos : %d %d'%(self
.true_pos
,self
.false_pos
)
346 print 'True neg / false neg : %d %d'%(self
.true_neg
,self
.false_neg
)
349 def findHighestScoringSpliceSites(self
, currentAcc
, currentDon
, DNA
, max_intron_size
, read_size
, splice_thresh
):
360 for idx
in xrange(max_intron_size
, max_intron_size
+read_size
/2):
361 if currentAcc
[idx
]>= splice_thresh
:
362 proximal_acc
.append((idx
,currentAcc
[idx
]))
364 proximal_acc
.sort(lambda x
,y
: signum(x
[1]-y
[1]))
365 proximal_acc
=proximal_acc
[-2:]
368 for idx
in xrange(max_intron_size
+read_size
, len(currentAcc
)):
369 if currentAcc
[idx
]>= splice_thresh
and idx
+read_size
<len(currentAcc
):
370 distal_acc
.append((idx
, currentAcc
[idx
], DNA
[idx
+1:idx
+read_size
]))
372 #distal_acc.sort(lambda x,y: signum(x[1]-y[1]))
373 #distal_acc=distal_acc[-2:]
376 for idx
in xrange(max_intron_size
+read_size
/2, max_intron_size
+read_size
):
377 if currentDon
[idx
] >= splice_thresh
:
378 proximal_don
.append((idx
, currentDon
[idx
]))
380 proximal_don
.sort(lambda x
,y
: signum(x
[1]-y
[1]))
381 proximal_don
=proximal_don
[-2:]
384 for idx
in xrange(1, max_intron_size
):
385 if currentDon
[idx
] >= splice_thresh
and idx
>read_size
:
386 distal_don
.append((idx
, currentDon
[idx
], DNA
[idx
-read_size
:idx
]))
388 distal_don
.sort(lambda x
,y
: y
[0]-x
[0])
389 #distal_don=distal_don[-2:]
391 return proximal_acc
,proximal_don
,distal_acc
,distal_don
394 def calcAlternativeAlignments(self
,location
):
396 Given an alignment proposed by Vmatch this function calculates possible
397 alternative alignments taking into account for example matched
398 donor/acceptor positions.
404 chr = location
['chr']
405 pos
= location
['pos']
406 strand
= location
['strand']
407 original_est
= location
['seq']
408 quality
= location
['prb']
409 quality
= map(lambda x
:ord(x
)-self
.prb_offset
,quality
)
411 original_est
= original_est
.lower()
412 est
= unbracket_seq(original_est
)
413 effective_len
= len(est
)
415 genomicSeq_start
= pos
- self
.max_intron_size
416 genomicSeq_stop
= pos
+ self
.max_intron_size
+ len(est
)
418 strand_map
= {'D':'+', 'P':'-'}
419 strand
= strand_map
[strand
]
422 est
= reverse_complement(est
)
423 original_est
= reverse_complement(original_est
)
426 #currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr, strand, genomicSeq_start, genomicSeq_stop, run['dna_flat_files'])
427 currentDNASeq
, currentAcc
, currentDon
= self
.lt1
.get_seq_and_scores(chr, strand
, genomicSeq_start
, genomicSeq_stop
)
429 self
.get_time
+= stop
-start
432 proximal_acc
,proximal_don
,distal_acc
,distal_don
= self
.findHighestScoringSpliceSites(currentAcc
, currentDon
, dna
, self
.max_intron_size
, len(est
), self
.splice_thresh
)
434 alternativeScores
= []
442 mmatrix
= self
.mmatrix
443 qualityPlifs
= self
.qualityPlifs
446 # find an intron on the 3' end
448 for (don_pos
,don_score
) in proximal_don
:
449 DonorScore
= calculatePlif(d
, [don_score
])[0]
451 for (acc_pos
,acc_score
,acc_dna
) in distal_acc
:
453 IntronScore
= calculatePlif(h
, [acc_pos
-don_pos
])[0]
454 AcceptorScore
= calculatePlif(a
, [acc_score
])[0]
456 #print 'don splice: ', (don_pos,don_score), (acc_pos,acc_score,acc_dna), (DonorScore,IntronScore,AcceptorScore)
458 # construct a new "original_est"
462 dna_ptr
=self
.max_intron_size
467 while ptr
<len(original_est
):
468 #print acc_dna_ptr,len(acc_dna),acc_pos,don_pos
470 if original_est
[ptr
]=='[':
471 dnaletter
=original_est
[ptr
+1]
472 estletter
=original_est
[ptr
+2]
473 if dna_ptr
< don_pos
:
474 original_est_cut
+=original_est
[ptr
:ptr
+4]
477 if acc_dna
[acc_dna_ptr
]==estletter
:
478 original_est_cut
+= estletter
# EST letter
480 original_est_cut
+= '['+acc_dna
[acc_dna_ptr
]+estletter
+']' # EST letter
482 #print '['+acc_dna[acc_dna_ptr]+estletter+']'
486 dnaletter
=original_est
[ptr
]
489 if dna_ptr
< don_pos
:
490 original_est_cut
+=estletter
# EST letter
492 if acc_dna
[acc_dna_ptr
]==estletter
:
493 original_est_cut
+= estletter
# EST letter
496 original_est_cut
+= '['+acc_dna
[acc_dna_ptr
]+estletter
+']' # EST letter
497 #print '('+acc_dna[acc_dna_ptr]+estletter+')'
509 if num_mismatch
>self
.max_mismatch
:
512 assert(dna_ptr
<=len(dna
))
513 assert(est_ptr
<=len(est
))
515 #print original_est, original_est_cut
517 score
= computeSpliceAlignScoreWithQuality(original_est_cut
, quality
, qualityPlifs
, run
, self
.currentPhi
)
518 score
+= AcceptorScore
+ IntronScore
+ DonorScore
+ self
.spliced_bias
520 alternativeScores
.append(score
)
522 if acc_score
>=self
.splice_stop_thresh
:
528 self
.alternativeScoresTime
+= _stop
-_start
530 # find an intron on the 5' end
532 for (acc_pos
,acc_score
) in proximal_acc
:
534 AcceptorScore
= calculatePlif(a
, [acc_score
])[0]
536 for (don_pos
,don_score
,don_dna
) in distal_don
:
538 DonorScore
= calculatePlif(d
, [don_score
])[0]
539 IntronScore
= calculatePlif(h
, [acc_pos
-don_pos
])[0]
541 #print 'acc splice: ', (don_pos,don_score,don_dna), (acc_pos,acc_score), (DonorScore,IntronScore,AcceptorScore)
543 # construct a new "original_est"
547 dna_ptr
=self
.max_intron_size
550 don_dna_ptr
=len(don_dna
)-(acc_pos
-self
.max_intron_size
)-1
551 while ptr
<len(original_est
):
553 if original_est
[ptr
]=='[':
554 dnaletter
=original_est
[ptr
+1]
555 estletter
=original_est
[ptr
+2]
556 if dna_ptr
> acc_pos
:
557 original_est_cut
+=original_est
[ptr
:ptr
+4]
560 if don_dna
[don_dna_ptr
]==estletter
:
561 original_est_cut
+= estletter
# EST letter
563 original_est_cut
+= '['+don_dna
[don_dna_ptr
]+estletter
+']' # EST letter
565 #print '['+don_dna[don_dna_ptr]+estletter+']'
569 dnaletter
=original_est
[ptr
]
572 if dna_ptr
> acc_pos
:
573 original_est_cut
+=estletter
# EST letter
575 if don_dna
[don_dna_ptr
]==estletter
:
576 original_est_cut
+= estletter
# EST letter
578 original_est_cut
+= '['+don_dna
[don_dna_ptr
]+estletter
+']' # EST letter
580 #print '('+don_dna[don_dna_ptr]+estletter+')'
593 if num_mismatch
>self
.max_mismatch
:
595 assert(dna_ptr
<=len(dna
))
596 assert(est_ptr
<=len(est
))
598 #print original_est, original_est_cut
600 score
= computeSpliceAlignScoreWithQuality(original_est_cut
, quality
, qualityPlifs
, run
, self
.currentPhi
)
601 score
+= AcceptorScore
+ IntronScore
+ DonorScore
+ self
.spliced_bias
603 alternativeScores
.append(score
)
605 if don_score
>=self
.splice_stop_thresh
:
609 self
.alternativeScoresTime
+= _stop
-_start
611 return alternativeScores
614 def calcAlignmentScore(self
,alignment
):
616 Given an alignment (dna,exons,est) and the current parameter for QPalma
617 this function calculates the dot product of the feature representation of
618 the alignment and the parameter vector i.e the alignment score.
624 # Lets start calculation
625 dna
, exons
, est
, original_est
, quality
, acc_supp
, don_supp
= alignment
627 score
= computeSpliceAlignScoreWithQuality(original_est
, quality
, self
.qualityPlifs
, run
, self
.currentPhi
)
630 self
.calcAlignmentScoreTime
+= stop
-start