+ refined creation of alternative alignments
[qpalma.git] / scripts / PipelineHeuristic.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import cPickle
5 import sys
6 import pydb
7 import pdb
8 import os
9 import os.path
10 import math
11
12 from qpalma.DataProc import *
13 from qpalma.computeSpliceWeights import *
14 from qpalma.set_param_palma import *
15 from qpalma.computeSpliceAlignWithQuality import *
16 from qpalma.penalty_lookup_new import *
17 from qpalma.compute_donacc import *
18 from qpalma.TrainingParam import Param
19 from qpalma.Plif import Plf
20
21 from qpalma.tools.splicesites import getDonAccScores
22 from qpalma.Configuration import *
23
24 from compile_dataset import getSpliceScores, get_seq_and_scores
25
26
27 from numpy.matlib import mat,zeros,ones,inf
28 from numpy import inf
29
30 from qpalma.parsers import PipelineReadParser
31
32
33 def unbracket_est(est):
34 new_est = ''
35 e = 0
36
37 while True:
38 if e >= len(est):
39 break
40
41 if est[e] == '[':
42 new_est += est[e+2]
43 e += 4
44 else:
45 new_est += est[e]
46 e += 1
47
48 return "".join(new_est).lower()
49
50
51 class PipelineHeuristic:
52 """
53 This class wraps the filter which decides whether an alignment found by
54 vmatch is spliced an should be then newly aligned using QPalma or not.
55 """
56
57 def __init__(self,run_fname,data_fname,param_fname):
58 """
59 We need a run object holding information about the nr. of support points
60 etc.
61 """
62
63 run = cPickle.load(open(run_fname))
64 self.run = run
65
66 self.data_fname = data_fname
67
68 self.param = cPickle.load(open(param_fname))
69
70 # Set the parameters such as limits penalties for the Plifs
71 [h,d,a,mmatrix,qualityPlifs] = set_param_palma(self.param,True,run)
72
73 self.h = h
74 self.d = d
75 self.a = a
76 self.mmatrix = mmatrix
77 self.qualityPlifs = qualityPlifs
78
79 # when we look for alternative alignments with introns this value is the
80 # mean of intron size
81 self.intron_size = 90
82
83 self.read_size = 36
84
85
86 def filter(self):
87 """
88 This method...
89 """
90 run = self.run
91
92 rrp = PipelineReadParser(self.data_fname)
93 all_remapped_reads = rrp.parse()
94
95 ctr = 0
96 unspliced_ctr = 0
97 spliced_ctr = 0
98
99 print 'Starting filtering...'
100
101 for readId,currentReadLocations in all_remapped_reads.items():
102 for location in currentReadLocations:
103 chr = location['chr']
104 pos = location['pos']
105 strand = location['strand']
106 mismatch = location['mismatches']
107 length = location['length']
108 off = location['offset']
109 seq = location['seq']
110 prb = location['prb']
111 cal_prb = location['cal_prb']
112 chastity = location['chastity']
113 is_spliced = location['is_spliced']
114
115 if strand == '-':
116 continue
117
118 if ctr == 100:
119 break
120
121 #if pos > 10000000:
122 # continue
123
124 unb_seq = unbracket_est(seq)
125 effective_len = len(unb_seq)
126
127 genomicSeq_start = pos
128 genomicSeq_stop = pos+effective_len-1
129
130 #print genomicSeq_start,genomicSeq_stop
131 currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
132
133 dna = currentDNASeq
134 exons = zeros((2,1))
135 exons[0,0] = 0
136 exons[1,0] = effective_len
137 est = unb_seq
138 original_est = seq
139 quality = prb
140
141 #pdb.set_trace()
142
143 currentVMatchAlignment = dna, exons, est, original_est, quality,\
144 currentAcc, currentDon
145 vMatchScore = self.calcAlignmentScore(currentVMatchAlignment)
146
147 alternativeAlignmentScores = self.calcAlternativeAlignments(location)
148
149 # found no alternatives
150 if alternativeAlignmentScores == []:
151 continue
152
153 maxAlternativeAlignmentScore = max(alternativeAlignmentScores)
154 #print 'vMatchScore/alternativeScore: %f %f ' % (vMatchScore,maxAlternativeAlignmentScore)
155 #print 'all candidates %s' % str(alternativeAlignmentScores)
156
157 # Seems that according to our learned parameters VMatch found a good
158 # alignment of the current read
159 if maxAlternativeAlignmentScore < vMatchScore:
160 unspliced_ctr += 1
161 # We found an alternative alignment considering splice sites that scores
162 # higher than the VMatch alignment
163 else:
164 spliced_ctr += 1
165
166 ctr += 1
167
168 print 'Unspliced/Splice: %d %d'%(unspliced_ctr,spliced_ctr)
169
170
171 def findHighestScoringSpliceSites(self,currentAcc,currentDon):
172 max_don = -inf
173 don_pos = []
174 for idx,score in enumerate(currentDon):
175 if score > -inf and idx > 1 and idx < self.read_size:
176 don_pos.append(idx)
177
178 if len(don_pos) == 2:
179 break
180
181 max_acc = -inf
182 acc_pos = []
183 for idx,score in enumerate(currentAcc):
184 if score > -inf and idx > self.intron_size:
185 acc_pos = idx
186 break
187
188 return don_pos,acc_pos
189
190
191 def calcAlternativeAlignments(self,location):
192 """
193 Given an alignment proposed by Vmatch this function calculates possible
194 alternative alignments taking into account for example matched
195 donor/acceptor positions.
196 """
197 run = self.run
198
199 chr = location['chr']
200 pos = location['pos']
201 strand = location['strand']
202 seq = location['seq']
203 orig_seq = location['orig_seq']
204 prb = location['prb']
205 cal_prb = location['cal_prb']
206 is_spliced = location['is_spliced']
207
208 unb_seq = unbracket_est(seq)
209 effective_len = len(unb_seq)
210
211 genomicSeq_start = pos
212 genomicSeq_stop = pos+self.intron_size*2+self.read_size*2
213
214 currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
215 dna = currentDNASeq
216
217 alt_don_pos,acc_pos = self.findHighestScoringSpliceSites(currentAcc,currentDon)
218
219 #print alt_don_pos,acc_pos
220
221 alternativeScores = []
222
223 for don_pos in alt_don_pos:
224 exons = zeros((2,2),dtype=numpy.int)
225 exons[0,0] = 0
226 exons[0,1] = don_pos
227 exons[1,0] = acc_pos+1
228 exons[1,1] = acc_pos+1+(self.read_size-don_pos)
229 est = unb_seq
230 original_est = seq
231 quality = prb
232
233 _dna = dna[:int(exons[1,1])]
234 _dna = _dna[:exons[1,0]] + orig_seq[don_pos:]
235
236 #pdb.set_trace()
237
238 _currentAcc = currentAcc[:int(exons[1,1])]
239 _currentDon = currentDon[:int(exons[1,1])]
240
241 currentVMatchAlignment = _dna, exons, est, original_est, quality,\
242 _currentAcc, _currentDon
243
244 #alternativeScore = self.calcAlignmentScore(currentVMatchAlignment)
245 alternativeScores.append(self.calcAlignmentScore(currentVMatchAlignment))
246
247 return alternativeScores
248
249
250 def calcAlignmentScore(self,alignment):
251 """
252 Given an alignment (dna,exons,est) and the current parameter for QPalma
253 this function calculates the dot product of the feature representation of
254 the alignment and the parameter vector i.e the alignment score.
255 """
256 run = self.run
257
258 h = self.h
259 d = self.d
260 a = self.a
261 mmatrix = self.mmatrix
262 qualityPlifs = self.qualityPlifs
263
264 lengthSP = run['numLengthSuppPoints']
265 donSP = run['numDonSuppPoints']
266 accSP = run['numAccSuppPoints']
267 mmatrixSP = run['matchmatrixRows']*run['matchmatrixCols']
268 numq = run['numQualSuppPoints']
269 totalQualSP = run['totalQualSuppPoints']
270
271 currentPhi = zeros((run['numFeatures'],1))
272
273 # Lets start calculation
274 dna, exons, est, original_est, quality, acc_supp, don_supp = alignment
275
276 # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
277 trueSpliceAlign, trueWeightMatch, trueWeightQuality ,dna_calc =\
278 computeSpliceAlignWithQuality(dna, exons, est, original_est,\
279 quality, qualityPlifs, run)
280
281 # Calculate the weights
282 trueWeightDon, trueWeightAcc, trueWeightIntron =\
283 computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
284
285 trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
286
287 currentPhi[0:lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
288 currentPhi[lengthSP:lengthSP+donSP] = mat(d.penalties[:]).reshape(donSP,1)
289 currentPhi[lengthSP+donSP:lengthSP+donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
290 currentPhi[lengthSP+donSP+accSP:lengthSP+donSP+accSP+mmatrixSP] = mmatrix[:]
291
292 totalQualityPenalties = self.param[-totalQualSP:]
293 currentPhi[lengthSP+donSP+accSP+mmatrixSP:] = totalQualityPenalties[:]
294
295 # Calculate w'phi(x,y) the total score of the alignment
296 return (trueWeight.T * currentPhi)[0,0]
297
298
299 if __name__ == '__main__':
300 #run_fname = sys.argv[1]
301 #data_fname = sys.argv[2]
302 #param_filename = sys.argv[3]
303
304 dir = '/fml/ag-raetsch/home/fabio/tmp/QPalma_test/run_+_quality_+_splicesignals_+_intron_len'
305 jp = os.path.join
306
307 run_fname = jp(dir,'run_object.pickle')
308 #data_fname = '/fml/ag-raetsch/share/projects/qpalma/solexa/current_data/map.vm_unspliced_flag'
309 data_fname = '/fml/ag-raetsch/share/projects/qpalma/solexa/current_data/map.vm_spliced_flag'
310
311 param_fname = jp(dir,'param_500.pickle')
312
313 ph1 = PipelineHeuristic(run_fname,data_fname,param_fname)
314 ph1.filter()