+ finished first running version of the spliced/unspliced heuristic
[qpalma.git] / scripts / PipelineHeuristic.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import cPickle
5 import sys
6 import pydb
7 import pdb
8 import os
9 import os.path
10 import math
11
12 from qpalma.DataProc import *
13 from qpalma.computeSpliceWeights import *
14 from qpalma.set_param_palma import *
15 from qpalma.computeSpliceAlignWithQuality import *
16 from qpalma.penalty_lookup_new import *
17 from qpalma.compute_donacc import *
18 from qpalma.TrainingParam import Param
19 from qpalma.Plif import Plf
20
21 from qpalma.tools.splicesites import getDonAccScores
22 from qpalma.Configuration import *
23
24 from compile_dataset import getSpliceScores, get_seq_and_scores
25
26
27 from numpy.matlib import mat,zeros,ones,inf
28 from numpy import inf
29
30 from qpalma.parsers import PipelineReadParser
31
32
33 def unbracket_est(est):
34 new_est = ''
35 e = 0
36
37 while True:
38 if e >= len(est):
39 break
40
41 if est[e] == '[':
42 new_est += est[e+2]
43 e += 4
44 else:
45 new_est += est[e]
46 e += 1
47
48 return "".join(new_est).lower()
49
50
51 class PipelineHeuristic:
52 """
53 This class wraps the filter which decides whether an alignment found by
54 vmatch is spliced an should be then newly aligned using QPalma or not.
55 """
56
57 def __init__(self,run_fname,data_fname,param_fname):
58 """
59 We need a run object holding information about the nr. of support points
60 etc.
61 """
62
63 run = cPickle.load(open(run_fname))
64 self.run = run
65
66 self.data_fname = data_fname
67
68 self.param = cPickle.load(open(param_fname))
69
70 # Set the parameters such as limits penalties for the Plifs
71 [h,d,a,mmatrix,qualityPlifs] = set_param_palma(self.param,True,run)
72
73 self.h = h
74 self.d = d
75 self.a = a
76 self.mmatrix = mmatrix
77 self.qualityPlifs = qualityPlifs
78
79 # when we look for alternative alignments with introns this value is the
80 # mean of intron size
81 self.intron_size = 250
82
83 self.read_size = 36
84
85
86 def filter(self):
87 """
88 This method...
89 """
90 run = self.run
91
92 rrp = PipelineReadParser(self.data_fname)
93 all_remapped_reads = rrp.parse()
94
95
96 for readId,currentReadLocations in all_remapped_reads.items():
97 for location in currentReadLocations:
98 chr = location['chr']
99 pos = location['pos']
100 strand = location['strand']
101 mismatch = location['mismatches']
102 length = location['length']
103 off = location['offset']
104 seq = location['seq']
105 prb = location['prb']
106 cal_prb = location['cal_prb']
107 chastity = location['chastity']
108 is_spliced = location['is_spliced']
109
110 unb_seq = unbracket_est(seq)
111 effective_len = len(unb_seq)
112
113 genomicSeq_start = pos
114 genomicSeq_stop = pos+effective_len-1
115
116 currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
117
118 dna = currentDNASeq
119 exons = zeros((2,1))
120 exons[0,0] = 0
121 exons[1,0] = effective_len
122 est = unb_seq
123 original_est = seq
124 quality = prb
125
126 pdb.set_trace()
127
128 currentVMatchAlignment = dna, exons, est, original_est, quality,\
129 currentAcc, currentDon
130 vMatchScore = self.calcAlignmentScore(currentVMatchAlignment)
131
132 print 'vMatchScore is %f' % vMatchScore
133
134
135 alternativeAlignmentScore = self.calcAlternativeAlignments(location)
136
137 print 'alternative score is %f' % alternativeAlignmentScore
138
139 """
140 maxScore = 0.0
141 maxAlignment = None
142 for currentAlternative in alternativeAlignments:
143 if currentScore > maxScore:
144 maxScore = alternativeScores
145 maxAlignment = currentAlternative
146
147 currentScore = calcAlignmentScore(currentAlternative)
148
149 # Seems that according to our learned parameters VMatch found a good
150 # alignment of the current read
151 if maxScore < vMatchScore:
152 pass
153 # We found an alternative alignment considering splice sites that scores
154 # higher than the VMatch alignment
155 else:
156 pass
157 """
158
159 def findHighestScoringSpliceSites(self,currentAcc,currentDon):
160 pdb.set_trace()
161 max_don = -inf
162 don_pos = 0
163 for idx,score in enumerate(currentDon):
164 if score > -inf and idx < (len(currentDon)*0.5):
165 don_pos = idx
166 break
167
168 max_acc = -inf
169 acc_pos = 0
170 for idx,score in enumerate(currentAcc):
171 if score > -inf and idx > don_pos:
172 acc_pos = idx
173 break
174
175 return don_pos,acc_pos
176
177
178 def calcAlternativeAlignments(self,location):
179 """
180 Given an alignment proposed by Vmatch this function calculates possible
181 alternative alignments taking into account for example matched
182 donor/acceptor positions.
183 """
184 run = self.run
185
186 chr = location['chr']
187 pos = location['pos']
188 strand = location['strand']
189 seq = location['seq']
190 prb = location['prb']
191 cal_prb = location['cal_prb']
192 is_spliced = location['is_spliced']
193
194 unb_seq = unbracket_est(seq)
195 effective_len = len(unb_seq)
196
197 genomicSeq_start = pos
198 genomicSeq_stop = pos+self.intron_size+self.read_size*2
199
200 currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
201
202 don_pos,acc_pos = self.findHighestScoringSpliceSites(currentAcc,currentDon)
203
204 print don_pos,acc_pos
205
206 """
207 donor_elem = dna[exons[0,1]:exons[0,1]+2]
208 acceptor_elem = dna[exons[1,0]-2:exons[1,0]]
209 """
210
211 dna = currentDNASeq
212 exons = zeros((2,2),dtype=numpy.int)
213 exons[0,0] = 0
214 exons[0,1] = don_pos
215 exons[1,0] = acc_pos+1
216 exons[1,1] = acc_pos+1+(self.read_size-don_pos)
217 est = unb_seq
218 original_est = seq
219 quality = prb
220
221 dna = dna[:int(exons[1,1])]
222 currentAcc = currentAcc[:int(exons[1,1])]
223 currentDon = currentDon[:int(exons[1,1])]
224
225 currentVMatchAlignment = dna, exons, est, original_est, quality,\
226 currentAcc, currentDon
227
228 alternativeScore = self.calcAlignmentScore(currentVMatchAlignment)
229
230 return alternativeScore
231
232
233 def calcAlignmentScore(self,alignment):
234 """
235 Given an alignment (dna,exons,est) and the current parameter for QPalma
236 this function calculates the dot product of the feature representation of
237 the alignment and the parameter vector i.e the alignment score.
238 """
239 run = self.run
240
241 h = self.h
242 d = self.d
243 a = self.a
244 mmatrix = self.mmatrix
245 qualityPlifs = self.qualityPlifs
246
247 lengthSP = run['numLengthSuppPoints']
248 donSP = run['numDonSuppPoints']
249 accSP = run['numAccSuppPoints']
250 mmatrixSP = run['matchmatrixRows']*run['matchmatrixCols']
251 numq = run['numQualSuppPoints']
252 totalQualSP = run['totalQualSuppPoints']
253
254 currentPhi = zeros((run['numFeatures'],1))
255
256 # Lets start calculation
257 dna, exons, est, original_est, quality, acc_supp, don_supp = alignment
258
259 # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
260 trueSpliceAlign, trueWeightMatch, trueWeightQuality ,dna_calc =\
261 computeSpliceAlignWithQuality(dna, exons, est, original_est,\
262 quality, qualityPlifs, run)
263
264 # Calculate the weights
265 trueWeightDon, trueWeightAcc, trueWeightIntron =\
266 computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
267
268 trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
269
270 currentPhi[0:lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
271 currentPhi[lengthSP:lengthSP+donSP] = mat(d.penalties[:]).reshape(donSP,1)
272 currentPhi[lengthSP+donSP:lengthSP+donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
273 currentPhi[lengthSP+donSP+accSP:lengthSP+donSP+accSP+mmatrixSP] = mmatrix[:]
274
275 totalQualityPenalties = self.param[-totalQualSP:]
276 currentPhi[lengthSP+donSP+accSP+mmatrixSP:] = totalQualityPenalties[:]
277
278 # Calculate w'phi(x,y) the total score of the alignment
279 return (trueWeight.T * currentPhi)[0,0]
280
281
282 if __name__ == '__main__':
283 #run_fname = sys.argv[1]
284 #data_fname = sys.argv[2]
285 #param_filename = sys.argv[3]
286
287 dir = '/fml/ag-raetsch/home/fabio/tmp/QPalma_test/run_+_quality_+_splicesignals_+_intron_len'
288 jp = os.path.join
289
290 run_fname = jp(dir,'run_object.pickle')
291 data_fname = '/fml/ag-raetsch/share/projects/qpalma/solexa/current_data/map.vm_unspliced_flag'
292
293 param_fname = jp(dir,'param_500.pickle')
294
295 ph1 = PipelineHeuristic(run_fname,data_fname,param_fname)
296 ph1.filter()