+ got rid of some legacy code
[qpalma.git] / qpalma / computeSpliceAlignWithQuality.py
1 # This program is free software; you can redistribute it and/or modify
2 # it under the terms of the GNU General Public License as published by
3 # the Free Software Foundation; either version 2 of the License, or
4 # (at your option) any later version.
5 #
6 # Written (W) 2008 Fabio De Bona
7 # Copyright (C) 2008 Max-Planck-Society
8
9 import numpy
10 from numpy.matlib import mat,zeros,ones,inf
11 import pdb
12 from Plif import Plf,base_coord,linspace
13
14 import sequence_utils
15
16 def computeSpliceAlignWithQuality(dna, exons, est, original_est, quality, qualityPlifs, run):
17 """
18 Exonpos: Anfang Exon bis 1 rechts neben Ende Exon
19 Daraus folgt: SpliceposAnfang ist gleich Ende Exon von Vorgaenger
20 (ausser dem letzten und ersten)
21 und Ende Splicestelle ist gleich Anfang Exon vom naechsten, da Splicestelleende notiert 1 rechts neben Splciestellenende
22 bsp:
23 cccccA1cccccE1cccccA2 ... AmcccccEmcccccccc
24 cccccXXcccccA1cccccE1 ... Em-1cccXXcccccccc
25 """
26
27 SpliceAlign = []
28
29 if exons.shape == (2,2):
30 numberOfExons = 2
31 exonSizes = [-1]*numberOfExons
32
33 exonSizes[0] = exons[0,1] - exons[0,0]
34 exonSizes[1] = exons[1,1] - exons[1,0]
35
36 # SpliceAlign vector describes alignment:
37 # 1:donorpos, 3:intron 2:acceptorpos, 0:exon, 4: dangling end
38
39 if exons[0,0] > 0:
40 SpliceAlign.extend([4]*(exons[0,0]))
41
42 for idx in range(numberOfExons):
43 exonLength = exonSizes[idx]
44 SpliceAlign.extend([0]*exonLength)
45
46 if idx < numberOfExons-1:
47 intronLength = exons[idx+1,0] - exons[idx,1]
48 SpliceAlign.extend([1]+[3]*(intronLength-2)+[2])
49
50 if len(dna) > exons[-1,1]:
51 SpliceAlign.extend([4]*(len(dna)-exons[-1,1]))
52
53
54 elif exons.shape == (2,1):
55 exonSize = exons[1,0] - exons[0,0]
56
57 if exons[0,0] > 0:
58 SpliceAlign.extend([4]*(exons[0,0]))
59
60 SpliceAlign.extend([0]*(exonSize))
61
62 if len(dna) > exons[1,0]:
63 SpliceAlign.extend([4]*(len(dna)-exons[1,0]))
64
65 else:
66 pass
67
68 assert len(SpliceAlign) == len(dna), pdb.set_trace()
69
70 #
71 # start of label feature generation
72 #
73
74 sizeMatchmatrix = run['matchmatrixRows']*run['matchmatrixCols'] #
75 # counts the occurences of a,c,g,t,n in this order
76
77 # counts the occurrences of a,c,g,t,n with their quality scores
78 trueWeightQualityMat = [0.0]*5 # 5 rows
79 for row in range(5):
80 trueWeightQualityMat[row] = [0.0]*6 # 6 col per row
81
82 for row in range(5):
83 for col in range(6):
84 trueWeightQualityMat[row][col] = [0.0]*run['numQualSuppPoints'] # supp. points per plif
85
86 dna_part = []
87 est_part = []
88
89 orig_pos = 0
90 while True:
91 if not orig_pos < len(original_est):
92 break
93
94 orig_char = original_est[orig_pos]
95
96 if orig_char == '[':
97 dna_part.append( original_est[orig_pos+1] )
98 est_part.append( original_est[orig_pos+2] )
99 orig_pos += 4
100 else:
101 dna_part.append( orig_char )
102 est_part.append( orig_char )
103 orig_pos += 1
104
105 for orig_char in dna_part:
106 assert orig_char in sequence_utils.alphabet, pdb.set_trace()
107
108 for orig_char in est_part:
109 assert orig_char in sequence_utils.alphabet, pdb.set_trace()
110
111 trueWeightMatch = zeros((run['matchmatrixRows']*run['matchmatrixCols'],1)) # Scorematrix fuer Wahrheit
112
113 map = {'-':0, 'a':1, 'c':2, 'g':3, 't':4, 'n':5}
114
115 ctr = 0
116 quality_part = []
117 for elem in est_part:
118 if elem == '-':
119 quality_part.append(-inf)
120 else:
121 quality_part.append(quality[ctr])
122 ctr += 1
123
124 assert len(est_part) == len(quality_part)
125
126 for idx in range(len(dna_part)):
127 dnanum = map[dna_part[idx]]
128 estnum = map[est_part[idx]]
129 value = quality_part[idx]
130
131 if estnum == 0:
132 trueWeightMatch[dnanum] += 1.0
133 assert value == -inf
134
135 else:
136 assert value != -inf
137 cur_plf = qualityPlifs[(estnum-1)*6+dnanum]
138
139 Lower = len([elem for elem in cur_plf.limits if elem <= value])
140
141 if Lower == 0:
142 trueWeightQualityMat[estnum-1][dnanum][0] += 1
143 elif Lower == len(cur_plf.limits):
144 trueWeightQualityMat[estnum-1][dnanum][-1] += 1
145 else:
146 # because we count from 0 in python
147 Lower -= 1
148 Upper = Lower+1 ; # x-werte bleiben fest
149 #print value,Lower,Upper
150 weightup = 1.0*(value - cur_plf.limits[Lower]) / (cur_plf.limits[Upper] - cur_plf.limits[Lower])
151 weightlow = 1.0*(cur_plf.limits[Upper] - value) / (cur_plf.limits[Upper] - cur_plf.limits[Lower])
152
153 trueWeightQualityMat[estnum-1][dnanum][Upper] += weightup
154 trueWeightQualityMat[estnum-1][dnanum][Lower] += weightlow
155
156 trueWeightQuality = zeros((run['numQualSuppPoints']*run['numQualPlifs'],1))
157
158 ctr = 0
159 for row in range(5):
160 for col in range(6):
161 for p_idx in range(run['numQualSuppPoints']):
162 #print ctr, row, col, p_idx
163 trueWeightQuality[ctr] = trueWeightQualityMat[row][col][p_idx]
164 ctr += 1
165
166 #pdb.set_trace()
167 featureSum = 0
168 featureSum += sum([e for p,e in enumerate(trueWeightQuality.flatten().tolist()[0]) if e != 0.0])
169 featureSum += sum(trueWeightMatch.flatten().tolist()[0])
170
171 #assert int(featureSum) == len(est_part),pdb.set_trace()
172 # 'feature sum is not equal read size!!'
173
174 if exons.shape == (2,2):
175 dna_orig = dna[exons[0,0]:exons[0,1]]
176 dna_orig += dna[exons[1,0]:exons[1,1]]
177 elif exons.shape == (2,1):
178 #dna_orig = dna[exons[0,0]:exons[1,0]]
179 pass
180 else:
181 pass
182
183 dna_calc = "".join(dna_part)
184
185 #assert dna_orig == dna_calc, pdb.set_trace()
186
187 return SpliceAlign, trueWeightMatch, trueWeightQuality, dna_calc
188
189
190 def computeSpliceAlignScoreWithQuality(original_est, quality, qualityPlifs, run, weightvector):
191 """
192 """
193
194 lengthSP = run['numLengthSuppPoints']
195 donSP = run['numDonSuppPoints']
196 accSP = run['numAccSuppPoints']
197 mmatrixSP = run['matchmatrixRows']*run['matchmatrixCols']
198 numq = run['numQualSuppPoints']*5*6
199
200 lengthP = weightvector[0:lengthSP]
201 donP = weightvector[lengthSP:lengthSP+donSP]
202 accP = weightvector[donSP+lengthSP:donSP+lengthSP+accSP]
203 mmatrixP = weightvector[accSP+donSP+lengthSP:accSP+donSP+lengthSP+mmatrixSP]
204 numqP = weightvector[accSP+donSP+lengthSP+mmatrixSP:accSP+donSP+lengthSP+mmatrixSP+numq]
205 numQualSuppPoints=run['numQualSuppPoints']
206
207 map = {'-':0, 'a':1, 'c':2, 'g':3, 't':4, 'n':5, '[': 10, ']':11 }
208 original_est_mapped = [ map[e] for e in original_est ]
209
210 if True:
211 score=0.0
212 est_ptr=0
213 dna_ptr=0
214 ptr=0
215 while ptr<len(original_est_mapped):
216
217 if original_est_mapped[ptr]==10: #'[':
218 dnaletter=original_est_mapped[ptr+1]
219 estletter=original_est_mapped[ptr+2]
220 ptr+=4
221 else:
222 dnaletter=original_est_mapped[ptr]
223 estletter=dnaletter
224 ptr+=1
225
226 if estletter==0:# '-':
227 score+= mmatrixP[dna_letter]
228 else:
229 value = quality[est_ptr]
230 cur_plf = qualityPlifs[(estletter-1)*6+dnaletter]
231
232 Lower = 0
233 for elem in cur_plf.limits:
234 if elem<=value:
235 Lower+=1
236
237 if Lower == 0:
238 score += numqP[(estletter-1)*6*numQualSuppPoints + dnaletter*numQualSuppPoints+0] ;
239
240 elif Lower == len(cur_plf.limits):
241 score += numqP[(estletter-1)*6*numQualSuppPoints + dnaletter*numQualSuppPoints + numQualSuppPoints-1] ;
242
243 else:
244 # because we count from 0 in python
245 Lower -= 1
246 Upper = Lower+1 ; # x-werte bleiben fest
247 weightup = 1.0*(value - cur_plf.limits[Lower]) / (cur_plf.limits[Upper] - cur_plf.limits[Lower])
248 weightlow = 1.0*(cur_plf.limits[Upper] - value) / (cur_plf.limits[Upper] - cur_plf.limits[Lower])
249
250 score += numqP[(estletter-1)*6*numQualSuppPoints + dnaletter*numQualSuppPoints+Upper]*weightup ;
251 score += numqP[(estletter-1)*6*numQualSuppPoints + dnaletter*numQualSuppPoints+Lower]*weightlow ;
252
253 if estletter==0:# '-':
254 dna_ptr+=1
255 elif dnaletter==0:#'-':
256 est_ptr+=1
257 else:
258 dna_ptr+=1
259 est_ptr+=1
260
261 return score