git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@8589 e1793c9e...
[qpalma.git] / qpalma / computeSpliceAlignWithQuality.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import numpy
5 from numpy.matlib import mat,zeros,ones,inf
6 import pdb
7 from Plif import Plf,base_coord,linspace
8
9 import Helpers
10
11 def computeSpliceAlignWithQuality(dna, exons, est, original_est, quality, qualityPlifs, run):
12 """
13 Exonpos: Anfang Exon bis 1 rechts neben Ende Exon
14 Daraus folgt: SpliceposAnfang ist gleich Ende Exon von Vorgaenger
15 (ausser dem letzten und ersten)
16 und Ende Splicestelle ist gleich Anfang Exon vom naechsten, da Splicestelleende notiert 1 rechts neben Splciestellenende
17 bsp:
18 cccccA1cccccE1cccccA2 ... AmcccccEmcccccccc
19 cccccXXcccccA1cccccE1 ... Em-1cccXXcccccccc
20 """
21
22 SpliceAlign = []
23
24 if exons.shape == (2,2):
25 numberOfExons = 2
26 exonSizes = [-1]*numberOfExons
27
28 exonSizes[0] = exons[0,1] - exons[0,0]
29 exonSizes[1] = exons[1,1] - exons[1,0]
30
31 # SpliceAlign vector describes alignment:
32 # 1:donorpos, 3:intron 2:acceptorpos, 0:exon, 4: dangling end
33
34 if exons[0,0] > 0:
35 SpliceAlign.extend([4]*(exons[0,0]))
36
37 for idx in range(numberOfExons):
38 exonLength = exonSizes[idx]
39 SpliceAlign.extend([0]*exonLength)
40
41 if idx < numberOfExons-1:
42 intronLength = exons[idx+1,0] - exons[idx,1]
43 SpliceAlign.extend([1]+[3]*(intronLength-2)+[2])
44
45 if len(dna) > exons[-1,1]:
46 SpliceAlign.extend([4]*(len(dna)-exons[-1,1]))
47
48
49 elif exons.shape == (2,1):
50 exonSize = exons[1,0] - exons[0,0]
51 SpliceAlign.extend([0]*(exonSize))
52
53 else:
54 pass
55
56 assert len(SpliceAlign) == len(dna), pdb.set_trace()
57
58 #
59 # start of label feature generation
60 #
61
62 sizeMatchmatrix = run['matchmatrixRows']*run['matchmatrixCols'] #
63 # counts the occurences of a,c,g,t,n in this order
64
65 # counts the occurrences of a,c,g,t,n with their quality scores
66 trueWeightQualityMat = [0.0]*5 # 5 rows
67 for row in range(5):
68 trueWeightQualityMat[row] = [0.0]*6 # 6 col per row
69
70 for row in range(5):
71 for col in range(6):
72 trueWeightQualityMat[row][col] = [0.0]*run['numQualSuppPoints'] # supp. points per plif
73
74 dna_part = []
75 est_part = []
76
77 orig_pos = 0
78 while True:
79 if not orig_pos < len(original_est):
80 break
81
82 orig_char = original_est[orig_pos]
83
84 if orig_char == '[':
85 dna_part.append( original_est[orig_pos+1] )
86 est_part.append( original_est[orig_pos+2] )
87 orig_pos += 4
88 else:
89 dna_part.append( orig_char )
90 est_part.append( orig_char )
91 orig_pos += 1
92
93 for orig_char in dna_part:
94 assert orig_char in Helpers.alphabet, pdb.set_trace()
95
96 for orig_char in est_part:
97 assert orig_char in Helpers.alphabet, pdb.set_trace()
98
99 trueWeightMatch = zeros((run['matchmatrixRows']*run['matchmatrixCols'],1)) # Scorematrix fuer Wahrheit
100
101 map = {'-':0, 'a':1, 'c':2, 'g':3, 't':4, 'n':5}
102
103 ctr = 0
104 quality_part = []
105 for elem in est_part:
106 if elem == '-':
107 quality_part.append(-inf)
108 else:
109 quality_part.append(quality[ctr])
110 ctr += 1
111
112 assert len(est_part) == len(quality_part)
113
114 for idx in range(len(dna_part)):
115 dnanum = map[dna_part[idx]]
116 estnum = map[est_part[idx]]
117 value = quality_part[idx]
118
119 if estnum == 0:
120 trueWeightMatch[dnanum] += 1.0
121 assert value == -inf
122
123 else:
124 assert value != -inf
125 cur_plf = qualityPlifs[(estnum-1)*6+dnanum]
126
127 Lower = len([elem for elem in cur_plf.limits if elem <= value])
128
129 if Lower == 0:
130 trueWeightQualityMat[estnum-1][dnanum][0] += 1
131 elif Lower == len(cur_plf.limits):
132 trueWeightQualityMat[estnum-1][dnanum][-1] += 1
133 else:
134 # because we count from 0 in python
135 Lower -= 1
136 Upper = Lower+1 ; # x-werte bleiben fest
137 #print value,Lower,Upper
138 weightup = 1.0*(value - cur_plf.limits[Lower]) / (cur_plf.limits[Upper] - cur_plf.limits[Lower])
139 weightlow = 1.0*(cur_plf.limits[Upper] - value) / (cur_plf.limits[Upper] - cur_plf.limits[Lower])
140
141 trueWeightQualityMat[estnum-1][dnanum][Upper] += weightup
142 trueWeightQualityMat[estnum-1][dnanum][Lower] += weightlow
143
144 trueWeightQuality = zeros((run['numQualSuppPoints']*run['numQualPlifs'],1))
145
146 ctr = 0
147 for row in range(5):
148 for col in range(6):
149 for p_idx in range(run['numQualSuppPoints']):
150 #print ctr, row, col, p_idx
151 trueWeightQuality[ctr] = trueWeightQualityMat[row][col][p_idx]
152 ctr += 1
153
154 #pdb.set_trace()
155 featureSum = 0
156 featureSum += sum([e for p,e in enumerate(trueWeightQuality.flatten().tolist()[0]) if e != 0.0])
157 featureSum += sum(trueWeightMatch.flatten().tolist()[0])
158
159 #assert int(featureSum) == len(est_part),pdb.set_trace()
160 # 'feature sum is not equal read size!!'
161
162 if exons.shape == (2,2):
163 dna_orig = dna[exons[0,0]:exons[0,1]]
164 dna_orig += dna[exons[1,0]:exons[1,1]]
165 elif exons.shape == (2,1):
166 #dna_orig = dna[exons[0,0]:exons[1,0]]
167 pass
168 else:
169 pass
170
171 dna_calc = "".join(dna_part)
172
173 #assert dna_orig == dna_calc, pdb.set_trace()
174
175 return SpliceAlign, trueWeightMatch, trueWeightQuality, dna_calc
176
177
178 def computeSpliceAlignScoreWithQuality(dna, exons, est, original_est, quality, qualityPlifs, run, weightvector):
179 """
180 """
181
182 lengthSP = run['numLengthSuppPoints']
183 donSP = run['numDonSuppPoints']
184 accSP = run['numAccSuppPoints']
185 mmatrixSP = run['matchmatrixRows']*run['matchmatrixCols']
186 numq = run['numQualSuppPoints']
187
188 lengthP = weightvector[0:lengthSP]
189 donP = weightvector[lengthSP:lengthSP+donSP]
190 accP = weightvector[donSP+lengthSP:donSP+lengthSP+accSP]
191 mmatrixP = weightvector[accSP+donSP+lengthSP:accSP+donSP+lengthSP+mmatrixSP]
192 numqP = weightvector[accSP+donSP+lengthSP+mmatrixSP:accSP+donSP+lengthSP+mmatrixSP+numq]
193 numQualSuppPoints=run['numQualSuppPoints']
194
195 map = {'-':0, 'a':1, 'c':2, 'g':3, 't':4, 'n':5, '[': 10, ']':11}
196 original_est_mapped = [ map[e] for e in original_est ]
197
198 if exons.shape == (2,2):
199 numberOfExons = 2
200 exonSizes = [-1]*numberOfExons
201
202 exonSizes[0] = exons[0,1] - exons[0,0]
203 exonSizes[1] = exons[1,1] - exons[1,0]
204
205 score=0.0
206
207 elif exons.shape == (2,1):
208 score=0.0
209 est_ptr=0
210 dna_ptr=0
211 ptr=0
212 while ptr<len(original_est_mapped):
213
214 if original_est[ptr]==10: #'[':
215 dnaletter=original_est_mapped[ptr+1]
216 estletter=original_est_mapped
217 ptr+=4
218 else:
219 dnaletter=original_est[ptr+2][ptr]
220 estletter=dnaletter
221 ptr+=1
222 if est_letter=='-':
223 score+= mmatrixP[dna_letter]
224 dna_ptr+=1
225 else:
226 est_qual = quality[est_ptr]
227
228 cur_plf = qualityPlifs[(estnum-1)*6+dnanum]
229
230 Lower = len([elem for elem in cur_plf.limits if elem <= value])
231
232 if Lower == 0:
233 score += numqP[(estletter-1)*6*numQualSuppPoints + dnaletter*numQualSuppPoints+0] ;
234
235 elif Lower == len(cur_plf.limits):
236 score += numqP[(estletter-1)*6*numQualSuppPoints + dnaletter*numQualSuppPoints+numQualSuppPoints-1] ;
237
238 else:
239 # because we count from 0 in python
240 Lower -= 1
241 Upper = Lower+1 ; # x-werte bleiben fest
242 #print value,Lower,Upper
243 weightup = 1.0*(value - cur_plf.limits[Lower]) / (cur_plf.limits[Upper] - cur_plf.limits[Lower])
244 weightlow = 1.0*(cur_plf.limits[Upper] - value) / (cur_plf.limits[Upper] - cur_plf.limits[Lower])
245
246 score += numqP[(estletter-1)*6*numQualSuppPoints + dnaletter*numQualSuppPoints+Upper]*weightup ;
247 score += numqP[(estletter-1)*6*numQualSuppPoints + dnaletter*numQualSuppPoints+Lower]*weightlow ;
248
249 if est_letter=='-':
250 dna_ptr+=1
251 elif dna_letter=='-':
252 est_ptr+=1
253 else:
254 dna_ptr+=1
255 est_ptr+=1
256
257 assert(dna_ptr<len(dna))
258 assert(est_ptr<len(est))
259
260 exonSize = exons[1,0] - exons[0,0]
261 SpliceAlign.extend([0]*(exonSize))
262
263 else:
264 assert(False)
265
266 return score