+ added settings parser module
[qpalma.git] / qpalma / computeSpliceAlignWithQuality.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import numpy
5 from numpy.matlib import mat,zeros,ones,inf
6 import pdb
7 from Plif import Plf,base_coord,linspace
8
9 import sequence_utils
10
11 def computeSpliceAlignWithQuality(dna, exons, est, original_est, quality, qualityPlifs, run):
12 """
13 Exonpos: Anfang Exon bis 1 rechts neben Ende Exon
14 Daraus folgt: SpliceposAnfang ist gleich Ende Exon von Vorgaenger
15 (ausser dem letzten und ersten)
16 und Ende Splicestelle ist gleich Anfang Exon vom naechsten, da Splicestelleende notiert 1 rechts neben Splciestellenende
17 bsp:
18 cccccA1cccccE1cccccA2 ... AmcccccEmcccccccc
19 cccccXXcccccA1cccccE1 ... Em-1cccXXcccccccc
20 """
21
22 SpliceAlign = []
23
24 if exons.shape == (2,2):
25 numberOfExons = 2
26 exonSizes = [-1]*numberOfExons
27
28 exonSizes[0] = exons[0,1] - exons[0,0]
29 exonSizes[1] = exons[1,1] - exons[1,0]
30
31 # SpliceAlign vector describes alignment:
32 # 1:donorpos, 3:intron 2:acceptorpos, 0:exon, 4: dangling end
33
34 if exons[0,0] > 0:
35 SpliceAlign.extend([4]*(exons[0,0]))
36
37 for idx in range(numberOfExons):
38 exonLength = exonSizes[idx]
39 SpliceAlign.extend([0]*exonLength)
40
41 if idx < numberOfExons-1:
42 intronLength = exons[idx+1,0] - exons[idx,1]
43 SpliceAlign.extend([1]+[3]*(intronLength-2)+[2])
44
45 if len(dna) > exons[-1,1]:
46 SpliceAlign.extend([4]*(len(dna)-exons[-1,1]))
47
48
49 elif exons.shape == (2,1):
50 exonSize = exons[1,0] - exons[0,0]
51
52 if exons[0,0] > 0:
53 SpliceAlign.extend([4]*(exons[0,0]))
54
55 SpliceAlign.extend([0]*(exonSize))
56
57 if len(dna) > exons[1,0]:
58 SpliceAlign.extend([4]*(len(dna)-exons[1,0]))
59
60 else:
61 pass
62
63 assert len(SpliceAlign) == len(dna), pdb.set_trace()
64
65 #
66 # start of label feature generation
67 #
68
69 sizeMatchmatrix = run['matchmatrixRows']*run['matchmatrixCols'] #
70 # counts the occurences of a,c,g,t,n in this order
71
72 # counts the occurrences of a,c,g,t,n with their quality scores
73 trueWeightQualityMat = [0.0]*5 # 5 rows
74 for row in range(5):
75 trueWeightQualityMat[row] = [0.0]*6 # 6 col per row
76
77 for row in range(5):
78 for col in range(6):
79 trueWeightQualityMat[row][col] = [0.0]*run['numQualSuppPoints'] # supp. points per plif
80
81 dna_part = []
82 est_part = []
83
84 orig_pos = 0
85 while True:
86 if not orig_pos < len(original_est):
87 break
88
89 orig_char = original_est[orig_pos]
90
91 if orig_char == '[':
92 dna_part.append( original_est[orig_pos+1] )
93 est_part.append( original_est[orig_pos+2] )
94 orig_pos += 4
95 else:
96 dna_part.append( orig_char )
97 est_part.append( orig_char )
98 orig_pos += 1
99
100 for orig_char in dna_part:
101 assert orig_char in sequence_utils.alphabet, pdb.set_trace()
102
103 for orig_char in est_part:
104 assert orig_char in sequence_utils.alphabet, pdb.set_trace()
105
106 trueWeightMatch = zeros((run['matchmatrixRows']*run['matchmatrixCols'],1)) # Scorematrix fuer Wahrheit
107
108 map = {'-':0, 'a':1, 'c':2, 'g':3, 't':4, 'n':5}
109
110 ctr = 0
111 quality_part = []
112 for elem in est_part:
113 if elem == '-':
114 quality_part.append(-inf)
115 else:
116 quality_part.append(quality[ctr])
117 ctr += 1
118
119 assert len(est_part) == len(quality_part)
120
121 for idx in range(len(dna_part)):
122 dnanum = map[dna_part[idx]]
123 estnum = map[est_part[idx]]
124 value = quality_part[idx]
125
126 if estnum == 0:
127 trueWeightMatch[dnanum] += 1.0
128 assert value == -inf
129
130 else:
131 assert value != -inf
132 cur_plf = qualityPlifs[(estnum-1)*6+dnanum]
133
134 Lower = len([elem for elem in cur_plf.limits if elem <= value])
135
136 if Lower == 0:
137 trueWeightQualityMat[estnum-1][dnanum][0] += 1
138 elif Lower == len(cur_plf.limits):
139 trueWeightQualityMat[estnum-1][dnanum][-1] += 1
140 else:
141 # because we count from 0 in python
142 Lower -= 1
143 Upper = Lower+1 ; # x-werte bleiben fest
144 #print value,Lower,Upper
145 weightup = 1.0*(value - cur_plf.limits[Lower]) / (cur_plf.limits[Upper] - cur_plf.limits[Lower])
146 weightlow = 1.0*(cur_plf.limits[Upper] - value) / (cur_plf.limits[Upper] - cur_plf.limits[Lower])
147
148 trueWeightQualityMat[estnum-1][dnanum][Upper] += weightup
149 trueWeightQualityMat[estnum-1][dnanum][Lower] += weightlow
150
151 trueWeightQuality = zeros((run['numQualSuppPoints']*run['numQualPlifs'],1))
152
153 ctr = 0
154 for row in range(5):
155 for col in range(6):
156 for p_idx in range(run['numQualSuppPoints']):
157 #print ctr, row, col, p_idx
158 trueWeightQuality[ctr] = trueWeightQualityMat[row][col][p_idx]
159 ctr += 1
160
161 #pdb.set_trace()
162 featureSum = 0
163 featureSum += sum([e for p,e in enumerate(trueWeightQuality.flatten().tolist()[0]) if e != 0.0])
164 featureSum += sum(trueWeightMatch.flatten().tolist()[0])
165
166 #assert int(featureSum) == len(est_part),pdb.set_trace()
167 # 'feature sum is not equal read size!!'
168
169 if exons.shape == (2,2):
170 dna_orig = dna[exons[0,0]:exons[0,1]]
171 dna_orig += dna[exons[1,0]:exons[1,1]]
172 elif exons.shape == (2,1):
173 #dna_orig = dna[exons[0,0]:exons[1,0]]
174 pass
175 else:
176 pass
177
178 dna_calc = "".join(dna_part)
179
180 #assert dna_orig == dna_calc, pdb.set_trace()
181
182 return SpliceAlign, trueWeightMatch, trueWeightQuality, dna_calc
183
184
185 def computeSpliceAlignScoreWithQuality(original_est, quality, qualityPlifs, run, weightvector):
186 """
187 """
188
189 lengthSP = run['numLengthSuppPoints']
190 donSP = run['numDonSuppPoints']
191 accSP = run['numAccSuppPoints']
192 mmatrixSP = run['matchmatrixRows']*run['matchmatrixCols']
193 numq = run['numQualSuppPoints']*5*6
194
195 lengthP = weightvector[0:lengthSP]
196 donP = weightvector[lengthSP:lengthSP+donSP]
197 accP = weightvector[donSP+lengthSP:donSP+lengthSP+accSP]
198 mmatrixP = weightvector[accSP+donSP+lengthSP:accSP+donSP+lengthSP+mmatrixSP]
199 numqP = weightvector[accSP+donSP+lengthSP+mmatrixSP:accSP+donSP+lengthSP+mmatrixSP+numq]
200 numQualSuppPoints=run['numQualSuppPoints']
201
202 map = {'-':0, 'a':1, 'c':2, 'g':3, 't':4, 'n':5, '[': 10, ']':11 }
203 original_est_mapped = [ map[e] for e in original_est ]
204
205 if True:
206 score=0.0
207 est_ptr=0
208 dna_ptr=0
209 ptr=0
210 while ptr<len(original_est_mapped):
211
212 if original_est_mapped[ptr]==10: #'[':
213 dnaletter=original_est_mapped[ptr+1]
214 estletter=original_est_mapped[ptr+2]
215 ptr+=4
216 else:
217 dnaletter=original_est_mapped[ptr]
218 estletter=dnaletter
219 ptr+=1
220
221 if estletter==0:# '-':
222 score+= mmatrixP[dna_letter]
223 else:
224 value = quality[est_ptr]
225 cur_plf = qualityPlifs[(estletter-1)*6+dnaletter]
226
227 Lower = 0
228 for elem in cur_plf.limits:
229 if elem<=value:
230 Lower+=1
231
232 if Lower == 0:
233 score += numqP[(estletter-1)*6*numQualSuppPoints + dnaletter*numQualSuppPoints+0] ;
234
235 elif Lower == len(cur_plf.limits):
236 score += numqP[(estletter-1)*6*numQualSuppPoints + dnaletter*numQualSuppPoints + numQualSuppPoints-1] ;
237
238 else:
239 # because we count from 0 in python
240 Lower -= 1
241 Upper = Lower+1 ; # x-werte bleiben fest
242 weightup = 1.0*(value - cur_plf.limits[Lower]) / (cur_plf.limits[Upper] - cur_plf.limits[Lower])
243 weightlow = 1.0*(cur_plf.limits[Upper] - value) / (cur_plf.limits[Upper] - cur_plf.limits[Lower])
244
245 score += numqP[(estletter-1)*6*numQualSuppPoints + dnaletter*numQualSuppPoints+Upper]*weightup ;
246 score += numqP[(estletter-1)*6*numQualSuppPoints + dnaletter*numQualSuppPoints+Lower]*weightlow ;
247
248 if estletter==0:# '-':
249 dna_ptr+=1
250 elif dnaletter==0:#'-':
251 est_ptr+=1
252 else:
253 dna_ptr+=1
254 est_ptr+=1
255
256 return score