1 # This program is free software; you can redistribute it and/or modify
3 # the Free Software Foundation; either version 2 of the License, or
4 # (at your option) any later version.
5 #
6 # Written (W) 2008 Fabio De Bona
7 # Copyright (C) 2008 Max-Planck-Society
9 import numpy
10 from numpy.matlib import mat,zeros,ones,inf
11 import pdb
12 from Plif import Plf,base_coord,linspace
14 import sequence_utils
16 def computeSpliceAlignWithQuality(dna, exons, est, original_est, quality, qualityPlifs, run):
17 """
18 Exonpos: Anfang Exon bis 1 rechts neben Ende Exon
19 Daraus folgt: SpliceposAnfang ist gleich Ende Exon von Vorgaenger
20 (ausser dem letzten und ersten)
21 und Ende Splicestelle ist gleich Anfang Exon vom naechsten, da Splicestelleende notiert 1 rechts neben Splciestellenende
22 bsp:
23 cccccA1cccccE1cccccA2 ... AmcccccEmcccccccc
24 cccccXXcccccA1cccccE1 ... Em-1cccXXcccccccc
25 """
27 SpliceAlign = []
29 if exons.shape == (2,2):
30 numberOfExons = 2
31 exonSizes = [-1]*numberOfExons
33 exonSizes[0] = exons[0,1] - exons[0,0]
34 exonSizes[1] = exons[1,1] - exons[1,0]
36 # SpliceAlign vector describes alignment:
37 # 1:donorpos, 3:intron 2:acceptorpos, 0:exon, 4: dangling end
39 if exons[0,0] > 0:
40 SpliceAlign.extend([4]*(exons[0,0]))
42 for idx in range(numberOfExons):
43 exonLength = exonSizes[idx]
44 SpliceAlign.extend([0]*exonLength)
46 if idx < numberOfExons-1:
47 intronLength = exons[idx+1,0] - exons[idx,1]
48 SpliceAlign.extend([1]+[3]*(intronLength-2)+[2])
50 if len(dna) > exons[-1,1]:
51 SpliceAlign.extend([4]*(len(dna)-exons[-1,1]))
54 elif exons.shape == (2,1):
55 exonSize = exons[1,0] - exons[0,0]
57 if exons[0,0] > 0:
58 SpliceAlign.extend([4]*(exons[0,0]))
60 SpliceAlign.extend([0]*(exonSize))
62 if len(dna) > exons[1,0]:
63 SpliceAlign.extend([4]*(len(dna)-exons[1,0]))
65 else:
66 pass
68 assert len(SpliceAlign) == len(dna), pdb.set_trace()
70 #
71 # start of label feature generation
72 #
74 sizeMatchmatrix = run['matchmatrixRows']*run['matchmatrixCols'] #
75 # counts the occurences of a,c,g,t,n in this order
77 # counts the occurrences of a,c,g,t,n with their quality scores
78 trueWeightQualityMat = [0.0]*5 # 5 rows
79 for row in range(5):
80 trueWeightQualityMat[row] = [0.0]*6 # 6 col per row
82 for row in range(5):
83 for col in range(6):
84 trueWeightQualityMat[row][col] = [0.0]*run['numQualSuppPoints'] # supp. points per plif
86 dna_part = []
87 est_part = []
89 orig_pos = 0
90 while True:
91 if not orig_pos < len(original_est):
92 break
94 orig_char = original_est[orig_pos]
96 if orig_char == '[':
97 dna_part.append( original_est[orig_pos+1] )
98 est_part.append( original_est[orig_pos+2] )
99 orig_pos += 4
100 else:
101 dna_part.append( orig_char )
102 est_part.append( orig_char )
103 orig_pos += 1
105 for orig_char in dna_part:
106 assert orig_char in sequence_utils.alphabet, pdb.set_trace()
108 for orig_char in est_part:
109 assert orig_char in sequence_utils.alphabet, pdb.set_trace()
111 trueWeightMatch = zeros((run['matchmatrixRows']*run['matchmatrixCols'],1)) # Scorematrix fuer Wahrheit
113 map = {'-':0, 'a':1, 'c':2, 'g':3, 't':4, 'n':5}
115 ctr = 0
116 quality_part = []
117 for elem in est_part:
118 if elem == '-':
119 quality_part.append(-inf)
120 else:
121 quality_part.append(quality[ctr])
122 ctr += 1
124 assert len(est_part) == len(quality_part)
126 for idx in range(len(dna_part)):
127 dnanum = map[dna_part[idx]]
128 estnum = map[est_part[idx]]
129 value = quality_part[idx]
131 if estnum == 0:
132 trueWeightMatch[dnanum] += 1.0
133 assert value == -inf
135 else:
136 assert value != -inf
137 cur_plf = qualityPlifs[(estnum-1)*6+dnanum]
139 Lower = len([elem for elem in cur_plf.limits if elem <= value])
141 if Lower == 0:
142 trueWeightQualityMat[estnum-1][dnanum][0] += 1
143 elif Lower == len(cur_plf.limits):
144 trueWeightQualityMat[estnum-1][dnanum][-1] += 1
145 else:
146 # because we count from 0 in python
147 Lower -= 1
148 Upper = Lower+1 ; # x-werte bleiben fest
149 #print value,Lower,Upper
150 weightup = 1.0*(value - cur_plf.limits[Lower]) / (cur_plf.limits[Upper] - cur_plf.limits[Lower])
151 weightlow = 1.0*(cur_plf.limits[Upper] - value) / (cur_plf.limits[Upper] - cur_plf.limits[Lower])
153 trueWeightQualityMat[estnum-1][dnanum][Upper] += weightup
154 trueWeightQualityMat[estnum-1][dnanum][Lower] += weightlow
156 trueWeightQuality = zeros((run['numQualSuppPoints']*run['numQualPlifs'],1))
158 ctr = 0
159 for row in range(5):
160 for col in range(6):
161 for p_idx in range(run['numQualSuppPoints']):
162 #print ctr, row, col, p_idx
163 trueWeightQuality[ctr] = trueWeightQualityMat[row][col][p_idx]
164 ctr += 1
166 #pdb.set_trace()
167 featureSum = 0
168 featureSum += sum([e for p,e in enumerate(trueWeightQuality.flatten().tolist()[0]) if e != 0.0])
169 featureSum += sum(trueWeightMatch.flatten().tolist()[0])
171 #assert int(featureSum) == len(est_part),pdb.set_trace()
172 # 'feature sum is not equal read size!!'
174 if exons.shape == (2,2):
175 dna_orig = dna[exons[0,0]:exons[0,1]]
176 dna_orig += dna[exons[1,0]:exons[1,1]]
177 elif exons.shape == (2,1):
178 #dna_orig = dna[exons[0,0]:exons[1,0]]
179 pass
180 else:
181 pass
183 dna_calc = "".join(dna_part)
185 #assert dna_orig == dna_calc, pdb.set_trace()
187 return SpliceAlign, trueWeightMatch, trueWeightQuality, dna_calc
190 def computeSpliceAlignScoreWithQuality(original_est, quality, qualityPlifs, run, weightvector):
191 """
192 """
194 lengthSP = run['numLengthSuppPoints']
195 donSP = run['numDonSuppPoints']
196 accSP = run['numAccSuppPoints']
197 mmatrixSP = run['matchmatrixRows']*run['matchmatrixCols']
198 numq = run['numQualSuppPoints']*5*6
200 lengthP = weightvector[0:lengthSP]
201 donP = weightvector[lengthSP:lengthSP+donSP]
202 accP = weightvector[donSP+lengthSP:donSP+lengthSP+accSP]
203 mmatrixP = weightvector[accSP+donSP+lengthSP:accSP+donSP+lengthSP+mmatrixSP]
204 numqP = weightvector[accSP+donSP+lengthSP+mmatrixSP:accSP+donSP+lengthSP+mmatrixSP+numq]
205 numQualSuppPoints=run['numQualSuppPoints']
207 map = {'-':0, 'a':1, 'c':2, 'g':3, 't':4, 'n':5, '[': 10, ']':11 }
208 original_est_mapped = [ map[e] for e in original_est ]
210 if True:
211 score=0.0
212 est_ptr=0
213 dna_ptr=0
214 ptr=0
215 while ptr<len(original_est_mapped):
217 if original_est_mapped[ptr]==10: #'[':
218 dnaletter=original_est_mapped[ptr+1]
219 estletter=original_est_mapped[ptr+2]
220 ptr+=4
221 else:
222 dnaletter=original_est_mapped[ptr]
223 estletter=dnaletter
224 ptr+=1
226 if estletter==0:# '-':
227 score+= mmatrixP[dna_letter]
228 else:
229 value = quality[est_ptr]
230 cur_plf = qualityPlifs[(estletter-1)*6+dnaletter]
232 Lower = 0
233 for elem in cur_plf.limits:
234 if elem<=value:
235 Lower+=1
237 if Lower == 0:
238 score += numqP[(estletter-1)*6*numQualSuppPoints + dnaletter*numQualSuppPoints+0] ;
240 elif Lower == len(cur_plf.limits):
241 score += numqP[(estletter-1)*6*numQualSuppPoints + dnaletter*numQualSuppPoints + numQualSuppPoints-1] ;
243 else:
244 # because we count from 0 in python
245 Lower -= 1
246 Upper = Lower+1 ; # x-werte bleiben fest
247 weightup = 1.0*(value - cur_plf.limits[Lower]) / (cur_plf.limits[Upper] - cur_plf.limits[Lower])
248 weightlow = 1.0*(cur_plf.limits[Upper] - value) / (cur_plf.limits[Upper] - cur_plf.limits[Lower])
250 score += numqP[(estletter-1)*6*numQualSuppPoints + dnaletter*numQualSuppPoints+Upper]*weightup ;
251 score += numqP[(estletter-1)*6*numQualSuppPoints + dnaletter*numQualSuppPoints+Lower]*weightlow ;
253 if estletter==0:# '-':
254 dna_ptr+=1
255 elif dnaletter==0:#'-':
256 est_ptr+=1
257 else:
258 dna_ptr+=1
259 est_ptr+=1
261 return score