+ refactored code further
[qpalma.git] / qpalma / computeSpliceAlignWithQuality.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import numpy
5 from numpy.matlib import mat,zeros,ones,inf
6 import pdb
7 from Plif import Plf,base_coord,linspace
8
9 import Helpers
10
11 def computeSpliceAlignWithQuality(dna, exons, est, original_est, quality, qualityPlifs, run):
12 """
13 Exonpos: Anfang Exon bis 1 rechts neben Ende Exon
14 Daraus folgt: SpliceposAnfang ist gleich Ende Exon von Vorgaenger
15 (ausser dem letzten und ersten)
16 und Ende Splicestelle ist gleich Anfang Exon vom naechsten, da Splicestelleende notiert 1 rechts neben Splciestellenende
17 bsp:
18 cccccA1cccccE1cccccA2 ... AmcccccEmcccccccc
19 cccccXXcccccA1cccccE1 ... Em-1cccXXcccccccc
20 """
21
22 numberOfExons = (exons.shape)[0] # how many rows ?
23 exonSizes = [-1]*numberOfExons
24
25 #assert numberOfExons == 3
26
27 for idx in range(numberOfExons):
28 exonSizes[idx] = exons[idx,1] - exons[idx,0]
29
30 # SpliceAlign vector describes alignment:
31 # 1:donorpos, 3:intron 2:acceptorpos, 0:exon, 4: dangling end
32 SpliceAlign = []
33
34 if exons[0,0] > 0:
35 SpliceAlign.extend([4]*(exons[0,0]))
36
37 for idx in range(numberOfExons):
38 exonLength = exonSizes[idx]
39 SpliceAlign.extend([0]*exonLength)
40
41 if idx < numberOfExons-1:
42 intronLength = exons[idx+1,0] - exons[idx,1]
43 SpliceAlign.extend([1]+[3]*(intronLength-2)+[2])
44
45 if len(dna) > exons[-1,1]:
46 SpliceAlign.extend([4]*(len(dna)-exons[-1,1]))
47
48 assert len(SpliceAlign) == len(dna), pdb.set_trace()
49
50 # number of matches: just have to look at the underlying est
51 # est = dna(find(SpliceAlign==0)) # exon nucleotides
52 exonPos = [pos for pos,elem in enumerate(SpliceAlign) if elem == 0]
53
54 #
55 # start of label feature generation
56 #
57
58 sizeMatchmatrix = run['matchmatrixRows']*run['matchmatrixCols'] #
59 # counts the occurences of a,c,g,t,n in this order
60
61 # counts the occurrences of a,c,g,t,n with their quality scores
62 trueWeightQualityMat = [0.0]*5 # 5 rows
63 for row in range(5):
64 trueWeightQualityMat[row] = [0.0]*6 # 6 col per row
65
66 for row in range(5):
67 for col in range(6):
68 trueWeightQualityMat[row][col] = [0.0]*run['numQualSuppPoints'] # supp. points per plif
69
70 dna_part = []
71 est_part = []
72
73 orig_pos = 0
74 while True:
75 if not orig_pos < len(original_est):
76 break
77
78 orig_char = original_est[orig_pos]
79
80 if orig_char == '[':
81 dna_part.append( original_est[orig_pos+1] )
82 est_part.append( original_est[orig_pos+2] )
83 orig_pos += 4
84 else:
85 dna_part.append( orig_char )
86 est_part.append( orig_char )
87 orig_pos += 1
88
89 for orig_char in dna_part:
90 assert orig_char in Helpers.alphabet, pdb.set_trace()
91
92 for orig_char in est_part:
93 assert orig_char in Helpers.alphabet, pdb.set_trace()
94
95 trueWeightMatch = zeros((run['matchmatrixRows']*run['matchmatrixCols'],1)) # Scorematrix fuer Wahrheit
96
97 map = {'-':0, 'a':1, 'c':2, 'g':3, 't':4, 'n':5}
98
99 ctr = 0
100 quality_part = []
101 for elem in est_part:
102 if elem == '-':
103 quality_part.append(-inf)
104 else:
105 quality_part.append(quality[ctr])
106 ctr += 1
107
108 assert len(est_part) == len(quality_part)
109
110 for idx in range(len(dna_part)):
111 dnanum = map[dna_part[idx]]
112 estnum = map[est_part[idx]]
113 value = quality_part[idx]
114
115 if estnum == 0:
116 trueWeightMatch[dnanum] += 1.0
117 assert value == -inf
118
119 else:
120 assert value != -inf
121 cur_plf = qualityPlifs[(estnum-1)*6+dnanum]
122
123 Lower = len([elem for elem in cur_plf.limits if elem <= value])
124
125 if Lower == 0:
126 trueWeightQualityMat[estnum-1][dnanum][0] += 1
127 elif Lower == len(cur_plf.limits):
128 trueWeightQualityMat[estnum-1][dnanum][-1] += 1
129 else:
130 # because we count from 0 in python
131 Lower -= 1
132 Upper = Lower+1 ; # x-werte bleiben fest
133 #print value,Lower,Upper
134 weightup = 1.0*(value - cur_plf.limits[Lower]) / (cur_plf.limits[Upper] - cur_plf.limits[Lower])
135 weightlow = 1.0*(cur_plf.limits[Upper] - value) / (cur_plf.limits[Upper] - cur_plf.limits[Lower])
136
137 trueWeightQualityMat[estnum-1][dnanum][Upper] += weightup
138 trueWeightQualityMat[estnum-1][dnanum][Lower] += weightlow
139
140 trueWeightQuality = zeros((run['numQualSuppPoints']*run['numQualPlifs'],1))
141
142 ctr = 0
143 for row in range(5):
144 for col in range(6):
145 for p_idx in range(run['numQualSuppPoints']):
146 #print ctr, row, col, p_idx
147 trueWeightQuality[ctr] = trueWeightQualityMat[row][col][p_idx]
148 ctr += 1
149
150 #pdb.set_trace()
151 featureSum = 0
152 featureSum += sum([e for p,e in enumerate(trueWeightQuality.flatten().tolist()[0]) if e != 0.0])
153 featureSum += sum(trueWeightMatch.flatten().tolist()[0])
154
155 #assert int(featureSum) == len(est_part),pdb.set_trace()
156 # 'feature sum is not equal read size!!'
157
158 dna_orig = dna[exons[0,0]:exons[0,1]]
159 dna_orig += dna[exons[1,0]:exons[1,1]]
160 dna_calc = "".join(dna_part)
161
162 #assert dna_orig == dna_calc, pdb.set_trace()
163
164 return SpliceAlign, trueWeightMatch, trueWeightQuality, dna_calc