+ fixed a bug in the filtering (forgot to disable remove_ambiguities)
[qpalma.git] / qpalma / computeSpliceAlignWithQuality.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import numpy
5 from numpy.matlib import mat,zeros,ones,inf
6 import pdb
7 from Plif import Plf,base_coord,linspace
8 import Configuration as Conf
9
10 def computeSpliceAlignWithQuality(dna, exons, est, original_est, quality, qualityPlifs):
11 """
12 Exonpos: Anfang Exon bis 1 rechts neben Ende Exon
13 Daraus folgt: SpliceposAnfang ist gleich Ende Exon von Vorgaenger
14 (ausser dem letzten und ersten)
15 und Ende Splicestelle ist gleich Anfang Exon vom naechsten, da Splicestelleende notiert 1 rechts neben Splciestellenende
16 bsp:
17 cccccA1cccccE1cccccA2 ... AmcccccEmcccccccc
18 cccccXXcccccA1cccccE1 ... Em-1cccXXcccccccc
19 """
20
21 numberOfExons = (exons.shape)[0] # how many rows ?
22 exonSizes = [-1]*numberOfExons
23
24 #assert numberOfExons == 3
25
26 for idx in range(numberOfExons):
27 exonSizes[idx] = exons[idx,1] - exons[idx,0]
28
29 # SpliceAlign vector describes alignment:
30 # 1:donorpos, 3:intron 2:acceptorpos, 0:exon, 4: dangling end
31 SpliceAlign = []
32
33 if exons[0,0] > 0:
34 SpliceAlign.extend([4]*(exons[0,0]))
35
36 for idx in range(numberOfExons):
37 exonLength = exonSizes[idx]
38 SpliceAlign.extend([0]*exonLength)
39
40 if idx < numberOfExons-1:
41 intronLength = exons[idx+1,0] - exons[idx,1]
42 SpliceAlign.extend([1]+[3]*(intronLength-2)+[2])
43
44 if len(dna) > exons[-1,1]:
45 SpliceAlign.extend([4]*(len(dna)-exons[-1,1]))
46
47 assert len(SpliceAlign) == len(dna), pdb.set_trace()
48
49 # number of matches: just have to look at the underlying est
50 # est = dna(find(SpliceAlign==0)) # exon nucleotides
51 exonPos = [pos for pos,elem in enumerate(SpliceAlign) if elem == 0]
52
53 #
54 # start of label feature generation
55 #
56
57 sizeMatchmatrix = Conf.sizeMatchmatrix
58 # counts the occurences of a,c,g,t,n in this order
59
60 # counts the occurrences of a,c,g,t,n with their quality scores
61 trueWeightQualityMat = [0.0]*5 # 5 rows
62 for row in range(5):
63 trueWeightQualityMat[row] = [0.0]*6 # 6 col per row
64
65 for row in range(5):
66 for col in range(6):
67 trueWeightQualityMat[row][col] = [0.0]*Conf.numQualSuppPoints # supp. points per plif
68
69 dna_part = []
70 est_part = []
71
72 orig_pos = 0
73 while True:
74 if not orig_pos < len(original_est):
75 break
76
77 orig_char = original_est[orig_pos]
78
79 if orig_char == '[':
80 dna_part.append( original_est[orig_pos+1] )
81 est_part.append( original_est[orig_pos+2] )
82 orig_pos += 4
83 else:
84 dna_part.append( orig_char )
85 est_part.append( orig_char )
86 orig_pos += 1
87
88 for orig_char in dna_part:
89 assert orig_char in Conf.alphabet, pdb.set_trace()
90
91 for orig_char in est_part:
92 assert orig_char in Conf.alphabet, pdb.set_trace()
93
94 trueWeightMatch = zeros((sizeMatchmatrix[0]*sizeMatchmatrix[1],1)) # Scorematrix fuer Wahrheit
95
96 map = {'-':0, 'a':1, 'c':2, 'g':3, 't':4, 'n':5}
97
98 ctr = 0
99 quality_part = []
100 for elem in est_part:
101 if elem == '-':
102 quality_part.append(-inf)
103 else:
104 quality_part.append(quality[ctr])
105 ctr += 1
106
107 assert len(est_part) == len(quality_part)
108
109 for idx in range(len(dna_part)):
110 dnanum = map[dna_part[idx]]
111 estnum = map[est_part[idx]]
112 value = quality_part[idx]
113
114 if estnum == 0:
115 trueWeightMatch[dnanum] += 1.0
116 assert value == -inf
117
118 else:
119 assert value != -inf
120 cur_plf = qualityPlifs[(estnum-1)*6+dnanum]
121
122 Lower = len([elem for elem in cur_plf.limits if elem <= value])
123
124 if Lower == 0:
125 trueWeightQualityMat[estnum-1][dnanum][0] += 1
126 elif Lower == len(cur_plf.limits):
127 trueWeightQualityMat[estnum-1][dnanum][-1] += 1
128 else:
129 # because we count from 0 in python
130 Lower -= 1
131 Upper = Lower+1 ; # x-werte bleiben fest
132 #print value,Lower,Upper
133 weightup = 1.0*(value - cur_plf.limits[Lower]) / (cur_plf.limits[Upper] - cur_plf.limits[Lower])
134 weightlow = 1.0*(cur_plf.limits[Upper] - value) / (cur_plf.limits[Upper] - cur_plf.limits[Lower])
135
136 trueWeightQualityMat[estnum-1][dnanum][Upper] += weightup
137 trueWeightQualityMat[estnum-1][dnanum][Lower] += weightlow
138
139 trueWeightQuality = zeros((Conf.numQualSuppPoints*Conf.numQualPlifs,1))
140
141 ctr = 0
142 for row in range(5):
143 for col in range(6):
144 for p_idx in range(Conf.numQualSuppPoints):
145 #print ctr, row, col, p_idx
146 trueWeightQuality[ctr] = trueWeightQualityMat[row][col][p_idx]
147 ctr += 1
148
149 #pdb.set_trace()
150 featureSum = 0
151 featureSum += sum([e for p,e in enumerate(trueWeightQuality.flatten().tolist()[0]) if e != 0.0])
152 featureSum += sum(trueWeightMatch.flatten().tolist()[0])
153
154 #assert int(featureSum) == len(est_part),pdb.set_trace()
155 # 'feature sum is not equal read size!!'
156
157 dna_orig = dna[exons[0,0]:exons[0,1]]
158 dna_orig += dna[exons[1,0]:exons[1,1]]
159 dna_calc = "".join(dna_part)
160
161 #assert dna_orig == dna_calc, pdb.set_trace()
162
163 return SpliceAlign, trueWeightMatch, trueWeightQuality, dna_calc