+ fixed bug in the computeSpliceAlign... function
[qpalma.git] / qpalma / computeSpliceAlignWithQuality.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import numpy
5 from numpy.matlib import mat,zeros,ones,inf
6 import pdb
7 from Plif import Plf,base_coord,linspace
8 import Configuration as Conf
9
10 def computeSpliceAlignWithQuality(dna, exons, est=None, quality=None,
11 qualityPlifs=None):
12 """
13 Exonpos: Anfang Exon bis 1 rechts neben Ende Exon
14 Daraus folgt: SpliceposAnfang ist gleich Ende Exon von Vorgaenger
15 (ausser dem letzten und ersten)
16 und Ende Splicestelle ist gleich Anfang Exon vom naechsten, da Splicestelleende notiert 1 rechts neben Splciestellenende
17 bsp:
18 cccccA1cccccE1cccccA2 ... AmcccccEmcccccccc
19 cccccXXcccccA1cccccE1 ... Em-1cccXXcccccccc
20 """
21
22 numberOfExons = (exons.shape)[0] # how many rows ?
23 exonSizes = [-1]*numberOfExons
24
25 #assert numberOfExons == 3
26
27 for idx in range(numberOfExons):
28 exonSizes[idx] = exons[idx,1] - exons[idx,0]
29
30 # SpliceAlign vector describes alignment:
31 # 1:donorpos, 3:intron 2:acceptorpos, 0:exon, 4: dangling end
32 SpliceAlign = []
33
34 if exons[0,0] > 0:
35 SpliceAlign.extend([4]*(exons[0,0]))
36
37 for idx in range(numberOfExons):
38 exonLength = exonSizes[idx]
39 SpliceAlign.extend([0]*exonLength)
40
41 if idx < numberOfExons-1:
42 intronLength = exons[idx+1,0] - exons[idx,1]
43 SpliceAlign.extend([1]+[3]*(intronLength-2)+[2])
44
45 if len(dna) > exons[-1,1]:
46 SpliceAlign.extend([4]*(len(dna)-exons[-1,1]))
47
48 assert len(SpliceAlign) == len(dna), pdb.set_trace()
49
50 # number of matches: just have to look at the underlying est
51 # est = dna(find(SpliceAlign==0)) # exon nucleotides
52 exonPos = [pos for pos,elem in enumerate(SpliceAlign) if elem == 0]
53
54 #est = [elem for pos,elem in enumerate(dna) if pos in exonPos]
55
56 #length_est = sum(exon_sizes) ;
57 totalESTLength = 0
58 for elem in exonSizes:
59 totalESTLength += elem
60
61 assert totalESTLength == len(est)
62
63 sizeMatchmatrix = Conf.sizeMatchmatrix
64 # counts the occurences of a,c,g,t,n in this order
65 numChar = [0]*sizeMatchmatrix[0]
66
67 # counts the occurrences of a,c,g,t,n with their quality scores
68 trueWeightQualityMat = [0.0]*5 # 5 rows
69 for row in range(5):
70 trueWeightQualityMat[row] = [0.0]*6 # 6 col per row
71
72 for elem in est:
73 if elem == 'a':
74 numChar[0] += 1
75 if elem == 'c':
76 numChar[1] += 1
77 if elem == 'g':
78 numChar[2] += 1
79 if elem == 't':
80 numChar[3] += 1
81 if elem == 'n':
82 numChar[4] += 1
83
84 for row in range(5):
85 for col in range(6):
86 trueWeightQualityMat[row][col] = [0.0]*Conf.numQualSuppPoints # supp. points per plif
87
88 first_exon = dna[exons[0,0]:exons[0,1]]
89 second_exon = dna[exons[1,0]:exons[1,1]]
90 dna_part = first_exon + second_exon
91
92 assert len(dna_part) == len(est)
93 assert len(est) == sum(numChar)
94
95 #print numChar
96 #print est
97
98 map = {'-':0, 'a':1, 'c':2, 'g':3, 't':4, 'n':5}
99
100 if Conf.mode == 'using_quality_scores':
101
102 for idx in range(len(dna_part)):
103 dnanum = map[dna_part[idx]]
104 estnum = map[est[idx]]
105 value = quality[idx]
106
107 cur_plf = qualityPlifs[(estnum-1)*6+dnanum]
108
109 Lower = len([elem for elem in cur_plf.limits if elem <= value])
110
111 if Lower == 0:
112 trueWeightQualityMat[estnum-1][dnanum][0] += 1
113 elif Lower == len(cur_plf.limits):
114 trueWeightQualityMat[estnum-1][dnanum][-1] += 1
115 else:
116 # because we count from 0 in python
117 Lower -= 1
118 Upper = Lower+1 ; # x-werte bleiben fest
119 #print value,Lower,Upper
120 weightup = 1.0*(value - cur_plf.limits[Lower]) / (cur_plf.limits[Upper] - cur_plf.limits[Lower])
121 weightlow = 1.0*(cur_plf.limits[Upper] - value) / (cur_plf.limits[Upper] - cur_plf.limits[Lower])
122
123 trueWeightQualityMat[estnum-1][dnanum][Upper] += weightup
124 trueWeightQualityMat[estnum-1][dnanum][Lower] += weightlow
125
126 trueWeightQuality = zeros((Conf.numQualSuppPoints*Conf.numQualPlifs,1))
127
128 ctr = 0
129 for row in range(5):
130 for col in range(6):
131 for p_idx in range(Conf.numQualSuppPoints):
132 #print ctr, row, col, p_idx
133 trueWeightQuality[ctr] = trueWeightQualityMat[row][col][p_idx]
134 ctr += 1
135
136 #assert int(sum(trueWeightQuality.flatten().tolist()[0])) == len(est)
137
138 totalNumChar = 0
139 for idx in range(sizeMatchmatrix[0]):
140 totalNumChar += numChar[idx]
141
142 assert totalNumChar == len(est), "numChar %d, len(est) %d" % (totalNumChar,len(est))
143
144 # writing in weight match matrix
145 # matrix is saved columnwise
146 if Conf.mode == 'normal':
147 trueWeightMatch = zeros((sizeMatchmatrix[0],sizeMatchmatrix[1])) # Scorematrix fuer Wahrheit
148 for idx in range(1,sizeMatchmatrix[0]):
149 trueWeightMatch[idx,idx] = numChar[idx-1]
150
151 if Conf.mode == 'using_quality_scores':
152 trueWeightMatch = zeros((sizeMatchmatrix[0],sizeMatchmatrix[1])) # Scorematrix fuer Wahrheit
153 #for idx in range(1,sizeMatchmatrix[0]):
154 # trueWeightMatch[idx] = numChar[idx-1]
155
156 trueWeightMatch = trueWeightMatch.reshape(sizeMatchmatrix[0]*sizeMatchmatrix[1],1)
157
158 return SpliceAlign, trueWeightMatch, trueWeightQuality