+ added feature calculation for the labels
[qpalma.git] / python / computeSpliceAlignWithQuality.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from numpy.matlib import mat,zeros,ones,inf
5 import pdb
6 from Plif import Plf,base_coord,linspace
7 import Configuration
8
9 def computeSpliceAlignWithQuality(dna, exons, quality):
10 """
11 Exonpos: Anfang Exon bis 1 rechts neben Ende Exon
12 Daraus folgt: SpliceposAnfang ist gleich Ende Exon von Vorgaenger
13 (ausser dem letzten und ersten)
14 und Ende Splicestelle ist gleich Anfang Exon vom naechsten, da Splicestelleende notiert 1 rechts neben Splciestellenende
15 bsp:
16 cccccA1cccccE1cccccA2 ... AmcccccEmcccccccc
17 cccccXXcccccA1cccccE1 ... Em-1cccXXcccccccc
18 """
19
20 numberOfExons = (exons.shape)[0] # how many rows ?
21 exonSizes = [-1]*numberOfExons
22
23 assert numberOfExons == 3
24
25 for idx in range(numberOfExons):
26 exonSizes[idx] = exons[idx,1] - exons[idx,0]
27
28 sizeMatchmatrix = 6 # -acgtn
29
30 # SpliceAlign vector describes alignment:
31 # 1:donorpos, 3:intron 2:acceptorpos, 0:exon, 4: dangling end
32 SpliceAlign = []
33
34 if exons[0,0] > 0:
35 SpliceAlign.extend([4]*(exons[0,0]))
36
37 for idx in range(numberOfExons):
38 exonLength = exonSizes[idx]
39 SpliceAlign.extend([0]*exonLength)
40
41 if idx < numberOfExons-1:
42 intronLength = exons[idx+1,0] - exons[idx,1]
43 SpliceAlign.extend([1]+[3]*(intronLength-2)+[2])
44
45 if len(dna) > exons[2,1]:
46 SpliceAlign.extend([4]*(len(dna)-exons[2,1]))
47
48 assert len(SpliceAlign) == len(dna), pdb.set_trace()
49
50 # number of matches: just have to look at the underlying est
51 # est = dna(find(SpliceAlign==0)) # exon nucleotides
52 exonPos = [pos for pos,elem in enumerate(SpliceAlign) if elem == 0]
53 est = [elem for pos,elem in enumerate(dna) if pos in exonPos]
54
55 #length_est = sum(exon_sizes) ;
56 totalESTLength = 0
57 for elem in exonSizes:
58 totalESTLength += elem
59
60 assert totalESTLength == len(est)
61
62 # counts the occurences of a,c,g,t,n in this order
63 numChar = [0]*sizeMatchmatrix
64
65 # counts the occurrnces of a,c,g,t,n with their quality scores
66 trueQualityPlifs = [Plf(Configuration.numQualSuppPoints)] * Configuration.numQualPlifs
67 for currentPlif in trueQualityPlifs:
68 currentPlif.limits = mat(linspace(0,40,10)).reshape(10,1)
69 currentPlif.penalties = zeros((10,1))
70
71 for pos, elem in enumerate(est):
72 if elem == 'a':
73 numChar[0] += 1
74 if elem == 'c':
75 numChar[1] += 1
76 if elem == 'g':
77 numChar[2] += 1
78 if elem == 't':
79 numChar[3] += 1
80 if elem == 'n':
81 numChar[4] += 1
82
83 currentPlif = trueQualityPlifs[base_coord(elem,elem)]
84 currentQualityScore = quality[pos]
85
86 if currentQualityScore < currentPlif.limits[0]:
87 currentPlif.penalties[0] += 1
88
89 elif currentQualityScore >= currentPlif.limits[-1]:
90 currentPlif.penalties[-1] += 1
91
92 else:
93 for idx in range(1,currentPlif.len):
94 if currentPlif.limits[idx-1] <= currentQualityScore < currentPlif.limits[idx]:
95 currentPlif.penalties[idx] += 1
96
97 totalNumChar = 0
98 for idx in range(sizeMatchmatrix):
99 totalNumChar += numChar[idx]
100
101 assert totalNumChar == len(est)
102
103 # writing in weight match matrix
104 # matrix is saved columnwise
105 trueWeightMatch = zeros((sizeMatchmatrix*sizeMatchmatrix,1)) # Scorematrix fuer Wahrheit
106 for idx in range(1,sizeMatchmatrix):
107 trueWeightMatch[(sizeMatchmatrix+1)*idx] = numChar[idx-1]
108
109 return SpliceAlign, trueWeightMatch, trueQualityPlifs
110