+ added some assertions
[qpalma.git] / python / computeSpliceAlignWithQuality.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from numpy.matlib import mat,zeros,ones,inf
5 import pdb
6 from Plif import Plf,base_coord,linspace
7 import Configuration
8
9 def computeSpliceAlignWithQuality(dna, exons):
10 """
11 Exonpos: Anfang Exon bis 1 rechts neben Ende Exon
12 Daraus folgt: SpliceposAnfang ist gleich Ende Exon von Vorgaenger
13 (ausser dem letzten und ersten)
14 und Ende Splicestelle ist gleich Anfang Exon vom naechsten, da Splicestelleende notiert 1 rechts neben Splciestellenende
15 bsp:
16 cccccA1cccccE1cccccA2 ... AmcccccEmcccccccc
17 cccccXXcccccA1cccccE1 ... Em-1cccXXcccccccc
18 """
19
20 numberOfExons = (exons.shape)[0] # how many rows ?
21 exonSizes = [-1]*numberOfExons
22
23 assert numberOfExons == 3
24
25 for idx in range(numberOfExons):
26 exonSizes[idx] = exons[idx,1] - exons[idx,0]
27
28 # SpliceAlign vector describes alignment:
29 # 1:donorpos, 3:intron 2:acceptorpos, 0:exon, 4: dangling end
30 SpliceAlign = []
31
32 if exons[0,0] > 0:
33 SpliceAlign.extend([4]*(exons[0,0]))
34
35 for idx in range(numberOfExons):
36 exonLength = exonSizes[idx]
37 SpliceAlign.extend([0]*exonLength)
38
39 if idx < numberOfExons-1:
40 intronLength = exons[idx+1,0] - exons[idx,1]
41 SpliceAlign.extend([1]+[3]*(intronLength-2)+[2])
42
43 if len(dna) > exons[2,1]:
44 SpliceAlign.extend([4]*(len(dna)-exons[2,1]))
45
46 assert len(SpliceAlign) == len(dna), pdb.set_trace()
47
48 # number of matches: just have to look at the underlying est
49 # est = dna(find(SpliceAlign==0)) # exon nucleotides
50 exonPos = [pos for pos,elem in enumerate(SpliceAlign) if elem == 0]
51 est = [elem for pos,elem in enumerate(dna) if pos in exonPos]
52
53 #length_est = sum(exon_sizes) ;
54 totalESTLength = 0
55 for elem in exonSizes:
56 totalESTLength += elem
57
58 assert totalESTLength == len(est)
59
60 sizeMatchmatrix = Configuration.sizeMatchmatrix
61 # counts the occurences of a,c,g,t,n in this order
62 numChar = [0]*sizeMatchmatrix[0]
63
64 # counts the occurrences of a,c,g,t,n with their quality scores
65 trueWeightQuality = zeros((Configuration.numQualPlifs*Configuration.numQualSuppPoints,1))
66
67 for elem in est:
68 if elem == 'a':
69 numChar[0] += 1
70 if elem == 'c':
71 numChar[1] += 1
72 if elem == 'g':
73 numChar[2] += 1
74 if elem == 't':
75 numChar[3] += 1
76 if elem == 'n':
77 numChar[4] += 1
78
79 if Configuration.mode == 'using_quality_scores':
80 trueWeightQuality[base_coord(elem,elem)*Configuration.numQualSuppPoints+Configuration.numQualSuppPoints-1] += 1.0
81
82 totalNumChar = 0
83 for idx in range(sizeMatchmatrix[0]):
84 totalNumChar += numChar[idx]
85
86 assert totalNumChar == len(est), "numChar %d, len(est) %d" % (totalNumChar,len(est))
87
88 # writing in weight match matrix
89 # matrix is saved columnwise
90 trueWeightMatch = zeros((sizeMatchmatrix[0],sizeMatchmatrix[1])) # Scorematrix fuer Wahrheit
91 for idx in range(1,sizeMatchmatrix[0]):
92 trueWeightMatch[idx,idx] = numChar[idx-1]
93
94 trueWeightMatch = trueWeightMatch.reshape(sizeMatchmatrix[0]*sizeMatchmatrix[1],1)
95
96 return SpliceAlign, trueWeightMatch, trueWeightQuality