+ renamed main dir in order to create python module hierarchy
[qpalma.git] / qpalma / computeSpliceAlignWithQuality.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import numpy
5 from numpy.matlib import mat,zeros,ones,inf
6 import pdb
7 from Plif import Plf,base_coord,linspace
8 import Configuration
9
10 def computeSpliceAlignWithQuality(dna, exons):
11 """
12 Exonpos: Anfang Exon bis 1 rechts neben Ende Exon
13 Daraus folgt: SpliceposAnfang ist gleich Ende Exon von Vorgaenger
14 (ausser dem letzten und ersten)
15 und Ende Splicestelle ist gleich Anfang Exon vom naechsten, da Splicestelleende notiert 1 rechts neben Splciestellenende
16 bsp:
17 cccccA1cccccE1cccccA2 ... AmcccccEmcccccccc
18 cccccXXcccccA1cccccE1 ... Em-1cccXXcccccccc
19 """
20
21 numberOfExons = (exons.shape)[0] # how many rows ?
22 exonSizes = [-1]*numberOfExons
23
24 #assert numberOfExons == 3
25
26 for idx in range(numberOfExons):
27 exonSizes[idx] = exons[idx,1] - exons[idx,0]
28
29 # SpliceAlign vector describes alignment:
30 # 1:donorpos, 3:intron 2:acceptorpos, 0:exon, 4: dangling end
31 SpliceAlign = []
32
33 if exons[0,0] > 0:
34 SpliceAlign.extend([4]*(exons[0,0]))
35
36 for idx in range(numberOfExons):
37 exonLength = exonSizes[idx]
38 SpliceAlign.extend([0]*exonLength)
39
40 if idx < numberOfExons-1:
41 intronLength = exons[idx+1,0] - exons[idx,1]
42 SpliceAlign.extend([1]+[3]*(intronLength-2)+[2])
43
44 if len(dna) > exons[-1,1]:
45 SpliceAlign.extend([4]*(len(dna)-exons[-1,1]))
46
47 assert len(SpliceAlign) == len(dna), pdb.set_trace()
48
49 # number of matches: just have to look at the underlying est
50 # est = dna(find(SpliceAlign==0)) # exon nucleotides
51 exonPos = [pos for pos,elem in enumerate(SpliceAlign) if elem == 0]
52 est = [elem for pos,elem in enumerate(dna) if pos in exonPos]
53
54 #length_est = sum(exon_sizes) ;
55 totalESTLength = 0
56 for elem in exonSizes:
57 totalESTLength += elem
58
59 assert totalESTLength == len(est)
60
61 sizeMatchmatrix = Configuration.sizeMatchmatrix
62 # counts the occurences of a,c,g,t,n in this order
63 numChar = [0]*sizeMatchmatrix[0]
64
65 # counts the occurrences of a,c,g,t,n with their quality scores
66 #trueWeightQuality = zeros((Configuration.numQualPlifs*Configuration.numQualSuppPoints,1))
67 trueWeightQuality = numpy.zeros(Configuration.numQualPlifs*Configuration.numQualSuppPoints)
68 trueWeightQuality = trueWeightQuality.reshape(Configuration.estPlifs,Configuration.dnaPlifs,Configuration.numQualSuppPoints)
69
70 for elem in est:
71 if elem == 'a':
72 numChar[0] += 1
73 if elem == 'c':
74 numChar[1] += 1
75 if elem == 'g':
76 numChar[2] += 1
77 if elem == 't':
78 numChar[3] += 1
79 if elem == 'n':
80 numChar[4] += 1
81
82 if Configuration.mode == 'using_quality_scores':
83 for pos,elem in enumerate(numChar):
84 if pos < Configuration.estPlifs:
85 trueWeightQuality[pos][pos+1][-1] = elem
86
87 trueWeightQuality = mat(trueWeightQuality.flatten())
88 trueWeightQuality = trueWeightQuality.T
89
90 totalNumChar = 0
91 for idx in range(sizeMatchmatrix[0]):
92 totalNumChar += numChar[idx]
93
94 assert totalNumChar == len(est), "numChar %d, len(est) %d" % (totalNumChar,len(est))
95
96 # writing in weight match matrix
97 # matrix is saved columnwise
98 if Configuration.mode == 'normal':
99 trueWeightMatch = zeros((sizeMatchmatrix[0],sizeMatchmatrix[1])) # Scorematrix fuer Wahrheit
100 for idx in range(1,sizeMatchmatrix[0]):
101 trueWeightMatch[idx,idx] = numChar[idx-1]
102
103 if Configuration.mode == 'using_quality_scores':
104 trueWeightMatch = zeros((sizeMatchmatrix[0],sizeMatchmatrix[1])) # Scorematrix fuer Wahrheit
105 #for idx in range(1,sizeMatchmatrix[0]):
106 # trueWeightMatch[idx] = numChar[idx-1]
107
108 trueWeightMatch = trueWeightMatch.reshape(sizeMatchmatrix[0]*sizeMatchmatrix[1],1)
109
110 return SpliceAlign, trueWeightMatch, trueWeightQuality