+ Extended C/C++ interface to return results easily
[qpalma.git] / python / computeSpliceAlign.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from numpy.matlib import zeros
5 import pdb
6
7 def computeSpliceAlign(dna, exons):
8 """
9 Exonpos: Anfang Exon bis 1 rechts neben Ende Exon
10 Daraus folgt: SpliceposAnfang ist gleich Ende Exon von Vorgaenger
11 (ausser dem letzten und ersten)
12 und Ende Splicestelle ist gleich Anfang Exon vom naechsten, da Splicestelleende notiert 1 rechts neben Splciestellenende
13 bsp:
14 cccccA1cccccE1cccccA2 ... AmcccccEmcccccccc
15 cccccXXcccccA1cccccE1 ... Em-1cccXXcccccccc
16 """
17
18 numberOfExons = (exons.shape)[0] # how many rows ?
19 exonSizes = [-1]*numberOfExons
20
21 for idx in range(numberOfExons):
22 exonSizes[idx] = exons[idx,1] - exons[idx,0]
23
24 sizeMatchmatrix = 6 # -acgtn
25
26 # SpliceAlign vector describes alignment:
27 # 1:donorpos, 3:intron 2:acceptorpos, 0:exon, 4: dangling end
28 SpliceAlign = []
29
30 if exons[0,0] > 1:
31 SpliceAlign.extend([4]*(exons[0,0]))
32
33 #for i=1:nr_exons
34 # exon_length = exon_sizes(i); %exons(i,2) is begin of intron
35 # SpliceAlign = [SpliceAlign, zeros(1,exon_length)] ;
36 # if i~= nr_exons,
37 # intron_length = exons(i+1,1) - exons(i,2) ;
38 # SpliceAlign = [SpliceAlign, 1, ones(1,intron_length-2)*3, 2] ;
39 # end
40 #end
41
42 for idx in range(numberOfExons):
43 exonLength = exonSizes[idx]
44 SpliceAlign.extend([0]*exonLength)
45
46 if idx < numberOfExons-1:
47 intronLength = exons[idx+1,0] - exons[idx,1]
48 SpliceAlign.extend([1])
49 SpliceAlign.extend([3]*((intronLength-2)))
50 SpliceAlign.extend([2])
51
52 if len(dna) > exons[2,1]:
53 SpliceAlign.extend([4]*(len(dna)+1-exons[2,1]))
54
55 assert len(SpliceAlign) == len(dna), pdb.set_trace()
56
57 # number of matches: just have to look at the underlying est
58 # est = dna(find(SpliceAlign==0)) # exon nucleotides
59 exonPos = [pos for pos,elem in enumerate(SpliceAlign) if elem == 0]
60 est = [elem for pos,elem in enumerate(dna) if pos in exonPos]
61
62 #length_est = sum(exon_sizes) ;
63 totalESTLength = 0
64 for elem in exonSizes:
65 totalESTLength += elem
66
67 assert totalESTLength == len(est)
68
69 # counts the occurences of a,c,g,t,n in this order
70 numChar = [0]*sizeMatchmatrix
71
72 for elem in est:
73 if elem == 'a':
74 numChar[0] += 1
75 if elem == 'c':
76 numChar[1] += 1
77 if elem == 'g':
78 numChar[2] += 1
79 if elem == 't':
80 numChar[3] += 1
81 if elem == 'n':
82 numChar[4] += 1
83
84 totalNumChar = 0
85 for idx in range(sizeMatchmatrix):
86 totalNumChar += numChar[idx]
87
88 assert totalNumChar == len(est)
89
90 # writing in weight match matrix
91 # matrix is saved columnwise
92 trueWeightMatch = zeros((sizeMatchmatrix*sizeMatchmatrix,1)) # Scorematrix fuer Wahrheit
93 for idx in range(1,sizeMatchmatrix):
94 trueWeightMatch[(sizeMatchmatrix+1)*idx] = numChar[idx-1]
95
96 return SpliceAlign, trueWeightMatch