+ added first converted scripts
[qpalma.git] / python / computeSpliceAlign.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from numpy.matlib import zeros
5 import pdb
6
7
8 def computeSpliceAlign(dna, exons):
9 """
10 Exonpos: Anfang Exon bis 1 rechts neben Ende Exon
11 Daraus folgt: SpliceposAnfang ist gleich Ende Exon von Vorgaenger
12 (ausser dem letzten und ersten)
13 und Ende Splicestelle ist gleich Anfang Exon vom naechsten, da Splicestelleende notiert 1 rechts neben Splciestellenende
14 bsp:
15 cccccA1cccccE1cccccA2 ... AmcccccEmcccccccc
16 cccccXXcccccA1cccccE1 ... Em-1cccXXcccccccc
17 """
18
19 numberOfExons = (exons.shape)[0] # how many rows ?
20 exonSizes = [-1]*numberOfExons
21
22
23 for idx in range(numberOfExons):
24 exonSizes[idx] = exons[idx,1] - exons[idx,0]
25
26 sizeMatchmatrix = 6 # -acgtn
27
28 # SpliceAlign vector describes alignment:
29 # 1:donorpos, 3:intron 2:acceptorpos, 0:exon, 4: dangling end
30 SpliceAlign = []
31
32 if exons[0,0] > 1:
33 SpliceAlign.extend([4]*(exons[0,0]))
34
35 #for i=1:nr_exons
36 # exon_length = exon_sizes(i); %exons(i,2) is begin of intron
37 # SpliceAlign = [SpliceAlign, zeros(1,exon_length)] ;
38 # if i~= nr_exons,
39 # intron_length = exons(i+1,1) - exons(i,2) ;
40 # SpliceAlign = [SpliceAlign, 1, ones(1,intron_length-2)*3, 2] ;
41 # end
42 #end
43
44 for idx in range(numberOfExons):
45 exonLength = exonSizes[idx]
46 SpliceAlign.extend([0]*exonLength)
47
48 if idx < numberOfExons-1:
49 intronLength = exons[idx+1,0] - exons[idx,1]
50 SpliceAlign.extend([1])
51 SpliceAlign.extend([3]*((intronLength-2)))
52 SpliceAlign.extend([2])
53
54 if len(dna) > exons[2,1]:
55 SpliceAlign.extend([4]*(len(dna)+1-exons[2,1]))
56
57 assert len(SpliceAlign) == len(dna), pdb.set_trace()
58
59 # number of matches: just have to look at the underlying est
60 # est = dna(find(SpliceAlign==0)) # exon nucleotides
61 exonPos = [pos for pos,elem in enumerate(SpliceAlign) if elem == 0]
62 est = [elem for pos,elem in enumerate(dna) if pos in exonPos]
63
64 #length_est = sum(exon_sizes) ;
65 totalESTLength = 0
66 for elem in exonSizes:
67 totalESTLength += elem
68
69 assert totalESTLength == len(est)
70
71 # counts the occurences of a,c,g,t,n in this order
72 numChar = [0]*sizeMatchmatrix
73
74 for elem in est:
75 if elem == 'a':
76 numChar[0] += 1
77 if elem == 'c':
78 numChar[1] += 1
79 if elem == 'g':
80 numChar[2] += 1
81 if elem == 't':
82 numChar[3] += 1
83 if elem == 'n':
84 numChar[4] += 1
85
86 totalNumChar = 0
87 for idx in range(sizeMatchmatrix):
88 totalNumChar += numChar[idx]
89
90 assert totalNumChar == len(est)
91
92 # writing in weight match matrix
93 # matrix is saved columnwise
94 trueWeightMatch = zeros((sizeMatchmatrix*sizeMatchmatrix,1)) # Scorematrix fuer Wahrheit
95 for idx in range(1,sizeMatchmatrix):
96 trueWeightMatch[(sizeMatchmatrix+1)*idx] = numChar[idx-1]
97
98 return SpliceAlign, trueWeightMatch