+ extended pipeline code
[qpalma.git] / qpalma / computeSpliceWeights.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from numpy import inf
5 from numpy.matlib import zeros
6 import pdb
7
8 # 3 * 30 supporting points: x_1 ... x_30 => 3 * 30 parameters (1 .. 90): y_1 ... y_30
9 # piecewise linear function:
10 # take score from SVM vektor (don_supp: case 1, acc_supp: case 2) and compute length of intron: case 3
11 # these are our values x
12 #
13 # | y_1 if x <= x_1
14 # |
15 # | x_i+1 - x x - x_i
16 # f(x) = | y_i * ----------- + y_i+1 * ----------- if x_i <= x <= x_i+1
17 # | x_i+1 - x_i x_i+1 - x_i
18 # |
19 # | y_30 if x_n <= x
20 #
21 # y_i and y_i+1 parameters, so the fractions are saved in the weight vectors!
22
23 def calculateWeights(plf, scores):
24 currentWeight = zeros((len(plf.limits),1))
25
26 for k in range(len(scores)):
27 value = scores[k]
28 Lower = len([elem for elem in plf.limits if elem <= value])
29
30 if Lower == 0:
31 currentWeight[0] += 1
32 elif Lower == len(plf.limits):
33 currentWeight[-1] += 1
34 else:
35 # because we count from 0 in python
36 Lower -= 1
37 Upper = Lower+1 ; # x-werte bleiben fest
38 #print value,Lower,Upper
39 weightup = 1.0*(value - plf.limits[Lower]) / (plf.limits[Upper] - plf.limits[Lower])
40 weightlow = 1.0*(plf.limits[Upper] - value) / (plf.limits[Upper] - plf.limits[Lower])
41 currentWeight[Upper] = currentWeight[Upper] + weightup
42 currentWeight[Lower] = currentWeight[Lower] + weightlow
43
44 #print plf.limits[Lower],plf.limits[Upper]
45 #print weightup,weightlow,currentWeight[Upper],currentWeight[Lower]
46
47 return currentWeight
48
49 def calculatePlif(plf, scores):
50 plifed_scores=[] ;
51
52 for k in range(len(scores)):
53 value = scores[k]
54
55 Lower = 0
56 for elem in plf.limits:
57 if elem<=value:
58 Lower+=1
59
60 if Lower == 0:
61 plifed_scores.append(plf.penalties[0])
62 elif Lower == len(plf.limits):
63 plifed_scores.append(plf.penalties[-1])
64 else:
65 Lower -= 1
66 Upper = Lower+1 ;
67 weightup = 1.0*(value - plf.limits[Lower]) / (plf.limits[Upper] - plf.limits[Lower])
68 weightlow = 1.0*(plf.limits[Upper] - value) / (plf.limits[Upper] - plf.limits[Lower])
69
70 plifed_scores.append( plf.penalties[Upper]*weightup + plf.penalties[Lower]*weightlow )
71
72 return plifed_scores
73
74 def computeSpliceWeights(d, a, h, SpliceAlign, don_supp, acc_supp):
75 ####################################################################################
76 # 1. Donor: In don_supp stehen Werte der SVM., in SpliceAlign die Einsen
77 ####################################################################################
78
79 # Picke die Positionen raus, an denen eine Donorstelle ist
80 try:
81 DonorScores = [elem for pos,elem in enumerate(don_supp) if SpliceAlign[pos] == 1]
82 assert not ( -inf in DonorScores )
83 except:
84 print 'Error'
85 pdb.set_trace()
86
87 #print 'donor'
88 weightDon = calculateWeights(d,DonorScores)
89
90 ####################################################################################
91 # 2. Acceptor: In acc_supp stehen Werte der SVM., in SpliceAlign die Einsen
92 ####################################################################################
93
94 #Den Vektor Acceptorstellen durchgehen und die Gewichtsvektoren belasten:
95 try:
96 AcceptorScores = [elem for pos,elem in enumerate(acc_supp) if pos > 0 and SpliceAlign[pos] == 2]
97 assert not ( -inf in AcceptorScores )
98 except:
99 print 'Error'
100 pdb.set_trace()
101
102 #print 'acceptor'
103 weightAcc = calculateWeights(a,AcceptorScores)
104
105 ####################################################################################
106 # 3. Intron length: SpliceAlign: Gaps zaehlen und auf Gapgewichte addieren
107 ####################################################################################
108
109 intron_starts = []
110 intron_ends = []
111 for pos,elem in enumerate(SpliceAlign):
112 if elem == 1:
113 intron_starts.append(pos)
114
115 if elem == 2:
116 intron_ends.append(pos)
117
118 assert len(intron_starts) == len(intron_ends)
119
120 for i in range(len(intron_starts)):
121 assert intron_starts[i] < intron_ends[i]
122
123 values = [0.0]*len(intron_starts)
124 for pos in range(len(intron_starts)):
125 values[pos] = intron_ends[pos] - intron_starts[pos] + 1
126
127 #print 'introns found: ' + str(values)
128 weightIntron = calculateWeights(h,values)
129
130 return weightDon, weightAcc, weightIntron