+ found bug in SWIG interface dna_len vs. dna_len-1 because of '\0' in matlab
[qpalma.git] / python / qpalma.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 ###########################################################
5 #
6 # This file containts the
7 #
8 ###########################################################
9
10 import sys
11 import subprocess
12 import scipy.io
13 import pdb
14
15 from numpy.matlib import mat,zeros,ones,inf
16
17 import QPalmaDP
18
19 from SIQP_CPX import SIQPSolver
20
21 from paths_load_data import *
22 from computeSpliceWeights import *
23 from set_param_palma import *
24 from computeSpliceAlign import *
25 from penalty_lookup_new import *
26 from compute_donacc import *
27 from TrainingParam import Param
28
29 import Configuration
30
31 class QPalma:
32 """
33 A training method for the QPalma project
34 """
35
36 def __init__(self):
37 self.ARGS = Param()
38
39 self.logfh = open('qpalma.log','w+')
40
41 gen_file= '%s/genome.config' % self.ARGS.basedir
42
43 cmd = ['']*4
44 cmd[0] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/utils'
45 cmd[1] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/genomes'
46 cmd[2] = 'genome_info = init_genome(\'%s\')' % gen_file
47 cmd[3] = 'save genome_info.mat genome_info'
48 full_cmd = "matlab -nojvm -nodisplay -r \"%s; %s; %s; %s; exit\"" % (cmd[0],cmd[1],cmd[2],cmd[3])
49
50 obj = subprocess.Popen(full_cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
51 out,err = obj.communicate()
52 assert err == '', 'An error occured!\n%s'%err
53
54 ginfo = scipy.io.loadmat('genome_info.mat')
55 self.genome_info = ginfo['genome_info']
56
57 self.plog('genome_info.basedir is %s\n'%self.genome_info.basedir)
58
59 self.C=1.0
60
61 self.numDonSuppPoints = 30
62 self.numAccSuppPoints = 30
63 self.numLengthSuppPoints = 30
64 self.sizeMMatrix = 36
65 self.numQualSuppPoints = 16*0
66
67 self.numFeatures = self.numDonSuppPoints + self.numAccSuppPoints + self.numLengthSuppPoints\
68 + self.sizeMMatrix + self.numQualSuppPoints
69
70 self.plog('Initializing problem...\n')
71
72
73 def createParameterVector(self, trueWeightDon, trueWeightAcc, trueWeightIntron, trueWeightMatch, qualityMatrix):
74 currentW = zeros((self.numFeatures,1))
75
76 donS = self.numDonSuppPoints
77 accS = self.numAccSuppPoints
78 lenS = self.numLengthSuppPoints
79 matS = self.sizeMMatrix
80 qualS = self.numQualSuppPoints
81
82 currentW[0:donS,0] = trueWeightDon[:,0]
83 currentW[donS:donS+accS,0] = trueWeightAcc[:,0]
84 currentW[donS+accS:donS+accS+lenS,0] = trueWeightIntron[:,0]
85 currentW[donS+accS+lenS:donS+accS+lenS+matS,0] = trueWeightMatch[:,0]
86 currentW[donS+accS+lenS+matS:donS+accS+lenS+matS+qualS,0] = qualityMatrix[:,0]
87
88 return currentW
89
90
91 def plog(self,string):
92 self.logfh.write(string)
93
94
95 def run(self):
96
97 # Load the whole dataset
98 Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data('training',self.genome_info,self.ARGS)
99
100 # number of training instances
101 N = len(Sequences)
102 self.numExamples = N
103 assert N == len(Acceptors) and N == len(Acceptors) and N == len(Exons)\
104 and N == len(Ests), 'The Seq,Accept,Donor,.. arrays are of different lengths'
105
106 self.plog('Number of training examples: %d\n'% N)
107
108 iteration_steps = 200 ; #upper bound on iteration steps
109
110 remove_duplicate_scores = False
111 print_matrix = False
112 anzpath = 2
113
114 # Initialize parameter vector
115 # param = numpy.matlib.rand(126,1)
116 param = Configuration.fixedParam
117
118 # Set the parameters such as limits penalties for the Plifs
119 [h,d,a,mmatrix] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
120
121 # delete splicesite-score-information
122 if not self.ARGS.train_with_splicesitescoreinformation:
123 for i in range(len(Acceptors)):
124 if Acceptors[i] > -20:
125 Acceptors[i] = 1
126 if Donors[i] >-20:
127 Donors[i] = 1
128
129 # Initialize solver
130 #problem = SIQPSolver(numFeatures,numExamples,C,logfile)
131
132 # stores the number of alignments done for each example (best path, second-best path etc.)
133 num_path = [anzpath]*N
134 # stores the gap for each example
135 gap = [0.0]*N
136
137 qualityMatrix = zeros((self.numQualSuppPoints,1))
138
139 #############################################################################################
140 # Training
141 #############################################################################################
142 self.plog('Starting training...\n')
143
144 iteration_nr = 1
145
146 while True:
147 print 'Iteration step: %d'% iteration_nr
148
149 for exampleIdx in range(self.numExamples):
150 if exampleIdx% 1000 == 0:
151 print 'Current example nr %d' % exampleIdx
152
153 dna = Sequences[exampleIdx]
154 est = Ests[exampleIdx]
155
156 exons = Exons[exampleIdx]
157 # NoiseMatrix = Noises[exampleIdx]
158 don_supp = Donors[exampleIdx]
159 acc_supp = Acceptors[exampleIdx]
160
161 # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
162 trueSpliceAlign, trueWeightMatch = computeSpliceAlign(dna, exons)
163
164 # Calculate the weights
165 trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
166
167 # Reshape currentW param
168 currentW = self.createParameterVector(trueWeightDon, trueWeightAcc, trueWeightIntron, trueWeightMatch, qualityMatrix)
169
170 #currentPhi = createFeatureVector()
171
172 currentPhi = zeros((self.numFeatures,1))
173 currentPhi[0:30] = mat(d.penalties[:]).reshape(30,1)
174 currentPhi[30:60] = mat(a.penalties[:]).reshape(30,1)
175 currentPhi[60:90] = mat(h.penalties[:]).reshape(30,1)
176 currentPhi[90:126] = mmatrix[:]
177 currentPhi[126:] = qualityMatrix[:]
178
179 # Calculate w'phi(x,y) the total score of the alignment
180 trueAlignmentScore = currentW.T * currentPhi
181
182
183 ################## Calculate wrong alignment(s) ######################
184
185
186 # Compute donor, acceptor with penalty_lookup_new
187 # returns two double lists
188 donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
189
190 #myalign wants the acceptor site on the g of the ag
191 acceptor = acceptor[1:]
192 acceptor.append(-inf)
193
194 dna = str(dna)
195 est = str(est)
196 dna_len = len(dna)
197 est_len = len(est)
198 ps = h.convert2SWIG()
199
200 matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
201 mm_len = 36
202
203 d_len = len(donor)
204 donor = QPalmaDP.createDoubleArrayFromList(donor)
205 a_len = len(acceptor)
206 acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
207
208 currentAlignment = QPalmaDP.Alignment()
209 qualityMat = QPalmaDP.createDoubleArrayFromList(qualityMatrix)
210 currentAlignment.setQualityMatrix(qualityMat,self.numQualSuppPoints)
211
212 print 'PYTHON: Calling myalign...'
213 # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
214 currentAlignment.myalign( num_path[exampleIdx], dna, dna_len,\
215 est, est_len, ps, matchmatrix, mm_len, donor, d_len,\
216 acceptor, a_len, remove_duplicate_scores, print_matrix)
217 print 'PYTHON: After myalign call...'
218
219 newSpliceAlign = QPalmaDP.createIntArrayFromList([0]*(dna_len*num_path[exampleIdx]))
220 newEstAlign = QPalmaDP.createIntArrayFromList([0]*(est_len*num_path[exampleIdx]))
221 newWeightMatch = QPalmaDP.createIntArrayFromList([0]*(mm_len*num_path[exampleIdx]))
222 newAlignmentScores = QPalmaDP.createDoubleArrayFromList([.0]*num_path[exampleIdx])
223
224 currentAlignment.getAlignmentResults(newSpliceAlign, newEstAlign, newWeightMatch, newAlignmentScores)
225
226 spliceAlign = zeros((num_path[exampleIdx]*dna_len,1))
227 weightMatch = zeros((num_path[exampleIdx]*mm_len,1))
228
229 for i in range(dna_len*num_path[exampleIdx]):
230 spliceAlign[i] = newSpliceAlign[i]
231
232 for i in range(mm_len*num_path[exampleIdx]):
233 weightMatch[i] = newWeightMatch[i]
234
235 spliceAlign = spliceAlign.reshape(num_path[exampleIdx],dna_len)
236 weightMatch = weightMatch.reshape(num_path[exampleIdx],mm_len)
237
238 # Calculate weights of the respective alignments Note that we are
239 # calculating n-best alignments without any hamming loss, so we
240 # have to keep track which of the n-best alignments correspond to
241 # the true one in order not to incorporate a true alignment in the
242 # constraints. To keep track of the true and false alignments we
243 # define an array true_map with a boolean indicating the
244 # equivalence to the true alignment for each decoded alignment.
245 true_map = zeros((1,1+num_path[exampleIdx]))
246 true_map[0] = 1
247 path_loss = [0]*num_path[exampleIdx]
248
249 AlignmentScores = [0.0]*num_path[exampleIdx]
250 Weights = zeros((num_path[exampleIdx],self.numFeatures))
251
252 for pathNr in range(num_path[exampleIdx]):
253 #dna_numbers = dnaest{1,pathNr}
254 #est_numbers = dnaest{2,pathNr}
255
256 weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a, h, spliceAlign[pathNr,:].flatten().tolist()[0], don_supp, acc_supp)
257
258 # sum up positionwise loss between alignments
259 for alignPosIdx in range(len(spliceAlign[pathNr,:])):
260 if spliceAlign[pathNr,alignPosIdx] != trueSpliceAlign[alignPosIdx]:
261 path_loss[pathNr] += 1
262
263 # Gewichte in restliche Zeilen der Matrix speichern
264 # Weights[pathNr,:] = [weightIntron, weightDon, weightAcc, weightMatch(pathNr,:)]
265 wp = self.createParameterVector(weightIntron, weightDon, weightAcc, weightMatch[pathNr,:], qualityMatrix)
266 Weights[pathNr,:] = wp.T
267
268 hpen = mat(h.penalties).reshape(len(h.penalties),1)
269 dpen = mat(d.penalties).reshape(len(d.penalties),1)
270 apen = mat(a.penalties).reshape(len(a.penalties),1)
271
272 features = numpy.vstack([hpen , dpen , apen , mmatrix[:]])
273
274 AlignmentScores[pathNr] = (Weights[pathNr,:] * features)[0,0]
275
276 # Check wether scalar product + loss equals viterbi score
277 # assert math.fabs(newAlignmentScores[pathNr] - AlignmentScores[pathNr]) < 1e-6, 'Scalar prod + loss is not equal Viterbi score'
278
279 # if norm(Weights(1,:)-Weights(pathNr+1,:))<1e-5,
280 # true_map(pathNr+1)=1
281 #
282 # # the true label sequence should not have a larger score than the
283 # # maximal one WHYYYYY?
284 # if AlignmentScores(1) >= max(AlignmentScores(2:end))+1e-6,
285 # AlignmentScores
286 # warning('true score > max score\n') ;
287 # keyboard
288 # end ;
289
290 # if all(true_map==1)
291 # num_path[exampleIdx]=num_path[exampleIdx]+1 ; %next iteration step: one alignment more
292 # end ;
293
294 # # Choose true and first false alignment for extending A
295 # Weights = Weights([1 min(find(true_map==0))], :) ;
296
297 # # if there is a false alignment
298 # if size(Weights,1)>1 & sum((Weights(1,:)-Weights(2,:)).*param)+xis[exampleIdx]<1+column_eps,
299 # e=zeros(1,N) ;
300 # e[exampleIdx] = 1 ;
301 # A=[A; Weights(2,:)-Weights(1,:) zeros(1,126) -e] ;
302 # b=[b; -1] ;
303 # gap[exampleIdx] = 1-sum((Weights(1,:)-Weights(2,:)).*param)-xis[exampleIdx] ;
304 # else
305 # gap[exampleIdx] = 0 ;
306 # end ;
307
308 # LMM.py code:
309 # deltas = Weights(2,:)-Weights(1,:) zeros(1,126) -e
310 # const_added = solver.addConstraint(deltas, idx)
311 #
312 # objValue,w,self.slacks = solver.solve()
313
314 # sum_xis = 0
315 # for elem in self.slacks:
316 # sum_xis += elem
317
318 if exampleIdx== 10:
319 break
320
321 iteration_nr += 1
322 break
323
324 self.logfh.close()
325 print 'Training completed'
326
327 if __name__ == '__main__':
328 qpalma = QPalma()
329 qpalma.run()