+ fixed a bug in the C++ interface
[qpalma.git] / python / qpalma.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 ###########################################################
5 #
6 # This file contains the
7 #
8 ###########################################################
9
10 import sys
11 import subprocess
12 import scipy.io
13 import pdb
14
15 from numpy.matlib import mat,zeros,ones,inf
16 from numpy.linalg import norm
17
18 import QPalmaDP
19
20 from SIQP_CPX import SIQPSolver
21
22 from paths_load_data import *
23 from paths_load_data_pickle import *
24
25 from computeSpliceWeights import *
26 from set_param_palma import *
27 from computeSpliceAlign import *
28 from penalty_lookup_new import *
29 from compute_donacc import *
30 from TrainingParam import Param
31 from export_param import *
32
33 import Configuration
34
35 class QPalma:
36 """
37 A training method for the QPalma project
38 """
39
40 def __init__(self):
41 self.ARGS = Param()
42
43 self.logfh = open('qpalma.log','w+')
44
45 gen_file= '%s/genome.config' % self.ARGS.basedir
46
47 cmd = ['']*4
48 cmd[0] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/utils'
49 cmd[1] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/genomes'
50 cmd[2] = 'genome_info = init_genome(\'%s\')' % gen_file
51 cmd[3] = 'save genome_info.mat genome_info'
52 full_cmd = "matlab -nojvm -nodisplay -r \"%s; %s; %s; %s; exit\"" % (cmd[0],cmd[1],cmd[2],cmd[3])
53
54 obj = subprocess.Popen(full_cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
55 out,err = obj.communicate()
56 assert err == '', 'An error occured!\n%s'%err
57
58 ginfo = scipy.io.loadmat('genome_info.mat')
59 self.genome_info = ginfo['genome_info']
60
61 self.plog('genome_info.basedir is %s\n'%self.genome_info.basedir)
62
63 self.C=1.0
64
65 self.numDonSuppPoints = 30
66 self.numAccSuppPoints = 30
67 self.numLengthSuppPoints = 30
68 self.sizeMMatrix = 36
69 self.numQualSuppPoints = 16*0
70
71 self.numFeatures = self.numDonSuppPoints + self.numAccSuppPoints + self.numLengthSuppPoints\
72 + self.sizeMMatrix + self.numQualSuppPoints
73
74 self.plog('Initializing problem...\n')
75
76
77 def plog(self,string):
78 self.logfh.write(string)
79
80
81 def run(self):
82 # Load the whole dataset
83 #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data('training',self.genome_info,self.ARGS)
84 Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
85
86 # number of training instances
87 N = len(Sequences)
88 self.numExamples = N
89 assert N == len(Acceptors) and N == len(Acceptors) and N == len(Exons)\
90 and N == len(Ests), 'The Seq,Accept,Donor,.. arrays are of different lengths'
91 self.plog('Number of training examples: %d\n'% N)
92
93 iteration_steps = 200 ; #upper bound on iteration steps
94
95 remove_duplicate_scores = False
96 print_matrix = False
97 anzpath = 2
98
99 # Initialize parameter vector
100 # param = numpy.matlib.rand(126,1)
101 param = Configuration.fixedParam
102
103 # Set the parameters such as limits penalties for the Plifs
104 [h,d,a,mmatrix] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
105
106 # delete splicesite-score-information
107 if not self.ARGS.train_with_splicesitescoreinformation:
108 for i in range(len(Acceptors)):
109 if Acceptors[i] > -20:
110 Acceptors[i] = 1
111 if Donors[i] >-20:
112 Donors[i] = 1
113
114 # Initialize solver
115 if not __debug__:
116 solver = SIQPSolver(self.numFeatures,self.numExamples,self.C,self.logfh)
117
118 # stores the number of alignments done for each example (best path, second-best path etc.)
119 num_path = [anzpath]*N
120 # stores the gap for each example
121 gap = [0.0]*N
122
123 qualityMatrix = zeros((self.numQualSuppPoints,1))
124
125 #############################################################################################
126 # Training
127 #############################################################################################
128 self.plog('Starting training...\n')
129
130 iteration_nr = 1
131
132 while True:
133 if iteration_nr == iteration_steps:
134 break
135
136 for exampleIdx in range(self.numExamples):
137 if (exampleIdx%10) == 0:
138 print 'Current example nr %d' % exampleIdx
139
140 dna = Sequences[exampleIdx]
141 est = Ests[exampleIdx]
142
143 exons = Exons[exampleIdx]
144 # NoiseMatrix = Noises[exampleIdx]
145 don_supp = Donors[exampleIdx]
146 acc_supp = Acceptors[exampleIdx]
147
148 # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
149 trueSpliceAlign, trueWeightMatch = computeSpliceAlign(dna, exons)
150
151 # Calculate the weights
152 trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
153 trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, qualityMatrix ])
154
155 currentPhi = zeros((self.numFeatures,1))
156 currentPhi[0:30] = mat(d.penalties[:]).reshape(30,1)
157 currentPhi[30:60] = mat(a.penalties[:]).reshape(30,1)
158 currentPhi[60:90] = mat(h.penalties[:]).reshape(30,1)
159 currentPhi[90:126] = mmatrix[:]
160 currentPhi[126:] = qualityMatrix[:]
161
162 # Calculate w'phi(x,y) the total score of the alignment
163 trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
164
165 # The allWeights vector is supposed to store the weight parameter
166 # of the true alignment as well as the weight parameters of the
167 # num_path[exampleIdx] other alignments
168 allWeights = zeros((self.numFeatures,num_path[exampleIdx]+1))
169 allWeights[:,0] = trueWeight[:,0]
170
171 AlignmentScores = [0.0]*(num_path[exampleIdx]+1)
172 AlignmentScores[0] = trueAlignmentScore
173
174 ################## Calculate wrong alignment(s) ######################
175
176 # Compute donor, acceptor with penalty_lookup_new
177 # returns two double lists
178 donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
179
180 #myalign wants the acceptor site on the g of the ag
181 acceptor = acceptor[1:]
182 acceptor.append(-inf)
183
184 dna = str(dna)
185 est = str(est)
186 dna_len = len(dna)
187 est_len = len(est)
188 ps = h.convert2SWIG()
189
190 matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
191 mm_len = 36
192
193 d_len = len(donor)
194 donor = QPalmaDP.createDoubleArrayFromList(donor)
195 a_len = len(acceptor)
196 acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
197
198 currentAlignment = QPalmaDP.Alignment()
199 qualityMat = QPalmaDP.createDoubleArrayFromList(qualityMatrix)
200 currentAlignment.setQualityMatrix(qualityMat,self.numQualSuppPoints)
201
202 #print 'PYTHON: Calling myalign...'
203 # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
204 currentAlignment.myalign( num_path[exampleIdx], dna, dna_len,\
205 est, est_len, ps, matchmatrix, mm_len, donor, d_len,\
206 acceptor, a_len, remove_duplicate_scores, print_matrix)
207 #print 'PYTHON: After myalign call...'
208
209 newSpliceAlign = QPalmaDP.createIntArrayFromList([0]*(dna_len*num_path[exampleIdx]))
210 newEstAlign = QPalmaDP.createIntArrayFromList([0]*(est_len*num_path[exampleIdx]))
211 newWeightMatch = QPalmaDP.createIntArrayFromList([0]*(mm_len*num_path[exampleIdx]))
212 newAlignmentScores = QPalmaDP.createDoubleArrayFromList([.0]*num_path[exampleIdx])
213
214 currentAlignment.getAlignmentResults(newSpliceAlign, newEstAlign, newWeightMatch, newAlignmentScores)
215 del currentAlignment
216
217 spliceAlign = zeros((num_path[exampleIdx]*dna_len,1))
218 weightMatch = zeros((num_path[exampleIdx]*mm_len,1))
219
220 #print 'spliceAlign'
221 #for i in range(dna_len*num_path[exampleIdx]):
222 # spliceAlign[i] = newSpliceAlign[i]
223 # print '%f' % (spliceAlign[i])
224
225 #print 'weightMatch'
226 #for i in range(mm_len*num_path[exampleIdx]):
227 # weightMatch[i] = newWeightMatch[i]
228 # print '%f' % (weightMatch[i])
229
230 for i in range(num_path[exampleIdx]):
231 AlignmentScores[i+1] = newAlignmentScores[i]
232
233 #print AlignmentScores
234
235 spliceAlign = spliceAlign.reshape(num_path[exampleIdx],dna_len)
236 weightMatch = weightMatch.reshape(num_path[exampleIdx],mm_len)
237 # Calculate weights of the respective alignments Note that we are
238 # calculating n-best alignments without any hamming loss, so we
239 # have to keep track which of the n-best alignments correspond to
240 # the true one in order not to incorporate a true alignment in the
241 # constraints. To keep track of the true and false alignments we
242 # define an array true_map with a boolean indicating the
243 # equivalence to the true alignment for each decoded alignment.
244 true_map = [0]*(num_path[exampleIdx]+1)
245 true_map[0] = 1
246 path_loss = [0]*(num_path[exampleIdx]+1)
247
248 for pathNr in range(num_path[exampleIdx]):
249 #dna_numbers = dnaest{1,pathNr}
250 #est_numbers = dnaest{2,pathNr}
251
252 weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a, h, spliceAlign[pathNr,:].flatten().tolist()[0], don_supp, acc_supp)
253
254 # sum up positionwise loss between alignments
255 for alignPosIdx in range(len(spliceAlign[pathNr,:])):
256 if spliceAlign[pathNr,alignPosIdx] != trueSpliceAlign[alignPosIdx]:
257 path_loss[pathNr+1] += 1
258
259 # Gewichte in restliche Zeilen der Matrix speichern
260 wp = numpy.vstack([weightIntron, weightDon, weightAcc, weightMatch[pathNr,:].T, qualityMatrix ])
261 allWeights[:,pathNr+1] = wp
262
263 hpen = mat(h.penalties).reshape(len(h.penalties),1)
264 dpen = mat(d.penalties).reshape(len(d.penalties),1)
265 apen = mat(a.penalties).reshape(len(a.penalties),1)
266
267 features = numpy.vstack([hpen , dpen , apen , mmatrix[:]])
268 AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
269
270 # Check wether scalar product + loss equals viterbi score
271 #assert math.fabs(newAlignmentScores[pathNr] - AlignmentScores[pathNr+1]) < 1e-6,\
272 #'Scalar prod + loss is not equal Viterbi score. Respective values are %f, %f' % \
273 #(newAlignmentScores[pathNr],AlignmentScores[pathNr+1])
274
275 # # if the pathNr-best alignment is very close to the true alignment consider it as true
276 if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
277 true_map[pathNr+1] = 1
278
279 # the true label sequence should not have a larger score than the maximal one WHYYYYY?
280
281 # this means that all n-best paths are to close to each other
282 # we have to extend the n-best search to a (n+1)-best
283 if len([elem for elem in true_map if elem == 1]) == len(true_map):
284 num_path[exampleIdx] = num_path[exampleIdx]+1
285
286 # Choose true and first false alignment for extending A
287 firstFalseIdx = -1
288 for map_idx,elem in enumerate(true_map):
289 if elem == 0:
290 firstFalseIdx = map_idx
291 break
292
293 # if there is at least one useful false alignment add the
294 # corresponding constraints to the optimization problem
295 if firstFalseIdx != -1:
296 trueWeights = allWeights[:,0]
297 firstFalseWeights = allWeights[:,firstFalseIdx]
298
299 # LMM.py code:
300 deltas = firstFalseWeights - trueWeights
301 if not __debug__:
302 const_added = solver.addConstraint(deltas, exampleIdx)
303 objValue,w,self.slacks = solver.solve()
304
305 sum_xis = 0
306 for elem in self.slacks:
307 sum_xis += elem
308
309 for i in range(len(param)):
310 param[i] = w[i]
311
312 [h,d,a,mmatrix] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
313
314 #
315 # end of one example processing
316 #
317 if exampleIdx == 10:
318 break
319
320 break
321
322 #
323 # end of one iteration through all examples
324 #
325 iteration_nr += 1
326
327 #
328 # end of optimization
329 #
330 export_param('elegans.param',h,d,a,mmatrix)
331 self.logfh.close()
332 print 'Training completed'
333
334 if __name__ == '__main__':
335 qpalma = QPalma()
336 qpalma.run()