+ parameter transfer to python layer works
[qpalma.git] / python / qpalma.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 ###########################################################
5 #
6 #
7 #
8 ###########################################################
9
10 import sys
11 import subprocess
12 import scipy.io
13 import pdb
14 import os.path
15
16 from numpy.matlib import mat,zeros,ones,inf
17 from numpy.linalg import norm
18
19 import QPalmaDP
20
21 from SIQP_CPX import SIQPSolver
22
23 from paths_load_data import *
24 from paths_load_data_pickle import *
25
26 from computeSpliceWeights import *
27 from set_param_palma import *
28 from computeSpliceAlign import *
29 from computeSpliceAlignWithQuality import *
30 from penalty_lookup_new import *
31 from compute_donacc import *
32 from TrainingParam import Param
33 from export_param import *
34
35 import Configuration
36
37 from Plif import Plf
38
39
40 def initializeQualityScoringFunctions(numPlifs,numSuppPoints):
41
42 min_intron_len=20
43 max_intron_len=1000
44 min_svm_score=-5
45 max_svm_score=5
46
47 qualityPlifs = [None]*numPlifs
48
49 for idx in range(numPlifs):
50
51 curPlif = Plf()
52 curPlif.limits = linspace(min_svm_score,max_svm_score,numSuppPoints)
53 curPlif.penalties = [0]*numSuppPoints
54 curPlif.transform = ''
55 curPlif.name = ''
56 curPlif.max_len = 100
57 curPlif.min_len = -100
58 curPlif.id = 1
59 curPlif.use_svm = 0
60 curPlif.next_id = 0
61
62 if idx == 0:
63 curPlif.penalties[0] = 11
64 curPlif.penalties[1] = 22
65 curPlif.penalties[2] = 33
66
67 if idx == 1:
68 curPlif.penalties[0] = 99
69 curPlif.penalties[1] = 100
70 curPlif.penalties[2] = 101
71
72 curPlif = curPlif.convert2SWIG()
73 qualityPlifs[idx] = curPlif
74
75 return qualityPlifs
76
77 class QPalma:
78 """
79 A training method for the QPalma project
80 """
81
82 def __init__(self):
83 self.ARGS = Param()
84
85 self.logfh = open('qpalma.log','w+')
86 gen_file= '%s/genome.config' % self.ARGS.basedir
87
88 ginfo_filename = 'genome_info.pickle'
89 self.genome_info = fetch_genome_info(ginfo_filename)
90
91 self.plog('genome_info.basedir is %s\n'%self.genome_info.basedir)
92
93 self.C=1.0
94
95 # 'normal' means work like Palma
96 # 'using_quality_scores' means work like Palma plus using sequencing
97 # quality scores
98 self.mode = 'normal'
99 #self.mode = 'using_quality_scores'
100
101 # Here we specify the total number of parameters.
102 # When using quality scores our scoring function is defined as
103 #
104 # f: S x R x S -> R
105 #
106 # as opposed to a usage without quality scores when we only have
107 #
108 # f: S x S -> R
109 #
110 self.numDonSuppPoints = 30
111 self.numAccSuppPoints = 30
112 self.numLengthSuppPoints = 30
113 if self.mode == 'normal':
114 self.sizeMMatrix = 36
115 elif self.mode == 'using_quality_scores':
116 self.sizeMMatrix = 728
117 else:
118 assert False, 'Wrong operation mode specified'
119
120 # this number defines the number of support points for one tuple (a,b)
121 # where 'a' comes with a quality score
122 self.numQualSuppPoints = 10
123 self.numQualSuppPoints = 0
124
125 self.numFeatures = self.numDonSuppPoints + self.numAccSuppPoints\
126 + self.numLengthSuppPoints + self.sizeMMatrix
127
128 self.plog('Initializing problem...\n')
129
130
131 def plog(self,string):
132 self.logfh.write(string)
133
134
135 def run(self):
136 # Load the whole dataset
137 #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data('training',self.genome_info,self.ARGS)
138 Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
139
140 # number of training instances
141 N = len(Sequences)
142 self.numExamples = N
143 assert N == len(Acceptors) and N == len(Acceptors) and N == len(Exons)\
144 and N == len(Ests), 'The Seq,Accept,Donor,.. arrays are of different lengths'
145 self.plog('Number of training examples: %d\n'% N)
146
147 #iteration_steps = 200 ; #upper bound on iteration steps
148 iteration_steps = 2 ; #upper bound on iteration steps
149
150 remove_duplicate_scores = False
151 print_matrix = False
152 anzpath = 2
153
154 # Initialize parameter vector
155 # param = numpy.matlib.rand(126,1)
156 param = Configuration.fixedParam
157
158 # Set the parameters such as limits penalties for the Plifs
159 [h,d,a,mmatrix] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
160
161 # delete splicesite-score-information
162 if not self.ARGS.train_with_splicesitescoreinformation:
163 for i in range(len(Acceptors)):
164 if Acceptors[i] > -20:
165 Acceptors[i] = 1
166 if Donors[i] >-20:
167 Donors[i] = 1
168
169 # Initialize solver
170 if not __debug__:
171 solver = SIQPSolver(self.numFeatures,self.numExamples,self.C,self.logfh)
172
173 # stores the number of alignments done for each example (best path, second-best path etc.)
174 num_path = [anzpath]*N
175 # stores the gap for each example
176 gap = [0.0]*N
177
178 qualityMatrix = zeros((self.numQualSuppPoints,1))
179
180 numPlifs = 24
181 numSuppPoints = 30
182
183 qualityPlifs = initializeQualityScoringFunctions(numPlifs,numSuppPoints)
184
185 #############################################################################################
186 # Training
187 #############################################################################################
188 self.plog('Starting training...\n')
189
190 iteration_nr = 0
191
192 while True:
193 if iteration_nr == iteration_steps:
194 break
195
196 for exampleIdx in range(self.numExamples):
197 if (exampleIdx%10) == 0:
198 print 'Current example nr %d' % exampleIdx
199
200 dna = Sequences[exampleIdx]
201 est = Ests[exampleIdx]
202
203 quality = [0.0]*len(est)
204
205 exons = Exons[exampleIdx]
206 # NoiseMatrix = Noises[exampleIdx]
207 don_supp = Donors[exampleIdx]
208 acc_supp = Acceptors[exampleIdx]
209
210 # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
211 trueSpliceAlign, trueWeightMatch = computeSpliceAlign(dna, exons)
212 trueSpliceAlign, trueWeightMatch, trueQualityPlifs = computeSpliceAlignWithQuality(dna, exons, quality)
213
214 # Calculate the weights
215 trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
216 trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, qualityMatrix ])
217
218 currentPhi = zeros((self.numFeatures,1))
219 currentPhi[0:30] = mat(d.penalties[:]).reshape(30,1)
220 currentPhi[30:60] = mat(a.penalties[:]).reshape(30,1)
221 currentPhi[60:90] = mat(h.penalties[:]).reshape(30,1)
222 currentPhi[90:126] = mmatrix[:]
223 currentPhi[126:] = qualityMatrix[:]
224
225 # Calculate w'phi(x,y) the total score of the alignment
226 trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
227
228 # The allWeights vector is supposed to store the weight parameter
229 # of the true alignment as well as the weight parameters of the
230 # num_path[exampleIdx] other alignments
231 allWeights = zeros((self.numFeatures,num_path[exampleIdx]+1))
232 allWeights[:,0] = trueWeight[:,0]
233
234 AlignmentScores = [0.0]*(num_path[exampleIdx]+1)
235 AlignmentScores[0] = trueAlignmentScore
236
237 ################## Calculate wrong alignment(s) ######################
238
239 # Compute donor, acceptor with penalty_lookup_new
240 # returns two double lists
241 donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
242
243 #myalign wants the acceptor site on the g of the ag
244 acceptor = acceptor[1:]
245 acceptor.append(-inf)
246
247 dna = str(dna)
248 est = str(est)
249 dna_len = len(dna)
250 est_len = len(est)
251 ps = h.convert2SWIG()
252
253 prb = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
254 chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
255
256 matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
257 mm_len = 36
258
259 d_len = len(donor)
260 donor = QPalmaDP.createDoubleArrayFromList(donor)
261 a_len = len(acceptor)
262 acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
263
264 currentAlignment = QPalmaDP.Alignment()
265 qualityMat = QPalmaDP.createDoubleArrayFromList(qualityMatrix)
266 currentAlignment.setQualityMatrix(qualityMat,self.numQualSuppPoints)
267
268 c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList(qualityPlifs)
269
270 #print 'PYTHON: Calling myalign...'
271 # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
272 currentAlignment.myalign( num_path[exampleIdx], dna, dna_len,\
273 est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
274 acceptor, a_len, c_qualityPlifs, remove_duplicate_scores, print_matrix)
275 #print 'PYTHON: After myalign call...'
276
277 c_SpliceAlign = QPalmaDP.createIntArrayFromList([0]*(dna_len*num_path[exampleIdx]))
278 c_EstAlign = QPalmaDP.createIntArrayFromList([0]*(est_len*num_path[exampleIdx]))
279 c_WeightMatch = QPalmaDP.createIntArrayFromList([0]*(mm_len*num_path[exampleIdx]))
280 c_AlignmentScores = QPalmaDP.createDoubleArrayFromList([.0]*num_path[exampleIdx])
281
282 emptyPlif = Plf(30)
283 emptyPlif = emptyPlif.convert2SWIG()
284 c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([emptyPlif]*(24*num_path[exampleIdx]))
285
286 print "Trying to fetch results from c-code"
287
288 currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
289 c_WeightMatch, c_AlignmentScores, c_qualityPlifs)
290
291 print "Fetched results from c-code"
292
293 #print '%d %d %d' % (dna_len*num_path[exampleIdx],est_len*num_path[exampleIdx],mm_len*num_path[exampleIdx])
294
295 newSpliceAlign = zeros((num_path[exampleIdx]*dna_len,1))
296 newEstAlign = zeros((est_len*num_path[exampleIdx],1))
297 newWeightMatch = zeros((num_path[exampleIdx]*mm_len,1))
298 newQualityPlifs = [None]*num_path[exampleIdx]*24
299
300 print 'newSpliceAlign'
301 for i in range(dna_len*num_path[exampleIdx]):
302 newSpliceAlign[i] = c_SpliceAlign[i]
303 # print '%f' % (spliceAlign[i])
304
305 print 'newEstAlign'
306 for i in range(est_len*num_path[exampleIdx]):
307 newEstAlign[i] = c_EstAlign[i]
308 # print '%f' % (spliceAlign[i])
309
310 print 'weightMatch'
311 for i in range(mm_len*num_path[exampleIdx]):
312 newWeightMatch[i] = c_WeightMatch[i]
313 # print '%f' % (weightMatch[i])
314
315 print 'AlignmentScores'
316 for i in range(num_path[exampleIdx]):
317 AlignmentScores[i+1] = c_AlignmentScores[i]
318
319 print 'newQualityPlifs'
320 for i in range(num_path[exampleIdx]*24):
321 newQualityPlifs[i] = QPalmaDP.penaltyArray_getitem(c_qualityPlifs, i)
322
323 print "Calling destructors"
324
325 del c_SpliceAlign
326 del c_EstAlign
327 del c_WeightMatch
328 del c_AlignmentScores
329 del c_qualityPlifs
330 del currentAlignment
331
332 newSpliceAlign = newSpliceAlign.reshape(num_path[exampleIdx],dna_len)
333 newWeightMatch = newWeightMatch.reshape(num_path[exampleIdx],mm_len)
334 # Calculate weights of the respective alignments Note that we are
335 # calculating n-best alignments without hamming loss, so we
336 # have to keep track which of the n-best alignments correspond to
337 # the true one in order not to incorporate a true alignment in the
338 # constraints. To keep track of the true and false alignments we
339 # define an array true_map with a boolean indicating the
340 # equivalence to the true alignment for each decoded alignment.
341 true_map = [0]*(num_path[exampleIdx]+1)
342 true_map[0] = 1
343 path_loss = [0]*(num_path[exampleIdx]+1)
344
345 for pathNr in range(num_path[exampleIdx]):
346 #dna_numbers = dnaest{1,pathNr}
347 #est_numbers = dnaest{2,pathNr}
348
349 weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a, h, newSpliceAlign[pathNr,:].flatten().tolist()[0], don_supp, acc_supp)
350
351 testPlif = newQualityPlifs[24*pathNr]
352 print "testPlif.penalties"
353 print type(testPlif)
354 for tidx in range(testPlif.len):
355 #elem = testPlif.penalties[tidx]
356 elem = QPalmaDP.doubleFArray_getitem(testPlif.penalties, tidx)
357 print '%f ' % elem,
358 print
359
360 # sum up positionwise loss between alignments
361 for alignPosIdx in range(len(newSpliceAlign[pathNr,:])):
362 if newSpliceAlign[pathNr,alignPosIdx] != trueSpliceAlign[alignPosIdx]:
363 path_loss[pathNr+1] += 1
364
365 # Gewichte in restliche Zeilen der Matrix speichern
366 wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch[pathNr,:].T, qualityMatrix ])
367 allWeights[:,pathNr+1] = wp
368
369 hpen = mat(h.penalties).reshape(len(h.penalties),1)
370 dpen = mat(d.penalties).reshape(len(d.penalties),1)
371 apen = mat(a.penalties).reshape(len(a.penalties),1)
372
373 features = numpy.vstack([hpen , dpen , apen , mmatrix[:]])
374 AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
375
376 # Check wether scalar product + loss equals viterbi score
377 #assert math.fabs(newAlignmentScores[pathNr] - AlignmentScores[pathNr+1]) < 1e-6,\
378 #'Scalar prod + loss is not equal Viterbi score. Respective values are %f, %f' % \
379 #(newAlignmentScores[pathNr],AlignmentScores[pathNr+1])
380
381 # # if the pathNr-best alignment is very close to the true alignment consider it as true
382 if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
383 true_map[pathNr+1] = 1
384
385 # the true label sequence should not have a larger score than the maximal one WHYYYYY?
386
387 # this means that all n-best paths are to close to each other
388 # we have to extend the n-best search to a (n+1)-best
389 if len([elem for elem in true_map if elem == 1]) == len(true_map):
390 num_path[exampleIdx] = num_path[exampleIdx]+1
391
392 # Choose true and first false alignment for extending A
393 firstFalseIdx = -1
394 for map_idx,elem in enumerate(true_map):
395 if elem == 0:
396 firstFalseIdx = map_idx
397 break
398
399 # if there is at least one useful false alignment add the
400 # corresponding constraints to the optimization problem
401 if firstFalseIdx != -1:
402 trueWeights = allWeights[:,0]
403 firstFalseWeights = allWeights[:,firstFalseIdx]
404
405 # LMM.py code:
406 deltas = firstFalseWeights - trueWeights
407 if not __debug__:
408 const_added = solver.addConstraint(deltas, exampleIdx)
409 objValue,w,self.slacks = solver.solve()
410
411 sum_xis = 0
412 for elem in self.slacks:
413 sum_xis += elem
414
415 for i in range(len(param)):
416 param[i] = w[i]
417
418 [h,d,a,mmatrix] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
419
420 #
421 # end of one example processing
422 #
423 #if exampleIdx == 100:
424 # break
425
426 #break
427
428 #
429 # end of one iteration through all examples
430 #
431 iteration_nr += 1
432
433 #
434 # end of optimization
435 #
436 export_param('elegans.param',h,d,a,mmatrix)
437 self.logfh.close()
438 print 'Training completed'
439
440 def fetch_genome_info(ginfo_filename):
441 if not os.path.exists(ginfo_filename):
442 cmd = ['']*4
443 cmd[0] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/utils'
444 cmd[1] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/genomes'
445 cmd[2] = 'genome_info = init_genome(\'%s\')' % gen_file
446 cmd[3] = 'save genome_info.mat genome_info'
447 full_cmd = "matlab -nojvm -nodisplay -r \"%s; %s; %s; %s; exit\"" % (cmd[0],cmd[1],cmd[2],cmd[3])
448
449 obj = subprocess.Popen(full_cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
450 out,err = obj.communicate()
451 assert err == '', 'An error occured!\n%s'%err
452
453 ginfo = scipy.io.loadmat('genome_info.mat')
454 cPickle.dump(self.genome_info,open(ginfo_filename,'w+'))
455 return ginfo['genome_info']
456
457 else:
458 return cPickle.load(open(ginfo_filename))
459
460 if __name__ == '__main__':
461 qpalma = QPalma()
462 qpalma.run()