+ added feature calculation for the labels
[qpalma.git] / python / qpalma.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 ###########################################################
5 #
6 #
7 #
8 ###########################################################
9
10 import sys
11 import subprocess
12 import scipy.io
13 import pdb
14 import os.path
15
16 from numpy.matlib import mat,zeros,ones,inf
17 from numpy.linalg import norm
18
19 import QPalmaDP
20
21 from SIQP_CPX import SIQPSolver
22
23 from paths_load_data import *
24 from paths_load_data_pickle import *
25
26 from computeSpliceWeights import *
27 from set_param_palma import *
28 from computeSpliceAlign import *
29 from computeSpliceAlignWithQuality import *
30 from penalty_lookup_new import *
31 from compute_donacc import *
32 from TrainingParam import Param
33 from export_param import *
34
35 import Configuration
36
37 from Plif import Plf
38
39 def getQualityFeatureCounts(qualityPlifs):
40 weightQuality = qualityPlifs[0].penalties
41 for currentPlif in qualityPlifs[1:]:
42 weightQuality = numpy.vstack([weightQuality, currentPlif.penalties])
43
44 return weightQuality
45
46
47 def initializeQualityScoringFunctions(numPlifs,numSuppPoints):
48
49 min_intron_len=20
50 max_intron_len=1000
51 min_svm_score=-5
52 max_svm_score=5
53
54 qualityPlifs = [None]*numPlifs
55
56 for idx in range(numPlifs):
57
58 curPlif = Plf()
59 curPlif.limits = linspace(min_svm_score,max_svm_score,numSuppPoints)
60 curPlif.penalties = [0]*numSuppPoints
61 curPlif.transform = ''
62 curPlif.name = ''
63 curPlif.max_len = 100
64 curPlif.min_len = -100
65 curPlif.id = 1
66 curPlif.use_svm = 0
67 curPlif.next_id = 0
68
69 if idx == 0:
70 curPlif.penalties[0] = 11
71 curPlif.penalties[1] = 22
72 curPlif.penalties[2] = 33
73
74 if idx == 1:
75 curPlif.penalties[0] = 99
76 curPlif.penalties[1] = 100
77 curPlif.penalties[2] = 101
78
79 curPlif = curPlif.convert2SWIG()
80 qualityPlifs[idx] = curPlif
81
82 return qualityPlifs
83
84 class QPalma:
85 """
86 A training method for the QPalma project
87 """
88
89 def __init__(self):
90 self.ARGS = Param()
91
92 self.logfh = open('qpalma.log','w+')
93 gen_file= '%s/genome.config' % self.ARGS.basedir
94
95 ginfo_filename = 'genome_info.pickle'
96 self.genome_info = fetch_genome_info(ginfo_filename)
97
98 self.plog('genome_info.basedir is %s\n'%self.genome_info.basedir)
99
100
101 def plog(self,string):
102 self.logfh.write(string)
103
104
105 def run(self):
106 # Load the whole dataset
107 #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data('training',self.genome_info,self.ARGS)
108 Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
109
110 # number of training instances
111 N = len(Sequences)
112 self.numExamples = N
113 assert N == len(Acceptors) and N == len(Acceptors) and N == len(Exons)\
114 and N == len(Ests), 'The Seq,Accept,Donor,.. arrays are of different lengths'
115 self.plog('Number of training examples: %d\n'% N)
116
117 #iteration_steps = 200 ; #upper bound on iteration steps
118 iteration_steps = 2 ; #upper bound on iteration steps
119
120 remove_duplicate_scores = False
121 print_matrix = False
122 anzpath = 2
123
124 # Initialize parameter vector
125 # param = numpy.matlib.rand(126,1)
126 param = Configuration.fixedParam
127
128 # Set the parameters such as limits penalties for the Plifs
129 [h,d,a,mmatrix] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
130
131 # delete splicesite-score-information
132 if not self.ARGS.train_with_splicesitescoreinformation:
133 for i in range(len(Acceptors)):
134 if Acceptors[i] > -20:
135 Acceptors[i] = 1
136 if Donors[i] >-20:
137 Donors[i] = 1
138
139 # Initialize solver
140 if not __debug__:
141 self.plog('Initializing problem...\n')
142 solver = SIQPSolver(Configuration.numFeatures,Configuration.numExamples,Configuration.C,self.logfh)
143
144 # stores the number of alignments done for each example (best path, second-best path etc.)
145 num_path = [anzpath]*N
146 # stores the gap for each example
147 gap = [0.0]*N
148
149 #############################################################################################
150 # Training
151 #############################################################################################
152 self.plog('Starting training...\n')
153
154 donSP = Configuration.numDonSuppPoints
155 accSP = Configuration.numAccSuppPoints
156 lengthSP = Configuration.numLengthSuppPoints
157 mmatrixSP = Configuration.sizeMMatrix
158 totalQualSP = Configuration.totalQualSuppPoints
159
160 currentPhi = zeros((Configuration.numFeatures,1))
161 totalQualityPenalties = zeros((totalQualSP,1))
162
163 #qualityMatrix = zeros((Configuration.numPlifSuppPoints*Configuration.numQualPlifs,1))
164
165 iteration_nr = 0
166
167 while True:
168 if iteration_nr == iteration_steps:
169 break
170
171 for exampleIdx in range(self.numExamples):
172 if (exampleIdx%10) == 0:
173 print 'Current example nr %d' % exampleIdx
174
175 dna = Sequences[exampleIdx]
176 est = Ests[exampleIdx]
177
178 quality = [0.0]*len(est)
179
180 exons = Exons[exampleIdx]
181 # NoiseMatrix = Noises[exampleIdx]
182 don_supp = Donors[exampleIdx]
183 acc_supp = Acceptors[exampleIdx]
184
185 # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
186 # trueSpliceAlign, trueWeightMatch = computeSpliceAlign(dna, exons)
187 trueSpliceAlign, trueWeightMatch, trueQualityPlifs = computeSpliceAlignWithQuality(dna, exons, quality)
188
189 # Calculate the weights
190 trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
191
192 trueWeightQuality = getQualityFeatureCounts(trueQualityPlifs)
193 trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
194
195 currentPhi[0:donSP] = mat(d.penalties[:]).reshape(donSP,1)
196 currentPhi[donSP:donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
197 currentPhi[donSP+accSP:donSP+accSP+lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
198 currentPhi[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP] = mmatrix[:]
199 currentPhi[donSP+accSP+lengthSP+mmatrixSP:] = totalQualityPenalties[:]
200
201 # Calculate w'phi(x,y) the total score of the alignment
202 trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
203
204 # The allWeights vector is supposed to store the weight parameter
205 # of the true alignment as well as the weight parameters of the
206 # num_path[exampleIdx] other alignments
207 allWeights = zeros((Configuration.numFeatures,num_path[exampleIdx]+1))
208 allWeights[:,0] = trueWeight[:,0]
209
210 AlignmentScores = [0.0]*(num_path[exampleIdx]+1)
211 AlignmentScores[0] = trueAlignmentScore
212
213 ################## Calculate wrong alignment(s) ######################
214
215 # Compute donor, acceptor with penalty_lookup_new
216 # returns two double lists
217 donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
218
219 #myalign wants the acceptor site on the g of the ag
220 acceptor = acceptor[1:]
221 acceptor.append(-inf)
222
223 dna = str(dna)
224 est = str(est)
225 dna_len = len(dna)
226 est_len = len(est)
227 ps = h.convert2SWIG()
228
229 prb = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
230 chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
231
232 matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
233 mm_len = 36
234
235 d_len = len(donor)
236 donor = QPalmaDP.createDoubleArrayFromList(donor)
237 a_len = len(acceptor)
238 acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
239
240 currentAlignment = QPalmaDP.Alignment()
241 #qualityMat = QPalmaDP.createDoubleArrayFromList(qualityMatrix)
242 #currentAlignment.setQualityMatrix(qualityMat,self.numQualSuppPoints)
243
244 qualityPlifs = initializeQualityScoringFunctions(Configuration.numQualPlifs,Configuration.numQualSuppPoints)
245
246 c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList(qualityPlifs)
247
248 #print 'PYTHON: Calling myalign...'
249 # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
250 currentAlignment.myalign( num_path[exampleIdx], dna, dna_len,\
251 est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
252 acceptor, a_len, c_qualityPlifs, remove_duplicate_scores, print_matrix)
253 #print 'PYTHON: After myalign call...'
254
255 c_SpliceAlign = QPalmaDP.createIntArrayFromList([0]*(dna_len*num_path[exampleIdx]))
256 c_EstAlign = QPalmaDP.createIntArrayFromList([0]*(est_len*num_path[exampleIdx]))
257 c_WeightMatch = QPalmaDP.createIntArrayFromList([0]*(mm_len*num_path[exampleIdx]))
258 c_AlignmentScores = QPalmaDP.createDoubleArrayFromList([.0]*num_path[exampleIdx])
259
260 emptyPlif = Plf(30)
261 emptyPlif = emptyPlif.convert2SWIG()
262 c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([emptyPlif]*(Configuration.numQualPlifs*num_path[exampleIdx]))
263
264 currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
265 c_WeightMatch, c_AlignmentScores, c_qualityPlifs)
266
267 newSpliceAlign = zeros((num_path[exampleIdx]*dna_len,1))
268 newEstAlign = zeros((est_len*num_path[exampleIdx],1))
269 newWeightMatch = zeros((num_path[exampleIdx]*mm_len,1))
270 newQualityPlifs = [None]*num_path[exampleIdx]*Configuration.numQualPlifs
271
272 #print 'newSpliceAlign'
273 for i in range(dna_len*num_path[exampleIdx]):
274 newSpliceAlign[i] = c_SpliceAlign[i]
275 # print '%f' % (spliceAlign[i])
276
277 #print 'newEstAlign'
278 for i in range(est_len*num_path[exampleIdx]):
279 newEstAlign[i] = c_EstAlign[i]
280 # print '%f' % (spliceAlign[i])
281
282 #print 'weightMatch'
283 for i in range(mm_len*num_path[exampleIdx]):
284 newWeightMatch[i] = c_WeightMatch[i]
285 # print '%f' % (weightMatch[i])
286
287 #print 'AlignmentScores'
288 for i in range(num_path[exampleIdx]):
289 AlignmentScores[i+1] = c_AlignmentScores[i]
290
291 #print 'newQualityPlifs'
292 for i in range(num_path[exampleIdx]*Configuration.numQualPlifs):
293 newQualityPlifs[i] = QPalmaDP.penaltyArray_getitem(c_qualityPlifs, i)
294
295 #print "Calling destructors"
296
297 del c_SpliceAlign
298 del c_EstAlign
299 del c_WeightMatch
300 del c_AlignmentScores
301 del c_qualityPlifs
302 del currentAlignment
303
304 newSpliceAlign = newSpliceAlign.reshape(num_path[exampleIdx],dna_len)
305 newWeightMatch = newWeightMatch.reshape(num_path[exampleIdx],mm_len)
306 # Calculate weights of the respective alignments Note that we are
307 # calculating n-best alignments without hamming loss, so we
308 # have to keep track which of the n-best alignments correspond to
309 # the true one in order not to incorporate a true alignment in the
310 # constraints. To keep track of the true and false alignments we
311 # define an array true_map with a boolean indicating the
312 # equivalence to the true alignment for each decoded alignment.
313 true_map = [0]*(num_path[exampleIdx]+1)
314 true_map[0] = 1
315 path_loss = [0]*(num_path[exampleIdx]+1)
316
317 for pathNr in range(num_path[exampleIdx]):
318
319 weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a, h, newSpliceAlign[pathNr,:].flatten().tolist()[0], don_supp, acc_supp)
320
321 decodedQualityFeatures = zeros((Configuration.totalQualSuppPoints,1))
322 qidx = 0
323
324 for currentPlif in newQualityPlifs[Configuration.numQualPlifs*pathNr:Configuration.numQualPlifs*(pathNr+1)]:
325 for tidx in range(currentPlif.len):
326 #elem = currentPlif.penalties[tidx]
327 elem = QPalmaDP.doubleFArray_getitem(currentPlif.penalties, tidx)
328 #print '%f ' % elem,
329 print qidx
330 decodedQualityFeatures[qidx] = elem
331 qidx += 1
332 #print
333
334 # sum up positionwise loss between alignments
335 for alignPosIdx in range(len(newSpliceAlign[pathNr,:])):
336 if newSpliceAlign[pathNr,alignPosIdx] != trueSpliceAlign[alignPosIdx]:
337 path_loss[pathNr+1] += 1
338
339 # Gewichte in restliche Zeilen der Matrix speichern
340 wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch[pathNr,:].T, decodedQualityFeatures])
341 allWeights[:,pathNr+1] = wp
342
343 hpen = mat(h.penalties).reshape(len(h.penalties),1)
344 dpen = mat(d.penalties).reshape(len(d.penalties),1)
345 apen = mat(a.penalties).reshape(len(a.penalties),1)
346
347 features = numpy.vstack([hpen , dpen , apen , mmatrix[:], zeros((Configuration.totalQualSuppPoints,1))])
348 AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
349
350 # Check wether scalar product + loss equals viterbi score
351 #assert math.fabs(newAlignmentScores[pathNr] - AlignmentScores[pathNr+1]) < 1e-6,\
352 #'Scalar prod + loss is not equal Viterbi score. Respective values are %f, %f' % \
353 #(newAlignmentScores[pathNr],AlignmentScores[pathNr+1])
354
355 # # if the pathNr-best alignment is very close to the true alignment consider it as true
356 if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
357 true_map[pathNr+1] = 1
358
359 # the true label sequence should not have a larger score than the maximal one WHYYYYY?
360
361 # this means that all n-best paths are to close to each other
362 # we have to extend the n-best search to a (n+1)-best
363 if len([elem for elem in true_map if elem == 1]) == len(true_map):
364 num_path[exampleIdx] = num_path[exampleIdx]+1
365
366 # Choose true and first false alignment for extending A
367 firstFalseIdx = -1
368 for map_idx,elem in enumerate(true_map):
369 if elem == 0:
370 firstFalseIdx = map_idx
371 break
372
373 # if there is at least one useful false alignment add the
374 # corresponding constraints to the optimization problem
375 if firstFalseIdx != -1:
376 trueWeights = allWeights[:,0]
377 firstFalseWeights = allWeights[:,firstFalseIdx]
378
379 # LMM.py code:
380 deltas = firstFalseWeights - trueWeights
381 if not __debug__:
382 const_added = solver.addConstraint(deltas, exampleIdx)
383 objValue,w,self.slacks = solver.solve()
384
385 sum_xis = 0
386 for elem in self.slacks:
387 sum_xis += elem
388
389 for i in range(len(param)):
390 param[i] = w[i]
391
392 [h,d,a,mmatrix] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
393
394 #
395 # end of one example processing
396 #
397 #if exampleIdx == 100:
398 # break
399
400 #break
401
402 #
403 # end of one iteration through all examples
404 #
405 iteration_nr += 1
406
407 #
408 # end of optimization
409 #
410 export_param('elegans.param',h,d,a,mmatrix)
411 self.logfh.close()
412 print 'Training completed'
413
414 def fetch_genome_info(ginfo_filename):
415 if not os.path.exists(ginfo_filename):
416 cmd = ['']*4
417 cmd[0] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/utils'
418 cmd[1] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/genomes'
419 cmd[2] = 'genome_info = init_genome(\'%s\')' % gen_file
420 cmd[3] = 'save genome_info.mat genome_info'
421 full_cmd = "matlab -nojvm -nodisplay -r \"%s; %s; %s; %s; exit\"" % (cmd[0],cmd[1],cmd[2],cmd[3])
422
423 obj = subprocess.Popen(full_cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
424 out,err = obj.communicate()
425 assert err == '', 'An error occured!\n%s'%err
426
427 ginfo = scipy.io.loadmat('genome_info.mat')
428 cPickle.dump(self.genome_info,open(ginfo_filename,'w+'))
429 return ginfo['genome_info']
430
431 else:
432 return cPickle.load(open(ginfo_filename))
433
434 if __name__ == '__main__':
435 qpalma = QPalma()
436 qpalma.run()