+ fixded index bug in feature count
[qpalma.git] / python / qpalma.py
1 #qualityquality!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 ###########################################################
5 #
6 #
7 #
8 ###########################################################
9
10 import sys
11 import subprocess
12 import scipy.io
13 import pdb
14 import os.path
15
16 from numpy.matlib import mat,zeros,ones,inf
17 from numpy.linalg import norm
18
19 import QPalmaDP
20
21 from SIQP_CPX import SIQPSolver
22
23 from paths_load_data import *
24 from paths_load_data_pickle import *
25 from paths_load_data_solexa import *
26
27 from computeSpliceWeights import *
28 from set_param_palma import *
29 from computeSpliceAlignWithQuality import *
30 from penalty_lookup_new import *
31 from compute_donacc import *
32 from TrainingParam import Param
33 from export_param import *
34
35 import Configuration
36 from Plif import Plf
37
38 def getQualityFeatureCounts(qualityPlifs):
39 weightQuality = qualityPlifs[0].penalties
40 for currentPlif in qualityPlifs[1:]:
41 weightQuality = numpy.vstack([weightQuality, currentPlif.penalties])
42
43 return weightQuality
44
45 class QPalma:
46 """
47 A training method for the QPalma project
48 """
49
50 def __init__(self):
51 self.ARGS = Param()
52
53 self.logfh = open('qpalma.log','w+')
54 gen_file= '%s/genome.config' % self.ARGS.basedir
55
56 ginfo_filename = 'genome_info.pickle'
57 self.genome_info = fetch_genome_info(ginfo_filename)
58
59 self.plog('genome_info.basedir is %s\n'%self.genome_info.basedir)
60
61 #self.ARGS.train_with_splicesitescoreinformation = False
62
63 def plog(self,string):
64 self.logfh.write(string)
65 self.logfh.flush()
66
67 def run(self):
68 # Load the whole dataset
69 if Configuration.mode == 'normal':
70 Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
71 use_quality_scores = False
72 elif Configuration.mode == 'using_quality_scores':
73 Sequences, Acceptors, Donors, Exons, Ests, Qualities = paths_load_data_solexa('training',self.genome_info,self.ARGS)
74
75 #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
76 #Qualities = []
77 #for i in range(len(Ests)):
78 # Qualities.append([40]*len(Ests[i]))
79
80 use_quality_scores = True
81 else:
82 assert(False)
83
84 # number of training instances
85 N = len(Sequences)
86 self.numExamples = N
87 assert N == len(Acceptors) and N == len(Acceptors) and N == len(Exons)\
88 and N == len(Ests), 'The Seq,Accept,Donor,.. arrays are of different lengths'
89 self.plog('Number of training examples: %d\n'% N)
90 print 'Number of features: %d\n'% Configuration.numFeatures
91
92 iteration_steps = Configuration.iter_steps ; #upper bound on iteration steps
93 remove_duplicate_scores = Configuration.remove_duplicate_scores
94 print_matrix = Configuration.print_matrix
95 anzpath = Configuration.anzpath
96
97 # Initialize parameter vector / param = numpy.matlib.rand(126,1)
98 param = Configuration.fixedParam
99 #param = cPickle.load(open('initial_param.pickle'))
100
101 # Set the parameters such as limits penalties for the Plifs
102 [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
103
104 # delete splicesite-score-information
105 if not self.ARGS.train_with_splicesitescoreinformation:
106 for i in range(len(Acceptors)):
107 if Acceptors[i] > -20:
108 Acceptors[i] = 1
109 if Donors[i] >-20:
110 Donors[i] = 1
111
112 # Initialize solver
113 if not __debug__:
114 self.plog('Initializing problem...\n')
115 solver = SIQPSolver(Configuration.numFeatures,self.numExamples,Configuration.C,self.logfh)
116
117 # stores the number of alignments done for each example (best path, second-best path etc.)
118 num_path = [anzpath]*N
119 # stores the gap for each example
120 gap = [0.0]*N
121
122 #############################################################################################
123 # Training
124 #############################################################################################
125 self.plog('Starting training...\n')
126
127 donSP = Configuration.numDonSuppPoints
128 accSP = Configuration.numAccSuppPoints
129 lengthSP = Configuration.numLengthSuppPoints
130 mmatrixSP = Configuration.sizeMatchmatrix[0]\
131 *Configuration.sizeMatchmatrix[1]
132 numq = Configuration.numQualSuppPoints
133 totalQualSP = Configuration.totalQualSuppPoints
134
135 currentPhi = zeros((Configuration.numFeatures,1))
136 totalQualityPenalties = zeros((totalQualSP,1))
137
138 iteration_nr = 0
139 param_idx = 0
140 const_added_ctr = 0
141 while True:
142 if iteration_nr == iteration_steps:
143 break
144
145 for exampleIdx in range(self.numExamples):
146
147 if (exampleIdx%10) == 0:
148 print 'Current example nr %d' % exampleIdx
149
150 dna = Sequences[exampleIdx]
151 est = Ests[exampleIdx]
152
153 if Configuration.mode == 'normal':
154 quality = [40]*len(est)
155
156 if Configuration.mode == 'using_quality_scores':
157 quality = Qualities[exampleIdx]
158
159 #print 'Current quality'
160 #print 'max/min: %d,%d' % (max(quality),min(quality))
161
162 exons = Exons[exampleIdx]
163 # NoiseMatrix = Noises[exampleIdx]
164 don_supp = Donors[exampleIdx]
165 acc_supp = Acceptors[exampleIdx]
166
167 # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
168 trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
169
170 # Calculate the weights
171 trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
172 trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
173
174 currentPhi[0:donSP] = mat(d.penalties[:]).reshape(donSP,1)
175 currentPhi[donSP:donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
176 currentPhi[donSP+accSP:donSP+accSP+lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
177 currentPhi[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP] = mmatrix[:]
178
179 if Configuration.mode == 'using_quality_scores':
180 totalQualityPenalties = param[-totalQualSP:]
181 currentPhi[donSP+accSP+lengthSP+mmatrixSP:] = totalQualityPenalties[:]
182
183 # Calculate w'phi(x,y) the total score of the alignment
184 trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
185
186 # The allWeights vector is supposed to store the weight parameter
187 # of the true alignment as well as the weight parameters of the
188 # num_path[exampleIdx] other alignments
189 allWeights = zeros((Configuration.numFeatures,num_path[exampleIdx]+1))
190 allWeights[:,0] = trueWeight[:,0]
191
192 AlignmentScores = [0.0]*(num_path[exampleIdx]+1)
193 AlignmentScores[0] = trueAlignmentScore
194
195 ################## Calculate wrong alignment(s) ######################
196 #print 'Generating MVs'
197
198 # Compute donor, acceptor with penalty_lookup_new
199 # returns two double lists
200 donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
201
202 #myalign wants the acceptor site on the g of the ag
203 acceptor = acceptor[1:]
204 acceptor.append(-inf)
205
206 # for now we don't use donor/acceptor scores
207 donor = [-inf] * len(donor)
208 acceptor = [-inf] * len(donor)
209
210 dna = str(dna)
211 est = str(est)
212 dna_len = len(dna)
213 est_len = len(est)
214 ps = h.convert2SWIG()
215
216 prb = QPalmaDP.createDoubleArrayFromList(quality)
217 chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
218
219 matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
220 mm_len = 36
221
222 d_len = len(donor)
223 donor = QPalmaDP.createDoubleArrayFromList(donor)
224 a_len = len(acceptor)
225 acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
226
227 #print 'Creating alignment object with %d plifs each having %d support points' % (Configuration.numQualPlifs,Configuration.numQualSuppPoints)
228 currentAlignment = QPalmaDP.Alignment(Configuration.numQualPlifs,Configuration.numQualSuppPoints)
229
230 c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
231
232 #pdb.set_trace()
233
234 # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
235 currentAlignment.myalign( num_path[exampleIdx], dna, dna_len,\
236 est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
237 acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
238 print_matrix, use_quality_scores)
239
240 #print 'Python: after myalign...'
241
242 c_SpliceAlign = QPalmaDP.createIntArrayFromList([0]*(dna_len*num_path[exampleIdx]))
243 c_EstAlign = QPalmaDP.createIntArrayFromList([0]*(est_len*num_path[exampleIdx]))
244 c_WeightMatch = QPalmaDP.createIntArrayFromList([0]*(mm_len*num_path[exampleIdx]))
245 c_AlignmentScores = QPalmaDP.createDoubleArrayFromList([.0]*num_path[exampleIdx])
246
247 c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(Configuration.numQualSuppPoints*Configuration.numQualPlifs*num_path[exampleIdx]))
248
249 currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
250 c_WeightMatch, c_AlignmentScores, c_qualityPlifsFeatures)
251
252 #print 'Python: after getAlignmentResults...'
253
254 newSpliceAlign = zeros((num_path[exampleIdx]*dna_len,1))
255 newEstAlign = zeros((est_len*num_path[exampleIdx],1))
256 newWeightMatch = zeros((num_path[exampleIdx]*mm_len,1))
257 newQualityPlifsFeatures = zeros((Configuration.numQualSuppPoints*Configuration.numQualPlifs*num_path[exampleIdx],1))
258
259 #print 'newSpliceAlign'
260 for i in range(dna_len*num_path[exampleIdx]):
261 newSpliceAlign[i] = c_SpliceAlign[i]
262 # print '%f' % (spliceAlign[i])
263
264 #print 'newEstAlign'
265 for i in range(est_len*num_path[exampleIdx]):
266 newEstAlign[i] = c_EstAlign[i]
267 # print '%f' % (spliceAlign[i])
268
269 #print 'weightMatch'
270 for i in range(mm_len*num_path[exampleIdx]):
271 newWeightMatch[i] = c_WeightMatch[i]
272 # print '%f' % (weightMatch[i])
273
274 #print 'AlignmentScores'
275 for i in range(num_path[exampleIdx]):
276 AlignmentScores[i+1] = c_AlignmentScores[i]
277
278 for i in range(Configuration.numQualSuppPoints*Configuration.numQualPlifs*num_path[exampleIdx]):
279 newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
280
281 # equals palma up to here
282
283 #pdb.set_trace()
284
285 #print "Calling destructors"
286 del c_SpliceAlign
287 del c_EstAlign
288 del c_WeightMatch
289 del c_AlignmentScores
290 del c_qualityPlifsFeatures
291 del currentAlignment
292
293 newSpliceAlign = newSpliceAlign.reshape(num_path[exampleIdx],dna_len)
294 newWeightMatch = newWeightMatch.reshape(num_path[exampleIdx],mm_len)
295 # Calculate weights of the respective alignments Note that we are
296 # calculating n-best alignments without hamming loss, so we
297 # have to keep track which of the n-best alignments correspond to
298 # the true one in order not to incorporate a true alignment in the
299 # constraints. To keep track of the true and false alignments we
300 # define an array true_map with a boolean indicating the
301 # equivalence to the true alignment for each decoded alignment.
302 true_map = [0]*(num_path[exampleIdx]+1)
303 true_map[0] = 1
304 path_loss = [0]*(num_path[exampleIdx]+1)
305
306 #print 'Calculating decoded features'
307
308 for pathNr in range(num_path[exampleIdx]):
309 weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a, h, newSpliceAlign[pathNr,:].flatten().tolist()[0], don_supp, acc_supp)
310
311 decodedQualityFeatures = zeros((Configuration.totalQualSuppPoints,1))
312
313 #print "Calculating pathNr %d" % pathNr
314
315 for qidx in range(Configuration.numQualPlifs*pathNr,Configuration.numQualPlifs*(pathNr+1)):
316 decodedQualityFeatures[qidx%Configuration.numQualPlifs] = newQualityPlifsFeatures[qidx]
317
318 # sum up positionwise loss between alignments
319 for alignPosIdx in range(len(newSpliceAlign[pathNr,:])):
320 if newSpliceAlign[pathNr,alignPosIdx] != trueSpliceAlign[alignPosIdx]:
321 path_loss[pathNr+1] += 1
322
323 # Gewichte in restliche Zeilen der Matrix speichern
324 wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch[pathNr,:].T, decodedQualityFeatures])
325 allWeights[:,pathNr+1] = wp
326
327 hpen = mat(h.penalties).reshape(len(h.penalties),1)
328 dpen = mat(d.penalties).reshape(len(d.penalties),1)
329 apen = mat(a.penalties).reshape(len(a.penalties),1)
330
331 features = numpy.vstack([hpen , dpen , apen , mmatrix[:], zeros((Configuration.totalQualSuppPoints,1))])
332 AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
333
334 # Check wether scalar product + loss equals viterbi score
335 #assert math.fabs(AlignmentScores[pathNr] - AlignmentScores[pathNr+1]) < 1e-6,\
336 #'Scalar prod + loss is not equal Viterbi score. Respective values are %f, %f' % \
337 #(AlignmentScores[pathNr],AlignmentScores[pathNr+1])
338
339 # # if the pathNr-best alignment is very close to the true alignment consider it as true
340 if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
341 true_map[pathNr+1] = 1
342
343 # the true label sequence should not have a larger score than the maximal one WHYYYYY?
344 # this means that all n-best paths are to close to each other
345 # we have to extend the n-best search to a (n+1)-best
346 if len([elem for elem in true_map if elem == 1]) == len(true_map):
347 num_path[exampleIdx] = num_path[exampleIdx]+1
348
349 # Choose true and first false alignment for extending A
350 firstFalseIdx = -1
351 for map_idx,elem in enumerate(true_map):
352 if elem == 0:
353 firstFalseIdx = map_idx
354 break
355
356 # if there is at least one useful false alignment add the
357 # corresponding constraints to the optimization problem
358 if firstFalseIdx != -1:
359 trueWeights = allWeights[:,0]
360 firstFalseWeights = allWeights[:,firstFalseIdx]
361 #differenceVector = firstFalseWeights - trueWeights
362 differenceVector = trueWeights - firstFalseWeights
363 #pdb.set_trace()
364
365 if not __debug__:
366 const_added = solver.addConstraint(differenceVector, exampleIdx)
367 const_added_ctr += 1
368 #
369 # end of one example processing
370 #
371
372 # call solver every nth example //added constraint
373 if exampleIdx != 0 and exampleIdx % 10 == 0 and not __debug__:
374 objValue,w,self.slacks = solver.solve()
375
376 print "objValue is %f" % objValue
377
378 sum_xis = 0
379 for elem in self.slacks:
380 sum_xis += elem
381
382 for i in range(len(param)):
383 param[i] = w[i]
384
385 cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
386 param_idx += 1
387 [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
388 # break
389 #break
390
391 #
392 # end of one iteration through all examples
393 #
394 iteration_nr += 1
395
396 #
397 # end of optimization
398 #
399 print 'Training completed'
400 #pdb.set_trace()
401 export_param('elegans.param',h,d,a,mmatrix)
402 self.logfh.close()
403
404 def fetch_genome_info(ginfo_filename):
405 if not os.path.exists(ginfo_filename):
406 cmd = ['']*4
407 cmd[0] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/utils'
408 cmd[1] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/genomes'
409 cmd[2] = 'genome_info = init_genome(\'%s\')' % gen_file
410 cmd[3] = 'save genome_info.mat genome_info'
411 full_cmd = "matlab -nojvm -nodisplay -r \"%s; %s; %s; %s; exit\"" % (cmd[0],cmd[1],cmd[2],cmd[3])
412
413 obj = subprocess.Popen(full_cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
414 out,err = obj.communicate()
415 assert err == '', 'An error occured!\n%s'%err
416
417 ginfo = scipy.io.loadmat('genome_info.mat')
418 cPickle.dump(self.genome_info,open(ginfo_filename,'w+'))
419 return ginfo['genome_info']
420
421 else:
422 return cPickle.load(open(ginfo_filename))
423
424 if __name__ == '__main__':
425 qpalma = QPalma()
426 qpalma.run()