+ renamed files
[qpalma.git] / scripts / qpalma_main.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 ###########################################################
5 #
6 # The QPalma project aims at extending the Palma project
7 # to be able to use Solexa reads toegether with their
8 # quality scores.
9 #
10 # This file represents the conversion of the main matlab
11 # training loop for Palma to python.
12 #
13 # Author: Fabio De Bona
14 #
15 ###########################################################
16
17 import sys
18 import subprocess
19 import scipy.io
20 import pdb
21 import os.path
22
23 from numpy.matlib import mat,zeros,ones,inf
24 from numpy.linalg import norm
25
26 import QPalmaDP
27 import qpalma
28 from qpalma.SIQP_CPX import SIQPSolver
29 from qpalma.DataProc import *
30 from qpalma.generateEvaluationData import *
31 from qpalma.computeSpliceWeights import *
32 from qpalma.set_param_palma import *
33 from qpalma.computeSpliceAlignWithQuality import *
34 from qpalma.penalty_lookup_new import *
35 from qpalma.compute_donacc import *
36 from qpalma.export_param import *
37 from qpalma.TrainingParam import Param
38 from qpalma.Plif import Plf
39
40 from qpalma.tools.splicesites import getDonAccScores
41
42 from qpalma.Configuration import *
43
44 class para:
45 pass
46
47 class QPalma:
48 """
49 A training method for the QPalma project
50 """
51
52 def __init__(self):
53 self.ARGS = Param()
54 self.logfh = open('_qpalma.log','w+')
55
56 #gen_file= '%s/genome.config' % self.ARGS.basedir
57 #ginfo_filename = 'genome_info.pickle'
58 #self.genome_info = fetch_genome_info(ginfo_filename)
59 #self.plog('genome_info.basedir is %s\n'%self.genome_info.basedir)
60
61 #self.ARGS.train_with_splicesitescoreinformation = False
62
63 # Load the whole dataset
64 if Conf.mode == 'normal':
65 #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
66 Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
67 self.use_quality_scores = False
68
69 elif Conf.mode == 'using_quality_scores':
70
71 filename = 'real_dset_%s'% 'recent'
72 if True:# not os.path.exists(filename):
73 Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos = paths_load_data_solexa('training',None,self.ARGS)
74
75 end = 1000
76 self.Sequences = Sequences[:end]
77 self.Exons = Exons[:end]
78 self.Ests = Ests[:end]
79 self.Qualities = Qualities[:end]
80 self.Acceptors = Acceptors[:end]
81 self.Donors = Donors[:end]
82
83 self.Donors, self.Acceptors = getDonAccScores(Sequences)
84 #dset = Dataset()
85 else:
86 dset = cPickle.load(open(filename))
87 Sequences = dset.Sequences
88 Acceptors = dset.Acceptors
89 Donors = dset.Donors
90 Exons = dset.Exons
91 Ests = dset.Ests
92 Qualities = dset.Qualities
93
94 #Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
95 #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
96 #Qualities = []
97 #for i in range(len(Ests)):
98 # Qualities.append([40]*len(Ests[i]))
99 self.use_quality_scores = True
100 else:
101 assert(False)
102
103 # number of training instances
104 N = len(Sequences)
105 self.numExamples = N
106 assert N == len(Acceptors) and N == len(Acceptors) and N == len(Exons)\
107 and N == len(Ests), 'The Seq,Accept,Donor,.. arrays are of different lengths'
108 self.plog('Number of training examples: %d\n'% N)
109 print 'Number of features: %d\n'% Conf.numFeatures
110
111 def plog(self,string):
112 self.logfh.write(string)
113 self.logfh.flush()
114
115 def train(self):
116 beg = Conf.training_begin
117 end = Conf.training_end
118
119 Sequences = self.Sequences[beg:end]
120 Exons = self.Exons[beg:end]
121 Ests = self.Ests[beg:end]
122 Qualities = self.Qualities[beg:end]
123 Acceptors = self.Acceptors[beg:end]
124 Donors = self.Donors[beg:end]
125
126 iteration_steps = Conf.iter_steps ; #upper bound on iteration steps
127 remove_duplicate_scores = Conf.remove_duplicate_scores
128 print_matrix = Conf.print_matrix
129 anzpath = Conf.anzpath
130
131 # Initialize parameter vector / param = numpy.matlib.rand(126,1)
132 param = Conf.fixedParam
133
134 # Set the parameters such as limits penalties for the Plifs
135 [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
136
137 # delete splicesite-score-information
138 #if not self.ARGS.train_with_splicesitescoreinformation:
139 # for i in range(len(Acceptors)):
140 # if Acceptors[i] > -20:
141 # Acceptors[i] = 1
142 # if Donors[i] >-20:
143 # Donors[i] = 1
144
145 # Initialize solver
146 if Conf.USE_OPT:
147 self.plog('Initializing problem...\n')
148 solver = SIQPSolver(Conf.numFeatures,self.numExamples,Conf.C,self.logfh)
149
150 # stores the number of alignments done for each example (best path, second-best path etc.)
151 num_path = [anzpath]*self.numExamples
152 # stores the gap for each example
153 gap = [0.0]*self.numExamples
154
155 #############################################################################################
156 # Training
157 #############################################################################################
158 self.plog('Starting training...\n')
159
160 donSP = Conf.numDonSuppPoints
161 accSP = Conf.numAccSuppPoints
162 lengthSP = Conf.numLengthSuppPoints
163 mmatrixSP = Conf.sizeMatchmatrix[0]\
164 *Conf.sizeMatchmatrix[1]
165 numq = Conf.numQualSuppPoints
166 totalQualSP = Conf.totalQualSuppPoints
167
168 currentPhi = zeros((Conf.numFeatures,1))
169 totalQualityPenalties = zeros((totalQualSP,1))
170
171 iteration_nr = 0
172 param_idx = 0
173 const_added_ctr = 0
174 while True:
175 if iteration_nr == iteration_steps:
176 break
177
178 for exampleIdx in range(self.numExamples):
179 if (exampleIdx%100) == 0:
180 print 'Current example nr %d' % exampleIdx
181
182 dna = Sequences[exampleIdx]
183 est = Ests[exampleIdx]
184
185 if Conf.mode == 'normal':
186 quality = [40]*len(est)
187
188 if Conf.mode == 'using_quality_scores':
189 quality = Qualities[exampleIdx]
190
191 exons = Exons[exampleIdx]
192 # NoiseMatrix = Noises[exampleIdx]
193 don_supp = Donors[exampleIdx]
194 acc_supp = Acceptors[exampleIdx]
195
196 # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
197 trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
198
199 #print 'trueWeights'
200 # Calculate the weights
201 trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
202 trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
203
204 currentPhi[0:donSP] = mat(d.penalties[:]).reshape(donSP,1)
205 currentPhi[donSP:donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
206 currentPhi[donSP+accSP:donSP+accSP+lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
207 currentPhi[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP] = mmatrix[:]
208
209 #pdb.set_trace()
210 if Conf.mode == 'using_quality_scores':
211 totalQualityPenalties = param[-totalQualSP:]
212 currentPhi[donSP+accSP+lengthSP+mmatrixSP:] = totalQualityPenalties[:]
213
214 # Calculate w'phi(x,y) the total score of the alignment
215 trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
216
217 # The allWeights vector is supposed to store the weight parameter
218 # of the true alignment as well as the weight parameters of the
219 # num_path[exampleIdx] other alignments
220 allWeights = zeros((Conf.numFeatures,num_path[exampleIdx]+1))
221 allWeights[:,0] = trueWeight[:,0]
222
223 AlignmentScores = [0.0]*(num_path[exampleIdx]+1)
224 AlignmentScores[0] = trueAlignmentScore
225
226 ################## Calculate wrong alignment(s) ######################
227
228 # Compute donor, acceptor with penalty_lookup_new
229 # returns two double lists
230 donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
231
232 #myalign wants the acceptor site on the g of the ag
233 acceptor = acceptor[1:]
234 acceptor.append(-inf)
235
236 # for now we don't use donor/acceptor scores
237
238 #donor = [-inf] * len(donor)
239 #acceptor = [-inf] * len(donor)
240
241 dna = str(dna)
242 est = str(est)
243 dna_len = len(dna)
244 est_len = len(est)
245
246 ps = h.convert2SWIG()
247
248 prb = QPalmaDP.createDoubleArrayFromList(quality)
249 chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
250
251 matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
252 mm_len = Conf.sizeMatchmatrix[0]*Conf.sizeMatchmatrix[1]
253
254 d_len = len(donor)
255 donor = QPalmaDP.createDoubleArrayFromList(donor)
256 a_len = len(acceptor)
257 acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
258
259 # Create the alignment object representing the interface to the C/C++ code.
260 currentAlignment = QPalmaDP.Alignment(Conf.numQualPlifs,Conf.numQualSuppPoints, self.use_quality_scores)
261
262 c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
263 #print 'Calling myalign...'
264 # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
265 currentAlignment.myalign( num_path[exampleIdx], dna, dna_len,\
266 est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
267 acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
268 print_matrix)
269
270 #print 'After calling myalign...'
271 #print 'Calling getAlignmentResults...'
272
273 c_SpliceAlign = QPalmaDP.createIntArrayFromList([0]*(dna_len*num_path[exampleIdx]))
274 c_EstAlign = QPalmaDP.createIntArrayFromList([0]*(est_len*num_path[exampleIdx]))
275 c_WeightMatch = QPalmaDP.createIntArrayFromList([0]*(mm_len*num_path[exampleIdx]))
276 c_DPScores = QPalmaDP.createDoubleArrayFromList([.0]*num_path[exampleIdx])
277
278 c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(Conf.totalQualSuppPoints*num_path[exampleIdx]))
279
280 currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
281 c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
282
283 #print 'After calling getAlignmentResults...'
284
285 newSpliceAlign = zeros((num_path[exampleIdx]*dna_len,1))
286 newEstAlign = zeros((est_len*num_path[exampleIdx],1))
287 newWeightMatch = zeros((num_path[exampleIdx]*mm_len,1))
288 newDPScores = zeros((num_path[exampleIdx],1))
289 newQualityPlifsFeatures = zeros((Conf.totalQualSuppPoints*num_path[exampleIdx],1))
290
291 #print 'newSpliceAlign'
292 for i in range(dna_len*num_path[exampleIdx]):
293 newSpliceAlign[i] = c_SpliceAlign[i]
294 # print '%f' % (spliceAlign[i])
295
296 #print 'newEstAlign'
297 for i in range(est_len*num_path[exampleIdx]):
298 newEstAlign[i] = c_EstAlign[i]
299 # print '%f' % (spliceAlign[i])
300
301 #print 'weightMatch'
302 for i in range(mm_len*num_path[exampleIdx]):
303 newWeightMatch[i] = c_WeightMatch[i]
304 # print '%f' % (weightMatch[i])
305
306 #print 'ViterbiScores'
307 for i in range(num_path[exampleIdx]):
308 newDPScores[i] = c_DPScores[i]
309
310
311 if self.use_quality_scores:
312 for i in range(Conf.totalQualSuppPoints*num_path[exampleIdx]):
313 newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
314
315 # equals palma up to here
316
317 #print "Calling destructors"
318 del c_SpliceAlign
319 del c_EstAlign
320 del c_WeightMatch
321 del c_DPScores
322 del c_qualityPlifsFeatures
323 del currentAlignment
324
325 newSpliceAlign = newSpliceAlign.reshape(num_path[exampleIdx],dna_len)
326 newWeightMatch = newWeightMatch.reshape(num_path[exampleIdx],mm_len)
327 # Calculate weights of the respective alignments Note that we are
328 # calculating n-best alignments without hamming loss, so we
329 # have to keep track which of the n-best alignments correspond to
330 # the true one in order not to incorporate a true alignment in the
331 # constraints. To keep track of the true and false alignments we
332 # define an array true_map with a boolean indicating the
333 # equivalence to the true alignment for each decoded alignment.
334 true_map = [0]*(num_path[exampleIdx]+1)
335 true_map[0] = 1
336 path_loss = [0]*(num_path[exampleIdx])
337
338 for pathNr in range(num_path[exampleIdx]):
339 #print 'decodedWeights'
340 weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a,\
341 h, newSpliceAlign[pathNr,:].flatten().tolist()[0], don_supp,\
342 acc_supp)
343
344 decodedQualityFeatures = zeros((Conf.totalQualSuppPoints,1))
345 for qidx in range(Conf.totalQualSuppPoints):
346 decodedQualityFeatures[qidx] = newQualityPlifsFeatures[(pathNr*Conf.totalQualSuppPoints)+qidx]
347
348 #pdb.set_trace()
349
350 path_loss[pathNr] = 0
351 # sum up positionwise loss between alignments
352 for alignPosIdx in range(newSpliceAlign[pathNr,:].shape[1]):
353 if newSpliceAlign[pathNr,alignPosIdx] != trueSpliceAlign[alignPosIdx]:
354 path_loss[pathNr] += 1
355
356 #pdb.set_trace()
357
358 # Gewichte in restliche Zeilen der Matrix speichern
359 wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch[pathNr,:].T, decodedQualityFeatures])
360 allWeights[:,pathNr+1] = wp
361
362 hpen = mat(h.penalties).reshape(len(h.penalties),1)
363 dpen = mat(d.penalties).reshape(len(d.penalties),1)
364 apen = mat(a.penalties).reshape(len(a.penalties),1)
365 features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties])
366
367 AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
368
369 # Check wether scalar product + loss equals viterbi score
370 #print 'Example nr.: %d, path nr. %d, scores: %f vs %f' % (exampleIdx,pathNr,newDPScores[pathNr,0], AlignmentScores[pathNr+1])
371
372 distinct_scores = False
373 if math.fabs(AlignmentScores[pathNr] - AlignmentScores[pathNr+1]) > 1e-5:
374 distinct_scores = True
375
376 #if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) + [0,1][distinct_scores and (pathNr>0)] <= 1e-5:
377 if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) <= 1e-5:
378 pdb.set_trace()
379
380 # # if the pathNr-best alignment is very close to the true alignment consider it as true
381 if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
382 true_map[pathNr+1] = 1
383
384 # assert AlignmentScores[0] > max(AlignmentScores[1:]) + 1e-6, pdb.set_trace()
385
386 # the true label sequence should not have a larger score than the maximal one WHYYYYY?
387 # this means that all n-best paths are to close to each other
388 # we have to extend the n-best search to a (n+1)-best
389 if len([elem for elem in true_map if elem == 1]) == len(true_map):
390 num_path[exampleIdx] = num_path[exampleIdx]+1
391
392 # Choose true and first false alignment for extending A
393 firstFalseIdx = -1
394 for map_idx,elem in enumerate(true_map):
395 if elem == 0:
396 firstFalseIdx = map_idx
397 break
398
399 # if there is at least one useful false alignment add the
400 # corresponding constraints to the optimization problem
401 if firstFalseIdx != -1:
402 trueWeights = allWeights[:,0]
403 firstFalseWeights = allWeights[:,firstFalseIdx]
404 differenceVector = trueWeights - firstFalseWeights
405 #pdb.set_trace()
406
407 if Conf.USE_OPT:
408 const_added = solver.addConstraint(differenceVector, exampleIdx)
409 const_added_ctr += 1
410 #
411 # end of one example processing
412 #
413
414 # call solver every nth example //added constraint
415 if exampleIdx != 0 and exampleIdx % 100 == 0 and Conf.USE_OPT:
416 objValue,w,self.slacks = solver.solve()
417
418 print "objValue is %f" % objValue
419
420 sum_xis = 0
421 for elem in self.slacks:
422 sum_xis += elem
423
424 for i in range(len(param)):
425 param[i] = w[i]
426
427 #pdb.set_trace()
428 cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
429 param_idx += 1
430 [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
431
432 #
433 # end of one iteration through all examples
434 #
435 iteration_nr += 1
436
437 #
438 # end of optimization
439 #
440 print 'Training completed'
441
442 pa = para()
443 pa.h = h
444 pa.d = d
445 pa.a = a
446 pa.mmatrix = mmatrix
447 pa.qualityPlifs = qualityPlifs
448
449 cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
450 #cPickle.dump(pa,open('elegans.param','w+'))
451 self.logfh.close()
452
453
454
455 def predict(self):
456 beg = Conf.prediction_begin
457 end = Conf.prediction_end
458
459 Sequences = self.Sequences[beg:end]
460 Exons = self.Exons[beg:end]
461 Ests = self.Ests[beg:end]
462 Qualities = self.Qualities[beg:end]
463 Acceptors = self.Acceptors[beg:end]
464 Donors = self.Donors[beg:end]
465
466 remove_duplicate_scores = Conf.remove_duplicate_scores
467 print_matrix = Conf.print_matrix
468 anzpath = Conf.anzpath
469
470 param_filename='/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/param_28.pickle'
471 param = load_param(param_filename)
472
473 # Set the parameters such as limits penalties for the Plifs
474 [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
475
476 #############################################################################################
477 # Prediction
478 #############################################################################################
479 self.plog('Starting prediction...\n')
480
481 donSP = Conf.numDonSuppPoints
482 accSP = Conf.numAccSuppPoints
483 lengthSP = Conf.numLengthSuppPoints
484 mmatrixSP = Conf.sizeMatchmatrix[0]\
485 *Conf.sizeMatchmatrix[1]
486 numq = Conf.numQualSuppPoints
487 totalQualSP = Conf.totalQualSuppPoints
488
489 currentPhi = zeros((Conf.numFeatures,1))
490 totalQualityPenalties = zeros((totalQualSP,1))
491
492 total_up_off = []
493 total_down_off = []
494
495 for exampleIdx in range(self.numExamples):
496 dna = Sequences[exampleIdx]
497 est = Ests[exampleIdx]
498 currentSplitPos = SplitPos[exampleIdx]
499
500 if Conf.mode == 'normal':
501 quality = [40]*len(est)
502
503 if Conf.mode == 'using_quality_scores':
504 quality = Qualities[exampleIdx]
505
506 exons = Exons[exampleIdx]
507 # NoiseMatrix = Noises[exampleIdx]
508 don_supp = Donors[exampleIdx]
509 acc_supp = Acceptors[exampleIdx]
510
511 if exons[-1,1] > len(dna):
512 continue
513
514 # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
515 trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
516
517 # Calculate the weights
518 trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
519 trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
520
521 currentPhi[0:donSP] = mat(d.penalties[:]).reshape(donSP,1)
522 currentPhi[donSP:donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
523 currentPhi[donSP+accSP:donSP+accSP+lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
524 currentPhi[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP] = mmatrix[:]
525
526 if Conf.mode == 'using_quality_scores':
527 totalQualityPenalties = param[-totalQualSP:]
528 currentPhi[donSP+accSP+lengthSP+mmatrixSP:] = totalQualityPenalties[:]
529
530 # Calculate w'phi(x,y) the total score of the alignment
531 trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
532
533 # The allWeights vector is supposed to store the weight parameter
534 # of the true alignment as well as the weight parameters of the
535 # 1 other alignments
536 allWeights = zeros((Conf.numFeatures,1+1))
537 allWeights[:,0] = trueWeight[:,0]
538
539 AlignmentScores = [0.0]*(1+1)
540 AlignmentScores[0] = trueAlignmentScore
541
542 ################## Calculate wrong alignment(s) ######################
543
544 # Compute donor, acceptor with penalty_lookup_new
545 # returns two double lists
546 donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
547
548 #myalign wants the acceptor site on the g of the ag
549 acceptor = acceptor[1:]
550 acceptor.append(-inf)
551
552 dna = str(dna)
553 est = str(est)
554 dna_len = len(dna)
555 est_len = len(est)
556
557 ps = h.convert2SWIG()
558
559 prb = QPalmaDP.createDoubleArrayFromList(quality)
560 chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
561
562 matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
563 mm_len = Conf.sizeMatchmatrix[0]*Conf.sizeMatchmatrix[1]
564
565 d_len = len(donor)
566 donor = QPalmaDP.createDoubleArrayFromList(donor)
567 a_len = len(acceptor)
568 acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
569
570 # Create the alignment object representing the interface to the C/C++ code.
571 currentAlignment = QPalmaDP.Alignment(Conf.numQualPlifs,Conf.numQualSuppPoints, self.use_quality_scores)
572
573 c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
574
575 # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
576 currentAlignment.myalign(1, dna, dna_len,\
577 est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
578 acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
579 print_matrix)
580
581 c_SpliceAlign = QPalmaDP.createIntArrayFromList([0]*(dna_len*1))
582 c_EstAlign = QPalmaDP.createIntArrayFromList([0]*(est_len*1))
583 c_WeightMatch = QPalmaDP.createIntArrayFromList([0]*(mm_len*1))
584 c_DPScores = QPalmaDP.createDoubleArrayFromList([.0]*1)
585
586 c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(Conf.totalQualSuppPoints))
587
588 currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
589 c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
590
591 newSpliceAlign = zeros((dna_len,1))
592 newEstAlign = zeros((est_len,1))
593 newWeightMatch = zeros((mm_len,1))
594 newQualityPlifsFeatures = zeros((Conf.totalQualSuppPoints,1))
595
596 #print 'newSpliceAlign'
597 for i in range(dna_len):
598 newSpliceAlign[i] = c_SpliceAlign[i]
599 # print '%f' % (spliceAlign[i])
600
601 #print 'newEstAlign'
602 for i in range(est_len):
603 newEstAlign[i] = c_EstAlign[i]
604 # print '%f' % (spliceAlign[i])
605
606 #print 'weightMatch'
607 for i in range(mm_len):
608 newWeightMatch[i] = c_WeightMatch[i]
609 # print '%f' % (weightMatch[i])
610
611 newDPScores = c_DPScores[0]
612
613 for i in range(Conf.totalQualSuppPoints):
614 newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
615
616 #print "Calling destructors"
617 del c_SpliceAlign
618 del c_EstAlign
619 del c_WeightMatch
620 del c_DPScores
621 del c_qualityPlifsFeatures
622 del currentAlignment
623
624 newSpliceAlign = newSpliceAlign.reshape(1,dna_len)
625 newWeightMatch = newWeightMatch.reshape(1,mm_len)
626 # Calculate weights of the respective alignments Note that we are
627 # calculating n-best alignments without hamming loss, so we
628 # have to keep track which of the n-best alignments correspond to
629 # the true one in order not to incorporate a true alignment in the
630 # constraints. To keep track of the true and false alignments we
631 # define an array true_map with a boolean indicating the
632 # equivalence to the true alignment for each decoded alignment.
633 true_map = [0]*2
634 true_map[0] = 1
635 pathNr = 0
636
637 weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a, h, newSpliceAlign.flatten().tolist()[0], don_supp, acc_supp)
638
639 decodedQualityFeatures = zeros((Conf.totalQualSuppPoints,1))
640 for qidx in range(Conf.totalQualSuppPoints):
641 decodedQualityFeatures[qidx] = newQualityPlifsFeatures[qidx]
642
643 # Gewichte in restliche Zeilen der Matrix speichern
644 wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch.T, decodedQualityFeatures])
645 allWeights[:,pathNr+1] = wp
646
647 hpen = mat(h.penalties).reshape(len(h.penalties),1)
648 dpen = mat(d.penalties).reshape(len(d.penalties),1)
649 apen = mat(a.penalties).reshape(len(a.penalties),1)
650 features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties])
651 AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
652
653 # if the pathNr-best alignment is very close to the true alignment consider it as true
654 if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
655 true_map[pathNr+1] = 1
656
657 up_off,down_off = evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign,currentSplitPos)
658 #evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign,currentSplitPos)
659 #print up_off,down_off
660
661 if up_off > -1:
662 total_up_off.append(up_off)
663 total_down_off.append(down_off)
664
665 total_up = 0
666 total_down = 0
667 for idx in range(len(total_up_off)):
668 total_up += total_up_off[idx]
669 total_down += total_down_off[idx]
670
671 total_up /= len(total_up_off)
672 total_down /= len(total_down_off)
673
674 print 'Mean up_off is %f' % total_up
675 print 'Mean down_off is %f' % total_down
676 print 'Correct up_off len is %d' % len([elem for elem in total_up_off if elem in [0,1]])
677 print 'Correct down_off len is %d' % len([elem for elem in total_down_off if elem in [0,1]])
678
679 print 'Prediction completed'
680 self.logfh.close()
681
682 def fetch_genome_info(ginfo_filename):
683 if not os.path.exists(ginfo_filename):
684 cmd = ['']*4
685 cmd[0] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/utils'
686 cmd[1] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/genomes'
687 cmd[2] = 'genome_info = init_genome(\'%s\')' % gen_file
688 cmd[3] = 'save genome_info.mat genome_info'
689 full_cmd = "matlab -nojvm -nodisplay -r \"%s; %s; %s; %s; exit\"" % (cmd[0],cmd[1],cmd[2],cmd[3])
690
691 obj = subprocess.Popen(full_cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
692 out,err = obj.communicate()
693 assert err == '', 'An error occured!\n%s'%err
694
695 ginfo = scipy.io.loadmat('genome_info.mat')
696 cPickle.dump(self.genome_info,open(ginfo_filename,'w+'))
697 return ginfo['genome_info']
698
699 else:
700 return cPickle.load(open(ginfo_filename))
701
702 def plifs2param(h,d,a,mmatrix,qualityPlifs):
703 donSP = Conf.numDonSuppPoints
704 accSP = Conf.numAccSuppPoints
705 lengthSP = Conf.numLengthSuppPoints
706 mmatrixSP = Conf.sizeMatchmatrix[0]\
707 *Conf.sizeMatchmatrix[1]
708
709
710 param = zeros((Conf.numFeatures,1))
711 param[0:donSP] = mat(d.penalties[:]).reshape(donSP,1)
712 param[donSP:donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
713 param[donSP+accSP:donSP+accSP+lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
714 param[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP] = mmatrix[:]
715
716 for idx in range(len(qualityPlifs)):
717 currentPlif = qualityPlifs[idx]
718 begin = lengthSP+donSP+accSP+mmatrixSP+(idx*Conf.numQualSuppPoints)
719 end = lengthSP+donSP+accSP+mmatrixSP+((idx+1)*Conf.numQualSuppPoints)
720 param[begin:end] = mat(currentPlif.penalties).reshape(Conf.numQualSuppPoints,1)
721
722 return param
723
724 def load_param(filename):
725 param = None
726 #try:
727 # para = cPickle.load(open(filename))
728 #except:
729 # print 'Error: Could not open parameter file!'
730 # sys.exit(1)
731 #
732 #param = plifs2param(para.h,para.d,para.a,para.mmatrix,para.qualityPlifs)
733
734 param = cPickle.load(open(filename))
735 return param
736
737 def evaluateExample(dna,est,exons,SpliceAlign,newEstAlign,spos):
738 newExons = []
739 oldElem = -1
740 SpliceAlign = SpliceAlign.flatten().tolist()[0]
741 SpliceAlign.append(-1)
742 for pos,elem in enumerate(SpliceAlign):
743 if pos == 0:
744 oldElem = -1
745 else:
746 oldElem = SpliceAlign[pos-1]
747
748 if oldElem != 0 and elem == 0: # start of exon
749 newExons.append(pos-1)
750
751 if oldElem == 0 and elem != 0: # end of exon
752 newExons.append(pos)
753
754 up_off = -1
755 down_off = -1
756
757 if len(newExons) != 4:
758 acc = 0.0
759 else:
760 e1_begin,e1_end = newExons[0],newExons[1]
761 e2_begin,e2_end = newExons[2],newExons[3]
762
763 up_off = int(math.fabs(e1_end - exons[0,1]))
764 down_off = int(math.fabs(e2_begin - exons[1,0]))
765
766 return up_off,down_off
767
768
769 if __name__ == '__main__':
770
771 qpalma = QPalma()
772 if len(sys.argv) != 2:
773 print 'You have to choose between training or prediction mode:'
774 print 'python qpalma. py (train|predict)'
775
776 mode = sys.argv[1]
777 assert mode in ['train','predict']
778
779 if mode == 'train':
780 qpalma.train()
781
782 if mode == 'predict':
783 qpalma.predict()
784