+ minor changes to filterReads
[qpalma.git] / scripts / qpalma_main.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 ###########################################################
5 #
6 # The QPalma project aims at extending the Palma project
7 # to be able to use Solexa reads toegether with their
8 # quality scores.
9 #
10 # This file represents the conversion of the main matlab
11 # training loop for Palma to python.
12 #
13 # Author: Fabio De Bona
14 #
15 ###########################################################
16
17 import sys
18 import subprocess
19 import scipy.io
20 import pdb
21 import os.path
22
23 from numpy.matlib import mat,zeros,ones,inf
24 from numpy.linalg import norm
25
26 import QPalmaDP
27 import qpalma
28 from qpalma.SIQP_CPX import SIQPSolver
29 from qpalma.DataProc import *
30 from qpalma.generateEvaluationData import *
31 from qpalma.computeSpliceWeights import *
32 from qpalma.set_param_palma import *
33 from qpalma.computeSpliceAlignWithQuality import *
34 from qpalma.penalty_lookup_new import *
35 from qpalma.compute_donacc import *
36 from qpalma.export_param import *
37 from qpalma.TrainingParam import Param
38 from qpalma.Plif import Plf
39
40 from qpalma.tools.splicesites import getDonAccScores
41 from qpalma.Configuration import *
42
43 from compile_dataset import compile_d
44
45 import palma.palma_utils as pu
46 from palma.output_formating import print_results
47
48 class para:
49 pass
50
51 class QPalma:
52 """
53 A training method for the QPalma project
54 """
55
56 def __init__(self):
57 self.ARGS = Param()
58
59 #compile_d(Conf.gff_fn,Conf.dna_flat_fn,Conf.filtered_fn,Conf.remapped_fn,Conf.tmp_dir,Conf.dataset_fn,False)
60 #compile_d(Conf.gff_fn,Conf.dna_flat_fn,Conf.filtered_fn,Conf.remapped_fn,Conf.tmp_dir,Conf.dataset_fn,True)
61
62 data_filename = Conf.dataset_fn
63 Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos = paths_load_data(data_filename,'training',None,self.ARGS)
64
65 # Load the whole dataset
66 if Conf.mode == 'normal':
67 self.use_quality_scores = False
68
69 elif Conf.mode == 'using_quality_scores':
70 self.use_quality_scores = True
71 else:
72 assert(False)
73
74 self.Sequences = Sequences
75 self.Exons = Exons
76 self.Ests = Ests
77 self.Qualities = Qualities
78 self.SplitPos = SplitPos
79 self.Donors = Donors
80 self.Acceptors = Acceptors
81
82 calc_info(self.Acceptors,self.Donors,self.Exons)
83 print 'leaving constructor...'
84
85 def plog(self,string):
86 self.logfh.write(string)
87 self.logfh.flush()
88
89 def train(self):
90 self.logfh = open('_qpalma_train.log','w+')
91
92 beg = Conf.training_begin
93 end = Conf.training_end
94
95 Sequences = self.Sequences[beg:end]
96 Exons = self.Exons[beg:end]
97 Ests = self.Ests[beg:end]
98 Qualities = self.Qualities[beg:end]
99 Acceptors = self.Acceptors[beg:end]
100 Donors = self.Donors[beg:end]
101
102 # number of training instances
103 N = numExamples = len(Sequences)
104 assert len(Exons) == N and len(Ests) == N\
105 and len(Qualities) == N and len(Acceptors) == N\
106 and len(Donors) == N, 'The Exons,Acc,Don,.. arrays are of different lengths'
107 self.plog('Number of training examples: %d\n'% numExamples)
108
109 self.noImprovementCtr = 0
110 self.oldObjValue = 1e8
111
112 iteration_steps = Conf.iter_steps ; #upper bound on iteration steps
113 remove_duplicate_scores = Conf.remove_duplicate_scores
114 print_matrix = Conf.print_matrix
115 anzpath = Conf.anzpath
116
117 # Initialize parameter vector / param = numpy.matlib.rand(126,1)
118 param = Conf.fixedParam
119
120 lengthSP = Conf.numLengthSuppPoints
121 donSP = Conf.numDonSuppPoints
122 accSP = Conf.numAccSuppPoints
123 mmatrixSP = Conf.sizeMatchmatrix[0]\
124 *Conf.sizeMatchmatrix[1]
125 numq = Conf.numQualSuppPoints
126 totalQualSP = Conf.totalQualSuppPoints
127
128 i1 = range(0,lengthSP)
129 i2 = range(lengthSP,lengthSP+donSP)
130 i3 = range(lengthSP+donSP,lengthSP+donSP+accSP)
131 i4 = range(lengthSP+donSP+accSP,lengthSP+donSP+accSP+mmatrixSP)
132 i5 = range(lengthSP+donSP+accSP+mmatrixSP,lengthSP+donSP+accSP+mmatrixSP+totalQualSP)
133 intervals = [] #i5,\ #i2,#i3,#i4,#i5]
134 zero_out(param,intervals)
135 pdb.set_trace()
136
137 # Set the parameters such as limits penalties for the Plifs
138 [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
139
140 # Initialize solver
141 if Conf.USE_OPT:
142 self.plog('Initializing problem...\n')
143 solver = SIQPSolver(Conf.numFeatures,numExamples,Conf.C,self.logfh)
144 #solver.enforceMonotonicity(lengthSP,lengthSP+donSP)
145 #solver.enforceMonotonicity(lengthSP+donSP,lengthSP+donSP+accSP)
146
147 # stores the number of alignments done for each example (best path, second-best path etc.)
148 num_path = [anzpath]*numExamples
149 # stores the gap for each example
150 gap = [0.0]*numExamples
151 #############################################################################################
152 # Training
153 #############################################################################################
154 self.plog('Starting training...\n')
155
156 currentPhi = zeros((Conf.numFeatures,1))
157 totalQualityPenalties = zeros((totalQualSP,1))
158
159 suboptimal_example = 0
160 iteration_nr = 0
161 param_idx = 0
162 const_added_ctr = 0
163
164 # the main training loop
165 while True:
166 if iteration_nr == iteration_steps:
167 break
168
169 for exampleIdx in range(numExamples):
170 if (exampleIdx%100) == 0:
171 print 'Current example nr %d' % exampleIdx
172
173 dna = Sequences[exampleIdx]
174 est = Ests[exampleIdx]
175 est = "".join(est)
176
177 if Conf.mode == 'normal':
178 quality = [40]*len(est)
179
180 if Conf.mode == 'using_quality_scores':
181 quality = Qualities[exampleIdx]
182
183 exons = Exons[exampleIdx]
184 don_supp = Donors[exampleIdx]
185 acc_supp = Acceptors[exampleIdx]
186
187 # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
188 if Conf.mode == 'using_quality_scores':
189 trueSpliceAlign, trueWeightMatch, trueWeightQuality =\
190 computeSpliceAlignWithQuality(dna, exons, est, quality, qualityPlifs)
191 else:
192 trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
193
194 # Calculate the weights
195 trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
196 trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
197
198 currentPhi[0:lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
199 currentPhi[lengthSP:lengthSP+donSP] = mat(d.penalties[:]).reshape(donSP,1)
200 currentPhi[lengthSP+donSP:lengthSP+donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
201 currentPhi[lengthSP+donSP+accSP:lengthSP+donSP+accSP+mmatrixSP] = mmatrix[:]
202
203 if Conf.mode == 'using_quality_scores':
204 totalQualityPenalties = param[-totalQualSP:]
205 currentPhi[lengthSP+donSP+accSP+mmatrixSP:] = totalQualityPenalties[:]
206
207 zero_out(currentPhi,intervals)
208 zero_out(trueWeight,intervals)
209
210 #pdb.set_trace()
211
212 # Calculate w'phi(x,y) the total score of the alignment
213 trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
214
215 # The allWeights vector is supposed to store the weight parameter
216 # of the true alignment as well as the weight parameters of the
217 # num_path[exampleIdx] other alignments
218 allWeights = zeros((Conf.numFeatures,num_path[exampleIdx]+1))
219 allWeights[:,0] = trueWeight[:,0]
220
221 AlignmentScores = [0.0]*(num_path[exampleIdx]+1)
222 AlignmentScores[0] = trueAlignmentScore
223 ################## Calculate wrong alignment(s) ######################
224
225 # Compute donor, acceptor with penalty_lookup_new
226 # returns two double lists
227 donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
228
229 #myalign wants the acceptor site on the g of the ag
230 acceptor = acceptor[1:]
231 acceptor.append(-inf)
232
233 dna = str(dna)
234 est = str(est)
235 dna_len = len(dna)
236 est_len = len(est)
237
238 # check that splice site scores are at dna positions as expected by
239 # the dynamic programming component
240 for d_pos in [pos for pos,elem in enumerate(donor) if elem != -inf]:
241 assert dna[d_pos] == 'g' and (dna[d_pos+1] == 'c'\
242 or dna[d_pos+1] == 't'), pdb.set_trace()
243
244 for a_pos in [pos for pos,elem in enumerate(acceptor) if elem != -inf]:
245 assert dna[a_pos-1] == 'a' and dna[a_pos] == 'g', pdb.set_trace()
246
247 ps = h.convert2SWIG()
248
249 prb = QPalmaDP.createDoubleArrayFromList(quality)
250 chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
251
252 matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
253 mm_len = Conf.sizeMatchmatrix[0]*Conf.sizeMatchmatrix[1]
254
255 d_len = len(donor)
256 donor = QPalmaDP.createDoubleArrayFromList(donor)
257 a_len = len(acceptor)
258 acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
259
260 # Create the alignment object representing the interface to the C/C++ code.
261 currentAlignment = QPalmaDP.Alignment(Conf.numQualPlifs,Conf.numQualSuppPoints, self.use_quality_scores)
262 c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
263 #print 'Calling myalign...'
264 # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
265 currentAlignment.myalign( num_path[exampleIdx], dna, dna_len,\
266 est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
267 acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
268 print_matrix)
269
270 c_SpliceAlign = QPalmaDP.createIntArrayFromList([0]*(dna_len*num_path[exampleIdx]))
271 c_EstAlign = QPalmaDP.createIntArrayFromList([0]*(est_len*num_path[exampleIdx]))
272 c_WeightMatch = QPalmaDP.createIntArrayFromList([0]*(mm_len*num_path[exampleIdx]))
273 c_DPScores = QPalmaDP.createDoubleArrayFromList([.0]*num_path[exampleIdx])
274
275 c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(Conf.totalQualSuppPoints*num_path[exampleIdx]))
276
277 currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
278 c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
279
280 #print 'After calling getAlignmentResults...'
281
282 newSpliceAlign = zeros((num_path[exampleIdx]*dna_len,1))
283 newEstAlign = zeros((est_len*num_path[exampleIdx],1))
284 newWeightMatch = zeros((num_path[exampleIdx]*mm_len,1))
285 newDPScores = zeros((num_path[exampleIdx],1))
286 newQualityPlifsFeatures = zeros((Conf.totalQualSuppPoints*num_path[exampleIdx],1))
287
288 #print 'newSpliceAlign'
289 for i in range(dna_len*num_path[exampleIdx]):
290 newSpliceAlign[i] = c_SpliceAlign[i]
291 # print '%f' % (spliceAlign[i])
292
293 #print 'newEstAlign'
294 for i in range(est_len*num_path[exampleIdx]):
295 newEstAlign[i] = c_EstAlign[i]
296 # print '%f' % (spliceAlign[i])
297
298 #print 'weightMatch'
299 for i in range(mm_len*num_path[exampleIdx]):
300 newWeightMatch[i] = c_WeightMatch[i]
301 # print '%f' % (weightMatch[i])
302
303 #print 'ViterbiScores'
304 for i in range(num_path[exampleIdx]):
305 newDPScores[i] = c_DPScores[i]
306
307 if self.use_quality_scores:
308 for i in range(Conf.totalQualSuppPoints*num_path[exampleIdx]):
309 newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
310
311 # equals palma up to here
312
313 #print "Calling destructors"
314 del c_SpliceAlign
315 del c_EstAlign
316 del c_WeightMatch
317 del c_DPScores
318 del c_qualityPlifsFeatures
319 #del currentAlignment
320
321 newSpliceAlign = newSpliceAlign.reshape(num_path[exampleIdx],dna_len)
322 newWeightMatch = newWeightMatch.reshape(num_path[exampleIdx],mm_len)
323
324 newQualityPlifsFeatures = newQualityPlifsFeatures.reshape(num_path[exampleIdx],Conf.totalQualSuppPoints)
325 # Calculate weights of the respective alignments Note that we are
326 # calculating n-best alignments without hamming loss, so we
327 # have to keep track which of the n-best alignments correspond to
328 # the true one in order not to incorporate a true alignment in the
329 # constraints. To keep track of the true and false alignments we
330 # define an array true_map with a boolean indicating the
331 # equivalence to the true alignment for each decoded alignment.
332 true_map = [0]*(num_path[exampleIdx]+1)
333 true_map[0] = 1
334
335 for pathNr in range(num_path[exampleIdx]):
336 #print 'decodedWeights'
337 weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a,\
338 h, newSpliceAlign[pathNr,:].flatten().tolist()[0], don_supp,\
339 acc_supp)
340
341 decodedQualityFeatures = zeros((Conf.totalQualSuppPoints,1))
342 decodedQualityFeatures = newQualityPlifsFeatures[pathNr,:].T
343 # Gewichte in restliche Zeilen der Matrix speichern
344 allWeights[:,pathNr+1] = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch[pathNr,:].T, decodedQualityFeatures[:]])
345
346 hpen = mat(h.penalties).reshape(len(h.penalties),1)
347 dpen = mat(d.penalties).reshape(len(d.penalties),1)
348 apen = mat(a.penalties).reshape(len(a.penalties),1)
349 features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties[:]])
350
351 zero_out(features,intervals)
352 zero_out(allWeights[:,pathNr+1],intervals)
353
354 #pdb.set_trace()
355
356 AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
357
358 distinct_scores = False
359 if math.fabs(AlignmentScores[pathNr] - AlignmentScores[pathNr+1]) > 1e-5:
360 distinct_scores = True
361
362 # Check wether scalar product + loss equals viterbi score
363 if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) <= 1e-5:
364 pdb.set_trace()
365
366 if exampleIdx == 1:
367 self.plog(" scalar prod (correct) : %f\n"%AlignmentScores[0])
368 self.plog(" scalar prod (pred.) : %f %f\n"%(newDPScores[pathNr,0],AlignmentScores[pathNr+1]))
369
370 pdb.set_trace()
371
372 # if the pathNr-best alignment is very close to the true alignment consider it as true
373 if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
374 true_map[pathNr+1] = 1
375
376 #pdb.set_trace()
377
378 #assert AlignmentScores[0] <= max(AlignmentScores[1:]) + 1e-6, pdb.set_trace()
379 if not trueAlignmentScore <= max(AlignmentScores[1:]) + 1e-6:
380 suboptimal_example += 1
381 self.plog("suboptimal_example %d\n" %exampleIdx)
382
383
384 # the true label sequence should not have a larger score than the maximal one WHYYYYY?
385 # this means that all n-best paths are to close to each other
386 # we have to extend the n-best search to a (n+1)-best
387 if len([elem for elem in true_map if elem == 1]) == len(true_map):
388 num_path[exampleIdx] = num_path[exampleIdx]+1
389
390 # Choose true and first false alignment for extending
391 firstFalseIdx = -1
392 for map_idx,elem in enumerate(true_map):
393 if elem == 0:
394 firstFalseIdx = map_idx
395 break
396
397 #if exampleIdx == 1:
398 if False:
399
400 self.plog("Is considered as: %d\n" % true_map[1])
401
402 result_len = currentAlignment.getResultLength()
403 c_dna_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
404 c_est_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
405
406 currentAlignment.getAlignmentArrays(c_dna_array,c_est_array)
407
408 dna_array = [0.0]*result_len
409 est_array = [0.0]*result_len
410
411 for r_idx in range(result_len):
412 dna_array[r_idx] = c_dna_array[r_idx]
413 est_array[r_idx] = c_est_array[r_idx]
414
415 _newSpliceAlign = newSpliceAlign[0].flatten().tolist()[0]
416 _newEstAlign = newEstAlign[0].flatten().tolist()[0]
417 line1,line2,line3 = pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array)
418 self.plog(line1+'\n')
419 self.plog(line2+'\n')
420 self.plog(line3+'\n')
421
422 # if there is at least one useful false alignment add the
423 # corresponding constraints to the optimization problem
424 if firstFalseIdx != -1:
425 trueWeights = trueWeight
426 firstFalseWeights = allWeights[:,firstFalseIdx]
427 differenceVector = trueWeights - firstFalseWeights
428 #pdb.set_trace()
429
430 if Conf.USE_OPT:
431 const_added = solver.addConstraint(differenceVector, exampleIdx)
432 const_added_ctr += 1
433 #
434 # end of one example processing
435 #
436
437 # call solver every nth example //added constraint
438 if exampleIdx != 0 and exampleIdx % Conf.numConstraintsPerRound == 0 and Conf.USE_OPT:
439 objValue,w,self.slacks = solver.solve()
440
441 if math.fabs(objValue - self.oldObjValue) <= 1e-6:
442 self.noImprovementCtr += 1
443
444 if self.noImprovementCtr == numExamples+1:
445 break
446
447 self.oldObjValue = objValue
448 print "objValue is %f" % objValue
449
450 sum_xis = 0
451 for elem in self.slacks:
452 sum_xis += elem
453
454 print 'sum of slacks is %f'% sum_xis
455
456 for i in range(len(param)):
457 param[i] = w[i]
458
459 #pdb.set_trace()
460 cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
461 param_idx += 1
462 [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
463
464 #
465 # end of one iteration through all examples
466 #
467
468 self.plog("suboptimal rounds %d\n" %suboptimal_example)
469
470 if self.noImprovementCtr == numExamples*2:
471 break
472
473 iteration_nr += 1
474
475 #
476 # end of optimization
477 #
478 print 'Training completed'
479
480 pa = para()
481 pa.h = h
482 pa.d = d
483 pa.a = a
484 pa.mmatrix = mmatrix
485 pa.qualityPlifs = qualityPlifs
486
487 cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
488 #cPickle.dump(pa,open('elegans.param','w+'))
489 self.logfh.close()
490
491 def evaluate(self,param_filename):
492 beg = Conf.prediction_begin
493 end = Conf.prediction_end
494
495 # predict on training set
496 print '##### Prediction on the training set #####'
497 self.predict(param_filename,0,beg)
498
499 # predict on test set
500 print '##### Prediction on the test set #####'
501 self.predict(param_filename,beg,end)
502
503 #pdb.set_trace()
504
505 def predict(self,param_filename,beg,end):
506 self.logfh = open('_qpalma_predict.log','w+')
507
508 Sequences = self.Sequences[beg:end]
509 Exons = self.Exons[beg:end]
510 Ests = self.Ests[beg:end]
511 Qualities = self.Qualities[beg:end]
512 Acceptors = self.Acceptors[beg:end]
513 Donors = self.Donors[beg:end]
514 SplitPos = self.SplitPos[beg:end]
515
516 # number of training instances
517 N = numExamples = len(Sequences)
518 assert len(Exons) == N and len(Ests) == N\
519 and len(Qualities) == N and len(Acceptors) == N\
520 and len(Donors) == N, 'The Exons,Acc,Don,.. arrays are of different lengths'
521 self.plog('Number of training examples: %d\n'% numExamples)
522
523 self.noImprovementCtr = 0
524 self.oldObjValue = 1e8
525
526 remove_duplicate_scores = Conf.remove_duplicate_scores
527 print_matrix = Conf.print_matrix
528 anzpath = Conf.anzpath
529
530 param = cPickle.load(open(param_filename))
531
532 # Set the parameters such as limits penalties for the Plifs
533 [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
534
535 #############################################################################################
536 # Prediction
537 #############################################################################################
538 self.plog('Starting prediction...\n')
539
540 donSP = Conf.numDonSuppPoints
541 accSP = Conf.numAccSuppPoints
542 lengthSP = Conf.numLengthSuppPoints
543 mmatrixSP = Conf.sizeMatchmatrix[0]\
544 *Conf.sizeMatchmatrix[1]
545 numq = Conf.numQualSuppPoints
546 totalQualSP = Conf.totalQualSuppPoints
547
548 currentPhi = zeros((Conf.numFeatures,1))
549 totalQualityPenalties = zeros((totalQualSP,1))
550
551 exon1Begin = []
552 exon1End = []
553 exon2Begin = []
554 exon2End = []
555 allWrongExons = []
556
557 for exampleIdx in range(numExamples):
558 dna = Sequences[exampleIdx]
559 est = Ests[exampleIdx]
560 currentSplitPos = SplitPos[exampleIdx]
561
562 if Conf.mode == 'normal':
563 quality = [40]*len(est)
564
565 if Conf.mode == 'using_quality_scores':
566 quality = Qualities[exampleIdx]
567
568 exons = Exons[exampleIdx]
569 # NoiseMatrix = Noises[exampleIdx]
570 don_supp = Donors[exampleIdx]
571 acc_supp = Acceptors[exampleIdx]
572
573 # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
574 if Conf.mode == 'using_quality_scores':
575 trueSpliceAlign, trueWeightMatch, trueWeightQuality =\
576 computeSpliceAlignWithQuality(dna, exons, est, quality, qualityPlifs)
577 else:
578 trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
579
580 # Calculate the weights
581 trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
582 trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
583
584 currentPhi[0:donSP] = mat(d.penalties[:]).reshape(donSP,1)
585 currentPhi[donSP:donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
586 currentPhi[donSP+accSP:donSP+accSP+lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
587 currentPhi[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP] = mmatrix[:]
588
589 if Conf.mode == 'using_quality_scores':
590 totalQualityPenalties = param[-totalQualSP:]
591 currentPhi[donSP+accSP+lengthSP+mmatrixSP:] = totalQualityPenalties[:]
592
593 # Calculate w'phi(x,y) the total score of the alignment
594 trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
595
596 # The allWeights vector is supposed to store the weight parameter
597 # of the true alignment as well as the weight parameters of the
598 # 1 other alignments
599 allWeights = zeros((Conf.numFeatures,1+1))
600 allWeights[:,0] = trueWeight[:,0]
601
602 AlignmentScores = [0.0]*(1+1)
603 AlignmentScores[0] = trueAlignmentScore
604
605 ################## Calculate wrong alignment(s) ######################
606
607 # Compute donor, acceptor with penalty_lookup_new
608 # returns two double lists
609 donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
610
611 #myalign wants the acceptor site on the g of the ag
612 acceptor = acceptor[1:]
613 acceptor.append(-inf)
614
615 dna = str(dna)
616 est = str(est)
617 dna_len = len(dna)
618 est_len = len(est)
619
620 ps = h.convert2SWIG()
621
622 prb = QPalmaDP.createDoubleArrayFromList(quality)
623 chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
624
625 matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
626 mm_len = Conf.sizeMatchmatrix[0]*Conf.sizeMatchmatrix[1]
627
628 d_len = len(donor)
629 donor = QPalmaDP.createDoubleArrayFromList(donor)
630 a_len = len(acceptor)
631 acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
632
633 # Create the alignment object representing the interface to the C/C++ code.
634 currentAlignment = QPalmaDP.Alignment(Conf.numQualPlifs,Conf.numQualSuppPoints, self.use_quality_scores)
635
636 c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
637
638 # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
639 currentAlignment.myalign(1, dna, dna_len,\
640 est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
641 acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
642 print_matrix)
643
644 c_SpliceAlign = QPalmaDP.createIntArrayFromList([0]*(dna_len*1))
645 c_EstAlign = QPalmaDP.createIntArrayFromList([0]*(est_len*1))
646 c_WeightMatch = QPalmaDP.createIntArrayFromList([0]*(mm_len*1))
647 c_DPScores = QPalmaDP.createDoubleArrayFromList([.0]*1)
648
649 result_len = currentAlignment.getResultLength()
650 c_dna_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
651 c_est_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
652
653 currentAlignment.getAlignmentArrays(c_dna_array,c_est_array)
654
655 c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(Conf.totalQualSuppPoints))
656
657 currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
658 c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
659
660 newSpliceAlign = zeros((dna_len,1))
661 newEstAlign = zeros((est_len,1))
662 newWeightMatch = zeros((mm_len,1))
663 newQualityPlifsFeatures = zeros((Conf.totalQualSuppPoints,1))
664
665 for i in range(dna_len):
666 newSpliceAlign[i] = c_SpliceAlign[i]
667
668 for i in range(est_len):
669 newEstAlign[i] = c_EstAlign[i]
670
671 for i in range(mm_len):
672 newWeightMatch[i] = c_WeightMatch[i]
673
674 newDPScores = c_DPScores[0]
675
676 for i in range(Conf.totalQualSuppPoints):
677 newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
678
679 del c_SpliceAlign
680 del c_EstAlign
681 del c_WeightMatch
682 del c_DPScores
683 del c_qualityPlifsFeatures
684 del currentAlignment
685
686 newSpliceAlign = newSpliceAlign.reshape(1,dna_len)
687 newWeightMatch = newWeightMatch.reshape(1,mm_len)
688 true_map = [0]*2
689 true_map[0] = 1
690 pathNr = 0
691
692 dna_array = [0.0]*result_len
693 est_array = [0.0]*result_len
694
695 for r_idx in range(result_len):
696 dna_array[r_idx] = c_dna_array[r_idx]
697 est_array[r_idx] = c_est_array[r_idx]
698
699 _newSpliceAlign = newSpliceAlign.flatten().tolist()[0]
700 _newEstAlign = newEstAlign.flatten().tolist()[0]
701
702 line1,line2,line3 = pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array)
703 self.plog(line1+'\n')
704 self.plog(line2+'\n')
705 self.plog(line3+'\n')
706
707 weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a, h, newSpliceAlign.flatten().tolist()[0], don_supp, acc_supp)
708
709 decodedQualityFeatures = zeros((Conf.totalQualSuppPoints,1))
710 for qidx in range(Conf.totalQualSuppPoints):
711 decodedQualityFeatures[qidx] = newQualityPlifsFeatures[qidx]
712
713 # Gewichte in restliche Zeilen der Matrix speichern
714 wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch.T, decodedQualityFeatures])
715 allWeights[:,pathNr+1] = wp
716
717 hpen = mat(h.penalties).reshape(len(h.penalties),1)
718 dpen = mat(d.penalties).reshape(len(d.penalties),1)
719 apen = mat(a.penalties).reshape(len(a.penalties),1)
720 features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties])
721 AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
722
723 # if the pathNr-best alignment is very close to the true alignment consider it as true
724 if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
725 true_map[pathNr+1] = 1
726
727 e1_b_off,e1_e_off,e2_b_off,e2_e_off,newExons = self.evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign,currentSplitPos,exampleIdx)
728
729 if e1_b_off != None:
730 exon1Begin.append(e1_b_off)
731 exon1End.append(e1_e_off)
732 exon2Begin.append(e2_b_off)
733 exon2End.append(e2_e_off)
734 else:
735 allWrongExons.append((newExons,exons))
736
737
738 logfile = self.logfh
739
740 if e1_b_off == 0 and e1_e_off == 0 and e2_b_off == 0 and e2_e_off == 0:
741 print >> logfile, 'example %d correct' % exampleIdx
742 else:
743 print >> logfile, 'example %d wrong' % exampleIdx
744
745 e1Begin_pos,e1Begin_neg,e1End_pos,e1End_neg,mean_e1Begin_neg,mean_e1End_neg = self.evaluatePositions(exon1Begin,exon1End)
746 e2Begin_pos,e2Begin_neg,e2End_pos,e2End_neg,mean_e2Begin_neg,mean_e2End_neg = self.evaluatePositions(exon2Begin,exon2End)
747
748 all_pos_correct = 0
749 for idx in range(len(exon1Begin)):
750 if exon1Begin[idx] == 0 and exon1End[idx] == 0\
751 and exon2Begin[idx] == 0 and exon2End[idx] == 0:
752 all_pos_correct += 1
753
754 logfile = self.logfh
755 print >> logfile, 'Total num. of examples: %d' % numExamples
756 print >> logfile, 'Number of total correct examples: %d' % all_pos_correct
757 print >> logfile, 'Correct positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_pos),len(e1End_pos),len(e2Begin_pos),len(e2End_pos))
758 print >> logfile, 'Incorrect positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_neg),len(e1End_neg),len(e2Begin_neg),len(e2End_neg))
759 print >> logfile, 'Mean of pos. offset:\t\t%.2f\t%.2f\t%.2f\t%.2f' % (mean_e1Begin_neg,mean_e1End_neg,mean_e2Begin_neg,mean_e2End_neg)
760
761 print 'Total num. of examples: %d' % numExamples
762 print 'Number of total correct examples: %d' % all_pos_correct
763 print 'Correct positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_pos),len(e1End_pos),len(e2Begin_pos),len(e2End_pos))
764 print 'Incorrect positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_neg),len(e1End_neg),len(e2Begin_neg),len(e2End_neg))
765 print 'Mean of pos. offset:\t\t%.2f\t%.2f\t%.2f\t%.2f' % (mean_e1Begin_neg,mean_e1End_neg,mean_e2Begin_neg,mean_e2End_neg)
766
767 print 'Prediction completed'
768 self.logfh.close()
769
770 def evaluatePositions(self,eBegin,eEnd):
771
772 eBegin_pos = [elem for elem in eBegin if elem == 0]
773 eBegin_neg = [elem for elem in eBegin if elem != 0]
774 eEnd_pos = [elem for elem in eEnd if elem == 0]
775 eEnd_neg = [elem for elem in eEnd if elem != 0]
776
777 mean_eBegin_neg = 0
778 for idx in range(len(eBegin_neg)):
779 mean_eBegin_neg += eBegin_neg[idx]
780
781 try:
782 mean_eBegin_neg /= 1.0*len(eBegin_neg)
783 except:
784 mean_eBegin_neg = -1
785
786 mean_eEnd_neg = 0
787 for idx in range(len(eEnd_neg)):
788 mean_eEnd_neg += eEnd_neg[idx]
789
790 try:
791 mean_eEnd_neg /= 1.0*len(eEnd_neg)
792 except:
793 mean_eEnd_neg = -1
794
795 return eBegin_pos,eBegin_neg,eEnd_pos,eEnd_neg,mean_eBegin_neg,mean_eEnd_neg
796
797 def evaluateExample(self,dna,est,exons,SpliceAlign,newEstAlign,spos,exampleIdx):
798 newExons = []
799 oldElem = -1
800 SpliceAlign = SpliceAlign.flatten().tolist()[0]
801 SpliceAlign.append(-1)
802 for pos,elem in enumerate(SpliceAlign):
803 if pos == 0:
804 oldElem = -1
805 else:
806 oldElem = SpliceAlign[pos-1]
807
808 if oldElem != 0 and elem == 0: # start of exon
809 newExons.append(pos)
810
811 if oldElem == 0 and elem != 0: # end of exon
812 newExons.append(pos)
813
814 e1_b_off,e1_e_off,e2_b_off,e2_e_off = 0,0,0,0
815
816 self.plog(("Example %d"%exampleIdx) + str(exons) + " "+ str(newExons) + "\n")
817 #self.plog(("SpliceAlign " + str(SpliceAlign)+ "\n"))
818 self.plog(("read: " + str(est)+ "\n"))
819
820 if len(newExons) == 4:
821 e1_begin,e1_end = newExons[0],newExons[1]
822 e2_begin,e2_end = newExons[2],newExons[3]
823 else:
824 return None,None,None,None,newExons
825
826 e1_b_off = int(math.fabs(e1_begin - exons[0,0]))
827 e1_e_off = int(math.fabs(e1_end - exons[0,1]))
828
829 e2_b_off = int(math.fabs(e2_begin - exons[1,0]))
830 e2_e_off = int(math.fabs(e2_end - exons[1,1]))
831
832 return e1_b_off,e1_e_off,e2_b_off,e2_e_off,newExons
833
834 def fetch_genome_info(ginfo_filename):
835 if not os.path.exists(ginfo_filename):
836 cmd = ['']*4
837 cmd[0] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/utils'
838 cmd[1] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/genomes'
839 cmd[2] = 'genome_info = init_genome(\'%s\')' % gen_file
840 cmd[3] = 'save genome_info.mat genome_info'
841 full_cmd = "matlab -nojvm -nodisplay -r \"%s; %s; %s; %s; exit\"" % (cmd[0],cmd[1],cmd[2],cmd[3])
842
843 obj = subprocess.Popen(full_cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
844 out,err = obj.communicate()
845 assert err == '', 'An error occured!\n%s'%err
846
847 ginfo = scipy.io.loadmat('genome_info.mat')
848 cPickle.dump(self.genome_info,open(ginfo_filename,'w+'))
849 return ginfo['genome_info']
850
851 else:
852 return cPickle.load(open(ginfo_filename))
853
854
855
856 def calcStat(Acceptor,Donor,Exons):
857 maxAcc = -100
858 minAcc = 100
859 maxDon = -100
860 minDon = 100
861
862 acc_vec_pos = []
863 acc_vec_neg = []
864 don_vec_pos = []
865 don_vec_neg = []
866
867 for jdx in range(len(Acceptors)):
868 currentExons = Exons[jdx]
869 currentAcceptor = Acceptors[jdx]
870 currentAcceptor = currentAcceptor[1:]
871 currentAcceptor.append(-inf)
872 currentDonor = Donors[jdx]
873
874 for idx,elem in enumerate(currentAcceptor):
875 if idx == (int(currentExons[1,0])-1): # acceptor site
876 acc_vec_pos.append(elem)
877 else:
878 acc_vec_neg.append(elem)
879
880 for idx,elem in enumerate(currentDonor):
881 if idx == (int(currentExons[0,1])): # donor site
882 don_vec_pos.append(elem)
883 else:
884 don_vec_neg.append(elem)
885
886 acc_pos = [elem for elem in acc_vec_pos if elem != -inf]
887 acc_neg = [elem for elem in acc_vec_neg if elem != -inf]
888 don_pos = [elem for elem in don_vec_pos if elem != -inf]
889 don_neg = [elem for elem in don_vec_neg if elem != -inf]
890
891 pdb.set_trace()
892
893 for idx in range(len(Sequences)):
894 acc = [elem for elem in Acceptors[idx] if elem != -inf]
895 maxAcc = max(max(acc),maxAcc)
896 minAcc = min(min(acc),minAcc)
897
898 don = [elem for elem in Donors[idx] if elem != -inf]
899 maxDon = max(max(don),maxDon)
900 minDon = min(min(don),minDon)
901
902 def pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array):
903 (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds, tExonSizes, tStarts, tEnds) =\
904 pu.get_splice_info(_newSpliceAlign,_newEstAlign)
905
906 t_strand = '+'
907 translation = '-ACGTN_' #how aligned est and dna sequence is displayed
908 #(gap_char, 5 nucleotides, intron_char)
909 comparison_char = '| ' #first: match_char, second: intron_char
910
911 (exon_length, identity, ssQuality, exonQuality, comparison, qIDX, tIDX) =\
912 pu.comp_identity(dna_array, est_array, t_strand, qStart, tStart, translation, comparison_char)
913
914 first_identity = None
915 last_identity = None
916 for i in range(len(comparison)):
917 if comparison[i] == '|' and first_identity is None:
918 first_identity = i
919 if comparison[-i] == '|' and last_identity is None:
920 last_identity = len(comparison) - i - 1
921
922 try:
923 for idx in range(len(dna_array)):
924 dna_array[idx] = translation[int(dna_array[idx])]
925 est_array[idx] = translation[int(est_array[idx])]
926 except:
927 pdb.set_trace()
928
929 line1 = "".join(dna_array)
930 line2 = comparison
931 line3 = "".join(est_array)
932
933 return line1,line2,line3
934
935 def calc_info(acc,don,exons):
936 min_score = 100
937 max_score = -100
938
939 for elem in acc:
940 for score in elem:
941 if score != -inf:
942 min_score = min(min_score,score)
943 max_score = max(max_score,score)
944
945 print 'Acceptors max/min: %f / %f' % (max_score,min_score)
946
947 min_score = 100
948 max_score = -100
949
950 for elem in don:
951 for score in elem:
952 if score != -inf:
953 min_score = min(min_score,score)
954 max_score = max(max_score,score)
955
956 print 'Donor max/min: %f / %f' % (max_score,min_score)
957
958 min_score = 10000
959 max_score = 0
960 mean_intron_len = 0
961
962 for elem in exons:
963 intron_len = int(elem[1,0] - elem[0,1])
964 mean_intron_len += intron_len
965 min_score = min(min_score,intron_len)
966 max_score = max(max_score,intron_len)
967
968 mean_intron_len /= 1.0*len(exons)
969 print 'Intron length max/min: %d / %d mean length %f' % (max_score,min_score,mean_intron_len)
970
971 def zero_out(vec,intervals):
972 for interval in intervals:
973 for pos in interval:
974 vec[pos] = 0.0
975
976 if __name__ == '__main__':
977 qpalma = QPalma()
978
979 if len(sys.argv) == 2:
980 mode = sys.argv[1]
981 assert mode == 'train'
982 qpalma.train()
983
984 elif len(sys.argv) == 3:
985 mode = sys.argv[1]
986 param_filename = sys.argv[2]
987 assert mode == 'predict'
988 assert os.path.exists(param_filename)
989 qpalma.evaluate(param_filename)
990 else:
991 print 'You have to choose between training or prediction mode:'
992 print 'python qpalma. py (train|predict) <param_file>'