+ dataset size is smaller now (just storing indices).
[qpalma.git] / scripts / qpalma_main.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 ###########################################################
5 #
6 # The QPalma project aims at extending the Palma project
7 # to be able to use Solexa reads toegether with their
8 # quality scores.
9 #
10 # This file represents the conversion of the main matlab
11 # training loop for Palma to Python.
12 #
13 # Author: Fabio De Bona
14 #
15 ###########################################################
16
17 import sys
18 import cPickle
19 import subprocess
20 import scipy.io
21 import pdb
22 import os.path
23
24 import numpy
25 from numpy.matlib import mat,zeros,ones,inf
26 from numpy.linalg import norm
27
28 import QPalmaDP
29 import qpalma
30 from qpalma.SIQP_CPX import SIQPSolver
31 from qpalma.DataProc import *
32 from qpalma.computeSpliceWeights import *
33 from qpalma.set_param_palma import *
34 from qpalma.computeSpliceAlignWithQuality import *
35 from qpalma.penalty_lookup_new import *
36 from qpalma.compute_donacc import *
37 #from qpalma.export_param import *
38 from qpalma.TrainingParam import Param
39 from qpalma.Plif import Plf
40
41 from qpalma.tools.splicesites import getDonAccScores
42 from qpalma.Configuration import *
43
44 import palma.palma_utils as pu
45 from palma.output_formating import print_results
46
47 class para:
48 pass
49
50 class QPalma:
51 """
52 A training method for the QPalma project
53 """
54
55 def __init__(self,run):
56 self.ARGS = Param()
57 self.run = run
58
59 if self.run['mode'] == 'normal':
60 self.use_quality_scores = False
61
62 elif self.run['mode'] == 'using_quality_scores':
63 self.use_quality_scores = True
64 else:
65 assert(False)
66
67 full_working_path = os.path.join(run['experiment_path'],run['name'])
68 #assert not os.path.exists(full_working_path)
69 #os.mkdir(full_working_path)
70 assert os.path.exists(full_working_path)
71
72 # ATTENTION: Changing working directory
73 os.chdir(full_working_path)
74
75 cPickle.dump(run,open('run_object.pickle','w+'))
76
77 def plog(self,string):
78 self.logfh.write(string)
79 self.logfh.flush()
80
81 def train(self):
82 run = self.run
83
84 self.logfh = open('_qpalma_train.log','w+')
85
86 self.plog("Settings are:\n")
87 self.plog("%s\n"%str(run))
88
89 data_filename = self.run['dataset_filename']
90 Sequences, Acceptors, Donors, Exons, Ests, OriginalEsts, Qualities,\
91 AlternativeSequences =\
92 paths_load_data(data_filename,'training',None,self.ARGS)
93
94 # Load the whole dataset
95 if self.run['mode'] == 'normal':
96 self.use_quality_scores = False
97
98 elif self.run['mode'] == 'using_quality_scores':
99 self.use_quality_scores = True
100 else:
101 assert(False)
102
103 self.Sequences = Sequences
104 self.Exons = Exons
105 self.Ests = Ests
106 self.OriginalEsts= OriginalEsts
107 self.Qualities = Qualities
108 self.Donors = Donors
109 self.Acceptors = Acceptors
110
111 calc_info(self.Acceptors,self.Donors,self.Exons,self.Qualities)
112
113 beg = run['training_begin']
114 end = run['training_end']
115
116 Sequences = Sequences[beg:end]
117 Exons = Exons[beg:end]
118 Ests = Ests[beg:end]
119 OriginalEsts= OriginalEsts[beg:end]
120 Qualities = Qualities[beg:end]
121 Acceptors = Acceptors[beg:end]
122 Donors = Donors[beg:end]
123
124 # number of training instances
125 N = numExamples = len(Sequences)
126 assert len(Exons) == N and len(Ests) == N\
127 and len(Qualities) == N and len(Acceptors) == N\
128 and len(Donors) == N, 'The Exons,Acc,Don,.. arrays are of different lengths'
129 self.plog('Number of training examples: %d\n'% numExamples)
130
131 self.noImprovementCtr = 0
132 self.oldObjValue = 1e8
133
134 iteration_steps = run['iter_steps']
135 remove_duplicate_scores = run['remove_duplicate_scores']
136 print_matrix = run['print_matrix']
137 anzpath = run['anzpath']
138
139 # Initialize parameter vector / param = numpy.matlib.rand(126,1)
140 param = Conf.fixedParam[:run['numFeatures']]
141
142 lengthSP = run['numLengthSuppPoints']
143 donSP = run['numDonSuppPoints']
144 accSP = run['numAccSuppPoints']
145 mmatrixSP = run['matchmatrixRows']*run['matchmatrixCols']
146 numq = run['numQualSuppPoints']
147 totalQualSP = run['totalQualSuppPoints']
148
149 # no intron length model
150 if not run['enable_intron_length']:
151 param[:lengthSP] *= 0.0
152
153 # Set the parameters such as limits penalties for the Plifs
154 [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation,run)
155
156 # Initialize solver
157 self.plog('Initializing problem...\n')
158 solver = SIQPSolver(run['numFeatures'],numExamples,run['C'],self.logfh,run)
159
160 #solver.enforceMonotonicity(lengthSP,lengthSP+donSP)
161 #solver.enforceMonotonicity(lengthSP+donSP,lengthSP+donSP+accSP)
162
163 # stores the number of alignments done for each example (best path, second-best path etc.)
164 num_path = [anzpath]*numExamples
165 # stores the gap for each example
166 gap = [0.0]*numExamples
167 #############################################################################################
168 # Training
169 #############################################################################################
170 self.plog('Starting training...\n')
171
172 currentPhi = zeros((run['numFeatures'],1))
173 totalQualityPenalties = zeros((totalQualSP,1))
174
175 numConstPerRound = run['numConstraintsPerRound']
176 solver_call_ctr = 0
177
178 suboptimal_example = 0
179 iteration_nr = 0
180 param_idx = 0
181 const_added_ctr = 0
182
183 # the main training loop
184 while True:
185 if iteration_nr == iteration_steps:
186 break
187
188 for exampleIdx in range(numExamples):
189 if (exampleIdx%100) == 0:
190 print 'Current example nr %d' % exampleIdx
191
192 dna = Sequences[exampleIdx]
193 est = Ests[exampleIdx]
194 est = "".join(est)
195 est = est.lower()
196 est = est.replace('-','')
197 original_est = OriginalEsts[exampleIdx]
198 original_est = "".join(original_est)
199 original_est = original_est.lower()
200
201 dna_len = len(dna)
202 est_len = len(est)
203
204 assert len(est) == run['read_size'], pdb.set_trace()
205
206 if run['mode'] == 'normal':
207 quality = [40]*len(est)
208
209 if run['mode'] == 'using_quality_scores':
210 quality = Qualities[exampleIdx]
211
212 if not run['enable_quality_scores']:
213 quality = [40]*len(est)
214
215 exons = Exons[exampleIdx]
216 don_supp = Donors[exampleIdx]
217 acc_supp = Acceptors[exampleIdx]
218
219 if not run['enable_splice_signals']:
220 for idx,elem in enumerate(don_supp):
221 if elem != -inf:
222 don_supp[idx] = 0.0
223
224 for idx,elem in enumerate(acc_supp):
225 if elem != -inf:
226 acc_supp[idx] = 0.0
227
228 # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
229 if run['mode'] == 'using_quality_scores':
230 trueSpliceAlign, trueWeightMatch, trueWeightQuality ,dna_calc =\
231 computeSpliceAlignWithQuality(dna, exons, est, original_est,\
232 quality, qualityPlifs,run)
233 else:
234 trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
235
236 dna_calc = dna_calc.replace('-','')
237
238 # Calculate the weights
239 trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
240 trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
241
242 currentPhi[0:lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
243 currentPhi[lengthSP:lengthSP+donSP] = mat(d.penalties[:]).reshape(donSP,1)
244 currentPhi[lengthSP+donSP:lengthSP+donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
245 currentPhi[lengthSP+donSP+accSP:lengthSP+donSP+accSP+mmatrixSP] = mmatrix[:]
246
247 if run['mode'] == 'using_quality_scores':
248 totalQualityPenalties = param[-totalQualSP:]
249 currentPhi[lengthSP+donSP+accSP+mmatrixSP:] = totalQualityPenalties[:]
250
251 # Calculate w'phi(x,y) the total score of the alignment
252 trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
253
254 # The allWeights vector is supposed to store the weight parameter
255 # of the true alignment as well as the weight parameters of the
256 # num_path[exampleIdx] other alignments
257 allWeights = zeros((run['numFeatures'],num_path[exampleIdx]+1))
258 allWeights[:,0] = trueWeight[:,0]
259
260 AlignmentScores = [0.0]*(num_path[exampleIdx]+1)
261 AlignmentScores[0] = trueAlignmentScore
262 ################## Calculate wrong alignment(s) ######################
263 # Compute donor, acceptor with penalty_lookup_new
264 # returns two double lists
265 donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
266
267 #myalign wants the acceptor site on the g of the ag
268 acceptor = acceptor[1:]
269 acceptor.append(-inf)
270
271 # check that splice site scores are at dna positions as expected by
272 # the dynamic programming component
273
274 #for d_pos in [pos for pos,elem in enumerate(donor) if elem != -inf]:
275 # assert dna[d_pos] == 'g' and (dna[d_pos+1] == 'c'\
276 # or dna[d_pos+1] == 't'), pdb.set_trace()
277 #
278 #for a_pos in [pos for pos,elem in enumerate(acceptor) if elem != -inf]:
279 # assert dna[a_pos-1] == 'a' and dna[a_pos] == 'g', pdb.set_trace()
280
281 ps = h.convert2SWIG()
282
283 prb = QPalmaDP.createDoubleArrayFromList(quality)
284 chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
285
286 matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
287 mm_len = run['matchmatrixRows']*run['matchmatrixCols']
288
289 d_len = len(donor)
290 donor = QPalmaDP.createDoubleArrayFromList(donor)
291 a_len = len(acceptor)
292 acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
293
294 # Create the alignment object representing the interface to the C/C++ code.
295 currentAlignment = QPalmaDP.Alignment(run['numQualPlifs'],run['numQualSuppPoints'], self.use_quality_scores)
296 c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
297 #print 'Calling myalign...'
298 # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
299 currentAlignment.myalign( num_path[exampleIdx], dna, dna_len,\
300 est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
301 acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
302 print_matrix)
303
304 c_SpliceAlign = QPalmaDP.createIntArrayFromList([0]*(dna_len*num_path[exampleIdx]))
305 c_EstAlign = QPalmaDP.createIntArrayFromList([0]*(est_len*num_path[exampleIdx]))
306 c_WeightMatch = QPalmaDP.createIntArrayFromList([0]*(mm_len*num_path[exampleIdx]))
307 c_DPScores = QPalmaDP.createDoubleArrayFromList([.0]*num_path[exampleIdx])
308
309 c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(run['totalQualSuppPoints']*num_path[exampleIdx]))
310
311 currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
312 c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
313
314 #print 'After calling getAlignmentResults...'
315
316 newSpliceAlign = zeros((num_path[exampleIdx]*dna_len,1))
317 newEstAlign = zeros((est_len*num_path[exampleIdx],1))
318 newWeightMatch = zeros((num_path[exampleIdx]*mm_len,1))
319 newDPScores = zeros((num_path[exampleIdx],1))
320 newQualityPlifsFeatures = zeros((run['totalQualSuppPoints']*num_path[exampleIdx],1))
321
322 #print 'newSpliceAlign'
323 for i in range(dna_len*num_path[exampleIdx]):
324 newSpliceAlign[i] = c_SpliceAlign[i]
325 # print '%f' % (spliceAlign[i])
326
327 #print 'newEstAlign'
328 for i in range(est_len*num_path[exampleIdx]):
329 newEstAlign[i] = c_EstAlign[i]
330 # print '%f' % (spliceAlign[i])
331
332 #print 'weightMatch'
333 for i in range(mm_len*num_path[exampleIdx]):
334 newWeightMatch[i] = c_WeightMatch[i]
335 # print '%f' % (weightMatch[i])
336
337 #print 'ViterbiScores'
338 for i in range(num_path[exampleIdx]):
339 newDPScores[i] = c_DPScores[i]
340
341 if self.use_quality_scores:
342 for i in range(run['totalQualSuppPoints']*num_path[exampleIdx]):
343 newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
344
345 # equals palma up to here
346
347 #print "Calling destructors"
348 del c_SpliceAlign
349 del c_EstAlign
350 del c_WeightMatch
351 del c_DPScores
352 del c_qualityPlifsFeatures
353 #del currentAlignment
354
355 newSpliceAlign = newSpliceAlign.reshape(num_path[exampleIdx],dna_len)
356 newWeightMatch = newWeightMatch.reshape(num_path[exampleIdx],mm_len)
357
358 newQualityPlifsFeatures = newQualityPlifsFeatures.reshape(num_path[exampleIdx],run['totalQualSuppPoints'])
359 # Calculate weights of the respective alignments Note that we are
360 # calculating n-best alignments without hamming loss, so we
361 # have to keep track which of the n-best alignments correspond to
362 # the true one in order not to incorporate a true alignment in the
363 # constraints. To keep track of the true and false alignments we
364 # define an array true_map with a boolean indicating the
365 # equivalence to the true alignment for each decoded alignment.
366 true_map = [0]*(num_path[exampleIdx]+1)
367 true_map[0] = 1
368
369 for pathNr in range(num_path[exampleIdx]):
370 #print 'decodedWeights'
371 weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a,\
372 h, newSpliceAlign[pathNr,:].flatten().tolist()[0], don_supp,\
373 acc_supp)
374
375 decodedQualityFeatures = zeros((run['totalQualSuppPoints'],1))
376 decodedQualityFeatures = newQualityPlifsFeatures[pathNr,:].T
377 # Gewichte in restliche Zeilen der Matrix speichern
378 allWeights[:,pathNr+1] = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch[pathNr,:].T, decodedQualityFeatures[:]])
379
380 hpen = mat(h.penalties).reshape(len(h.penalties),1)
381 dpen = mat(d.penalties).reshape(len(d.penalties),1)
382 apen = mat(a.penalties).reshape(len(a.penalties),1)
383 features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties[:]])
384
385 #pdb.set_trace()
386
387 AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
388
389 distinct_scores = False
390 if math.fabs(AlignmentScores[pathNr] - AlignmentScores[pathNr+1]) > 1e-5:
391 distinct_scores = True
392
393 #pdb.set_trace()
394
395 # Check wether scalar product + loss equals viterbi score
396 if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) <= 1e-5:
397 self.plog("Scalar prod. + loss not equals Viterbi output!\n")
398 pdb.set_trace()
399
400 self.plog(" scalar prod (correct) : %f\n"%AlignmentScores[0])
401 self.plog(" scalar prod (pred.) : %f %f\n"%(newDPScores[pathNr,0],AlignmentScores[pathNr+1]))
402
403 #pdb.set_trace()
404
405 # if the pathNr-best alignment is very close to the true alignment consider it as true
406 if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
407 true_map[pathNr+1] = 1
408
409
410 if not trueAlignmentScore <= max(AlignmentScores[1:]) + 1e-6:
411 print "suboptimal_example %d\n" %exampleIdx
412 #trueSpliceAlign, trueWeightMatch, trueWeightQuality dna_calc=\
413 #computeSpliceAlignWithQuality(dna, exons, est, original_est, quality, qualityPlifs)
414
415 #pdb.set_trace()
416 suboptimal_example += 1
417 self.plog("suboptimal_example %d\n" %exampleIdx)
418
419
420 # the true label sequence should not have a larger score than the maximal one WHYYYYY?
421 # this means that all n-best paths are to close to each other
422 # we have to extend the n-best search to a (n+1)-best
423 if len([elem for elem in true_map if elem == 1]) == len(true_map):
424 num_path[exampleIdx] = num_path[exampleIdx]+1
425
426 # Choose true and first false alignment for extending
427 firstFalseIdx = -1
428 for map_idx,elem in enumerate(true_map):
429 if elem == 0:
430 firstFalseIdx = map_idx
431 break
432
433 if False:
434 self.plog("Is considered as: %d\n" % true_map[1])
435
436 result_len = currentAlignment.getResultLength()
437 c_dna_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
438 c_est_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
439
440 currentAlignment.getAlignmentArrays(c_dna_array,c_est_array)
441
442 dna_array = [0.0]*result_len
443 est_array = [0.0]*result_len
444
445 for r_idx in range(result_len):
446 dna_array[r_idx] = c_dna_array[r_idx]
447 est_array[r_idx] = c_est_array[r_idx]
448
449 _newSpliceAlign = newSpliceAlign[0].flatten().tolist()[0]
450 _newEstAlign = newEstAlign[0].flatten().tolist()[0]
451 line1,line2,line3 = pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array)
452 self.plog(line1+'\n')
453 self.plog(line2+'\n')
454 self.plog(line3+'\n')
455
456 # if there is at least one useful false alignment add the
457 # corresponding constraints to the optimization problem
458 if firstFalseIdx != -1:
459 firstFalseWeights = allWeights[:,firstFalseIdx]
460 differenceVector = trueWeight - firstFalseWeights
461 #pdb.set_trace()
462
463 const_added = solver.addConstraint(differenceVector, exampleIdx)
464 const_added_ctr += 1
465 #
466 # end of one example processing
467 #
468
469 # call solver every nth example //added constraint
470 if exampleIdx != 0 and exampleIdx % numConstPerRound == 0:
471 objValue,w,self.slacks = solver.solve()
472 solver_call_ctr += 1
473
474 if solver_call_ctr == 5:
475 numConstPerRound = 200
476 self.plog('numConstPerRound is now %d\n'% numConstPerRound)
477
478 if math.fabs(objValue - self.oldObjValue) <= 1e-6:
479 self.noImprovementCtr += 1
480
481 if self.noImprovementCtr == numExamples+1:
482 break
483
484 self.oldObjValue = objValue
485 print "objValue is %f" % objValue
486
487 sum_xis = 0
488 for elem in self.slacks:
489 sum_xis += elem
490
491 print 'sum of slacks is %f'% sum_xis
492 self.plog('sum of slacks is %f\n'% sum_xis)
493
494 for i in range(len(param)):
495 param[i] = w[i]
496
497 #pdb.set_trace()
498
499 cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
500 param_idx += 1
501 [h,d,a,mmatrix,qualityPlifs] =\
502 set_param_palma(param,self.ARGS.train_with_intronlengthinformation,run)
503
504 #
505 # end of one iteration through all examples
506 #
507
508 self.plog("suboptimal rounds %d\n" %suboptimal_example)
509
510 if self.noImprovementCtr == numExamples*2:
511 break
512
513 iteration_nr += 1
514
515 #
516 # end of optimization
517 #
518 print 'Training completed'
519
520 pa = para()
521 pa.h = h
522 pa.d = d
523 pa.a = a
524 pa.mmatrix = mmatrix
525 pa.qualityPlifs = qualityPlifs
526
527 cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
528 #cPickle.dump(pa,open('elegans.param','w+'))
529 self.logfh.close()
530
531 ###############################################################################
532 #
533 # End of the code needed for training
534 #
535 #
536 #
537 #
538 #
539 # Begin of code for prediction
540 #
541 #
542 ###############################################################################
543
544
545 def evaluate(self,param_filename):
546 run = self.run
547 beg = run['prediction_begin']
548 end = run['prediction_end']
549
550 data_filename = self.run['dataset_filename']
551 Sequences, Acceptors, Donors, Exons, Ests, OriginalEsts, Qualities,\
552 AlternativeSequences=\
553 paths_load_data(data_filename,'training',None,self.ARGS)
554
555 self.Sequences = Sequences
556 self.Exons = Exons
557 self.Ests = Ests
558 self.OriginalEsts= OriginalEsts
559 self.Qualities = Qualities
560 self.Donors = Donors
561 self.Acceptors = Acceptors
562
563 self.AlternativeSequences = AlternativeSequences
564
565 #calc_info(self.Acceptors,self.Donors,self.Exons,self.Qualities)
566 #print 'leaving constructor...'
567
568 self.logfh = open('_qpalma_predict.log','w+')
569
570 # predict on training set
571 print '##### Prediction on the training set #####'
572 self.predict(param_filename,0,beg)
573
574 # predict on test set
575 print '##### Prediction on the test set #####'
576 self.predict(param_filename,beg,end)
577
578 self.logfh.close()
579 #pdb.set_trace()
580
581 def predict(self,param_filename,beg,end):
582 """
583 Performing a prediction takes...
584
585 """
586
587 run = self.run
588
589 if self.run['mode'] == 'normal':
590 self.use_quality_scores = False
591
592 elif self.run['mode'] == 'using_quality_scores':
593 self.use_quality_scores = True
594 else:
595 assert(False)
596
597 Sequences = self.Sequences[beg:end]
598 Exons = self.Exons[beg:end]
599 Ests = self.Ests[beg:end]
600 OriginalEsts = self.OriginalEsts[beg:end]
601 Qualities = self.Qualities[beg:end]
602 Acceptors = self.Acceptors[beg:end]
603 Donors = self.Donors[beg:end]
604 #SplitPos = self.SplitPositions[beg:end]
605 #FReads = self.FReads[beg:end]
606
607 AlternativeSequences = self.AlternativeSequences[beg:end]
608
609 print 'STARTING PREDICTION'
610
611 # number of training instances
612 N = numExamples = len(Sequences)
613 assert len(Exons) == N and len(Ests) == N\
614 and len(Qualities) == N and len(Acceptors) == N\
615 and len(Donors) == N, 'The Exons,Acc,Don,.. arrays are of different lengths'
616 self.plog('Number of training examples: %d\n'% numExamples)
617
618 self.noImprovementCtr = 0
619 self.oldObjValue = 1e8
620
621 remove_duplicate_scores = self.run['remove_duplicate_scores']
622 print_matrix = self.run['print_matrix']
623 anzpath = self.run['anzpath']
624
625 param = cPickle.load(open(param_filename))
626
627 # Set the parameters such as limits penalties for the Plifs
628 [h,d,a,mmatrix,qualityPlifs] =\
629 set_param_palma(param,self.ARGS.train_with_intronlengthinformation,run)
630
631 #############################################################################################
632 # Prediction
633 #############################################################################################
634 self.plog('Starting prediction...\n')
635
636 donSP = self.run['numDonSuppPoints']
637 accSP = self.run['numAccSuppPoints']
638 lengthSP = self.run['numLengthSuppPoints']
639 mmatrixSP = run['matchmatrixRows']*run['matchmatrixCols']
640 numq = self.run['numQualSuppPoints']
641 totalQualSP = self.run['totalQualSuppPoints']
642
643 currentPhi = zeros((self.run['numFeatures'],1))
644 totalQualityPenalties = zeros((totalQualSP,1))
645
646 # some datastructures storing the predictions
647 exon1Begin = []
648 exon1End = []
649 exon2Begin = []
650 exon2End = []
651 allWrongExons = []
652
653 allRealCorrectSplitPos = []
654 allRealWrongSplitPos = []
655
656 allPredictions = []
657
658 # beginning of the prediction loop
659 #
660 for exampleIdx in range(numExamples):
661 dna = Sequences[exampleIdx]
662 est = Ests[exampleIdx]
663
664 new_est = ''
665 e = 0
666 while True:
667 if not e < len(est):
668 break
669
670 if est[e] == '[':
671 new_est += est[e+2]
672 e += 4
673 else:
674 new_est += est[e]
675 e += 1
676
677 est = new_est
678
679 print exampleIdx
680
681 est = "".join(est)
682 est = est.lower()
683 est = est.replace('-','')
684
685 original_est = OriginalEsts[exampleIdx]
686 original_est = "".join(original_est)
687 original_est = original_est.lower()
688
689 #currentSplitPos = SplitPos[exampleIdx]
690
691 #import re
692 #num_hits = len(re.findall('\[',original_est[:currentSplitPos]))
693
694 #if num_hits == 0:
695 # currentRealSplitPos = currentSplitPos
696 #else:
697 # currentRealSplitPos = currentSplitPos - (num_hits*3)
698 currentRealSplitPos = 0
699
700 #assert 0 < currentRealSplitPos < 36, pdb.set_trace()
701 currentRealRestPos = 36 - currentRealSplitPos
702 #assert 0 < currentRealRestPos < 36, pdb.set_trace()
703
704 if self.run['mode'] == 'normal':
705 quality = [40]*len(est)
706
707 if self.run['mode'] == 'using_quality_scores':
708 quality = Qualities[exampleIdx]
709
710 if not run['enable_quality_scores']:
711 quality = [40]*len(est)
712
713 #quality = fRead['prb']
714
715 exons = zeros((2,2))
716
717 exons[0,0] = fRead['p_start']-fRead['p_start']
718 exons[0,1] = fRead['exon_stop']-fRead['p_start']
719 exons[1,0] = fRead['exon_start']-fRead['p_start']
720 exons[1,1] = fRead['p_stop']-fRead['p_start']
721
722 don_supp = Donors[exampleIdx]
723 acc_supp = Acceptors[exampleIdx]
724
725 if not run['enable_splice_signals']:
726
727 for idx,elem in enumerate(don_supp):
728 if elem != -inf:
729 don_supp[idx] = 0.0
730
731 for idx,elem in enumerate(acc_supp):
732 if elem != -inf:
733 acc_supp[idx] = 0.0
734
735 currentPhi[0:donSP] = mat(d.penalties[:]).reshape(donSP,1)
736 currentPhi[donSP:donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
737 currentPhi[donSP+accSP:donSP+accSP+lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
738 currentPhi[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP] = mmatrix[:]
739
740 ################## Calculate alignment(s) ######################
741
742 # Compute donor, acceptor with penalty_lookup_new
743 # returns two double lists
744 donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
745
746 #myalign wants the acceptor site on the g of the ag
747 acceptor = acceptor[1:]
748 acceptor.append(-inf)
749
750 dna = str(dna)
751 est = str(est)
752 dna_len = len(dna)
753 est_len = len(est)
754
755 ps = h.convert2SWIG()
756
757 prb = QPalmaDP.createDoubleArrayFromList(quality)
758 chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
759
760 matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
761 mm_len = run['matchmatrixRows']*run['matchmatrixCols']
762
763 d_len = len(donor)
764 donor = QPalmaDP.createDoubleArrayFromList(donor)
765 a_len = len(acceptor)
766 acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
767
768 # Create the alignment object representing the interface to the C/C++ code.
769 currentAlignment = QPalmaDP.Alignment(run['numQualPlifs'],run['numQualSuppPoints'], self.use_quality_scores)
770
771 c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
772
773 # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
774 currentAlignment.myalign(1, dna, dna_len,\
775 est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
776 acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
777 print_matrix)
778
779 c_SpliceAlign = QPalmaDP.createIntArrayFromList([0]*(dna_len*1))
780 c_EstAlign = QPalmaDP.createIntArrayFromList([0]*(est_len*1))
781 c_WeightMatch = QPalmaDP.createIntArrayFromList([0]*(mm_len*1))
782 c_DPScores = QPalmaDP.createDoubleArrayFromList([.0]*1)
783
784 result_len = currentAlignment.getResultLength()
785 c_dna_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
786 c_est_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
787
788 currentAlignment.getAlignmentArrays(c_dna_array,c_est_array)
789
790 c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(self.run['totalQualSuppPoints']))
791
792 currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
793 c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
794
795 newSpliceAlign = zeros((dna_len,1))
796 newEstAlign = zeros((est_len,1))
797 newWeightMatch = zeros((mm_len,1))
798 newQualityPlifsFeatures = zeros((self.run['totalQualSuppPoints'],1))
799
800 for i in range(dna_len):
801 newSpliceAlign[i] = c_SpliceAlign[i]
802
803 for i in range(est_len):
804 newEstAlign[i] = c_EstAlign[i]
805
806 for i in range(mm_len):
807 newWeightMatch[i] = c_WeightMatch[i]
808
809 newDPScores = c_DPScores[0]
810
811 for i in range(run['totalQualSuppPoints']):
812 newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
813
814 del c_SpliceAlign
815 del c_EstAlign
816 del c_WeightMatch
817 del c_DPScores
818 del c_qualityPlifsFeatures
819 del currentAlignment
820
821 newSpliceAlign = newSpliceAlign.reshape(1,dna_len)
822 newWeightMatch = newWeightMatch.reshape(1,mm_len)
823 true_map = [0]*2
824 true_map[0] = 1
825 pathNr = 0
826
827 dna_array = [0.0]*result_len
828 est_array = [0.0]*result_len
829
830 for r_idx in range(result_len):
831 dna_array[r_idx] = c_dna_array[r_idx]
832 est_array[r_idx] = c_est_array[r_idx]
833
834 _newSpliceAlign = newSpliceAlign.flatten().tolist()[0]
835 _newEstAlign = newEstAlign.flatten().tolist()[0]
836
837 line1,line2,line3 = pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array)
838 self.plog(line1+'\n')
839 self.plog(line2+'\n')
840 self.plog(line3+'\n')
841
842 e1_b_off,e1_e_off,e2_b_off,e2_e_off,newExons = self.evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign,exampleIdx)
843
844 one_prediction = {'e1_b_off':e1_b_off,'e1_e_off':e1_e_off,
845 'e2_b_off':e2_b_off ,'e2_e_off':e2_e_off,\
846 'predExons':newExons, 'trueExons':exons,\
847 'dna':dna, 'est':est, 'exampleIdx':exampleIdx,\
848 'realRestPos':currentRealRestPos,\
849 'DPScores':newDPScores,
850 'fRead':fRead
851 }
852
853 allPredictions.append(one_prediction)
854
855 if e1_b_off != None:
856 exon1Begin.append(e1_b_off)
857 exon1End.append(e1_e_off)
858 exon2Begin.append(e2_b_off)
859 exon2End.append(e2_e_off)
860 allRealCorrectSplitPos.append(currentRealRestPos)
861 else:
862 allWrongExons.append((newExons,exons))
863 allRealWrongSplitPos.append(currentRealRestPos)
864
865 logfile = self.logfh
866
867 if e1_b_off == 0 and e1_e_off == 0 and e2_b_off == 0 and e2_e_off == 0:
868 print >> logfile, 'example %d correct' % exampleIdx
869 else:
870 print >> logfile, 'example %d wrong' % exampleIdx
871
872 e1Begin_pos,e1Begin_neg,e1End_pos,e1End_neg,mean_e1Begin_neg,mean_e1End_neg = self.evaluatePositions(exon1Begin,exon1End)
873 e2Begin_pos,e2Begin_neg,e2End_pos,e2End_neg,mean_e2Begin_neg,mean_e2End_neg = self.evaluatePositions(exon2Begin,exon2End)
874
875 correct_rest2error = {}
876 off_rest2error = {}
877 wrong_rest2error = {}
878
879 for e_idx in range(len(exon1Begin)):
880 e1b = exon1Begin[e_idx]
881 e1e = exon1End[e_idx]
882
883 e2b = exon2Begin[e_idx]
884 e2e = exon2End[e_idx]
885
886 current_elem = allRealCorrectSplitPos[e_idx]
887
888 if e1b == 0 and e1e == 0 and e2b == 0 and e2e == 0:
889 try:
890 correct_rest2error[current_elem] += 1
891 except:
892 correct_rest2error[current_elem] = 1
893
894 if not (e1b == 0 and e1e == 0 and e2b == 0 and e2e == 0):
895 try:
896 off_rest2error[current_elem] += 1
897 except:
898 off_rest2error[current_elem] = 1
899
900
901 for e_idx in range(len(allWrongExons)):
902
903 current_elem = allRealWrongSplitPos[e_idx]
904
905 try:
906 wrong_rest2error[current_elem] += 1
907 except:
908 wrong_rest2error[current_elem] = 1
909
910 all_pos_correct = 0
911 for idx in range(len(exon1Begin)):
912 if exon1Begin[idx] == 0 and exon1End[idx] == 0\
913 and exon2Begin[idx] == 0 and exon2End[idx] == 0:
914 all_pos_correct += 1
915
916 logfile = self.logfh
917 print >> logfile, 'Total num. of examples: %d' % numExamples
918 print >> logfile, 'Number of total correct examples: %d' % all_pos_correct
919 print >> logfile, 'Correct positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_pos),len(e1End_pos),len(e2Begin_pos),len(e2End_pos))
920 print >> logfile, 'Incorrect positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_neg),len(e1End_neg),len(e2Begin_neg),len(e2End_neg))
921 print >> logfile, 'Mean of pos. offset:\t\t%.2f\t%.2f\t%.2f\t%.2f' % (mean_e1Begin_neg,mean_e1End_neg,mean_e2Begin_neg,mean_e2End_neg)
922
923 print 'Total num. of examples: %d' % numExamples
924 print 'Number of total correct examples: %d' % all_pos_correct
925 print 'Correct positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_pos),len(e1End_pos),len(e2Begin_pos),len(e2End_pos))
926 print 'Incorrect positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_neg),len(e1End_neg),len(e2Begin_neg),len(e2End_neg))
927 print 'Mean of pos. offset:\t\t%.2f\t%.2f\t%.2f\t%.2f' % (mean_e1Begin_neg,mean_e1End_neg,mean_e2Begin_neg,mean_e2End_neg)
928
929 cPickle.dump(allPredictions,open('%s_allPredictions'%run['name'],'w+'))
930
931 print 'Prediction completed'
932
933 def evaluatePositions(self,eBegin,eEnd):
934
935 eBegin_pos = [elem for elem in eBegin if elem == 0]
936 eBegin_neg = [elem for elem in eBegin if elem != 0]
937 eEnd_pos = [elem for elem in eEnd if elem == 0]
938 eEnd_neg = [elem for elem in eEnd if elem != 0]
939
940 mean_eBegin_neg = 0
941 for idx in range(len(eBegin_neg)):
942 mean_eBegin_neg += eBegin_neg[idx]
943
944 try:
945 mean_eBegin_neg /= 1.0*len(eBegin_neg)
946 except:
947 mean_eBegin_neg = -1
948
949 mean_eEnd_neg = 0
950 for idx in range(len(eEnd_neg)):
951 mean_eEnd_neg += eEnd_neg[idx]
952
953 try:
954 mean_eEnd_neg /= 1.0*len(eEnd_neg)
955 except:
956 mean_eEnd_neg = -1
957
958 return eBegin_pos,eBegin_neg,eEnd_pos,eEnd_neg,mean_eBegin_neg,mean_eEnd_neg
959
960 def evaluateExample(self,dna,est,exons,SpliceAlign,newEstAlign,exampleIdx):
961 newExons = []
962 oldElem = -1
963 SpliceAlign = SpliceAlign.flatten().tolist()[0]
964 SpliceAlign.append(-1)
965 for pos,elem in enumerate(SpliceAlign):
966 if pos == 0:
967 oldElem = -1
968 else:
969 oldElem = SpliceAlign[pos-1]
970
971 if oldElem != 0 and elem == 0: # start of exon
972 newExons.append(pos)
973
974 if oldElem == 0 and elem != 0: # end of exon
975 newExons.append(pos)
976
977 e1_b_off,e1_e_off,e2_b_off,e2_e_off = 0,0,0,0
978
979 self.plog(("Example %d"%exampleIdx) + str(exons) + " "+ str(newExons) + "\n")
980 #self.plog(("SpliceAlign " + str(SpliceAlign)+ "\n"))
981 self.plog(("read: " + str(est)+ "\n"))
982
983 if len(newExons) == 4:
984 e1_begin,e1_end = newExons[0],newExons[1]
985 e2_begin,e2_end = newExons[2],newExons[3]
986 else:
987 return None,None,None,None,newExons
988
989 e1_b_off = int(math.fabs(e1_begin - exons[0,0]))
990 e1_e_off = int(math.fabs(e1_end - exons[0,1]))
991
992 e2_b_off = int(math.fabs(e2_begin - exons[1,0]))
993 e2_e_off = int(math.fabs(e2_end - exons[1,1]))
994
995 return e1_b_off,e1_e_off,e2_b_off,e2_e_off,newExons
996
997 def calcStat(Acceptor,Donor,Exons):
998 maxAcc = -100
999 minAcc = 100
1000 maxDon = -100
1001 minDon = 100
1002
1003 acc_vec_pos = []
1004 acc_vec_neg = []
1005 don_vec_pos = []
1006 don_vec_neg = []
1007
1008 for jdx in range(len(Acceptors)):
1009 currentExons = Exons[jdx]
1010 currentAcceptor = Acceptors[jdx]
1011 currentAcceptor = currentAcceptor[1:]
1012 currentAcceptor.append(-inf)
1013 currentDonor = Donors[jdx]
1014
1015 for idx,elem in enumerate(currentAcceptor):
1016 if idx == (int(currentExons[1,0])-1): # acceptor site
1017 acc_vec_pos.append(elem)
1018 else:
1019 acc_vec_neg.append(elem)
1020
1021 for idx,elem in enumerate(currentDonor):
1022 if idx == (int(currentExons[0,1])): # donor site
1023 don_vec_pos.append(elem)
1024 else:
1025 don_vec_neg.append(elem)
1026
1027 acc_pos = [elem for elem in acc_vec_pos if elem != -inf]
1028 acc_neg = [elem for elem in acc_vec_neg if elem != -inf]
1029 don_pos = [elem for elem in don_vec_pos if elem != -inf]
1030 don_neg = [elem for elem in don_vec_neg if elem != -inf]
1031
1032 for idx in range(len(Sequences)):
1033 acc = [elem for elem in Acceptors[idx] if elem != -inf]
1034 maxAcc = max(max(acc),maxAcc)
1035 minAcc = min(min(acc),minAcc)
1036
1037 don = [elem for elem in Donors[idx] if elem != -inf]
1038 maxDon = max(max(don),maxDon)
1039 minDon = min(min(don),minDon)
1040
1041 def pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array):
1042 (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds, tExonSizes, tStarts, tEnds) =\
1043 pu.get_splice_info(_newSpliceAlign,_newEstAlign)
1044
1045 t_strand = '+'
1046 translation = '-ACGTN_' #how aligned est and dna sequence is displayed
1047 #(gap_char, 5 nucleotides, intron_char)
1048 comparison_char = '| ' #first: match_char, second: intron_char
1049
1050 (exon_length, identity, ssQuality, exonQuality, comparison, qIDX, tIDX) =\
1051 pu.comp_identity(dna_array, est_array, t_strand, qStart, tStart, translation, comparison_char)
1052
1053 first_identity = None
1054 last_identity = None
1055 for i in range(len(comparison)):
1056 if comparison[i] == '|' and first_identity is None:
1057 first_identity = i
1058 if comparison[-i] == '|' and last_identity is None:
1059 last_identity = len(comparison) - i - 1
1060
1061 try:
1062 for idx in range(len(dna_array)):
1063 dna_array[idx] = translation[int(dna_array[idx])]
1064 est_array[idx] = translation[int(est_array[idx])]
1065 except:
1066 pdb.set_trace()
1067
1068 line1 = "".join(dna_array)
1069 line2 = comparison
1070 line3 = "".join(est_array)
1071
1072 return line1,line2,line3
1073
1074 def calc_info(acc,don,exons,qualities):
1075 min_score = 100
1076 max_score = -100
1077
1078 for elem in acc:
1079 for score in elem:
1080 if score != -inf:
1081 min_score = min(min_score,score)
1082 max_score = max(max_score,score)
1083
1084 print 'Acceptors max/min: %f / %f' % (max_score,min_score)
1085
1086 min_score = 100
1087 max_score = -100
1088
1089 for elem in don:
1090 for score in elem:
1091 if score != -inf:
1092 min_score = min(min_score,score)
1093 max_score = max(max_score,score)
1094
1095 print 'Donor max/min: %f / %f' % (max_score,min_score)
1096
1097 min_score = 10000
1098 max_score = 0
1099 mean_intron_len = 0
1100
1101 for elem in exons:
1102 intron_len = int(elem[1,0] - elem[0,1])
1103 mean_intron_len += intron_len
1104 min_score = min(min_score,intron_len)
1105 max_score = max(max_score,intron_len)
1106
1107 mean_intron_len /= 1.0*len(exons)
1108 print 'Intron length max/min: %d / %d mean length %f' % (max_score,min_score,mean_intron_len)
1109
1110 min_score = 10000
1111 max_score = 0
1112 mean_quality = 0
1113
1114 for qVec in qualities:
1115 for elem in qVec:
1116 min_score = min(min_score,elem)
1117 max_score = max(max_score,elem)
1118
1119 print 'Quality values max/min: %d / %d mean' % (max_score,min_score)
1120
1121
1122 ###########################
1123 # A simple command line
1124 # interface
1125 ###########################
1126
1127 if __name__ == '__main__':
1128 mode = sys.argv[1]
1129 run_obj_fn = sys.argv[2]
1130
1131 run_obj = cPickle.load(open(run_obj_fn))
1132
1133 qpalma = QPalma(run_obj)
1134
1135
1136 if len(sys.argv) == 3 and mode == 'train':
1137 qpalma.train()
1138
1139 elif len(sys.argv) == 4 and mode == 'predict':
1140 param_filename = sys.argv[3]
1141 assert os.path.exists(param_filename)
1142 qpalma.evaluate(param_filename)
1143 else:
1144 print 'You have to choose between training or prediction mode:'
1145 print 'python qpalma. py (train|predict) <param_file>'