+ minor changes
[qpalma.git] / scripts / qpalma_main.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 ###########################################################
5 #
6 # The QPalma project aims at extending the Palma project
7 # to be able to use Solexa reads toegether with their
8 # quality scores.
9 #
10 # This file represents the conversion of the main matlab
11 # training loop for Palma to Python.
12 #
13 # Author: Fabio De Bona
14 #
15 ###########################################################
16
17 import sys
18 import cPickle
19 import subprocess
20 import scipy.io
21 import pdb
22 import re
23 import os.path
24 import pydb
25
26 from compile_dataset import getSpliceScores
27
28 import numpy
29 from numpy.matlib import mat,zeros,ones,inf
30 from numpy.linalg import norm
31
32 import QPalmaDP
33 import qpalma
34 from qpalma.SIQP_CPX import SIQPSolver
35 from qpalma.DataProc import *
36 from qpalma.computeSpliceWeights import *
37 from qpalma.set_param_palma import *
38 from qpalma.computeSpliceAlignWithQuality import *
39 from qpalma.penalty_lookup_new import *
40 from qpalma.compute_donacc import *
41 from qpalma.TrainingParam import Param
42 from qpalma.Plif import Plf
43
44 from qpalma.tools.splicesites import getDonAccScores
45 from qpalma.Configuration import *
46
47 # this two imports are needed for the load genomic resp. interval query
48 # functions
49 from Genefinding import *
50 from genome_utils import load_genomic
51
52 from Utils import calc_stat, calc_info, pprint_alignment
53
54 class QPalma:
55 """
56 A training method for the QPalma project
57 """
58
59 def __init__(self,run):
60 self.ARGS = Param()
61 self.run = run
62
63 if self.run['mode'] == 'normal':
64 self.use_quality_scores = False
65
66 elif self.run['mode'] == 'using_quality_scores':
67 self.use_quality_scores = True
68 else:
69 assert(False)
70
71 full_working_path = os.path.join(run['experiment_path'],run['name'])
72 #assert not os.path.exists(full_working_path)
73 #os.mkdir(full_working_path)
74 assert os.path.exists(full_working_path)
75
76 # ATTENTION: Changing working directory
77 os.chdir(full_working_path)
78
79 cPickle.dump(run,open('run_object.pickle','w+'))
80
81 def plog(self,string):
82 self.logfh.write(string)
83 self.logfh.flush()
84
85
86 def do_alignment(self,dna,est,quality,mmatrix,donor,acceptor,ps,qualityPlifs,current_num_path,prediction_mode):
87 """
88 Given the needed input this method calls the QPalma C module which
89 calculates a dynamic programming in order to obtain an alignment
90 """
91 run = self.run
92
93 dna_len = len(dna)
94 est_len = len(est)
95
96 prb = QPalmaDP.createDoubleArrayFromList(quality)
97 chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
98
99 matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
100 mm_len = run['matchmatrixRows']*run['matchmatrixCols']
101
102 d_len = len(donor)
103 donor = QPalmaDP.createDoubleArrayFromList(donor)
104 a_len = len(acceptor)
105 acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
106
107 # Create the alignment object representing the interface to the C/C++ code.
108 currentAlignment = QPalmaDP.Alignment(run['numQualPlifs'],run['numQualSuppPoints'], self.use_quality_scores)
109 c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
110 # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
111 currentAlignment.myalign( current_num_path, dna, dna_len,\
112 est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
113 acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
114 print_matrix)
115
116 c_SpliceAlign = QPalmaDP.createIntArrayFromList([0]*(dna_len*current_num_path))
117 c_EstAlign = QPalmaDP.createIntArrayFromList([0]*(est_len*current_num_path))
118 c_WeightMatch = QPalmaDP.createIntArrayFromList([0]*(mm_len*current_num_path))
119 c_DPScores = QPalmaDP.createDoubleArrayFromList([.0]*current_num_path)
120
121 c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(run['totalQualSuppPoints']*current_num_path))
122
123 if prediction_mode:
124 # part that is only needed for prediction
125 result_len = currentAlignment.getResultLength()
126 c_dna_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
127 c_est_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
128
129 currentAlignment.getAlignmentArrays(c_dna_array,c_est_array)
130
131 dna_array = [0.0]*result_len
132 est_array = [0.0]*result_len
133
134 for r_idx in range(result_len):
135 dna_array[r_idx] = c_dna_array[r_idx]
136 est_array[r_idx] = c_est_array[r_idx]
137
138 else:
139 dna_array = None
140 est_array = None
141
142 currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
143 c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
144
145 #print 'After calling getAlignmentResults...'
146
147 newSpliceAlign = zeros((current_num_path*dna_len,1))
148 newEstAlign = zeros((est_len*current_num_path,1))
149 newWeightMatch = zeros((current_num_path*mm_len,1))
150 newDPScores = zeros((current_num_path,1))
151 newQualityPlifsFeatures = zeros((run['totalQualSuppPoints']*current_num_path,1))
152
153 for i in range(dna_len*current_num_path):
154 newSpliceAlign[i] = c_SpliceAlign[i]
155
156 for i in range(est_len*current_num_path):
157 newEstAlign[i] = c_EstAlign[i]
158
159 for i in range(mm_len*current_num_path):
160 newWeightMatch[i] = c_WeightMatch[i]
161
162 for i in range(current_num_path):
163 newDPScores[i] = c_DPScores[i]
164
165 if self.use_quality_scores:
166 for i in range(run['totalQualSuppPoints']*current_num_path):
167 newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
168
169 del c_SpliceAlign
170 del c_EstAlign
171 del c_WeightMatch
172 del c_DPScores
173 del c_qualityPlifsFeatures
174 del currentAlignment
175
176 return newSpliceAlign, newEstAlign, newWeightMatch, newDPScores,\
177 newQualityPlifsFeatures, dna_array, est_array
178
179
180 def train(self):
181 run = self.run
182
183 self.logfh = open('_qpalma_train.log','w+')
184
185 self.plog("Settings are:\n")
186 self.plog("%s\n"%str(run))
187
188 data_filename = self.run['dataset_filename']
189 Sequences, Acceptors, Donors, Exons, Ests, OriginalEsts, Qualities,\
190 UpCut, AlternativeSequences =\
191 paths_load_data(data_filename,'training',None,self.ARGS)
192
193 # Load the whole dataset
194 if self.run['mode'] == 'normal':
195 self.use_quality_scores = False
196
197 elif self.run['mode'] == 'using_quality_scores':
198 self.use_quality_scores = True
199 else:
200 assert(False)
201
202 self.Sequences = Sequences
203 self.Exons = Exons
204 self.Ests = Ests
205 self.OriginalEsts= OriginalEsts
206 self.Qualities = Qualities
207 self.Donors = Donors
208 self.Acceptors = Acceptors
209
210 calc_info(self.Acceptors,self.Donors,self.Exons,self.Qualities)
211
212 beg = run['training_begin']
213 end = run['training_end']
214
215 Sequences = Sequences[beg:end]
216 Exons = Exons[beg:end]
217 Ests = Ests[beg:end]
218 OriginalEsts= OriginalEsts[beg:end]
219 Qualities = Qualities[beg:end]
220 Acceptors = Acceptors[beg:end]
221 Donors = Donors[beg:end]
222
223 # number of training instances
224 N = numExamples = len(Sequences)
225 assert len(Exons) == N and len(Ests) == N\
226 and len(Qualities) == N and len(Acceptors) == N\
227 and len(Donors) == N, 'The Exons,Acc,Don,.. arrays are of different lengths'
228 self.plog('Number of training examples: %d\n'% numExamples)
229
230 self.noImprovementCtr = 0
231 self.oldObjValue = 1e8
232
233 iteration_steps = run['iter_steps']
234 remove_duplicate_scores = run['remove_duplicate_scores']
235 print_matrix = run['print_matrix']
236 anzpath = run['anzpath']
237
238 # Initialize parameter vector / param = numpy.matlib.rand(126,1)
239 param = Conf.fixedParam[:run['numFeatures']]
240
241 lengthSP = run['numLengthSuppPoints']
242 donSP = run['numDonSuppPoints']
243 accSP = run['numAccSuppPoints']
244 mmatrixSP = run['matchmatrixRows']*run['matchmatrixCols']
245 numq = run['numQualSuppPoints']
246 totalQualSP = run['totalQualSuppPoints']
247
248 # no intron length model
249 if not run['enable_intron_length']:
250 param[:lengthSP] *= 0.0
251
252 # Set the parameters such as limits penalties for the Plifs
253 [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation,run)
254
255 # Initialize solver
256 self.plog('Initializing problem...\n')
257 solver = SIQPSolver(run['numFeatures'],numExamples,run['C'],self.logfh,run)
258
259 #solver.enforceMonotonicity(lengthSP,lengthSP+donSP)
260 #solver.enforceMonotonicity(lengthSP+donSP,lengthSP+donSP+accSP)
261
262 # stores the number of alignments done for each example (best path, second-best path etc.)
263 num_path = [anzpath]*numExamples
264 # stores the gap for each example
265 gap = [0.0]*numExamples
266 #############################################################################################
267 # Training
268 #############################################################################################
269 self.plog('Starting training...\n')
270
271 currentPhi = zeros((run['numFeatures'],1))
272 totalQualityPenalties = zeros((totalQualSP,1))
273
274 numConstPerRound = run['numConstraintsPerRound']
275 solver_call_ctr = 0
276
277 suboptimal_example = 0
278 iteration_nr = 0
279 param_idx = 0
280 const_added_ctr = 0
281
282 # the main training loop
283 while True:
284 if iteration_nr == iteration_steps:
285 break
286
287 for exampleIdx in range(numExamples):
288 if (exampleIdx%100) == 0:
289 print 'Current example nr %d' % exampleIdx
290
291 dna = Sequences[exampleIdx]
292 est = Ests[exampleIdx]
293 est = "".join(est)
294 est = est.lower()
295 est = est.replace('-','')
296 original_est = OriginalEsts[exampleIdx]
297 original_est = "".join(original_est)
298 original_est = original_est.lower()
299
300 dna_len = len(dna)
301 est_len = len(est)
302
303 assert len(est) == run['read_size'], pdb.set_trace()
304
305 if run['mode'] == 'normal':
306 quality = [40]*len(est)
307
308 if run['mode'] == 'using_quality_scores':
309 quality = Qualities[exampleIdx]
310
311 if not run['enable_quality_scores']:
312 quality = [40]*len(est)
313
314 exons = Exons[exampleIdx]
315 don_supp = Donors[exampleIdx]
316 acc_supp = Acceptors[exampleIdx]
317
318 if not run['enable_splice_signals']:
319 for idx,elem in enumerate(don_supp):
320 if elem != -inf:
321 don_supp[idx] = 0.0
322
323 for idx,elem in enumerate(acc_supp):
324 if elem != -inf:
325 acc_supp[idx] = 0.0
326
327 # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
328 if run['mode'] == 'using_quality_scores':
329 trueSpliceAlign, trueWeightMatch, trueWeightQuality ,dna_calc =\
330 computeSpliceAlignWithQuality(dna, exons, est, original_est,\
331 quality, qualityPlifs,run)
332 else:
333 trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
334
335 dna_calc = dna_calc.replace('-','')
336
337 # Calculate the weights
338 trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
339 trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
340
341 currentPhi[0:lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
342 currentPhi[lengthSP:lengthSP+donSP] = mat(d.penalties[:]).reshape(donSP,1)
343 currentPhi[lengthSP+donSP:lengthSP+donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
344 currentPhi[lengthSP+donSP+accSP:lengthSP+donSP+accSP+mmatrixSP] = mmatrix[:]
345
346 if run['mode'] == 'using_quality_scores':
347 totalQualityPenalties = param[-totalQualSP:]
348 currentPhi[lengthSP+donSP+accSP+mmatrixSP:] = totalQualityPenalties[:]
349
350 # Calculate w'phi(x,y) the total score of the alignment
351 trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
352
353 # The allWeights vector is supposed to store the weight parameter
354 # of the true alignment as well as the weight parameters of the
355 # num_path[exampleIdx] other alignments
356 allWeights = zeros((run['numFeatures'],num_path[exampleIdx]+1))
357 allWeights[:,0] = trueWeight[:,0]
358
359 AlignmentScores = [0.0]*(num_path[exampleIdx]+1)
360 AlignmentScores[0] = trueAlignmentScore
361
362 ################## Calculate wrong alignment(s) ######################
363 # Compute donor, acceptor with penalty_lookup_new
364 # returns two double lists
365 donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
366
367 #myalign wants the acceptor site on the g of the ag
368 acceptor = acceptor[1:]
369 acceptor.append(-inf)
370
371 # check that splice site scores are at dna positions as expected by
372 # the dynamic programming component
373
374 #for d_pos in [pos for pos,elem in enumerate(donor) if elem != -inf]:
375 # assert dna[d_pos] == 'g' and (dna[d_pos+1] == 'c'\
376 # or dna[d_pos+1] == 't'), pdb.set_trace()
377 #
378 #for a_pos in [pos for pos,elem in enumerate(acceptor) if elem != -inf]:
379 # assert dna[a_pos-1] == 'a' and dna[a_pos] == 'g', pdb.set_trace()
380
381 ps = h.convert2SWIG()
382
383 _newSpliceAlign, _newEstAlign, _newWeightMatch, _newDPScores,\
384 _newQualityPlifsFeatures, unneeded1, unneeded2 =\
385 self.do_alignment(dna,est,quality,mmatrix,donor,acceptor,ps,qualityPlifs,num_path[exampleIdx],False)
386
387 mm_len = run['matchmatrixRows']*run['matchmatrixCols']
388
389 # old code removed
390
391 newSpliceAlign = _newSpliceAlign
392 newEstAlign = _newEstAlign
393 newWeightMatch = _newWeightMatch
394 newDPScores = _newDPScores
395 newQualityPlifsFeatures = _newQualityPlifsFeatures
396
397 newSpliceAlign = newSpliceAlign.reshape(num_path[exampleIdx],dna_len)
398 newWeightMatch = newWeightMatch.reshape(num_path[exampleIdx],mm_len)
399
400 newQualityPlifsFeatures = newQualityPlifsFeatures.reshape(num_path[exampleIdx],run['totalQualSuppPoints'])
401
402 # Calculate weights of the respective alignments. Note that we are
403 # calculating n-best alignments without hamming loss, so we
404 # have to keep track which of the n-best alignments correspond to
405 # the true one in order not to incorporate a true alignment in the
406 # constraints. To keep track of the true and false alignments we
407 # define an array true_map with a boolean indicating the
408 # equivalence to the true alignment for each decoded alignment.
409 true_map = [0]*(num_path[exampleIdx]+1)
410 true_map[0] = 1
411
412 for pathNr in range(num_path[exampleIdx]):
413 #print 'decodedWeights'
414 weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a,\
415 h, newSpliceAlign[pathNr,:].flatten().tolist()[0], don_supp,\
416 acc_supp)
417
418 decodedQualityFeatures = zeros((run['totalQualSuppPoints'],1))
419 decodedQualityFeatures = newQualityPlifsFeatures[pathNr,:].T
420 # Gewichte in restliche Zeilen der Matrix speichern
421 allWeights[:,pathNr+1] = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch[pathNr,:].T, decodedQualityFeatures[:]])
422
423 hpen = mat(h.penalties).reshape(len(h.penalties),1)
424 dpen = mat(d.penalties).reshape(len(d.penalties),1)
425 apen = mat(a.penalties).reshape(len(a.penalties),1)
426 features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties[:]])
427
428 AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
429
430 distinct_scores = False
431 if math.fabs(AlignmentScores[pathNr] - AlignmentScores[pathNr+1]) > 1e-5:
432 distinct_scores = True
433
434 # Check wether scalar product + loss equals viterbi score
435 if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) <= 1e-5:
436 self.plog("Scalar prod. + loss not equals Viterbi output!\n")
437 pdb.set_trace()
438
439 self.plog(" scalar prod (correct) : %f\n"%AlignmentScores[0])
440 self.plog(" scalar prod (pred.) : %f %f\n"%(newDPScores[pathNr,0],AlignmentScores[pathNr+1]))
441
442 # if the pathNr-best alignment is very close to the true alignment consider it as true
443 if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
444 true_map[pathNr+1] = 1
445
446 if not trueAlignmentScore <= max(AlignmentScores[1:]) + 1e-6:
447 print "suboptimal_example %d\n" %exampleIdx
448 #trueSpliceAlign, trueWeightMatch, trueWeightQuality dna_calc=\
449 #computeSpliceAlignWithQuality(dna, exons, est, original_est, quality, qualityPlifs)
450
451 #pdb.set_trace()
452 suboptimal_example += 1
453 self.plog("suboptimal_example %d\n" %exampleIdx)
454
455 # the true label sequence should not have a larger score than the maximal one WHYYYYY?
456 # this means that all n-best paths are to close to each other
457 # we have to extend the n-best search to a (n+1)-best
458 if len([elem for elem in true_map if elem == 1]) == len(true_map):
459 num_path[exampleIdx] = num_path[exampleIdx]+1
460
461 # Choose true and first false alignment for extending
462 firstFalseIdx = -1
463 for map_idx,elem in enumerate(true_map):
464 if elem == 0:
465 firstFalseIdx = map_idx
466 break
467
468 if False:
469 self.plog("Is considered as: %d\n" % true_map[1])
470
471 result_len = currentAlignment.getResultLength()
472 c_dna_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
473 c_est_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
474
475 currentAlignment.getAlignmentArrays(c_dna_array,c_est_array)
476
477 dna_array = [0.0]*result_len
478 est_array = [0.0]*result_len
479
480 for r_idx in range(result_len):
481 dna_array[r_idx] = c_dna_array[r_idx]
482 est_array[r_idx] = c_est_array[r_idx]
483
484 _newSpliceAlign = newSpliceAlign[0].flatten().tolist()[0]
485 _newEstAlign = newEstAlign[0].flatten().tolist()[0]
486 line1,line2,line3 = pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array)
487 self.plog(line1+'\n')
488 self.plog(line2+'\n')
489 self.plog(line3+'\n')
490
491 # if there is at least one useful false alignment add the
492 # corresponding constraints to the optimization problem
493 if firstFalseIdx != -1:
494 firstFalseWeights = allWeights[:,firstFalseIdx]
495 differenceVector = trueWeight - firstFalseWeights
496 #pdb.set_trace()
497
498 const_added = solver.addConstraint(differenceVector, exampleIdx)
499 const_added_ctr += 1
500 #
501 # end of one example processing
502 #
503
504 # call solver every nth example //added constraint
505 if exampleIdx != 0 and exampleIdx % numConstPerRound == 0:
506 objValue,w,self.slacks = solver.solve()
507 solver_call_ctr += 1
508
509 if solver_call_ctr == 5:
510 numConstPerRound = 200
511 self.plog('numConstPerRound is now %d\n'% numConstPerRound)
512
513 if math.fabs(objValue - self.oldObjValue) <= 1e-6:
514 self.noImprovementCtr += 1
515
516 if self.noImprovementCtr == numExamples+1:
517 break
518
519 self.oldObjValue = objValue
520 print "objValue is %f" % objValue
521
522 sum_xis = 0
523 for elem in self.slacks:
524 sum_xis += elem
525
526 print 'sum of slacks is %f'% sum_xis
527 self.plog('sum of slacks is %f\n'% sum_xis)
528
529 for i in range(len(param)):
530 param[i] = w[i]
531
532 cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
533 param_idx += 1
534 [h,d,a,mmatrix,qualityPlifs] =\
535 set_param_palma(param,self.ARGS.train_with_intronlengthinformation,run)
536
537 #
538 # end of one iteration through all examples
539 #
540
541 self.plog("suboptimal rounds %d\n" %suboptimal_example)
542
543 if self.noImprovementCtr == numExamples*2:
544 break
545
546 iteration_nr += 1
547
548 #
549 # end of optimization
550 #
551 print 'Training completed'
552
553 cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
554 self.logfh.close()
555
556 ###############################################################################
557 #
558 # End of the code needed for training
559 #
560 #
561 # Begin of code for prediction
562 #
563 ###############################################################################
564
565 def evaluate(self,param_filename):
566 run = self.run
567 beg = run['prediction_begin']
568 end = run['prediction_end']
569
570 data_filename = self.run['dataset_filename']
571 Sequences, Acceptors, Donors, Exons, Ests, OriginalEsts, Qualities,\
572 UpCut, AlternativeSequences=\
573 paths_load_data(data_filename,'training',None,self.ARGS)
574
575 self.Sequences = Sequences
576 self.Exons = Exons
577 self.Ests = Ests
578 self.OriginalEsts= OriginalEsts
579 self.Qualities = Qualities
580 self.Donors = Donors
581 self.Acceptors = Acceptors
582 self.UpCut = UpCut
583
584 self.AlternativeSequences = AlternativeSequences
585
586 #calc_info(self.Acceptors,self.Donors,self.Exons,self.Qualities)
587 #print 'leaving constructor...'
588
589 self.logfh = open('_qpalma_predict.log','w+')
590
591 # predict on training set
592 print '##### Prediction on the training set #####'
593 self.predict(param_filename,0,beg,'TRAIN')
594
595 # predict on test set
596 print '##### Prediction on the test set #####'
597 self.predict(param_filename,beg,end,'TEST')
598
599 self.logfh.close()
600 #pdb.set_trace()
601
602 def predict(self,param_filename,beg,end,set_flag):
603 """
604 Performing a prediction takes...
605
606 """
607
608 run = self.run
609
610 if self.run['mode'] == 'normal':
611 self.use_quality_scores = False
612
613 elif self.run['mode'] == 'using_quality_scores':
614 self.use_quality_scores = True
615 else:
616 assert(False)
617
618 Sequences = self.Sequences[beg:end]
619 Exons = self.Exons[beg:end]
620 Ests = self.Ests[beg:end]
621 OriginalEsts = self.OriginalEsts[beg:end]
622 Qualities = self.Qualities[beg:end]
623 Acceptors = self.Acceptors[beg:end]
624 Donors = self.Donors[beg:end]
625 UpCut = self.UpCut[beg:end]
626 #SplitPos = self.SplitPositions[beg:end]
627
628 AlternativeSequences = self.AlternativeSequences[beg:end]
629
630 print 'STARTING PREDICTION'
631
632 # number of training instances
633 N = numExamples = len(Sequences)
634 assert len(Exons) == N and len(Ests) == N\
635 and len(Qualities) == N and len(Acceptors) == N\
636 and len(Donors) == N, 'The Exons,Acc,Don,.. arrays are of different lengths'
637 self.plog('Number of training examples: %d\n'% numExamples)
638
639 self.noImprovementCtr = 0
640 self.oldObjValue = 1e8
641
642 remove_duplicate_scores = self.run['remove_duplicate_scores']
643 print_matrix = self.run['print_matrix']
644 anzpath = self.run['anzpath']
645
646 param = cPickle.load(open(param_filename))
647
648 # Set the parameters such as limits penalties for the Plifs
649 [h,d,a,mmatrix,qualityPlifs] =\
650 set_param_palma(param,self.ARGS.train_with_intronlengthinformation,run)
651
652 #############################################################################################
653 # Prediction
654 #############################################################################################
655 self.plog('Starting prediction...\n')
656
657 donSP = self.run['numDonSuppPoints']
658 accSP = self.run['numAccSuppPoints']
659 lengthSP = self.run['numLengthSuppPoints']
660 mmatrixSP = run['matchmatrixRows']*run['matchmatrixCols']
661 numq = self.run['numQualSuppPoints']
662 totalQualSP = self.run['totalQualSuppPoints']
663
664 totalQualityPenalties = zeros((totalQualSP,1))
665
666 # where we store the predictions
667 allPredictions = []
668
669 # beginning of the prediction loop
670 for exampleIdx in range(numExamples):
671 dna = Sequences[exampleIdx]
672 est = Ests[exampleIdx]
673
674 new_est = ''
675 e = 0
676 while True:
677 if not e < len(est):
678 break
679
680 if est[e] == '[':
681 new_est += est[e+2]
682 e += 4
683 else:
684 new_est += est[e]
685 e += 1
686
687 est = new_est
688 est = "".join(est)
689 est = est.lower()
690
691 exons = Exons[exampleIdx]
692
693 current_up_cut = UpCut[exampleIdx]
694
695 currentAlternatives = AlternativeSequences[exampleIdx]
696
697 #est = est.replace('-','')
698 #original_est = OriginalEsts[exampleIdx]
699 #original_est = "".join(original_est)
700 #original_est = original_est.lower()
701 #currentSplitPos = SplitPos[exampleIdx]
702
703 if self.run['mode'] == 'normal':
704 quality = [40]*len(est)
705
706 if self.run['mode'] == 'using_quality_scores':
707 quality = Qualities[exampleIdx]
708
709 if not run['enable_quality_scores']:
710 quality = [40]*len(est)
711
712 don_supp = Donors[exampleIdx]
713 acc_supp = Acceptors[exampleIdx]
714
715 if not run['enable_splice_signals']:
716
717 for idx,elem in enumerate(don_supp):
718 if elem != -inf:
719 don_supp[idx] = 0.0
720
721 for idx,elem in enumerate(acc_supp):
722 if elem != -inf:
723 acc_supp[idx] = 0.0
724
725 current_example_predictions = []
726
727 # first make a prediction on the dna fragment which comes from the ground truth
728 current_prediction = self.calc_alignment(dna, est, exons, quality, don_supp, acc_supp, d, a, h, mmatrix, qualityPlifs)
729 current_prediction['exampleIdx'] = exampleIdx
730
731 current_example_predictions.append(current_prediction)
732
733 # then make predictions for all dna fragments that where occurring in
734 # the vmatch results
735 for alternative_alignment in currentAlternatives:
736 chr, strand, genomicSeq_start, genomicSeq_stop, currentLabel = alternative_alignment
737 currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
738 current_exons = exons - current_up_cut
739
740 current_prediction = self.calc_alignment(dna, est, exons, quality, don_supp, acc_supp, d, a, h, mmatrix, qualityPlifs)
741 current_prediction['exampleIdx'] = exampleIdx
742
743 current_example_predictions.append(current_prediction)
744
745 allPredictions.append(current_example_predictions)
746
747 # end of the prediction loop we save all predictions in a pickle file and exit
748 cPickle.dump(allPredictions,open('%s_allPredictions_%s'%(run['name'],set_flag),'w+'))
749 print 'Prediction completed'
750
751
752 def calc_alignment(self, dna, est, exons, quality, don_supp, acc_supp, d, a, h, mmatrix, qualityPlifs):
753 """
754 Given two sequences and the parameters we calculate on alignment
755 """
756
757 run = self.run
758
759 donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
760
761 #myalign wants the acceptor site on the g of the ag
762 acceptor = acceptor[1:]
763 acceptor.append(-inf)
764
765 dna = str(dna)
766 est = str(est)
767 dna_len = len(dna)
768 est_len = len(est)
769
770 ps = h.convert2SWIG()
771
772 __newSpliceAlign, __newEstAlign, __newWeightMatch, __newDPScores,\
773 __newQualityPlifsFeatures, __dna_array, __est_array =\
774 self.do_alignment(dna,est,quality,mmatrix,donor,acceptor,ps,qualityPlifs,1,True)
775
776 mm_len = run['matchmatrixRows']*run['matchmatrixCols']
777
778 # old code removed
779
780 newSpliceAlign = __newSpliceAlign
781 newEstAlign = __newEstAlign
782 newWeightMatch = __newWeightMatch
783 newDPScores = __newDPScores
784 newQualityPlifsFeatures = __newQualityPlifsFeatures
785 dna_array = __dna_array
786 est_array = __est_array
787
788 newSpliceAlign = newSpliceAlign.reshape(1,dna_len)
789 newWeightMatch = newWeightMatch.reshape(1,mm_len)
790 true_map = [0]*2
791 true_map[0] = 1
792 pathNr = 0
793
794 _newSpliceAlign = newSpliceAlign.flatten().tolist()[0]
795 _newEstAlign = newEstAlign.flatten().tolist()[0]
796
797 line1,line2,line3 = pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array)
798 self.plog(line1+'\n')
799 self.plog(line2+'\n')
800 self.plog(line3+'\n')
801
802 e1_b_off,e1_e_off,e2_b_off,e2_e_off,newExons = self.evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign)
803
804 current_prediction = {'e1_b_off':e1_b_off,'e1_e_off':e1_e_off,\
805 'e2_b_off':e2_b_off ,'e2_e_off':e2_e_off,\
806 'predExons':newExons, 'trueExons':exons,\
807 'dna':dna, 'est':est, 'DPScores':newDPScores }
808
809 return current_prediction
810
811
812 def evaluateExample(self,dna,est,exons,SpliceAlign,newEstAlign):
813 newExons = []
814 oldElem = -1
815 SpliceAlign = SpliceAlign.flatten().tolist()[0]
816 SpliceAlign.append(-1)
817 for pos,elem in enumerate(SpliceAlign):
818 if pos == 0:
819 oldElem = -1
820 else:
821 oldElem = SpliceAlign[pos-1]
822
823 if oldElem != 0 and elem == 0: # start of exon
824 newExons.append(pos)
825
826 if oldElem == 0 and elem != 0: # end of exon
827 newExons.append(pos)
828
829 e1_b_off,e1_e_off,e2_b_off,e2_e_off = 0,0,0,0
830
831 #self.plog(("Example %d"%exampleIdx) + str(exons) + " "+ str(newExons) + "\n")
832 #self.plog(("SpliceAlign " + str(SpliceAlign)+ "\n"))
833 self.plog(("read: " + str(est)+ "\n"))
834
835 if len(newExons) == 4:
836 e1_begin,e1_end = newExons[0],newExons[1]
837 e2_begin,e2_end = newExons[2],newExons[3]
838 else:
839 return None,None,None,None,newExons
840
841 e1_b_off = int(math.fabs(e1_begin - exons[0,0]))
842 e1_e_off = int(math.fabs(e1_end - exons[0,1]))
843
844 e2_b_off = int(math.fabs(e2_begin - exons[1,0]))
845 e2_e_off = int(math.fabs(e2_end - exons[1,1]))
846
847 return e1_b_off,e1_e_off,e2_b_off,e2_e_off,newExons
848
849
850 def get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_files):
851 """
852 This function expects an interval, chromosome and strand information and
853 returns then the genomic sequence of this interval and the associated scores.
854 """
855
856 chrom = 'chr%d' % chr
857 genomicSeq = load_genomic(chrom,strand,genomicSeq_start-1,genomicSeq_stop,dna_flat_files,one_based=False)
858 genomicSeq = genomicSeq.lower()
859
860 # check the obtained dna sequence
861 assert genomicSeq != '', 'load_genomic returned empty sequence!'
862 #for elem in genomicSeq:
863 # if not elem in alphabet:
864
865 no_base = re.compile('[^acgt]')
866 genomicSeq = no_base.sub('n',genomicSeq)
867
868 intervalBegin = genomicSeq_start-100
869 intervalEnd = genomicSeq_stop+100
870 currentDNASeq = genomicSeq
871 seq_pos_offset = genomicSeq_start
872
873 currentAcc, currentDon = getSpliceScores(chr,strand,intervalBegin,intervalEnd,currentDNASeq,seq_pos_offset)
874
875 return currentDNASeq, currentAcc, currentDon
876
877
878 ###########################
879 # A simple command line
880 # interface
881 ###########################
882
883 if __name__ == '__main__':
884 mode = sys.argv[1]
885 run_obj_fn = sys.argv[2]
886
887 run_obj = cPickle.load(open(run_obj_fn))
888
889 qpalma = QPalma(run_obj)
890
891
892 if len(sys.argv) == 3 and mode == 'train':
893 qpalma.train()
894
895 elif len(sys.argv) == 4 and mode == 'predict':
896 param_filename = sys.argv[3]
897 assert os.path.exists(param_filename)
898 qpalma.evaluate(param_filename)
899 else:
900 print 'You have to choose between training or prediction mode:'
901 print 'python qpalma. py (train|predict) <param_file>'