+ extended scripts
[qpalma.git] / scripts / qpalma_main.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 ###########################################################
5 #
6 # The QPalma project aims at extending the Palma project
7 # to be able to use Solexa reads toegether with their
8 # quality scores.
9 #
10 # This file represents the conversion of the main matlab
11 # training loop for Palma to Python.
12 #
13 # Author: Fabio De Bona
14 #
15 ###########################################################
16
17 import sys
18 import cPickle
19 import subprocess
20 import scipy.io
21 import pdb
22 import re
23 import os.path
24 import pydb
25
26 from compile_dataset import getSpliceScores
27
28 import numpy
29 from numpy.matlib import mat,zeros,ones,inf
30 from numpy.linalg import norm
31
32 import QPalmaDP
33 import qpalma
34 from qpalma.SIQP_CPX import SIQPSolver
35 from qpalma.DataProc import *
36 from qpalma.computeSpliceWeights import *
37 from qpalma.set_param_palma import *
38 from qpalma.computeSpliceAlignWithQuality import *
39 from qpalma.penalty_lookup_new import *
40 from qpalma.compute_donacc import *
41 from qpalma.TrainingParam import Param
42 from qpalma.Plif import Plf
43
44 from qpalma.tools.splicesites import getDonAccScores
45 from qpalma.Configuration import *
46
47 # this two imports are needed for the load genomic resp. interval query
48 # functions
49 from Genefinding import *
50 from genome_utils import load_genomic
51
52 from Utils import calc_stat, calc_info, pprint_alignment
53
54 class QPalma:
55 """
56 A training method for the QPalma project
57 """
58
59 def __init__(self,run):
60 self.ARGS = Param()
61 self.run = run
62
63 if self.run['mode'] == 'normal':
64 self.use_quality_scores = False
65
66 elif self.run['mode'] == 'using_quality_scores':
67 self.use_quality_scores = True
68 else:
69 assert(False)
70
71 full_working_path = os.path.join(run['experiment_path'],run['name'])
72 #assert not os.path.exists(full_working_path)
73 #os.mkdir(full_working_path)
74 assert os.path.exists(full_working_path)
75
76 # ATTENTION: Changing working directory
77 os.chdir(full_working_path)
78
79 cPickle.dump(run,open('run_object.pickle','w+'))
80
81 def plog(self,string):
82 self.logfh.write(string)
83 self.logfh.flush()
84
85
86 def do_alignment(self,dna,est,quality,mmatrix,donor,acceptor,ps,qualityPlifs,current_num_path,prediction_mode):
87 """
88 Given the needed input this method calls the QPalma C module which
89 calculates a dynamic programming in order to obtain an alignment
90 """
91 run = self.run
92
93 dna_len = len(dna)
94 est_len = len(est)
95
96 prb = QPalmaDP.createDoubleArrayFromList(quality)
97 chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
98
99 matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
100 mm_len = run['matchmatrixRows']*run['matchmatrixCols']
101
102 d_len = len(donor)
103 donor = QPalmaDP.createDoubleArrayFromList(donor)
104 a_len = len(acceptor)
105 acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
106
107 # Create the alignment object representing the interface to the C/C++ code.
108 currentAlignment = QPalmaDP.Alignment(run['numQualPlifs'],run['numQualSuppPoints'], self.use_quality_scores)
109 c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
110 # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
111 currentAlignment.myalign( current_num_path, dna, dna_len,\
112 est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
113 acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
114 print_matrix)
115
116 c_SpliceAlign = QPalmaDP.createIntArrayFromList([0]*(dna_len*current_num_path))
117 c_EstAlign = QPalmaDP.createIntArrayFromList([0]*(est_len*current_num_path))
118 c_WeightMatch = QPalmaDP.createIntArrayFromList([0]*(mm_len*current_num_path))
119 c_DPScores = QPalmaDP.createDoubleArrayFromList([.0]*current_num_path)
120
121 c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(run['totalQualSuppPoints']*current_num_path))
122
123 if prediction_mode:
124 # part that is only needed for prediction
125 result_len = currentAlignment.getResultLength()
126 c_dna_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
127 c_est_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
128
129 currentAlignment.getAlignmentArrays(c_dna_array,c_est_array)
130
131 dna_array = [0.0]*result_len
132 est_array = [0.0]*result_len
133
134 for r_idx in range(result_len):
135 dna_array[r_idx] = c_dna_array[r_idx]
136 est_array[r_idx] = c_est_array[r_idx]
137
138 else:
139 dna_array = None
140 est_array = None
141
142 currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
143 c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
144
145 #print 'After calling getAlignmentResults...'
146
147 newSpliceAlign = zeros((current_num_path*dna_len,1))
148 newEstAlign = zeros((est_len*current_num_path,1))
149 newWeightMatch = zeros((current_num_path*mm_len,1))
150 newDPScores = zeros((current_num_path,1))
151 newQualityPlifsFeatures = zeros((run['totalQualSuppPoints']*current_num_path,1))
152
153 for i in range(dna_len*current_num_path):
154 newSpliceAlign[i] = c_SpliceAlign[i]
155
156 for i in range(est_len*current_num_path):
157 newEstAlign[i] = c_EstAlign[i]
158
159 for i in range(mm_len*current_num_path):
160 newWeightMatch[i] = c_WeightMatch[i]
161
162 for i in range(current_num_path):
163 newDPScores[i] = c_DPScores[i]
164
165 if self.use_quality_scores:
166 for i in range(run['totalQualSuppPoints']*current_num_path):
167 newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
168
169 del c_SpliceAlign
170 del c_EstAlign
171 del c_WeightMatch
172 del c_DPScores
173 del c_qualityPlifsFeatures
174 del currentAlignment
175
176 return newSpliceAlign, newEstAlign, newWeightMatch, newDPScores,\
177 newQualityPlifsFeatures, dna_array, est_array
178
179
180 def train(self):
181 run = self.run
182
183 self.logfh = open('_qpalma_train.log','w+')
184
185 self.plog("Settings are:\n")
186 self.plog("%s\n"%str(run))
187
188 data_filename = self.run['dataset_filename']
189 Sequences, Acceptors, Donors, Exons, Ests, OriginalEsts, Qualities,\
190 UpCut, StartPos, AlternativeSequences =\
191 paths_load_data(data_filename,'training',None,self.ARGS)
192
193 # Load the whole dataset
194 if self.run['mode'] == 'normal':
195 self.use_quality_scores = False
196
197 elif self.run['mode'] == 'using_quality_scores':
198 self.use_quality_scores = True
199 else:
200 assert(False)
201
202 self.Sequences = Sequences
203 self.Exons = Exons
204 self.Ests = Ests
205 self.OriginalEsts= OriginalEsts
206 self.Qualities = Qualities
207 self.Donors = Donors
208 self.Acceptors = Acceptors
209
210 calc_info(self.Acceptors,self.Donors,self.Exons,self.Qualities)
211
212 beg = run['training_begin']
213 end = run['training_end']
214
215 Sequences = Sequences[beg:end]
216 Exons = Exons[beg:end]
217 Ests = Ests[beg:end]
218 OriginalEsts= OriginalEsts[beg:end]
219 Qualities = Qualities[beg:end]
220 Acceptors = Acceptors[beg:end]
221 Donors = Donors[beg:end]
222
223 # number of training instances
224 N = numExamples = len(Sequences)
225 assert len(Exons) == N and len(Ests) == N\
226 and len(Qualities) == N and len(Acceptors) == N\
227 and len(Donors) == N, 'The Exons,Acc,Don,.. arrays are of different lengths'
228 self.plog('Number of training examples: %d\n'% numExamples)
229
230 self.noImprovementCtr = 0
231 self.oldObjValue = 1e8
232
233 iteration_steps = run['iter_steps']
234 remove_duplicate_scores = run['remove_duplicate_scores']
235 print_matrix = run['print_matrix']
236 anzpath = run['anzpath']
237
238 # Initialize parameter vector / param = numpy.matlib.rand(126,1)
239 param = Conf.fixedParam[:run['numFeatures']]
240
241 lengthSP = run['numLengthSuppPoints']
242 donSP = run['numDonSuppPoints']
243 accSP = run['numAccSuppPoints']
244 mmatrixSP = run['matchmatrixRows']*run['matchmatrixCols']
245 numq = run['numQualSuppPoints']
246 totalQualSP = run['totalQualSuppPoints']
247
248 # no intron length model
249 if not run['enable_intron_length']:
250 param[:lengthSP] *= 0.0
251
252 # Set the parameters such as limits penalties for the Plifs
253 [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation,run)
254
255 # Initialize solver
256 self.plog('Initializing problem...\n')
257 solver = SIQPSolver(run['numFeatures'],numExamples,run['C'],self.logfh,run)
258
259 #solver.enforceMonotonicity(lengthSP,lengthSP+donSP)
260 #solver.enforceMonotonicity(lengthSP+donSP,lengthSP+donSP+accSP)
261
262 # stores the number of alignments done for each example (best path, second-best path etc.)
263 num_path = [anzpath]*numExamples
264 # stores the gap for each example
265 gap = [0.0]*numExamples
266 #############################################################################################
267 # Training
268 #############################################################################################
269 self.plog('Starting training...\n')
270
271 currentPhi = zeros((run['numFeatures'],1))
272 totalQualityPenalties = zeros((totalQualSP,1))
273
274 numConstPerRound = run['numConstraintsPerRound']
275 solver_call_ctr = 0
276
277 suboptimal_example = 0
278 iteration_nr = 0
279 param_idx = 0
280 const_added_ctr = 0
281
282 # the main training loop
283 while True:
284 if iteration_nr == iteration_steps:
285 break
286
287 for exampleIdx in range(numExamples):
288 if (exampleIdx%100) == 0:
289 print 'Current example nr %d' % exampleIdx
290
291 dna = Sequences[exampleIdx]
292 est = Ests[exampleIdx]
293 est = "".join(est)
294 est = est.lower()
295 est = est.replace('-','')
296 original_est = OriginalEsts[exampleIdx]
297 original_est = "".join(original_est)
298 original_est = original_est.lower()
299
300 dna_len = len(dna)
301 est_len = len(est)
302
303 assert len(est) == run['read_size'], pdb.set_trace()
304
305 if run['mode'] == 'normal':
306 quality = [40]*len(est)
307
308 if run['mode'] == 'using_quality_scores':
309 quality = Qualities[exampleIdx]
310
311 if not run['enable_quality_scores']:
312 quality = [40]*len(est)
313
314 exons = Exons[exampleIdx]
315 don_supp = Donors[exampleIdx]
316 acc_supp = Acceptors[exampleIdx]
317
318 if not run['enable_splice_signals']:
319 for idx,elem in enumerate(don_supp):
320 if elem != -inf:
321 don_supp[idx] = 0.0
322
323 for idx,elem in enumerate(acc_supp):
324 if elem != -inf:
325 acc_supp[idx] = 0.0
326
327 # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
328 if run['mode'] == 'using_quality_scores':
329 trueSpliceAlign, trueWeightMatch, trueWeightQuality ,dna_calc =\
330 computeSpliceAlignWithQuality(dna, exons, est, original_est,\
331 quality, qualityPlifs,run)
332 else:
333 trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
334
335 dna_calc = dna_calc.replace('-','')
336
337 # Calculate the weights
338 trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
339 trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
340
341 currentPhi[0:lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
342 currentPhi[lengthSP:lengthSP+donSP] = mat(d.penalties[:]).reshape(donSP,1)
343 currentPhi[lengthSP+donSP:lengthSP+donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
344 currentPhi[lengthSP+donSP+accSP:lengthSP+donSP+accSP+mmatrixSP] = mmatrix[:]
345
346 if run['mode'] == 'using_quality_scores':
347 totalQualityPenalties = param[-totalQualSP:]
348 currentPhi[lengthSP+donSP+accSP+mmatrixSP:] = totalQualityPenalties[:]
349
350 # Calculate w'phi(x,y) the total score of the alignment
351 trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
352
353 # The allWeights vector is supposed to store the weight parameter
354 # of the true alignment as well as the weight parameters of the
355 # num_path[exampleIdx] other alignments
356 allWeights = zeros((run['numFeatures'],num_path[exampleIdx]+1))
357 allWeights[:,0] = trueWeight[:,0]
358
359 AlignmentScores = [0.0]*(num_path[exampleIdx]+1)
360 AlignmentScores[0] = trueAlignmentScore
361
362 ################## Calculate wrong alignment(s) ######################
363 # Compute donor, acceptor with penalty_lookup_new
364 # returns two double lists
365 donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
366
367 #myalign wants the acceptor site on the g of the ag
368 acceptor = acceptor[1:]
369 acceptor.append(-inf)
370
371 # check that splice site scores are at dna positions as expected by
372 # the dynamic programming component
373
374 #for d_pos in [pos for pos,elem in enumerate(donor) if elem != -inf]:
375 # assert dna[d_pos] == 'g' and (dna[d_pos+1] == 'c'\
376 # or dna[d_pos+1] == 't'), pdb.set_trace()
377 #
378 #for a_pos in [pos for pos,elem in enumerate(acceptor) if elem != -inf]:
379 # assert dna[a_pos-1] == 'a' and dna[a_pos] == 'g', pdb.set_trace()
380
381 ps = h.convert2SWIG()
382
383 _newSpliceAlign, _newEstAlign, _newWeightMatch, _newDPScores,\
384 _newQualityPlifsFeatures, unneeded1, unneeded2 =\
385 self.do_alignment(dna,est,quality,mmatrix,donor,acceptor,ps,qualityPlifs,num_path[exampleIdx],False)
386
387 mm_len = run['matchmatrixRows']*run['matchmatrixCols']
388
389 # old code removed
390
391 newSpliceAlign = _newSpliceAlign
392 newEstAlign = _newEstAlign
393 newWeightMatch = _newWeightMatch
394 newDPScores = _newDPScores
395 newQualityPlifsFeatures = _newQualityPlifsFeatures
396
397 newSpliceAlign = newSpliceAlign.reshape(num_path[exampleIdx],dna_len)
398 newWeightMatch = newWeightMatch.reshape(num_path[exampleIdx],mm_len)
399
400 newQualityPlifsFeatures = newQualityPlifsFeatures.reshape(num_path[exampleIdx],run['totalQualSuppPoints'])
401
402 # Calculate weights of the respective alignments. Note that we are
403 # calculating n-best alignments without hamming loss, so we
404 # have to keep track which of the n-best alignments correspond to
405 # the true one in order not to incorporate a true alignment in the
406 # constraints. To keep track of the true and false alignments we
407 # define an array true_map with a boolean indicating the
408 # equivalence to the true alignment for each decoded alignment.
409 true_map = [0]*(num_path[exampleIdx]+1)
410 true_map[0] = 1
411
412 for pathNr in range(num_path[exampleIdx]):
413 #print 'decodedWeights'
414 weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a,\
415 h, newSpliceAlign[pathNr,:].flatten().tolist()[0], don_supp,\
416 acc_supp)
417
418 decodedQualityFeatures = zeros((run['totalQualSuppPoints'],1))
419 decodedQualityFeatures = newQualityPlifsFeatures[pathNr,:].T
420 # Gewichte in restliche Zeilen der Matrix speichern
421 allWeights[:,pathNr+1] = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch[pathNr,:].T, decodedQualityFeatures[:]])
422
423 hpen = mat(h.penalties).reshape(len(h.penalties),1)
424 dpen = mat(d.penalties).reshape(len(d.penalties),1)
425 apen = mat(a.penalties).reshape(len(a.penalties),1)
426 features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties[:]])
427
428 AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
429
430 distinct_scores = False
431 if math.fabs(AlignmentScores[pathNr] - AlignmentScores[pathNr+1]) > 1e-5:
432 distinct_scores = True
433
434 # Check wether scalar product + loss equals viterbi score
435 if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) <= 1e-5:
436 self.plog("Scalar prod. + loss not equals Viterbi output!\n")
437 pdb.set_trace()
438
439 self.plog(" scalar prod (correct) : %f\n"%AlignmentScores[0])
440 self.plog(" scalar prod (pred.) : %f %f\n"%(newDPScores[pathNr,0],AlignmentScores[pathNr+1]))
441
442 # if the pathNr-best alignment is very close to the true alignment consider it as true
443 if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
444 true_map[pathNr+1] = 1
445
446 if not trueAlignmentScore <= max(AlignmentScores[1:]) + 1e-6:
447 print "suboptimal_example %d\n" %exampleIdx
448 #trueSpliceAlign, trueWeightMatch, trueWeightQuality dna_calc=\
449 #computeSpliceAlignWithQuality(dna, exons, est, original_est, quality, qualityPlifs)
450
451 #pdb.set_trace()
452 suboptimal_example += 1
453 self.plog("suboptimal_example %d\n" %exampleIdx)
454
455 # the true label sequence should not have a larger score than the maximal one WHYYYYY?
456 # this means that all n-best paths are to close to each other
457 # we have to extend the n-best search to a (n+1)-best
458 if len([elem for elem in true_map if elem == 1]) == len(true_map):
459 num_path[exampleIdx] = num_path[exampleIdx]+1
460
461 # Choose true and first false alignment for extending
462 firstFalseIdx = -1
463 for map_idx,elem in enumerate(true_map):
464 if elem == 0:
465 firstFalseIdx = map_idx
466 break
467
468 if False:
469 self.plog("Is considered as: %d\n" % true_map[1])
470
471 result_len = currentAlignment.getResultLength()
472 c_dna_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
473 c_est_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
474
475 currentAlignment.getAlignmentArrays(c_dna_array,c_est_array)
476
477 dna_array = [0.0]*result_len
478 est_array = [0.0]*result_len
479
480 for r_idx in range(result_len):
481 dna_array[r_idx] = c_dna_array[r_idx]
482 est_array[r_idx] = c_est_array[r_idx]
483
484 _newSpliceAlign = newSpliceAlign[0].flatten().tolist()[0]
485 _newEstAlign = newEstAlign[0].flatten().tolist()[0]
486 line1,line2,line3 = pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array)
487 self.plog(line1+'\n')
488 self.plog(line2+'\n')
489 self.plog(line3+'\n')
490
491 # if there is at least one useful false alignment add the
492 # corresponding constraints to the optimization problem
493 if firstFalseIdx != -1:
494 firstFalseWeights = allWeights[:,firstFalseIdx]
495 differenceVector = trueWeight - firstFalseWeights
496 #pdb.set_trace()
497
498 const_added = solver.addConstraint(differenceVector, exampleIdx)
499 const_added_ctr += 1
500 #
501 # end of one example processing
502 #
503
504 # call solver every nth example //added constraint
505 if exampleIdx != 0 and exampleIdx % numConstPerRound == 0:
506 objValue,w,self.slacks = solver.solve()
507 solver_call_ctr += 1
508
509 if solver_call_ctr == 5:
510 numConstPerRound = 200
511 self.plog('numConstPerRound is now %d\n'% numConstPerRound)
512
513 if math.fabs(objValue - self.oldObjValue) <= 1e-6:
514 self.noImprovementCtr += 1
515
516 if self.noImprovementCtr == numExamples+1:
517 break
518
519 self.oldObjValue = objValue
520 print "objValue is %f" % objValue
521
522 sum_xis = 0
523 for elem in self.slacks:
524 sum_xis += elem
525
526 print 'sum of slacks is %f'% sum_xis
527 self.plog('sum of slacks is %f\n'% sum_xis)
528
529 for i in range(len(param)):
530 param[i] = w[i]
531
532 cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
533 param_idx += 1
534 [h,d,a,mmatrix,qualityPlifs] =\
535 set_param_palma(param,self.ARGS.train_with_intronlengthinformation,run)
536
537 #
538 # end of one iteration through all examples
539 #
540
541 self.plog("suboptimal rounds %d\n" %suboptimal_example)
542
543 if self.noImprovementCtr == numExamples*2:
544 break
545
546 iteration_nr += 1
547
548 #
549 # end of optimization
550 #
551 print 'Training completed'
552
553 cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
554 self.logfh.close()
555
556 ###############################################################################
557 #
558 # End of the code needed for training
559 #
560 #
561 # Begin of code for prediction
562 #
563 ###############################################################################
564
565 def evaluate(self,param_filename):
566 run = self.run
567 beg = run['prediction_begin']
568 end = run['prediction_end']
569
570 data_filename = self.run['dataset_filename']
571 Sequences, Acceptors, Donors, Exons, Ests, OriginalEsts, Qualities,\
572 UpCut, StartPos, AlternativeSequences=\
573 paths_load_data(data_filename,'training',None,self.ARGS)
574
575 self.Sequences = Sequences
576 self.Exons = Exons
577 self.Ests = Ests
578 self.OriginalEsts= OriginalEsts
579 self.Qualities = Qualities
580 self.Donors = Donors
581 self.Acceptors = Acceptors
582 self.UpCut = UpCut
583 self.StartPos = StartPos
584
585 self.AlternativeSequences = AlternativeSequences
586
587 #calc_info(self.Acceptors,self.Donors,self.Exons,self.Qualities)
588 #print 'leaving constructor...'
589
590 self.logfh = open('_qpalma_predict.log','w+')
591
592 # predict on training set
593 print '##### Prediction on the training set #####'
594 self.predict(param_filename,0,beg,'TRAIN')
595
596 # predict on test set
597 print '##### Prediction on the test set #####'
598 self.predict(param_filename,beg,end,'TEST')
599
600 self.logfh.close()
601 #pdb.set_trace()
602
603 def predict(self,param_filename,beg,end,set_flag):
604 """
605 Performing a prediction takes...
606
607 """
608
609 run = self.run
610
611 if self.run['mode'] == 'normal':
612 self.use_quality_scores = False
613
614 elif self.run['mode'] == 'using_quality_scores':
615 self.use_quality_scores = True
616 else:
617 assert(False)
618
619 Sequences = self.Sequences[beg:end]
620 Exons = self.Exons[beg:end]
621 Ests = self.Ests[beg:end]
622 OriginalEsts = self.OriginalEsts[beg:end]
623 Qualities = self.Qualities[beg:end]
624 Acceptors = self.Acceptors[beg:end]
625 Donors = self.Donors[beg:end]
626 UpCut = self.UpCut[beg:end]
627 StartPos = self.StartPos[beg:end]
628 #SplitPos = self.SplitPositions[beg:end]
629
630 AlternativeSequences = self.AlternativeSequences[beg:end]
631
632 print 'STARTING PREDICTION'
633
634 # number of training instances
635 N = numExamples = len(Sequences)
636 assert len(Exons) == N and len(Ests) == N\
637 and len(Qualities) == N and len(Acceptors) == N\
638 and len(Donors) == N, 'The Exons,Acc,Don,.. arrays are of different lengths'
639 self.plog('Number of training examples: %d\n'% numExamples)
640
641 self.noImprovementCtr = 0
642 self.oldObjValue = 1e8
643
644 remove_duplicate_scores = self.run['remove_duplicate_scores']
645 print_matrix = self.run['print_matrix']
646 anzpath = self.run['anzpath']
647
648 param = cPickle.load(open(param_filename))
649
650 # Set the parameters such as limits penalties for the Plifs
651 [h,d,a,mmatrix,qualityPlifs] =\
652 set_param_palma(param,self.ARGS.train_with_intronlengthinformation,run)
653
654 #############################################################################################
655 # Prediction
656 #############################################################################################
657 self.plog('Starting prediction...\n')
658
659 donSP = self.run['numDonSuppPoints']
660 accSP = self.run['numAccSuppPoints']
661 lengthSP = self.run['numLengthSuppPoints']
662 mmatrixSP = run['matchmatrixRows']*run['matchmatrixCols']
663 numq = self.run['numQualSuppPoints']
664 totalQualSP = self.run['totalQualSuppPoints']
665
666 totalQualityPenalties = zeros((totalQualSP,1))
667
668 # where we store the predictions
669 allPredictions = []
670
671 # beginning of the prediction loop
672 for exampleIdx in range(numExamples):
673 dna = Sequences[exampleIdx]
674 est = Ests[exampleIdx]
675
676 new_est = ''
677 e = 0
678 while True:
679 if not e < len(est):
680 break
681
682 if est[e] == '[':
683 new_est += est[e+2]
684 e += 4
685 else:
686 new_est += est[e]
687 e += 1
688
689 est = new_est
690 est = "".join(est)
691 est = est.lower()
692
693 exons = Exons[exampleIdx]
694
695 current_up_cut = UpCut[exampleIdx]
696
697 current_start_pos = StartPos[exampleIdx]
698
699 currentAlternatives = AlternativeSequences[exampleIdx]
700
701 #est = est.replace('-','')
702 #original_est = OriginalEsts[exampleIdx]
703 #original_est = "".join(original_est)
704 #original_est = original_est.lower()
705 #currentSplitPos = SplitPos[exampleIdx]
706
707 if self.run['mode'] == 'normal':
708 quality = [40]*len(est)
709
710 if self.run['mode'] == 'using_quality_scores':
711 quality = Qualities[exampleIdx]
712
713 if not run['enable_quality_scores']:
714 quality = [40]*len(est)
715
716 don_supp = Donors[exampleIdx]
717 acc_supp = Acceptors[exampleIdx]
718
719 if not run['enable_splice_signals']:
720
721 for idx,elem in enumerate(don_supp):
722 if elem != -inf:
723 don_supp[idx] = 0.0
724
725 for idx,elem in enumerate(acc_supp):
726 if elem != -inf:
727 acc_supp[idx] = 0.0
728
729 current_example_predictions = []
730
731 # first make a prediction on the dna fragment which comes from the ground truth
732 current_prediction = self.calc_alignment(dna, est, exons, quality, don_supp, acc_supp, d, a, h, mmatrix, qualityPlifs)
733 current_prediction['exampleIdx'] = exampleIdx
734 current_prediction['start_pos'] = current_start_pos
735
736 current_example_predictions.append(current_prediction)
737
738 # then make predictions for all dna fragments that where occurring in
739 # the vmatch results
740 for alternative_alignment in currentAlternatives:
741 chr, strand, genomicSeq_start, genomicSeq_stop, currentLabel = alternative_alignment
742 currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
743
744 current_prediction = self.calc_alignment(currentDNASeq, est, exons,\
745 quality, currentDon, currentAcc, d, a, h, mmatrix, qualityPlifs)
746 current_prediction['exampleIdx'] = exampleIdx
747 current_prediction['start_pos'] = current_start_pos
748 current_prediction['alternative_start_pos'] = genomicSeq_start
749 current_prediction['label'] = currentLabel
750
751 current_example_predictions.append(current_prediction)
752
753 allPredictions.append(current_example_predictions)
754
755 # end of the prediction loop we save all predictions in a pickle file and exit
756 cPickle.dump(allPredictions,open('%s_allPredictions_%s'%(run['name'],set_flag),'w+'))
757 print 'Prediction completed'
758
759
760 def calc_alignment(self, dna, est, exons, quality, don_supp, acc_supp, d, a, h, mmatrix, qualityPlifs):
761 """
762 Given two sequences and the parameters we calculate on alignment
763 """
764
765 run = self.run
766
767 donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
768
769 #myalign wants the acceptor site on the g of the ag
770 acceptor = acceptor[1:]
771 acceptor.append(-inf)
772
773 dna = str(dna)
774 est = str(est)
775 dna_len = len(dna)
776 est_len = len(est)
777
778 ps = h.convert2SWIG()
779
780 __newSpliceAlign, __newEstAlign, __newWeightMatch, __newDPScores,\
781 __newQualityPlifsFeatures, __dna_array, __est_array =\
782 self.do_alignment(dna,est,quality,mmatrix,donor,acceptor,ps,qualityPlifs,1,True)
783
784 mm_len = run['matchmatrixRows']*run['matchmatrixCols']
785
786 # old code removed
787
788 newSpliceAlign = __newSpliceAlign
789 newEstAlign = __newEstAlign
790 newWeightMatch = __newWeightMatch
791 newDPScores = __newDPScores
792 newQualityPlifsFeatures = __newQualityPlifsFeatures
793 dna_array = __dna_array
794 est_array = __est_array
795
796 newSpliceAlign = newSpliceAlign.reshape(1,dna_len)
797 newWeightMatch = newWeightMatch.reshape(1,mm_len)
798 true_map = [0]*2
799 true_map[0] = 1
800 pathNr = 0
801
802 _newSpliceAlign = newSpliceAlign.flatten().tolist()[0]
803 _newEstAlign = newEstAlign.flatten().tolist()[0]
804
805 line1,line2,line3 = pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array)
806 self.plog(line1+'\n')
807 self.plog(line2+'\n')
808 self.plog(line3+'\n')
809
810 e1_b_off,e1_e_off,e2_b_off,e2_e_off,newExons = self.evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign)
811
812 current_prediction = {'e1_b_off':e1_b_off,'e1_e_off':e1_e_off,\
813 'e2_b_off':e2_b_off ,'e2_e_off':e2_e_off,\
814 'predExons':newExons, 'trueExons':exons,\
815 'dna':dna, 'est':est, 'DPScores':newDPScores }
816
817 return current_prediction
818
819
820 def evaluateExample(self,dna,est,exons,SpliceAlign,newEstAlign):
821 newExons = []
822 oldElem = -1
823 SpliceAlign = SpliceAlign.flatten().tolist()[0]
824 SpliceAlign.append(-1)
825 for pos,elem in enumerate(SpliceAlign):
826 if pos == 0:
827 oldElem = -1
828 else:
829 oldElem = SpliceAlign[pos-1]
830
831 if oldElem != 0 and elem == 0: # start of exon
832 newExons.append(pos)
833
834 if oldElem == 0 and elem != 0: # end of exon
835 newExons.append(pos)
836
837 e1_b_off,e1_e_off,e2_b_off,e2_e_off = 0,0,0,0
838
839 #self.plog(("Example %d"%exampleIdx) + str(exons) + " "+ str(newExons) + "\n")
840 #self.plog(("SpliceAlign " + str(SpliceAlign)+ "\n"))
841 self.plog(("read: " + str(est)+ "\n"))
842
843 if len(newExons) == 4:
844 e1_begin,e1_end = newExons[0],newExons[1]
845 e2_begin,e2_end = newExons[2],newExons[3]
846 else:
847 return None,None,None,None,newExons
848
849 e1_b_off = int(math.fabs(e1_begin - exons[0,0]))
850 e1_e_off = int(math.fabs(e1_end - exons[0,1]))
851
852 e2_b_off = int(math.fabs(e2_begin - exons[1,0]))
853 e2_e_off = int(math.fabs(e2_end - exons[1,1]))
854
855 return e1_b_off,e1_e_off,e2_b_off,e2_e_off,newExons
856
857
858 def get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_files):
859 """
860 This function expects an interval, chromosome and strand information and
861 returns then the genomic sequence of this interval and the associated scores.
862 """
863
864 chrom = 'chr%d' % chr
865 genomicSeq = load_genomic(chrom,strand,genomicSeq_start-1,genomicSeq_stop,dna_flat_files,one_based=False)
866 genomicSeq = genomicSeq.lower()
867
868 # check the obtained dna sequence
869 assert genomicSeq != '', 'load_genomic returned empty sequence!'
870 #for elem in genomicSeq:
871 # if not elem in alphabet:
872
873 no_base = re.compile('[^acgt]')
874 genomicSeq = no_base.sub('n',genomicSeq)
875
876 intervalBegin = genomicSeq_start-100
877 intervalEnd = genomicSeq_stop+100
878 currentDNASeq = genomicSeq
879 seq_pos_offset = genomicSeq_start
880
881 currentAcc, currentDon = getSpliceScores(chr,strand,intervalBegin,intervalEnd,currentDNASeq,seq_pos_offset)
882
883 return currentDNASeq, currentAcc, currentDon
884
885
886 ###########################
887 # A simple command line
888 # interface
889 ###########################
890
891 if __name__ == '__main__':
892 mode = sys.argv[1]
893 run_obj_fn = sys.argv[2]
894
895 run_obj = cPickle.load(open(run_obj_fn))
896
897 qpalma = QPalma(run_obj)
898
899
900 if len(sys.argv) == 3 and mode == 'train':
901 qpalma.train()
902
903 elif len(sys.argv) == 4 and mode == 'predict':
904 param_filename = sys.argv[3]
905 assert os.path.exists(param_filename)
906 qpalma.evaluate(param_filename)
907 else:
908 print 'You have to choose between training or prediction mode:'
909 print 'python qpalma. py (train|predict) <param_file>'