164c1e5a3d08e7ca7d0cbe6466e1523f9f616fc4
[qpalma.git] / scripts / qpalma_main.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 ###########################################################
5 #
6 # The QPalma project aims at extending the Palma project
7 # to be able to use Solexa reads together with their
8 # quality scores.
9 #
10 # This file represents the conversion of the main matlab
11 # training loop for Palma to Python.
12 #
13 # Author: Fabio De Bona
14 #
15 ###########################################################
16
17 import array
18 import cPickle
19 import os.path
20 import pdb
21 import sys
22
23 from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper,unbracket_seq
24
25 import numpy
26 from numpy.matlib import mat,zeros,ones,inf
27 from numpy.linalg import norm
28
29 import QPalmaDP
30 import qpalma
31
32 #from qpalma.SIQP_CPX import SIQPSolver
33 #from qpalma.SIQP_CVXOPT import SIQPSolver
34
35 from qpalma.computeSpliceWeights import *
36 from qpalma.set_param_palma import *
37 from qpalma.computeSpliceAlignWithQuality import *
38 from qpalma.TrainingParam import Param
39 from qpalma.Plif import Plf,compute_donacc
40
41 from Utils import calc_stat, calc_info, pprint_alignment, get_alignment
42
43 class SpliceSiteException:
44 pass
45
46
47 def getData(training_set,exampleKey,run):
48 currentSeqInfo,currentExons,original_est,currentQualities = training_set[exampleKey]
49 id,chr,strand,up_cut,down_cut = currentSeqInfo
50
51 est = original_est
52 est = "".join(est)
53 est = est.lower()
54 est = unbracket_est(est)
55 est = est.replace('-','')
56
57 assert len(est) == run['read_size'], pdb.set_trace()
58 est_len = len(est)
59
60 #original_est = OriginalEsts[exampleIdx]
61 original_est = "".join(original_est)
62 original_est = original_est.lower()
63
64 dna_flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
65 dna, acc_supp, don_supp = get_seq_and_scores(chr,strand,up_cut,down_cut,dna_flat_files)
66
67 original_exons = currentExons
68 exons = original_exons - (up_cut-1)
69 exons[0,0] -= 1
70 exons[1,0] -= 1
71
72 if exons.shape == (2,2):
73 fetched_dna_subseq = dna[exons[0,0]:exons[0,1]] + dna[exons[1,0]:exons[1,1]]
74
75 donor_elem = dna[exons[0,1]:exons[0,1]+2]
76 acceptor_elem = dna[exons[1,0]-2:exons[1,0]]
77
78 if not ( donor_elem == 'gt' or donor_elem == 'gc' ):
79 print 'invalid donor in example %d'% exampleKey
80 raise SpliceSiteException
81
82 if not ( acceptor_elem == 'ag' ):
83 print 'invalid acceptor in example %d'% exampleKey
84 raise SpliceSiteException
85
86 assert len(fetched_dna_subseq) == len(est), pdb.set_trace()
87
88 return dna,est,acc_supp,don_supp,exons,original_est,currentQualities
89
90
91
92 class QPalma:
93 """
94 This class wraps the training and prediction functions for
95 the alignment.
96 """
97
98 def __init__(self,dmode=False):
99 self.ARGS = Param()
100 self.qpalma_debug_mode = dmode
101
102
103 def plog(self,string):
104 self.logfh.write(string)
105 self.logfh.flush()
106
107
108 def do_alignment(self,dna,est,quality,mmatrix,donor,acceptor,ps,qualityPlifs,current_num_path,prediction_mode):
109 """
110 Given the needed input this method calls the QPalma C module which
111 calculates a dynamic programming in order to obtain an alignment
112 """
113 run = self.run
114
115 dna_len = len(dna)
116 est_len = len(est)
117
118 prb = QPalmaDP.createDoubleArrayFromList(quality)
119 chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
120
121 matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
122 mm_len = run['matchmatrixRows']*run['matchmatrixCols']
123
124 d_len = len(donor)
125 donor = QPalmaDP.createDoubleArrayFromList(donor)
126 a_len = len(acceptor)
127 acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
128
129 # Create the alignment object representing the interface to the C/C++ code.
130 currentAlignment = QPalmaDP.Alignment(run['numQualPlifs'],run['numQualSuppPoints'], self.use_quality_scores)
131 c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
132 # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
133 currentAlignment.myalign( current_num_path, dna, dna_len,\
134 est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
135 acceptor, a_len, c_qualityPlifs, run['remove_duplicate_scores'],\
136 run['print_matrix'] )
137
138 c_SpliceAlign = QPalmaDP.createIntArrayFromList([0]*(dna_len*current_num_path))
139 c_EstAlign = QPalmaDP.createIntArrayFromList([0]*(est_len*current_num_path))
140 c_WeightMatch = QPalmaDP.createIntArrayFromList([0]*(mm_len*current_num_path))
141 c_DPScores = QPalmaDP.createDoubleArrayFromList([.0]*current_num_path)
142
143 c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(run['totalQualSuppPoints']*current_num_path))
144
145 if prediction_mode:
146 # part that is only needed for prediction
147 result_len = currentAlignment.getResultLength()
148 c_dna_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
149 c_est_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
150
151 currentAlignment.getAlignmentArraysOld(c_dna_array,c_est_array)
152 __dna_array,__est_array = currentAlignment.getAlignmentArrays()
153
154 dna_array = [0.0]*result_len
155 est_array = [0.0]*result_len
156
157 for r_idx in range(result_len):
158 dna_array[r_idx] = c_dna_array[r_idx]
159 est_array[r_idx] = c_est_array[r_idx]
160
161 else:
162 dna_array = None
163 est_array = None
164
165 assert dna_array == __dna_array
166 assert est_array == __est_array
167
168 currentAlignment.getAlignmentResultsOld(c_SpliceAlign, c_EstAlign,\
169 c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
170
171 __newSpliceAlign, __newEstAlign, __newWeightMatch, __newDPScores, __newQualityPlifsFeatures =\
172 currentAlignment.getAlignmentResults()
173
174 newSpliceAlign = zeros((current_num_path*dna_len,1))
175 newEstAlign = zeros((est_len*current_num_path,1))
176 newWeightMatch = zeros((current_num_path*mm_len,1))
177 newDPScores = zeros((current_num_path,1))
178 newQualityPlifsFeatures = zeros((run['totalQualSuppPoints']*current_num_path,1))
179
180 for i in range(dna_len*current_num_path):
181 newSpliceAlign[i] = c_SpliceAlign[i]
182
183 for i in range(est_len*current_num_path):
184 newEstAlign[i] = c_EstAlign[i]
185
186 for i in range(mm_len*current_num_path):
187 newWeightMatch[i] = c_WeightMatch[i]
188
189 for i in range(current_num_path):
190 newDPScores[i] = c_DPScores[i]
191
192 if self.use_quality_scores:
193 for i in range(run['totalQualSuppPoints']*current_num_path):
194 newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
195
196 del c_SpliceAlign
197 del c_EstAlign
198 del c_WeightMatch
199 del c_DPScores
200 del c_qualityPlifsFeatures
201 del currentAlignment
202
203 assert newSpliceAlign == __newSpliceAlign
204 assert newEstAlign == __newEstAlign
205 assert newWeightMatch == __newWeightMatch
206 assert newDPScores == __newDPScores
207 assert newQualityPlifsFeatures == __newQualityPlifsFeatures
208
209 return newSpliceAlign, newEstAlign, newWeightMatch, newDPScores,\
210 newQualityPlifsFeatures, dna_array, est_array
211
212
213 def train(self,run,training_set):
214 self.run = run
215
216 full_working_path = os.path.join(run['alignment_dir'],run['name'])
217
218 #assert not os.path.exists(full_working_path)
219 if not os.path.exists(full_working_path):
220 os.mkdir(full_working_path)
221
222 assert os.path.exists(full_working_path)
223
224 # ATTENTION: Changing working directory
225 os.chdir(full_working_path)
226
227 self.logfh = open('_qpalma_train.log','w+')
228 cPickle.dump(run,open('run_obj.pickle','w+'))
229
230 self.plog("Settings are:\n")
231 self.plog("%s\n"%str(run))
232
233 if self.run['mode'] == 'normal':
234 self.use_quality_scores = False
235
236 elif self.run['mode'] == 'using_quality_scores':
237 self.use_quality_scores = True
238 else:
239 assert(False)
240
241 numExamples = len(training_set)
242 self.plog('Number of training examples: %d\n'% numExamples)
243
244 self.noImprovementCtr = 0
245 self.oldObjValue = 1e8
246
247 iteration_steps = run['iter_steps']
248 remove_duplicate_scores = run['remove_duplicate_scores']
249 print_matrix = run['print_matrix']
250 anzpath = run['anzpath']
251
252 # Initialize parameter vector /
253 #param = Conf.fixedParam[:run['numFeatures']]
254 param = numpy.matlib.rand(run['numFeatures'],1)
255
256 lengthSP = run['numLengthSuppPoints']
257 donSP = run['numDonSuppPoints']
258 accSP = run['numAccSuppPoints']
259 mmatrixSP = run['matchmatrixRows']*run['matchmatrixCols']
260 numq = run['numQualSuppPoints']
261 totalQualSP = run['totalQualSuppPoints']
262
263 # no intron length model
264 if not run['enable_intron_length']:
265 param[:lengthSP] *= 0.0
266
267 # Set the parameters such as limits penalties for the Plifs
268 [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation,run)
269
270 # Initialize solver
271 self.plog('Initializing problem...\n')
272
273 #try:
274 # solver = SIQPSolver(run['numFeatures'],numExamples,run['C'],self.logfh,run)
275 #except:
276 # self.plog('Got no license. Telling queue to reschedule job...\n')
277 # sys.exit(99)
278
279 #solver.enforceMonotonicity(lengthSP,lengthSP+donSP)
280 #solver.enforceMonotonicity(lengthSP+donSP,lengthSP+donSP+accSP)
281
282 # stores the number of alignments done for each example (best path, second-best path etc.)
283 num_path = [anzpath]*numExamples
284 # stores the gap for each example
285 gap = [0.0]*numExamples
286 #############################################################################################
287 # Training
288 #############################################################################################
289 self.plog('Starting training...\n')
290
291 currentPhi = zeros((run['numFeatures'],1))
292 totalQualityPenalties = zeros((totalQualSP,1))
293
294 numConstPerRound = run['numConstraintsPerRound']
295 solver_call_ctr = 0
296
297 suboptimal_example = 0
298 iteration_nr = 0
299 param_idx = 0
300 const_added_ctr = 0
301
302 featureVectors = zeros((run['numFeatures'],numExamples))
303
304 # the main training loop
305 while True:
306 if iteration_nr == iteration_steps:
307 break
308
309 for exampleIdx,example_key in enumerate(training_set.keys()):
310 print 'Current example %d' % example_key
311 try:
312 dna,est,acc_supp,don_supp,exons,original_est,currentQualities =\
313 getData(training_set,example_key,run)
314 except SpliceSiteException:
315 continue
316
317 dna_len = len(dna)
318
319 if run['mode'] == 'normal':
320 quality = [40]*len(est)
321
322 if run['mode'] == 'using_quality_scores':
323 quality = currentQualities[0]
324
325 if not run['enable_quality_scores']:
326 quality = [40]*len(est)
327
328 if not run['enable_splice_signals']:
329 for idx,elem in enumerate(don_supp):
330 if elem != -inf:
331 don_supp[idx] = 0.0
332
333 for idx,elem in enumerate(acc_supp):
334 if elem != -inf:
335 acc_supp[idx] = 0.0
336
337 # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
338 if run['mode'] == 'using_quality_scores':
339 trueSpliceAlign, trueWeightMatch, trueWeightQuality ,dna_calc =\
340 computeSpliceAlignWithQuality(dna, exons, est, original_est,\
341 quality, qualityPlifs,run)
342 else:
343 trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
344
345 dna_calc = dna_calc.replace('-','')
346
347 #print 'right before computeSpliceWeights exampleIdx %d' % exampleIdx
348 # Calculate the weights
349 trueWeightDon, trueWeightAcc, trueWeightIntron =\
350 computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
351 trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
352
353 currentPhi[0:lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
354 currentPhi[lengthSP:lengthSP+donSP] = mat(d.penalties[:]).reshape(donSP,1)
355 currentPhi[lengthSP+donSP:lengthSP+donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
356 currentPhi[lengthSP+donSP+accSP:lengthSP+donSP+accSP+mmatrixSP] = mmatrix[:]
357
358 if run['mode'] == 'using_quality_scores':
359 totalQualityPenalties = param[-totalQualSP:]
360 currentPhi[lengthSP+donSP+accSP+mmatrixSP:] = totalQualityPenalties[:]
361
362 # Calculate w'phi(x,y) the total score of the alignment
363 trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
364
365 # The allWeights vector is supposed to store the weight parameter
366 # of the true alignment as well as the weight parameters of the
367 # num_path[exampleIdx] other alignments
368 allWeights = zeros((run['numFeatures'],num_path[exampleIdx]+1))
369 allWeights[:,0] = trueWeight[:,0]
370
371 AlignmentScores = [0.0]*(num_path[exampleIdx]+1)
372 AlignmentScores[0] = trueAlignmentScore
373
374 ################## Calculate wrong alignment(s) ######################
375 # Compute donor, acceptor with penalty_lookup_new
376 # returns two double lists
377 donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
378
379 #myalign wants the acceptor site on the g of the ag
380 #acceptor = acceptor[1:]
381 #acceptor.append(-inf)
382
383 #donor = [-inf] + donor[:-1]
384
385 ps = h.convert2SWIG()
386
387 _newSpliceAlign, _newEstAlign, _newWeightMatch, _newDPScores,\
388 _newQualityPlifsFeatures, unneeded1, unneeded2 =\
389 self.do_alignment(dna,est,quality,mmatrix,donor,acceptor,ps,qualityPlifs,num_path[exampleIdx],False)
390 mm_len = run['matchmatrixRows']*run['matchmatrixCols']
391
392 newSpliceAlign = _newSpliceAlign
393 newEstAlign = _newEstAlign
394 newWeightMatch = _newWeightMatch
395 newDPScores = _newDPScores
396 newQualityPlifsFeatures = _newQualityPlifsFeatures
397
398 newSpliceAlign = newSpliceAlign.reshape(num_path[exampleIdx],dna_len)
399 newWeightMatch = newWeightMatch.reshape(num_path[exampleIdx],mm_len)
400
401 newQualityPlifsFeatures = newQualityPlifsFeatures.reshape(num_path[exampleIdx],run['totalQualSuppPoints'])
402 # Calculate weights of the respective alignments. Note that we are
403 # calculating n-best alignments without hamming loss, so we
404 # have to keep track which of the n-best alignments correspond to
405 # the true one in order not to incorporate a true alignment in the
406 # constraints. To keep track of the true and false alignments we
407 # define an array true_map with a boolean indicating the
408 # equivalence to the true alignment for each decoded alignment.
409 true_map = [0]*(num_path[exampleIdx]+1)
410 true_map[0] = 1
411
412 for pathNr in range(num_path[exampleIdx]):
413 weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a,\
414 h, newSpliceAlign[pathNr,:].flatten().tolist()[0], don_supp,\
415 acc_supp)
416
417 decodedQualityFeatures = zeros((run['totalQualSuppPoints'],1))
418 decodedQualityFeatures = newQualityPlifsFeatures[pathNr,:].T
419 # Gewichte in restliche Zeilen der Matrix speichern
420 allWeights[:,pathNr+1] = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch[pathNr,:].T, decodedQualityFeatures[:]])
421
422 hpen = mat(h.penalties).reshape(len(h.penalties),1)
423 dpen = mat(d.penalties).reshape(len(d.penalties),1)
424 apen = mat(a.penalties).reshape(len(a.penalties),1)
425 features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties[:]])
426
427 featureVectors[:,exampleIdx] = allWeights[:,pathNr+1]
428
429 AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
430
431 distinct_scores = False
432 if math.fabs(AlignmentScores[pathNr] - AlignmentScores[pathNr+1]) > 1e-5:
433 distinct_scores = True
434
435 # Check wether scalar product + loss equals viterbi score
436 if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) <= 1e-5:
437 self.plog("Scalar prod. + loss not equals Viterbi output!\n")
438 pdb.set_trace()
439
440 self.plog(" scalar prod (correct) : %f\n"%AlignmentScores[0])
441 self.plog(" scalar prod (pred.) : %f %f\n"%(newDPScores[pathNr,0],AlignmentScores[pathNr+1]))
442
443 # if the pathNr-best alignment is very close to the true alignment consider it as true
444 if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
445 true_map[pathNr+1] = 1
446
447 if not trueAlignmentScore <= max(AlignmentScores[1:]) + 1e-6:
448 print "suboptimal_example %d\n" %exampleIdx
449 #trueSpliceAlign, trueWeightMatch, trueWeightQuality dna_calc=\
450 #computeSpliceAlignWithQuality(dna, exons, est, original_est, quality, qualityPlifs)
451
452 #pdb.set_trace()
453 suboptimal_example += 1
454 self.plog("suboptimal_example %d\n" %exampleIdx)
455
456 # the true label sequence should not have a larger score than the maximal one WHYYYYY?
457 # this means that all n-best paths are to close to each other
458 # we have to extend the n-best search to a (n+1)-best
459 if len([elem for elem in true_map if elem == 1]) == len(true_map):
460 num_path[exampleIdx] = num_path[exampleIdx]+1
461
462 # Choose true and first false alignment for extending
463 firstFalseIdx = -1
464 for map_idx,elem in enumerate(true_map):
465 if elem == 0:
466 firstFalseIdx = map_idx
467 break
468
469 if False:
470 self.plog("Is considered as: %d\n" % true_map[1])
471
472 result_len = currentAlignment.getResultLength()
473 c_dna_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
474 c_est_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
475
476 currentAlignment.getAlignmentArrays(c_dna_array,c_est_array)
477
478 dna_array = [0.0]*result_len
479 est_array = [0.0]*result_len
480
481 for r_idx in range(result_len):
482 dna_array[r_idx] = c_dna_array[r_idx]
483 est_array[r_idx] = c_est_array[r_idx]
484
485 _newSpliceAlign = newSpliceAlign[0].flatten().tolist()[0]
486 _newEstAlign = newEstAlign[0].flatten().tolist()[0]
487
488 #line1,line2,line3 = pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array)
489 #self.plog(line1+'\n')
490 #self.plog(line2+'\n')
491 #self.plog(line3+'\n')
492
493 # if there is at least one useful false alignment add the
494 # corresponding constraints to the optimization problem
495 if firstFalseIdx != -1:
496 firstFalseWeights = allWeights[:,firstFalseIdx]
497 differenceVector = trueWeight - firstFalseWeights
498 #pdb.set_trace()
499
500 #print 'NOT ADDING ANY CONSTRAINTS'
501 const_added = solver.addConstraint(differenceVector, exampleIdx)
502
503 const_added_ctr += 1
504 #
505 # end of one example processing
506 #
507
508 # call solver every nth example //added constraint
509 if exampleIdx != 0 and exampleIdx % numConstPerRound == 0:
510 objValue,w,self.slacks = solver.solve()
511 solver_call_ctr += 1
512
513 if solver_call_ctr == 5:
514 numConstPerRound = 200
515 self.plog('numConstPerRound is now %d\n'% numConstPerRound)
516
517 if math.fabs(objValue - self.oldObjValue) <= 1e-6:
518 self.noImprovementCtr += 1
519
520 if self.noImprovementCtr == numExamples+1:
521 break
522
523 self.oldObjValue = objValue
524 print "objValue is %f" % objValue
525
526 sum_xis = 0
527 for elem in self.slacks:
528 sum_xis += elem
529
530 print 'sum of slacks is %f'% sum_xis
531 self.plog('sum of slacks is %f\n'% sum_xis)
532
533 for i in range(len(param)):
534 param[i] = w[i]
535
536 cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
537 param_idx += 1
538 [h,d,a,mmatrix,qualityPlifs] =\
539 set_param_palma(param,self.ARGS.train_with_intronlengthinformation,run)
540
541 #
542 # end of one iteration through all examples
543 #
544
545 self.plog("suboptimal rounds %d\n" %suboptimal_example)
546
547 if self.noImprovementCtr == numExamples*2:
548 break
549
550 iteration_nr += 1
551
552 #
553 # end of optimization
554 #
555 print 'Training completed'
556
557 cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
558 self.logfh.close()
559
560
561 ###############################################################################
562 #
563 # End of the code needed for training
564 #
565 # Begin of code for prediction
566 #
567 ###############################################################################
568
569 def init_prediction(self,run,dataset_fn,prediction_keys,param_fn,seqInfo,set_name):
570 """
571 Performing a prediction takes...
572 """
573 self.run = run
574 self.set_name = set_name
575
576 full_working_path = run['result_dir']
577
578 print 'full_working_path is %s' % full_working_path
579
580 #assert not os.path.exists(full_working_path)
581 if not os.path.exists(full_working_path):
582 os.mkdir(full_working_path)
583
584 assert os.path.exists(full_working_path)
585
586 # ATTENTION: Changing working directory
587 os.chdir(full_working_path)
588
589 self.logfh = open('_qpalma_predict_%s.log'%set_name,'w+')
590
591 # number of prediction instances
592 self.plog('Number of prediction examples: %d\n'% len(prediction_keys))
593
594 # load dataset and fetch instances that shall be predicted
595 dataset = cPickle.load(open(dataset_fn))
596
597 prediction_set = {}
598 for key in prediction_keys:
599 prediction_set[key] = dataset[key]
600
601 # we do not need the full dataset anymore
602 del dataset
603
604 # Load parameter vector to predict with
605 param = cPickle.load(open(param_fn))
606
607 self.predict(run,prediction_set,param,seqInfo)
608
609
610 def predict(self,run,prediction_set,param,seqInfo):
611 """
612 This method contains the prediction loop over all reads of the given dataset.
613 """
614
615 self.run = run
616 self.read_size = 38
617
618 self.use_quality_scores = False
619 if self.run['mode'] == 'using_quality_scores':
620 use_quality_scores = True
621
622 # Set the parameters such as limits/penalties for the Plifs
623 [h,d,a,mmatrix,qualityPlifs] =\
624 set_param_palma(param,self.ARGS.train_with_intronlengthinformation,run)
625
626 #############################################################################################
627 # Prediction
628 #############################################################################################
629
630 if not self.qpalma_debug_mode:
631 self.plog('Starting prediction...\n')
632
633 #donSP = self.run['numDonSuppPoints']
634 #accSP = self.run['numAccSuppPoints']
635 #lengthSP = self.run['numLengthSuppPoints']
636 #mmatrixSP = run['matchmatrixRows']*run['matchmatrixCols']
637 #numq = self.run['numQualSuppPoints']
638 #totalQualSP = self.run['totalQualSuppPoints']
639 #totalQualityPenalties = zeros((totalQualSP,1))
640
641 problem_ctr = 0
642
643 # where we store the predictions
644 allPredictions = []
645
646 # we take the first quality vector of the tuple of quality vectors
647 quality_index = 0
648
649 # beginning of the prediction loop
650 for example_key in prediction_set.keys():
651 for example in prediction_set[example_key]:
652
653 currentSeqInfo,read,currentQualities = example
654
655 id,chromo,strand,genomicSeq_start,genomicSeq_stop =\
656 currentSeqInfo
657
658 if not self.qpalma_debug_mode:
659 self.plog('Loading example id: %d...\n'% int(id))
660
661 if use_quality_scores:
662 quality = currentQualities[quality_index]
663 else:
664 quality = [40]*len(read)
665
666 try:
667 currentDNASeq, currentAcc, currentDon = seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop)
668 except:
669 problem_ctr += 1
670 continue
671
672 if not run['enable_splice_signals']:
673 for idx in range(len(currentDon)):
674 if currentDon[idx] != -inf:
675 currentDon[idx] = 0.0
676
677 if currentAcc[idx] != -inf:
678 currentAcc[idx] = 0.0
679
680 current_prediction = self.calc_alignment(currentDNASeq, read,\
681 quality, currentDon, currentAcc, d, a, h, mmatrix, qualityPlifs)
682
683 current_prediction['id'] = id
684 current_prediction['chr'] = chromo
685 current_prediction['strand'] = strand
686 current_prediction['start_pos'] = genomicSeq_start
687
688 allPredictions.append(current_prediction)
689
690 if not self.qpalma_debug_mode:
691 self.finalizePrediction(allPredictions,problem_ctr)
692 else:
693 return allPredictions
694
695
696 def finalizePrediction(self,allPredictions,problem_ctr):
697 # end of the prediction loop we save all predictions in a pickle file and exit
698 cPickle.dump(allPredictions,open('%s.predictions.pickle'%(self.set_name),'w+'))
699 print 'Prediction completed'
700 self.plog('Prediction completed\n')
701 mes = 'Problem ctr %d' % problem_ctr
702 print mes
703 self.plog(mes+'\n')
704 self.logfh.close()
705
706
707 def calc_alignment(self, dna, read, quality, don_supp, acc_supp, d, a, h, mmatrix, qualityPlifs):
708 """
709 Given two sequences and the parameters we calculate on alignment
710 """
711
712 run = self.run
713 donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
714
715 dna = str(dna)
716 read = str(read)
717
718 if '-' in read:
719 self.plog('found gap\n')
720 read = read.replace('-','')
721 assert len(read) == self.read_size
722
723 dna_len = len(dna)
724 read_len = len(read)
725
726 ps = h.convert2SWIG()
727
728 newSpliceAlign, newEstAlign, newWeightMatch, newDPScores,\
729 newQualityPlifsFeatures, dna_array, read_array =\
730 self.do_alignment(dna,read,quality,mmatrix,donor,acceptor,ps,qualityPlifs,1,True)
731
732 mm_len = run['matchmatrixRows']*run['matchmatrixCols']
733
734 # old code removed
735 newSpliceAlign = newSpliceAlign.reshape(1,dna_len)
736 newWeightMatch = newWeightMatch.reshape(1,mm_len)
737 true_map = [0]*2
738 true_map[0] = 1
739 pathNr = 0
740
741 _newSpliceAlign = array.array('B',newSpliceAlign.flatten().tolist()[0])
742 _newEstAlign = array.array('B',newEstAlign.flatten().tolist()[0])
743
744 alignment = get_alignment(_newSpliceAlign,_newEstAlign, dna_array, read_array) #(qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds, tExonSizes, tStarts, tEnds)
745
746 dna_array = array.array('B',dna_array)
747 read_array = array.array('B',read_array)
748
749 newExons = self.calculatePredictedExons(newSpliceAlign)
750
751 current_prediction = {'predExons':newExons, 'dna':dna, 'read':read, 'DPScores':newDPScores,\
752 'alignment':alignment,'spliceAlign':_newSpliceAlign,'estAlign':_newEstAlign,\
753 'dna_array':dna_array, 'read_array':read_array }
754
755 return current_prediction
756
757
758 def calculatePredictedExons(self,SpliceAlign):
759 newExons = []
760 oldElem = -1
761 SpliceAlign = SpliceAlign.flatten().tolist()[0]
762 SpliceAlign.append(-1)
763 for pos,elem in enumerate(SpliceAlign):
764 if pos == 0:
765 oldElem = -1
766 else:
767 oldElem = SpliceAlign[pos-1]
768
769 if oldElem != 0 and elem == 0: # start of exon
770 newExons.append(pos)
771
772 if oldElem == 0 and elem != 0: # end of exon
773 newExons.append(pos)
774
775 return newExons