afac4b8ab38774cf147796eff4abdbb61af820ce
[qpalma.git] / scripts / qpalma_main.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 ###########################################################
5 #
6 # The QPalma project aims at extending the Palma project
7 # to be able to use Solexa reads together with their
8 # quality scores.
9 #
10 # This file represents the conversion of the main matlab
11 # training loop for Palma to Python.
12 #
13 # Author: Fabio De Bona
14 #
15 ###########################################################
16
17 import array
18 import cPickle
19 import os.path
20 import pdb
21 import sys
22
23 from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper,unbracket_seq
24
25 import numpy
26 from numpy.matlib import mat,zeros,ones,inf
27 from numpy.linalg import norm
28
29 import QPalmaDP
30 import qpalma
31
32 #from qpalma.SIQP_CPX import SIQPSolver
33 #from qpalma.SIQP_CVXOPT import SIQPSolver
34
35 from qpalma.computeSpliceWeights import *
36 from qpalma.set_param_palma import *
37 from qpalma.computeSpliceAlignWithQuality import *
38 from qpalma.TrainingParam import Param
39 from qpalma.Plif import Plf,compute_donacc
40
41 import QPalmaConfiguration as Conf
42
43 from Utils import calc_stat, calc_info, pprint_alignment, get_alignment
44
45 class SpliceSiteException:
46 pass
47
48
49 def getData(training_set,exampleKey,run):
50 currentSeqInfo,currentExons,original_est,currentQualities = training_set[exampleKey]
51 id,chr,strand,up_cut,down_cut = currentSeqInfo
52
53 est = original_est
54 est = "".join(est)
55 est = est.lower()
56 est = unbracket_est(est)
57 est = est.replace('-','')
58
59 assert len(est) == run['read_size'], pdb.set_trace()
60 est_len = len(est)
61
62 #original_est = OriginalEsts[exampleIdx]
63 original_est = "".join(original_est)
64 original_est = original_est.lower()
65
66 dna_flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
67 dna, acc_supp, don_supp = get_seq_and_scores(chr,strand,up_cut,down_cut,dna_flat_files)
68
69 original_exons = currentExons
70 exons = original_exons - (up_cut-1)
71 exons[0,0] -= 1
72 exons[1,0] -= 1
73
74 if exons.shape == (2,2):
75 fetched_dna_subseq = dna[exons[0,0]:exons[0,1]] + dna[exons[1,0]:exons[1,1]]
76
77 donor_elem = dna[exons[0,1]:exons[0,1]+2]
78 acceptor_elem = dna[exons[1,0]-2:exons[1,0]]
79
80 if not ( donor_elem == 'gt' or donor_elem == 'gc' ):
81 print 'invalid donor in example %d'% exampleKey
82 raise SpliceSiteException
83
84 if not ( acceptor_elem == 'ag' ):
85 print 'invalid acceptor in example %d'% exampleKey
86 raise SpliceSiteException
87
88 assert len(fetched_dna_subseq) == len(est), pdb.set_trace()
89
90 return dna,est,acc_supp,don_supp,exons,original_est,currentQualities
91
92
93
94 class QPalma:
95 """
96 This class wraps the training and prediction functions for
97 the alignment.
98 """
99
100 def __init__(self,dmode=False):
101 self.ARGS = Param()
102 self.qpalma_debug_mode = dmode
103
104
105 def plog(self,string):
106 self.logfh.write(string)
107 self.logfh.flush()
108
109
110 def do_alignment(self,dna,est,quality,mmatrix,donor,acceptor,ps,qualityPlifs,current_num_path,prediction_mode):
111 """
112 Given the needed input this method calls the QPalma C module which
113 calculates a dynamic programming in order to obtain an alignment
114 """
115 run = self.run
116
117 dna_len = len(dna)
118 est_len = len(est)
119
120 prb = QPalmaDP.createDoubleArrayFromList(quality)
121 chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
122
123 matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
124 mm_len = run['matchmatrixRows']*run['matchmatrixCols']
125
126 d_len = len(donor)
127 donor = QPalmaDP.createDoubleArrayFromList(donor)
128 a_len = len(acceptor)
129 acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
130
131 # Create the alignment object representing the interface to the C/C++ code.
132 currentAlignment = QPalmaDP.Alignment(run['numQualPlifs'],run['numQualSuppPoints'], self.use_quality_scores)
133 c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
134 # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
135 currentAlignment.myalign( current_num_path, dna, dna_len,\
136 est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
137 acceptor, a_len, c_qualityPlifs, run['remove_duplicate_scores'],\
138 run['print_matrix'] )
139
140 c_SpliceAlign = QPalmaDP.createIntArrayFromList([0]*(dna_len*current_num_path))
141 c_EstAlign = QPalmaDP.createIntArrayFromList([0]*(est_len*current_num_path))
142 c_WeightMatch = QPalmaDP.createIntArrayFromList([0]*(mm_len*current_num_path))
143 c_DPScores = QPalmaDP.createDoubleArrayFromList([.0]*current_num_path)
144
145 c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(run['totalQualSuppPoints']*current_num_path))
146
147 if prediction_mode:
148 # part that is only needed for prediction
149 result_len = currentAlignment.getResultLength()
150 c_dna_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
151 c_est_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
152
153 currentAlignment.getAlignmentArraysOld(c_dna_array,c_est_array)
154 __dna_array,__est_array = currentAlignment.getAlignmentArrays()
155
156 dna_array = [0.0]*result_len
157 est_array = [0.0]*result_len
158
159 for r_idx in range(result_len):
160 dna_array[r_idx] = c_dna_array[r_idx]
161 est_array[r_idx] = c_est_array[r_idx]
162
163 else:
164 dna_array = None
165 est_array = None
166
167 assert dna_array == __dna_array
168 assert est_array == __est_array
169
170 currentAlignment.getAlignmentResultsOld(c_SpliceAlign, c_EstAlign,\
171 c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
172
173 __newSpliceAlign, __newEstAlign, __newWeightMatch, __newDPScores, __newQualityPlifsFeatures =\
174 currentAlignment.getAlignmentResults()
175
176 newSpliceAlign = zeros((current_num_path*dna_len,1))
177 newEstAlign = zeros((est_len*current_num_path,1))
178 newWeightMatch = zeros((current_num_path*mm_len,1))
179 newDPScores = zeros((current_num_path,1))
180 newQualityPlifsFeatures = zeros((run['totalQualSuppPoints']*current_num_path,1))
181
182 for i in range(dna_len*current_num_path):
183 newSpliceAlign[i] = c_SpliceAlign[i]
184
185 for i in range(est_len*current_num_path):
186 newEstAlign[i] = c_EstAlign[i]
187
188 for i in range(mm_len*current_num_path):
189 newWeightMatch[i] = c_WeightMatch[i]
190
191 for i in range(current_num_path):
192 newDPScores[i] = c_DPScores[i]
193
194 if self.use_quality_scores:
195 for i in range(run['totalQualSuppPoints']*current_num_path):
196 newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
197
198 del c_SpliceAlign
199 del c_EstAlign
200 del c_WeightMatch
201 del c_DPScores
202 del c_qualityPlifsFeatures
203 del currentAlignment
204
205 assert newSpliceAlign = __newSpliceAlign
206 assert newEstAlign = __newEstAlign
207 assert newWeightMatch = __newWeightMatch
208 assert newDPScores = __newDPScores
209 assert newQualityPlifsFeatures = __newQualityPlifsFeatures
210
211 return newSpliceAlign, newEstAlign, newWeightMatch, newDPScores,\
212 newQualityPlifsFeatures, dna_array, est_array
213
214
215 def train(self,run,training_set):
216 self.run = run
217
218 full_working_path = os.path.join(run['alignment_dir'],run['name'])
219
220 #assert not os.path.exists(full_working_path)
221 if not os.path.exists(full_working_path):
222 os.mkdir(full_working_path)
223
224 assert os.path.exists(full_working_path)
225
226 # ATTENTION: Changing working directory
227 os.chdir(full_working_path)
228
229 self.logfh = open('_qpalma_train.log','w+')
230 cPickle.dump(run,open('run_obj.pickle','w+'))
231
232 self.plog("Settings are:\n")
233 self.plog("%s\n"%str(run))
234
235 if self.run['mode'] == 'normal':
236 self.use_quality_scores = False
237
238 elif self.run['mode'] == 'using_quality_scores':
239 self.use_quality_scores = True
240 else:
241 assert(False)
242
243 numExamples = len(training_set)
244 self.plog('Number of training examples: %d\n'% numExamples)
245
246 self.noImprovementCtr = 0
247 self.oldObjValue = 1e8
248
249 iteration_steps = run['iter_steps']
250 remove_duplicate_scores = run['remove_duplicate_scores']
251 print_matrix = run['print_matrix']
252 anzpath = run['anzpath']
253
254 # Initialize parameter vector /
255 #param = Conf.fixedParam[:run['numFeatures']]
256 param = numpy.matlib.rand(run['numFeatures'],1)
257
258 lengthSP = run['numLengthSuppPoints']
259 donSP = run['numDonSuppPoints']
260 accSP = run['numAccSuppPoints']
261 mmatrixSP = run['matchmatrixRows']*run['matchmatrixCols']
262 numq = run['numQualSuppPoints']
263 totalQualSP = run['totalQualSuppPoints']
264
265 # no intron length model
266 if not run['enable_intron_length']:
267 param[:lengthSP] *= 0.0
268
269 # Set the parameters such as limits penalties for the Plifs
270 [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation,run)
271
272 # Initialize solver
273 self.plog('Initializing problem...\n')
274
275 #try:
276 # solver = SIQPSolver(run['numFeatures'],numExamples,run['C'],self.logfh,run)
277 #except:
278 # self.plog('Got no license. Telling queue to reschedule job...\n')
279 # sys.exit(99)
280
281 #solver.enforceMonotonicity(lengthSP,lengthSP+donSP)
282 #solver.enforceMonotonicity(lengthSP+donSP,lengthSP+donSP+accSP)
283
284 # stores the number of alignments done for each example (best path, second-best path etc.)
285 num_path = [anzpath]*numExamples
286 # stores the gap for each example
287 gap = [0.0]*numExamples
288 #############################################################################################
289 # Training
290 #############################################################################################
291 self.plog('Starting training...\n')
292
293 currentPhi = zeros((run['numFeatures'],1))
294 totalQualityPenalties = zeros((totalQualSP,1))
295
296 numConstPerRound = run['numConstraintsPerRound']
297 solver_call_ctr = 0
298
299 suboptimal_example = 0
300 iteration_nr = 0
301 param_idx = 0
302 const_added_ctr = 0
303
304 featureVectors = zeros((run['numFeatures'],numExamples))
305
306 # the main training loop
307 while True:
308 if iteration_nr == iteration_steps:
309 break
310
311 for exampleIdx,example_key in enumerate(training_set.keys()):
312 print 'Current example %d' % example_key
313 try:
314 dna,est,acc_supp,don_supp,exons,original_est,currentQualities =\
315 getData(training_set,example_key,run)
316 except SpliceSiteException:
317 continue
318
319 dna_len = len(dna)
320
321 if run['mode'] == 'normal':
322 quality = [40]*len(est)
323
324 if run['mode'] == 'using_quality_scores':
325 quality = currentQualities[0]
326
327 if not run['enable_quality_scores']:
328 quality = [40]*len(est)
329
330 if not run['enable_splice_signals']:
331 for idx,elem in enumerate(don_supp):
332 if elem != -inf:
333 don_supp[idx] = 0.0
334
335 for idx,elem in enumerate(acc_supp):
336 if elem != -inf:
337 acc_supp[idx] = 0.0
338
339 # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
340 if run['mode'] == 'using_quality_scores':
341 trueSpliceAlign, trueWeightMatch, trueWeightQuality ,dna_calc =\
342 computeSpliceAlignWithQuality(dna, exons, est, original_est,\
343 quality, qualityPlifs,run)
344 else:
345 trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
346
347 dna_calc = dna_calc.replace('-','')
348
349 #print 'right before computeSpliceWeights exampleIdx %d' % exampleIdx
350 # Calculate the weights
351 trueWeightDon, trueWeightAcc, trueWeightIntron =\
352 computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
353 trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
354
355 currentPhi[0:lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
356 currentPhi[lengthSP:lengthSP+donSP] = mat(d.penalties[:]).reshape(donSP,1)
357 currentPhi[lengthSP+donSP:lengthSP+donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
358 currentPhi[lengthSP+donSP+accSP:lengthSP+donSP+accSP+mmatrixSP] = mmatrix[:]
359
360 if run['mode'] == 'using_quality_scores':
361 totalQualityPenalties = param[-totalQualSP:]
362 currentPhi[lengthSP+donSP+accSP+mmatrixSP:] = totalQualityPenalties[:]
363
364 # Calculate w'phi(x,y) the total score of the alignment
365 trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
366
367 # The allWeights vector is supposed to store the weight parameter
368 # of the true alignment as well as the weight parameters of the
369 # num_path[exampleIdx] other alignments
370 allWeights = zeros((run['numFeatures'],num_path[exampleIdx]+1))
371 allWeights[:,0] = trueWeight[:,0]
372
373 AlignmentScores = [0.0]*(num_path[exampleIdx]+1)
374 AlignmentScores[0] = trueAlignmentScore
375
376 ################## Calculate wrong alignment(s) ######################
377 # Compute donor, acceptor with penalty_lookup_new
378 # returns two double lists
379 donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
380
381 #myalign wants the acceptor site on the g of the ag
382 #acceptor = acceptor[1:]
383 #acceptor.append(-inf)
384
385 #donor = [-inf] + donor[:-1]
386
387 ps = h.convert2SWIG()
388
389 _newSpliceAlign, _newEstAlign, _newWeightMatch, _newDPScores,\
390 _newQualityPlifsFeatures, unneeded1, unneeded2 =\
391 self.do_alignment(dna,est,quality,mmatrix,donor,acceptor,ps,qualityPlifs,num_path[exampleIdx],False)
392 mm_len = run['matchmatrixRows']*run['matchmatrixCols']
393
394 newSpliceAlign = _newSpliceAlign
395 newEstAlign = _newEstAlign
396 newWeightMatch = _newWeightMatch
397 newDPScores = _newDPScores
398 newQualityPlifsFeatures = _newQualityPlifsFeatures
399
400 newSpliceAlign = newSpliceAlign.reshape(num_path[exampleIdx],dna_len)
401 newWeightMatch = newWeightMatch.reshape(num_path[exampleIdx],mm_len)
402
403 newQualityPlifsFeatures = newQualityPlifsFeatures.reshape(num_path[exampleIdx],run['totalQualSuppPoints'])
404 # Calculate weights of the respective alignments. Note that we are
405 # calculating n-best alignments without hamming loss, so we
406 # have to keep track which of the n-best alignments correspond to
407 # the true one in order not to incorporate a true alignment in the
408 # constraints. To keep track of the true and false alignments we
409 # define an array true_map with a boolean indicating the
410 # equivalence to the true alignment for each decoded alignment.
411 true_map = [0]*(num_path[exampleIdx]+1)
412 true_map[0] = 1
413
414 for pathNr in range(num_path[exampleIdx]):
415 weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a,\
416 h, newSpliceAlign[pathNr,:].flatten().tolist()[0], don_supp,\
417 acc_supp)
418
419 decodedQualityFeatures = zeros((run['totalQualSuppPoints'],1))
420 decodedQualityFeatures = newQualityPlifsFeatures[pathNr,:].T
421 # Gewichte in restliche Zeilen der Matrix speichern
422 allWeights[:,pathNr+1] = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch[pathNr,:].T, decodedQualityFeatures[:]])
423
424 hpen = mat(h.penalties).reshape(len(h.penalties),1)
425 dpen = mat(d.penalties).reshape(len(d.penalties),1)
426 apen = mat(a.penalties).reshape(len(a.penalties),1)
427 features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties[:]])
428
429 featureVectors[:,exampleIdx] = allWeights[:,pathNr+1]
430
431 AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
432
433 distinct_scores = False
434 if math.fabs(AlignmentScores[pathNr] - AlignmentScores[pathNr+1]) > 1e-5:
435 distinct_scores = True
436
437 # Check wether scalar product + loss equals viterbi score
438 if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) <= 1e-5:
439 self.plog("Scalar prod. + loss not equals Viterbi output!\n")
440 pdb.set_trace()
441
442 self.plog(" scalar prod (correct) : %f\n"%AlignmentScores[0])
443 self.plog(" scalar prod (pred.) : %f %f\n"%(newDPScores[pathNr,0],AlignmentScores[pathNr+1]))
444
445 # if the pathNr-best alignment is very close to the true alignment consider it as true
446 if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
447 true_map[pathNr+1] = 1
448
449 if not trueAlignmentScore <= max(AlignmentScores[1:]) + 1e-6:
450 print "suboptimal_example %d\n" %exampleIdx
451 #trueSpliceAlign, trueWeightMatch, trueWeightQuality dna_calc=\
452 #computeSpliceAlignWithQuality(dna, exons, est, original_est, quality, qualityPlifs)
453
454 #pdb.set_trace()
455 suboptimal_example += 1
456 self.plog("suboptimal_example %d\n" %exampleIdx)
457
458 # the true label sequence should not have a larger score than the maximal one WHYYYYY?
459 # this means that all n-best paths are to close to each other
460 # we have to extend the n-best search to a (n+1)-best
461 if len([elem for elem in true_map if elem == 1]) == len(true_map):
462 num_path[exampleIdx] = num_path[exampleIdx]+1
463
464 # Choose true and first false alignment for extending
465 firstFalseIdx = -1
466 for map_idx,elem in enumerate(true_map):
467 if elem == 0:
468 firstFalseIdx = map_idx
469 break
470
471 if False:
472 self.plog("Is considered as: %d\n" % true_map[1])
473
474 result_len = currentAlignment.getResultLength()
475 c_dna_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
476 c_est_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
477
478 currentAlignment.getAlignmentArrays(c_dna_array,c_est_array)
479
480 dna_array = [0.0]*result_len
481 est_array = [0.0]*result_len
482
483 for r_idx in range(result_len):
484 dna_array[r_idx] = c_dna_array[r_idx]
485 est_array[r_idx] = c_est_array[r_idx]
486
487 _newSpliceAlign = newSpliceAlign[0].flatten().tolist()[0]
488 _newEstAlign = newEstAlign[0].flatten().tolist()[0]
489
490 #line1,line2,line3 = pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array)
491 #self.plog(line1+'\n')
492 #self.plog(line2+'\n')
493 #self.plog(line3+'\n')
494
495 # if there is at least one useful false alignment add the
496 # corresponding constraints to the optimization problem
497 if firstFalseIdx != -1:
498 firstFalseWeights = allWeights[:,firstFalseIdx]
499 differenceVector = trueWeight - firstFalseWeights
500 #pdb.set_trace()
501
502 #print 'NOT ADDING ANY CONSTRAINTS'
503 const_added = solver.addConstraint(differenceVector, exampleIdx)
504
505 const_added_ctr += 1
506 #
507 # end of one example processing
508 #
509
510 # call solver every nth example //added constraint
511 if exampleIdx != 0 and exampleIdx % numConstPerRound == 0:
512 objValue,w,self.slacks = solver.solve()
513 solver_call_ctr += 1
514
515 if solver_call_ctr == 5:
516 numConstPerRound = 200
517 self.plog('numConstPerRound is now %d\n'% numConstPerRound)
518
519 if math.fabs(objValue - self.oldObjValue) <= 1e-6:
520 self.noImprovementCtr += 1
521
522 if self.noImprovementCtr == numExamples+1:
523 break
524
525 self.oldObjValue = objValue
526 print "objValue is %f" % objValue
527
528 sum_xis = 0
529 for elem in self.slacks:
530 sum_xis += elem
531
532 print 'sum of slacks is %f'% sum_xis
533 self.plog('sum of slacks is %f\n'% sum_xis)
534
535 for i in range(len(param)):
536 param[i] = w[i]
537
538 cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
539 param_idx += 1
540 [h,d,a,mmatrix,qualityPlifs] =\
541 set_param_palma(param,self.ARGS.train_with_intronlengthinformation,run)
542
543 #
544 # end of one iteration through all examples
545 #
546
547 self.plog("suboptimal rounds %d\n" %suboptimal_example)
548
549 if self.noImprovementCtr == numExamples*2:
550 break
551
552 iteration_nr += 1
553
554 #
555 # end of optimization
556 #
557 print 'Training completed'
558
559 cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
560 self.logfh.close()
561
562
563 ###############################################################################
564 #
565 # End of the code needed for training
566 #
567 # Begin of code for prediction
568 #
569 ###############################################################################
570
571 def init_prediction(self,run,dataset_fn,prediction_keys,param_fn,seqInfo,set_name):
572 """
573 Performing a prediction takes...
574 """
575 self.run = run
576 self.set_name = set_name
577
578 full_working_path = run['result_dir']
579
580 print 'full_working_path is %s' % full_working_path
581
582 #assert not os.path.exists(full_working_path)
583 if not os.path.exists(full_working_path):
584 os.mkdir(full_working_path)
585
586 assert os.path.exists(full_working_path)
587
588 # ATTENTION: Changing working directory
589 os.chdir(full_working_path)
590
591 self.logfh = open('_qpalma_predict_%s.log'%set_name,'w+')
592
593 # number of prediction instances
594 self.plog('Number of prediction examples: %d\n'% len(prediction_keys))
595
596 # load dataset and fetch instances that shall be predicted
597 dataset = cPickle.load(open(dataset_fn))
598
599 prediction_set = {}
600 for key in prediction_keys:
601 prediction_set[key] = dataset[key]
602
603 # we do not need the full dataset anymore
604 del dataset
605
606 # Load parameter vector to predict with
607 param = cPickle.load(open(param_fn))
608
609 self.predict(run,prediction_set,param,seqInfo)
610
611
612 def predict(self,run,prediction_set,param,seqInfo):
613 """
614 This method contains the prediction loop over all reads of the given dataset.
615 """
616
617 self.run = run
618
619 self.use_quality_scores = False
620 if self.run['mode'] == 'using_quality_scores':
621 use_quality_scores = True
622
623 # Set the parameters such as limits/penalties for the Plifs
624 [h,d,a,mmatrix,qualityPlifs] =\
625 set_param_palma(param,self.ARGS.train_with_intronlengthinformation,run)
626
627 #############################################################################################
628 # Prediction
629 #############################################################################################
630
631 if not self.qpalma_debug_mode:
632 self.plog('Starting prediction...\n')
633
634 #donSP = self.run['numDonSuppPoints']
635 #accSP = self.run['numAccSuppPoints']
636 #lengthSP = self.run['numLengthSuppPoints']
637 #mmatrixSP = run['matchmatrixRows']*run['matchmatrixCols']
638 #numq = self.run['numQualSuppPoints']
639 #totalQualSP = self.run['totalQualSuppPoints']
640 #totalQualityPenalties = zeros((totalQualSP,1))
641
642 problem_ctr = 0
643
644 # where we store the predictions
645 allPredictions = []
646
647 # we take the first quality vector of the tuple of quality vectors
648 quality_index = 0
649
650 # beginning of the prediction loop
651 for example_key in prediction_set.keys():
652 for example in prediction_set[example_key]:
653
654 currentSeqInfo,read,currentQualities = example
655
656 id,chromo,strand,genomicSeq_start,genomicSeq_stop =\
657 currentSeqInfo
658
659 if not self.qpalma_debug_mode:
660 self.plog('Loading example id: %d...\n'% int(id))
661
662 if use_quality_scores = True
663 quality = currentQualities[quality_index]
664 else:
665 quality = [40]*len(read)
666
667 try:
668 currentDNASeq, currentAcc, currentDon = seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop)
669 except:
670 problem_ctr += 1
671 continue
672
673 if not run['enable_splice_signals']:
674 for idx in range(len(currentDon)):
675 if currentDon[idx] != -inf:
676 currentDon[idx] = 0.0
677
678 if currentAcc[idx] != -inf:
679 currentAcc[idx] = 0.0
680
681 current_prediction = self.calc_alignment(currentDNASeq, read,\
682 quality, currentDon, currentAcc, d, a, h, mmatrix, qualityPlifs)
683
684 current_prediction['id'] = id
685 current_prediction['chr'] = chromo
686 current_prediction['strand'] = strand
687 current_prediction['start_pos'] = genomicSeq_start
688
689 allPredictions.append(current_prediction)
690
691 if not self.qpalma_debug_mode:
692 self.finalizePrediction(allPredictions,problem_ctr)
693 else:
694 return allPredictions
695
696
697 def finalizePrediction(self,allPredictions,problem_ctr):
698 # end of the prediction loop we save all predictions in a pickle file and exit
699 cPickle.dump(allPredictions,open('%s.predictions.pickle'%(self.set_name),'w+'))
700 print 'Prediction completed'
701 self.plog('Prediction completed\n')
702 mes = 'Problem ctr %d' % problem_ctr
703 print mes
704 self.plog(mes+'\n')
705 self.logfh.close()
706
707
708 def calc_alignment(self, dna, read, quality, don_supp, acc_supp, d, a, h, mmatrix, qualityPlifs):
709 """
710 Given two sequences and the parameters we calculate on alignment
711 """
712
713 run = self.run
714 donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
715
716 dna = str(dna)
717 read = str(read)
718
719 if '-' in read:
720 self.plog('found gap\n')
721 read = read.replace('-','')
722 assert len(read) == Conf.read_size
723
724 dna_len = len(dna)
725 read_len = len(read)
726
727 ps = h.convert2SWIG()
728
729 newSpliceAlign, newEstAlign, newWeightMatch, newDPScores,\
730 newQualityPlifsFeatures, dna_array, read_array =\
731 self.do_alignment(dna,read,quality,mmatrix,donor,acceptor,ps,qualityPlifs,1,True)
732
733 mm_len = run['matchmatrixRows']*run['matchmatrixCols']
734
735 # old code removed
736 newSpliceAlign = newSpliceAlign.reshape(1,dna_len)
737 newWeightMatch = newWeightMatch.reshape(1,mm_len)
738 true_map = [0]*2
739 true_map[0] = 1
740 pathNr = 0
741
742 _newSpliceAlign = array.array('B',newSpliceAlign.flatten().tolist()[0])
743 _newEstAlign = array.array('B',newEstAlign.flatten().tolist()[0])
744
745 alignment = get_alignment(_newSpliceAlign,_newEstAlign, dna_array, read_array) #(qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds, tExonSizes, tStarts, tEnds)
746
747 dna_array = array.array('B',dna_array)
748 read_array = array.array('B',read_array)
749
750 newExons = self.calculatePredictedExons(newSpliceAlign)
751
752 current_prediction = {'predExons':newExons, 'dna':dna, 'read':read, 'DPScores':newDPScores,\
753 'alignment':alignment,'spliceAlign':_newSpliceAlign,'estAlign':_newEstAlign,\
754 'dna_array':dna_array, 'read_array':read_array }
755
756 return current_prediction
757
758
759 def calculatePredictedExons(self,SpliceAlign):
760 newExons = []
761 oldElem = -1
762 SpliceAlign = SpliceAlign.flatten().tolist()[0]
763 SpliceAlign.append(-1)
764 for pos,elem in enumerate(SpliceAlign):
765 if pos == 0:
766 oldElem = -1
767 else:
768 oldElem = SpliceAlign[pos-1]
769
770 if oldElem != 0 and elem == 0: # start of exon
771 newExons.append(pos)
772
773 if oldElem == 0 and elem != 0: # end of exon
774 newExons.append(pos)
775
776 return newExons