b3349be4ae21f1ef0cc68416daafe453ad673734
[qpalma.git] / scripts / qpalma_main.py
1 # This program is free software; you can redistribute it and/or modify
2 # it under the terms of the GNU General Public License as published by
3 # the Free Software Foundation; either version 2 of the License, or
4 # (at your option) any later version.
5 #
6 # Written (W) 2008 Fabio De Bona
7 # Copyright (C) 2008 Max-Planck-Society
8
9 import array
10 import cPickle
11 import os.path
12 import pdb
13 import sys
14
15 from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper,unbracket_seq
16
17 import numpy
18 from numpy.matlib import mat,zeros,ones,inf
19 from numpy.linalg import norm
20
21 #from qpalma.SIQP_CPX import SIQPSolver
22 #from qpalma.SIQP_CVXOPT import SIQPSolver
23
24 import QPalmaDP
25 import qpalma
26 from qpalma.computeSpliceWeights import *
27 from qpalma.set_param_palma import *
28 from qpalma.computeSpliceAlignWithQuality import *
29 from qpalma.TrainingParam import Param
30 from qpalma.Plif import Plf,compute_donacc
31
32 from qpalma.utils import pprint_alignment, get_alignment
33
34 class SpliceSiteException:
35 pass
36
37
38 def getData(training_set,exampleKey,run):
39 """ This function... """
40
41 currentSeqInfo,original_est,currentQualities,currentExons = training_set[exampleKey]
42 id,chr,strand,up_cut,down_cut = currentSeqInfo
43
44 est = original_est
45 est = "".join(est)
46 est = est.lower()
47 est = unbracket_est(est)
48 est = est.replace('-','')
49
50 assert len(est) == run['read_size'], pdb.set_trace()
51 est_len = len(est)
52
53 #original_est = OriginalEsts[exampleIdx]
54 original_est = "".join(original_est)
55 original_est = original_est.lower()
56
57 dna_flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
58 dna, acc_supp, don_supp = get_seq_and_scores(chr,strand,up_cut,down_cut,dna_flat_files)
59
60 #currentDNASeq, currentAcc, currentDon = seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop)
61
62 original_exons = currentExons
63 exons = original_exons - (up_cut-1)
64 exons[0,0] -= 1
65 exons[1,0] -= 1
66
67 if exons.shape == (2,2):
68 fetched_dna_subseq = dna[exons[0,0]:exons[0,1]] + dna[exons[1,0]:exons[1,1]]
69
70 donor_elem = dna[exons[0,1]:exons[0,1]+2]
71 acceptor_elem = dna[exons[1,0]-2:exons[1,0]]
72
73 if not ( donor_elem == 'gt' or donor_elem == 'gc' ):
74 print 'invalid donor in example %d'% exampleKey
75 raise SpliceSiteException
76
77 if not ( acceptor_elem == 'ag' ):
78 print 'invalid acceptor in example %d'% exampleKey
79 raise SpliceSiteException
80
81 assert len(fetched_dna_subseq) == len(est), pdb.set_trace()
82
83 return dna,est,acc_supp,don_supp,exons,original_est,currentQualities
84
85
86 class QPalma:
87 """
88 This class wraps the training and prediction functions for
89 the alignment.
90 """
91
92 def __init__(self,run,seqInfo,dmode=False):
93 self.ARGS = Param()
94 self.qpalma_debug_mode = dmode
95 self.run = run
96 self.seqInfo = seqInfo
97
98
99 def plog(self,string):
100 self.logfh.write(string)
101 self.logfh.flush()
102
103
104 def do_alignment(self,dna,est,quality,mmatrix,donor,acceptor,ps,qualityPlifs,current_num_path,prediction_mode):
105 """
106 Given the needed input this method calls the QPalma C module which
107 calculates a dynamic programming in order to obtain an alignment
108 """
109
110 dna_len = len(dna)
111 est_len = len(est)
112
113 prb = QPalmaDP.createDoubleArrayFromList(quality)
114 chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
115
116 matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
117 mm_len = self.run['matchmatrixRows']*self.run['matchmatrixCols']
118
119 d_len = len(donor)
120 donor = QPalmaDP.createDoubleArrayFromList(donor)
121 a_len = len(acceptor)
122 acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
123
124 # Create the alignment object representing the interface to the C/C++ code.
125 currentAlignment = QPalmaDP.Alignment(self.run['numQualPlifs'],self.run['numQualSuppPoints'], self.use_quality_scores)
126 c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
127 # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
128 currentAlignment.myalign( current_num_path, dna, dna_len,\
129 est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
130 acceptor, a_len, c_qualityPlifs, self.run['remove_duplicate_scores'],\
131 self.run['print_matrix'] )
132
133 if prediction_mode:
134 # part that is only needed for prediction
135 result_len = currentAlignment.getResultLength()
136 dna_array,est_array = currentAlignment.getAlignmentArraysNew()
137 else:
138 dna_array = None
139 est_array = None
140
141 newSpliceAlign, newEstAlign, newWeightMatch, newDPScores, newQualityPlifsFeatures =\
142 currentAlignment.getAlignmentResultsNew()
143
144 return newSpliceAlign, newEstAlign, newWeightMatch, newDPScores,\
145 newQualityPlifsFeatures, dna_array, est_array
146
147
148 def init_train(self,training_set):
149 full_working_path = os.path.join(self.run['alignment_dir'],self.run['name'])
150
151 #assert not os.path.exists(full_working_path)
152 if not os.path.exists(full_working_path):
153 os.mkdir(full_working_path)
154
155 assert os.path.exists(full_working_path)
156
157 # ATTENTION: Changing working directory
158 os.chdir(full_working_path)
159
160 self.logfh = open('_qpalma_train.log','w+')
161 cPickle.dump(self.run,open('run_obj.pickle','w+'))
162
163 self.plog("Settings are:\n")
164 self.plog("%s\n"%str(self.run))
165
166 if self.run['mode'] == 'normal':
167 self.use_quality_scores = False
168
169 elif self.run['mode'] == 'using_quality_scores':
170 self.use_quality_scores = True
171 else:
172 assert(False)
173
174
175 def setUpSolver(self):
176 # Initialize solver
177 self.plog('Initializing problem...\n')
178
179 try:
180 solver = SIQPSolver(run['numFeatures'],numExamples,run['C'],self.logfh,run)
181 except:
182 self.plog('Got no license. Telling queue to reschedule job...\n')
183 sys.exit(99)
184
185 solver.enforceMonotonicity(lengthSP,lengthSP+donSP)
186 solver.enforceMonotonicity(lengthSP+donSP,lengthSP+donSP+accSP)
187
188 return solver
189
190
191 def train(self,training_set):
192 """
193 The mainloop for training.
194 """
195
196 numExamples = len(training_set)
197 self.plog('Number of training examples: %d\n'% numExamples)
198
199 self.noImprovementCtr = 0
200 self.oldObjValue = 1e8
201
202 iteration_steps = self.run['iter_steps']
203 remove_duplicate_scores = self.run['remove_duplicate_scores']
204 print_matrix = self.run['print_matrix']
205 anzpath = self.run['anzpath']
206
207 # Initialize parameter vector
208 param = numpy.matlib.rand(run['numFeatures'],1)
209
210 lengthSP = self.run['numLengthSuppPoints']
211 donSP = self.run['numDonSuppPoints']
212 accSP = self.run['numAccSuppPoints']
213 mmatrixSP = self.run['matchmatrixRows']*run['matchmatrixCols']
214 numq = self.run['numQualSuppPoints']
215 totalQualSP = self.run['totalQualSuppPoints']
216
217 # no intron length model
218 if not self.run['enable_intron_length']:
219 param[:lengthSP] *= 0.0
220
221 # Set the parameters such as limits penalties for the Plifs
222 [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation,self.run)
223
224 solver = self.setUpSolver()
225
226 # stores the number of alignments done for each example (best path, second-best path etc.)
227 num_path = [anzpath]*numExamples
228
229 # stores the gap for each example
230 gap = [0.0]*numExamples
231
232 currentPhi = zeros((run['numFeatures'],1))
233 totalQualityPenalties = zeros((totalQualSP,1))
234
235 numConstPerRound = run['numConstraintsPerRound']
236 solver_call_ctr = 0
237
238 suboptimal_example = 0
239 iteration_nr = 0
240 param_idx = 0
241 const_added_ctr = 0
242
243 featureVectors = zeros((run['numFeatures'],numExamples))
244
245 self.plog('Starting training...\n')
246 # the main training loop
247 while True:
248 if iteration_nr == iteration_steps:
249 break
250
251 for exampleIdx,example_key in enumerate(training_set.keys()):
252 print 'Current example %d' % example_key
253 try:
254 dna,est,acc_supp,don_supp,exons,original_est,currentQualities =\
255 getData(training_set,example_key,run)
256 except SpliceSiteException:
257 continue
258
259 dna_len = len(dna)
260
261 if run['enable_quality_scores']:
262 quality = currentQualities[quality_index]
263 else:
264 quality = [40]*len(read)
265
266 if not run['enable_splice_signals']:
267 for idx,elem in enumerate(don_supp):
268 if elem != -inf:
269 don_supp[idx] = 0.0
270
271 for idx,elem in enumerate(acc_supp):
272 if elem != -inf:
273 acc_supp[idx] = 0.0
274
275 # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
276 if run['mode'] == 'using_quality_scores':
277 trueSpliceAlign, trueWeightMatch, trueWeightQuality ,dna_calc =\
278 computeSpliceAlignWithQuality(dna, exons, est, original_est,\
279 quality, qualityPlifs,run)
280 else:
281 trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
282
283 dna_calc = dna_calc.replace('-','')
284
285 #print 'right before computeSpliceWeights exampleIdx %d' % exampleIdx
286 # Calculate the weights
287 trueWeightDon, trueWeightAcc, trueWeightIntron =\
288 computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
289 trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
290
291 currentPhi[0:lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
292 currentPhi[lengthSP:lengthSP+donSP] = mat(d.penalties[:]).reshape(donSP,1)
293 currentPhi[lengthSP+donSP:lengthSP+donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
294 currentPhi[lengthSP+donSP+accSP:lengthSP+donSP+accSP+mmatrixSP] = mmatrix[:]
295
296 if run['mode'] == 'using_quality_scores':
297 totalQualityPenalties = param[-totalQualSP:]
298 currentPhi[lengthSP+donSP+accSP+mmatrixSP:] = totalQualityPenalties[:]
299
300 # Calculate w'phi(x,y) the total score of the alignment
301 trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
302
303 # The allWeights vector is supposed to store the weight parameter
304 # of the true alignment as well as the weight parameters of the
305 # num_path[exampleIdx] other alignments
306 allWeights = zeros((run['numFeatures'],num_path[exampleIdx]+1))
307 allWeights[:,0] = trueWeight[:,0]
308
309 AlignmentScores = [0.0]*(num_path[exampleIdx]+1)
310 AlignmentScores[0] = trueAlignmentScore
311
312 ################## Calculate wrong alignment(s) ######################
313 # Compute donor, acceptor with penalty_lookup_new
314 # returns two double lists
315 donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
316
317 ps = h.convert2SWIG()
318
319 newSpliceAlign, newEstAlign, newWeightMatch, newDPScores,\
320 newQualityPlifsFeatures, unneeded1, unneeded2 =\
321 self.do_alignment(dna,est,quality,mmatrix,donor,acceptor,ps,qualityPlifs,num_path[exampleIdx],False)
322 mm_len = run['matchmatrixRows']*run['matchmatrixCols']
323
324 newSpliceAlign = newSpliceAlign.reshape(num_path[exampleIdx],dna_len)
325 newWeightMatch = newWeightMatch.reshape(num_path[exampleIdx],mm_len)
326
327 newQualityPlifsFeatures = newQualityPlifsFeatures.reshape(num_path[exampleIdx],run['totalQualSuppPoints'])
328 # Calculate weights of the respective alignments. Note that we are calculating n-best alignments without
329 # hamming loss, so we have to keep track which of the n-best alignments correspond to the true one in order
330 # not to incorporate a true alignment in the
331 # constraints. To keep track of the true and false alignments we
332 # define an array true_map with a boolean indicating the
333 # equivalence to the true alignment for each decoded alignment.
334 true_map = [0]*(num_path[exampleIdx]+1)
335 true_map[0] = 1
336
337 for pathNr in range(num_path[exampleIdx]):
338 weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a,\
339 h, newSpliceAlign[pathNr,:].flatten().tolist()[0], don_supp,\
340 acc_supp)
341
342 decodedQualityFeatures = zeros((run['totalQualSuppPoints'],1))
343 decodedQualityFeatures = newQualityPlifsFeatures[pathNr,:].T
344 # Gewichte in restliche Zeilen der Matrix speichern
345 allWeights[:,pathNr+1] = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch[pathNr,:].T, decodedQualityFeatures[:]])
346
347 hpen = mat(h.penalties).reshape(len(h.penalties),1)
348 dpen = mat(d.penalties).reshape(len(d.penalties),1)
349 apen = mat(a.penalties).reshape(len(a.penalties),1)
350 features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties[:]])
351
352 featureVectors[:,exampleIdx] = allWeights[:,pathNr+1]
353
354 AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
355
356 distinct_scores = False
357 if math.fabs(AlignmentScores[pathNr] - AlignmentScores[pathNr+1]) > 1e-5:
358 distinct_scores = True
359
360 # Check wether scalar product + loss equals viterbi score
361 if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) <= 1e-5:
362 self.plog("Scalar prod. + loss not equals Viterbi output!\n")
363 pdb.set_trace()
364
365 self.plog(" scalar prod (correct) : %f\n"%AlignmentScores[0])
366 self.plog(" scalar prod (pred.) : %f %f\n"%(newDPScores[pathNr,0],AlignmentScores[pathNr+1]))
367
368 # if the pathNr-best alignment is very close to the true alignment consider it as true
369 if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
370 true_map[pathNr+1] = 1
371
372 if not trueAlignmentScore <= max(AlignmentScores[1:]) + 1e-6:
373 print "suboptimal_example %d\n" %exampleIdx
374 #trueSpliceAlign, trueWeightMatch, trueWeightQuality dna_calc=\
375 #computeSpliceAlignWithQuality(dna, exons, est, original_est, quality, qualityPlifs)
376
377 #pdb.set_trace()
378 suboptimal_example += 1
379 self.plog("suboptimal_example %d\n" %exampleIdx)
380
381 # the true label sequence should not have a larger score than the maximal one WHYYYYY?
382 # this means that all n-best paths are to close to each other
383 # we have to extend the n-best search to a (n+1)-best
384 if len([elem for elem in true_map if elem == 1]) == len(true_map):
385 num_path[exampleIdx] = num_path[exampleIdx]+1
386
387 # Choose true and first false alignment for extending
388 firstFalseIdx = -1
389 for map_idx,elem in enumerate(true_map):
390 if elem == 0:
391 firstFalseIdx = map_idx
392 break
393
394 if False:
395 self.plog("Is considered as: %d\n" % true_map[1])
396
397 #result_len = currentAlignment.getResultLength()
398
399 dna_array,est_array = currentAlignment.getAlignmentArraysNew()
400
401 _newSpliceAlign = newSpliceAlign[0].flatten().tolist()[0]
402 _newEstAlign = newEstAlign[0].flatten().tolist()[0]
403
404 # if there is at least one useful false alignment add the
405 # corresponding constraints to the optimization problem
406 if firstFalseIdx != -1:
407 firstFalseWeights = allWeights[:,firstFalseIdx]
408 differenceVector = trueWeight - firstFalseWeights
409 #pdb.set_trace()
410
411 const_added = solver.addConstraint(differenceVector, exampleIdx)
412 const_added_ctr += 1
413
414 # end of one example processing
415
416 # call solver every nth example //added constraint
417 if exampleIdx != 0 and exampleIdx % numConstPerRound == 0:
418 objValue,w,self.slacks = solver.solve()
419 solver_call_ctr += 1
420
421 if solver_call_ctr == 5:
422 numConstPerRound = 200
423 self.plog('numConstPerRound is now %d\n'% numConstPerRound)
424
425 if math.fabs(objValue - self.oldObjValue) <= 1e-6:
426 self.noImprovementCtr += 1
427
428 if self.noImprovementCtr == numExamples+1:
429 break
430
431 self.oldObjValue = objValue
432 print "objValue is %f" % objValue
433
434 sum_xis = 0
435 for elem in self.slacks:
436 sum_xis += elem
437
438 print 'sum of slacks is %f'% sum_xis
439 self.plog('sum of slacks is %f\n'% sum_xis)
440
441 for i in range(len(param)):
442 param[i] = w[i]
443
444 cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
445 param_idx += 1
446 [h,d,a,mmatrix,qualityPlifs] =\
447 set_param_palma(param,self.ARGS.train_with_intronlengthinformation,self.run)
448
449 ##############################################
450 # end of one iteration through all examples #
451 ##############################################
452
453 self.plog("suboptimal rounds %d\n" %suboptimal_example)
454
455 if self.noImprovementCtr == numExamples*2:
456 FinalizeTraining(param,'param_%d.pickle'%param_idx)
457
458 iteration_nr += 1
459
460 #
461 # end of optimization
462 #
463 FinalizeTraining(param,'param_%d.pickle'%param_idx)
464
465
466 def FinalizeTraining(self,vector,name):
467 self.plog("Training completed")
468 cPickle.dump(param,open(name,'w+'))
469 self.logfh.close()
470
471
472 ###############################################################################
473 #
474 # End of the code needed for training
475 #
476 # Begin of code for prediction
477 #
478 ###############################################################################
479
480
481 def init_prediction(self,dataset_fn,prediction_keys,param_fn,set_name):
482 """
483 Performing a prediction takes...
484 """
485 self.set_name = set_name
486
487 #full_working_path = os.path.join(run['alignment_dir'],run['name'])
488 full_working_path = self.run['result_dir']
489
490 print 'full_working_path is %s' % full_working_path
491
492 #assert not os.path.exists(full_working_path)
493 if not os.path.exists(full_working_path):
494 os.mkdir(full_working_path)
495
496 assert os.path.exists(full_working_path)
497
498 # ATTENTION: Changing working directory
499 os.chdir(full_working_path)
500
501 self.logfh = open('_qpalma_predict_%s.log'%set_name,'w+')
502
503 # number of prediction instances
504 self.plog('Number of prediction examples: %d\n'% len(prediction_keys))
505
506 # load dataset and fetch instances that shall be predicted
507 dataset = cPickle.load(open(dataset_fn))
508
509 prediction_set = {}
510 for key in prediction_keys:
511 prediction_set[key] = dataset[key]
512
513 # we do not need the full dataset anymore
514 del dataset
515
516 # Load parameter vector to predict with
517 param = cPickle.load(open(param_fn))
518
519 self.predict(prediction_set,param)
520
521
522 def predict(self,prediction_set,param):
523 """
524 This method...
525 """
526
527 if self.run['mode'] == 'normal':
528 self.use_quality_scores = False
529
530 elif self.run['mode'] == 'using_quality_scores':
531 self.use_quality_scores = True
532 else:
533 assert(False)
534
535 # Set the parameters such as limits/penalties for the Plifs
536 [h,d,a,mmatrix,qualityPlifs] =\
537 set_param_palma(param,self.ARGS.train_with_intronlengthinformation,self.run)
538
539 if not self.qpalma_debug_mode:
540 self.plog('Starting prediction...\n')
541
542 self.problem_ctr = 0
543
544 # where we store the predictions
545 allPredictions = []
546
547 # we take the first quality vector of the tuple of quality vectors
548 quality_index = 0
549
550 # beginning of the prediction loop
551 for example_key in prediction_set.keys():
552 print 'Current example %d' % example_key
553 for example in prediction_set[example_key]:
554
555 currentSeqInfo,read,currentQualities = example
556
557 id,chromo,strand,genomicSeq_start,genomicSeq_stop =\
558 currentSeqInfo
559
560 if not self.qpalma_debug_mode:
561 self.plog('Loading example id: %d...\n'% int(id))
562
563 if self.run['enable_quality_scores']:
564 quality = currentQualities[quality_index]
565 else:
566 quality = [40]*len(read)
567
568 try:
569 currentDNASeq, currentAcc, currentDon = self.seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop)
570 except:
571 self.problem_ctr += 1
572 continue
573
574 if not self.run['enable_splice_signals']:
575 for idx,elem in enumerate(currentDon):
576 if elem != -inf:
577 currentDon[idx] = 0.0
578
579 for idx,elem in enumerate(currentAcc):
580 if elem != -inf:
581 currentAcc[idx] = 0.0
582
583 current_prediction = self.calc_alignment(currentDNASeq, read,\
584 quality, currentDon, currentAcc, d, a, h, mmatrix, qualityPlifs)
585
586 current_prediction['id'] = id
587 current_prediction['chr'] = chromo
588 current_prediction['strand'] = strand
589 current_prediction['start_pos'] = genomicSeq_start
590
591 allPredictions.append(current_prediction)
592
593 if not self.qpalma_debug_mode:
594 self.FinalizePrediction(allPredictions)
595 else:
596 return allPredictions
597
598
599 def FinalizePrediction(self,allPredictions):
600 """ End of the prediction loop we save all predictions in a pickle file and exit """
601
602 cPickle.dump(allPredictions,open('%s.predictions.pickle'%(self.set_name),'w+'))
603 self.plog('Prediction completed\n')
604 mes = 'Problem ctr %d' % self.problem_ctr
605 self.plog(mes+'\n')
606 self.logfh.close()
607
608
609 def calc_alignment(self, dna, read, quality, don_supp, acc_supp, d, a, h, mmatrix, qualityPlifs):
610 """
611 Given two sequences and the parameters we calculate on alignment
612 """
613
614 run = self.run
615 donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
616
617 if '-' in read:
618 self.plog('found gap\n')
619 read = read.replace('-','')
620 assert len(read) == Conf.read_size
621
622 ps = h.convert2SWIG()
623
624 newSpliceAlign, newEstAlign, newWeightMatch, newDPScores,\
625 newQualityPlifsFeatures, dna_array, read_array =\
626 self.do_alignment(dna,read,quality,mmatrix,donor,acceptor,ps,qualityPlifs,1,True)
627
628 mm_len = run['matchmatrixRows']*run['matchmatrixCols']
629
630 true_map = [0]*2
631 true_map[0] = 1
632 pathNr = 0
633
634 _newSpliceAlign = array.array('B',newSpliceAlign)
635 _newEstAlign = array.array('B',newEstAlign)
636
637 #(qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds, tExonSizes, tStarts, tEnds)
638 alignment = get_alignment(_newSpliceAlign,_newEstAlign, dna_array, read_array)
639
640 dna_array = array.array('B',dna_array)
641 read_array = array.array('B',read_array)
642
643 newExons = self.calculatePredictedExons(newSpliceAlign)
644
645 current_prediction = {'predExons':newExons, 'dna':dna, 'read':read, 'DPScores':newDPScores,\
646 'alignment':alignment,'spliceAlign':_newSpliceAlign,'estAlign':_newEstAlign,\
647 'dna_array':dna_array, 'read_array':read_array }
648
649 return current_prediction
650
651
652 def calculatePredictedExons(self,SpliceAlign):
653 newExons = []
654 oldElem = -1
655 SpliceAlign.append(-1)
656 for pos,elem in enumerate(SpliceAlign):
657 if pos == 0:
658 oldElem = -1
659 else:
660 oldElem = SpliceAlign[pos-1]
661
662 if oldElem != 0 and elem == 0: # start of exon
663 newExons.append(pos)
664
665 if oldElem == 0 and elem != 0: # end of exon
666 newExons.append(pos)
667
668 return newExons