+ added framework code for training modus
[qpalma.git] / scripts / qpalma_main.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 # This program is free software; you can redistribute it and/or modify
5 # it under the terms of the GNU General Public License as published by
6 # the Free Software Foundation; either version 2 of the License, or
7 # (at your option) any later version.
8 #
9 # Written (W) 2008 Fabio De Bona
10 # Copyright (C) 2008 Max-Planck-Society
11
12 import array
13 import cPickle
14 import os.path
15 import pdb
16 import sys
17
18 from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper,unbracket_seq
19
20 import numpy
21 from numpy.matlib import mat,zeros,ones,inf
22 from numpy.linalg import norm
23
24 #from qpalma.SIQP_CPX import SIQPSolver
25 #from qpalma.SIQP_CVXOPT import SIQPSolver
26
27 import QPalmaDP
28 import qpalma
29 from qpalma.computeSpliceWeights import *
30 from qpalma.set_param_palma import *
31 from qpalma.computeSpliceAlignWithQuality import *
32 from qpalma.TrainingParam import Param
33 from qpalma.Plif import Plf,compute_donacc
34
35 from Utils import pprint_alignment, get_alignment
36
37 class SpliceSiteException:
38 pass
39
40
41 def getData(training_set,exampleKey,run):
42 """ This function... """
43
44 currentSeqInfo,original_est,currentQualities,currentExons = training_set[exampleKey]
45 id,chr,strand,up_cut,down_cut = currentSeqInfo
46
47 est = original_est
48 est = "".join(est)
49 est = est.lower()
50 est = unbracket_est(est)
51 est = est.replace('-','')
52
53 assert len(est) == run['read_size'], pdb.set_trace()
54 est_len = len(est)
55
56 #original_est = OriginalEsts[exampleIdx]
57 original_est = "".join(original_est)
58 original_est = original_est.lower()
59
60 dna_flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
61 dna, acc_supp, don_supp = get_seq_and_scores(chr,strand,up_cut,down_cut,dna_flat_files)
62
63 #currentDNASeq, currentAcc, currentDon = seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop)
64
65 original_exons = currentExons
66 exons = original_exons - (up_cut-1)
67 exons[0,0] -= 1
68 exons[1,0] -= 1
69
70 if exons.shape == (2,2):
71 fetched_dna_subseq = dna[exons[0,0]:exons[0,1]] + dna[exons[1,0]:exons[1,1]]
72
73 donor_elem = dna[exons[0,1]:exons[0,1]+2]
74 acceptor_elem = dna[exons[1,0]-2:exons[1,0]]
75
76 if not ( donor_elem == 'gt' or donor_elem == 'gc' ):
77 print 'invalid donor in example %d'% exampleKey
78 raise SpliceSiteException
79
80 if not ( acceptor_elem == 'ag' ):
81 print 'invalid acceptor in example %d'% exampleKey
82 raise SpliceSiteException
83
84 assert len(fetched_dna_subseq) == len(est), pdb.set_trace()
85
86 return dna,est,acc_supp,don_supp,exons,original_est,currentQualities
87
88
89 class QPalma:
90 """
91 This class wraps the training and prediction functions for
92 the alignment.
93 """
94
95 def __init__(self,run,seqInfo,dmode=False):
96 self.ARGS = Param()
97 self.qpalma_debug_mode = dmode
98 self.run = run
99 self.seqInfo = seqInfo
100
101
102 def plog(self,string):
103 self.logfh.write(string)
104 self.logfh.flush()
105
106
107 def do_alignment(self,dna,est,quality,mmatrix,donor,acceptor,ps,qualityPlifs,current_num_path,prediction_mode):
108 """
109 Given the needed input this method calls the QPalma C module which
110 calculates a dynamic programming in order to obtain an alignment
111 """
112
113 dna_len = len(dna)
114 est_len = len(est)
115
116 prb = QPalmaDP.createDoubleArrayFromList(quality)
117 chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
118
119 matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
120 mm_len = self.run['matchmatrixRows']*self.run['matchmatrixCols']
121
122 d_len = len(donor)
123 donor = QPalmaDP.createDoubleArrayFromList(donor)
124 a_len = len(acceptor)
125 acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
126
127 # Create the alignment object representing the interface to the C/C++ code.
128 currentAlignment = QPalmaDP.Alignment(self.run['numQualPlifs'],self.run['numQualSuppPoints'], self.use_quality_scores)
129 c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
130 # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
131 currentAlignment.myalign( current_num_path, dna, dna_len,\
132 est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
133 acceptor, a_len, c_qualityPlifs, self.run['remove_duplicate_scores'],\
134 self.run['print_matrix'] )
135
136 if prediction_mode:
137 # part that is only needed for prediction
138 result_len = currentAlignment.getResultLength()
139 dna_array,est_array = currentAlignment.getAlignmentArraysNew()
140 else:
141 dna_array = None
142 est_array = None
143
144 newSpliceAlign, newEstAlign, newWeightMatch, newDPScores, newQualityPlifsFeatures =\
145 currentAlignment.getAlignmentResultsNew()
146
147 return newSpliceAlign, newEstAlign, newWeightMatch, newDPScores,\
148 newQualityPlifsFeatures, dna_array, est_array
149
150
151 def init_train(self,training_set):
152 full_working_path = os.path.join(self.run['alignment_dir'],self.run['name'])
153
154 #assert not os.path.exists(full_working_path)
155 if not os.path.exists(full_working_path):
156 os.mkdir(full_working_path)
157
158 assert os.path.exists(full_working_path)
159
160 # ATTENTION: Changing working directory
161 os.chdir(full_working_path)
162
163 self.logfh = open('_qpalma_train.log','w+')
164 cPickle.dump(self.run,open('run_obj.pickle','w+'))
165
166 self.plog("Settings are:\n")
167 self.plog("%s\n"%str(self.run))
168
169 if self.run['mode'] == 'normal':
170 self.use_quality_scores = False
171
172 elif self.run['mode'] == 'using_quality_scores':
173 self.use_quality_scores = True
174 else:
175 assert(False)
176
177
178 def setUpSolver(self):
179 # Initialize solver
180 self.plog('Initializing problem...\n')
181
182 try:
183 solver = SIQPSolver(run['numFeatures'],numExamples,run['C'],self.logfh,run)
184 except:
185 self.plog('Got no license. Telling queue to reschedule job...\n')
186 sys.exit(99)
187
188 solver.enforceMonotonicity(lengthSP,lengthSP+donSP)
189 solver.enforceMonotonicity(lengthSP+donSP,lengthSP+donSP+accSP)
190
191 return solver
192
193
194 def train(self,training_set):
195 numExamples = len(training_set)
196 self.plog('Number of training examples: %d\n'% numExamples)
197
198 self.noImprovementCtr = 0
199 self.oldObjValue = 1e8
200
201 iteration_steps = self.run['iter_steps']
202 remove_duplicate_scores = self.run['remove_duplicate_scores']
203 print_matrix = self.run['print_matrix']
204 anzpath = self.run['anzpath']
205
206 # Initialize parameter vector
207 param = numpy.matlib.rand(run['numFeatures'],1)
208
209 lengthSP = self.run['numLengthSuppPoints']
210 donSP = self.run['numDonSuppPoints']
211 accSP = self.run['numAccSuppPoints']
212 mmatrixSP = self.run['matchmatrixRows']*run['matchmatrixCols']
213 numq = self.run['numQualSuppPoints']
214 totalQualSP = self.run['totalQualSuppPoints']
215
216 # no intron length model
217 if not self.run['enable_intron_length']:
218 param[:lengthSP] *= 0.0
219
220 # Set the parameters such as limits penalties for the Plifs
221 [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation,self.run)
222
223 solver = self.setUpSolver()
224
225 # stores the number of alignments done for each example (best path, second-best path etc.)
226 num_path = [anzpath]*numExamples
227
228 # stores the gap for each example
229 gap = [0.0]*numExamples
230
231 currentPhi = zeros((run['numFeatures'],1))
232 totalQualityPenalties = zeros((totalQualSP,1))
233
234 numConstPerRound = run['numConstraintsPerRound']
235 solver_call_ctr = 0
236
237 suboptimal_example = 0
238 iteration_nr = 0
239 param_idx = 0
240 const_added_ctr = 0
241
242 featureVectors = zeros((run['numFeatures'],numExamples))
243
244 self.plog('Starting training...\n')
245 # the main training loop
246 while True:
247 if iteration_nr == iteration_steps:
248 break
249
250 for exampleIdx,example_key in enumerate(training_set.keys()):
251 print 'Current example %d' % example_key
252 try:
253 dna,est,acc_supp,don_supp,exons,original_est,currentQualities =\
254 getData(training_set,example_key,run)
255 except SpliceSiteException:
256 continue
257
258 dna_len = len(dna)
259
260 if run['enable_quality_scores']:
261 quality = currentQualities[quality_index]
262 else:
263 quality = [40]*len(read)
264
265 if not run['enable_splice_signals']:
266 for idx,elem in enumerate(don_supp):
267 if elem != -inf:
268 don_supp[idx] = 0.0
269
270 for idx,elem in enumerate(acc_supp):
271 if elem != -inf:
272 acc_supp[idx] = 0.0
273
274 # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
275 if run['mode'] == 'using_quality_scores':
276 trueSpliceAlign, trueWeightMatch, trueWeightQuality ,dna_calc =\
277 computeSpliceAlignWithQuality(dna, exons, est, original_est,\
278 quality, qualityPlifs,run)
279 else:
280 trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
281
282 dna_calc = dna_calc.replace('-','')
283
284 #print 'right before computeSpliceWeights exampleIdx %d' % exampleIdx
285 # Calculate the weights
286 trueWeightDon, trueWeightAcc, trueWeightIntron =\
287 computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
288 trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
289
290 currentPhi[0:lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
291 currentPhi[lengthSP:lengthSP+donSP] = mat(d.penalties[:]).reshape(donSP,1)
292 currentPhi[lengthSP+donSP:lengthSP+donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
293 currentPhi[lengthSP+donSP+accSP:lengthSP+donSP+accSP+mmatrixSP] = mmatrix[:]
294
295 if run['mode'] == 'using_quality_scores':
296 totalQualityPenalties = param[-totalQualSP:]
297 currentPhi[lengthSP+donSP+accSP+mmatrixSP:] = totalQualityPenalties[:]
298
299 # Calculate w'phi(x,y) the total score of the alignment
300 trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
301
302 # The allWeights vector is supposed to store the weight parameter
303 # of the true alignment as well as the weight parameters of the
304 # num_path[exampleIdx] other alignments
305 allWeights = zeros((run['numFeatures'],num_path[exampleIdx]+1))
306 allWeights[:,0] = trueWeight[:,0]
307
308 AlignmentScores = [0.0]*(num_path[exampleIdx]+1)
309 AlignmentScores[0] = trueAlignmentScore
310
311 ################## Calculate wrong alignment(s) ######################
312 # Compute donor, acceptor with penalty_lookup_new
313 # returns two double lists
314 donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
315
316 ps = h.convert2SWIG()
317
318 newSpliceAlign, newEstAlign, newWeightMatch, newDPScores,\
319 newQualityPlifsFeatures, unneeded1, unneeded2 =\
320 self.do_alignment(dna,est,quality,mmatrix,donor,acceptor,ps,qualityPlifs,num_path[exampleIdx],False)
321 mm_len = run['matchmatrixRows']*run['matchmatrixCols']
322
323 newSpliceAlign = newSpliceAlign.reshape(num_path[exampleIdx],dna_len)
324 newWeightMatch = newWeightMatch.reshape(num_path[exampleIdx],mm_len)
325
326 newQualityPlifsFeatures = newQualityPlifsFeatures.reshape(num_path[exampleIdx],run['totalQualSuppPoints'])
327 # Calculate weights of the respective alignments. Note that we are calculating n-best alignments without
328 # hamming loss, so we have to keep track which of the n-best alignments correspond to the true one in order
329 # not to incorporate a true alignment in the
330 # constraints. To keep track of the true and false alignments we
331 # define an array true_map with a boolean indicating the
332 # equivalence to the true alignment for each decoded alignment.
333 true_map = [0]*(num_path[exampleIdx]+1)
334 true_map[0] = 1
335
336 for pathNr in range(num_path[exampleIdx]):
337 weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a,\
338 h, newSpliceAlign[pathNr,:].flatten().tolist()[0], don_supp,\
339 acc_supp)
340
341 decodedQualityFeatures = zeros((run['totalQualSuppPoints'],1))
342 decodedQualityFeatures = newQualityPlifsFeatures[pathNr,:].T
343 # Gewichte in restliche Zeilen der Matrix speichern
344 allWeights[:,pathNr+1] = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch[pathNr,:].T, decodedQualityFeatures[:]])
345
346 hpen = mat(h.penalties).reshape(len(h.penalties),1)
347 dpen = mat(d.penalties).reshape(len(d.penalties),1)
348 apen = mat(a.penalties).reshape(len(a.penalties),1)
349 features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties[:]])
350
351 featureVectors[:,exampleIdx] = allWeights[:,pathNr+1]
352
353 AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
354
355 distinct_scores = False
356 if math.fabs(AlignmentScores[pathNr] - AlignmentScores[pathNr+1]) > 1e-5:
357 distinct_scores = True
358
359 # Check wether scalar product + loss equals viterbi score
360 if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) <= 1e-5:
361 self.plog("Scalar prod. + loss not equals Viterbi output!\n")
362 pdb.set_trace()
363
364 self.plog(" scalar prod (correct) : %f\n"%AlignmentScores[0])
365 self.plog(" scalar prod (pred.) : %f %f\n"%(newDPScores[pathNr,0],AlignmentScores[pathNr+1]))
366
367 # if the pathNr-best alignment is very close to the true alignment consider it as true
368 if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
369 true_map[pathNr+1] = 1
370
371 if not trueAlignmentScore <= max(AlignmentScores[1:]) + 1e-6:
372 print "suboptimal_example %d\n" %exampleIdx
373 #trueSpliceAlign, trueWeightMatch, trueWeightQuality dna_calc=\
374 #computeSpliceAlignWithQuality(dna, exons, est, original_est, quality, qualityPlifs)
375
376 #pdb.set_trace()
377 suboptimal_example += 1
378 self.plog("suboptimal_example %d\n" %exampleIdx)
379
380 # the true label sequence should not have a larger score than the maximal one WHYYYYY?
381 # this means that all n-best paths are to close to each other
382 # we have to extend the n-best search to a (n+1)-best
383 if len([elem for elem in true_map if elem == 1]) == len(true_map):
384 num_path[exampleIdx] = num_path[exampleIdx]+1
385
386 # Choose true and first false alignment for extending
387 firstFalseIdx = -1
388 for map_idx,elem in enumerate(true_map):
389 if elem == 0:
390 firstFalseIdx = map_idx
391 break
392
393 if False:
394 self.plog("Is considered as: %d\n" % true_map[1])
395
396 #result_len = currentAlignment.getResultLength()
397
398 dna_array,est_array = currentAlignment.getAlignmentArraysNew()
399
400 _newSpliceAlign = newSpliceAlign[0].flatten().tolist()[0]
401 _newEstAlign = newEstAlign[0].flatten().tolist()[0]
402
403 # if there is at least one useful false alignment add the
404 # corresponding constraints to the optimization problem
405 if firstFalseIdx != -1:
406 firstFalseWeights = allWeights[:,firstFalseIdx]
407 differenceVector = trueWeight - firstFalseWeights
408 #pdb.set_trace()
409
410 const_added = solver.addConstraint(differenceVector, exampleIdx)
411 const_added_ctr += 1
412
413 # end of one example processing
414
415 # call solver every nth example //added constraint
416 if exampleIdx != 0 and exampleIdx % numConstPerRound == 0:
417 objValue,w,self.slacks = solver.solve()
418 solver_call_ctr += 1
419
420 if solver_call_ctr == 5:
421 numConstPerRound = 200
422 self.plog('numConstPerRound is now %d\n'% numConstPerRound)
423
424 if math.fabs(objValue - self.oldObjValue) <= 1e-6:
425 self.noImprovementCtr += 1
426
427 if self.noImprovementCtr == numExamples+1:
428 break
429
430 self.oldObjValue = objValue
431 print "objValue is %f" % objValue
432
433 sum_xis = 0
434 for elem in self.slacks:
435 sum_xis += elem
436
437 print 'sum of slacks is %f'% sum_xis
438 self.plog('sum of slacks is %f\n'% sum_xis)
439
440 for i in range(len(param)):
441 param[i] = w[i]
442
443 cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
444 param_idx += 1
445 [h,d,a,mmatrix,qualityPlifs] =\
446 set_param_palma(param,self.ARGS.train_with_intronlengthinformation,self.run)
447
448 ##############################################
449 # end of one iteration through all examples #
450 ##############################################
451
452 self.plog("suboptimal rounds %d\n" %suboptimal_example)
453
454 if self.noImprovementCtr == numExamples*2:
455 FinalizeTraining(param,'param_%d.pickle'%param_idx)
456
457 iteration_nr += 1
458
459 #
460 # end of optimization
461 #
462 FinalizeTraining(param,'param_%d.pickle'%param_idx)
463
464
465 def FinalizeTraining(self,vector,name):
466 self.plog("Training completed")
467 cPickle.dump(param,open(name,'w+'))
468 self.logfh.close()
469 sys.exit(0)
470
471
472 ###############################################################################
473 #
474 # End of the code needed for training
475 #
476 # Begin of code for prediction
477 #
478 ###############################################################################
479
480 def init_prediction(self,dataset_fn,prediction_keys,param_fn,set_name):
481 """
482 Performing a prediction takes...
483 """
484 self.set_name = set_name
485
486 #full_working_path = os.path.join(run['alignment_dir'],run['name'])
487 full_working_path = self.run['result_dir']
488
489 print 'full_working_path is %s' % full_working_path
490
491 #assert not os.path.exists(full_working_path)
492 if not os.path.exists(full_working_path):
493 os.mkdir(full_working_path)
494
495 assert os.path.exists(full_working_path)
496
497 # ATTENTION: Changing working directory
498 os.chdir(full_working_path)
499
500 self.logfh = open('_qpalma_predict_%s.log'%set_name,'w+')
501
502 # number of prediction instances
503 self.plog('Number of prediction examples: %d\n'% len(prediction_keys))
504
505 # load dataset and fetch instances that shall be predicted
506 dataset = cPickle.load(open(dataset_fn))
507
508 prediction_set = {}
509 for key in prediction_keys:
510 prediction_set[key] = dataset[key]
511
512 # we do not need the full dataset anymore
513 del dataset
514
515 # Load parameter vector to predict with
516 param = cPickle.load(open(param_fn))
517
518 self.predict(prediction_set,param)
519
520
521 def predict(self,prediction_set,param):
522 """
523 This method...
524 """
525
526 if self.run['mode'] == 'normal':
527 self.use_quality_scores = False
528
529 elif self.run['mode'] == 'using_quality_scores':
530 self.use_quality_scores = True
531 else:
532 assert(False)
533
534 # Set the parameters such as limits/penalties for the Plifs
535 [h,d,a,mmatrix,qualityPlifs] =\
536 set_param_palma(param,self.ARGS.train_with_intronlengthinformation,self.run)
537
538 if not self.qpalma_debug_mode:
539 self.plog('Starting prediction...\n')
540
541 self.problem_ctr = 0
542
543 # where we store the predictions
544 allPredictions = []
545
546 # we take the first quality vector of the tuple of quality vectors
547 quality_index = 0
548
549 # beginning of the prediction loop
550 for example_key in prediction_set.keys():
551 print 'Current example %d' % example_key
552 for example in prediction_set[example_key]:
553
554 currentSeqInfo,read,currentQualities = example
555
556 id,chromo,strand,genomicSeq_start,genomicSeq_stop =\
557 currentSeqInfo
558
559 if not self.qpalma_debug_mode:
560 self.plog('Loading example id: %d...\n'% int(id))
561
562 if self.run['enable_quality_scores']:
563 quality = currentQualities[quality_index]
564 else:
565 quality = [40]*len(read)
566
567 try:
568 currentDNASeq, currentAcc, currentDon = self.seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop)
569 except:
570 self.problem_ctr += 1
571 continue
572
573 if not self.run['enable_splice_signals']:
574 for idx,elem in enumerate(currentDon):
575 if elem != -inf:
576 currentDon[idx] = 0.0
577
578 for idx,elem in enumerate(currentAcc):
579 if elem != -inf:
580 currentAcc[idx] = 0.0
581
582 current_prediction = self.calc_alignment(currentDNASeq, read,\
583 quality, currentDon, currentAcc, d, a, h, mmatrix, qualityPlifs)
584
585 current_prediction['id'] = id
586 current_prediction['chr'] = chromo
587 current_prediction['strand'] = strand
588 current_prediction['start_pos'] = genomicSeq_start
589
590 allPredictions.append(current_prediction)
591
592 if not self.qpalma_debug_mode:
593 self.FinalizePrediction(allPredictions)
594 else:
595 return allPredictions
596
597
598 def FinalizePrediction(self,allPredictions):
599 """ End of the prediction loop we save all predictions in a pickle file and exit """
600
601 cPickle.dump(allPredictions,open('%s.predictions.pickle'%(self.set_name),'w+'))
602 self.plog('Prediction completed\n')
603 mes = 'Problem ctr %d' % self.problem_ctr
604 self.plog(mes+'\n')
605 self.logfh.close()
606 sys.exit(0)
607
608
609 def calc_alignment(self, dna, read, quality, don_supp, acc_supp, d, a, h, mmatrix, qualityPlifs):
610 """
611 Given two sequences and the parameters we calculate on alignment
612 """
613
614 run = self.run
615 donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
616
617 if '-' in read:
618 self.plog('found gap\n')
619 read = read.replace('-','')
620 assert len(read) == Conf.read_size
621
622 ps = h.convert2SWIG()
623
624 newSpliceAlign, newEstAlign, newWeightMatch, newDPScores,\
625 newQualityPlifsFeatures, dna_array, read_array =\
626 self.do_alignment(dna,read,quality,mmatrix,donor,acceptor,ps,qualityPlifs,1,True)
627
628 mm_len = run['matchmatrixRows']*run['matchmatrixCols']
629
630 true_map = [0]*2
631 true_map[0] = 1
632 pathNr = 0
633
634 _newSpliceAlign = array.array('B',newSpliceAlign)
635 _newEstAlign = array.array('B',newEstAlign)
636
637 alignment = get_alignment(_newSpliceAlign,_newEstAlign, dna_array, read_array) #(qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds, tExonSizes, tStarts, tEnds)
638
639 dna_array = array.array('B',dna_array)
640 read_array = array.array('B',read_array)
641
642 newExons = self.calculatePredictedExons(newSpliceAlign)
643
644 current_prediction = {'predExons':newExons, 'dna':dna, 'read':read, 'DPScores':newDPScores,\
645 'alignment':alignment,'spliceAlign':_newSpliceAlign,'estAlign':_newEstAlign,\
646 'dna_array':dna_array, 'read_array':read_array }
647
648 return current_prediction
649
650
651 def calculatePredictedExons(self,SpliceAlign):
652 newExons = []
653 oldElem = -1
654 SpliceAlign.append(-1)
655 for pos,elem in enumerate(SpliceAlign):
656 if pos == 0:
657 oldElem = -1
658 else:
659 oldElem = SpliceAlign[pos-1]
660
661 if oldElem != 0 and elem == 0: # start of exon
662 newExons.append(pos)
663
664 if oldElem == 0 and elem != 0: # end of exon
665 newExons.append(pos)
666
667 return newExons