+ got rid of some unneccessary code
[qpalma.git] / scripts / qpalma_main.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 ###########################################################
5 #
6 # The QPalma project aims at extending the Palma project
7 # to be able to use Solexa reads together with their
8 # quality scores.
9 #
10 # This file represents the conversion of the main matlab
11 # training loop for Palma to Python.
12 #
13 # Author: Fabio De Bona
14 #
15 ###########################################################
16
17 import array
18 import cPickle
19 import os.path
20 import pdb
21 import sys
22
23 from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper,unbracket_seq
24
25 import numpy
26 from numpy.matlib import mat,zeros,ones,inf
27 from numpy.linalg import norm
28
29 #from qpalma.SIQP_CPX import SIQPSolver
30 #from qpalma.SIQP_CVXOPT import SIQPSolver
31
32 import QPalmaDP
33 import qpalma
34 from qpalma.computeSpliceWeights import *
35 from qpalma.set_param_palma import *
36 from qpalma.computeSpliceAlignWithQuality import *
37 from qpalma.TrainingParam import Param
38 from qpalma.Plif import Plf,compute_donacc
39
40 from Utils import pprint_alignment, get_alignment
41
42 class SpliceSiteException:
43 pass
44
45
46 def getData(training_set,exampleKey,run):
47 currentSeqInfo,currentExons,original_est,currentQualities = training_set[exampleKey]
48 id,chr,strand,up_cut,down_cut = currentSeqInfo
49
50 est = original_est
51 est = "".join(est)
52 est = est.lower()
53 est = unbracket_est(est)
54 est = est.replace('-','')
55
56 assert len(est) == run['read_size'], pdb.set_trace()
57 est_len = len(est)
58
59 #original_est = OriginalEsts[exampleIdx]
60 original_est = "".join(original_est)
61 original_est = original_est.lower()
62
63 dna_flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
64 dna, acc_supp, don_supp = get_seq_and_scores(chr,strand,up_cut,down_cut,dna_flat_files)
65
66 original_exons = currentExons
67 exons = original_exons - (up_cut-1)
68 exons[0,0] -= 1
69 exons[1,0] -= 1
70
71 if exons.shape == (2,2):
72 fetched_dna_subseq = dna[exons[0,0]:exons[0,1]] + dna[exons[1,0]:exons[1,1]]
73
74 donor_elem = dna[exons[0,1]:exons[0,1]+2]
75 acceptor_elem = dna[exons[1,0]-2:exons[1,0]]
76
77 if not ( donor_elem == 'gt' or donor_elem == 'gc' ):
78 print 'invalid donor in example %d'% exampleKey
79 raise SpliceSiteException
80
81 if not ( acceptor_elem == 'ag' ):
82 print 'invalid acceptor in example %d'% exampleKey
83 raise SpliceSiteException
84
85 assert len(fetched_dna_subseq) == len(est), pdb.set_trace()
86
87 return dna,est,acc_supp,don_supp,exons,original_est,currentQualities
88
89
90
91 class QPalma:
92 """
93 This class wraps the training and prediction functions for
94 the alignment.
95 """
96
97 def __init__(self,dmode=False):
98 self.ARGS = Param()
99 self.qpalma_debug_mode = dmode
100
101
102 def plog(self,string):
103 self.logfh.write(string)
104 self.logfh.flush()
105
106
107 def do_alignment(self,dna,est,quality,mmatrix,donor,acceptor,ps,qualityPlifs,current_num_path,prediction_mode):
108 """
109 Given the needed input this method calls the QPalma C module which
110 calculates a dynamic programming in order to obtain an alignment
111 """
112 run = self.run
113
114 dna_len = len(dna)
115 est_len = len(est)
116
117 prb = QPalmaDP.createDoubleArrayFromList(quality)
118 chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
119
120 matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
121 mm_len = run['matchmatrixRows']*run['matchmatrixCols']
122
123 d_len = len(donor)
124 donor = QPalmaDP.createDoubleArrayFromList(donor)
125 a_len = len(acceptor)
126 acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
127
128 # Create the alignment object representing the interface to the C/C++ code.
129 currentAlignment = QPalmaDP.Alignment(run['numQualPlifs'],run['numQualSuppPoints'], self.use_quality_scores)
130 c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
131 # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
132 currentAlignment.myalign( current_num_path, dna, dna_len,\
133 est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
134 acceptor, a_len, c_qualityPlifs, run['remove_duplicate_scores'],\
135 run['print_matrix'] )
136
137 if prediction_mode:
138 # part that is only needed for prediction
139 result_len = currentAlignment.getResultLength()
140 dna_array,est_array = currentAlignment.getAlignmentArraysNew()
141 else:
142 dna_array = None
143 est_array = None
144
145 newSpliceAlign, newEstAlign, newWeightMatch, newDPScores, newQualityPlifsFeatures =\
146 currentAlignment.getAlignmentResultsNew()
147
148 return newSpliceAlign, newEstAlign, newWeightMatch, newDPScores,\
149 newQualityPlifsFeatures, dna_array, est_array
150
151
152 def train(self,run,training_set):
153 self.run = run
154
155 full_working_path = os.path.join(run['alignment_dir'],run['name'])
156
157 #assert not os.path.exists(full_working_path)
158 if not os.path.exists(full_working_path):
159 os.mkdir(full_working_path)
160
161 assert os.path.exists(full_working_path)
162
163 # ATTENTION: Changing working directory
164 os.chdir(full_working_path)
165
166 self.logfh = open('_qpalma_train.log','w+')
167 cPickle.dump(run,open('run_obj.pickle','w+'))
168
169 self.plog("Settings are:\n")
170 self.plog("%s\n"%str(run))
171
172 if self.run['mode'] == 'normal':
173 self.use_quality_scores = False
174
175 elif self.run['mode'] == 'using_quality_scores':
176 self.use_quality_scores = True
177 else:
178 assert(False)
179
180 numExamples = len(training_set)
181 self.plog('Number of training examples: %d\n'% numExamples)
182
183 self.noImprovementCtr = 0
184 self.oldObjValue = 1e8
185
186 iteration_steps = run['iter_steps']
187 remove_duplicate_scores = run['remove_duplicate_scores']
188 print_matrix = run['print_matrix']
189 anzpath = run['anzpath']
190
191 # Initialize parameter vector /
192 #param = Conf.fixedParam[:run['numFeatures']]
193 param = numpy.matlib.rand(run['numFeatures'],1)
194
195 lengthSP = run['numLengthSuppPoints']
196 donSP = run['numDonSuppPoints']
197 accSP = run['numAccSuppPoints']
198 mmatrixSP = run['matchmatrixRows']*run['matchmatrixCols']
199 numq = run['numQualSuppPoints']
200 totalQualSP = run['totalQualSuppPoints']
201
202 # no intron length model
203 if not run['enable_intron_length']:
204 param[:lengthSP] *= 0.0
205
206 # Set the parameters such as limits penalties for the Plifs
207 [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation,run)
208
209 # Initialize solver
210 self.plog('Initializing problem...\n')
211
212 #try:
213 # solver = SIQPSolver(run['numFeatures'],numExamples,run['C'],self.logfh,run)
214 #except:
215 # self.plog('Got no license. Telling queue to reschedule job...\n')
216 # sys.exit(99)
217
218 #solver.enforceMonotonicity(lengthSP,lengthSP+donSP)
219 #solver.enforceMonotonicity(lengthSP+donSP,lengthSP+donSP+accSP)
220
221 # stores the number of alignments done for each example (best path, second-best path etc.)
222 num_path = [anzpath]*numExamples
223 # stores the gap for each example
224 gap = [0.0]*numExamples
225 #############################################################################################
226 # Training
227 #############################################################################################
228 self.plog('Starting training...\n')
229
230 currentPhi = zeros((run['numFeatures'],1))
231 totalQualityPenalties = zeros((totalQualSP,1))
232
233 numConstPerRound = run['numConstraintsPerRound']
234 solver_call_ctr = 0
235
236 suboptimal_example = 0
237 iteration_nr = 0
238 param_idx = 0
239 const_added_ctr = 0
240
241 featureVectors = zeros((run['numFeatures'],numExamples))
242
243 # the main training loop
244 while True:
245 if iteration_nr == iteration_steps:
246 break
247
248 for exampleIdx,example_key in enumerate(training_set.keys()):
249 print 'Current example %d' % example_key
250 try:
251 dna,est,acc_supp,don_supp,exons,original_est,currentQualities =\
252 getData(training_set,example_key,run)
253 except SpliceSiteException:
254 continue
255
256 dna_len = len(dna)
257
258 if run['mode'] == 'normal':
259 quality = [40]*len(est)
260
261 if run['mode'] == 'using_quality_scores':
262 quality = currentQualities[0]
263
264 if not run['enable_quality_scores']:
265 quality = [40]*len(est)
266
267 if not run['enable_splice_signals']:
268 for idx,elem in enumerate(don_supp):
269 if elem != -inf:
270 don_supp[idx] = 0.0
271
272 for idx,elem in enumerate(acc_supp):
273 if elem != -inf:
274 acc_supp[idx] = 0.0
275
276 # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
277 if run['mode'] == 'using_quality_scores':
278 trueSpliceAlign, trueWeightMatch, trueWeightQuality ,dna_calc =\
279 computeSpliceAlignWithQuality(dna, exons, est, original_est,\
280 quality, qualityPlifs,run)
281 else:
282 trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
283
284 dna_calc = dna_calc.replace('-','')
285
286 #print 'right before computeSpliceWeights exampleIdx %d' % exampleIdx
287 # Calculate the weights
288 trueWeightDon, trueWeightAcc, trueWeightIntron =\
289 computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
290 trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
291
292 currentPhi[0:lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
293 currentPhi[lengthSP:lengthSP+donSP] = mat(d.penalties[:]).reshape(donSP,1)
294 currentPhi[lengthSP+donSP:lengthSP+donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
295 currentPhi[lengthSP+donSP+accSP:lengthSP+donSP+accSP+mmatrixSP] = mmatrix[:]
296
297 if run['mode'] == 'using_quality_scores':
298 totalQualityPenalties = param[-totalQualSP:]
299 currentPhi[lengthSP+donSP+accSP+mmatrixSP:] = totalQualityPenalties[:]
300
301 # Calculate w'phi(x,y) the total score of the alignment
302 trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
303
304 # The allWeights vector is supposed to store the weight parameter
305 # of the true alignment as well as the weight parameters of the
306 # num_path[exampleIdx] other alignments
307 allWeights = zeros((run['numFeatures'],num_path[exampleIdx]+1))
308 allWeights[:,0] = trueWeight[:,0]
309
310 AlignmentScores = [0.0]*(num_path[exampleIdx]+1)
311 AlignmentScores[0] = trueAlignmentScore
312
313 ################## Calculate wrong alignment(s) ######################
314 # Compute donor, acceptor with penalty_lookup_new
315 # returns two double lists
316 donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
317
318 #myalign wants the acceptor site on the g of the ag
319 #acceptor = acceptor[1:]
320 #acceptor.append(-inf)
321
322 #donor = [-inf] + donor[:-1]
323
324 ps = h.convert2SWIG()
325
326 _newSpliceAlign, _newEstAlign, _newWeightMatch, _newDPScores,\
327 _newQualityPlifsFeatures, unneeded1, unneeded2 =\
328 self.do_alignment(dna,est,quality,mmatrix,donor,acceptor,ps,qualityPlifs,num_path[exampleIdx],False)
329 mm_len = run['matchmatrixRows']*run['matchmatrixCols']
330
331 newSpliceAlign = _newSpliceAlign
332 newEstAlign = _newEstAlign
333 newWeightMatch = _newWeightMatch
334 newDPScores = _newDPScores
335 newQualityPlifsFeatures = _newQualityPlifsFeatures
336
337 newSpliceAlign = newSpliceAlign.reshape(num_path[exampleIdx],dna_len)
338 newWeightMatch = newWeightMatch.reshape(num_path[exampleIdx],mm_len)
339
340 newQualityPlifsFeatures = newQualityPlifsFeatures.reshape(num_path[exampleIdx],run['totalQualSuppPoints'])
341 # Calculate weights of the respective alignments. Note that we are
342 # calculating n-best alignments without hamming loss, so we
343 # have to keep track which of the n-best alignments correspond to
344 # the true one in order not to incorporate a true alignment in the
345 # constraints. To keep track of the true and false alignments we
346 # define an array true_map with a boolean indicating the
347 # equivalence to the true alignment for each decoded alignment.
348 true_map = [0]*(num_path[exampleIdx]+1)
349 true_map[0] = 1
350
351 for pathNr in range(num_path[exampleIdx]):
352 weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a,\
353 h, newSpliceAlign[pathNr,:].flatten().tolist()[0], don_supp,\
354 acc_supp)
355
356 decodedQualityFeatures = zeros((run['totalQualSuppPoints'],1))
357 decodedQualityFeatures = newQualityPlifsFeatures[pathNr,:].T
358 # Gewichte in restliche Zeilen der Matrix speichern
359 allWeights[:,pathNr+1] = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch[pathNr,:].T, decodedQualityFeatures[:]])
360
361 hpen = mat(h.penalties).reshape(len(h.penalties),1)
362 dpen = mat(d.penalties).reshape(len(d.penalties),1)
363 apen = mat(a.penalties).reshape(len(a.penalties),1)
364 features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties[:]])
365
366 featureVectors[:,exampleIdx] = allWeights[:,pathNr+1]
367
368 AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
369
370 distinct_scores = False
371 if math.fabs(AlignmentScores[pathNr] - AlignmentScores[pathNr+1]) > 1e-5:
372 distinct_scores = True
373
374 # Check wether scalar product + loss equals viterbi score
375 if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) <= 1e-5:
376 self.plog("Scalar prod. + loss not equals Viterbi output!\n")
377 pdb.set_trace()
378
379 self.plog(" scalar prod (correct) : %f\n"%AlignmentScores[0])
380 self.plog(" scalar prod (pred.) : %f %f\n"%(newDPScores[pathNr,0],AlignmentScores[pathNr+1]))
381
382 # if the pathNr-best alignment is very close to the true alignment consider it as true
383 if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
384 true_map[pathNr+1] = 1
385
386 if not trueAlignmentScore <= max(AlignmentScores[1:]) + 1e-6:
387 print "suboptimal_example %d\n" %exampleIdx
388 #trueSpliceAlign, trueWeightMatch, trueWeightQuality dna_calc=\
389 #computeSpliceAlignWithQuality(dna, exons, est, original_est, quality, qualityPlifs)
390
391 #pdb.set_trace()
392 suboptimal_example += 1
393 self.plog("suboptimal_example %d\n" %exampleIdx)
394
395 # the true label sequence should not have a larger score than the maximal one WHYYYYY?
396 # this means that all n-best paths are to close to each other
397 # we have to extend the n-best search to a (n+1)-best
398 if len([elem for elem in true_map if elem == 1]) == len(true_map):
399 num_path[exampleIdx] = num_path[exampleIdx]+1
400
401 # Choose true and first false alignment for extending
402 firstFalseIdx = -1
403 for map_idx,elem in enumerate(true_map):
404 if elem == 0:
405 firstFalseIdx = map_idx
406 break
407
408 if False:
409 self.plog("Is considered as: %d\n" % true_map[1])
410
411 result_len = currentAlignment.getResultLength()
412 c_dna_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
413 c_est_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
414
415 currentAlignment.getAlignmentArrays(c_dna_array,c_est_array)
416
417 dna_array = [0.0]*result_len
418 est_array = [0.0]*result_len
419
420 for r_idx in range(result_len):
421 dna_array[r_idx] = c_dna_array[r_idx]
422 est_array[r_idx] = c_est_array[r_idx]
423
424 _newSpliceAlign = newSpliceAlign[0].flatten().tolist()[0]
425 _newEstAlign = newEstAlign[0].flatten().tolist()[0]
426
427 #line1,line2,line3 = pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array)
428 #self.plog(line1+'\n')
429 #self.plog(line2+'\n')
430 #self.plog(line3+'\n')
431
432 # if there is at least one useful false alignment add the
433 # corresponding constraints to the optimization problem
434 if firstFalseIdx != -1:
435 firstFalseWeights = allWeights[:,firstFalseIdx]
436 differenceVector = trueWeight - firstFalseWeights
437 #pdb.set_trace()
438
439 #print 'NOT ADDING ANY CONSTRAINTS'
440 const_added = solver.addConstraint(differenceVector, exampleIdx)
441
442 const_added_ctr += 1
443 #
444 # end of one example processing
445 #
446
447 # call solver every nth example //added constraint
448 if exampleIdx != 0 and exampleIdx % numConstPerRound == 0:
449 objValue,w,self.slacks = solver.solve()
450 solver_call_ctr += 1
451
452 if solver_call_ctr == 5:
453 numConstPerRound = 200
454 self.plog('numConstPerRound is now %d\n'% numConstPerRound)
455
456 if math.fabs(objValue - self.oldObjValue) <= 1e-6:
457 self.noImprovementCtr += 1
458
459 if self.noImprovementCtr == numExamples+1:
460 break
461
462 self.oldObjValue = objValue
463 print "objValue is %f" % objValue
464
465 sum_xis = 0
466 for elem in self.slacks:
467 sum_xis += elem
468
469 print 'sum of slacks is %f'% sum_xis
470 self.plog('sum of slacks is %f\n'% sum_xis)
471
472 for i in range(len(param)):
473 param[i] = w[i]
474
475 cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
476 param_idx += 1
477 [h,d,a,mmatrix,qualityPlifs] =\
478 set_param_palma(param,self.ARGS.train_with_intronlengthinformation,run)
479
480 #
481 # end of one iteration through all examples
482 #
483
484 self.plog("suboptimal rounds %d\n" %suboptimal_example)
485
486 if self.noImprovementCtr == numExamples*2:
487 break
488
489 iteration_nr += 1
490
491 #
492 # end of optimization
493 #
494 print 'Training completed'
495
496 cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
497 self.logfh.close()
498
499 ###############################################################################
500 #
501 # End of the code needed for training
502 #
503 # Begin of code for prediction
504 #
505 ###############################################################################
506
507 def init_prediction(self,run,dataset_fn,prediction_keys,param_fn,seqInfo,set_name,):
508 """
509 Performing a prediction takes...
510 """
511 self.run = run
512 self.set_name = set_name
513
514 #full_working_path = os.path.join(run['alignment_dir'],run['name'])
515 full_working_path = run['result_dir']
516
517 print 'full_working_path is %s' % full_working_path
518
519 #assert not os.path.exists(full_working_path)
520 if not os.path.exists(full_working_path):
521 os.mkdir(full_working_path)
522
523 assert os.path.exists(full_working_path)
524
525 # ATTENTION: Changing working directory
526 os.chdir(full_working_path)
527
528 self.logfh = open('_qpalma_predict_%s.log'%set_name,'w+')
529
530 # number of prediction instances
531 self.plog('Number of prediction examples: %d\n'% len(prediction_keys))
532
533 # load dataset and fetch instances that shall be predicted
534 dataset = cPickle.load(open(dataset_fn))
535
536 prediction_set = {}
537 for key in prediction_keys:
538 prediction_set[key] = dataset[key]
539
540 # we do not need the full dataset anymore
541 del dataset
542
543 # Load parameter vector to predict with
544 param = cPickle.load(open(param_fn))
545
546 self.predict(run,prediction_set,param,seqInfo)
547
548
549 def predict(self,run,prediction_set,param,seqInfo):
550 """
551 This method...
552 """
553
554 self.run = run
555
556 if self.run['mode'] == 'normal':
557 self.use_quality_scores = False
558
559 elif self.run['mode'] == 'using_quality_scores':
560 self.use_quality_scores = True
561 else:
562 assert(False)
563
564 # Set the parameters such as limits/penalties for the Plifs
565 [h,d,a,mmatrix,qualityPlifs] =\
566 set_param_palma(param,self.ARGS.train_with_intronlengthinformation,run)
567
568 if not self.qpalma_debug_mode:
569 self.plog('Starting prediction...\n')
570
571 self.problem_ctr = 0
572
573 # where we store the predictions
574 allPredictions = []
575
576 # we take the first quality vector of the tuple of quality vectors
577 quality_index = 0
578
579 # beginning of the prediction loop
580 for example_key in prediction_set.keys():
581 print 'Current example %d' % example_key
582 for example in prediction_set[example_key]:
583
584 currentSeqInfo,read,currentQualities = example
585
586 id,chromo,strand,genomicSeq_start,genomicSeq_stop =\
587 currentSeqInfo
588
589 if not self.qpalma_debug_mode:
590 self.plog('Loading example id: %d...\n'% int(id))
591
592 if run['enable_quality_scores']:
593 quality = currentQualities[quality_index]
594 else:
595 quality = [40]*len(read)
596
597 try:
598 currentDNASeq, currentAcc, currentDon = seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop)
599 except:
600 self.problem_ctr += 1
601 continue
602
603 if not run['enable_splice_signals']:
604 for idx,elem in enumerate(currentDon):
605 if elem != -inf:
606 currentDon[idx] = 0.0
607
608 for idx,elem in enumerate(currentAcc):
609 if elem != -inf:
610 currentAcc[idx] = 0.0
611
612 current_prediction = self.calc_alignment(currentDNASeq, read,\
613 quality, currentDon, currentAcc, d, a, h, mmatrix, qualityPlifs)
614
615 current_prediction['id'] = id
616 current_prediction['chr'] = chromo
617 current_prediction['strand'] = strand
618 current_prediction['start_pos'] = genomicSeq_start
619
620 allPredictions.append(current_prediction)
621
622 if not self.qpalma_debug_mode:
623 self.finalizePrediction(allPredictions)
624 else:
625 return allPredictions
626
627
628 def finalizePrediction(self,allPredictions):
629 # end of the prediction loop we save all predictions in a pickle file and exit
630 cPickle.dump(allPredictions,open('%s.predictions.pickle'%(self.set_name),'w+'))
631 print 'Prediction completed'
632 self.plog('Prediction completed\n')
633 mes = 'Problem ctr %d' % self.problem_ctr
634 print mes
635 self.plog(mes+'\n')
636 self.logfh.close()
637
638
639 def calc_alignment(self, dna, read, quality, don_supp, acc_supp, d, a, h, mmatrix, qualityPlifs):
640 """
641 Given two sequences and the parameters we calculate on alignment
642 """
643
644 run = self.run
645 donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
646
647 if '-' in read:
648 self.plog('found gap\n')
649 read = read.replace('-','')
650 assert len(read) == Conf.read_size
651
652 ps = h.convert2SWIG()
653
654 newSpliceAlign, newEstAlign, newWeightMatch, newDPScores,\
655 newQualityPlifsFeatures, dna_array, read_array =\
656 self.do_alignment(dna,read,quality,mmatrix,donor,acceptor,ps,qualityPlifs,1,True)
657
658 mm_len = run['matchmatrixRows']*run['matchmatrixCols']
659
660 true_map = [0]*2
661 true_map[0] = 1
662 pathNr = 0
663
664 _newSpliceAlign = array.array('B',newSpliceAlign)
665 _newEstAlign = array.array('B',newEstAlign)
666
667 alignment = get_alignment(_newSpliceAlign,_newEstAlign, dna_array, read_array) #(qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds, tExonSizes, tStarts, tEnds)
668
669 dna_array = array.array('B',dna_array)
670 read_array = array.array('B',read_array)
671
672 newExons = self.calculatePredictedExons(newSpliceAlign)
673
674 current_prediction = {'predExons':newExons, 'dna':dna, 'read':read, 'DPScores':newDPScores,\
675 'alignment':alignment,'spliceAlign':_newSpliceAlign,'estAlign':_newEstAlign,\
676 'dna_array':dna_array, 'read_array':read_array }
677
678 return current_prediction
679
680
681 def calculatePredictedExons(self,SpliceAlign):
682 newExons = []
683 oldElem = -1
684 SpliceAlign.append(-1)
685 for pos,elem in enumerate(SpliceAlign):
686 if pos == 0:
687 oldElem = -1
688 else:
689 oldElem = SpliceAlign[pos-1]
690
691 if oldElem != 0 and elem == 0: # start of exon
692 newExons.append(pos)
693
694 if oldElem == 0 and elem != 0: # end of exon
695 newExons.append(pos)
696
697 return newExons