afc32d18cf0eb71af272b2dfc5d0854149f067cf
[qpalma.git] / qpalma / qpalma_main.py
1 # This program is free software; you can redistribute it and/or modify
2 # it under the terms of the GNU General Public License as published by
3 # the Free Software Foundation; either version 2 of the License, or
4 # (at your option) any later version.
5 #
6 # Written (W) 2008 Fabio De Bona
7 # Copyright (C) 2008 Max-Planck-Society
8
9 import array
10 import cPickle
11 import os.path
12 import pdb
13 import sys
14
15 import numpy
16 from numpy.matlib import mat,zeros,ones,inf
17 from numpy.linalg import norm
18
19 #from qpalma.SIQP_CPX import SIQPSolver
20 from qpalma.SIQP_CVXOPT import SIQPSolver
21
22 import QPalmaDP
23 import qpalma
24 from qpalma.computeSpliceWeights import *
25 from qpalma.set_param_palma import *
26 from qpalma.computeSpliceAlignWithQuality import *
27 from qpalma.TrainingParam import Param
28 from qpalma.Plif import Plf,compute_donacc
29
30 from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper,unbracket_seq
31 from qpalma.utils import pprint_alignment, get_alignment
32
33 jp = os.path.join
34 dotProd = lambda x,y: (x.T * y)[0,0]
35
36
37 class SpliceSiteException:
38 pass
39
40
41 def preprocessExample(training_set,exampleKey,seqInfo,settings):
42 """
43 This function...
44 """
45
46 currentSeqInfo,originalRead,currentQualities,currentExons = training_set[exampleKey]
47 id,chr,strand,up_cut,down_cut = currentSeqInfo
48
49 read = unbracket_seq(originalRead)
50 read = read.replace('-','')
51
52 dna, acc_supp, don_supp = seqInfo.get_seq_and_scores(chr,strand,up_cut,down_cut)
53
54 exons = currentExons - up_cut
55
56 if exons.shape == (2,2):
57 fetched_dna_subseq = dna[exons[0,0]:exons[0,1]] + dna[exons[1,0]:exons[1,1]]
58 print fetched_dna_subseq
59
60 donor_elem = dna[exons[0,1]:exons[0,1]+2]
61 acceptor_elem = dna[exons[1,0]-2:exons[1,0]]
62
63 if not ( donor_elem == 'gt' or donor_elem == 'gc' ):
64 print 'invalid donor in example %d'% exampleKey
65 raise SpliceSiteException
66
67 if not ( acceptor_elem == 'ag' ):
68 print 'invalid acceptor in example %d'% exampleKey
69 raise SpliceSiteException
70
71 assert len(fetched_dna_subseq) == len(read), pdb.set_trace()
72
73 return dna,read,acc_supp,don_supp,exons,originalRead,currentQualities
74
75
76 def performAlignment(dna,read,quality,mmatrix,donor,acceptor,ps,qualityPlifs,current_num_path,prediction_mode,settings):
77 """
78 Given the needed input this method calls the QPalma C module which
79 calculates a dynamic programming in order to obtain an alignment
80 """
81
82 prb = QPalmaDP.createDoubleArrayFromList(quality)
83 chastity = QPalmaDP.createDoubleArrayFromList([.0]*len(read))
84
85 matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
86 mm_len = settings['matchmatrixRows']*settings['matchmatrixCols']
87
88 don_len = len(donor)
89 donor = QPalmaDP.createDoubleArrayFromList(donor)
90 acc_len = len(acceptor)
91 acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
92
93 # Create the alignment object representing the interface to the C/C++ code.
94 currentAlignment = QPalmaDP.Alignment(settings['numQualPlifs'],settings['numQualSuppPoints'], settings['enable_quality_scores'])
95
96 c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
97
98 # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
99 currentAlignment.myalign( current_num_path, dna, len(dna),\
100 read, len(read), prb, chastity, ps, matchmatrix, mm_len, donor, don_len,\
101 acceptor, acc_len, c_qualityPlifs, settings['remove_duplicate_scores'],\
102 settings['print_matrix'] )
103
104 if prediction_mode:
105 # part that is only needed for prediction
106 result_len = currentAlignment.getResultLength()
107 dna_array,read_array = currentAlignment.getAlignmentArraysNew()
108 else:
109 dna_array = None
110 read_array = None
111
112 newSpliceAlign, newEstAlign, newWeightMatch, newDPScores, newQualityPlifsFeatures =\
113 currentAlignment.getAlignmentResultsNew()
114
115 return newSpliceAlign, newEstAlign, newWeightMatch, newDPScores,\
116 newQualityPlifsFeatures, dna_array, read_array
117
118
119
120 class QPalma:
121 """
122 This class wraps the training and prediction functions for
123 the alignment.
124 """
125
126 def __init__(self,seqInfo,dmode=False):
127 self.ARGS = Param()
128 self.qpalma_debug_mode = dmode
129 self.seqInfo = seqInfo
130 self.logfh = None
131
132
133 def plog(self,string):
134 if self.logfh != None:
135 self.logfh.write(string)
136 self.logfh.flush()
137 else:
138 print string
139
140
141 def init_training(self,dataset_fn,training_keys,settings,set_name):
142 full_working_path = jp(settings['training_dir'],run_name)
143
144 #assert not os.path.exists(full_working_path)
145 if not os.path.exists(full_working_path):
146 os.mkdir(full_working_path)
147
148 assert os.path.exists(full_working_path)
149
150 # ATTENTION: Changing working directory
151 os.chdir(full_working_path)
152
153 self.logfh = open('_qpalma_train.log','w+')
154
155 training_set = cPickle.load(open(dataset_fn))
156
157 self.train(training_set,settings,set_name)
158
159
160 def train(self,training_set,settings,set_name):
161 """
162 The mainloop for training.
163 """
164
165 numExamples = len(training_set)
166 self.plog('Number of training examples: %d\n'% numExamples)
167
168 self.noImprovementCtr = 0
169 self.oldObjValue = 1e8
170
171 remove_duplicate_scores = settings['remove_duplicate_scores']
172 print_matrix = settings['print_matrix']
173
174 lengthSP = settings['numLengthSuppPoints']
175 donSP = settings['numDonSuppPoints']
176 accSP = settings['numAccSuppPoints']
177 mmatrixSP = settings['matchmatrixRows']*settings['matchmatrixCols']
178 numq = settings['numQualSuppPoints']
179 totalQualSP = settings['totalQualSuppPoints']
180
181 # calculate the total number of features
182 numFeatures = settings['numFeatures']
183
184 # Initialize parameter vector
185 param = numpy.matlib.rand(numFeatures,1)
186
187 # we take the first quality vector of the tuple of quality vectors
188 quality_index = 0
189
190 # no intron length model
191 if not settings['enable_intron_length']:
192 param[:lengthSP] *= 0.0
193
194 # Set the parameters such as limits penalties for the Plifs
195 [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation,settings)
196
197 # Initialize solver
198 self.plog('Initializing problem...\n')
199
200 try:
201 solver = SIQPSolver(numFeatures,numExamples,settings['C'],self.logfh,settings)
202 except:
203 self.plog('Got no license. Telling queue to reschedule job...\n')
204 self.plog(sys.exc_info())
205 sys.exit(99)
206
207 #solver.enforceMonotonicity(lengthSP,lengthSP+donSP)
208 #solver.enforceMonotonicity(lengthSP+donSP,lengthSP+donSP+accSP)
209
210 # stores the number of alignments done for each example (best path, second-best path etc.)
211 num_path = [settings['anzpath']]*numExamples
212
213 currentPhi = zeros((numFeatures,1))
214 totalQualityPenalties = zeros((totalQualSP,1))
215
216 numConstPerRound = settings['numConstraintsPerRound']
217 solver_call_ctr = 0
218
219 suboptimal_example = 0
220 iteration_nr = 0
221 param_idx = 0
222 const_added_ctr = 0
223
224 featureVectors = zeros((numFeatures,numExamples))
225
226 self.plog('Starting training...\n')
227 # the main training loop
228 while True:
229 if iteration_nr == settings['iter_steps']:
230 break
231
232 #print 'Got %d examples' % len(training_set.keys())
233 for exampleIdx,example_key in enumerate(training_set.keys()):
234 print 'Current example %d' % example_key
235
236 try:
237 dna,read,acc_supp,don_supp,exons,original_read,currentQualities =\
238 preprocessExample(training_set,example_key,self.seqInfo,settings)
239 except SpliceSiteException:
240 continue
241
242 if settings['enable_quality_scores']:
243 quality = currentQualities[quality_index]
244 else:
245 quality = [40]*len(read)
246
247 if not settings['enable_splice_scores']:
248 for idx,elem in enumerate(don_supp):
249 if elem != -inf:
250 don_supp[idx] = 0.0
251
252 for idx,elem in enumerate(acc_supp):
253 if elem != -inf:
254 acc_supp[idx] = 0.0
255
256 # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
257 if settings['enable_quality_scores']:
258 trueSpliceAlign, trueWeightMatch, trueWeightQuality ,dna_calc =\
259 computeSpliceAlignWithQuality(dna, exons, read, original_read,\
260 quality, qualityPlifs,settings)
261 else:
262 trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
263
264 dna_calc = dna_calc.replace('-','')
265
266 #print 'right before computeSpliceWeights exampleIdx %d' % exampleIdx
267 # Calculate the weights
268 trueWeightDon, trueWeightAcc, trueWeightIntron =\
269 computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
270 trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
271
272 currentPhi[0:lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
273 currentPhi[lengthSP:lengthSP+donSP] = mat(d.penalties[:]).reshape(donSP,1)
274 currentPhi[lengthSP+donSP:lengthSP+donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
275 currentPhi[lengthSP+donSP+accSP:lengthSP+donSP+accSP+mmatrixSP] = mmatrix[:]
276
277 if settings['enable_quality_scores']:
278 totalQualityPenalties = param[-totalQualSP:]
279 currentPhi[lengthSP+donSP+accSP+mmatrixSP:] = totalQualityPenalties[:]
280
281 # Calculate w'phi(x,y) the total score of the alignment
282 trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
283
284 # The allWeights vector is supposed to store the weight parameter
285 # of the true alignment as well as the weight parameters of the
286 # num_path[exampleIdx] other alignments
287 allWeights = zeros((numFeatures,num_path[exampleIdx]+1))
288 allWeights[:,0] = trueWeight[:,0]
289
290 AlignmentScores = [0.0]*(num_path[exampleIdx]+1)
291 AlignmentScores[0] = trueAlignmentScore
292
293 ################## Calculate wrong alignment(s) ######################
294 # Compute donor, acceptor with penalty_lookup_new
295 # returns two double lists
296 donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
297
298 ps = h.convert2SWIG()
299
300 newSpliceAlign, newEstAlign, newWeightMatch, newDPScores,\
301 newQualityPlifsFeatures, unneeded1, unneeded2 =\
302 performAlignment(dna,read,quality,mmatrix,donor,acceptor,ps,qualityPlifs,num_path[exampleIdx],False,settings)
303 mm_len = settings['matchmatrixRows']*settings['matchmatrixCols']
304
305 # check for correct reshaping !!
306 newSpliceAlign = numpy.mat(newSpliceAlign).reshape(num_path[exampleIdx],len(dna))
307 newWeightMatch = numpy.mat(newWeightMatch).reshape(num_path[exampleIdx],mm_len)
308
309 newQualityPlifsFeatures = numpy.mat(newQualityPlifsFeatures).reshape(num_path[exampleIdx],settings['totalQualSuppPoints'])
310 # Calculate weights of the respective alignments. Note that we are calculating n-best alignments without
311 # hamming loss, so we have to keep track which of the n-best alignments correspond to the true one in order
312 # not to incorporate a true alignment in the
313 # constraints. To keep track of the true and false alignments we
314 # define an array true_map with a boolean indicating the
315 # equivalence to the true alignment for each decoded alignment.
316 true_map = [0]*(num_path[exampleIdx]+1)
317 true_map[0] = 1
318
319 for pathNr in range(num_path[exampleIdx]):
320 weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a,\
321 h, newSpliceAlign[pathNr,:].flatten().tolist()[0], don_supp,\
322 acc_supp)
323
324 decodedQualityFeatures = zeros((settings['totalQualSuppPoints'],1))
325 decodedQualityFeatures = newQualityPlifsFeatures[pathNr,:].T
326 # Gewichte in restliche Zeilen der Matrix speichern
327 allWeights[:,pathNr+1] = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch[pathNr,:].T, decodedQualityFeatures[:]])
328
329 hpen = mat(h.penalties).reshape(len(h.penalties),1)
330 dpen = mat(d.penalties).reshape(len(d.penalties),1)
331 apen = mat(a.penalties).reshape(len(a.penalties),1)
332 features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties[:]])
333
334 featureVectors[:,exampleIdx] = allWeights[:,pathNr+1]
335
336 AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
337
338 distinct_scores = False
339 if math.fabs(AlignmentScores[pathNr] - AlignmentScores[pathNr+1]) > 1e-5:
340 distinct_scores = True
341
342 # Check wether scalar product + loss equals viterbi score
343 if not math.fabs(newDPScores[pathNr] - AlignmentScores[pathNr+1]) <= 1e-5:
344 self.plog("Scalar prod. + loss not equals Viterbi output!\n")
345 pdb.set_trace()
346
347 self.plog(" scalar prod (correct) : %f\n"%AlignmentScores[0])
348 self.plog(" scalar prod (pred.) : %f %f\n"%(newDPScores[pathNr],AlignmentScores[pathNr+1]))
349
350 # if the pathNr-best alignment is very close to the true alignment consider it as true
351 if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
352 true_map[pathNr+1] = 1
353
354 if not trueAlignmentScore <= max(AlignmentScores[1:]) + 1e-6:
355 print "suboptimal_example %d\n" %exampleIdx
356 #trueSpliceAlign, trueWeightMatch, trueWeightQuality dna_calc=\
357 #computeSpliceAlignWithQuality(dna, exons, est, original_est, quality, qualityPlifs)
358
359 #pdb.set_trace()
360 suboptimal_example += 1
361 self.plog("suboptimal_example %d\n" %exampleIdx)
362
363 # the true label sequence should not have a larger score than the maximal one WHYYYYY?
364 # this means that all n-best paths are to close to each other
365 # we have to extend the n-best search to a (n+1)-best
366 if len([elem for elem in true_map if elem == 1]) == len(true_map):
367 num_path[exampleIdx] = num_path[exampleIdx]+1
368
369 # Choose true and first false alignment for extending
370 firstFalseIdx = -1
371 for map_idx,elem in enumerate(true_map):
372 if elem == 0:
373 firstFalseIdx = map_idx
374 break
375
376 if False:
377 self.plog("Is considered as: %d\n" % true_map[1])
378
379 #result_len = currentAlignment.getResultLength()
380
381 dna_array,est_array = currentAlignment.getAlignmentArraysNew()
382
383 _newSpliceAlign = newSpliceAlign[0].flatten().tolist()[0]
384 _newEstAlign = newEstAlign[0].flatten().tolist()[0]
385
386 # if there is at least one useful false alignment add the
387 # corresponding constraints to the optimization problem
388 if firstFalseIdx != -1:
389 firstFalseWeights = allWeights[:,firstFalseIdx]
390 differenceVector = trueWeight - firstFalseWeights
391 #pdb.set_trace()
392
393 const_added = solver.addConstraint(differenceVector, exampleIdx)
394 const_added_ctr += 1
395
396 # end of one example processing
397
398 # call solver every nth example / added constraint
399 #if exampleIdx != 0 and exampleIdx % numConstPerRound == 0:
400 objValue,w,self.slacks = solver.solve()
401 solver_call_ctr += 1
402
403 if solver_call_ctr == 5:
404 numConstPerRound = 200
405 self.plog('numConstPerRound is now %d\n'% numConstPerRound)
406
407 if math.fabs(objValue - self.oldObjValue) <= 1e-6:
408 self.noImprovementCtr += 1
409
410 if self.noImprovementCtr == numExamples+1:
411 break
412
413 self.oldObjValue = objValue
414 print "objValue is %f" % objValue
415
416 sum_xis = 0
417 for elem in self.slacks:
418 sum_xis += elem
419
420 print 'sum of slacks is %f'% sum_xis
421 self.plog('sum of slacks is %f\n'% sum_xis)
422
423 for i in range(len(param)):
424 param[i] = w[i]
425
426 cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
427 param_idx += 1
428 [h,d,a,mmatrix,qualityPlifs] =\
429 set_param_palma(param,self.ARGS.train_with_intronlengthinformation,settings)
430
431 # end of one iteration through all examples
432 self.plog("suboptimal rounds %d\n" %suboptimal_example)
433
434 if self.noImprovementCtr == numExamples*2:
435 self.FinalizeTraining(param,'param_%d.pickle'%param_idx)
436
437 iteration_nr += 1
438
439 #
440 # end of optimization
441 #
442 self.FinalizeTraining(param,'param_%d.pickle'%param_idx)
443
444
445 def FinalizeTraining(self,param,name):
446 self.plog("Training completed")
447 cPickle.dump(param,open(name,'w+'))
448 if self.logfh != None:
449 self.logfh.close()
450
451
452 ###############################################################################
453 #
454 # End of the code needed for training
455 #
456 # Begin of code for prediction
457 #
458 ###############################################################################
459
460
461 def init_prediction(self,dataset_fn,prediction_keys,settings,set_name):
462 """
463 Performing a prediction takes...
464 """
465 self.set_name = set_name
466
467 full_working_path = settings['prediction_dir']
468
469 print 'full_working_path is %s' % full_working_path
470
471 #assert not os.path.exists(full_working_path)
472 if not os.path.exists(full_working_path):
473 os.mkdir(full_working_path)
474
475 assert os.path.exists(full_working_path)
476
477 # ATTENTION: Changing working directory
478 os.chdir(full_working_path)
479
480 self.logfh = open('_qpalma_predict_%s.log'%set_name,'w+')
481
482 # number of prediction instances
483 self.plog('Number of prediction examples: %d\n'% len(prediction_keys))
484
485 # load dataset and fetch instances that shall be predicted
486 dataset = cPickle.load(open(dataset_fn))
487
488 prediction_set = {}
489 for key in prediction_keys:
490 prediction_set[key] = dataset[key]
491
492 # we do not need the full dataset anymore
493 del dataset
494
495 self.predict(prediction_set,settings)
496
497
498 def predict(self,prediction_set,settings):
499 """
500 This method...
501 """
502
503 # Load parameter vector to predict with
504 param = cPickle.load(open(settings['prediction_param_fn']))
505
506 # Set the parameters such as limits/penalties for the Plifs
507 [h,d,a,mmatrix,qualityPlifs] =\
508 set_param_palma(param,self.ARGS.train_with_intronlengthinformation,settings)
509
510 if not self.qpalma_debug_mode:
511 self.plog('Starting prediction...\n')
512
513 self.problem_ctr = 0
514
515 # where we store the predictions
516 allPredictions = []
517
518 # we take the first quality vector of the tuple of quality vectors
519 quality_index = 0
520
521 # beginning of the prediction loop
522 for example_key in prediction_set.keys():
523 print 'Current example %d' % example_key
524 for example in prediction_set[example_key]:
525
526 currentSeqInfo,read,currentQualities = example
527
528 id,chromo,strand,genomicSeq_start,genomicSeq_stop =\
529 currentSeqInfo
530
531 if not self.qpalma_debug_mode:
532 self.plog('Loading example id: %d...\n'% int(id))
533
534 if settings['enable_quality_scores']:
535 quality = currentQualities[quality_index]
536 else:
537 quality = [40]*len(read)
538
539 try:
540 currentDNASeq, currentAcc, currentDon = self.seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop)
541 except:
542 self.problem_ctr += 1
543 print sys.exc_info()
544 continue
545
546 if not settings['enable_splice_scores']:
547 for idx,elem in enumerate(currentDon):
548 if elem != -inf:
549 currentDon[idx] = 0.0
550
551 for idx,elem in enumerate(currentAcc):
552 if elem != -inf:
553 currentAcc[idx] = 0.0
554
555 current_prediction = self.calc_alignment(currentDNASeq, read,\
556 quality, currentDon, currentAcc, d, a, h, mmatrix, qualityPlifs,settings)
557
558 current_prediction['id'] = id
559 current_prediction['chr'] = chromo
560 current_prediction['strand'] = strand
561 current_prediction['start_pos'] = genomicSeq_start
562
563 allPredictions.append(current_prediction)
564
565 if not self.qpalma_debug_mode:
566 self.FinalizePrediction(allPredictions)
567 else:
568 return allPredictions
569
570
571 def FinalizePrediction(self,allPredictions):
572 """ End of the prediction loop we save all predictions in a pickle file and exit """
573
574 cPickle.dump(allPredictions,open('%s.predictions.pickle'%(self.set_name),'w+'))
575 self.plog('Prediction completed\n')
576 mes = 'Problem ctr %d' % self.problem_ctr
577 self.plog(mes+'\n')
578 self.logfh.close()
579
580
581 def calc_alignment(self, dna, read, quality, don_supp, acc_supp, d, a, h, mmatrix, qualityPlifs,settings):
582 """
583 Given two sequences and the parameters we calculate on alignment
584 """
585
586 donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
587
588 if '-' in read:
589 self.plog('found gap\n')
590 read = read.replace('-','')
591 assert len(read) == Conf.read_size
592
593 ps = h.convert2SWIG()
594
595 newSpliceAlign, newEstAlign, newWeightMatch, newDPScores,\
596 newQualityPlifsFeatures, dna_array, read_array =\
597 performAlignment(dna,read,quality,mmatrix,donor,acceptor,ps,qualityPlifs,1,True,settings)
598
599 mm_len = settings['matchmatrixRows']*settings['matchmatrixCols']
600
601 true_map = [0]*2
602 true_map[0] = 1
603 pathNr = 0
604
605 _newSpliceAlign = array.array('B',newSpliceAlign)
606 _newEstAlign = array.array('B',newEstAlign)
607
608 #(qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds, tExonSizes, tStarts, tEnds)
609 alignment = get_alignment(_newSpliceAlign,_newEstAlign, dna_array, read_array)
610
611 dna_array = array.array('B',dna_array)
612 read_array = array.array('B',read_array)
613
614 newExons = self.calculatePredictedExons(newSpliceAlign)
615
616 current_prediction = {'predExons':newExons, 'dna':dna, 'read':read, 'DPScores':newDPScores,\
617 'alignment':alignment,'spliceAlign':_newSpliceAlign,'estAlign':_newEstAlign,\
618 'dna_array':dna_array, 'read_array':read_array }
619
620 return current_prediction
621
622
623 def calculatePredictedExons(self,SpliceAlign):
624 newExons = []
625 oldElem = -1
626 SpliceAlign.append(-1)
627 for pos,elem in enumerate(SpliceAlign):
628 if pos == 0:
629 oldElem = -1
630 else:
631 oldElem = SpliceAlign[pos-1]
632
633 if oldElem != 0 and elem == 0: # start of exon
634 newExons.append(pos)
635
636 if oldElem == 0 and elem != 0: # end of exon
637 newExons.append(pos)
638
639 return newExons