+ minor changes in the paths
[qpalma.git] / scripts / qpalma_main.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 ###########################################################
5 #
6 # The QPalma project aims at extending the Palma project
7 # to be able to use Solexa reads together with their
8 # quality scores.
9 #
10 # This file represents the conversion of the main matlab
11 # training loop for Palma to Python.
12 #
13 # Author: Fabio De Bona
14 #
15 ###########################################################
16
17 import array
18 import cPickle
19 import os.path
20 import pdb
21 import sys
22
23 from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper,unbracket_seq
24
25 import numpy
26 from numpy.matlib import mat,zeros,ones,inf
27 from numpy.linalg import norm
28
29 import QPalmaDP
30 import qpalma
31
32 #from qpalma.SIQP_CPX import SIQPSolver
33 #from qpalma.SIQP_CVXOPT import SIQPSolver
34
35 from qpalma.computeSpliceWeights import *
36 from qpalma.set_param_palma import *
37 from qpalma.computeSpliceAlignWithQuality import *
38 from qpalma.TrainingParam import Param
39 from qpalma.Plif import Plf,compute_donacc
40
41 import QPalmaConfiguration as Conf
42
43 # these two imports are needed for the load genomic resp. interval query
44 # functions
45 #from Genefinding import *
46 #from genome_utils import load_genomic
47 from Utils import calc_stat, calc_info, pprint_alignment, get_alignment
48
49 class SpliceSiteException:
50 pass
51
52
53 def getData(training_set,exampleKey,run):
54 currentSeqInfo,currentExons,original_est,currentQualities = training_set[exampleKey]
55 id,chr,strand,up_cut,down_cut = currentSeqInfo
56
57 est = original_est
58 est = "".join(est)
59 est = est.lower()
60 est = unbracket_est(est)
61 est = est.replace('-','')
62
63 assert len(est) == run['read_size'], pdb.set_trace()
64 est_len = len(est)
65
66 #original_est = OriginalEsts[exampleIdx]
67 original_est = "".join(original_est)
68 original_est = original_est.lower()
69
70 dna_flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
71 dna, acc_supp, don_supp = get_seq_and_scores(chr,strand,up_cut,down_cut,dna_flat_files)
72
73 # splice score is located at g of ag
74 #ag_tuple_pos = [p for p,e in enumerate(dna) if p>1 and dna[p-1]=='a' and dna[p]=='g' ]
75 #assert ag_tuple_pos == [p for p,e in enumerate(acc_supp) if e != -inf and p > 1], pdb.set_trace()
76 #gt_tuple_pos = [p for p,e in enumerate(dna) if p>0 and p<len(dna)-1 and e=='g' and (dna[p+1]=='t' or dna[p+1]=='c')]
77 #assert gt_tuple_pos == [p for p,e in enumerate(don_supp) if e != -inf and p > 0], pdb.set_trace()
78
79 original_exons = currentExons
80 exons = original_exons - (up_cut-1)
81 exons[0,0] -= 1
82 exons[1,0] -= 1
83
84 if exons.shape == (2,2):
85 fetched_dna_subseq = dna[exons[0,0]:exons[0,1]] + dna[exons[1,0]:exons[1,1]]
86
87 donor_elem = dna[exons[0,1]:exons[0,1]+2]
88 acceptor_elem = dna[exons[1,0]-2:exons[1,0]]
89
90 if not ( donor_elem == 'gt' or donor_elem == 'gc' ):
91 print 'invalid donor in example %d'% exampleKey
92 raise SpliceSiteException
93
94 if not ( acceptor_elem == 'ag' ):
95 print 'invalid acceptor in example %d'% exampleKey
96 raise SpliceSiteException
97
98 assert len(fetched_dna_subseq) == len(est), pdb.set_trace()
99
100 return dna,est,acc_supp,don_supp,exons,original_est,currentQualities
101
102
103
104 class QPalma:
105 """
106 This class wraps the training and prediction functions for
107 the alignment.
108 """
109
110 def __init__(self,dmode=False):
111 self.ARGS = Param()
112 self.qpalma_debug_mode = dmode
113
114
115 def plog(self,string):
116 self.logfh.write(string)
117 self.logfh.flush()
118
119
120 def do_alignment(self,dna,est,quality,mmatrix,donor,acceptor,ps,qualityPlifs,current_num_path,prediction_mode):
121 """
122 Given the needed input this method calls the QPalma C module which
123 calculates a dynamic programming in order to obtain an alignment
124 """
125 run = self.run
126
127 dna_len = len(dna)
128 est_len = len(est)
129
130 prb = QPalmaDP.createDoubleArrayFromList(quality)
131 chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
132
133 matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
134 mm_len = run['matchmatrixRows']*run['matchmatrixCols']
135
136 d_len = len(donor)
137 donor = QPalmaDP.createDoubleArrayFromList(donor)
138 a_len = len(acceptor)
139 acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
140
141 # Create the alignment object representing the interface to the C/C++ code.
142 currentAlignment = QPalmaDP.Alignment(run['numQualPlifs'],run['numQualSuppPoints'], self.use_quality_scores)
143 c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
144 # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
145 currentAlignment.myalign( current_num_path, dna, dna_len,\
146 est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
147 acceptor, a_len, c_qualityPlifs, run['remove_duplicate_scores'],\
148 run['print_matrix'] )
149
150 c_SpliceAlign = QPalmaDP.createIntArrayFromList([0]*(dna_len*current_num_path))
151 c_EstAlign = QPalmaDP.createIntArrayFromList([0]*(est_len*current_num_path))
152 c_WeightMatch = QPalmaDP.createIntArrayFromList([0]*(mm_len*current_num_path))
153 c_DPScores = QPalmaDP.createDoubleArrayFromList([.0]*current_num_path)
154
155 c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(run['totalQualSuppPoints']*current_num_path))
156
157 if prediction_mode:
158 # part that is only needed for prediction
159 result_len = currentAlignment.getResultLength()
160 c_dna_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
161 c_est_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
162
163 currentAlignment.getAlignmentArrays(c_dna_array,c_est_array)
164
165 dna_array = [0.0]*result_len
166 est_array = [0.0]*result_len
167
168 for r_idx in range(result_len):
169 dna_array[r_idx] = c_dna_array[r_idx]
170 est_array[r_idx] = c_est_array[r_idx]
171
172 else:
173 dna_array = None
174 est_array = None
175
176 currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
177 c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
178
179 newSpliceAlign = zeros((current_num_path*dna_len,1))
180 newEstAlign = zeros((est_len*current_num_path,1))
181 newWeightMatch = zeros((current_num_path*mm_len,1))
182 newDPScores = zeros((current_num_path,1))
183 newQualityPlifsFeatures = zeros((run['totalQualSuppPoints']*current_num_path,1))
184
185 for i in range(dna_len*current_num_path):
186 newSpliceAlign[i] = c_SpliceAlign[i]
187
188 for i in range(est_len*current_num_path):
189 newEstAlign[i] = c_EstAlign[i]
190
191 for i in range(mm_len*current_num_path):
192 newWeightMatch[i] = c_WeightMatch[i]
193
194 for i in range(current_num_path):
195 newDPScores[i] = c_DPScores[i]
196
197 if self.use_quality_scores:
198 for i in range(run['totalQualSuppPoints']*current_num_path):
199 newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
200
201 del c_SpliceAlign
202 del c_EstAlign
203 del c_WeightMatch
204 del c_DPScores
205 del c_qualityPlifsFeatures
206 del currentAlignment
207
208 return newSpliceAlign, newEstAlign, newWeightMatch, newDPScores,\
209 newQualityPlifsFeatures, dna_array, est_array
210
211
212 def train(self,run,training_set):
213 self.run = run
214
215 full_working_path = os.path.join(run['alignment_dir'],run['name'])
216
217 #assert not os.path.exists(full_working_path)
218 if not os.path.exists(full_working_path):
219 os.mkdir(full_working_path)
220
221 assert os.path.exists(full_working_path)
222
223 # ATTENTION: Changing working directory
224 os.chdir(full_working_path)
225
226 self.logfh = open('_qpalma_train.log','w+')
227 cPickle.dump(run,open('run_obj.pickle','w+'))
228
229 self.plog("Settings are:\n")
230 self.plog("%s\n"%str(run))
231
232 if self.run['mode'] == 'normal':
233 self.use_quality_scores = False
234
235 elif self.run['mode'] == 'using_quality_scores':
236 self.use_quality_scores = True
237 else:
238 assert(False)
239
240 numExamples = len(training_set)
241 self.plog('Number of training examples: %d\n'% numExamples)
242
243 self.noImprovementCtr = 0
244 self.oldObjValue = 1e8
245
246 iteration_steps = run['iter_steps']
247 remove_duplicate_scores = run['remove_duplicate_scores']
248 print_matrix = run['print_matrix']
249 anzpath = run['anzpath']
250
251 # Initialize parameter vector /
252 #param = Conf.fixedParam[:run['numFeatures']]
253 param = numpy.matlib.rand(run['numFeatures'],1)
254
255 lengthSP = run['numLengthSuppPoints']
256 donSP = run['numDonSuppPoints']
257 accSP = run['numAccSuppPoints']
258 mmatrixSP = run['matchmatrixRows']*run['matchmatrixCols']
259 numq = run['numQualSuppPoints']
260 totalQualSP = run['totalQualSuppPoints']
261
262 # no intron length model
263 if not run['enable_intron_length']:
264 param[:lengthSP] *= 0.0
265
266 # Set the parameters such as limits penalties for the Plifs
267 [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation,run)
268
269 # Initialize solver
270 self.plog('Initializing problem...\n')
271
272 #try:
273 # solver = SIQPSolver(run['numFeatures'],numExamples,run['C'],self.logfh,run)
274 #except:
275 # self.plog('Got no license. Telling queue to reschedule job...\n')
276 # sys.exit(99)
277
278 #solver.enforceMonotonicity(lengthSP,lengthSP+donSP)
279 #solver.enforceMonotonicity(lengthSP+donSP,lengthSP+donSP+accSP)
280
281 # stores the number of alignments done for each example (best path, second-best path etc.)
282 num_path = [anzpath]*numExamples
283 # stores the gap for each example
284 gap = [0.0]*numExamples
285 #############################################################################################
286 # Training
287 #############################################################################################
288 self.plog('Starting training...\n')
289
290 currentPhi = zeros((run['numFeatures'],1))
291 totalQualityPenalties = zeros((totalQualSP,1))
292
293 numConstPerRound = run['numConstraintsPerRound']
294 solver_call_ctr = 0
295
296 suboptimal_example = 0
297 iteration_nr = 0
298 param_idx = 0
299 const_added_ctr = 0
300
301 featureVectors = zeros((run['numFeatures'],numExamples))
302
303 # the main training loop
304 while True:
305 if iteration_nr == iteration_steps:
306 break
307
308 for exampleIdx,example_key in enumerate(training_set.keys()):
309 print 'Current example %d' % example_key
310 try:
311 dna,est,acc_supp,don_supp,exons,original_est,currentQualities =\
312 getData(training_set,example_key,run)
313 except SpliceSiteException:
314 continue
315
316 dna_len = len(dna)
317
318 if run['mode'] == 'normal':
319 quality = [40]*len(est)
320
321 if run['mode'] == 'using_quality_scores':
322 quality = currentQualities[0]
323
324 if not run['enable_quality_scores']:
325 quality = [40]*len(est)
326
327 if not run['enable_splice_signals']:
328 for idx,elem in enumerate(don_supp):
329 if elem != -inf:
330 don_supp[idx] = 0.0
331
332 for idx,elem in enumerate(acc_supp):
333 if elem != -inf:
334 acc_supp[idx] = 0.0
335
336 # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
337 if run['mode'] == 'using_quality_scores':
338 trueSpliceAlign, trueWeightMatch, trueWeightQuality ,dna_calc =\
339 computeSpliceAlignWithQuality(dna, exons, est, original_est,\
340 quality, qualityPlifs,run)
341 else:
342 trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
343
344 dna_calc = dna_calc.replace('-','')
345
346 #print 'right before computeSpliceWeights exampleIdx %d' % exampleIdx
347 # Calculate the weights
348 trueWeightDon, trueWeightAcc, trueWeightIntron =\
349 computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
350 trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
351
352 currentPhi[0:lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
353 currentPhi[lengthSP:lengthSP+donSP] = mat(d.penalties[:]).reshape(donSP,1)
354 currentPhi[lengthSP+donSP:lengthSP+donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
355 currentPhi[lengthSP+donSP+accSP:lengthSP+donSP+accSP+mmatrixSP] = mmatrix[:]
356
357 if run['mode'] == 'using_quality_scores':
358 totalQualityPenalties = param[-totalQualSP:]
359 currentPhi[lengthSP+donSP+accSP+mmatrixSP:] = totalQualityPenalties[:]
360
361 # Calculate w'phi(x,y) the total score of the alignment
362 trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
363
364 # The allWeights vector is supposed to store the weight parameter
365 # of the true alignment as well as the weight parameters of the
366 # num_path[exampleIdx] other alignments
367 allWeights = zeros((run['numFeatures'],num_path[exampleIdx]+1))
368 allWeights[:,0] = trueWeight[:,0]
369
370 AlignmentScores = [0.0]*(num_path[exampleIdx]+1)
371 AlignmentScores[0] = trueAlignmentScore
372
373 ################## Calculate wrong alignment(s) ######################
374 # Compute donor, acceptor with penalty_lookup_new
375 # returns two double lists
376 donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
377
378 #myalign wants the acceptor site on the g of the ag
379 #acceptor = acceptor[1:]
380 #acceptor.append(-inf)
381
382 #donor = [-inf] + donor[:-1]
383
384 ps = h.convert2SWIG()
385
386 _newSpliceAlign, _newEstAlign, _newWeightMatch, _newDPScores,\
387 _newQualityPlifsFeatures, unneeded1, unneeded2 =\
388 self.do_alignment(dna,est,quality,mmatrix,donor,acceptor,ps,qualityPlifs,num_path[exampleIdx],False)
389 mm_len = run['matchmatrixRows']*run['matchmatrixCols']
390
391 newSpliceAlign = _newSpliceAlign
392 newEstAlign = _newEstAlign
393 newWeightMatch = _newWeightMatch
394 newDPScores = _newDPScores
395 newQualityPlifsFeatures = _newQualityPlifsFeatures
396
397 newSpliceAlign = newSpliceAlign.reshape(num_path[exampleIdx],dna_len)
398 newWeightMatch = newWeightMatch.reshape(num_path[exampleIdx],mm_len)
399
400 newQualityPlifsFeatures = newQualityPlifsFeatures.reshape(num_path[exampleIdx],run['totalQualSuppPoints'])
401 # Calculate weights of the respective alignments. Note that we are
402 # calculating n-best alignments without hamming loss, so we
403 # have to keep track which of the n-best alignments correspond to
404 # the true one in order not to incorporate a true alignment in the
405 # constraints. To keep track of the true and false alignments we
406 # define an array true_map with a boolean indicating the
407 # equivalence to the true alignment for each decoded alignment.
408 true_map = [0]*(num_path[exampleIdx]+1)
409 true_map[0] = 1
410
411 for pathNr in range(num_path[exampleIdx]):
412 weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a,\
413 h, newSpliceAlign[pathNr,:].flatten().tolist()[0], don_supp,\
414 acc_supp)
415
416 decodedQualityFeatures = zeros((run['totalQualSuppPoints'],1))
417 decodedQualityFeatures = newQualityPlifsFeatures[pathNr,:].T
418 # Gewichte in restliche Zeilen der Matrix speichern
419 allWeights[:,pathNr+1] = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch[pathNr,:].T, decodedQualityFeatures[:]])
420
421 hpen = mat(h.penalties).reshape(len(h.penalties),1)
422 dpen = mat(d.penalties).reshape(len(d.penalties),1)
423 apen = mat(a.penalties).reshape(len(a.penalties),1)
424 features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties[:]])
425
426 featureVectors[:,exampleIdx] = allWeights[:,pathNr+1]
427
428 AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
429
430 distinct_scores = False
431 if math.fabs(AlignmentScores[pathNr] - AlignmentScores[pathNr+1]) > 1e-5:
432 distinct_scores = True
433
434 # Check wether scalar product + loss equals viterbi score
435 if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) <= 1e-5:
436 self.plog("Scalar prod. + loss not equals Viterbi output!\n")
437 pdb.set_trace()
438
439 self.plog(" scalar prod (correct) : %f\n"%AlignmentScores[0])
440 self.plog(" scalar prod (pred.) : %f %f\n"%(newDPScores[pathNr,0],AlignmentScores[pathNr+1]))
441
442 # if the pathNr-best alignment is very close to the true alignment consider it as true
443 if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
444 true_map[pathNr+1] = 1
445
446 if not trueAlignmentScore <= max(AlignmentScores[1:]) + 1e-6:
447 print "suboptimal_example %d\n" %exampleIdx
448 #trueSpliceAlign, trueWeightMatch, trueWeightQuality dna_calc=\
449 #computeSpliceAlignWithQuality(dna, exons, est, original_est, quality, qualityPlifs)
450
451 #pdb.set_trace()
452 suboptimal_example += 1
453 self.plog("suboptimal_example %d\n" %exampleIdx)
454
455 # the true label sequence should not have a larger score than the maximal one WHYYYYY?
456 # this means that all n-best paths are to close to each other
457 # we have to extend the n-best search to a (n+1)-best
458 if len([elem for elem in true_map if elem == 1]) == len(true_map):
459 num_path[exampleIdx] = num_path[exampleIdx]+1
460
461 # Choose true and first false alignment for extending
462 firstFalseIdx = -1
463 for map_idx,elem in enumerate(true_map):
464 if elem == 0:
465 firstFalseIdx = map_idx
466 break
467
468 if False:
469 self.plog("Is considered as: %d\n" % true_map[1])
470
471 result_len = currentAlignment.getResultLength()
472 c_dna_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
473 c_est_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
474
475 currentAlignment.getAlignmentArrays(c_dna_array,c_est_array)
476
477 dna_array = [0.0]*result_len
478 est_array = [0.0]*result_len
479
480 for r_idx in range(result_len):
481 dna_array[r_idx] = c_dna_array[r_idx]
482 est_array[r_idx] = c_est_array[r_idx]
483
484 _newSpliceAlign = newSpliceAlign[0].flatten().tolist()[0]
485 _newEstAlign = newEstAlign[0].flatten().tolist()[0]
486
487 #line1,line2,line3 = pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array)
488 #self.plog(line1+'\n')
489 #self.plog(line2+'\n')
490 #self.plog(line3+'\n')
491
492 # if there is at least one useful false alignment add the
493 # corresponding constraints to the optimization problem
494 if firstFalseIdx != -1:
495 firstFalseWeights = allWeights[:,firstFalseIdx]
496 differenceVector = trueWeight - firstFalseWeights
497 #pdb.set_trace()
498
499 #print 'NOT ADDING ANY CONSTRAINTS'
500 const_added = solver.addConstraint(differenceVector, exampleIdx)
501
502 const_added_ctr += 1
503 #
504 # end of one example processing
505 #
506
507 # call solver every nth example //added constraint
508 if exampleIdx != 0 and exampleIdx % numConstPerRound == 0:
509 objValue,w,self.slacks = solver.solve()
510 solver_call_ctr += 1
511
512 if solver_call_ctr == 5:
513 numConstPerRound = 200
514 self.plog('numConstPerRound is now %d\n'% numConstPerRound)
515
516 if math.fabs(objValue - self.oldObjValue) <= 1e-6:
517 self.noImprovementCtr += 1
518
519 if self.noImprovementCtr == numExamples+1:
520 break
521
522 self.oldObjValue = objValue
523 print "objValue is %f" % objValue
524
525 sum_xis = 0
526 for elem in self.slacks:
527 sum_xis += elem
528
529 print 'sum of slacks is %f'% sum_xis
530 self.plog('sum of slacks is %f\n'% sum_xis)
531
532 for i in range(len(param)):
533 param[i] = w[i]
534
535 cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
536 param_idx += 1
537 [h,d,a,mmatrix,qualityPlifs] =\
538 set_param_palma(param,self.ARGS.train_with_intronlengthinformation,run)
539
540 #
541 # end of one iteration through all examples
542 #
543
544 self.plog("suboptimal rounds %d\n" %suboptimal_example)
545
546 if self.noImprovementCtr == numExamples*2:
547 break
548
549 iteration_nr += 1
550
551 #
552 # end of optimization
553 #
554 print 'Training completed'
555
556 cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
557 self.logfh.close()
558
559
560 ###############################################################################
561 #
562 # End of the code needed for training
563 #
564 # Begin of code for prediction
565 #
566 ###############################################################################
567
568 def init_prediction(self,run,dataset_fn,prediction_keys,param_fn,set_name):
569 """
570 Performing a prediction takes...
571 """
572 self.run = run
573 self.set_name = set_name
574
575 #full_working_path = os.path.join(run['alignment_dir'],run['name'])
576 full_working_path = run['result_dir']
577
578 print 'full_working_path is %s' % full_working_path
579
580 #assert not os.path.exists(full_working_path)
581 if not os.path.exists(full_working_path):
582 os.mkdir(full_working_path)
583
584 assert os.path.exists(full_working_path)
585
586 # ATTENTION: Changing working directory
587 os.chdir(full_working_path)
588
589 self.logfh = open('_qpalma_predict_%s.log'%set_name,'w+')
590
591
592 # number of prediction instances
593 self.plog('Number of prediction examples: %d\n'% len(prediction_keys))
594
595 # load dataset and fetch instances that shall be predicted
596 dataset = cPickle.load(open(dataset_fn))
597
598 prediction_set = {}
599 for key in prediction_keys:
600 prediction_set[key] = dataset[key]
601
602 # we do not need the full dataset anymore
603 del dataset
604
605 # Load parameter vector to predict with
606 param = cPickle.load(open(param_fn))
607
608 self.predict(run,prediction_set,param)
609
610
611 def predict(self,run,prediction_set,param):
612 """
613 This method...
614 """
615
616 self.run = run
617
618 if self.run['mode'] == 'normal':
619 self.use_quality_scores = False
620
621 elif self.run['mode'] == 'using_quality_scores':
622 self.use_quality_scores = True
623 else:
624 assert(False)
625
626 # Set the parameters such as limits/penalties for the Plifs
627 [h,d,a,mmatrix,qualityPlifs] =\
628 set_param_palma(param,self.ARGS.train_with_intronlengthinformation,run)
629
630 #############################################################################################
631 # Prediction
632 #############################################################################################
633
634 if not self.qpalma_debug_mode:
635 self.plog('Starting prediction...\n')
636
637 donSP = self.run['numDonSuppPoints']
638 accSP = self.run['numAccSuppPoints']
639 lengthSP = self.run['numLengthSuppPoints']
640 mmatrixSP = run['matchmatrixRows']*run['matchmatrixCols']
641 numq = self.run['numQualSuppPoints']
642 totalQualSP = self.run['totalQualSuppPoints']
643
644 totalQualityPenalties = zeros((totalQualSP,1))
645
646 problem_ctr = 0
647
648 # where we store the predictions
649 allPredictions = []
650
651 # we take the first quality vector of the tuple of quality vectors
652 quality_index = 0
653
654 # fetch the data needed
655 g_dir = run['dna_flat_files'] #'/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
656 acc_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
657 don_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
658
659 g_fmt = 'chr%d.dna.flat'
660 s_fmt = 'contig_%d%s'
661
662 num_chromo = 6
663
664 accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
665 seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
666
667 # beginning of the prediction loop
668 for example_key in prediction_set.keys():
669 print 'Current example %d' % example_key
670
671 for example in prediction_set[example_key]:
672
673 currentSeqInfo,read,currentQualities = example
674
675 id,chromo,strand,genomicSeq_start,genomicSeq_stop =\
676 currentSeqInfo
677
678 if not self.qpalma_debug_mode:
679 self.plog('Loading example id: %d...\n'% int(id))
680
681 if run['mode'] == 'normal':
682 quality = [40]*len(read)
683
684 if run['mode'] == 'using_quality_scores':
685 quality = currentQualities[quality_index]
686
687 if not run['enable_quality_scores']:
688 quality = [40]*len(read)
689
690 try:
691 currentDNASeq, currentAcc, currentDon = seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop)
692 except:
693 problem_ctr += 1
694 continue
695
696 if not run['enable_splice_signals']:
697 for idx,elem in enumerate(currentDon):
698 if elem != -inf:
699 currentDon[idx] = 0.0
700
701 for idx,elem in enumerate(currentAcc):
702 if elem != -inf:
703 currentAcc[idx] = 0.0
704
705 current_prediction = self.calc_alignment(currentDNASeq, read,\
706 quality, currentDon, currentAcc, d, a, h, mmatrix, qualityPlifs)
707
708 current_prediction['id'] = id
709 current_prediction['chr'] = chromo
710 current_prediction['strand'] = strand
711 current_prediction['start_pos'] = genomicSeq_start
712
713 allPredictions.append(current_prediction)
714
715 if not self.qpalma_debug_mode:
716 self.finalizePrediction(allPredictions,problem_ctr)
717 else:
718 return allPredictions
719
720
721 def finalizePrediction(self,allPredictions,problem_ctr):
722 # end of the prediction loop we save all predictions in a pickle file and exit
723 cPickle.dump(allPredictions,open('%s.predictions.pickle'%(self.set_name),'w+'))
724 print 'Prediction completed'
725 self.plog('Prediction completed\n')
726 mes = 'Problem ctr %d' % problem_ctr
727 print mes
728 self.plog(mes+'\n')
729 self.logfh.close()
730
731
732 def calc_alignment(self, dna, read, quality, don_supp, acc_supp, d, a, h, mmatrix, qualityPlifs):
733 """
734 Given two sequences and the parameters we calculate on alignment
735 """
736
737 run = self.run
738 donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
739
740 dna = str(dna)
741 read = str(read)
742
743 if '-' in read:
744 self.plog('found gap\n')
745 read = read.replace('-','')
746 assert len(read) == Conf.read_size
747
748 dna_len = len(dna)
749 read_len = len(read)
750
751 ps = h.convert2SWIG()
752
753 newSpliceAlign, newEstAlign, newWeightMatch, newDPScores,\
754 newQualityPlifsFeatures, dna_array, read_array =\
755 self.do_alignment(dna,read,quality,mmatrix,donor,acceptor,ps,qualityPlifs,1,True)
756
757 mm_len = run['matchmatrixRows']*run['matchmatrixCols']
758
759 # old code removed
760 newSpliceAlign = newSpliceAlign.reshape(1,dna_len)
761 newWeightMatch = newWeightMatch.reshape(1,mm_len)
762 true_map = [0]*2
763 true_map[0] = 1
764 pathNr = 0
765
766 _newSpliceAlign = array.array('B',newSpliceAlign.flatten().tolist()[0])
767 _newEstAlign = array.array('B',newEstAlign.flatten().tolist()[0])
768
769 alignment = get_alignment(_newSpliceAlign,_newEstAlign, dna_array, read_array) #(qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds, tExonSizes, tStarts, tEnds)
770
771 dna_array = array.array('B',dna_array)
772 read_array = array.array('B',read_array)
773
774 newExons = self.calculatePredictedExons(newSpliceAlign)
775
776 current_prediction = {'predExons':newExons, 'dna':dna, 'read':read, 'DPScores':newDPScores,\
777 'alignment':alignment,'spliceAlign':_newSpliceAlign,'estAlign':_newEstAlign,\
778 'dna_array':dna_array, 'read_array':read_array }
779
780 return current_prediction
781
782
783 def calculatePredictedExons(self,SpliceAlign):
784 newExons = []
785 oldElem = -1
786 SpliceAlign = SpliceAlign.flatten().tolist()[0]
787 SpliceAlign.append(-1)
788 for pos,elem in enumerate(SpliceAlign):
789 if pos == 0:
790 oldElem = -1
791 else:
792 oldElem = SpliceAlign[pos-1]
793
794 if oldElem != 0 and elem == 0: # start of exon
795 newExons.append(pos)
796
797 if oldElem == 0 and elem != 0: # end of exon
798 newExons.append(pos)
799
800 return newExons