+ added code to allow for configuration
[qpalma.git] / scripts / qpalma_main.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 ###########################################################
5 #
6 # The QPalma project aims at extending the Palma project
7 # to be able to use Solexa reads toegether with their
8 # quality scores.
9 #
10 # This file represents the conversion of the main matlab
11 # training loop for Palma to python.
12 #
13 # Author: Fabio De Bona
14 #
15 ###########################################################
16
17 import sys
18 import cPickle
19 import subprocess
20 import scipy.io
21 import pdb
22 import os.path
23
24 import numpy
25 from numpy.matlib import mat,zeros,ones,inf
26 from numpy.linalg import norm
27
28 import QPalmaDP
29 import qpalma
30 from qpalma.SIQP_CPX import SIQPSolver
31 from qpalma.DataProc import *
32 from qpalma.generateEvaluationData import *
33 from qpalma.computeSpliceWeights import *
34 from qpalma.set_param_palma import *
35 from qpalma.computeSpliceAlignWithQuality import *
36 from qpalma.penalty_lookup_new import *
37 from qpalma.compute_donacc import *
38 from qpalma.export_param import *
39 from qpalma.TrainingParam import Param
40 from qpalma.Plif import Plf
41
42 from qpalma.tools.splicesites import getDonAccScores
43 from qpalma.Configuration import *
44
45 from compile_dataset import compile_d
46
47 import palma.palma_utils as pu
48 from palma.output_formating import print_results
49
50 class para:
51 pass
52
53 class QPalma:
54 """
55 A training method for the QPalma project
56 """
57
58 def __init__(self,run):
59 self.ARGS = Param()
60
61 self.run = run
62
63 #data_filename = self.run['dataset_filename']
64 #Sequences, Acceptors, Donors, Exons, Ests, OriginalEsts, Qualities, SplitPos = paths_load_data(data_filename,'training',None,self.ARGS)
65
66 # Load the whole dataset
67 if self.run['mode'] == 'normal':
68 self.use_quality_scores = False
69
70 elif self.run['mode'] == 'using_quality_scores':
71 self.use_quality_scores = True
72 else:
73 assert(False)
74
75 #self.Sequences = Sequences
76 #self.Exons = Exons
77 #self.Ests = Ests
78 #self.OriginalEsts= OriginalEsts
79 #self.Qualities = Qualities
80 #self.SplitPos = SplitPos
81 #self.Donors = Donors
82 #self.Acceptors = Acceptors
83
84 #calc_info(self.Acceptors,self.Donors,self.Exons,self.Qualities)
85 #print 'leaving constructor...'
86
87 def plog(self,string):
88 self.logfh.write(string)
89 self.logfh.flush()
90
91 def train(self):
92 run = self.run
93
94 full_working_path = os.path.join(run['experiment_path'],run['name'])
95 assert not os.path.exists(full_working_path)
96 os.mkdir(full_working_path)
97 assert os.path.exists(full_working_path)
98
99 # ATTENTION: Changing working directory
100 os.chdir(full_working_path)
101
102 self.logfh = open('_qpalma_train.log','w+')
103
104 self.plog("Settings are:\n")
105 self.plog("%s\n"%str(run))
106
107 data_filename = self.run['dataset_filename']
108 Sequences, Acceptors, Donors, Exons, Ests, OriginalEsts, Qualities, SplitPos = paths_load_data(data_filename,'training',None,self.ARGS)
109
110 # Load the whole dataset
111 if self.run['mode'] == 'normal':
112 self.use_quality_scores = False
113
114 elif self.run['mode'] == 'using_quality_scores':
115 self.use_quality_scores = True
116 else:
117 assert(False)
118
119 self.Sequences = Sequences
120 self.Exons = Exons
121 self.Ests = Ests
122 self.OriginalEsts= OriginalEsts
123 self.Qualities = Qualities
124 self.SplitPos = SplitPos
125 self.Donors = Donors
126 self.Acceptors = Acceptors
127
128 calc_info(self.Acceptors,self.Donors,self.Exons,self.Qualities)
129
130 beg = run['training_begin']
131 end = run['training_end']
132
133 Sequences = Sequences[beg:end]
134 Exons = Exons[beg:end]
135 Ests = Ests[beg:end]
136 OriginalEsts= OriginalEsts[beg:end]
137 Qualities = Qualities[beg:end]
138 Acceptors = Acceptors[beg:end]
139 Donors = Donors[beg:end]
140
141 # number of training instances
142 N = numExamples = len(Sequences)
143 assert len(Exons) == N and len(Ests) == N\
144 and len(Qualities) == N and len(Acceptors) == N\
145 and len(Donors) == N, 'The Exons,Acc,Don,.. arrays are of different lengths'
146 self.plog('Number of training examples: %d\n'% numExamples)
147
148 self.noImprovementCtr = 0
149 self.oldObjValue = 1e8
150
151 iteration_steps = run['iter_steps']
152 remove_duplicate_scores = run['remove_duplicate_scores']
153 print_matrix = run['print_matrix']
154 anzpath = run['anzpath']
155
156 # Initialize parameter vector / param = numpy.matlib.rand(126,1)
157 param = Conf.fixedParam[:run['numFeatures']]
158
159 lengthSP = run['numLengthSuppPoints']
160 donSP = run['numDonSuppPoints']
161 accSP = run['numAccSuppPoints']
162 mmatrixSP = run['matchmatrixRows']*run['matchmatrixCols']
163 numq = run['numQualSuppPoints']
164 totalQualSP = run['totalQualSuppPoints']
165
166 # no intron length model
167 if not run['enable_intron_length']:
168 param[:lengthSP] *= 0.0
169
170 # Set the parameters such as limits penalties for the Plifs
171 [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation,run)
172
173 # Initialize solver
174 self.plog('Initializing problem...\n')
175 solver = SIQPSolver(run['numFeatures'],numExamples,run['C'],self.logfh,run)
176 #solver.enforceMonotonicity(lengthSP,lengthSP+donSP)
177 #solver.enforceMonotonicity(lengthSP+donSP,lengthSP+donSP+accSP)
178
179 # stores the number of alignments done for each example (best path, second-best path etc.)
180 num_path = [anzpath]*numExamples
181 # stores the gap for each example
182 gap = [0.0]*numExamples
183 #############################################################################################
184 # Training
185 #############################################################################################
186 self.plog('Starting training...\n')
187
188 currentPhi = zeros((run['numFeatures'],1))
189 totalQualityPenalties = zeros((totalQualSP,1))
190
191 numConstPerRound = run['numConstraintsPerRound']
192 solver_call_ctr = 0
193
194 suboptimal_example = 0
195 iteration_nr = 0
196 param_idx = 0
197 const_added_ctr = 0
198
199 # the main training loop
200 while True:
201 if iteration_nr == iteration_steps:
202 break
203
204 for exampleIdx in range(numExamples):
205 if (exampleIdx%100) == 0:
206 print 'Current example nr %d' % exampleIdx
207
208 dna = Sequences[exampleIdx]
209 est = Ests[exampleIdx]
210 est = "".join(est)
211 est = est.lower()
212 est = est.replace('-','')
213 original_est = OriginalEsts[exampleIdx]
214 original_est = "".join(original_est)
215 original_est = original_est.lower()
216
217 dna_len = len(dna)
218 est_len = len(est)
219
220 assert len(est) == run['read_size'], pdb.set_trace()
221
222 if run['mode'] == 'normal':
223 quality = [40]*len(est)
224
225 if run['mode'] == 'using_quality_scores':
226 quality = Qualities[exampleIdx]
227
228 if not run['enable_quality_scores']:
229 quality = [40]*len(est)
230
231 exons = Exons[exampleIdx]
232 don_supp = Donors[exampleIdx]
233 acc_supp = Acceptors[exampleIdx]
234
235 if not run['enable_splice_signals']:
236 for idx,elem in enumerate(don_supp):
237 if elem != -inf:
238 don_supp[idx] = 0.0
239
240 for idx,elem in enumerate(acc_supp):
241 if elem != -inf:
242 acc_supp[idx] = 0.0
243
244 # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
245 if run['mode'] == 'using_quality_scores':
246 trueSpliceAlign, trueWeightMatch, trueWeightQuality ,dna_calc =\
247 computeSpliceAlignWithQuality(dna, exons, est, original_est,\
248 quality, qualityPlifs,run)
249 else:
250 trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
251
252 dna_calc = dna_calc.replace('-','')
253
254 # Calculate the weights
255 trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
256 trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
257
258 currentPhi[0:lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
259 currentPhi[lengthSP:lengthSP+donSP] = mat(d.penalties[:]).reshape(donSP,1)
260 currentPhi[lengthSP+donSP:lengthSP+donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
261 currentPhi[lengthSP+donSP+accSP:lengthSP+donSP+accSP+mmatrixSP] = mmatrix[:]
262
263 if run['mode'] == 'using_quality_scores':
264 totalQualityPenalties = param[-totalQualSP:]
265 currentPhi[lengthSP+donSP+accSP+mmatrixSP:] = totalQualityPenalties[:]
266
267 # Calculate w'phi(x,y) the total score of the alignment
268 trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
269
270 # The allWeights vector is supposed to store the weight parameter
271 # of the true alignment as well as the weight parameters of the
272 # num_path[exampleIdx] other alignments
273 allWeights = zeros((run['numFeatures'],num_path[exampleIdx]+1))
274 allWeights[:,0] = trueWeight[:,0]
275
276 AlignmentScores = [0.0]*(num_path[exampleIdx]+1)
277 AlignmentScores[0] = trueAlignmentScore
278 ################## Calculate wrong alignment(s) ######################
279 # Compute donor, acceptor with penalty_lookup_new
280 # returns two double lists
281 donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
282
283 #myalign wants the acceptor site on the g of the ag
284 acceptor = acceptor[1:]
285 acceptor.append(-inf)
286
287 # check that splice site scores are at dna positions as expected by
288 # the dynamic programming component
289
290 #for d_pos in [pos for pos,elem in enumerate(donor) if elem != -inf]:
291 # assert dna[d_pos] == 'g' and (dna[d_pos+1] == 'c'\
292 # or dna[d_pos+1] == 't'), pdb.set_trace()
293 #
294 #for a_pos in [pos for pos,elem in enumerate(acceptor) if elem != -inf]:
295 # assert dna[a_pos-1] == 'a' and dna[a_pos] == 'g', pdb.set_trace()
296
297 ps = h.convert2SWIG()
298
299 prb = QPalmaDP.createDoubleArrayFromList(quality)
300 chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
301
302 matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
303 mm_len = run['matchmatrixRows']*run['matchmatrixCols']
304
305 d_len = len(donor)
306 donor = QPalmaDP.createDoubleArrayFromList(donor)
307 a_len = len(acceptor)
308 acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
309
310 # Create the alignment object representing the interface to the C/C++ code.
311 currentAlignment = QPalmaDP.Alignment(run['numQualPlifs'],run['numQualSuppPoints'], self.use_quality_scores)
312 c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
313 #print 'Calling myalign...'
314 # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
315 currentAlignment.myalign( num_path[exampleIdx], dna, dna_len,\
316 est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
317 acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
318 print_matrix)
319
320 c_SpliceAlign = QPalmaDP.createIntArrayFromList([0]*(dna_len*num_path[exampleIdx]))
321 c_EstAlign = QPalmaDP.createIntArrayFromList([0]*(est_len*num_path[exampleIdx]))
322 c_WeightMatch = QPalmaDP.createIntArrayFromList([0]*(mm_len*num_path[exampleIdx]))
323 c_DPScores = QPalmaDP.createDoubleArrayFromList([.0]*num_path[exampleIdx])
324
325 c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(run['totalQualSuppPoints']*num_path[exampleIdx]))
326
327 currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
328 c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
329
330 #print 'After calling getAlignmentResults...'
331
332 newSpliceAlign = zeros((num_path[exampleIdx]*dna_len,1))
333 newEstAlign = zeros((est_len*num_path[exampleIdx],1))
334 newWeightMatch = zeros((num_path[exampleIdx]*mm_len,1))
335 newDPScores = zeros((num_path[exampleIdx],1))
336 newQualityPlifsFeatures = zeros((run['totalQualSuppPoints']*num_path[exampleIdx],1))
337
338 #print 'newSpliceAlign'
339 for i in range(dna_len*num_path[exampleIdx]):
340 newSpliceAlign[i] = c_SpliceAlign[i]
341 # print '%f' % (spliceAlign[i])
342
343 #print 'newEstAlign'
344 for i in range(est_len*num_path[exampleIdx]):
345 newEstAlign[i] = c_EstAlign[i]
346 # print '%f' % (spliceAlign[i])
347
348 #print 'weightMatch'
349 for i in range(mm_len*num_path[exampleIdx]):
350 newWeightMatch[i] = c_WeightMatch[i]
351 # print '%f' % (weightMatch[i])
352
353 #print 'ViterbiScores'
354 for i in range(num_path[exampleIdx]):
355 newDPScores[i] = c_DPScores[i]
356
357 if self.use_quality_scores:
358 for i in range(run['totalQualSuppPoints']*num_path[exampleIdx]):
359 newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
360
361 # equals palma up to here
362
363 #print "Calling destructors"
364 del c_SpliceAlign
365 del c_EstAlign
366 del c_WeightMatch
367 del c_DPScores
368 del c_qualityPlifsFeatures
369 #del currentAlignment
370
371 newSpliceAlign = newSpliceAlign.reshape(num_path[exampleIdx],dna_len)
372 newWeightMatch = newWeightMatch.reshape(num_path[exampleIdx],mm_len)
373
374 newQualityPlifsFeatures = newQualityPlifsFeatures.reshape(num_path[exampleIdx],run['totalQualSuppPoints'])
375 # Calculate weights of the respective alignments Note that we are
376 # calculating n-best alignments without hamming loss, so we
377 # have to keep track which of the n-best alignments correspond to
378 # the true one in order not to incorporate a true alignment in the
379 # constraints. To keep track of the true and false alignments we
380 # define an array true_map with a boolean indicating the
381 # equivalence to the true alignment for each decoded alignment.
382 true_map = [0]*(num_path[exampleIdx]+1)
383 true_map[0] = 1
384
385 for pathNr in range(num_path[exampleIdx]):
386 #print 'decodedWeights'
387 weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a,\
388 h, newSpliceAlign[pathNr,:].flatten().tolist()[0], don_supp,\
389 acc_supp)
390
391 decodedQualityFeatures = zeros((run['totalQualSuppPoints'],1))
392 decodedQualityFeatures = newQualityPlifsFeatures[pathNr,:].T
393 # Gewichte in restliche Zeilen der Matrix speichern
394 allWeights[:,pathNr+1] = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch[pathNr,:].T, decodedQualityFeatures[:]])
395
396 hpen = mat(h.penalties).reshape(len(h.penalties),1)
397 dpen = mat(d.penalties).reshape(len(d.penalties),1)
398 apen = mat(a.penalties).reshape(len(a.penalties),1)
399 features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties[:]])
400
401 #pdb.set_trace()
402
403 AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
404
405 distinct_scores = False
406 if math.fabs(AlignmentScores[pathNr] - AlignmentScores[pathNr+1]) > 1e-5:
407 distinct_scores = True
408
409 #pdb.set_trace()
410
411 # Check wether scalar product + loss equals viterbi score
412 if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) <= 1e-5:
413 self.plog("Scalar prod. + loss not equals Viterbi output!\n")
414 pdb.set_trace()
415
416 self.plog(" scalar prod (correct) : %f\n"%AlignmentScores[0])
417 self.plog(" scalar prod (pred.) : %f %f\n"%(newDPScores[pathNr,0],AlignmentScores[pathNr+1]))
418
419 #pdb.set_trace()
420
421 # if the pathNr-best alignment is very close to the true alignment consider it as true
422 if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
423 true_map[pathNr+1] = 1
424
425
426 if not trueAlignmentScore <= max(AlignmentScores[1:]) + 1e-6:
427 print "suboptimal_example %d\n" %exampleIdx
428 #trueSpliceAlign, trueWeightMatch, trueWeightQuality dna_calc=\
429 #computeSpliceAlignWithQuality(dna, exons, est, original_est, quality, qualityPlifs)
430
431 #pdb.set_trace()
432 suboptimal_example += 1
433 self.plog("suboptimal_example %d\n" %exampleIdx)
434
435
436 # the true label sequence should not have a larger score than the maximal one WHYYYYY?
437 # this means that all n-best paths are to close to each other
438 # we have to extend the n-best search to a (n+1)-best
439 if len([elem for elem in true_map if elem == 1]) == len(true_map):
440 num_path[exampleIdx] = num_path[exampleIdx]+1
441
442 # Choose true and first false alignment for extending
443 firstFalseIdx = -1
444 for map_idx,elem in enumerate(true_map):
445 if elem == 0:
446 firstFalseIdx = map_idx
447 break
448
449 if False:
450 self.plog("Is considered as: %d\n" % true_map[1])
451
452 result_len = currentAlignment.getResultLength()
453 c_dna_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
454 c_est_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
455
456 currentAlignment.getAlignmentArrays(c_dna_array,c_est_array)
457
458 dna_array = [0.0]*result_len
459 est_array = [0.0]*result_len
460
461 for r_idx in range(result_len):
462 dna_array[r_idx] = c_dna_array[r_idx]
463 est_array[r_idx] = c_est_array[r_idx]
464
465 _newSpliceAlign = newSpliceAlign[0].flatten().tolist()[0]
466 _newEstAlign = newEstAlign[0].flatten().tolist()[0]
467 line1,line2,line3 = pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array)
468 self.plog(line1+'\n')
469 self.plog(line2+'\n')
470 self.plog(line3+'\n')
471
472 # if there is at least one useful false alignment add the
473 # corresponding constraints to the optimization problem
474 if firstFalseIdx != -1:
475 firstFalseWeights = allWeights[:,firstFalseIdx]
476 differenceVector = trueWeight - firstFalseWeights
477 #pdb.set_trace()
478
479 const_added = solver.addConstraint(differenceVector, exampleIdx)
480 const_added_ctr += 1
481 #
482 # end of one example processing
483 #
484
485 # call solver every nth example //added constraint
486 if exampleIdx != 0 and exampleIdx % numConstPerRound == 0:
487 objValue,w,self.slacks = solver.solve()
488 solver_call_ctr += 1
489
490 if solver_call_ctr == 5:
491 numConstPerRound = 200
492 self.plog('numConstPerRound is now %d\n'% numConstPerRound)
493
494 if math.fabs(objValue - self.oldObjValue) <= 1e-6:
495 self.noImprovementCtr += 1
496
497 if self.noImprovementCtr == numExamples+1:
498 break
499
500 self.oldObjValue = objValue
501 print "objValue is %f" % objValue
502
503 sum_xis = 0
504 for elem in self.slacks:
505 sum_xis += elem
506
507 print 'sum of slacks is %f'% sum_xis
508 self.plog('sum of slacks is %f\n'% sum_xis)
509
510 for i in range(len(param)):
511 param[i] = w[i]
512
513 #pdb.set_trace()
514
515 cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
516 param_idx += 1
517 [h,d,a,mmatrix,qualityPlifs] =\
518 set_param_palma(param,self.ARGS.train_with_intronlengthinformation,run)
519
520 #
521 # end of one iteration through all examples
522 #
523
524 self.plog("suboptimal rounds %d\n" %suboptimal_example)
525
526 if self.noImprovementCtr == numExamples*2:
527 break
528
529 iteration_nr += 1
530
531 #
532 # end of optimization
533 #
534 print 'Training completed'
535
536 pa = para()
537 pa.h = h
538 pa.d = d
539 pa.a = a
540 pa.mmatrix = mmatrix
541 pa.qualityPlifs = qualityPlifs
542
543 cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
544 #cPickle.dump(pa,open('elegans.param','w+'))
545 self.logfh.close()
546
547 ###############################################################################
548 #
549 # End of the code needed for training
550 #
551 #
552 #
553 #
554 #
555 # Begin of code for prediction
556 #
557 #
558 ###############################################################################
559
560
561 def evaluate(self,param_filename,dataset):
562 run = self.run
563
564 data = dataset
565 self.Sequences = data['Sequences']
566 self.Acceptors = data['Acceptors']
567 self.Donors = data['Donors']
568 self.Exons = data['Exons']
569 self.Ests = data['Ests']
570 self.OriginalEsts= data['OriginalEsts']
571 self.Qualities = data['Qualities']
572 self.SplitPositions = data['SplitPositions']
573
574 self.FReads = data['FReads']
575
576 beg = run['prediction_begin']
577 end = run['prediction_end']
578
579 self.logfh = open('_qpalma_predict.log','w+')
580
581 # predict on training set
582 print '##### Prediction on the training set #####'
583 #self.predict(param_filename,0,beg)
584
585 # predict on test set
586 print '##### Prediction on the test set #####'
587 self.predict(param_filename,beg,end)
588
589 self.logfh.close()
590 #pdb.set_trace()
591
592 def predict(self,param_filename,beg,end):
593 run = self.run
594
595 if self.run['mode'] == 'normal':
596 self.use_quality_scores = False
597
598 elif self.run['mode'] == 'using_quality_scores':
599 self.use_quality_scores = True
600 else:
601 assert(False)
602
603 Sequences = self.Sequences[beg:end]
604 Exons = self.Exons[beg:end]
605 Ests = self.Ests[beg:end]
606 OriginalEsts = self.OriginalEsts[beg:end]
607 Qualities = self.Qualities[beg:end]
608 Acceptors = self.Acceptors[beg:end]
609 Donors = self.Donors[beg:end]
610 SplitPos = self.SplitPositions[beg:end]
611
612 FReads = self.FReads[beg:end]
613
614 print 'STARTING PREDICTION'
615
616 # number of training instances
617 N = numExamples = len(Sequences)
618 assert len(Exons) == N and len(Ests) == N\
619 and len(Qualities) == N and len(Acceptors) == N\
620 and len(Donors) == N, 'The Exons,Acc,Don,.. arrays are of different lengths'
621 self.plog('Number of training examples: %d\n'% numExamples)
622
623 self.noImprovementCtr = 0
624 self.oldObjValue = 1e8
625
626 remove_duplicate_scores = self.run['remove_duplicate_scores']
627 print_matrix = self.run['print_matrix']
628 anzpath = self.run['anzpath']
629
630 param = cPickle.load(open(param_filename))
631
632 # Set the parameters such as limits penalties for the Plifs
633 #pdb.set_trace()
634 [h,d,a,mmatrix,qualityPlifs] =\
635 set_param_palma(param,self.ARGS.train_with_intronlengthinformation,run)
636
637 #############################################################################################
638 # Prediction
639 #############################################################################################
640 self.plog('Starting prediction...\n')
641
642 donSP = self.run['numDonSuppPoints']
643 accSP = self.run['numAccSuppPoints']
644 lengthSP = self.run['numLengthSuppPoints']
645 mmatrixSP = run['matchmatrixRows']*run['matchmatrixCols']
646 numq = self.run['numQualSuppPoints']
647 totalQualSP = self.run['totalQualSuppPoints']
648
649 currentPhi = zeros((self.run['numFeatures'],1))
650 totalQualityPenalties = zeros((totalQualSP,1))
651
652 exon1Begin = []
653 exon1End = []
654 exon2Begin = []
655 exon2End = []
656 allWrongExons = []
657
658 allRealCorrectSplitPos = []
659 allRealWrongSplitPos = []
660
661 allPredictions = []
662
663 for exampleIdx in range(numExamples):
664 dna = Sequences[exampleIdx]
665 est = Ests[exampleIdx]
666
667 fRead = FReads[exampleIdx]
668 est = fRead['seq']
669
670 new_est = ''
671 e = 0
672 while True:
673 if not e < len(est):
674 break
675
676 if est[e] == '[':
677 new_est += est[e+2]
678 e += 4
679 else:
680 new_est += est[e]
681 e += 1
682
683 #pdb.set_trace()
684 est = new_est
685 #pdb.set_trace()
686
687 print exampleIdx
688
689 est = "".join(est)
690 est = est.lower()
691 est = est.replace('-','')
692 #original_est = OriginalEsts[exampleIdx]
693 #original_est = "".join(original_est)
694 #original_est = original_est.lower()
695
696 original_est = est
697
698 currentSplitPos = SplitPos[exampleIdx]
699
700
701 #import re
702 #num_hits = len(re.findall('\[',original_est[:currentSplitPos]))
703
704 #if num_hits == 0:
705 # currentRealSplitPos = currentSplitPos
706 #else:
707 # currentRealSplitPos = currentSplitPos - (num_hits*3)
708 currentRealSplitPos = 0
709
710 #assert 0 < currentRealSplitPos < 36, pdb.set_trace()
711 currentRealRestPos = 36 - currentRealSplitPos
712 #assert 0 < currentRealRestPos < 36, pdb.set_trace()
713
714 if self.run['mode'] == 'normal':
715 quality = [40]*len(est)
716
717 if self.run['mode'] == 'using_quality_scores':
718 quality = Qualities[exampleIdx]
719
720 if not run['enable_quality_scores']:
721 quality = [40]*len(est)
722
723 quality = fRead['prb']
724
725 exons = zeros((2,2))
726
727 exons[0,0] = fRead['p_start']-fRead['p_start']
728 exons[0,1] = fRead['exon_stop']-fRead['p_start']
729 exons[1,0] = fRead['exon_start']-fRead['p_start']
730 exons[1,1] = fRead['p_stop']-fRead['p_start']
731
732 don_supp = Donors[exampleIdx]
733 acc_supp = Acceptors[exampleIdx]
734
735 if not run['enable_splice_signals']:
736
737 for idx,elem in enumerate(don_supp):
738 if elem != -inf:
739 don_supp[idx] = 0.0
740
741 for idx,elem in enumerate(acc_supp):
742 if elem != -inf:
743 acc_supp[idx] = 0.0
744
745 currentPhi[0:donSP] = mat(d.penalties[:]).reshape(donSP,1)
746 currentPhi[donSP:donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
747 currentPhi[donSP+accSP:donSP+accSP+lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
748 currentPhi[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP] = mmatrix[:]
749
750 ################## Calculate wrong alignment(s) ######################
751
752 # Compute donor, acceptor with penalty_lookup_new
753 # returns two double lists
754 donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
755
756 #myalign wants the acceptor site on the g of the ag
757 acceptor = acceptor[1:]
758 acceptor.append(-inf)
759
760 dna = str(dna)
761 est = str(est)
762 dna_len = len(dna)
763 est_len = len(est)
764
765 ps = h.convert2SWIG()
766
767 prb = QPalmaDP.createDoubleArrayFromList(quality)
768 chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
769
770 matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
771 mm_len = run['matchmatrixRows']*run['matchmatrixCols']
772
773 d_len = len(donor)
774 donor = QPalmaDP.createDoubleArrayFromList(donor)
775 a_len = len(acceptor)
776 acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
777
778 # Create the alignment object representing the interface to the C/C++ code.
779 currentAlignment = QPalmaDP.Alignment(run['numQualPlifs'],run['numQualSuppPoints'], self.use_quality_scores)
780
781 c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
782
783 # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
784 currentAlignment.myalign(1, dna, dna_len,\
785 est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
786 acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
787 print_matrix)
788
789 c_SpliceAlign = QPalmaDP.createIntArrayFromList([0]*(dna_len*1))
790 c_EstAlign = QPalmaDP.createIntArrayFromList([0]*(est_len*1))
791 c_WeightMatch = QPalmaDP.createIntArrayFromList([0]*(mm_len*1))
792 c_DPScores = QPalmaDP.createDoubleArrayFromList([.0]*1)
793
794 result_len = currentAlignment.getResultLength()
795 c_dna_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
796 c_est_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
797
798 currentAlignment.getAlignmentArrays(c_dna_array,c_est_array)
799
800 c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(self.run['totalQualSuppPoints']))
801
802 currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
803 c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
804
805 newSpliceAlign = zeros((dna_len,1))
806 newEstAlign = zeros((est_len,1))
807 newWeightMatch = zeros((mm_len,1))
808 newQualityPlifsFeatures = zeros((self.run['totalQualSuppPoints'],1))
809
810 for i in range(dna_len):
811 newSpliceAlign[i] = c_SpliceAlign[i]
812
813 for i in range(est_len):
814 newEstAlign[i] = c_EstAlign[i]
815
816 for i in range(mm_len):
817 newWeightMatch[i] = c_WeightMatch[i]
818
819 newDPScores = c_DPScores[0]
820
821 for i in range(run['totalQualSuppPoints']):
822 newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
823
824 del c_SpliceAlign
825 del c_EstAlign
826 del c_WeightMatch
827 del c_DPScores
828 del c_qualityPlifsFeatures
829 del currentAlignment
830
831 newSpliceAlign = newSpliceAlign.reshape(1,dna_len)
832 newWeightMatch = newWeightMatch.reshape(1,mm_len)
833 true_map = [0]*2
834 true_map[0] = 1
835 pathNr = 0
836
837 dna_array = [0.0]*result_len
838 est_array = [0.0]*result_len
839
840 for r_idx in range(result_len):
841 dna_array[r_idx] = c_dna_array[r_idx]
842 est_array[r_idx] = c_est_array[r_idx]
843
844 _newSpliceAlign = newSpliceAlign.flatten().tolist()[0]
845 _newEstAlign = newEstAlign.flatten().tolist()[0]
846
847 line1,line2,line3 = pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array)
848 self.plog(line1+'\n')
849 self.plog(line2+'\n')
850 self.plog(line3+'\n')
851
852 e1_b_off,e1_e_off,e2_b_off,e2_e_off,newExons = self.evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign,currentSplitPos,exampleIdx)
853
854 one_prediction = {'e1_b_off':e1_b_off,'e1_e_off':e1_e_off,
855 'e2_b_off':e2_b_off ,'e2_e_off':e2_e_off,\
856 'predExons':newExons, 'trueExons':exons,\
857 'dna':dna, 'est':est, 'exampleIdx':exampleIdx,\
858 'realRestPos':currentRealRestPos,\
859 'DPScores':newDPScores,
860 'fRead':fRead
861 }
862
863 allPredictions.append(one_prediction)
864
865 if e1_b_off != None:
866 exon1Begin.append(e1_b_off)
867 exon1End.append(e1_e_off)
868 exon2Begin.append(e2_b_off)
869 exon2End.append(e2_e_off)
870 allRealCorrectSplitPos.append(currentRealRestPos)
871 else:
872 allWrongExons.append((newExons,exons))
873 allRealWrongSplitPos.append(currentRealRestPos)
874
875 logfile = self.logfh
876
877 if e1_b_off == 0 and e1_e_off == 0 and e2_b_off == 0 and e2_e_off == 0:
878 print >> logfile, 'example %d correct' % exampleIdx
879 else:
880 print >> logfile, 'example %d wrong' % exampleIdx
881
882 e1Begin_pos,e1Begin_neg,e1End_pos,e1End_neg,mean_e1Begin_neg,mean_e1End_neg = self.evaluatePositions(exon1Begin,exon1End)
883 e2Begin_pos,e2Begin_neg,e2End_pos,e2End_neg,mean_e2Begin_neg,mean_e2End_neg = self.evaluatePositions(exon2Begin,exon2End)
884
885 correct_rest2error = {}
886 off_rest2error = {}
887 wrong_rest2error = {}
888
889 for e_idx in range(len(exon1Begin)):
890 e1b = exon1Begin[e_idx]
891 e1e = exon1End[e_idx]
892
893 e2b = exon2Begin[e_idx]
894 e2e = exon2End[e_idx]
895
896 current_elem = allRealCorrectSplitPos[e_idx]
897
898 if e1b == 0 and e1e == 0 and e2b == 0 and e2e == 0:
899 try:
900 correct_rest2error[current_elem] += 1
901 except:
902 correct_rest2error[current_elem] = 1
903
904 if not (e1b == 0 and e1e == 0 and e2b == 0 and e2e == 0):
905 try:
906 off_rest2error[current_elem] += 1
907 except:
908 off_rest2error[current_elem] = 1
909
910
911 for e_idx in range(len(allWrongExons)):
912
913 current_elem = allRealWrongSplitPos[e_idx]
914
915 try:
916 wrong_rest2error[current_elem] += 1
917 except:
918 wrong_rest2error[current_elem] = 1
919
920 all_pos_correct = 0
921 for idx in range(len(exon1Begin)):
922 if exon1Begin[idx] == 0 and exon1End[idx] == 0\
923 and exon2Begin[idx] == 0 and exon2End[idx] == 0:
924 all_pos_correct += 1
925
926 logfile = self.logfh
927 print >> logfile, 'Total num. of examples: %d' % numExamples
928 print >> logfile, 'Number of total correct examples: %d' % all_pos_correct
929 print >> logfile, 'Correct positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_pos),len(e1End_pos),len(e2Begin_pos),len(e2End_pos))
930 print >> logfile, 'Incorrect positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_neg),len(e1End_neg),len(e2Begin_neg),len(e2End_neg))
931 print >> logfile, 'Mean of pos. offset:\t\t%.2f\t%.2f\t%.2f\t%.2f' % (mean_e1Begin_neg,mean_e1End_neg,mean_e2Begin_neg,mean_e2End_neg)
932
933 print 'Total num. of examples: %d' % numExamples
934 print 'Number of total correct examples: %d' % all_pos_correct
935 print 'Correct positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_pos),len(e1End_pos),len(e2Begin_pos),len(e2End_pos))
936 print 'Incorrect positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_neg),len(e1End_neg),len(e2Begin_neg),len(e2End_neg))
937 print 'Mean of pos. offset:\t\t%.2f\t%.2f\t%.2f\t%.2f' % (mean_e1Begin_neg,mean_e1End_neg,mean_e2Begin_neg,mean_e2End_neg)
938
939 #print 'total correct'
940 #print correct_rest2error
941
942 #import pylab
943 #
944 #y_values = []
945 #for overlap in range(36):
946 # result = 0
947 # try:
948 # cv = off_rest2error[overlap]
949 # except:
950 # cv = 0
951
952 # result += cv
953
954 # try:
955 # cv = wrong_rest2error[overlap]
956 # except:
957 # cv = 0
958
959 # result += cv
960
961 # y_values.append(result/(1.0*numExamples))
962
963 #pylab.bar(range(36),y_values,width=0.2,align='center')
964 #pylab.xticks( numpy.arange(0,36,1) )
965 #pylab.savefig('%s_error_vs_overlap.eps'%run['name'])
966 #pylab.clf()
967 #raw_input()
968 cPickle.dump(allPredictions,open('%s_allPredictions'%run['name'],'w+'))
969
970 #print 'some positions off'
971 #print off_rest2error
972 #import pylab
973 #pylab.bar(off_rest2error.keys(),off_rest2error.values(),width=0.2,align='center')
974 ##pylab.show()
975 #pylab.xticks( numpy.arange(0,max(off_rest2error.keys())+1,1) )
976 #pylab.savefig('%s_offset.eps'%run['name'])
977 #pylab.clf()
978 ##raw_input()
979
980 #print 'total wrong'
981 #print wrong_rest2error
982 #import pylab
983 #pylab.bar(wrong_rest2error.keys(),wrong_rest2error.values(),width=0.2,align='center')
984 #pylab.xticks( numpy.arange(0,max(wrong_rest2error.keys())+1,1) )
985 #pylab.savefig('%s_wrong.eps'%run['name'])
986 #pylab.clf()
987
988 print 'Prediction completed'
989
990 def evaluatePositions(self,eBegin,eEnd):
991
992 eBegin_pos = [elem for elem in eBegin if elem == 0]
993 eBegin_neg = [elem for elem in eBegin if elem != 0]
994 eEnd_pos = [elem for elem in eEnd if elem == 0]
995 eEnd_neg = [elem for elem in eEnd if elem != 0]
996
997 mean_eBegin_neg = 0
998 for idx in range(len(eBegin_neg)):
999 mean_eBegin_neg += eBegin_neg[idx]
1000
1001 try:
1002 mean_eBegin_neg /= 1.0*len(eBegin_neg)
1003 except:
1004 mean_eBegin_neg = -1
1005
1006 mean_eEnd_neg = 0
1007 for idx in range(len(eEnd_neg)):
1008 mean_eEnd_neg += eEnd_neg[idx]
1009
1010 try:
1011 mean_eEnd_neg /= 1.0*len(eEnd_neg)
1012 except:
1013 mean_eEnd_neg = -1
1014
1015 return eBegin_pos,eBegin_neg,eEnd_pos,eEnd_neg,mean_eBegin_neg,mean_eEnd_neg
1016
1017 def evaluateExample(self,dna,est,exons,SpliceAlign,newEstAlign,spos,exampleIdx):
1018 newExons = []
1019 oldElem = -1
1020 SpliceAlign = SpliceAlign.flatten().tolist()[0]
1021 SpliceAlign.append(-1)
1022 for pos,elem in enumerate(SpliceAlign):
1023 if pos == 0:
1024 oldElem = -1
1025 else:
1026 oldElem = SpliceAlign[pos-1]
1027
1028 if oldElem != 0 and elem == 0: # start of exon
1029 newExons.append(pos)
1030
1031 if oldElem == 0 and elem != 0: # end of exon
1032 newExons.append(pos)
1033
1034 e1_b_off,e1_e_off,e2_b_off,e2_e_off = 0,0,0,0
1035
1036 self.plog(("Example %d"%exampleIdx) + str(exons) + " "+ str(newExons) + "\n")
1037 #self.plog(("SpliceAlign " + str(SpliceAlign)+ "\n"))
1038 self.plog(("read: " + str(est)+ "\n"))
1039
1040 if len(newExons) == 4:
1041 e1_begin,e1_end = newExons[0],newExons[1]
1042 e2_begin,e2_end = newExons[2],newExons[3]
1043 else:
1044 return None,None,None,None,newExons
1045
1046 e1_b_off = int(math.fabs(e1_begin - exons[0,0]))
1047 e1_e_off = int(math.fabs(e1_end - exons[0,1]))
1048
1049 e2_b_off = int(math.fabs(e2_begin - exons[1,0]))
1050 e2_e_off = int(math.fabs(e2_end - exons[1,1]))
1051
1052 return e1_b_off,e1_e_off,e2_b_off,e2_e_off,newExons
1053
1054 def fetch_genome_info(ginfo_filename):
1055 if not os.path.exists(ginfo_filename):
1056 cmd = ['']*4
1057 cmd[0] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/utils'
1058 cmd[1] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/genomes'
1059 cmd[2] = 'genome_info = init_genome(\'%s\')' % gen_file
1060 cmd[3] = 'save genome_info.mat genome_info'
1061 full_cmd = "matlab -nojvm -nodisplay -r \"%s; %s; %s; %s; exit\"" % (cmd[0],cmd[1],cmd[2],cmd[3])
1062
1063 obj = subprocess.Popen(full_cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
1064 out,err = obj.communicate()
1065 assert err == '', 'An error occured!\n%s'%err
1066
1067 ginfo = scipy.io.loadmat('genome_info.mat')
1068 cPickle.dump(self.genome_info,open(ginfo_filename,'w+'))
1069 return ginfo['genome_info']
1070
1071 else:
1072 return cPickle.load(open(ginfo_filename))
1073
1074
1075
1076 def calcStat(Acceptor,Donor,Exons):
1077 maxAcc = -100
1078 minAcc = 100
1079 maxDon = -100
1080 minDon = 100
1081
1082 acc_vec_pos = []
1083 acc_vec_neg = []
1084 don_vec_pos = []
1085 don_vec_neg = []
1086
1087 for jdx in range(len(Acceptors)):
1088 currentExons = Exons[jdx]
1089 currentAcceptor = Acceptors[jdx]
1090 currentAcceptor = currentAcceptor[1:]
1091 currentAcceptor.append(-inf)
1092 currentDonor = Donors[jdx]
1093
1094 for idx,elem in enumerate(currentAcceptor):
1095 if idx == (int(currentExons[1,0])-1): # acceptor site
1096 acc_vec_pos.append(elem)
1097 else:
1098 acc_vec_neg.append(elem)
1099
1100 for idx,elem in enumerate(currentDonor):
1101 if idx == (int(currentExons[0,1])): # donor site
1102 don_vec_pos.append(elem)
1103 else:
1104 don_vec_neg.append(elem)
1105
1106 acc_pos = [elem for elem in acc_vec_pos if elem != -inf]
1107 acc_neg = [elem for elem in acc_vec_neg if elem != -inf]
1108 don_pos = [elem for elem in don_vec_pos if elem != -inf]
1109 don_neg = [elem for elem in don_vec_neg if elem != -inf]
1110
1111 #pdb.set_trace()
1112
1113 for idx in range(len(Sequences)):
1114 acc = [elem for elem in Acceptors[idx] if elem != -inf]
1115 maxAcc = max(max(acc),maxAcc)
1116 minAcc = min(min(acc),minAcc)
1117
1118 don = [elem for elem in Donors[idx] if elem != -inf]
1119 maxDon = max(max(don),maxDon)
1120 minDon = min(min(don),minDon)
1121
1122 def pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array):
1123 (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds, tExonSizes, tStarts, tEnds) =\
1124 pu.get_splice_info(_newSpliceAlign,_newEstAlign)
1125
1126 t_strand = '+'
1127 translation = '-ACGTN_' #how aligned est and dna sequence is displayed
1128 #(gap_char, 5 nucleotides, intron_char)
1129 comparison_char = '| ' #first: match_char, second: intron_char
1130
1131 (exon_length, identity, ssQuality, exonQuality, comparison, qIDX, tIDX) =\
1132 pu.comp_identity(dna_array, est_array, t_strand, qStart, tStart, translation, comparison_char)
1133
1134 first_identity = None
1135 last_identity = None
1136 for i in range(len(comparison)):
1137 if comparison[i] == '|' and first_identity is None:
1138 first_identity = i
1139 if comparison[-i] == '|' and last_identity is None:
1140 last_identity = len(comparison) - i - 1
1141
1142 try:
1143 for idx in range(len(dna_array)):
1144 dna_array[idx] = translation[int(dna_array[idx])]
1145 est_array[idx] = translation[int(est_array[idx])]
1146 except:
1147 pdb.set_trace()
1148
1149 line1 = "".join(dna_array)
1150 line2 = comparison
1151 line3 = "".join(est_array)
1152
1153 return line1,line2,line3
1154
1155 def calc_info(acc,don,exons,qualities):
1156 min_score = 100
1157 max_score = -100
1158
1159 for elem in acc:
1160 for score in elem:
1161 if score != -inf:
1162 min_score = min(min_score,score)
1163 max_score = max(max_score,score)
1164
1165 print 'Acceptors max/min: %f / %f' % (max_score,min_score)
1166
1167 min_score = 100
1168 max_score = -100
1169
1170 for elem in don:
1171 for score in elem:
1172 if score != -inf:
1173 min_score = min(min_score,score)
1174 max_score = max(max_score,score)
1175
1176 print 'Donor max/min: %f / %f' % (max_score,min_score)
1177
1178 min_score = 10000
1179 max_score = 0
1180 mean_intron_len = 0
1181
1182 for elem in exons:
1183 intron_len = int(elem[1,0] - elem[0,1])
1184 mean_intron_len += intron_len
1185 min_score = min(min_score,intron_len)
1186 max_score = max(max_score,intron_len)
1187
1188 mean_intron_len /= 1.0*len(exons)
1189 print 'Intron length max/min: %d / %d mean length %f' % (max_score,min_score,mean_intron_len)
1190
1191 min_score = 10000
1192 max_score = 0
1193 mean_quality = 0
1194
1195 for qVec in qualities:
1196 for elem in qVec:
1197 min_score = min(min_score,elem)
1198 max_score = max(max_score,elem)
1199
1200 print 'Quality values max/min: %d / %d mean' % (max_score,min_score)
1201
1202
1203 ###########################
1204 # A simple command line
1205 # interface
1206 ###########################
1207
1208 if __name__ == '__main__':
1209 qpalma = QPalma()
1210
1211 mode = sys.argv[1]
1212
1213 if len(sys.argv) == 2 and mode == 'train':
1214 qpalma.train()
1215
1216 elif len(sys.argv) == 3 and mode == 'predict':
1217 param_filename = sys.argv[2]
1218 assert os.path.exists(param_filename)
1219 qpalma.evaluate(param_filename)
1220 else:
1221 print 'You have to choose between training or prediction mode:'
1222 print 'python qpalma. py (train|predict) <param_file>'