+ added basic scripts to generate runs for an experiment
[qpalma.git] / scripts / qpalma_main.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 ###########################################################
5 #
6 # The QPalma project aims at extending the Palma project
7 # to be able to use Solexa reads toegether with their
8 # quality scores.
9 #
10 # This file represents the conversion of the main matlab
11 # training loop for Palma to python.
12 #
13 # Author: Fabio De Bona
14 #
15 ###########################################################
16
17 import sys
18 import subprocess
19 import scipy.io
20 import pdb
21 import os.path
22
23 from numpy.matlib import mat,zeros,ones,inf
24 from numpy.linalg import norm
25
26 import QPalmaDP
27 import qpalma
28 from qpalma.SIQP_CPX import SIQPSolver
29 from qpalma.DataProc import *
30 from qpalma.generateEvaluationData import *
31 from qpalma.computeSpliceWeights import *
32 from qpalma.set_param_palma import *
33 from qpalma.computeSpliceAlignWithQuality import *
34 from qpalma.penalty_lookup_new import *
35 from qpalma.compute_donacc import *
36 from qpalma.export_param import *
37 from qpalma.TrainingParam import Param
38 from qpalma.Plif import Plf
39
40 from qpalma.tools.splicesites import getDonAccScores
41 from qpalma.Configuration import *
42
43 from compile_dataset import compile_d
44
45 import palma.palma_utils as pu
46 from palma.output_formating import print_results
47
48 class para:
49 pass
50
51 class QPalma:
52 """
53 A training method for the QPalma project
54 """
55
56 def __init__(self):
57 self.ARGS = Param()
58
59 #compile_d(Conf.gff_fn,Conf.dna_flat_fn,Conf.filtered_fn,Conf.remapped_fn,Conf.tmp_dir,Conf.dataset_fn,False)
60 #compile_d(Conf.gff_fn,Conf.dna_flat_fn,Conf.filtered_fn,Conf.remapped_fn,Conf.tmp_dir,Conf.dataset_fn,True)
61
62 data_filename = Conf.dataset_fn
63 Sequences, Acceptors, Donors, Exons, Ests, OriginalEsts, Qualities, SplitPos = paths_load_data(data_filename,'training',None,self.ARGS)
64
65 # Load the whole dataset
66 if Conf.mode == 'normal':
67 self.use_quality_scores = False
68
69 elif Conf.mode == 'using_quality_scores':
70 self.use_quality_scores = True
71 else:
72 assert(False)
73
74 self.Sequences = Sequences
75 self.Exons = Exons
76 self.Ests = Ests
77 self.OriginalEsts= OriginalEsts
78 self.Qualities = Qualities
79 self.SplitPos = SplitPos
80 self.Donors = Donors
81 self.Acceptors = Acceptors
82
83 calc_info(self.Acceptors,self.Donors,self.Exons,self.Qualities)
84 pdb.set_trace()
85 print 'leaving constructor...'
86
87 def plog(self,string):
88 self.logfh.write(string)
89 self.logfh.flush()
90
91 def train(self):
92 self.logfh = open('_qpalma_train.log','w+')
93
94 beg = Conf.training_begin
95 end = Conf.training_end
96
97 Sequences = self.Sequences[beg:end]
98 Exons = self.Exons[beg:end]
99 Ests = self.Ests[beg:end]
100 OriginalEsts= self.OriginalEsts[beg:end]
101 Qualities = self.Qualities[beg:end]
102 Acceptors = self.Acceptors[beg:end]
103 Donors = self.Donors[beg:end]
104
105 # number of training instances
106 N = numExamples = len(Sequences)
107 assert len(Exons) == N and len(Ests) == N\
108 and len(Qualities) == N and len(Acceptors) == N\
109 and len(Donors) == N, 'The Exons,Acc,Don,.. arrays are of different lengths'
110 self.plog('Number of training examples: %d\n'% numExamples)
111
112 self.noImprovementCtr = 0
113 self.oldObjValue = 1e8
114
115 iteration_steps = Conf.iter_steps ; #upper bound on iteration steps
116 remove_duplicate_scores = Conf.remove_duplicate_scores
117 print_matrix = Conf.print_matrix
118 anzpath = Conf.anzpath
119
120 # Initialize parameter vector / param = numpy.matlib.rand(126,1)
121 param = Conf.fixedParam
122
123 lengthSP = Conf.numLengthSuppPoints
124 donSP = Conf.numDonSuppPoints
125 accSP = Conf.numAccSuppPoints
126 mmatrixSP = Conf.sizeMatchmatrix[0]\
127 *Conf.sizeMatchmatrix[1]
128 numq = Conf.numQualSuppPoints
129 totalQualSP = Conf.totalQualSuppPoints
130
131 #i1 = range(0,lengthSP)
132 #i2 = range(lengthSP,lengthSP+donSP)
133 #i3 = range(lengthSP+donSP,lengthSP+donSP+accSP)
134 #i4 = range(lengthSP+donSP+accSP,lengthSP+donSP+accSP+mmatrixSP)
135 #i5 = range(lengthSP+donSP+accSP+mmatrixSP,lengthSP+donSP+accSP+mmatrixSP+totalQualSP)
136
137 #offset =lengthSP+donSP+accSP+mmatrixSP
138 #intervals = [[offset+2,offset+3]] #i5,\ #i2,#i3,#i4,#i5]
139 #intervals = []
140
141 #param[offset+2] = 10.0
142 #param[offset+3] = 10.0
143
144 #zero_out(param,intervals)
145
146 #pdb.set_trace()
147
148 # Set the parameters such as limits penalties for the Plifs
149 [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
150
151 # Initialize solver
152 if Conf.USE_OPT:
153 self.plog('Initializing problem...\n')
154 solver = SIQPSolver(Conf.numFeatures,numExamples,Conf.C,self.logfh)
155 #solver.enforceMonotonicity(lengthSP,lengthSP+donSP)
156 #solver.enforceMonotonicity(lengthSP+donSP,lengthSP+donSP+accSP)
157
158 # stores the number of alignments done for each example (best path, second-best path etc.)
159 num_path = [anzpath]*numExamples
160 # stores the gap for each example
161 gap = [0.0]*numExamples
162 #############################################################################################
163 # Training
164 #############################################################################################
165 self.plog('Starting training...\n')
166
167 currentPhi = zeros((Conf.numFeatures,1))
168 totalQualityPenalties = zeros((totalQualSP,1))
169
170 numConstPerRound = Conf.numConstraintsPerRound
171 solver_call_ctr = 0
172
173 suboptimal_example = 0
174 iteration_nr = 0
175 param_idx = 0
176 const_added_ctr = 0
177
178 # the main training loop
179 while True:
180 if iteration_nr == iteration_steps:
181 break
182
183 for exampleIdx in range(numExamples):
184 if (exampleIdx%100) == 0:
185 print 'Current example nr %d' % exampleIdx
186
187 dna = Sequences[exampleIdx]
188 est = Ests[exampleIdx]
189 est = "".join(est)
190 est = est.lower()
191 est = est.replace('-','')
192 original_est = OriginalEsts[exampleIdx]
193 original_est = "".join(original_est)
194 original_est = original_est.lower()
195
196 dna_len = len(dna)
197 est_len = len(est)
198
199 assert len(est) == Conf.read_size, pdb.set_trace()
200
201 if Conf.mode == 'normal':
202 quality = [40]*len(est)
203
204 if Conf.mode == 'using_quality_scores':
205 quality = Qualities[exampleIdx]
206
207 # if Conf.disable_quality_scores
208 quality = [40]*len(est)
209
210 exons = Exons[exampleIdx]
211 don_supp = Donors[exampleIdx]
212 acc_supp = Acceptors[exampleIdx]
213
214 # if Conf.disable_splice_signals
215 #don_supp = [0.0]*len(don_supp)
216 #acc_supp = [0.0]*len(acc_supp)
217
218 for idx,elem in enumerate(don_supp):
219 if elem != -inf:
220 don_supp[idx] = 0.0
221
222 for idx,elem in enumerate(acc_supp):
223 if elem != -inf:
224 acc_supp[idx] = 0.0
225
226 # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
227 if Conf.mode == 'using_quality_scores':
228 trueSpliceAlign, trueWeightMatch, trueWeightQuality ,dna_calc =\
229 computeSpliceAlignWithQuality(dna, exons, est, original_est, quality, qualityPlifs)
230 else:
231 trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
232
233 dna_calc = dna_calc.replace('-','')
234
235 # Calculate the weights
236 trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
237 trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
238
239 currentPhi[0:lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
240 currentPhi[lengthSP:lengthSP+donSP] = mat(d.penalties[:]).reshape(donSP,1)
241 currentPhi[lengthSP+donSP:lengthSP+donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
242 currentPhi[lengthSP+donSP+accSP:lengthSP+donSP+accSP+mmatrixSP] = mmatrix[:]
243
244 if Conf.mode == 'using_quality_scores':
245 totalQualityPenalties = param[-totalQualSP:]
246 currentPhi[lengthSP+donSP+accSP+mmatrixSP:] = totalQualityPenalties[:]
247
248 # Calculate w'phi(x,y) the total score of the alignment
249 trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
250
251 # The allWeights vector is supposed to store the weight parameter
252 # of the true alignment as well as the weight parameters of the
253 # num_path[exampleIdx] other alignments
254 allWeights = zeros((Conf.numFeatures,num_path[exampleIdx]+1))
255 allWeights[:,0] = trueWeight[:,0]
256
257 AlignmentScores = [0.0]*(num_path[exampleIdx]+1)
258 AlignmentScores[0] = trueAlignmentScore
259 ################## Calculate wrong alignment(s) ######################
260 # Compute donor, acceptor with penalty_lookup_new
261 # returns two double lists
262 donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
263
264 #myalign wants the acceptor site on the g of the ag
265 acceptor = acceptor[1:]
266 acceptor.append(-inf)
267
268 # check that splice site scores are at dna positions as expected by
269 # the dynamic programming component
270
271 #for d_pos in [pos for pos,elem in enumerate(donor) if elem != -inf]:
272 # assert dna[d_pos] == 'g' and (dna[d_pos+1] == 'c'\
273 # or dna[d_pos+1] == 't'), pdb.set_trace()
274 #
275 #for a_pos in [pos for pos,elem in enumerate(acceptor) if elem != -inf]:
276 # assert dna[a_pos-1] == 'a' and dna[a_pos] == 'g', pdb.set_trace()
277
278 ps = h.convert2SWIG()
279
280 prb = QPalmaDP.createDoubleArrayFromList(quality)
281 chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
282
283 matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
284 mm_len = Conf.sizeMatchmatrix[0]*Conf.sizeMatchmatrix[1]
285
286 d_len = len(donor)
287 donor = QPalmaDP.createDoubleArrayFromList(donor)
288 a_len = len(acceptor)
289 acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
290
291 # Create the alignment object representing the interface to the C/C++ code.
292 currentAlignment = QPalmaDP.Alignment(Conf.numQualPlifs,Conf.numQualSuppPoints, self.use_quality_scores)
293 c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
294 #print 'Calling myalign...'
295 # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
296 currentAlignment.myalign( num_path[exampleIdx], dna, dna_len,\
297 est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
298 acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
299 print_matrix)
300
301 c_SpliceAlign = QPalmaDP.createIntArrayFromList([0]*(dna_len*num_path[exampleIdx]))
302 c_EstAlign = QPalmaDP.createIntArrayFromList([0]*(est_len*num_path[exampleIdx]))
303 c_WeightMatch = QPalmaDP.createIntArrayFromList([0]*(mm_len*num_path[exampleIdx]))
304 c_DPScores = QPalmaDP.createDoubleArrayFromList([.0]*num_path[exampleIdx])
305
306 c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(Conf.totalQualSuppPoints*num_path[exampleIdx]))
307
308 currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
309 c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
310
311 #print 'After calling getAlignmentResults...'
312
313 newSpliceAlign = zeros((num_path[exampleIdx]*dna_len,1))
314 newEstAlign = zeros((est_len*num_path[exampleIdx],1))
315 newWeightMatch = zeros((num_path[exampleIdx]*mm_len,1))
316 newDPScores = zeros((num_path[exampleIdx],1))
317 newQualityPlifsFeatures = zeros((Conf.totalQualSuppPoints*num_path[exampleIdx],1))
318
319 #print 'newSpliceAlign'
320 for i in range(dna_len*num_path[exampleIdx]):
321 newSpliceAlign[i] = c_SpliceAlign[i]
322 # print '%f' % (spliceAlign[i])
323
324 #print 'newEstAlign'
325 for i in range(est_len*num_path[exampleIdx]):
326 newEstAlign[i] = c_EstAlign[i]
327 # print '%f' % (spliceAlign[i])
328
329 #print 'weightMatch'
330 for i in range(mm_len*num_path[exampleIdx]):
331 newWeightMatch[i] = c_WeightMatch[i]
332 # print '%f' % (weightMatch[i])
333
334 #print 'ViterbiScores'
335 for i in range(num_path[exampleIdx]):
336 newDPScores[i] = c_DPScores[i]
337
338 if self.use_quality_scores:
339 for i in range(Conf.totalQualSuppPoints*num_path[exampleIdx]):
340 newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
341
342 # equals palma up to here
343
344 #print "Calling destructors"
345 del c_SpliceAlign
346 del c_EstAlign
347 del c_WeightMatch
348 del c_DPScores
349 del c_qualityPlifsFeatures
350 #del currentAlignment
351
352 newSpliceAlign = newSpliceAlign.reshape(num_path[exampleIdx],dna_len)
353 newWeightMatch = newWeightMatch.reshape(num_path[exampleIdx],mm_len)
354
355 newQualityPlifsFeatures = newQualityPlifsFeatures.reshape(num_path[exampleIdx],Conf.totalQualSuppPoints)
356 # Calculate weights of the respective alignments Note that we are
357 # calculating n-best alignments without hamming loss, so we
358 # have to keep track which of the n-best alignments correspond to
359 # the true one in order not to incorporate a true alignment in the
360 # constraints. To keep track of the true and false alignments we
361 # define an array true_map with a boolean indicating the
362 # equivalence to the true alignment for each decoded alignment.
363 true_map = [0]*(num_path[exampleIdx]+1)
364 true_map[0] = 1
365
366 for pathNr in range(num_path[exampleIdx]):
367 #print 'decodedWeights'
368 weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a,\
369 h, newSpliceAlign[pathNr,:].flatten().tolist()[0], don_supp,\
370 acc_supp)
371
372 decodedQualityFeatures = zeros((Conf.totalQualSuppPoints,1))
373 decodedQualityFeatures = newQualityPlifsFeatures[pathNr,:].T
374 # Gewichte in restliche Zeilen der Matrix speichern
375 allWeights[:,pathNr+1] = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch[pathNr,:].T, decodedQualityFeatures[:]])
376
377 hpen = mat(h.penalties).reshape(len(h.penalties),1)
378 dpen = mat(d.penalties).reshape(len(d.penalties),1)
379 apen = mat(a.penalties).reshape(len(a.penalties),1)
380 features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties[:]])
381
382 #zero_out(features,intervals)
383 #zero_out(allWeights[:,pathNr+1],intervals)
384
385 AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
386
387 #pdb.set_trace()
388
389 distinct_scores = False
390 if math.fabs(AlignmentScores[pathNr] - AlignmentScores[pathNr+1]) > 1e-5:
391 distinct_scores = True
392
393 # Check wether scalar product + loss equals viterbi score
394 if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) <= 1e-5:
395 pdb.set_trace()
396
397 self.plog(" scalar prod (correct) : %f\n"%AlignmentScores[0])
398 self.plog(" scalar prod (pred.) : %f %f\n"%(newDPScores[pathNr,0],AlignmentScores[pathNr+1]))
399
400 #pdb.set_trace()
401
402 # if the pathNr-best alignment is very close to the true alignment consider it as true
403 if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
404 true_map[pathNr+1] = 1
405
406 #assert AlignmentScores[0] <= max(AlignmentScores[1:]) + 1e-6, pdb.set_trace()
407 if not trueAlignmentScore <= max(AlignmentScores[1:]) + 1e-6:
408 print "suboptimal_example %d\n" %exampleIdx
409 #trueSpliceAlign, trueWeightMatch, trueWeightQuality dna_calc=\
410 #computeSpliceAlignWithQuality(dna, exons, est, original_est, quality, qualityPlifs)
411
412 pdb.set_trace()
413 suboptimal_example += 1
414 self.plog("suboptimal_example %d\n" %exampleIdx)
415
416 # the true label sequence should not have a larger score than the maximal one WHYYYYY?
417 # this means that all n-best paths are to close to each other
418 # we have to extend the n-best search to a (n+1)-best
419 if len([elem for elem in true_map if elem == 1]) == len(true_map):
420 num_path[exampleIdx] = num_path[exampleIdx]+1
421
422 # Choose true and first false alignment for extending
423 firstFalseIdx = -1
424 for map_idx,elem in enumerate(true_map):
425 if elem == 0:
426 firstFalseIdx = map_idx
427 break
428
429 if False:
430 self.plog("Is considered as: %d\n" % true_map[1])
431
432 result_len = currentAlignment.getResultLength()
433 c_dna_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
434 c_est_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
435
436 currentAlignment.getAlignmentArrays(c_dna_array,c_est_array)
437
438 dna_array = [0.0]*result_len
439 est_array = [0.0]*result_len
440
441 for r_idx in range(result_len):
442 dna_array[r_idx] = c_dna_array[r_idx]
443 est_array[r_idx] = c_est_array[r_idx]
444
445 _newSpliceAlign = newSpliceAlign[0].flatten().tolist()[0]
446 _newEstAlign = newEstAlign[0].flatten().tolist()[0]
447 line1,line2,line3 = pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array)
448 self.plog(line1+'\n')
449 self.plog(line2+'\n')
450 self.plog(line3+'\n')
451
452 # if there is at least one useful false alignment add the
453 # corresponding constraints to the optimization problem
454 if firstFalseIdx != -1:
455 firstFalseWeights = allWeights[:,firstFalseIdx]
456 differenceVector = trueWeight - firstFalseWeights
457 #pdb.set_trace()
458
459 if Conf.USE_OPT:
460 const_added = solver.addConstraint(differenceVector, exampleIdx)
461 const_added_ctr += 1
462 #
463 # end of one example processing
464 #
465
466 # call solver every nth example //added constraint
467 if exampleIdx != 0 and exampleIdx % numConstPerRound == 0 and Conf.USE_OPT:
468 objValue,w,self.slacks = solver.solve()
469 solver_call_ctr += 1
470
471 if solver_call_ctr == 5:
472 self.plog('numConstPerRound is now %d\n'% numConstPerRound)
473 numConstPerRound = 200
474
475 if math.fabs(objValue - self.oldObjValue) <= 1e-6:
476 self.noImprovementCtr += 1
477
478 if self.noImprovementCtr == numExamples+1:
479 break
480
481 self.oldObjValue = objValue
482 print "objValue is %f" % objValue
483
484 sum_xis = 0
485 for elem in self.slacks:
486 sum_xis += elem
487
488 print 'sum of slacks is %f'% sum_xis
489 self.plog('sum of slacks is %f\n'% sum_xis)
490
491 for i in range(len(param)):
492 param[i] = w[i]
493
494 #pdb.set_trace()
495 cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
496 param_idx += 1
497 [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
498
499 #
500 # end of one iteration through all examples
501 #
502
503 self.plog("suboptimal rounds %d\n" %suboptimal_example)
504
505 if self.noImprovementCtr == numExamples*2:
506 break
507
508 iteration_nr += 1
509
510 #
511 # end of optimization
512 #
513 print 'Training completed'
514
515 pa = para()
516 pa.h = h
517 pa.d = d
518 pa.a = a
519 pa.mmatrix = mmatrix
520 pa.qualityPlifs = qualityPlifs
521
522 cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
523 #cPickle.dump(pa,open('elegans.param','w+'))
524 self.logfh.close()
525
526 def evaluate(self,param_filename):
527 beg = Conf.prediction_begin
528 end = Conf.prediction_end
529 self.logfh = open('_qpalma_predict.log','w+')
530
531 # predict on training set
532 print '##### Prediction on the training set #####'
533 self.predict(param_filename,0,beg)
534
535 # predict on test set
536 print '##### Prediction on the test set #####'
537 self.predict(param_filename,beg,end)
538
539 self.logfh.close()
540 #pdb.set_trace()
541
542 def predict(self,param_filename,beg,end):
543
544 Sequences = self.Sequences[beg:end]
545 Exons = self.Exons[beg:end]
546 Ests = self.Ests[beg:end]
547 OriginalEsts = self.OriginalEsts[beg:end]
548 Qualities = self.Qualities[beg:end]
549 Acceptors = self.Acceptors[beg:end]
550 Donors = self.Donors[beg:end]
551 SplitPos = self.SplitPos[beg:end]
552
553 # number of training instances
554 N = numExamples = len(Sequences)
555 assert len(Exons) == N and len(Ests) == N\
556 and len(Qualities) == N and len(Acceptors) == N\
557 and len(Donors) == N, 'The Exons,Acc,Don,.. arrays are of different lengths'
558 self.plog('Number of training examples: %d\n'% numExamples)
559
560 self.noImprovementCtr = 0
561 self.oldObjValue = 1e8
562
563 remove_duplicate_scores = Conf.remove_duplicate_scores
564 print_matrix = Conf.print_matrix
565 anzpath = Conf.anzpath
566
567 param = cPickle.load(open(param_filename))
568
569 # Set the parameters such as limits penalties for the Plifs
570 [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
571
572 #############################################################################################
573 # Prediction
574 #############################################################################################
575 self.plog('Starting prediction...\n')
576
577 donSP = Conf.numDonSuppPoints
578 accSP = Conf.numAccSuppPoints
579 lengthSP = Conf.numLengthSuppPoints
580 mmatrixSP = Conf.sizeMatchmatrix[0]\
581 *Conf.sizeMatchmatrix[1]
582 numq = Conf.numQualSuppPoints
583 totalQualSP = Conf.totalQualSuppPoints
584
585 currentPhi = zeros((Conf.numFeatures,1))
586 totalQualityPenalties = zeros((totalQualSP,1))
587
588 exon1Begin = []
589 exon1End = []
590 exon2Begin = []
591 exon2End = []
592 allWrongExons = []
593
594 for exampleIdx in range(numExamples):
595 dna = Sequences[exampleIdx]
596 est = Ests[exampleIdx]
597 est = "".join(est)
598 est = est.lower()
599 est = est.replace('-','')
600 original_est = OriginalEsts[exampleIdx]
601 original_est = "".join(original_est)
602 original_est = original_est.lower()
603 currentSplitPos = SplitPos[exampleIdx]
604
605 if Conf.mode == 'normal':
606 quality = [40]*len(est)
607
608 if Conf.mode == 'using_quality_scores':
609 quality = Qualities[exampleIdx]
610
611 quality = [40]*len(est)
612
613 exons = Exons[exampleIdx]
614 # NoiseMatrix = Noises[exampleIdx]
615 don_supp = Donors[exampleIdx]
616 acc_supp = Acceptors[exampleIdx]
617
618 #don_supp = [0.0]*len(don_supp)
619 #acc_supp = [0.0]*len(acc_supp)
620
621 # for idx,elem in enumerate(don_supp):
622 # if elem != -inf:
623 # don_supp[idx] = 0.0
624
625 # for idx,elem in enumerate(acc_supp):
626 # if elem != -inf:
627 # acc_supp[idx] = 0.0
628
629 for idx,elem in enumerate(don_supp):
630 if elem != -inf:
631 don_supp[idx] = 0.0
632
633 for idx,elem in enumerate(acc_supp):
634 if elem != -inf:
635 acc_supp[idx] = 0.0
636
637 ## Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
638 #if Conf.mode == 'using_quality_scores':
639 # trueSpliceAlign, trueWeightMatch, trueWeightQuality =\
640 # computeSpliceAlignWithQuality(dna, exons, est, quality, qualityPlifs)
641 #else:
642 # trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
643 #
644 ## Calculate the weights
645 #trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
646 #trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
647
648 #if Conf.mode == 'using_quality_scores':
649 # totalQualityPenalties = param[-totalQualSP:]
650 # currentPhi[donSP+accSP+lengthSP+mmatrixSP:] = totalQualityPenalties[:]
651
652 currentPhi[0:donSP] = mat(d.penalties[:]).reshape(donSP,1)
653 currentPhi[donSP:donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
654 currentPhi[donSP+accSP:donSP+accSP+lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
655 currentPhi[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP] = mmatrix[:]
656
657 # Calculate w'phi(x,y) the total score of the alignment
658 #trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
659
660 # The allWeights vector is supposed to store the weight parameter
661 # of the true alignment as well as the weight parameters of the
662 # 1 other alignments
663 #allWeights = zeros((Conf.numFeatures,1+1))
664 #allWeights[:,0] = trueWeight[:,0]
665
666 #AlignmentScores = [0.0]*(1+1)
667 #AlignmentScores[0] = trueAlignmentScore
668
669 ################## Calculate wrong alignment(s) ######################
670
671 # Compute donor, acceptor with penalty_lookup_new
672 # returns two double lists
673 donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
674
675 #myalign wants the acceptor site on the g of the ag
676 acceptor = acceptor[1:]
677 acceptor.append(-inf)
678
679 dna = str(dna)
680 est = str(est)
681 dna_len = len(dna)
682 est_len = len(est)
683
684
685 ps = h.convert2SWIG()
686
687 prb = QPalmaDP.createDoubleArrayFromList(quality)
688 chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
689
690 matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
691 mm_len = Conf.sizeMatchmatrix[0]*Conf.sizeMatchmatrix[1]
692
693 d_len = len(donor)
694 donor = QPalmaDP.createDoubleArrayFromList(donor)
695 a_len = len(acceptor)
696 acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
697
698 # Create the alignment object representing the interface to the C/C++ code.
699 currentAlignment = QPalmaDP.Alignment(Conf.numQualPlifs,Conf.numQualSuppPoints, self.use_quality_scores)
700
701 c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
702
703 # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
704 currentAlignment.myalign(1, dna, dna_len,\
705 est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
706 acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
707 print_matrix)
708
709 c_SpliceAlign = QPalmaDP.createIntArrayFromList([0]*(dna_len*1))
710 c_EstAlign = QPalmaDP.createIntArrayFromList([0]*(est_len*1))
711 c_WeightMatch = QPalmaDP.createIntArrayFromList([0]*(mm_len*1))
712 c_DPScores = QPalmaDP.createDoubleArrayFromList([.0]*1)
713
714 result_len = currentAlignment.getResultLength()
715 c_dna_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
716 c_est_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
717
718 currentAlignment.getAlignmentArrays(c_dna_array,c_est_array)
719
720 c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(Conf.totalQualSuppPoints))
721
722 currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
723 c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
724
725 newSpliceAlign = zeros((dna_len,1))
726 newEstAlign = zeros((est_len,1))
727 newWeightMatch = zeros((mm_len,1))
728 newQualityPlifsFeatures = zeros((Conf.totalQualSuppPoints,1))
729
730 for i in range(dna_len):
731 newSpliceAlign[i] = c_SpliceAlign[i]
732
733 for i in range(est_len):
734 newEstAlign[i] = c_EstAlign[i]
735
736 for i in range(mm_len):
737 newWeightMatch[i] = c_WeightMatch[i]
738
739 newDPScores = c_DPScores[0]
740
741 for i in range(Conf.totalQualSuppPoints):
742 newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
743
744 del c_SpliceAlign
745 del c_EstAlign
746 del c_WeightMatch
747 del c_DPScores
748 del c_qualityPlifsFeatures
749 del currentAlignment
750
751 newSpliceAlign = newSpliceAlign.reshape(1,dna_len)
752 newWeightMatch = newWeightMatch.reshape(1,mm_len)
753 true_map = [0]*2
754 true_map[0] = 1
755 pathNr = 0
756
757 dna_array = [0.0]*result_len
758 est_array = [0.0]*result_len
759
760 for r_idx in range(result_len):
761 dna_array[r_idx] = c_dna_array[r_idx]
762 est_array[r_idx] = c_est_array[r_idx]
763
764 _newSpliceAlign = newSpliceAlign.flatten().tolist()[0]
765 _newEstAlign = newEstAlign.flatten().tolist()[0]
766
767 line1,line2,line3 = pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array)
768 self.plog(line1+'\n')
769 self.plog(line2+'\n')
770 self.plog(line3+'\n')
771
772 #weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a, h, newSpliceAlign.flatten().tolist()[0], don_supp, acc_supp)
773
774 #decodedQualityFeatures = zeros((Conf.totalQualSuppPoints,1))
775 #for qidx in range(Conf.totalQualSuppPoints):
776 # decodedQualityFeatures[qidx] = newQualityPlifsFeatures[qidx]
777
778 # Gewichte in restliche Zeilen der Matrix speichern
779 #wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch.T, decodedQualityFeatures])
780 #allWeights[:,pathNr+1] = wp
781
782 #hpen = mat(h.penalties).reshape(len(h.penalties),1)
783 #dpen = mat(d.penalties).reshape(len(d.penalties),1)
784 #apen = mat(a.penalties).reshape(len(a.penalties),1)
785 #features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties])
786 #AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
787 e1_b_off,e1_e_off,e2_b_off,e2_e_off,newExons = self.evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign,currentSplitPos,exampleIdx)
788
789 if e1_b_off != None:
790 exon1Begin.append(e1_b_off)
791 exon1End.append(e1_e_off)
792 exon2Begin.append(e2_b_off)
793 exon2End.append(e2_e_off)
794 else:
795 allWrongExons.append((newExons,exons))
796
797 logfile = self.logfh
798
799 if e1_b_off == 0 and e1_e_off == 0 and e2_b_off == 0 and e2_e_off == 0:
800 print >> logfile, 'example %d correct' % exampleIdx
801 else:
802 print >> logfile, 'example %d wrong' % exampleIdx
803
804 e1Begin_pos,e1Begin_neg,e1End_pos,e1End_neg,mean_e1Begin_neg,mean_e1End_neg = self.evaluatePositions(exon1Begin,exon1End)
805 e2Begin_pos,e2Begin_neg,e2End_pos,e2End_neg,mean_e2Begin_neg,mean_e2End_neg = self.evaluatePositions(exon2Begin,exon2End)
806
807 all_pos_correct = 0
808 for idx in range(len(exon1Begin)):
809 if exon1Begin[idx] == 0 and exon1End[idx] == 0\
810 and exon2Begin[idx] == 0 and exon2End[idx] == 0:
811 all_pos_correct += 1
812
813 logfile = self.logfh
814 print >> logfile, 'Total num. of examples: %d' % numExamples
815 print >> logfile, 'Number of total correct examples: %d' % all_pos_correct
816 print >> logfile, 'Correct positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_pos),len(e1End_pos),len(e2Begin_pos),len(e2End_pos))
817 print >> logfile, 'Incorrect positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_neg),len(e1End_neg),len(e2Begin_neg),len(e2End_neg))
818 print >> logfile, 'Mean of pos. offset:\t\t%.2f\t%.2f\t%.2f\t%.2f' % (mean_e1Begin_neg,mean_e1End_neg,mean_e2Begin_neg,mean_e2End_neg)
819
820 print 'Total num. of examples: %d' % numExamples
821 print 'Number of total correct examples: %d' % all_pos_correct
822 print 'Correct positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_pos),len(e1End_pos),len(e2Begin_pos),len(e2End_pos))
823 print 'Incorrect positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_neg),len(e1End_neg),len(e2Begin_neg),len(e2End_neg))
824 print 'Mean of pos. offset:\t\t%.2f\t%.2f\t%.2f\t%.2f' % (mean_e1Begin_neg,mean_e1End_neg,mean_e2Begin_neg,mean_e2End_neg)
825
826 print 'Prediction completed'
827
828 def evaluatePositions(self,eBegin,eEnd):
829
830 eBegin_pos = [elem for elem in eBegin if elem == 0]
831 eBegin_neg = [elem for elem in eBegin if elem != 0]
832 eEnd_pos = [elem for elem in eEnd if elem == 0]
833 eEnd_neg = [elem for elem in eEnd if elem != 0]
834
835 mean_eBegin_neg = 0
836 for idx in range(len(eBegin_neg)):
837 mean_eBegin_neg += eBegin_neg[idx]
838
839 try:
840 mean_eBegin_neg /= 1.0*len(eBegin_neg)
841 except:
842 mean_eBegin_neg = -1
843
844 mean_eEnd_neg = 0
845 for idx in range(len(eEnd_neg)):
846 mean_eEnd_neg += eEnd_neg[idx]
847
848 try:
849 mean_eEnd_neg /= 1.0*len(eEnd_neg)
850 except:
851 mean_eEnd_neg = -1
852
853 return eBegin_pos,eBegin_neg,eEnd_pos,eEnd_neg,mean_eBegin_neg,mean_eEnd_neg
854
855 def evaluateExample(self,dna,est,exons,SpliceAlign,newEstAlign,spos,exampleIdx):
856 newExons = []
857 oldElem = -1
858 SpliceAlign = SpliceAlign.flatten().tolist()[0]
859 SpliceAlign.append(-1)
860 for pos,elem in enumerate(SpliceAlign):
861 if pos == 0:
862 oldElem = -1
863 else:
864 oldElem = SpliceAlign[pos-1]
865
866 if oldElem != 0 and elem == 0: # start of exon
867 newExons.append(pos)
868
869 if oldElem == 0 and elem != 0: # end of exon
870 newExons.append(pos)
871
872 e1_b_off,e1_e_off,e2_b_off,e2_e_off = 0,0,0,0
873
874 self.plog(("Example %d"%exampleIdx) + str(exons) + " "+ str(newExons) + "\n")
875 #self.plog(("SpliceAlign " + str(SpliceAlign)+ "\n"))
876 self.plog(("read: " + str(est)+ "\n"))
877
878 if len(newExons) == 4:
879 e1_begin,e1_end = newExons[0],newExons[1]
880 e2_begin,e2_end = newExons[2],newExons[3]
881 else:
882 return None,None,None,None,newExons
883
884 e1_b_off = int(math.fabs(e1_begin - exons[0,0]))
885 e1_e_off = int(math.fabs(e1_end - exons[0,1]))
886
887 e2_b_off = int(math.fabs(e2_begin - exons[1,0]))
888 e2_e_off = int(math.fabs(e2_end - exons[1,1]))
889
890 return e1_b_off,e1_e_off,e2_b_off,e2_e_off,newExons
891
892 def fetch_genome_info(ginfo_filename):
893 if not os.path.exists(ginfo_filename):
894 cmd = ['']*4
895 cmd[0] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/utils'
896 cmd[1] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/genomes'
897 cmd[2] = 'genome_info = init_genome(\'%s\')' % gen_file
898 cmd[3] = 'save genome_info.mat genome_info'
899 full_cmd = "matlab -nojvm -nodisplay -r \"%s; %s; %s; %s; exit\"" % (cmd[0],cmd[1],cmd[2],cmd[3])
900
901 obj = subprocess.Popen(full_cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
902 out,err = obj.communicate()
903 assert err == '', 'An error occured!\n%s'%err
904
905 ginfo = scipy.io.loadmat('genome_info.mat')
906 cPickle.dump(self.genome_info,open(ginfo_filename,'w+'))
907 return ginfo['genome_info']
908
909 else:
910 return cPickle.load(open(ginfo_filename))
911
912
913
914 def calcStat(Acceptor,Donor,Exons):
915 maxAcc = -100
916 minAcc = 100
917 maxDon = -100
918 minDon = 100
919
920 acc_vec_pos = []
921 acc_vec_neg = []
922 don_vec_pos = []
923 don_vec_neg = []
924
925 for jdx in range(len(Acceptors)):
926 currentExons = Exons[jdx]
927 currentAcceptor = Acceptors[jdx]
928 currentAcceptor = currentAcceptor[1:]
929 currentAcceptor.append(-inf)
930 currentDonor = Donors[jdx]
931
932 for idx,elem in enumerate(currentAcceptor):
933 if idx == (int(currentExons[1,0])-1): # acceptor site
934 acc_vec_pos.append(elem)
935 else:
936 acc_vec_neg.append(elem)
937
938 for idx,elem in enumerate(currentDonor):
939 if idx == (int(currentExons[0,1])): # donor site
940 don_vec_pos.append(elem)
941 else:
942 don_vec_neg.append(elem)
943
944 acc_pos = [elem for elem in acc_vec_pos if elem != -inf]
945 acc_neg = [elem for elem in acc_vec_neg if elem != -inf]
946 don_pos = [elem for elem in don_vec_pos if elem != -inf]
947 don_neg = [elem for elem in don_vec_neg if elem != -inf]
948
949 pdb.set_trace()
950
951 for idx in range(len(Sequences)):
952 acc = [elem for elem in Acceptors[idx] if elem != -inf]
953 maxAcc = max(max(acc),maxAcc)
954 minAcc = min(min(acc),minAcc)
955
956 don = [elem for elem in Donors[idx] if elem != -inf]
957 maxDon = max(max(don),maxDon)
958 minDon = min(min(don),minDon)
959
960 def pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array):
961 (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds, tExonSizes, tStarts, tEnds) =\
962 pu.get_splice_info(_newSpliceAlign,_newEstAlign)
963
964 t_strand = '+'
965 translation = '-ACGTN_' #how aligned est and dna sequence is displayed
966 #(gap_char, 5 nucleotides, intron_char)
967 comparison_char = '| ' #first: match_char, second: intron_char
968
969 (exon_length, identity, ssQuality, exonQuality, comparison, qIDX, tIDX) =\
970 pu.comp_identity(dna_array, est_array, t_strand, qStart, tStart, translation, comparison_char)
971
972 first_identity = None
973 last_identity = None
974 for i in range(len(comparison)):
975 if comparison[i] == '|' and first_identity is None:
976 first_identity = i
977 if comparison[-i] == '|' and last_identity is None:
978 last_identity = len(comparison) - i - 1
979
980 try:
981 for idx in range(len(dna_array)):
982 dna_array[idx] = translation[int(dna_array[idx])]
983 est_array[idx] = translation[int(est_array[idx])]
984 except:
985 pdb.set_trace()
986
987 line1 = "".join(dna_array)
988 line2 = comparison
989 line3 = "".join(est_array)
990
991 return line1,line2,line3
992
993 def calc_info(acc,don,exons,qualities):
994 min_score = 100
995 max_score = -100
996
997 for elem in acc:
998 for score in elem:
999 if score != -inf:
1000 min_score = min(min_score,score)
1001 max_score = max(max_score,score)
1002
1003 print 'Acceptors max/min: %f / %f' % (max_score,min_score)
1004
1005 min_score = 100
1006 max_score = -100
1007
1008 for elem in don:
1009 for score in elem:
1010 if score != -inf:
1011 min_score = min(min_score,score)
1012 max_score = max(max_score,score)
1013
1014 print 'Donor max/min: %f / %f' % (max_score,min_score)
1015
1016 min_score = 10000
1017 max_score = 0
1018 mean_intron_len = 0
1019
1020 for elem in exons:
1021 intron_len = int(elem[1,0] - elem[0,1])
1022 mean_intron_len += intron_len
1023 min_score = min(min_score,intron_len)
1024 max_score = max(max_score,intron_len)
1025
1026 mean_intron_len /= 1.0*len(exons)
1027 print 'Intron length max/min: %d / %d mean length %f' % (max_score,min_score,mean_intron_len)
1028
1029 min_score = 10000
1030 max_score = 0
1031 mean_quality = 0
1032
1033 for qVec in qualities:
1034 for elem in qVec:
1035 min_score = min(min_score,elem)
1036 max_score = max(max_score,elem)
1037
1038 print 'Quality values max/min: %d / %d mean' % (max_score,min_score)
1039
1040
1041 def zero_out(vec,intervals):
1042 for interval in intervals:
1043 for pos in interval:
1044 vec[pos] = 0.0
1045
1046 if __name__ == '__main__':
1047 qpalma = QPalma()
1048
1049 mode = sys.argv[1]
1050
1051 if len(sys.argv) == 2 and mode == 'train':
1052 qpalma.train()
1053
1054 elif len(sys.argv) == 3 and mode == 'predict':
1055 param_filename = sys.argv[2]
1056 assert os.path.exists(param_filename)
1057 qpalma.evaluate(param_filename)
1058 else:
1059 print 'You have to choose between training or prediction mode:'
1060 print 'python qpalma. py (train|predict) <param_file>'