+ changed filter behavior (tendency towards smaller read fragments)
[qpalma.git] / scripts / qpalma_main.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 ###########################################################
5 #
6 # The QPalma project aims at extending the Palma project
7 # to be able to use Solexa reads toegether with their
8 # quality scores.
9 #
10 # This file represents the conversion of the main matlab
11 # training loop for Palma to python.
12 #
13 # Author: Fabio De Bona
14 #
15 ###########################################################
16
17 import sys
18 import subprocess
19 import scipy.io
20 import pdb
21 import os.path
22
23 from numpy.matlib import mat,zeros,ones,inf
24 from numpy.linalg import norm
25
26 import QPalmaDP
27 import qpalma
28 from qpalma.SIQP_CPX import SIQPSolver
29 from qpalma.DataProc import *
30 from qpalma.generateEvaluationData import *
31 from qpalma.computeSpliceWeights import *
32 from qpalma.set_param_palma import *
33 from qpalma.computeSpliceAlignWithQuality import *
34 from qpalma.penalty_lookup_new import *
35 from qpalma.compute_donacc import *
36 from qpalma.export_param import *
37 from qpalma.TrainingParam import Param
38 from qpalma.Plif import Plf
39
40 from qpalma.tools.splicesites import getDonAccScores
41 from qpalma.Configuration import *
42
43 from compile_dataset import compile_d
44
45 import palma.palma_utils as pu
46 from palma.output_formating import print_results
47
48 class para:
49 pass
50
51 class QPalma:
52 """
53 A training method for the QPalma project
54 """
55
56 def __init__(self):
57 self.ARGS = Param()
58
59 #compile_d(Conf.gff_fn,Conf.dna_flat_fn,Conf.filtered_fn,Conf.remapped_fn,Conf.tmp_dir,Conf.dataset_fn,False)
60 #compile_d(Conf.gff_fn,Conf.dna_flat_fn,Conf.filtered_fn,Conf.remapped_fn,Conf.tmp_dir,Conf.dataset_fn,True)
61
62 data_filename = Conf.dataset_fn
63 Sequences, Acceptors, Donors, Exons, Ests, OriginalEsts, Qualities, SplitPos = paths_load_data(data_filename,'training',None,self.ARGS)
64
65 # Load the whole dataset
66 if Conf.mode == 'normal':
67 self.use_quality_scores = False
68
69 elif Conf.mode == 'using_quality_scores':
70 self.use_quality_scores = True
71 else:
72 assert(False)
73
74 self.Sequences = Sequences
75 self.Exons = Exons
76 self.Ests = Ests
77 self.OriginalEsts= OriginalEsts
78 self.Qualities = Qualities
79 self.SplitPos = SplitPos
80 self.Donors = Donors
81 self.Acceptors = Acceptors
82
83 calc_info(self.Acceptors,self.Donors,self.Exons,self.Qualities)
84 pdb.set_trace()
85 print 'leaving constructor...'
86
87 def plog(self,string):
88 self.logfh.write(string)
89 self.logfh.flush()
90
91 def train(self):
92 self.logfh = open('_qpalma_train.log','w+')
93
94 beg = Conf.training_begin
95 end = Conf.training_end
96
97 Sequences = self.Sequences[beg:end]
98 Exons = self.Exons[beg:end]
99 Ests = self.Ests[beg:end]
100 OriginalEsts= self.OriginalEsts[beg:end]
101 Qualities = self.Qualities[beg:end]
102 Acceptors = self.Acceptors[beg:end]
103 Donors = self.Donors[beg:end]
104
105 # number of training instances
106 N = numExamples = len(Sequences)
107 assert len(Exons) == N and len(Ests) == N\
108 and len(Qualities) == N and len(Acceptors) == N\
109 and len(Donors) == N, 'The Exons,Acc,Don,.. arrays are of different lengths'
110 self.plog('Number of training examples: %d\n'% numExamples)
111
112 self.noImprovementCtr = 0
113 self.oldObjValue = 1e8
114
115 iteration_steps = Conf.iter_steps ; #upper bound on iteration steps
116 remove_duplicate_scores = Conf.remove_duplicate_scores
117 print_matrix = Conf.print_matrix
118 anzpath = Conf.anzpath
119
120 # Initialize parameter vector / param = numpy.matlib.rand(126,1)
121 param = Conf.fixedParam
122
123 lengthSP = Conf.numLengthSuppPoints
124 donSP = Conf.numDonSuppPoints
125 accSP = Conf.numAccSuppPoints
126 mmatrixSP = Conf.sizeMatchmatrix[0]\
127 *Conf.sizeMatchmatrix[1]
128 numq = Conf.numQualSuppPoints
129 totalQualSP = Conf.totalQualSuppPoints
130
131 #i1 = range(0,lengthSP)
132 #i2 = range(lengthSP,lengthSP+donSP)
133 #i3 = range(lengthSP+donSP,lengthSP+donSP+accSP)
134 #i4 = range(lengthSP+donSP+accSP,lengthSP+donSP+accSP+mmatrixSP)
135 #i5 = range(lengthSP+donSP+accSP+mmatrixSP,lengthSP+donSP+accSP+mmatrixSP+totalQualSP)
136
137 #offset =lengthSP+donSP+accSP+mmatrixSP
138 #intervals = [[offset+2,offset+3]] #i5,\ #i2,#i3,#i4,#i5]
139 #intervals = []
140
141 #param[offset+2] = 10.0
142 #param[offset+3] = 10.0
143
144 #zero_out(param,intervals)
145
146 #pdb.set_trace()
147
148 # Set the parameters such as limits penalties for the Plifs
149 [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
150
151 # Initialize solver
152 if Conf.USE_OPT:
153 self.plog('Initializing problem...\n')
154 solver = SIQPSolver(Conf.numFeatures,numExamples,Conf.C,self.logfh)
155 #solver.enforceMonotonicity(lengthSP,lengthSP+donSP)
156 #solver.enforceMonotonicity(lengthSP+donSP,lengthSP+donSP+accSP)
157
158 # stores the number of alignments done for each example (best path, second-best path etc.)
159 num_path = [anzpath]*numExamples
160 # stores the gap for each example
161 gap = [0.0]*numExamples
162 #############################################################################################
163 # Training
164 #############################################################################################
165 self.plog('Starting training...\n')
166
167 currentPhi = zeros((Conf.numFeatures,1))
168 totalQualityPenalties = zeros((totalQualSP,1))
169
170 suboptimal_example = 0
171 iteration_nr = 0
172 param_idx = 0
173 const_added_ctr = 0
174
175 # the main training loop
176 while True:
177 if iteration_nr == iteration_steps:
178 break
179
180 for exampleIdx in range(numExamples):
181 if (exampleIdx%100) == 0:
182 print 'Current example nr %d' % exampleIdx
183
184 dna = Sequences[exampleIdx]
185 est = Ests[exampleIdx]
186 est = "".join(est)
187 est = est.lower()
188 est = est.replace('-','')
189 original_est = OriginalEsts[exampleIdx]
190 original_est = "".join(original_est)
191 original_est = original_est.lower()
192
193 dna_len = len(dna)
194 est_len = len(est)
195
196 assert len(est) == Conf.read_size, pdb.set_trace()
197
198 if Conf.mode == 'normal':
199 quality = [40]*len(est)
200
201 if Conf.mode == 'using_quality_scores':
202 quality = Qualities[exampleIdx]
203
204 # if Conf.disable_quality_scores
205 #quality = [40]*len(est)
206
207 exons = Exons[exampleIdx]
208 don_supp = Donors[exampleIdx]
209 acc_supp = Acceptors[exampleIdx]
210
211 # if Conf.disable_splice_signals
212 #don_supp = [0.0]*len(don_supp)
213 #acc_supp = [0.0]*len(acc_supp)
214
215 for idx,elem in enumerate(don_supp):
216 if elem != -inf:
217 don_supp[idx] = 0.0
218
219 for idx,elem in enumerate(acc_supp):
220 if elem != -inf:
221 acc_supp[idx] = 0.0
222
223 # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
224 if Conf.mode == 'using_quality_scores':
225 trueSpliceAlign, trueWeightMatch, trueWeightQuality ,dna_calc =\
226 computeSpliceAlignWithQuality(dna, exons, est, original_est, quality, qualityPlifs)
227 else:
228 trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
229
230 dna_calc = dna_calc.replace('-','')
231
232 # Calculate the weights
233 trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
234 trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
235
236 currentPhi[0:lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
237 currentPhi[lengthSP:lengthSP+donSP] = mat(d.penalties[:]).reshape(donSP,1)
238 currentPhi[lengthSP+donSP:lengthSP+donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
239 currentPhi[lengthSP+donSP+accSP:lengthSP+donSP+accSP+mmatrixSP] = mmatrix[:]
240
241 if Conf.mode == 'using_quality_scores':
242 totalQualityPenalties = param[-totalQualSP:]
243 currentPhi[lengthSP+donSP+accSP+mmatrixSP:] = totalQualityPenalties[:]
244
245 # Calculate w'phi(x,y) the total score of the alignment
246 trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
247
248 # The allWeights vector is supposed to store the weight parameter
249 # of the true alignment as well as the weight parameters of the
250 # num_path[exampleIdx] other alignments
251 allWeights = zeros((Conf.numFeatures,num_path[exampleIdx]+1))
252 allWeights[:,0] = trueWeight[:,0]
253
254 AlignmentScores = [0.0]*(num_path[exampleIdx]+1)
255 AlignmentScores[0] = trueAlignmentScore
256 ################## Calculate wrong alignment(s) ######################
257 # Compute donor, acceptor with penalty_lookup_new
258 # returns two double lists
259 donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
260
261 #myalign wants the acceptor site on the g of the ag
262 acceptor = acceptor[1:]
263 acceptor.append(-inf)
264
265 # check that splice site scores are at dna positions as expected by
266 # the dynamic programming component
267
268 #for d_pos in [pos for pos,elem in enumerate(donor) if elem != -inf]:
269 # assert dna[d_pos] == 'g' and (dna[d_pos+1] == 'c'\
270 # or dna[d_pos+1] == 't'), pdb.set_trace()
271 #
272 #for a_pos in [pos for pos,elem in enumerate(acceptor) if elem != -inf]:
273 # assert dna[a_pos-1] == 'a' and dna[a_pos] == 'g', pdb.set_trace()
274
275 ps = h.convert2SWIG()
276
277 prb = QPalmaDP.createDoubleArrayFromList(quality)
278 chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
279
280 matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
281 mm_len = Conf.sizeMatchmatrix[0]*Conf.sizeMatchmatrix[1]
282
283 d_len = len(donor)
284 donor = QPalmaDP.createDoubleArrayFromList(donor)
285 a_len = len(acceptor)
286 acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
287
288 # Create the alignment object representing the interface to the C/C++ code.
289 currentAlignment = QPalmaDP.Alignment(Conf.numQualPlifs,Conf.numQualSuppPoints, self.use_quality_scores)
290 c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
291 #print 'Calling myalign...'
292 # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
293 currentAlignment.myalign( num_path[exampleIdx], dna, dna_len,\
294 est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
295 acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
296 print_matrix)
297
298 c_SpliceAlign = QPalmaDP.createIntArrayFromList([0]*(dna_len*num_path[exampleIdx]))
299 c_EstAlign = QPalmaDP.createIntArrayFromList([0]*(est_len*num_path[exampleIdx]))
300 c_WeightMatch = QPalmaDP.createIntArrayFromList([0]*(mm_len*num_path[exampleIdx]))
301 c_DPScores = QPalmaDP.createDoubleArrayFromList([.0]*num_path[exampleIdx])
302
303 c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(Conf.totalQualSuppPoints*num_path[exampleIdx]))
304
305 currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
306 c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
307
308 #print 'After calling getAlignmentResults...'
309
310 newSpliceAlign = zeros((num_path[exampleIdx]*dna_len,1))
311 newEstAlign = zeros((est_len*num_path[exampleIdx],1))
312 newWeightMatch = zeros((num_path[exampleIdx]*mm_len,1))
313 newDPScores = zeros((num_path[exampleIdx],1))
314 newQualityPlifsFeatures = zeros((Conf.totalQualSuppPoints*num_path[exampleIdx],1))
315
316 #print 'newSpliceAlign'
317 for i in range(dna_len*num_path[exampleIdx]):
318 newSpliceAlign[i] = c_SpliceAlign[i]
319 # print '%f' % (spliceAlign[i])
320
321 #print 'newEstAlign'
322 for i in range(est_len*num_path[exampleIdx]):
323 newEstAlign[i] = c_EstAlign[i]
324 # print '%f' % (spliceAlign[i])
325
326 #print 'weightMatch'
327 for i in range(mm_len*num_path[exampleIdx]):
328 newWeightMatch[i] = c_WeightMatch[i]
329 # print '%f' % (weightMatch[i])
330
331 #print 'ViterbiScores'
332 for i in range(num_path[exampleIdx]):
333 newDPScores[i] = c_DPScores[i]
334
335 if self.use_quality_scores:
336 for i in range(Conf.totalQualSuppPoints*num_path[exampleIdx]):
337 newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
338
339 # equals palma up to here
340
341 #print "Calling destructors"
342 del c_SpliceAlign
343 del c_EstAlign
344 del c_WeightMatch
345 del c_DPScores
346 del c_qualityPlifsFeatures
347 #del currentAlignment
348
349 newSpliceAlign = newSpliceAlign.reshape(num_path[exampleIdx],dna_len)
350 newWeightMatch = newWeightMatch.reshape(num_path[exampleIdx],mm_len)
351
352 newQualityPlifsFeatures = newQualityPlifsFeatures.reshape(num_path[exampleIdx],Conf.totalQualSuppPoints)
353 # Calculate weights of the respective alignments Note that we are
354 # calculating n-best alignments without hamming loss, so we
355 # have to keep track which of the n-best alignments correspond to
356 # the true one in order not to incorporate a true alignment in the
357 # constraints. To keep track of the true and false alignments we
358 # define an array true_map with a boolean indicating the
359 # equivalence to the true alignment for each decoded alignment.
360 true_map = [0]*(num_path[exampleIdx]+1)
361 true_map[0] = 1
362
363 for pathNr in range(num_path[exampleIdx]):
364 #print 'decodedWeights'
365 weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a,\
366 h, newSpliceAlign[pathNr,:].flatten().tolist()[0], don_supp,\
367 acc_supp)
368
369 decodedQualityFeatures = zeros((Conf.totalQualSuppPoints,1))
370 decodedQualityFeatures = newQualityPlifsFeatures[pathNr,:].T
371 # Gewichte in restliche Zeilen der Matrix speichern
372 allWeights[:,pathNr+1] = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch[pathNr,:].T, decodedQualityFeatures[:]])
373
374 hpen = mat(h.penalties).reshape(len(h.penalties),1)
375 dpen = mat(d.penalties).reshape(len(d.penalties),1)
376 apen = mat(a.penalties).reshape(len(a.penalties),1)
377 features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties[:]])
378
379 #zero_out(features,intervals)
380 #zero_out(allWeights[:,pathNr+1],intervals)
381
382 AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
383
384 #pdb.set_trace()
385
386 distinct_scores = False
387 if math.fabs(AlignmentScores[pathNr] - AlignmentScores[pathNr+1]) > 1e-5:
388 distinct_scores = True
389
390 # Check wether scalar product + loss equals viterbi score
391 if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) <= 1e-5:
392 pdb.set_trace()
393
394 self.plog(" scalar prod (correct) : %f\n"%AlignmentScores[0])
395 self.plog(" scalar prod (pred.) : %f %f\n"%(newDPScores[pathNr,0],AlignmentScores[pathNr+1]))
396
397 #pdb.set_trace()
398
399 # if the pathNr-best alignment is very close to the true alignment consider it as true
400 if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
401 true_map[pathNr+1] = 1
402
403 #assert AlignmentScores[0] <= max(AlignmentScores[1:]) + 1e-6, pdb.set_trace()
404 if not trueAlignmentScore <= max(AlignmentScores[1:]) + 1e-6:
405 print "suboptimal_example %d\n" %exampleIdx
406 #trueSpliceAlign, trueWeightMatch, trueWeightQuality dna_calc=\
407 #computeSpliceAlignWithQuality(dna, exons, est, original_est, quality, qualityPlifs)
408
409 pdb.set_trace()
410 suboptimal_example += 1
411 self.plog("suboptimal_example %d\n" %exampleIdx)
412
413 # the true label sequence should not have a larger score than the maximal one WHYYYYY?
414 # this means that all n-best paths are to close to each other
415 # we have to extend the n-best search to a (n+1)-best
416 if len([elem for elem in true_map if elem == 1]) == len(true_map):
417 num_path[exampleIdx] = num_path[exampleIdx]+1
418
419 # Choose true and first false alignment for extending
420 firstFalseIdx = -1
421 for map_idx,elem in enumerate(true_map):
422 if elem == 0:
423 firstFalseIdx = map_idx
424 break
425
426 if False:
427 self.plog("Is considered as: %d\n" % true_map[1])
428
429 result_len = currentAlignment.getResultLength()
430 c_dna_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
431 c_est_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
432
433 currentAlignment.getAlignmentArrays(c_dna_array,c_est_array)
434
435 dna_array = [0.0]*result_len
436 est_array = [0.0]*result_len
437
438 for r_idx in range(result_len):
439 dna_array[r_idx] = c_dna_array[r_idx]
440 est_array[r_idx] = c_est_array[r_idx]
441
442 _newSpliceAlign = newSpliceAlign[0].flatten().tolist()[0]
443 _newEstAlign = newEstAlign[0].flatten().tolist()[0]
444 line1,line2,line3 = pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array)
445 self.plog(line1+'\n')
446 self.plog(line2+'\n')
447 self.plog(line3+'\n')
448
449 # if there is at least one useful false alignment add the
450 # corresponding constraints to the optimization problem
451 if firstFalseIdx != -1:
452 firstFalseWeights = allWeights[:,firstFalseIdx]
453 differenceVector = trueWeight - firstFalseWeights
454 #pdb.set_trace()
455
456 if Conf.USE_OPT:
457 const_added = solver.addConstraint(differenceVector, exampleIdx)
458 const_added_ctr += 1
459 #
460 # end of one example processing
461 #
462
463 # call solver every nth example //added constraint
464 if exampleIdx != 0 and exampleIdx % Conf.numConstraintsPerRound == 0 and Conf.USE_OPT:
465 objValue,w,self.slacks = solver.solve()
466
467 if math.fabs(objValue - self.oldObjValue) <= 1e-6:
468 self.noImprovementCtr += 1
469
470 if self.noImprovementCtr == numExamples+1:
471 break
472
473 self.oldObjValue = objValue
474 print "objValue is %f" % objValue
475
476 sum_xis = 0
477 for elem in self.slacks:
478 sum_xis += elem
479
480 print 'sum of slacks is %f'% sum_xis
481 self.plog('sum of slacks is %f\n'% sum_xis)
482
483 for i in range(len(param)):
484 param[i] = w[i]
485
486 #pdb.set_trace()
487 cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
488 param_idx += 1
489 [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
490
491 #
492 # end of one iteration through all examples
493 #
494
495 self.plog("suboptimal rounds %d\n" %suboptimal_example)
496
497 if self.noImprovementCtr == numExamples*2:
498 break
499
500 iteration_nr += 1
501
502 #
503 # end of optimization
504 #
505 print 'Training completed'
506
507 pa = para()
508 pa.h = h
509 pa.d = d
510 pa.a = a
511 pa.mmatrix = mmatrix
512 pa.qualityPlifs = qualityPlifs
513
514 cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
515 #cPickle.dump(pa,open('elegans.param','w+'))
516 self.logfh.close()
517
518 def evaluate(self,param_filename):
519 beg = Conf.prediction_begin
520 end = Conf.prediction_end
521 self.logfh = open('_qpalma_predict.log','w+')
522
523 # predict on training set
524 print '##### Prediction on the training set #####'
525 self.predict(param_filename,0,beg)
526
527 # predict on test set
528 print '##### Prediction on the test set #####'
529 self.predict(param_filename,beg,end)
530
531 self.logfh.close()
532 #pdb.set_trace()
533
534 def predict(self,param_filename,beg,end):
535
536 Sequences = self.Sequences[beg:end]
537 Exons = self.Exons[beg:end]
538 Ests = self.Ests[beg:end]
539 OriginalEsts = self.OriginalEsts[beg:end]
540 Qualities = self.Qualities[beg:end]
541 Acceptors = self.Acceptors[beg:end]
542 Donors = self.Donors[beg:end]
543 SplitPos = self.SplitPos[beg:end]
544
545 # number of training instances
546 N = numExamples = len(Sequences)
547 assert len(Exons) == N and len(Ests) == N\
548 and len(Qualities) == N and len(Acceptors) == N\
549 and len(Donors) == N, 'The Exons,Acc,Don,.. arrays are of different lengths'
550 self.plog('Number of training examples: %d\n'% numExamples)
551
552 self.noImprovementCtr = 0
553 self.oldObjValue = 1e8
554
555 remove_duplicate_scores = Conf.remove_duplicate_scores
556 print_matrix = Conf.print_matrix
557 anzpath = Conf.anzpath
558
559 param = cPickle.load(open(param_filename))
560
561 # Set the parameters such as limits penalties for the Plifs
562 [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
563
564 #############################################################################################
565 # Prediction
566 #############################################################################################
567 self.plog('Starting prediction...\n')
568
569 donSP = Conf.numDonSuppPoints
570 accSP = Conf.numAccSuppPoints
571 lengthSP = Conf.numLengthSuppPoints
572 mmatrixSP = Conf.sizeMatchmatrix[0]\
573 *Conf.sizeMatchmatrix[1]
574 numq = Conf.numQualSuppPoints
575 totalQualSP = Conf.totalQualSuppPoints
576
577 currentPhi = zeros((Conf.numFeatures,1))
578 totalQualityPenalties = zeros((totalQualSP,1))
579
580 exon1Begin = []
581 exon1End = []
582 exon2Begin = []
583 exon2End = []
584 allWrongExons = []
585
586 for exampleIdx in range(numExamples):
587 dna = Sequences[exampleIdx]
588 est = Ests[exampleIdx]
589 est = "".join(est)
590 est = est.lower()
591 est = est.replace('-','')
592 original_est = OriginalEsts[exampleIdx]
593 original_est = "".join(original_est)
594 original_est = original_est.lower()
595 currentSplitPos = SplitPos[exampleIdx]
596
597 if Conf.mode == 'normal':
598 quality = [40]*len(est)
599
600 if Conf.mode == 'using_quality_scores':
601 quality = Qualities[exampleIdx]
602
603 #quality = [40]*len(est)
604
605 exons = Exons[exampleIdx]
606 # NoiseMatrix = Noises[exampleIdx]
607 don_supp = Donors[exampleIdx]
608 acc_supp = Acceptors[exampleIdx]
609
610 #don_supp = [0.0]*len(don_supp)
611 #acc_supp = [0.0]*len(acc_supp)
612
613 # for idx,elem in enumerate(don_supp):
614 # if elem != -inf:
615 # don_supp[idx] = 0.0
616
617 # for idx,elem in enumerate(acc_supp):
618 # if elem != -inf:
619 # acc_supp[idx] = 0.0
620
621 for idx,elem in enumerate(don_supp):
622 if elem != -inf:
623 don_supp[idx] = 0.0
624
625 for idx,elem in enumerate(acc_supp):
626 if elem != -inf:
627 acc_supp[idx] = 0.0
628
629 ## Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
630 #if Conf.mode == 'using_quality_scores':
631 # trueSpliceAlign, trueWeightMatch, trueWeightQuality =\
632 # computeSpliceAlignWithQuality(dna, exons, est, quality, qualityPlifs)
633 #else:
634 # trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
635 #
636 ## Calculate the weights
637 #trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
638 #trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
639
640 #if Conf.mode == 'using_quality_scores':
641 # totalQualityPenalties = param[-totalQualSP:]
642 # currentPhi[donSP+accSP+lengthSP+mmatrixSP:] = totalQualityPenalties[:]
643
644 currentPhi[0:donSP] = mat(d.penalties[:]).reshape(donSP,1)
645 currentPhi[donSP:donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
646 currentPhi[donSP+accSP:donSP+accSP+lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
647 currentPhi[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP] = mmatrix[:]
648
649 # Calculate w'phi(x,y) the total score of the alignment
650 #trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
651
652 # The allWeights vector is supposed to store the weight parameter
653 # of the true alignment as well as the weight parameters of the
654 # 1 other alignments
655 #allWeights = zeros((Conf.numFeatures,1+1))
656 #allWeights[:,0] = trueWeight[:,0]
657
658 #AlignmentScores = [0.0]*(1+1)
659 #AlignmentScores[0] = trueAlignmentScore
660
661 ################## Calculate wrong alignment(s) ######################
662
663 # Compute donor, acceptor with penalty_lookup_new
664 # returns two double lists
665 donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
666
667 #myalign wants the acceptor site on the g of the ag
668 acceptor = acceptor[1:]
669 acceptor.append(-inf)
670
671 dna = str(dna)
672 est = str(est)
673 dna_len = len(dna)
674 est_len = len(est)
675
676
677 ps = h.convert2SWIG()
678
679 prb = QPalmaDP.createDoubleArrayFromList(quality)
680 chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
681
682 matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
683 mm_len = Conf.sizeMatchmatrix[0]*Conf.sizeMatchmatrix[1]
684
685 d_len = len(donor)
686 donor = QPalmaDP.createDoubleArrayFromList(donor)
687 a_len = len(acceptor)
688 acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
689
690 # Create the alignment object representing the interface to the C/C++ code.
691 currentAlignment = QPalmaDP.Alignment(Conf.numQualPlifs,Conf.numQualSuppPoints, self.use_quality_scores)
692
693 c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
694
695 # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
696 currentAlignment.myalign(1, dna, dna_len,\
697 est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
698 acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
699 print_matrix)
700
701 c_SpliceAlign = QPalmaDP.createIntArrayFromList([0]*(dna_len*1))
702 c_EstAlign = QPalmaDP.createIntArrayFromList([0]*(est_len*1))
703 c_WeightMatch = QPalmaDP.createIntArrayFromList([0]*(mm_len*1))
704 c_DPScores = QPalmaDP.createDoubleArrayFromList([.0]*1)
705
706 result_len = currentAlignment.getResultLength()
707 c_dna_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
708 c_est_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
709
710 currentAlignment.getAlignmentArrays(c_dna_array,c_est_array)
711
712 c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(Conf.totalQualSuppPoints))
713
714 currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
715 c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
716
717 newSpliceAlign = zeros((dna_len,1))
718 newEstAlign = zeros((est_len,1))
719 newWeightMatch = zeros((mm_len,1))
720 newQualityPlifsFeatures = zeros((Conf.totalQualSuppPoints,1))
721
722 for i in range(dna_len):
723 newSpliceAlign[i] = c_SpliceAlign[i]
724
725 for i in range(est_len):
726 newEstAlign[i] = c_EstAlign[i]
727
728 for i in range(mm_len):
729 newWeightMatch[i] = c_WeightMatch[i]
730
731 newDPScores = c_DPScores[0]
732
733 for i in range(Conf.totalQualSuppPoints):
734 newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
735
736 del c_SpliceAlign
737 del c_EstAlign
738 del c_WeightMatch
739 del c_DPScores
740 del c_qualityPlifsFeatures
741 del currentAlignment
742
743 newSpliceAlign = newSpliceAlign.reshape(1,dna_len)
744 newWeightMatch = newWeightMatch.reshape(1,mm_len)
745 true_map = [0]*2
746 true_map[0] = 1
747 pathNr = 0
748
749 dna_array = [0.0]*result_len
750 est_array = [0.0]*result_len
751
752 for r_idx in range(result_len):
753 dna_array[r_idx] = c_dna_array[r_idx]
754 est_array[r_idx] = c_est_array[r_idx]
755
756 _newSpliceAlign = newSpliceAlign.flatten().tolist()[0]
757 _newEstAlign = newEstAlign.flatten().tolist()[0]
758
759 line1,line2,line3 = pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array)
760 self.plog(line1+'\n')
761 self.plog(line2+'\n')
762 self.plog(line3+'\n')
763
764 #weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a, h, newSpliceAlign.flatten().tolist()[0], don_supp, acc_supp)
765
766 #decodedQualityFeatures = zeros((Conf.totalQualSuppPoints,1))
767 #for qidx in range(Conf.totalQualSuppPoints):
768 # decodedQualityFeatures[qidx] = newQualityPlifsFeatures[qidx]
769
770 # Gewichte in restliche Zeilen der Matrix speichern
771 #wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch.T, decodedQualityFeatures])
772 #allWeights[:,pathNr+1] = wp
773
774 #hpen = mat(h.penalties).reshape(len(h.penalties),1)
775 #dpen = mat(d.penalties).reshape(len(d.penalties),1)
776 #apen = mat(a.penalties).reshape(len(a.penalties),1)
777 #features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties])
778 #AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
779 e1_b_off,e1_e_off,e2_b_off,e2_e_off,newExons = self.evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign,currentSplitPos,exampleIdx)
780
781 if e1_b_off != None:
782 exon1Begin.append(e1_b_off)
783 exon1End.append(e1_e_off)
784 exon2Begin.append(e2_b_off)
785 exon2End.append(e2_e_off)
786 else:
787 allWrongExons.append((newExons,exons))
788
789 logfile = self.logfh
790
791 if e1_b_off == 0 and e1_e_off == 0 and e2_b_off == 0 and e2_e_off == 0:
792 print >> logfile, 'example %d correct' % exampleIdx
793 else:
794 print >> logfile, 'example %d wrong' % exampleIdx
795
796 e1Begin_pos,e1Begin_neg,e1End_pos,e1End_neg,mean_e1Begin_neg,mean_e1End_neg = self.evaluatePositions(exon1Begin,exon1End)
797 e2Begin_pos,e2Begin_neg,e2End_pos,e2End_neg,mean_e2Begin_neg,mean_e2End_neg = self.evaluatePositions(exon2Begin,exon2End)
798
799 all_pos_correct = 0
800 for idx in range(len(exon1Begin)):
801 if exon1Begin[idx] == 0 and exon1End[idx] == 0\
802 and exon2Begin[idx] == 0 and exon2End[idx] == 0:
803 all_pos_correct += 1
804
805 logfile = self.logfh
806 print >> logfile, 'Total num. of examples: %d' % numExamples
807 print >> logfile, 'Number of total correct examples: %d' % all_pos_correct
808 print >> logfile, 'Correct positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_pos),len(e1End_pos),len(e2Begin_pos),len(e2End_pos))
809 print >> logfile, 'Incorrect positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_neg),len(e1End_neg),len(e2Begin_neg),len(e2End_neg))
810 print >> logfile, 'Mean of pos. offset:\t\t%.2f\t%.2f\t%.2f\t%.2f' % (mean_e1Begin_neg,mean_e1End_neg,mean_e2Begin_neg,mean_e2End_neg)
811
812 print 'Total num. of examples: %d' % numExamples
813 print 'Number of total correct examples: %d' % all_pos_correct
814 print 'Correct positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_pos),len(e1End_pos),len(e2Begin_pos),len(e2End_pos))
815 print 'Incorrect positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_neg),len(e1End_neg),len(e2Begin_neg),len(e2End_neg))
816 print 'Mean of pos. offset:\t\t%.2f\t%.2f\t%.2f\t%.2f' % (mean_e1Begin_neg,mean_e1End_neg,mean_e2Begin_neg,mean_e2End_neg)
817
818 print 'Prediction completed'
819
820 def evaluatePositions(self,eBegin,eEnd):
821
822 eBegin_pos = [elem for elem in eBegin if elem == 0]
823 eBegin_neg = [elem for elem in eBegin if elem != 0]
824 eEnd_pos = [elem for elem in eEnd if elem == 0]
825 eEnd_neg = [elem for elem in eEnd if elem != 0]
826
827 mean_eBegin_neg = 0
828 for idx in range(len(eBegin_neg)):
829 mean_eBegin_neg += eBegin_neg[idx]
830
831 try:
832 mean_eBegin_neg /= 1.0*len(eBegin_neg)
833 except:
834 mean_eBegin_neg = -1
835
836 mean_eEnd_neg = 0
837 for idx in range(len(eEnd_neg)):
838 mean_eEnd_neg += eEnd_neg[idx]
839
840 try:
841 mean_eEnd_neg /= 1.0*len(eEnd_neg)
842 except:
843 mean_eEnd_neg = -1
844
845 return eBegin_pos,eBegin_neg,eEnd_pos,eEnd_neg,mean_eBegin_neg,mean_eEnd_neg
846
847 def evaluateExample(self,dna,est,exons,SpliceAlign,newEstAlign,spos,exampleIdx):
848 newExons = []
849 oldElem = -1
850 SpliceAlign = SpliceAlign.flatten().tolist()[0]
851 SpliceAlign.append(-1)
852 for pos,elem in enumerate(SpliceAlign):
853 if pos == 0:
854 oldElem = -1
855 else:
856 oldElem = SpliceAlign[pos-1]
857
858 if oldElem != 0 and elem == 0: # start of exon
859 newExons.append(pos)
860
861 if oldElem == 0 and elem != 0: # end of exon
862 newExons.append(pos)
863
864 e1_b_off,e1_e_off,e2_b_off,e2_e_off = 0,0,0,0
865
866 self.plog(("Example %d"%exampleIdx) + str(exons) + " "+ str(newExons) + "\n")
867 #self.plog(("SpliceAlign " + str(SpliceAlign)+ "\n"))
868 self.plog(("read: " + str(est)+ "\n"))
869
870 if len(newExons) == 4:
871 e1_begin,e1_end = newExons[0],newExons[1]
872 e2_begin,e2_end = newExons[2],newExons[3]
873 else:
874 return None,None,None,None,newExons
875
876 e1_b_off = int(math.fabs(e1_begin - exons[0,0]))
877 e1_e_off = int(math.fabs(e1_end - exons[0,1]))
878
879 e2_b_off = int(math.fabs(e2_begin - exons[1,0]))
880 e2_e_off = int(math.fabs(e2_end - exons[1,1]))
881
882 return e1_b_off,e1_e_off,e2_b_off,e2_e_off,newExons
883
884 def fetch_genome_info(ginfo_filename):
885 if not os.path.exists(ginfo_filename):
886 cmd = ['']*4
887 cmd[0] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/utils'
888 cmd[1] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/genomes'
889 cmd[2] = 'genome_info = init_genome(\'%s\')' % gen_file
890 cmd[3] = 'save genome_info.mat genome_info'
891 full_cmd = "matlab -nojvm -nodisplay -r \"%s; %s; %s; %s; exit\"" % (cmd[0],cmd[1],cmd[2],cmd[3])
892
893 obj = subprocess.Popen(full_cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
894 out,err = obj.communicate()
895 assert err == '', 'An error occured!\n%s'%err
896
897 ginfo = scipy.io.loadmat('genome_info.mat')
898 cPickle.dump(self.genome_info,open(ginfo_filename,'w+'))
899 return ginfo['genome_info']
900
901 else:
902 return cPickle.load(open(ginfo_filename))
903
904
905
906 def calcStat(Acceptor,Donor,Exons):
907 maxAcc = -100
908 minAcc = 100
909 maxDon = -100
910 minDon = 100
911
912 acc_vec_pos = []
913 acc_vec_neg = []
914 don_vec_pos = []
915 don_vec_neg = []
916
917 for jdx in range(len(Acceptors)):
918 currentExons = Exons[jdx]
919 currentAcceptor = Acceptors[jdx]
920 currentAcceptor = currentAcceptor[1:]
921 currentAcceptor.append(-inf)
922 currentDonor = Donors[jdx]
923
924 for idx,elem in enumerate(currentAcceptor):
925 if idx == (int(currentExons[1,0])-1): # acceptor site
926 acc_vec_pos.append(elem)
927 else:
928 acc_vec_neg.append(elem)
929
930 for idx,elem in enumerate(currentDonor):
931 if idx == (int(currentExons[0,1])): # donor site
932 don_vec_pos.append(elem)
933 else:
934 don_vec_neg.append(elem)
935
936 acc_pos = [elem for elem in acc_vec_pos if elem != -inf]
937 acc_neg = [elem for elem in acc_vec_neg if elem != -inf]
938 don_pos = [elem for elem in don_vec_pos if elem != -inf]
939 don_neg = [elem for elem in don_vec_neg if elem != -inf]
940
941 pdb.set_trace()
942
943 for idx in range(len(Sequences)):
944 acc = [elem for elem in Acceptors[idx] if elem != -inf]
945 maxAcc = max(max(acc),maxAcc)
946 minAcc = min(min(acc),minAcc)
947
948 don = [elem for elem in Donors[idx] if elem != -inf]
949 maxDon = max(max(don),maxDon)
950 minDon = min(min(don),minDon)
951
952 def pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array):
953 (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds, tExonSizes, tStarts, tEnds) =\
954 pu.get_splice_info(_newSpliceAlign,_newEstAlign)
955
956 t_strand = '+'
957 translation = '-ACGTN_' #how aligned est and dna sequence is displayed
958 #(gap_char, 5 nucleotides, intron_char)
959 comparison_char = '| ' #first: match_char, second: intron_char
960
961 (exon_length, identity, ssQuality, exonQuality, comparison, qIDX, tIDX) =\
962 pu.comp_identity(dna_array, est_array, t_strand, qStart, tStart, translation, comparison_char)
963
964 first_identity = None
965 last_identity = None
966 for i in range(len(comparison)):
967 if comparison[i] == '|' and first_identity is None:
968 first_identity = i
969 if comparison[-i] == '|' and last_identity is None:
970 last_identity = len(comparison) - i - 1
971
972 try:
973 for idx in range(len(dna_array)):
974 dna_array[idx] = translation[int(dna_array[idx])]
975 est_array[idx] = translation[int(est_array[idx])]
976 except:
977 pdb.set_trace()
978
979 line1 = "".join(dna_array)
980 line2 = comparison
981 line3 = "".join(est_array)
982
983 return line1,line2,line3
984
985 def calc_info(acc,don,exons,qualities):
986 min_score = 100
987 max_score = -100
988
989 for elem in acc:
990 for score in elem:
991 if score != -inf:
992 min_score = min(min_score,score)
993 max_score = max(max_score,score)
994
995 print 'Acceptors max/min: %f / %f' % (max_score,min_score)
996
997 min_score = 100
998 max_score = -100
999
1000 for elem in don:
1001 for score in elem:
1002 if score != -inf:
1003 min_score = min(min_score,score)
1004 max_score = max(max_score,score)
1005
1006 print 'Donor max/min: %f / %f' % (max_score,min_score)
1007
1008 min_score = 10000
1009 max_score = 0
1010 mean_intron_len = 0
1011
1012 for elem in exons:
1013 intron_len = int(elem[1,0] - elem[0,1])
1014 mean_intron_len += intron_len
1015 min_score = min(min_score,intron_len)
1016 max_score = max(max_score,intron_len)
1017
1018 mean_intron_len /= 1.0*len(exons)
1019 print 'Intron length max/min: %d / %d mean length %f' % (max_score,min_score,mean_intron_len)
1020
1021 min_score = 10000
1022 max_score = 0
1023 mean_quality = 0
1024
1025 for qVec in qualities:
1026 for elem in qVec:
1027 min_score = min(min_score,elem)
1028 max_score = max(max_score,elem)
1029
1030 print 'Quality values max/min: %d / %d mean' % (max_score,min_score)
1031
1032
1033 def zero_out(vec,intervals):
1034 for interval in intervals:
1035 for pos in interval:
1036 vec[pos] = 0.0
1037
1038 if __name__ == '__main__':
1039 qpalma = QPalma()
1040
1041 mode = sys.argv[1]
1042
1043 if len(sys.argv) == 2 and mode == 'train':
1044 qpalma.train()
1045
1046 elif len(sys.argv) == 3 and mode == 'predict':
1047 param_filename = sys.argv[2]
1048 assert os.path.exists(param_filename)
1049 qpalma.evaluate(param_filename)
1050 else:
1051 print 'You have to choose between training or prediction mode:'
1052 print 'python qpalma. py (train|predict) <param_file>'