+ rearranged code
[qpalma.git] / scripts / qpalma_main.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 ###########################################################
5 #
6 # The QPalma project aims at extending the Palma project
7 # to be able to use Solexa reads toegether with their
8 # quality scores.
9 #
10 # This file represents the conversion of the main matlab
11 # training loop for Palma to python.
12 #
13 # Author: Fabio De Bona
14 #
15 ###########################################################
16
17 import sys
18 import subprocess
19 import scipy.io
20 import pdb
21 import os.path
22
23 from numpy.matlib import mat,zeros,ones,inf
24 from numpy.linalg import norm
25
26 import QPalmaDP
27 import qpalma
28 from qpalma.SIQP_CPX import SIQPSolver
29 from qpalma.DataProc import *
30 from qpalma.generateEvaluationData import *
31 from qpalma.computeSpliceWeights import *
32 from qpalma.set_param_palma import *
33 from qpalma.computeSpliceAlignWithQuality import *
34 from qpalma.penalty_lookup_new import *
35 from qpalma.compute_donacc import *
36 from qpalma.export_param import *
37 from qpalma.TrainingParam import Param
38 from qpalma.Plif import Plf
39
40 from qpalma.tools.splicesites import getDonAccScores
41 from qpalma.Configuration import *
42
43 from compile_dataset import compile_d
44
45 import palma.palma_utils as pu
46 from palma.output_formating import print_results
47
48 class para:
49 pass
50
51 class QPalma:
52 """
53 A training method for the QPalma project
54 """
55
56 def __init__(self):
57 self.ARGS = Param()
58
59 #compile_d(Conf.gff_fn,Conf.dna_flat_fn,Conf.filtered_fn,Conf.remapped_fn,Conf.tmp_dir,Conf.dataset_fn,False)
60 #compile_d(Conf.gff_fn,Conf.dna_flat_fn,Conf.filtered_fn,Conf.remapped_fn,Conf.tmp_dir,Conf.dataset_fn,True)
61
62 #gen_file= '%s/genome.config' % self.ARGS.basedir
63 #ginfo_filename = 'genome_info.pickle'
64 #self.genome_info = fetch_genome_info(ginfo_filename)
65 #self.plog('genome_info.basedir is %s\n'%self.genome_info.basedir)
66 #self.ARGS.train_with_splicesitescoreinformation = False
67
68 # Load the whole dataset
69 if Conf.mode == 'normal':
70 #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
71 Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
72 self.use_quality_scores = False
73
74 elif Conf.mode == 'using_quality_scores':
75
76 #Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos = paths_load_data_solexa('training',None,self.ARGS)
77 data_filename = Conf.dataset_fn
78
79 Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos = paths_load_data(data_filename,'training',None,self.ARGS)
80
81 #end = 1000
82 self.Sequences = Sequences
83 self.Exons = Exons
84 self.Ests = Ests
85 self.Qualities = Qualities
86 self.SplitPos = SplitPos
87 self.Donors = Donors
88 self.Acceptors = Acceptors
89
90 min_score = 100
91 max_score = -100
92
93 for elem in self.Acceptors:
94 for score in elem:
95 if score != -inf:
96 min_score = min(min_score,score)
97 max_score = max(max_score,score)
98
99 print 'Acceptors max/min: %f / %f' % (max_score,min_score)
100
101 min_score = 100
102 max_score = -100
103
104 for elem in self.Donors:
105 for score in elem:
106 if score != -inf:
107 min_score = min(min_score,score)
108 max_score = max(max_score,score)
109
110 print 'Donor max/min: %f / %f' % (max_score,min_score)
111
112 min_score = 10000
113 max_score = 0
114 mean_intron_len = 0
115
116 for elem in self.Exons:
117 intron_len = int(elem[1,0] - elem[0,1])
118 mean_intron_len += intron_len
119 min_score = min(min_score,intron_len)
120 max_score = max(max_score,intron_len)
121
122 mean_intron_len /= 1.0*len(self.Exons)
123 print 'Intron length max/min: %d / %d mean length %f' % (max_score,min_score,mean_intron_len)
124
125
126 self.use_quality_scores = True
127 else:
128 assert(False)
129
130 print 'leaving constructor...'
131
132 def plog(self,string):
133 self.logfh.write(string)
134 self.logfh.flush()
135
136 def train(self):
137 self.logfh = open('_qpalma_train.log','w+')
138
139 beg = Conf.training_begin
140 end = Conf.training_end
141
142 Sequences = self.Sequences[beg:end]
143 Exons = self.Exons[beg:end]
144 Ests = self.Ests[beg:end]
145 Qualities = self.Qualities[beg:end]
146 Acceptors = self.Acceptors[beg:end]
147 Donors = self.Donors[beg:end]
148
149 # number of training instances
150 N = numExamples = len(Sequences)
151 assert len(Exons) == N and len(Ests) == N\
152 and len(Qualities) == N and len(Acceptors) == N\
153 and len(Donors) == N, 'The Exons,Acc,Don,.. arrays are of different lengths'
154 self.plog('Number of training examples: %d\n'% numExamples)
155
156 self.noImprovementCtr = 0
157 self.oldObjValue = 1e8
158
159 iteration_steps = Conf.iter_steps ; #upper bound on iteration steps
160 remove_duplicate_scores = Conf.remove_duplicate_scores
161 print_matrix = Conf.print_matrix
162 anzpath = Conf.anzpath
163
164 # Initialize parameter vector / param = numpy.matlib.rand(126,1)
165 param = Conf.fixedParam
166
167 # Set the parameters such as limits penalties for the Plifs
168 [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
169
170 #pdb.set_trace()
171
172 lengthSP = Conf.numLengthSuppPoints
173 donSP = Conf.numDonSuppPoints
174 accSP = Conf.numAccSuppPoints
175 mmatrixSP = Conf.sizeMatchmatrix[0]\
176 *Conf.sizeMatchmatrix[1]
177 numq = Conf.numQualSuppPoints
178 totalQualSP = Conf.totalQualSuppPoints
179
180 # Initialize solver
181 if Conf.USE_OPT:
182 self.plog('Initializing problem...\n')
183 solver = SIQPSolver(Conf.numFeatures,numExamples,Conf.C,self.logfh)
184 solver.enforceMonotonicity(lengthSP,lengthSP+donSP)
185 solver.enforceMonotonicity(lengthSP+donSP,lengthSP+donSP+accSP)
186
187 # stores the number of alignments done for each example (best path, second-best path etc.)
188 num_path = [anzpath]*numExamples
189 # stores the gap for each example
190 gap = [0.0]*numExamples
191
192 #############################################################################################
193 # Training
194 #############################################################################################
195 self.plog('Starting training...\n')
196
197 currentPhi = zeros((Conf.numFeatures,1))
198 totalQualityPenalties = zeros((totalQualSP,1))
199
200 iteration_nr = 0
201 param_idx = 0
202 const_added_ctr = 0
203 while True:
204 if iteration_nr == iteration_steps:
205 break
206
207 for exampleIdx in range(numExamples):
208 if (exampleIdx%100) == 0:
209 print 'Current example nr %d' % exampleIdx
210
211 dna = Sequences[exampleIdx]
212 est = Ests[exampleIdx]
213
214 if Conf.mode == 'normal':
215 quality = [40]*len(est)
216
217 if Conf.mode == 'using_quality_scores':
218 quality = Qualities[exampleIdx]
219
220 exons = Exons[exampleIdx]
221 # NoiseMatrix = Noises[exampleIdx]
222 don_supp = Donors[exampleIdx]
223 acc_supp = Acceptors[exampleIdx]
224
225 # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
226 trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
227
228 # Calculate the weights
229 trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
230 trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
231
232 #idx_lst = [p for p,e in enumerate(trueWeightIntron) if e > 1e-5]
233 #print idx_lst
234 #print [h.limits[p] for p in idx_lst]
235
236 currentPhi[0:donSP] = mat(d.penalties[:]).reshape(donSP,1)
237 currentPhi[donSP:donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
238 currentPhi[donSP+accSP:donSP+accSP+lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
239 currentPhi[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP] = mmatrix[:]
240
241 if Conf.mode == 'using_quality_scores':
242 totalQualityPenalties = param[-totalQualSP:]
243 currentPhi[donSP+accSP+lengthSP+mmatrixSP:] = totalQualityPenalties[:]
244
245 # Calculate w'phi(x,y) the total score of the alignment
246 trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
247
248 # The allWeights vector is supposed to store the weight parameter
249 # of the true alignment as well as the weight parameters of the
250 # num_path[exampleIdx] other alignments
251 allWeights = zeros((Conf.numFeatures,num_path[exampleIdx]+1))
252 allWeights[:,0] = trueWeight[:,0]
253
254 AlignmentScores = [0.0]*(num_path[exampleIdx]+1)
255 AlignmentScores[0] = trueAlignmentScore
256
257 ################## Calculate wrong alignment(s) ######################
258
259 # Compute donor, acceptor with penalty_lookup_new
260 # returns two double lists
261 donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
262
263 #myalign wants the acceptor site on the g of the ag
264 acceptor = acceptor[1:]
265 acceptor.append(-inf)
266
267 dna = str(dna)
268 est = str(est)
269 dna_len = len(dna)
270 est_len = len(est)
271
272 # check that splice site scores are at dna positions as expected by
273 # the dynamic programming component
274 for d_pos in [pos for pos,elem in enumerate(donor) if elem != -inf]:
275 assert dna[d_pos] == 'g' and (dna[d_pos+1] == 'c'\
276 or dna[d_pos+1] == 't'), pdb.set_trace()
277
278 for a_pos in [pos for pos,elem in enumerate(acceptor) if elem != -inf]:
279 assert dna[a_pos-1] == 'a' and dna[a_pos] == 'g', pdb.set_trace()
280
281 ps = h.convert2SWIG()
282
283 prb = QPalmaDP.createDoubleArrayFromList(quality)
284 chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
285
286 matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
287 mm_len = Conf.sizeMatchmatrix[0]*Conf.sizeMatchmatrix[1]
288
289 d_len = len(donor)
290 donor = QPalmaDP.createDoubleArrayFromList(donor)
291 a_len = len(acceptor)
292 acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
293
294 # Create the alignment object representing the interface to the C/C++ code.
295 currentAlignment = QPalmaDP.Alignment(Conf.numQualPlifs,Conf.numQualSuppPoints, self.use_quality_scores)
296
297 c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
298 #print 'Calling myalign...'
299 # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
300 currentAlignment.myalign( num_path[exampleIdx], dna, dna_len,\
301 est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
302 acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
303 print_matrix)
304
305 #print 'After calling myalign...'
306 #print 'Calling getAlignmentResults...'
307
308 c_SpliceAlign = QPalmaDP.createIntArrayFromList([0]*(dna_len*num_path[exampleIdx]))
309 c_EstAlign = QPalmaDP.createIntArrayFromList([0]*(est_len*num_path[exampleIdx]))
310 c_WeightMatch = QPalmaDP.createIntArrayFromList([0]*(mm_len*num_path[exampleIdx]))
311 c_DPScores = QPalmaDP.createDoubleArrayFromList([.0]*num_path[exampleIdx])
312
313 c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(Conf.totalQualSuppPoints*num_path[exampleIdx]))
314
315 currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
316 c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
317
318 #print 'After calling getAlignmentResults...'
319
320 newSpliceAlign = zeros((num_path[exampleIdx]*dna_len,1))
321 newEstAlign = zeros((est_len*num_path[exampleIdx],1))
322 newWeightMatch = zeros((num_path[exampleIdx]*mm_len,1))
323 newDPScores = zeros((num_path[exampleIdx],1))
324 newQualityPlifsFeatures = zeros((Conf.totalQualSuppPoints*num_path[exampleIdx],1))
325
326 #print 'newSpliceAlign'
327 for i in range(dna_len*num_path[exampleIdx]):
328 newSpliceAlign[i] = c_SpliceAlign[i]
329 # print '%f' % (spliceAlign[i])
330
331 #print 'newEstAlign'
332 for i in range(est_len*num_path[exampleIdx]):
333 newEstAlign[i] = c_EstAlign[i]
334 # print '%f' % (spliceAlign[i])
335
336 #print 'weightMatch'
337 for i in range(mm_len*num_path[exampleIdx]):
338 newWeightMatch[i] = c_WeightMatch[i]
339 # print '%f' % (weightMatch[i])
340
341 #print 'ViterbiScores'
342 for i in range(num_path[exampleIdx]):
343 newDPScores[i] = c_DPScores[i]
344
345 if self.use_quality_scores:
346 for i in range(Conf.totalQualSuppPoints*num_path[exampleIdx]):
347 newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
348
349 # equals palma up to here
350
351 #print "Calling destructors"
352 del c_SpliceAlign
353 del c_EstAlign
354 del c_WeightMatch
355 del c_DPScores
356 del c_qualityPlifsFeatures
357 del currentAlignment
358
359 newSpliceAlign = newSpliceAlign.reshape(num_path[exampleIdx],dna_len)
360 newWeightMatch = newWeightMatch.reshape(num_path[exampleIdx],mm_len)
361 # Calculate weights of the respective alignments Note that we are
362 # calculating n-best alignments without hamming loss, so we
363 # have to keep track which of the n-best alignments correspond to
364 # the true one in order not to incorporate a true alignment in the
365 # constraints. To keep track of the true and false alignments we
366 # define an array true_map with a boolean indicating the
367 # equivalence to the true alignment for each decoded alignment.
368 true_map = [0]*(num_path[exampleIdx]+1)
369 true_map[0] = 1
370 path_loss = [0]*(num_path[exampleIdx])
371
372 for pathNr in range(num_path[exampleIdx]):
373 #print 'decodedWeights'
374 weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a,\
375 h, newSpliceAlign[pathNr,:].flatten().tolist()[0], don_supp,\
376 acc_supp)
377
378 decodedQualityFeatures = zeros((Conf.totalQualSuppPoints,1))
379 for qidx in range(Conf.totalQualSuppPoints):
380 decodedQualityFeatures[qidx] = newQualityPlifsFeatures[(pathNr*Conf.totalQualSuppPoints)+qidx]
381
382 #pdb.set_trace()
383
384 path_loss[pathNr] = 0
385 # sum up positionwise loss between alignments
386 for alignPosIdx in range(newSpliceAlign[pathNr,:].shape[1]):
387 if newSpliceAlign[pathNr,alignPosIdx] != trueSpliceAlign[alignPosIdx]:
388 path_loss[pathNr] += 1
389
390 #pdb.set_trace()
391
392 # Gewichte in restliche Zeilen der Matrix speichern
393 wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch[pathNr,:].T, decodedQualityFeatures])
394 allWeights[:,pathNr+1] = wp
395
396 #pdb.set_trace()
397
398 hpen = mat(h.penalties).reshape(len(h.penalties),1)
399 dpen = mat(d.penalties).reshape(len(d.penalties),1)
400 apen = mat(a.penalties).reshape(len(a.penalties),1)
401 features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties])
402
403 AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
404
405 # Check wether scalar product + loss equals viterbi score
406
407 distinct_scores = False
408 if math.fabs(AlignmentScores[pathNr] - AlignmentScores[pathNr+1]) > 1e-5:
409 distinct_scores = True
410
411 # Check wether scalar product + loss equals viterbi score
412 if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) <= 1e-5:
413 pdb.set_trace()
414
415 # if the pathNr-best alignment is very close to the true alignment consider it as true
416 if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
417 true_map[pathNr+1] = 1
418
419 # assert AlignmentScores[0] > max(AlignmentScores[1:]) + 1e-6, pdb.set_trace()
420
421 # the true label sequence should not have a larger score than the maximal one WHYYYYY?
422 # this means that all n-best paths are to close to each other
423 # we have to extend the n-best search to a (n+1)-best
424 if len([elem for elem in true_map if elem == 1]) == len(true_map):
425 num_path[exampleIdx] = num_path[exampleIdx]+1
426
427 # Choose true and first false alignment for extending
428 firstFalseIdx = -1
429 for map_idx,elem in enumerate(true_map):
430 if elem == 0:
431 firstFalseIdx = map_idx
432 break
433
434 # if there is at least one useful false alignment add the
435 # corresponding constraints to the optimization problem
436 if firstFalseIdx != -1:
437 trueWeights = allWeights[:,0]
438 firstFalseWeights = allWeights[:,firstFalseIdx]
439 differenceVector = trueWeights - firstFalseWeights
440 #pdb.set_trace()
441
442 if Conf.USE_OPT:
443 const_added = solver.addConstraint(differenceVector, exampleIdx)
444 const_added_ctr += 1
445 #
446 # end of one example processing
447 #
448
449 # call solver every nth example //added constraint
450 if exampleIdx != 0 and exampleIdx % Conf.numConstraintsPerRound == 0 and Conf.USE_OPT:
451 objValue,w,self.slacks = solver.solve()
452
453 if math.fabs(objValue - self.oldObjValue) <= 1e-6:
454 self.noImprovementCtr += 1
455
456 if self.noImprovementCtr == numExamples+1:
457 break
458
459 self.oldObjValue = objValue
460 print "objValue is %f" % objValue
461
462 sum_xis = 0
463 for elem in self.slacks:
464 sum_xis += elem
465
466 print 'sum of slacks is %f'% sum_xis
467
468 for i in range(len(param)):
469 param[i] = w[i]
470
471 #pdb.set_trace()
472 cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
473 param_idx += 1
474 [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
475
476 #
477 # end of one iteration through all examples
478 #
479 if self.noImprovementCtr == numExamples+1:
480 break
481
482 iteration_nr += 1
483
484 #
485 # end of optimization
486 #
487 print 'Training completed'
488
489 pa = para()
490 pa.h = h
491 pa.d = d
492 pa.a = a
493 pa.mmatrix = mmatrix
494 pa.qualityPlifs = qualityPlifs
495
496 cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
497 #cPickle.dump(pa,open('elegans.param','w+'))
498 self.logfh.close()
499
500 def evaluate(self,param_filename):
501 beg = Conf.prediction_begin
502 end = Conf.prediction_end
503
504 # predict on training set
505 print '##### Prediction on the training set #####'
506 self.predict(param_filename,0,beg)
507
508 # predict on test set
509 print '##### Prediction on the test set #####'
510 self.predict(param_filename,beg,end)
511
512 pdb.set_trace()
513
514 def predict(self,param_filename,beg,end):
515 self.logfh = open('_qpalma_predict.log','w+')
516
517 Sequences = self.Sequences[beg:end]
518 Exons = self.Exons[beg:end]
519 Ests = self.Ests[beg:end]
520 Qualities = self.Qualities[beg:end]
521 Acceptors = self.Acceptors[beg:end]
522 Donors = self.Donors[beg:end]
523 SplitPos = self.SplitPos[beg:end]
524
525 # number of training instances
526 N = numExamples = len(Sequences)
527 assert len(Exons) == N and len(Ests) == N\
528 and len(Qualities) == N and len(Acceptors) == N\
529 and len(Donors) == N, 'The Exons,Acc,Don,.. arrays are of different lengths'
530 self.plog('Number of training examples: %d\n'% numExamples)
531
532 self.noImprovementCtr = 0
533 self.oldObjValue = 1e8
534
535 remove_duplicate_scores = Conf.remove_duplicate_scores
536 print_matrix = Conf.print_matrix
537 anzpath = Conf.anzpath
538
539 param = cPickle.load(open(param_filename))
540
541 # Set the parameters such as limits penalties for the Plifs
542 [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
543
544 #############################################################################################
545 # Prediction
546 #############################################################################################
547 self.plog('Starting prediction...\n')
548
549 donSP = Conf.numDonSuppPoints
550 accSP = Conf.numAccSuppPoints
551 lengthSP = Conf.numLengthSuppPoints
552 mmatrixSP = Conf.sizeMatchmatrix[0]\
553 *Conf.sizeMatchmatrix[1]
554 numq = Conf.numQualSuppPoints
555 totalQualSP = Conf.totalQualSuppPoints
556
557 currentPhi = zeros((Conf.numFeatures,1))
558 totalQualityPenalties = zeros((totalQualSP,1))
559
560 exon1Begin = []
561 exon1End = []
562 exon2Begin = []
563 exon2End = []
564 allWrongExons = []
565
566 for exampleIdx in range(numExamples):
567 dna = Sequences[exampleIdx]
568 est = Ests[exampleIdx]
569 currentSplitPos = SplitPos[exampleIdx]
570
571 if Conf.mode == 'normal':
572 quality = [40]*len(est)
573
574 if Conf.mode == 'using_quality_scores':
575 quality = Qualities[exampleIdx]
576
577 exons = Exons[exampleIdx]
578 # NoiseMatrix = Noises[exampleIdx]
579 don_supp = Donors[exampleIdx]
580 acc_supp = Acceptors[exampleIdx]
581
582 # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
583 trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
584
585 # Calculate the weights
586 trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
587 trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
588
589 currentPhi[0:donSP] = mat(d.penalties[:]).reshape(donSP,1)
590 currentPhi[donSP:donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
591 currentPhi[donSP+accSP:donSP+accSP+lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
592 currentPhi[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP] = mmatrix[:]
593
594 if Conf.mode == 'using_quality_scores':
595 totalQualityPenalties = param[-totalQualSP:]
596 currentPhi[donSP+accSP+lengthSP+mmatrixSP:] = totalQualityPenalties[:]
597
598 # Calculate w'phi(x,y) the total score of the alignment
599 trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
600
601 # The allWeights vector is supposed to store the weight parameter
602 # of the true alignment as well as the weight parameters of the
603 # 1 other alignments
604 allWeights = zeros((Conf.numFeatures,1+1))
605 allWeights[:,0] = trueWeight[:,0]
606
607 AlignmentScores = [0.0]*(1+1)
608 AlignmentScores[0] = trueAlignmentScore
609
610 ################## Calculate wrong alignment(s) ######################
611
612 # Compute donor, acceptor with penalty_lookup_new
613 # returns two double lists
614 donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
615
616 #myalign wants the acceptor site on the g of the ag
617 acceptor = acceptor[1:]
618 acceptor.append(-inf)
619
620 dna = str(dna)
621 est = str(est)
622 dna_len = len(dna)
623 est_len = len(est)
624
625 ps = h.convert2SWIG()
626
627 prb = QPalmaDP.createDoubleArrayFromList(quality)
628 chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
629
630 matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
631 mm_len = Conf.sizeMatchmatrix[0]*Conf.sizeMatchmatrix[1]
632
633 d_len = len(donor)
634 donor = QPalmaDP.createDoubleArrayFromList(donor)
635 a_len = len(acceptor)
636 acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
637
638 # Create the alignment object representing the interface to the C/C++ code.
639 currentAlignment = QPalmaDP.Alignment(Conf.numQualPlifs,Conf.numQualSuppPoints, self.use_quality_scores)
640
641 c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
642
643 # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
644 currentAlignment.myalign(1, dna, dna_len,\
645 est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
646 acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
647 print_matrix)
648
649 c_SpliceAlign = QPalmaDP.createIntArrayFromList([0]*(dna_len*1))
650 c_EstAlign = QPalmaDP.createIntArrayFromList([0]*(est_len*1))
651 c_WeightMatch = QPalmaDP.createIntArrayFromList([0]*(mm_len*1))
652 c_DPScores = QPalmaDP.createDoubleArrayFromList([.0]*1)
653
654 result_len = currentAlignment.getResultLength()
655 c_dna_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
656 c_est_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
657
658 currentAlignment.getAlignmentArrays(c_dna_array,c_est_array)
659
660 c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(Conf.totalQualSuppPoints))
661
662 currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
663 c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
664
665 newSpliceAlign = zeros((dna_len,1))
666 newEstAlign = zeros((est_len,1))
667 newWeightMatch = zeros((mm_len,1))
668 newQualityPlifsFeatures = zeros((Conf.totalQualSuppPoints,1))
669
670 for i in range(dna_len):
671 newSpliceAlign[i] = c_SpliceAlign[i]
672
673 for i in range(est_len):
674 newEstAlign[i] = c_EstAlign[i]
675
676 for i in range(mm_len):
677 newWeightMatch[i] = c_WeightMatch[i]
678
679 newDPScores = c_DPScores[0]
680
681 for i in range(Conf.totalQualSuppPoints):
682 newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
683
684 del c_SpliceAlign
685 del c_EstAlign
686 del c_WeightMatch
687 del c_DPScores
688 del c_qualityPlifsFeatures
689 del currentAlignment
690
691 newSpliceAlign = newSpliceAlign.reshape(1,dna_len)
692 newWeightMatch = newWeightMatch.reshape(1,mm_len)
693 true_map = [0]*2
694 true_map[0] = 1
695 pathNr = 0
696
697 dna_array = [0.0]*result_len
698 est_array = [0.0]*result_len
699
700 for r_idx in range(result_len):
701 dna_array[r_idx] = c_dna_array[r_idx]
702 est_array[r_idx] = c_est_array[r_idx]
703
704 _newSpliceAlign = newSpliceAlign.flatten().tolist()[0]
705 _newEstAlign = newEstAlign.flatten().tolist()[0]
706
707 #line1,line2,line3 = pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array)
708 #self.plog(line1+'\n')
709 #self.plog(line2+'\n')
710 #self.plog(line3+'\n')
711
712 weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a, h, newSpliceAlign.flatten().tolist()[0], don_supp, acc_supp)
713
714 decodedQualityFeatures = zeros((Conf.totalQualSuppPoints,1))
715 for qidx in range(Conf.totalQualSuppPoints):
716 decodedQualityFeatures[qidx] = newQualityPlifsFeatures[qidx]
717
718 # Gewichte in restliche Zeilen der Matrix speichern
719 wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch.T, decodedQualityFeatures])
720 allWeights[:,pathNr+1] = wp
721
722 hpen = mat(h.penalties).reshape(len(h.penalties),1)
723 dpen = mat(d.penalties).reshape(len(d.penalties),1)
724 apen = mat(a.penalties).reshape(len(a.penalties),1)
725 features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties])
726 AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
727
728 # if the pathNr-best alignment is very close to the true alignment consider it as true
729 if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
730 true_map[pathNr+1] = 1
731
732 e1_b_off,e1_e_off,e2_b_off,e2_e_off,newExons = self.evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign,currentSplitPos,exampleIdx)
733
734 if e1_b_off != None:
735 exon1Begin.append(e1_b_off)
736 exon1End.append(e1_e_off)
737 exon2Begin.append(e2_b_off)
738 exon2End.append(e2_e_off)
739 else:
740 allWrongExons.append((newExons,exons))
741
742
743 logfile = self.logfh
744
745 if e1_b_off == 0 and e1_e_off == 0 and e2_b_off == 0 and e2_e_off == 0:
746 print >> logfile, 'example %d correct' % exampleIdx
747 else:
748 print >> logfile, 'example %d wrong' % exampleIdx
749
750 e1Begin_pos,e1Begin_neg,e1End_pos,e1End_neg,mean_e1Begin_neg,mean_e1End_neg = self.evaluatePositions(exon1Begin,exon1End)
751 e2Begin_pos,e2Begin_neg,e2End_pos,e2End_neg,mean_e2Begin_neg,mean_e2End_neg = self.evaluatePositions(exon2Begin,exon2End)
752
753 all_pos_correct = 0
754 for idx in range(len(exon1Begin)):
755 if exon1Begin[idx] == 0 and exon1End[idx] == 0\
756 and exon2Begin[idx] == 0 and exon2End[idx] == 0:
757 all_pos_correct += 1
758
759 logfile = self.logfh
760 print >> logfile, 'Total num. of examples: %d' % numExamples
761 print >> logfile, 'Number of total correct examples: %d' % all_pos_correct
762 print >> logfile, 'Correct positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_pos),len(e1End_pos),len(e2Begin_pos),len(e2End_pos))
763 print >> logfile, 'Incorrect positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_neg),len(e1End_neg),len(e2Begin_neg),len(e2End_neg))
764 print >> logfile, 'Mean of pos. offset:\t\t%.2f\t%.2f\t%.2f\t%.2f' % (mean_e1Begin_neg,mean_e1End_neg,mean_e2Begin_neg,mean_e2End_neg)
765
766 print 'Total num. of examples: %d' % numExamples
767 print 'Number of total correct examples: %d' % all_pos_correct
768 print 'Correct positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_pos),len(e1End_pos),len(e2Begin_pos),len(e2End_pos))
769 print 'Incorrect positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_neg),len(e1End_neg),len(e2Begin_neg),len(e2End_neg))
770 print 'Mean of pos. offset:\t\t%.2f\t%.2f\t%.2f\t%.2f' % (mean_e1Begin_neg,mean_e1End_neg,mean_e2Begin_neg,mean_e2End_neg)
771
772 print 'Prediction completed'
773 self.logfh.close()
774
775 def evaluatePositions(self,eBegin,eEnd):
776
777 eBegin_pos = [elem for elem in eBegin if elem == 0]
778 eBegin_neg = [elem for elem in eBegin if elem != 0]
779 eEnd_pos = [elem for elem in eEnd if elem == 0]
780 eEnd_neg = [elem for elem in eEnd if elem != 0]
781
782 mean_eBegin_neg = 0
783 for idx in range(len(eBegin_neg)):
784 mean_eBegin_neg += eBegin_neg[idx]
785
786 try:
787 mean_eBegin_neg /= 1.0*len(eBegin_neg)
788 except:
789 mean_eBegin_neg = -1
790
791 mean_eEnd_neg = 0
792 for idx in range(len(eEnd_neg)):
793 mean_eEnd_neg += eEnd_neg[idx]
794
795 try:
796 mean_eEnd_neg /= 1.0*len(eEnd_neg)
797 except:
798 mean_eEnd_neg = -1
799
800 return eBegin_pos,eBegin_neg,eEnd_pos,eEnd_neg,mean_eBegin_neg,mean_eEnd_neg
801
802 def evaluateExample(self,dna,est,exons,SpliceAlign,newEstAlign,spos,exampleIdx):
803 newExons = []
804 oldElem = -1
805 SpliceAlign = SpliceAlign.flatten().tolist()[0]
806 SpliceAlign.append(-1)
807 for pos,elem in enumerate(SpliceAlign):
808 if pos == 0:
809 oldElem = -1
810 else:
811 oldElem = SpliceAlign[pos-1]
812
813 if oldElem != 0 and elem == 0: # start of exon
814 newExons.append(pos)
815
816 if oldElem == 0 and elem != 0: # end of exon
817 newExons.append(pos)
818
819 e1_b_off,e1_e_off,e2_b_off,e2_e_off = 0,0,0,0
820
821 self.plog(("Example %d"%exampleIdx) + str(exons) + " "+ str(newExons) + "\n")
822 self.plog(("SpliceAlign " + str(SpliceAlign)+ "\n"))
823
824 if len(newExons) == 4:
825 e1_begin,e1_end = newExons[0],newExons[1]
826 e2_begin,e2_end = newExons[2],newExons[3]
827 else:
828 return None,None,None,None,newExons
829
830 e1_b_off = int(math.fabs(e1_begin - exons[0,0]))
831 e1_e_off = int(math.fabs(e1_end - exons[0,1]))
832
833 e2_b_off = int(math.fabs(e2_begin - exons[1,0]))
834 e2_e_off = int(math.fabs(e2_end - exons[1,1]))
835
836 return e1_b_off,e1_e_off,e2_b_off,e2_e_off,newExons
837
838 def fetch_genome_info(ginfo_filename):
839 if not os.path.exists(ginfo_filename):
840 cmd = ['']*4
841 cmd[0] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/utils'
842 cmd[1] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/genomes'
843 cmd[2] = 'genome_info = init_genome(\'%s\')' % gen_file
844 cmd[3] = 'save genome_info.mat genome_info'
845 full_cmd = "matlab -nojvm -nodisplay -r \"%s; %s; %s; %s; exit\"" % (cmd[0],cmd[1],cmd[2],cmd[3])
846
847 obj = subprocess.Popen(full_cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
848 out,err = obj.communicate()
849 assert err == '', 'An error occured!\n%s'%err
850
851 ginfo = scipy.io.loadmat('genome_info.mat')
852 cPickle.dump(self.genome_info,open(ginfo_filename,'w+'))
853 return ginfo['genome_info']
854
855 else:
856 return cPickle.load(open(ginfo_filename))
857
858
859
860 def calcStat(Acceptor,Donor,Exons):
861 maxAcc = -100
862 minAcc = 100
863 maxDon = -100
864 minDon = 100
865
866 acc_vec_pos = []
867 acc_vec_neg = []
868 don_vec_pos = []
869 don_vec_neg = []
870
871 for jdx in range(len(Acceptors)):
872 currentExons = Exons[jdx]
873 currentAcceptor = Acceptors[jdx]
874 currentAcceptor = currentAcceptor[1:]
875 currentAcceptor.append(-inf)
876 currentDonor = Donors[jdx]
877
878 for idx,elem in enumerate(currentAcceptor):
879 if idx == (int(currentExons[1,0])-1): # acceptor site
880 acc_vec_pos.append(elem)
881 else:
882 acc_vec_neg.append(elem)
883
884 for idx,elem in enumerate(currentDonor):
885 if idx == (int(currentExons[0,1])): # donor site
886 don_vec_pos.append(elem)
887 else:
888 don_vec_neg.append(elem)
889
890 acc_pos = [elem for elem in acc_vec_pos if elem != -inf]
891 acc_neg = [elem for elem in acc_vec_neg if elem != -inf]
892 don_pos = [elem for elem in don_vec_pos if elem != -inf]
893 don_neg = [elem for elem in don_vec_neg if elem != -inf]
894
895 pdb.set_trace()
896
897 for idx in range(len(Sequences)):
898 acc = [elem for elem in Acceptors[idx] if elem != -inf]
899 maxAcc = max(max(acc),maxAcc)
900 minAcc = min(min(acc),minAcc)
901
902 don = [elem for elem in Donors[idx] if elem != -inf]
903 maxDon = max(max(don),maxDon)
904 minDon = min(min(don),minDon)
905
906 def pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array):
907 (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds, tExonSizes, tStarts, tEnds) =\
908 pu.get_splice_info(_newSpliceAlign,_newEstAlign)
909
910 t_strand = '+'
911 translation = '-ACGTN_' #how aligned est and dna sequence is displayed
912 #(gap_char, 5 nucleotides, intron_char)
913 comparison_char = '| ' #first: match_char, second: intron_char
914
915 (exon_length, identity, ssQuality, exonQuality, comparison, qIDX, tIDX) =\
916 pu.comp_identity(dna_array, est_array, t_strand, qStart, tStart, translation, comparison_char)
917
918 first_identity = None
919 last_identity = None
920 for i in range(len(comparison)):
921 if comparison[i] == '|' and first_identity is None:
922 first_identity = i
923 if comparison[-i] == '|' and last_identity is None:
924 last_identity = len(comparison) - i - 1
925
926 try:
927 for idx in range(len(dna_array)):
928 dna_array[idx] = translation[int(dna_array[idx])]
929 est_array[idx] = translation[int(est_array[idx])]
930 except:
931 pdb.set_trace()
932
933 line1 = "".join(dna_array)
934 line2 = comparison
935 line3 = "".join(est_array)
936
937 return line1,line2,line3
938
939 #exon_length[-1] = exon_length[-1] - tEnd + tIDX[last_identity] + 1
940
941 #tStart = tIDX[first_identity]
942 #tStarts[0] = tIDX[first_identity]
943 #tEnd = tIDX[last_identity]
944 #tEnds[-1] = tIDX[last_identity]
945 #qStart = qIDX[first_identity]
946 #qStarts[0] = qIDX[first_identity]
947 #qEnd = qIDX[last_identity]
948 #qEnds[-1] = qIDX[last_identity]
949
950 #align_info.qStart = qStart
951 #align_info.qEnd = qEnd
952 #align_info.tStart = tStart
953 #align_info.tEnd = tEnd
954 #align_info.num_exons = num_exons
955 #align_info.qExonSizes = qExonSizes
956 #align_info.qStarts = qStarts
957 #align_info.qEnds = qEnds
958 #align_info.tExonSizes =tExonSizes
959 #align_info.tStarts = tStarts
960 #align_info.tEnds = tEnds
961 #align_info.exon_length = exon_length
962 #align_info.identity = identity
963 #align_info.ssQuality = ssQuality
964 #align_info.exonQuality = exonQuality
965 #align_info.comparison = comparison
966 #align_info.qIDX = qIDX
967 #align_info.tIDX = tIDX
968 #align_info.dna_letters = dna_letters
969 #align_info.est_letters = est_letters
970
971
972 if __name__ == '__main__':
973 qpalma = QPalma()
974
975 if len(sys.argv) == 2:
976 mode = sys.argv[1]
977 assert mode == 'train'
978 qpalma.train()
979
980 elif len(sys.argv) == 3:
981 mode = sys.argv[1]
982 param_filename = sys.argv[2]
983 assert mode == 'predict'
984 assert os.path.exists(param_filename)
985 qpalma.evaluate(param_filename)
986 else:
987 print 'You have to choose between training or prediction mode:'
988 print 'python qpalma. py (train|predict) <param_file>'