+ modified filtering of reads
[qpalma.git] / scripts / qpalma_main.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 ###########################################################
5 #
6 # The QPalma project aims at extending the Palma project
7 # to be able to use Solexa reads toegether with their
8 # quality scores.
9 #
10 # This file represents the conversion of the main matlab
11 # training loop for Palma to python.
12 #
13 # Author: Fabio De Bona
14 #
15 ###########################################################
16
17 import sys
18 import subprocess
19 import scipy.io
20 import pdb
21 import os.path
22
23 from numpy.matlib import mat,zeros,ones,inf
24 from numpy.linalg import norm
25
26 import QPalmaDP
27 import qpalma
28 from qpalma.SIQP_CPX import SIQPSolver
29 from qpalma.DataProc import *
30 from qpalma.generateEvaluationData import *
31 from qpalma.computeSpliceWeights import *
32 from qpalma.set_param_palma import *
33 from qpalma.computeSpliceAlignWithQuality import *
34 from qpalma.penalty_lookup_new import *
35 from qpalma.compute_donacc import *
36 from qpalma.export_param import *
37 from qpalma.TrainingParam import Param
38 from qpalma.Plif import Plf
39
40 from qpalma.tools.splicesites import getDonAccScores
41 from qpalma.Configuration import *
42
43 class para:
44 pass
45
46 class QPalma:
47 """
48 A training method for the QPalma project
49 """
50
51 def __init__(self):
52 self.ARGS = Param()
53
54 #gen_file= '%s/genome.config' % self.ARGS.basedir
55 #ginfo_filename = 'genome_info.pickle'
56 #self.genome_info = fetch_genome_info(ginfo_filename)
57 #self.plog('genome_info.basedir is %s\n'%self.genome_info.basedir)
58 #self.ARGS.train_with_splicesitescoreinformation = False
59
60 # Load the whole dataset
61 if Conf.mode == 'normal':
62 #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
63 Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
64 self.use_quality_scores = False
65
66 elif Conf.mode == 'using_quality_scores':
67
68 #Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos = paths_load_data_solexa('training',None,self.ARGS)
69 data_filename = Conf.data_filename
70
71 Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos = paths_load_data(data_filename,'training',None,self.ARGS)
72
73 #end = 1000
74 self.Sequences = Sequences
75 self.Exons = Exons
76 self.Ests = Ests
77 self.Qualities = Qualities
78 self.SplitPos = SplitPos
79 self.Donors = Donors
80 self.Acceptors = Acceptors
81
82 #self.Acceptors = self.Acceptors[:end]
83 #self.Donors = self.Donors[:end]
84
85 #Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
86 #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
87 #Qualities = []
88 #for i in range(len(Ests)):
89 # Qualities.append([40]*len(Ests[i]))
90 self.use_quality_scores = True
91 else:
92 assert(False)
93
94 def plog(self,string):
95 self.logfh.write(string)
96 self.logfh.flush()
97
98 def train(self):
99 self.logfh = open('_qpalma_train.log','w+')
100
101 beg = Conf.training_begin
102 end = Conf.training_end
103
104 Sequences = self.Sequences[beg:end]
105 Exons = self.Exons[beg:end]
106 Ests = self.Ests[beg:end]
107 Qualities = self.Qualities[beg:end]
108 Acceptors = self.Acceptors[beg:end]
109 Donors = self.Donors[beg:end]
110
111 # number of training instances
112 N = numExamples = len(Sequences)
113 assert len(Exons) == N and len(Ests) == N\
114 and len(Qualities) == N and len(Acceptors) == N\
115 and len(Donors) == N, 'The Exons,Acc,Don,.. arrays are of different lengths'
116 self.plog('Number of training examples: %d\n'% numExamples)
117
118 self.noImprovementCtr = 0
119 self.oldObjValue = 1e8
120
121 iteration_steps = Conf.iter_steps ; #upper bound on iteration steps
122 remove_duplicate_scores = Conf.remove_duplicate_scores
123 print_matrix = Conf.print_matrix
124 anzpath = Conf.anzpath
125
126 # Initialize parameter vector / param = numpy.matlib.rand(126,1)
127 param = Conf.fixedParam
128
129 # Set the parameters such as limits penalties for the Plifs
130 [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
131
132 # delete splicesite-score-information
133 #if not self.ARGS.train_with_splicesitescoreinformation:
134 # for i in range(len(Acceptors)):
135 # if Acceptors[i] > -20:
136 # Acceptors[i] = 1
137 # if Donors[i] >-20:
138 # Donors[i] = 1
139
140 # Initialize solver
141 if Conf.USE_OPT:
142 self.plog('Initializing problem...\n')
143 solver = SIQPSolver(Conf.numFeatures,numExamples,Conf.C,self.logfh)
144
145 # stores the number of alignments done for each example (best path, second-best path etc.)
146 num_path = [anzpath]*numExamples
147 # stores the gap for each example
148 gap = [0.0]*numExamples
149
150 #############################################################################################
151 # Training
152 #############################################################################################
153 self.plog('Starting training...\n')
154
155 donSP = Conf.numDonSuppPoints
156 accSP = Conf.numAccSuppPoints
157 lengthSP = Conf.numLengthSuppPoints
158 mmatrixSP = Conf.sizeMatchmatrix[0]\
159 *Conf.sizeMatchmatrix[1]
160 numq = Conf.numQualSuppPoints
161 totalQualSP = Conf.totalQualSuppPoints
162
163 currentPhi = zeros((Conf.numFeatures,1))
164 totalQualityPenalties = zeros((totalQualSP,1))
165
166 iteration_nr = 0
167 param_idx = 0
168 const_added_ctr = 0
169 while True:
170 if iteration_nr == iteration_steps:
171 break
172
173 for exampleIdx in range(numExamples):
174 if (exampleIdx%100) == 0:
175 print 'Current example nr %d' % exampleIdx
176
177 dna = Sequences[exampleIdx]
178 est = Ests[exampleIdx]
179
180 if Conf.mode == 'normal':
181 quality = [40]*len(est)
182
183 if Conf.mode == 'using_quality_scores':
184 quality = Qualities[exampleIdx]
185
186 exons = Exons[exampleIdx]
187 # NoiseMatrix = Noises[exampleIdx]
188 don_supp = Donors[exampleIdx]
189 acc_supp = Acceptors[exampleIdx]
190
191 # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
192 trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
193
194 #print 'trueWeights'
195 # Calculate the weights
196 trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
197 trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
198
199 currentPhi[0:donSP] = mat(d.penalties[:]).reshape(donSP,1)
200 currentPhi[donSP:donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
201 currentPhi[donSP+accSP:donSP+accSP+lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
202 currentPhi[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP] = mmatrix[:]
203
204 #pdb.set_trace()
205 if Conf.mode == 'using_quality_scores':
206 totalQualityPenalties = param[-totalQualSP:]
207 currentPhi[donSP+accSP+lengthSP+mmatrixSP:] = totalQualityPenalties[:]
208
209 # Calculate w'phi(x,y) the total score of the alignment
210 trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
211
212 # The allWeights vector is supposed to store the weight parameter
213 # of the true alignment as well as the weight parameters of the
214 # num_path[exampleIdx] other alignments
215 allWeights = zeros((Conf.numFeatures,num_path[exampleIdx]+1))
216 allWeights[:,0] = trueWeight[:,0]
217
218 AlignmentScores = [0.0]*(num_path[exampleIdx]+1)
219 AlignmentScores[0] = trueAlignmentScore
220
221 ################## Calculate wrong alignment(s) ######################
222
223 # Compute donor, acceptor with penalty_lookup_new
224 # returns two double lists
225 donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
226
227 #myalign wants the acceptor site on the g of the ag
228 acceptor = acceptor[1:]
229 acceptor.append(-inf)
230
231 dna = str(dna)
232 est = str(est)
233 dna_len = len(dna)
234 est_len = len(est)
235
236 # check that splice site scores are at dna positions as expected by
237 # the dynamic programming component
238 for d_pos in [pos for pos,elem in enumerate(donor) if elem != -inf]:
239 assert dna[d_pos] == 'g' and (dna[d_pos+1] == 'c'\
240 or dna[d_pos+1] == 't'), pdb.set_trace()
241
242 for a_pos in [pos for pos,elem in enumerate(acceptor) if elem != -inf]:
243 assert dna[a_pos-1] == 'a' and dna[a_pos] == 'g', pdb.set_trace()
244
245 ps = h.convert2SWIG()
246
247 prb = QPalmaDP.createDoubleArrayFromList(quality)
248 chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
249
250 matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
251 mm_len = Conf.sizeMatchmatrix[0]*Conf.sizeMatchmatrix[1]
252
253 d_len = len(donor)
254 donor = QPalmaDP.createDoubleArrayFromList(donor)
255 a_len = len(acceptor)
256 acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
257
258 # Create the alignment object representing the interface to the C/C++ code.
259 currentAlignment = QPalmaDP.Alignment(Conf.numQualPlifs,Conf.numQualSuppPoints, self.use_quality_scores)
260
261 c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
262 #print 'Calling myalign...'
263 # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
264 currentAlignment.myalign( num_path[exampleIdx], dna, dna_len,\
265 est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
266 acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
267 print_matrix)
268
269 #print 'After calling myalign...'
270 #print 'Calling getAlignmentResults...'
271
272 c_SpliceAlign = QPalmaDP.createIntArrayFromList([0]*(dna_len*num_path[exampleIdx]))
273 c_EstAlign = QPalmaDP.createIntArrayFromList([0]*(est_len*num_path[exampleIdx]))
274 c_WeightMatch = QPalmaDP.createIntArrayFromList([0]*(mm_len*num_path[exampleIdx]))
275 c_DPScores = QPalmaDP.createDoubleArrayFromList([.0]*num_path[exampleIdx])
276
277 c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(Conf.totalQualSuppPoints*num_path[exampleIdx]))
278
279 currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
280 c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
281
282 #print 'After calling getAlignmentResults...'
283
284 newSpliceAlign = zeros((num_path[exampleIdx]*dna_len,1))
285 newEstAlign = zeros((est_len*num_path[exampleIdx],1))
286 newWeightMatch = zeros((num_path[exampleIdx]*mm_len,1))
287 newDPScores = zeros((num_path[exampleIdx],1))
288 newQualityPlifsFeatures = zeros((Conf.totalQualSuppPoints*num_path[exampleIdx],1))
289
290 #print 'newSpliceAlign'
291 for i in range(dna_len*num_path[exampleIdx]):
292 newSpliceAlign[i] = c_SpliceAlign[i]
293 # print '%f' % (spliceAlign[i])
294
295 #print 'newEstAlign'
296 for i in range(est_len*num_path[exampleIdx]):
297 newEstAlign[i] = c_EstAlign[i]
298 # print '%f' % (spliceAlign[i])
299
300 #print 'weightMatch'
301 for i in range(mm_len*num_path[exampleIdx]):
302 newWeightMatch[i] = c_WeightMatch[i]
303 # print '%f' % (weightMatch[i])
304
305 #print 'ViterbiScores'
306 for i in range(num_path[exampleIdx]):
307 newDPScores[i] = c_DPScores[i]
308
309
310 if self.use_quality_scores:
311 for i in range(Conf.totalQualSuppPoints*num_path[exampleIdx]):
312 newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
313
314 # equals palma up to here
315
316 #print "Calling destructors"
317 del c_SpliceAlign
318 del c_EstAlign
319 del c_WeightMatch
320 del c_DPScores
321 del c_qualityPlifsFeatures
322 del currentAlignment
323
324 newSpliceAlign = newSpliceAlign.reshape(num_path[exampleIdx],dna_len)
325 newWeightMatch = newWeightMatch.reshape(num_path[exampleIdx],mm_len)
326 # Calculate weights of the respective alignments Note that we are
327 # calculating n-best alignments without hamming loss, so we
328 # have to keep track which of the n-best alignments correspond to
329 # the true one in order not to incorporate a true alignment in the
330 # constraints. To keep track of the true and false alignments we
331 # define an array true_map with a boolean indicating the
332 # equivalence to the true alignment for each decoded alignment.
333 true_map = [0]*(num_path[exampleIdx]+1)
334 true_map[0] = 1
335 path_loss = [0]*(num_path[exampleIdx])
336
337 for pathNr in range(num_path[exampleIdx]):
338 #print 'decodedWeights'
339 weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a,\
340 h, newSpliceAlign[pathNr,:].flatten().tolist()[0], don_supp,\
341 acc_supp)
342
343 decodedQualityFeatures = zeros((Conf.totalQualSuppPoints,1))
344 for qidx in range(Conf.totalQualSuppPoints):
345 decodedQualityFeatures[qidx] = newQualityPlifsFeatures[(pathNr*Conf.totalQualSuppPoints)+qidx]
346
347 #pdb.set_trace()
348
349 path_loss[pathNr] = 0
350 # sum up positionwise loss between alignments
351 for alignPosIdx in range(newSpliceAlign[pathNr,:].shape[1]):
352 if newSpliceAlign[pathNr,alignPosIdx] != trueSpliceAlign[alignPosIdx]:
353 path_loss[pathNr] += 1
354
355 #pdb.set_trace()
356
357 # Gewichte in restliche Zeilen der Matrix speichern
358 wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch[pathNr,:].T, decodedQualityFeatures])
359 allWeights[:,pathNr+1] = wp
360
361 hpen = mat(h.penalties).reshape(len(h.penalties),1)
362 dpen = mat(d.penalties).reshape(len(d.penalties),1)
363 apen = mat(a.penalties).reshape(len(a.penalties),1)
364 features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties])
365
366 AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
367
368 # Check wether scalar product + loss equals viterbi score
369 #print 'Example nr.: %d, path nr. %d, scores: %f vs %f' % (exampleIdx,pathNr,newDPScores[pathNr,0], AlignmentScores[pathNr+1])
370
371 distinct_scores = False
372 if math.fabs(AlignmentScores[pathNr] - AlignmentScores[pathNr+1]) > 1e-5:
373 distinct_scores = True
374
375 #if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) + [0,1][distinct_scores and (pathNr>0)] <= 1e-5:
376 if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) <= 1e-5:
377 pdb.set_trace()
378
379 # # if the pathNr-best alignment is very close to the true alignment consider it as true
380 if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
381 true_map[pathNr+1] = 1
382
383 # assert AlignmentScores[0] > max(AlignmentScores[1:]) + 1e-6, pdb.set_trace()
384
385 # the true label sequence should not have a larger score than the maximal one WHYYYYY?
386 # this means that all n-best paths are to close to each other
387 # we have to extend the n-best search to a (n+1)-best
388 if len([elem for elem in true_map if elem == 1]) == len(true_map):
389 num_path[exampleIdx] = num_path[exampleIdx]+1
390
391 # Choose true and first false alignment for extending A
392 firstFalseIdx = -1
393 for map_idx,elem in enumerate(true_map):
394 if elem == 0:
395 firstFalseIdx = map_idx
396 break
397
398 # if there is at least one useful false alignment add the
399 # corresponding constraints to the optimization problem
400 if firstFalseIdx != -1:
401 trueWeights = allWeights[:,0]
402 firstFalseWeights = allWeights[:,firstFalseIdx]
403 differenceVector = trueWeights - firstFalseWeights
404 #pdb.set_trace()
405
406 if Conf.USE_OPT:
407 const_added = solver.addConstraint(differenceVector, exampleIdx)
408 const_added_ctr += 1
409 #
410 # end of one example processing
411 #
412
413 # call solver every nth example //added constraint
414 if exampleIdx != 0 and exampleIdx % 100 == 0 and Conf.USE_OPT:
415 objValue,w,self.slacks = solver.solve()
416
417 if math.fabs(objValue - self.oldObjValue) <= 1e-6:
418 self.noImprovementCtr += 1
419
420 if self.noImprovementCtr == numExamples+1:
421 break
422
423 self.oldObjValue = objValue
424 print "objValue is %f" % objValue
425
426 sum_xis = 0
427 for elem in self.slacks:
428 sum_xis += elem
429
430 for i in range(len(param)):
431 param[i] = w[i]
432
433 #pdb.set_trace()
434 cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
435 param_idx += 1
436 [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
437
438 #
439 # end of one iteration through all examples
440 #
441 if self.noImprovementCtr == numExamples+1:
442 break
443
444 iteration_nr += 1
445
446 #
447 # end of optimization
448 #
449 print 'Training completed'
450
451 pa = para()
452 pa.h = h
453 pa.d = d
454 pa.a = a
455 pa.mmatrix = mmatrix
456 pa.qualityPlifs = qualityPlifs
457
458 cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
459 #cPickle.dump(pa,open('elegans.param','w+'))
460 self.logfh.close()
461
462 def evaluate(self,param_filename):
463 beg = Conf.prediction_begin
464 end = Conf.prediction_end
465
466 # predict on training set
467 print '##### Prediction on the training set #####'
468 self.predict(param_filename,0,beg)
469
470 # predict on test set
471 print '##### Prediction on the test set #####'
472 self.predict(param_filename,beg,end)
473
474 pdb.set_trace()
475
476 def predict(self,param_filename,beg,end):
477
478 self.logfh = open('_qpalma_predict.log','w+')
479
480 Sequences = self.Sequences[beg:end]
481 Exons = self.Exons[beg:end]
482 Ests = self.Ests[beg:end]
483 Qualities = self.Qualities[beg:end]
484 Acceptors = self.Acceptors[beg:end]
485 Donors = self.Donors[beg:end]
486 SplitPos = self.SplitPos[beg:end]
487
488 # number of training instances
489 N = numExamples = len(Sequences)
490 assert len(Exons) == N and len(Ests) == N\
491 and len(Qualities) == N and len(Acceptors) == N\
492 and len(Donors) == N, 'The Exons,Acc,Don,.. arrays are of different lengths'
493 self.plog('Number of training examples: %d\n'% numExamples)
494
495 self.noImprovementCtr = 0
496 self.oldObjValue = 1e8
497
498 remove_duplicate_scores = Conf.remove_duplicate_scores
499 print_matrix = Conf.print_matrix
500 anzpath = Conf.anzpath
501
502 #param_filename='/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/param_25.pickle'
503 param = load_param(param_filename)
504
505 # Set the parameters such as limits penalties for the Plifs
506 [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
507
508 #############################################################################################
509 # Prediction
510 #############################################################################################
511 self.plog('Starting prediction...\n')
512
513 donSP = Conf.numDonSuppPoints
514 accSP = Conf.numAccSuppPoints
515 lengthSP = Conf.numLengthSuppPoints
516 mmatrixSP = Conf.sizeMatchmatrix[0]\
517 *Conf.sizeMatchmatrix[1]
518 numq = Conf.numQualSuppPoints
519 totalQualSP = Conf.totalQualSuppPoints
520
521 currentPhi = zeros((Conf.numFeatures,1))
522 totalQualityPenalties = zeros((totalQualSP,1))
523
524 exon1Begin = []
525 exon1End = []
526 exon2Begin = []
527 exon2End = []
528 allWrongExons = []
529
530 for exampleIdx in range(numExamples):
531 dna = Sequences[exampleIdx]
532 est = Ests[exampleIdx]
533 currentSplitPos = SplitPos[exampleIdx]
534
535 if Conf.mode == 'normal':
536 quality = [40]*len(est)
537
538 if Conf.mode == 'using_quality_scores':
539 quality = Qualities[exampleIdx]
540
541 exons = Exons[exampleIdx]
542 # NoiseMatrix = Noises[exampleIdx]
543 don_supp = Donors[exampleIdx]
544 acc_supp = Acceptors[exampleIdx]
545
546 # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
547 trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
548
549 # Calculate the weights
550 trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
551 trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
552
553 currentPhi[0:donSP] = mat(d.penalties[:]).reshape(donSP,1)
554 currentPhi[donSP:donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
555 currentPhi[donSP+accSP:donSP+accSP+lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
556 currentPhi[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP] = mmatrix[:]
557
558 if Conf.mode == 'using_quality_scores':
559 totalQualityPenalties = param[-totalQualSP:]
560 currentPhi[donSP+accSP+lengthSP+mmatrixSP:] = totalQualityPenalties[:]
561
562 # Calculate w'phi(x,y) the total score of the alignment
563 trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
564
565 # The allWeights vector is supposed to store the weight parameter
566 # of the true alignment as well as the weight parameters of the
567 # 1 other alignments
568 allWeights = zeros((Conf.numFeatures,1+1))
569 allWeights[:,0] = trueWeight[:,0]
570
571 AlignmentScores = [0.0]*(1+1)
572 AlignmentScores[0] = trueAlignmentScore
573
574 ################## Calculate wrong alignment(s) ######################
575
576 # Compute donor, acceptor with penalty_lookup_new
577 # returns two double lists
578 donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
579
580 #myalign wants the acceptor site on the g of the ag
581 acceptor = acceptor[1:]
582 acceptor.append(-inf)
583
584 dna = str(dna)
585 est = str(est)
586 dna_len = len(dna)
587 est_len = len(est)
588
589 ps = h.convert2SWIG()
590
591 prb = QPalmaDP.createDoubleArrayFromList(quality)
592 chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
593
594 matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
595 mm_len = Conf.sizeMatchmatrix[0]*Conf.sizeMatchmatrix[1]
596
597 d_len = len(donor)
598 donor = QPalmaDP.createDoubleArrayFromList(donor)
599 a_len = len(acceptor)
600 acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
601
602 # Create the alignment object representing the interface to the C/C++ code.
603 currentAlignment = QPalmaDP.Alignment(Conf.numQualPlifs,Conf.numQualSuppPoints, self.use_quality_scores)
604
605 c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
606
607 # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
608 currentAlignment.myalign(1, dna, dna_len,\
609 est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
610 acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
611 print_matrix)
612
613 c_SpliceAlign = QPalmaDP.createIntArrayFromList([0]*(dna_len*1))
614 c_EstAlign = QPalmaDP.createIntArrayFromList([0]*(est_len*1))
615 c_WeightMatch = QPalmaDP.createIntArrayFromList([0]*(mm_len*1))
616 c_DPScores = QPalmaDP.createDoubleArrayFromList([.0]*1)
617
618 c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(Conf.totalQualSuppPoints))
619
620 currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
621 c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
622
623 newSpliceAlign = zeros((dna_len,1))
624 newEstAlign = zeros((est_len,1))
625 newWeightMatch = zeros((mm_len,1))
626 newQualityPlifsFeatures = zeros((Conf.totalQualSuppPoints,1))
627
628 for i in range(dna_len):
629 newSpliceAlign[i] = c_SpliceAlign[i]
630
631 for i in range(est_len):
632 newEstAlign[i] = c_EstAlign[i]
633
634 for i in range(mm_len):
635 newWeightMatch[i] = c_WeightMatch[i]
636
637 newDPScores = c_DPScores[0]
638
639 for i in range(Conf.totalQualSuppPoints):
640 newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
641
642 del c_SpliceAlign
643 del c_EstAlign
644 del c_WeightMatch
645 del c_DPScores
646 del c_qualityPlifsFeatures
647 del currentAlignment
648
649 newSpliceAlign = newSpliceAlign.reshape(1,dna_len)
650 newWeightMatch = newWeightMatch.reshape(1,mm_len)
651 true_map = [0]*2
652 true_map[0] = 1
653 pathNr = 0
654
655 weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a, h, newSpliceAlign.flatten().tolist()[0], don_supp, acc_supp)
656
657 decodedQualityFeatures = zeros((Conf.totalQualSuppPoints,1))
658 for qidx in range(Conf.totalQualSuppPoints):
659 decodedQualityFeatures[qidx] = newQualityPlifsFeatures[qidx]
660
661 # Gewichte in restliche Zeilen der Matrix speichern
662 wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch.T, decodedQualityFeatures])
663 allWeights[:,pathNr+1] = wp
664
665 hpen = mat(h.penalties).reshape(len(h.penalties),1)
666 dpen = mat(d.penalties).reshape(len(d.penalties),1)
667 apen = mat(a.penalties).reshape(len(a.penalties),1)
668 features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties])
669 AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
670
671 # if the pathNr-best alignment is very close to the true alignment consider it as true
672 if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
673 true_map[pathNr+1] = 1
674
675 e1_b_off,e1_e_off,e2_b_off,e2_e_off,newExons = evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign,currentSplitPos)
676
677 if e1_b_off != None:
678 exon1Begin.append(e1_b_off)
679 exon1End.append(e1_e_off)
680 exon2Begin.append(e2_b_off)
681 exon2End.append(e2_e_off)
682 else:
683 allWrongExons.append((newExons,exons))
684
685 e1Begin_pos,e1Begin_neg,e1End_pos,e1End_neg,mean_e1Begin_neg,mean_e1End_neg = self.evaluatePositions(exon1Begin,exon1End)
686 e2Begin_pos,e2Begin_neg,e2End_pos,e2End_neg,mean_e2Begin_neg,mean_e2End_neg = self.evaluatePositions(exon2Begin,exon2End)
687
688 print 'Total num. of Examples: %d' % numExamples
689 print 'Correct positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_pos),len(e1End_pos),len(e2Begin_pos),len(e2End_pos))
690 print 'Incorrect positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_neg),len(e1End_neg),len(e2Begin_neg),len(e2End_neg))
691 print 'Mean of pos. offset:\t\t%.2f\t%.2f\t%.2f\t%.2f' % (mean_e1Begin_neg,mean_e1End_neg,mean_e2Begin_neg,mean_e2End_neg)
692
693 print 'Prediction completed'
694 self.logfh.close()
695
696 def evaluatePositions(self,eBegin,eEnd):
697 eBegin_pos = [elem for elem in eBegin if elem == 0]
698 eBegin_neg = [elem for elem in eBegin if elem != 0]
699 eEnd_pos = [elem for elem in eEnd if elem == 0]
700 eEnd_neg = [elem for elem in eEnd if elem != 0]
701
702 mean_eBegin_neg = 0
703 for idx in range(len(eBegin_neg)):
704 mean_eBegin_neg += eBegin_neg[idx]
705
706 mean_eBegin_neg /= 1.0*len(eBegin_neg)
707
708 mean_eEnd_neg = 0
709 for idx in range(len(eEnd_neg)):
710 mean_eEnd_neg += eEnd_neg[idx]
711
712 mean_eEnd_neg /= 1.0*len(eEnd_neg)
713
714 return eBegin_pos,eBegin_neg,eEnd_pos,eEnd_neg,mean_eBegin_neg,mean_eEnd_neg
715
716
717 def fetch_genome_info(ginfo_filename):
718 if not os.path.exists(ginfo_filename):
719 cmd = ['']*4
720 cmd[0] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/utils'
721 cmd[1] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/genomes'
722 cmd[2] = 'genome_info = init_genome(\'%s\')' % gen_file
723 cmd[3] = 'save genome_info.mat genome_info'
724 full_cmd = "matlab -nojvm -nodisplay -r \"%s; %s; %s; %s; exit\"" % (cmd[0],cmd[1],cmd[2],cmd[3])
725
726 obj = subprocess.Popen(full_cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
727 out,err = obj.communicate()
728 assert err == '', 'An error occured!\n%s'%err
729
730 ginfo = scipy.io.loadmat('genome_info.mat')
731 cPickle.dump(self.genome_info,open(ginfo_filename,'w+'))
732 return ginfo['genome_info']
733
734 else:
735 return cPickle.load(open(ginfo_filename))
736
737 def plifs2param(h,d,a,mmatrix,qualityPlifs):
738 donSP = Conf.numDonSuppPoints
739 accSP = Conf.numAccSuppPoints
740 lengthSP = Conf.numLengthSuppPoints
741 mmatrixSP = Conf.sizeMatchmatrix[0]\
742 *Conf.sizeMatchmatrix[1]
743
744
745 param = zeros((Conf.numFeatures,1))
746 param[0:donSP] = mat(d.penalties[:]).reshape(donSP,1)
747 param[donSP:donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
748 param[donSP+accSP:donSP+accSP+lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
749 param[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP] = mmatrix[:]
750
751 for idx in range(len(qualityPlifs)):
752 currentPlif = qualityPlifs[idx]
753 begin = lengthSP+donSP+accSP+mmatrixSP+(idx*Conf.numQualSuppPoints)
754 end = lengthSP+donSP+accSP+mmatrixSP+((idx+1)*Conf.numQualSuppPoints)
755 param[begin:end] = mat(currentPlif.penalties).reshape(Conf.numQualSuppPoints,1)
756
757 return param
758
759 def load_param(filename):
760 param = None
761 #try:
762 # para = cPickle.load(open(filename))
763 #except:
764 # print 'Error: Could not open parameter file!'
765 # sys.exit(1)
766 #
767 #param = plifs2param(para.h,para.d,para.a,para.mmatrix,para.qualityPlifs)
768
769 param = cPickle.load(open(filename))
770 return param
771
772 def evaluateExample(dna,est,exons,SpliceAlign,newEstAlign,spos):
773 newExons = []
774 oldElem = -1
775 SpliceAlign = SpliceAlign.flatten().tolist()[0]
776 SpliceAlign.append(-1)
777 for pos,elem in enumerate(SpliceAlign):
778 if pos == 0:
779 oldElem = -1
780 else:
781 oldElem = SpliceAlign[pos-1]
782
783 if oldElem != 0 and elem == 0: # start of exon
784 newExons.append(pos)
785
786 if oldElem == 0 and elem != 0: # end of exon
787 newExons.append(pos)
788
789 e1_b_off,e1_e_off,e2_b_off,e2_e_off = 0,0,0,0
790
791 if len(newExons) == 4:
792 e1_begin,e1_end = newExons[0],newExons[1]
793 e2_begin,e2_end = newExons[2],newExons[3]
794
795 #elif len(newExons) > 4 :
796 # e1_begin,e1_end = newExons[0],newExons[1]
797 # e2_begin,e2_end = newExons[-2],newExons[-1]
798 else:
799 return None,None,None,None,newExons
800
801 e1_b_off = int(math.fabs(e1_begin - exons[0,0]))
802 e1_e_off = int(math.fabs(e1_end - exons[0,1]))
803
804 e2_b_off = int(math.fabs(e2_begin - exons[1,0]))
805 e2_e_off = int(math.fabs(e2_end - exons[1,1]))
806
807 return e1_b_off,e1_e_off,e2_b_off,e2_e_off,newExons
808
809 def calcStat(Acceptor,Donor,Exons):
810 maxAcc = -100
811 minAcc = 100
812 maxDon = -100
813 minDon = 100
814
815 acc_vec_pos = []
816 acc_vec_neg = []
817 don_vec_pos = []
818 don_vec_neg = []
819
820 for jdx in range(len(Acceptors)):
821 currentExons = Exons[jdx]
822 currentAcceptor = Acceptors[jdx]
823 currentAcceptor = currentAcceptor[1:]
824 currentAcceptor.append(-inf)
825 currentDonor = Donors[jdx]
826
827 for idx,elem in enumerate(currentAcceptor):
828 if idx == (int(currentExons[1,0])-1): # acceptor site
829 acc_vec_pos.append(elem)
830 else:
831 acc_vec_neg.append(elem)
832
833 for idx,elem in enumerate(currentDonor):
834 if idx == (int(currentExons[0,1])): # donor site
835 don_vec_pos.append(elem)
836 else:
837 don_vec_neg.append(elem)
838
839 acc_pos = [elem for elem in acc_vec_pos if elem != -inf]
840 acc_neg = [elem for elem in acc_vec_neg if elem != -inf]
841 don_pos = [elem for elem in don_vec_pos if elem != -inf]
842 don_neg = [elem for elem in don_vec_neg if elem != -inf]
843
844 pdb.set_trace()
845
846 for idx in range(len(Sequences)):
847 acc = [elem for elem in Acceptors[idx] if elem != -inf]
848 maxAcc = max(max(acc),maxAcc)
849 minAcc = min(min(acc),minAcc)
850
851 don = [elem for elem in Donors[idx] if elem != -inf]
852 maxDon = max(max(don),maxDon)
853 minDon = min(min(don),minDon)
854
855 if __name__ == '__main__':
856
857 qpalma = QPalma()
858
859 if len(sys.argv) == 2:
860 mode = sys.argv[1]
861 assert mode == 'train'
862 qpalma.train()
863
864 elif len(sys.argv) == 3:
865 mode = sys.argv[1]
866 param_filename = sys.argv[2]
867 assert mode == 'predict'
868 assert os.path.exists(param_filename)
869 qpalma.evaluate(param_filename)
870 else:
871 print 'You have to choose between training or prediction mode:'
872 print 'python qpalma. py (train|predict) <param_file>'