+ extended evaluation function to return positionwise deviations
[qpalma.git] / scripts / qpalma_main.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 ###########################################################
5 #
6 # The QPalma project aims at extending the Palma project
7 # to be able to use Solexa reads toegether with their
8 # quality scores.
9 #
10 # This file represents the conversion of the main matlab
11 # training loop for Palma to python.
12 #
13 # Author: Fabio De Bona
14 #
15 ###########################################################
16
17 import sys
18 import subprocess
19 import scipy.io
20 import pdb
21 import os.path
22
23 from numpy.matlib import mat,zeros,ones,inf
24 from numpy.linalg import norm
25
26 import QPalmaDP
27 import qpalma
28 from qpalma.SIQP_CPX import SIQPSolver
29 from qpalma.DataProc import *
30 from qpalma.generateEvaluationData import *
31 from qpalma.computeSpliceWeights import *
32 from qpalma.set_param_palma import *
33 from qpalma.computeSpliceAlignWithQuality import *
34 from qpalma.penalty_lookup_new import *
35 from qpalma.compute_donacc import *
36 from qpalma.export_param import *
37 from qpalma.TrainingParam import Param
38 from qpalma.Plif import Plf
39
40 from qpalma.tools.splicesites import getDonAccScores
41 from qpalma.Configuration import *
42
43 class para:
44 pass
45
46 class QPalma:
47 """
48 A training method for the QPalma project
49 """
50
51 def __init__(self):
52 self.ARGS = Param()
53
54 #gen_file= '%s/genome.config' % self.ARGS.basedir
55 #ginfo_filename = 'genome_info.pickle'
56 #self.genome_info = fetch_genome_info(ginfo_filename)
57 #self.plog('genome_info.basedir is %s\n'%self.genome_info.basedir)
58 #self.ARGS.train_with_splicesitescoreinformation = False
59
60 # Load the whole dataset
61 if Conf.mode == 'normal':
62 #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
63 Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
64 self.use_quality_scores = False
65
66 elif Conf.mode == 'using_quality_scores':
67
68 filename = 'real_dset_%s'% 'recent'
69 Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos = paths_load_data_solexa('training',None,self.ARGS)
70
71 #end = 1000
72 self.Sequences = Sequences
73 self.Exons = Exons
74 self.Ests = Ests
75 self.Qualities = Qualities
76 self.SplitPos = SplitPos
77
78 self.Donors, self.Acceptors = getDonAccScores(Sequences)
79
80 #self.Acceptors = self.Acceptors[:end]
81 #self.Donors = self.Donors[:end]
82
83 #Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
84 #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
85 #Qualities = []
86 #for i in range(len(Ests)):
87 # Qualities.append([40]*len(Ests[i]))
88 self.use_quality_scores = True
89 else:
90 assert(False)
91
92 def plog(self,string):
93 self.logfh.write(string)
94 self.logfh.flush()
95
96 def train(self):
97 self.logfh = open('_qpalma_train.log','w+')
98
99 beg = Conf.training_begin
100 end = Conf.training_end
101
102 Sequences = self.Sequences[beg:end]
103 Exons = self.Exons[beg:end]
104 Ests = self.Ests[beg:end]
105 Qualities = self.Qualities[beg:end]
106 Acceptors = self.Acceptors[beg:end]
107 Donors = self.Donors[beg:end]
108
109 # number of training instances
110 N = numExamples = len(Sequences)
111 assert len(Exons) == N and len(Ests) == N\
112 and len(Qualities) == N and len(Acceptors) == N\
113 and len(Donors) == N, 'The Exons,Acc,Don,.. arrays are of different lengths'
114 self.plog('Number of training examples: %d\n'% numExamples)
115
116 self.noImprovementCtr = 0
117 self.oldObjValue = 1e8
118
119 iteration_steps = Conf.iter_steps ; #upper bound on iteration steps
120 remove_duplicate_scores = Conf.remove_duplicate_scores
121 print_matrix = Conf.print_matrix
122 anzpath = Conf.anzpath
123
124 # Initialize parameter vector / param = numpy.matlib.rand(126,1)
125 param = Conf.fixedParam
126
127 # Set the parameters such as limits penalties for the Plifs
128 [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
129
130 # delete splicesite-score-information
131 #if not self.ARGS.train_with_splicesitescoreinformation:
132 # for i in range(len(Acceptors)):
133 # if Acceptors[i] > -20:
134 # Acceptors[i] = 1
135 # if Donors[i] >-20:
136 # Donors[i] = 1
137
138 # Initialize solver
139 if Conf.USE_OPT:
140 self.plog('Initializing problem...\n')
141 solver = SIQPSolver(Conf.numFeatures,numExamples,Conf.C,self.logfh)
142
143 # stores the number of alignments done for each example (best path, second-best path etc.)
144 num_path = [anzpath]*numExamples
145 # stores the gap for each example
146 gap = [0.0]*numExamples
147
148 #############################################################################################
149 # Training
150 #############################################################################################
151 self.plog('Starting training...\n')
152
153 donSP = Conf.numDonSuppPoints
154 accSP = Conf.numAccSuppPoints
155 lengthSP = Conf.numLengthSuppPoints
156 mmatrixSP = Conf.sizeMatchmatrix[0]\
157 *Conf.sizeMatchmatrix[1]
158 numq = Conf.numQualSuppPoints
159 totalQualSP = Conf.totalQualSuppPoints
160
161 currentPhi = zeros((Conf.numFeatures,1))
162 totalQualityPenalties = zeros((totalQualSP,1))
163
164 iteration_nr = 0
165 param_idx = 0
166 const_added_ctr = 0
167 while True:
168 if iteration_nr == iteration_steps:
169 break
170
171 for exampleIdx in range(numExamples):
172 if (exampleIdx%100) == 0:
173 print 'Current example nr %d' % exampleIdx
174
175 dna = Sequences[exampleIdx]
176 est = Ests[exampleIdx]
177
178 if Conf.mode == 'normal':
179 quality = [40]*len(est)
180
181 if Conf.mode == 'using_quality_scores':
182 quality = Qualities[exampleIdx]
183
184 exons = Exons[exampleIdx]
185 # NoiseMatrix = Noises[exampleIdx]
186 don_supp = Donors[exampleIdx]
187 acc_supp = Acceptors[exampleIdx]
188
189 # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
190 trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
191
192 #print 'trueWeights'
193 # Calculate the weights
194 trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
195 trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
196
197 currentPhi[0:donSP] = mat(d.penalties[:]).reshape(donSP,1)
198 currentPhi[donSP:donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
199 currentPhi[donSP+accSP:donSP+accSP+lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
200 currentPhi[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP] = mmatrix[:]
201
202 #pdb.set_trace()
203 if Conf.mode == 'using_quality_scores':
204 totalQualityPenalties = param[-totalQualSP:]
205 currentPhi[donSP+accSP+lengthSP+mmatrixSP:] = totalQualityPenalties[:]
206
207 # Calculate w'phi(x,y) the total score of the alignment
208 trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
209
210 # The allWeights vector is supposed to store the weight parameter
211 # of the true alignment as well as the weight parameters of the
212 # num_path[exampleIdx] other alignments
213 allWeights = zeros((Conf.numFeatures,num_path[exampleIdx]+1))
214 allWeights[:,0] = trueWeight[:,0]
215
216 AlignmentScores = [0.0]*(num_path[exampleIdx]+1)
217 AlignmentScores[0] = trueAlignmentScore
218
219 ################## Calculate wrong alignment(s) ######################
220
221 # Compute donor, acceptor with penalty_lookup_new
222 # returns two double lists
223 donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
224
225 #myalign wants the acceptor site on the g of the ag
226 acceptor = acceptor[1:]
227 acceptor.append(-inf)
228
229 dna = str(dna)
230 est = str(est)
231 dna_len = len(dna)
232 est_len = len(est)
233
234 # check that splice site scores are at dna positions as expected by
235 # the dynamic programming component
236 for d_pos in [pos for pos,elem in enumerate(donor) if elem != -inf]:
237 assert dna[d_pos] == 'g' and (dna[d_pos+1] == 'c'\
238 or dna[d_pos+1] == 't'), pdb.set_trace()
239
240 for a_pos in [pos for pos,elem in enumerate(acceptor) if elem != -inf]:
241 assert dna[a_pos-1] == 'a' and dna[a_pos] == 'g', pdb.set_trace()
242
243 ps = h.convert2SWIG()
244
245 prb = QPalmaDP.createDoubleArrayFromList(quality)
246 chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
247
248 matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
249 mm_len = Conf.sizeMatchmatrix[0]*Conf.sizeMatchmatrix[1]
250
251 d_len = len(donor)
252 donor = QPalmaDP.createDoubleArrayFromList(donor)
253 a_len = len(acceptor)
254 acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
255
256 # Create the alignment object representing the interface to the C/C++ code.
257 currentAlignment = QPalmaDP.Alignment(Conf.numQualPlifs,Conf.numQualSuppPoints, self.use_quality_scores)
258
259 c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
260 #print 'Calling myalign...'
261 # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
262 currentAlignment.myalign( num_path[exampleIdx], dna, dna_len,\
263 est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
264 acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
265 print_matrix)
266
267 #print 'After calling myalign...'
268 #print 'Calling getAlignmentResults...'
269
270 c_SpliceAlign = QPalmaDP.createIntArrayFromList([0]*(dna_len*num_path[exampleIdx]))
271 c_EstAlign = QPalmaDP.createIntArrayFromList([0]*(est_len*num_path[exampleIdx]))
272 c_WeightMatch = QPalmaDP.createIntArrayFromList([0]*(mm_len*num_path[exampleIdx]))
273 c_DPScores = QPalmaDP.createDoubleArrayFromList([.0]*num_path[exampleIdx])
274
275 c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(Conf.totalQualSuppPoints*num_path[exampleIdx]))
276
277 currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
278 c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
279
280 #print 'After calling getAlignmentResults...'
281
282 newSpliceAlign = zeros((num_path[exampleIdx]*dna_len,1))
283 newEstAlign = zeros((est_len*num_path[exampleIdx],1))
284 newWeightMatch = zeros((num_path[exampleIdx]*mm_len,1))
285 newDPScores = zeros((num_path[exampleIdx],1))
286 newQualityPlifsFeatures = zeros((Conf.totalQualSuppPoints*num_path[exampleIdx],1))
287
288 #print 'newSpliceAlign'
289 for i in range(dna_len*num_path[exampleIdx]):
290 newSpliceAlign[i] = c_SpliceAlign[i]
291 # print '%f' % (spliceAlign[i])
292
293 #print 'newEstAlign'
294 for i in range(est_len*num_path[exampleIdx]):
295 newEstAlign[i] = c_EstAlign[i]
296 # print '%f' % (spliceAlign[i])
297
298 #print 'weightMatch'
299 for i in range(mm_len*num_path[exampleIdx]):
300 newWeightMatch[i] = c_WeightMatch[i]
301 # print '%f' % (weightMatch[i])
302
303 #print 'ViterbiScores'
304 for i in range(num_path[exampleIdx]):
305 newDPScores[i] = c_DPScores[i]
306
307
308 if self.use_quality_scores:
309 for i in range(Conf.totalQualSuppPoints*num_path[exampleIdx]):
310 newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
311
312 # equals palma up to here
313
314 #print "Calling destructors"
315 del c_SpliceAlign
316 del c_EstAlign
317 del c_WeightMatch
318 del c_DPScores
319 del c_qualityPlifsFeatures
320 del currentAlignment
321
322 newSpliceAlign = newSpliceAlign.reshape(num_path[exampleIdx],dna_len)
323 newWeightMatch = newWeightMatch.reshape(num_path[exampleIdx],mm_len)
324 # Calculate weights of the respective alignments Note that we are
325 # calculating n-best alignments without hamming loss, so we
326 # have to keep track which of the n-best alignments correspond to
327 # the true one in order not to incorporate a true alignment in the
328 # constraints. To keep track of the true and false alignments we
329 # define an array true_map with a boolean indicating the
330 # equivalence to the true alignment for each decoded alignment.
331 true_map = [0]*(num_path[exampleIdx]+1)
332 true_map[0] = 1
333 path_loss = [0]*(num_path[exampleIdx])
334
335 for pathNr in range(num_path[exampleIdx]):
336 #print 'decodedWeights'
337 weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a,\
338 h, newSpliceAlign[pathNr,:].flatten().tolist()[0], don_supp,\
339 acc_supp)
340
341 decodedQualityFeatures = zeros((Conf.totalQualSuppPoints,1))
342 for qidx in range(Conf.totalQualSuppPoints):
343 decodedQualityFeatures[qidx] = newQualityPlifsFeatures[(pathNr*Conf.totalQualSuppPoints)+qidx]
344
345 #pdb.set_trace()
346
347 path_loss[pathNr] = 0
348 # sum up positionwise loss between alignments
349 for alignPosIdx in range(newSpliceAlign[pathNr,:].shape[1]):
350 if newSpliceAlign[pathNr,alignPosIdx] != trueSpliceAlign[alignPosIdx]:
351 path_loss[pathNr] += 1
352
353 #pdb.set_trace()
354
355 # Gewichte in restliche Zeilen der Matrix speichern
356 wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch[pathNr,:].T, decodedQualityFeatures])
357 allWeights[:,pathNr+1] = wp
358
359 hpen = mat(h.penalties).reshape(len(h.penalties),1)
360 dpen = mat(d.penalties).reshape(len(d.penalties),1)
361 apen = mat(a.penalties).reshape(len(a.penalties),1)
362 features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties])
363
364 AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
365
366 # Check wether scalar product + loss equals viterbi score
367 #print 'Example nr.: %d, path nr. %d, scores: %f vs %f' % (exampleIdx,pathNr,newDPScores[pathNr,0], AlignmentScores[pathNr+1])
368
369 distinct_scores = False
370 if math.fabs(AlignmentScores[pathNr] - AlignmentScores[pathNr+1]) > 1e-5:
371 distinct_scores = True
372
373 #if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) + [0,1][distinct_scores and (pathNr>0)] <= 1e-5:
374 if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) <= 1e-5:
375 pdb.set_trace()
376
377 # # if the pathNr-best alignment is very close to the true alignment consider it as true
378 if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
379 true_map[pathNr+1] = 1
380
381 # assert AlignmentScores[0] > max(AlignmentScores[1:]) + 1e-6, pdb.set_trace()
382
383 # the true label sequence should not have a larger score than the maximal one WHYYYYY?
384 # this means that all n-best paths are to close to each other
385 # we have to extend the n-best search to a (n+1)-best
386 if len([elem for elem in true_map if elem == 1]) == len(true_map):
387 num_path[exampleIdx] = num_path[exampleIdx]+1
388
389 # Choose true and first false alignment for extending A
390 firstFalseIdx = -1
391 for map_idx,elem in enumerate(true_map):
392 if elem == 0:
393 firstFalseIdx = map_idx
394 break
395
396 # if there is at least one useful false alignment add the
397 # corresponding constraints to the optimization problem
398 if firstFalseIdx != -1:
399 trueWeights = allWeights[:,0]
400 firstFalseWeights = allWeights[:,firstFalseIdx]
401 differenceVector = trueWeights - firstFalseWeights
402 #pdb.set_trace()
403
404 if Conf.USE_OPT:
405 const_added = solver.addConstraint(differenceVector, exampleIdx)
406 const_added_ctr += 1
407 #
408 # end of one example processing
409 #
410
411 # call solver every nth example //added constraint
412 if exampleIdx != 0 and exampleIdx % 100 == 0 and Conf.USE_OPT:
413 objValue,w,self.slacks = solver.solve()
414
415 if math.fabs(objValue - self.oldObjValue) <= 1e-6:
416 self.noImprovementCtr += 1
417
418 if self.noImprovementCtr == numExamples+1:
419 break
420
421 self.oldObjValue = objValue
422 print "objValue is %f" % objValue
423
424 sum_xis = 0
425 for elem in self.slacks:
426 sum_xis += elem
427
428 for i in range(len(param)):
429 param[i] = w[i]
430
431 #pdb.set_trace()
432 cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
433 param_idx += 1
434 [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
435
436 #
437 # end of one iteration through all examples
438 #
439 if self.noImprovementCtr == numExamples+1:
440 break
441
442 iteration_nr += 1
443
444 #
445 # end of optimization
446 #
447 print 'Training completed'
448
449 pa = para()
450 pa.h = h
451 pa.d = d
452 pa.a = a
453 pa.mmatrix = mmatrix
454 pa.qualityPlifs = qualityPlifs
455
456 cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
457 #cPickle.dump(pa,open('elegans.param','w+'))
458 self.logfh.close()
459
460 def evaluate(self,param_filename):
461 beg = Conf.prediction_begin
462 end = Conf.prediction_end
463
464 # predict on training set
465 print '##### Prediction on the training set #####'
466 self.predict(param_filename,0,beg)
467
468 # predict on test set
469 print '##### Prediction on the test set #####'
470 self.predict(param_filename,beg,end)
471
472 pdb.set_trace()
473
474 def predict(self,param_filename,beg,end):
475
476 self.logfh = open('_qpalma_predict.log','w+')
477
478 Sequences = self.Sequences[beg:end]
479 Exons = self.Exons[beg:end]
480 Ests = self.Ests[beg:end]
481 Qualities = self.Qualities[beg:end]
482 Acceptors = self.Acceptors[beg:end]
483 Donors = self.Donors[beg:end]
484 SplitPos = self.SplitPos[beg:end]
485
486 # number of training instances
487 N = numExamples = len(Sequences)
488 assert len(Exons) == N and len(Ests) == N\
489 and len(Qualities) == N and len(Acceptors) == N\
490 and len(Donors) == N, 'The Exons,Acc,Don,.. arrays are of different lengths'
491 self.plog('Number of training examples: %d\n'% numExamples)
492
493 self.noImprovementCtr = 0
494 self.oldObjValue = 1e8
495
496 remove_duplicate_scores = Conf.remove_duplicate_scores
497 print_matrix = Conf.print_matrix
498 anzpath = Conf.anzpath
499
500 #param_filename='/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/param_25.pickle'
501 param = load_param(param_filename)
502
503 # Set the parameters such as limits penalties for the Plifs
504 [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
505
506 #############################################################################################
507 # Prediction
508 #############################################################################################
509 self.plog('Starting prediction...\n')
510
511 donSP = Conf.numDonSuppPoints
512 accSP = Conf.numAccSuppPoints
513 lengthSP = Conf.numLengthSuppPoints
514 mmatrixSP = Conf.sizeMatchmatrix[0]\
515 *Conf.sizeMatchmatrix[1]
516 numq = Conf.numQualSuppPoints
517 totalQualSP = Conf.totalQualSuppPoints
518
519 currentPhi = zeros((Conf.numFeatures,1))
520 totalQualityPenalties = zeros((totalQualSP,1))
521
522 exon1Begin = []
523 exon1End = []
524 exon2Begin = []
525 exon2End = []
526 allWrongExons = []
527
528 for exampleIdx in range(numExamples):
529 dna = Sequences[exampleIdx]
530 est = Ests[exampleIdx]
531 currentSplitPos = SplitPos[exampleIdx]
532
533 if Conf.mode == 'normal':
534 quality = [40]*len(est)
535
536 if Conf.mode == 'using_quality_scores':
537 quality = Qualities[exampleIdx]
538
539 exons = Exons[exampleIdx]
540 # NoiseMatrix = Noises[exampleIdx]
541 don_supp = Donors[exampleIdx]
542 acc_supp = Acceptors[exampleIdx]
543
544 # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
545 trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
546
547 # Calculate the weights
548 trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
549 trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
550
551 currentPhi[0:donSP] = mat(d.penalties[:]).reshape(donSP,1)
552 currentPhi[donSP:donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
553 currentPhi[donSP+accSP:donSP+accSP+lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
554 currentPhi[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP] = mmatrix[:]
555
556 if Conf.mode == 'using_quality_scores':
557 totalQualityPenalties = param[-totalQualSP:]
558 currentPhi[donSP+accSP+lengthSP+mmatrixSP:] = totalQualityPenalties[:]
559
560 # Calculate w'phi(x,y) the total score of the alignment
561 trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
562
563 # The allWeights vector is supposed to store the weight parameter
564 # of the true alignment as well as the weight parameters of the
565 # 1 other alignments
566 allWeights = zeros((Conf.numFeatures,1+1))
567 allWeights[:,0] = trueWeight[:,0]
568
569 AlignmentScores = [0.0]*(1+1)
570 AlignmentScores[0] = trueAlignmentScore
571
572 ################## Calculate wrong alignment(s) ######################
573
574 # Compute donor, acceptor with penalty_lookup_new
575 # returns two double lists
576 donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
577
578 #myalign wants the acceptor site on the g of the ag
579 acceptor = acceptor[1:]
580 acceptor.append(-inf)
581
582 dna = str(dna)
583 est = str(est)
584 dna_len = len(dna)
585 est_len = len(est)
586
587 ps = h.convert2SWIG()
588
589 prb = QPalmaDP.createDoubleArrayFromList(quality)
590 chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
591
592 matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
593 mm_len = Conf.sizeMatchmatrix[0]*Conf.sizeMatchmatrix[1]
594
595 d_len = len(donor)
596 donor = QPalmaDP.createDoubleArrayFromList(donor)
597 a_len = len(acceptor)
598 acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
599
600 # Create the alignment object representing the interface to the C/C++ code.
601 currentAlignment = QPalmaDP.Alignment(Conf.numQualPlifs,Conf.numQualSuppPoints, self.use_quality_scores)
602
603 c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
604
605 # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
606 currentAlignment.myalign(1, dna, dna_len,\
607 est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
608 acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
609 print_matrix)
610
611 c_SpliceAlign = QPalmaDP.createIntArrayFromList([0]*(dna_len*1))
612 c_EstAlign = QPalmaDP.createIntArrayFromList([0]*(est_len*1))
613 c_WeightMatch = QPalmaDP.createIntArrayFromList([0]*(mm_len*1))
614 c_DPScores = QPalmaDP.createDoubleArrayFromList([.0]*1)
615
616 c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(Conf.totalQualSuppPoints))
617
618 currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
619 c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
620
621 newSpliceAlign = zeros((dna_len,1))
622 newEstAlign = zeros((est_len,1))
623 newWeightMatch = zeros((mm_len,1))
624 newQualityPlifsFeatures = zeros((Conf.totalQualSuppPoints,1))
625
626 for i in range(dna_len):
627 newSpliceAlign[i] = c_SpliceAlign[i]
628
629 for i in range(est_len):
630 newEstAlign[i] = c_EstAlign[i]
631
632 for i in range(mm_len):
633 newWeightMatch[i] = c_WeightMatch[i]
634
635 newDPScores = c_DPScores[0]
636
637 for i in range(Conf.totalQualSuppPoints):
638 newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
639
640 del c_SpliceAlign
641 del c_EstAlign
642 del c_WeightMatch
643 del c_DPScores
644 del c_qualityPlifsFeatures
645 del currentAlignment
646
647 newSpliceAlign = newSpliceAlign.reshape(1,dna_len)
648 newWeightMatch = newWeightMatch.reshape(1,mm_len)
649 true_map = [0]*2
650 true_map[0] = 1
651 pathNr = 0
652
653 weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a, h, newSpliceAlign.flatten().tolist()[0], don_supp, acc_supp)
654
655 decodedQualityFeatures = zeros((Conf.totalQualSuppPoints,1))
656 for qidx in range(Conf.totalQualSuppPoints):
657 decodedQualityFeatures[qidx] = newQualityPlifsFeatures[qidx]
658
659 # Gewichte in restliche Zeilen der Matrix speichern
660 wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch.T, decodedQualityFeatures])
661 allWeights[:,pathNr+1] = wp
662
663 hpen = mat(h.penalties).reshape(len(h.penalties),1)
664 dpen = mat(d.penalties).reshape(len(d.penalties),1)
665 apen = mat(a.penalties).reshape(len(a.penalties),1)
666 features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties])
667 AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
668
669 # if the pathNr-best alignment is very close to the true alignment consider it as true
670 if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
671 true_map[pathNr+1] = 1
672
673 e1_b_off,e1_e_off,e2_b_off,e2_e_off,newExons = evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign,currentSplitPos)
674
675 if e1_b_off != None:
676 exon1Begin.append(e1_b_off)
677 exon1End.append(e1_e_off)
678 exon2Begin.append(e2_b_off)
679 exon2End.append(e2_e_off)
680 else:
681 allWrongExons.append((newExons,exons))
682
683 e1Begin_pos,e1Begin_neg,e1End_pos,e1End_neg,mean_e1Begin_neg,mean_e1End_neg = self.evaluatePositions(exon1Begin,exon1End)
684 e2Begin_pos,e2Begin_neg,e2End_pos,e2End_neg,mean_e2Begin_neg,mean_e2End_neg = self.evaluatePositions(exon2Begin,exon2End)
685
686 print 'Total num. of Examples: %d' % numExamples
687 print 'Correct positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_pos),len(e1End_pos),len(e2Begin_pos),len(e2End_pos))
688 print 'Incorrect positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_neg),len(e1End_neg),len(e2Begin_neg),len(e2End_neg))
689 print 'Mean of pos. offset:\t\t%.2f\t%.2f\t%.2f\t%.2f' % (mean_e1Begin_neg,mean_e1End_neg,mean_e2Begin_neg,mean_e2End_neg)
690
691 print 'Prediction completed'
692 self.logfh.close()
693
694 def evaluatePositions(self,eBegin,eEnd):
695 eBegin_pos = [elem for elem in eBegin if elem == 0]
696 eBegin_neg = [elem for elem in eBegin if elem != 0]
697 eEnd_pos = [elem for elem in eEnd if elem == 0]
698 eEnd_neg = [elem for elem in eEnd if elem != 0]
699
700 mean_eBegin_neg = 0
701 for idx in range(len(eBegin_neg)):
702 mean_eBegin_neg += eBegin_neg[idx]
703
704 mean_eBegin_neg /= 1.0*len(eBegin_neg)
705
706 mean_eEnd_neg = 0
707 for idx in range(len(eEnd_neg)):
708 mean_eEnd_neg += eEnd_neg[idx]
709
710 mean_eEnd_neg /= 1.0*len(eEnd_neg)
711
712 return eBegin_pos,eBegin_neg,eEnd_pos,eEnd_neg,mean_eBegin_neg,mean_eEnd_neg
713
714
715 def fetch_genome_info(ginfo_filename):
716 if not os.path.exists(ginfo_filename):
717 cmd = ['']*4
718 cmd[0] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/utils'
719 cmd[1] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/genomes'
720 cmd[2] = 'genome_info = init_genome(\'%s\')' % gen_file
721 cmd[3] = 'save genome_info.mat genome_info'
722 full_cmd = "matlab -nojvm -nodisplay -r \"%s; %s; %s; %s; exit\"" % (cmd[0],cmd[1],cmd[2],cmd[3])
723
724 obj = subprocess.Popen(full_cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
725 out,err = obj.communicate()
726 assert err == '', 'An error occured!\n%s'%err
727
728 ginfo = scipy.io.loadmat('genome_info.mat')
729 cPickle.dump(self.genome_info,open(ginfo_filename,'w+'))
730 return ginfo['genome_info']
731
732 else:
733 return cPickle.load(open(ginfo_filename))
734
735 def plifs2param(h,d,a,mmatrix,qualityPlifs):
736 donSP = Conf.numDonSuppPoints
737 accSP = Conf.numAccSuppPoints
738 lengthSP = Conf.numLengthSuppPoints
739 mmatrixSP = Conf.sizeMatchmatrix[0]\
740 *Conf.sizeMatchmatrix[1]
741
742
743 param = zeros((Conf.numFeatures,1))
744 param[0:donSP] = mat(d.penalties[:]).reshape(donSP,1)
745 param[donSP:donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
746 param[donSP+accSP:donSP+accSP+lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
747 param[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP] = mmatrix[:]
748
749 for idx in range(len(qualityPlifs)):
750 currentPlif = qualityPlifs[idx]
751 begin = lengthSP+donSP+accSP+mmatrixSP+(idx*Conf.numQualSuppPoints)
752 end = lengthSP+donSP+accSP+mmatrixSP+((idx+1)*Conf.numQualSuppPoints)
753 param[begin:end] = mat(currentPlif.penalties).reshape(Conf.numQualSuppPoints,1)
754
755 return param
756
757 def load_param(filename):
758 param = None
759 #try:
760 # para = cPickle.load(open(filename))
761 #except:
762 # print 'Error: Could not open parameter file!'
763 # sys.exit(1)
764 #
765 #param = plifs2param(para.h,para.d,para.a,para.mmatrix,para.qualityPlifs)
766
767 param = cPickle.load(open(filename))
768 return param
769
770 def evaluateExample(dna,est,exons,SpliceAlign,newEstAlign,spos):
771 newExons = []
772 oldElem = -1
773 SpliceAlign = SpliceAlign.flatten().tolist()[0]
774 SpliceAlign.append(-1)
775 for pos,elem in enumerate(SpliceAlign):
776 if pos == 0:
777 oldElem = -1
778 else:
779 oldElem = SpliceAlign[pos-1]
780
781 if oldElem != 0 and elem == 0: # start of exon
782 newExons.append(pos)
783
784 if oldElem == 0 and elem != 0: # end of exon
785 newExons.append(pos)
786
787 e1_b_off,e1_e_off,e2_b_off,e2_e_off = 0,0,0,0
788
789 if len(newExons) == 4:
790 e1_begin,e1_end = newExons[0],newExons[1]
791 e2_begin,e2_end = newExons[2],newExons[3]
792
793 #elif len(newExons) > 4 :
794 # e1_begin,e1_end = newExons[0],newExons[1]
795 # e2_begin,e2_end = newExons[-2],newExons[-1]
796 else:
797 return None,None,None,None,newExons
798
799 e1_b_off = int(math.fabs(e1_begin - exons[0,0]))
800 e1_e_off = int(math.fabs(e1_end - exons[0,1]))
801
802 e2_b_off = int(math.fabs(e2_begin - exons[1,0]))
803 e2_e_off = int(math.fabs(e2_end - exons[1,1]))
804
805 return e1_b_off,e1_e_off,e2_b_off,e2_e_off,newExons
806
807 def calcStat(Acceptor,Donor,Exons):
808 maxAcc = -100
809 minAcc = 100
810 maxDon = -100
811 minDon = 100
812
813 acc_vec_pos = []
814 acc_vec_neg = []
815 don_vec_pos = []
816 don_vec_neg = []
817
818 for jdx in range(len(Acceptors)):
819 currentExons = Exons[jdx]
820 currentAcceptor = Acceptors[jdx]
821 currentAcceptor = currentAcceptor[1:]
822 currentAcceptor.append(-inf)
823 currentDonor = Donors[jdx]
824
825 for idx,elem in enumerate(currentAcceptor):
826 if idx == (int(currentExons[1,0])-1): # acceptor site
827 acc_vec_pos.append(elem)
828 else:
829 acc_vec_neg.append(elem)
830
831 for idx,elem in enumerate(currentDonor):
832 if idx == (int(currentExons[0,1])): # donor site
833 don_vec_pos.append(elem)
834 else:
835 don_vec_neg.append(elem)
836
837 acc_pos = [elem for elem in acc_vec_pos if elem != -inf]
838 acc_neg = [elem for elem in acc_vec_neg if elem != -inf]
839 don_pos = [elem for elem in don_vec_pos if elem != -inf]
840 don_neg = [elem for elem in don_vec_neg if elem != -inf]
841
842 pdb.set_trace()
843
844 for idx in range(len(Sequences)):
845 acc = [elem for elem in Acceptors[idx] if elem != -inf]
846 maxAcc = max(max(acc),maxAcc)
847 minAcc = min(min(acc),minAcc)
848
849 don = [elem for elem in Donors[idx] if elem != -inf]
850 maxDon = max(max(don),maxDon)
851 minDon = min(min(don),minDon)
852
853 if __name__ == '__main__':
854
855 qpalma = QPalma()
856
857 if len(sys.argv) == 2:
858 mode = sys.argv[1]
859 assert mode == 'train'
860 qpalma.train()
861
862 elif len(sys.argv) == 3:
863 mode = sys.argv[1]
864 param_filename = sys.argv[2]
865 assert mode == 'predict'
866 assert os.path.exists(param_filename)
867 qpalma.evaluate(param_filename)
868 else:
869 print 'You have to choose between training or prediction mode:'
870 print 'python qpalma. py (train|predict) <param_file>'