+ extended dataset compilation function
[qpalma.git] / scripts / qpalma_main.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 ###########################################################
5 #
6 # The QPalma project aims at extending the Palma project
7 # to be able to use Solexa reads toegether with their
8 # quality scores.
9 #
10 # This file represents the conversion of the main matlab
11 # training loop for Palma to python.
12 #
13 # Author: Fabio De Bona
14 #
15 ###########################################################
16
17 import sys
18 import subprocess
19 import scipy.io
20 import pdb
21 import os.path
22
23 from numpy.matlib import mat,zeros,ones,inf
24 from numpy.linalg import norm
25
26 import QPalmaDP
27 import qpalma
28 from qpalma.SIQP_CPX import SIQPSolver
29 from qpalma.DataProc import *
30 from qpalma.generateEvaluationData import *
31 from qpalma.computeSpliceWeights import *
32 from qpalma.set_param_palma import *
33 from qpalma.computeSpliceAlignWithQuality import *
34 from qpalma.penalty_lookup_new import *
35 from qpalma.compute_donacc import *
36 from qpalma.export_param import *
37 from qpalma.TrainingParam import Param
38 from qpalma.Plif import Plf
39
40 from qpalma.tools.splicesites import getDonAccScores
41 from qpalma.Configuration import *
42
43 class para:
44 pass
45
46 class QPalma:
47 """
48 A training method for the QPalma project
49 """
50
51 def __init__(self):
52 self.ARGS = Param()
53
54 #from compile_dataset import compile_d
55 #compile_d(Conf.gff_fn,Conf.dna_flat_fn,Conf.filtered_fn,Conf.remapped_fn,Conf.tmp_dir,Conf.dataset_fn,False)
56
57 from compile_dataset import compile_d
58 compile_d(Conf.gff_fn,Conf.dna_flat_fn,Conf.filtered_fn,Conf.remapped_fn,Conf.tmp_dir,Conf.dataset_fn,True)
59
60 #gen_file= '%s/genome.config' % self.ARGS.basedir
61 #ginfo_filename = 'genome_info.pickle'
62 #self.genome_info = fetch_genome_info(ginfo_filename)
63 #self.plog('genome_info.basedir is %s\n'%self.genome_info.basedir)
64 #self.ARGS.train_with_splicesitescoreinformation = False
65
66 # Load the whole dataset
67 if Conf.mode == 'normal':
68 #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
69 Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
70 self.use_quality_scores = False
71
72 elif Conf.mode == 'using_quality_scores':
73
74 #Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos = paths_load_data_solexa('training',None,self.ARGS)
75 data_filename = Conf.dataset_fn
76
77 Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos = paths_load_data(data_filename,'training',None,self.ARGS)
78
79 #end = 1000
80 self.Sequences = Sequences
81 self.Exons = Exons
82 self.Ests = Ests
83 self.Qualities = Qualities
84 self.SplitPos = SplitPos
85 self.Donors = Donors
86 self.Acceptors = Acceptors
87
88 min_score = 100
89 max_score = -100
90
91 for elem in self.Acceptors:
92 for score in elem:
93 if score != -inf:
94 min_score = min(min_score,score)
95 max_score = max(max_score,score)
96
97 print 'Acceptors max/min: %f / %f' % (max_score,min_score)
98
99 min_score = 100
100 max_score = -100
101
102 for elem in self.Donors:
103 for score in elem:
104 if score != -inf:
105 min_score = min(min_score,score)
106 max_score = max(max_score,score)
107
108 print 'Donor max/min: %f / %f' % (max_score,min_score)
109
110 min_score = 10000
111 max_score = 0
112 mean_intron_len = 0
113
114 for elem in self.Exons:
115 intron_len = int(elem[1,0] - elem[0,1])
116 mean_intron_len += intron_len
117 min_score = min(min_score,intron_len)
118 max_score = max(max_score,intron_len)
119
120 mean_intron_len /= 1.0*len(self.Exons)
121 print 'Intron length max/min: %d / %d mean length %f' % (max_score,min_score,mean_intron_len)
122
123
124 self.use_quality_scores = True
125 else:
126 assert(False)
127
128 def plog(self,string):
129 self.logfh.write(string)
130 self.logfh.flush()
131
132 def train(self):
133 self.logfh = open('_qpalma_train.log','w+')
134
135 beg = Conf.training_begin
136 end = Conf.training_end
137
138 Sequences = self.Sequences[beg:end]
139 Exons = self.Exons[beg:end]
140 Ests = self.Ests[beg:end]
141 Qualities = self.Qualities[beg:end]
142 Acceptors = self.Acceptors[beg:end]
143 Donors = self.Donors[beg:end]
144
145 # number of training instances
146 N = numExamples = len(Sequences)
147 assert len(Exons) == N and len(Ests) == N\
148 and len(Qualities) == N and len(Acceptors) == N\
149 and len(Donors) == N, 'The Exons,Acc,Don,.. arrays are of different lengths'
150 self.plog('Number of training examples: %d\n'% numExamples)
151
152 self.noImprovementCtr = 0
153 self.oldObjValue = 1e8
154
155 iteration_steps = Conf.iter_steps ; #upper bound on iteration steps
156 remove_duplicate_scores = Conf.remove_duplicate_scores
157 print_matrix = Conf.print_matrix
158 anzpath = Conf.anzpath
159
160 # Initialize parameter vector / param = numpy.matlib.rand(126,1)
161 param = Conf.fixedParam
162
163 # Set the parameters such as limits penalties for the Plifs
164 [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
165
166 pdb.set_trace()
167
168 # Initialize solver
169 if Conf.USE_OPT:
170 self.plog('Initializing problem...\n')
171 solver = SIQPSolver(Conf.numFeatures,numExamples,Conf.C,self.logfh)
172
173 # stores the number of alignments done for each example (best path, second-best path etc.)
174 num_path = [anzpath]*numExamples
175 # stores the gap for each example
176 gap = [0.0]*numExamples
177
178 #############################################################################################
179 # Training
180 #############################################################################################
181 self.plog('Starting training...\n')
182
183 donSP = Conf.numDonSuppPoints
184 accSP = Conf.numAccSuppPoints
185 lengthSP = Conf.numLengthSuppPoints
186 mmatrixSP = Conf.sizeMatchmatrix[0]\
187 *Conf.sizeMatchmatrix[1]
188 numq = Conf.numQualSuppPoints
189 totalQualSP = Conf.totalQualSuppPoints
190
191 currentPhi = zeros((Conf.numFeatures,1))
192 totalQualityPenalties = zeros((totalQualSP,1))
193
194 iteration_nr = 0
195 param_idx = 0
196 const_added_ctr = 0
197 while True:
198 if iteration_nr == iteration_steps:
199 break
200
201 for exampleIdx in range(numExamples):
202 if (exampleIdx%100) == 0:
203 print 'Current example nr %d' % exampleIdx
204
205 dna = Sequences[exampleIdx]
206 est = Ests[exampleIdx]
207
208 if Conf.mode == 'normal':
209 quality = [40]*len(est)
210
211 if Conf.mode == 'using_quality_scores':
212 quality = Qualities[exampleIdx]
213
214 exons = Exons[exampleIdx]
215 # NoiseMatrix = Noises[exampleIdx]
216 don_supp = Donors[exampleIdx]
217 acc_supp = Acceptors[exampleIdx]
218
219 # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
220 trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
221
222 # Calculate the weights
223 trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
224 trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
225
226 #idx_lst = [p for p,e in enumerate(trueWeightIntron) if e > 1e-5]
227 #print idx_lst
228 #print [h.limits[p] for p in idx_lst]
229
230 currentPhi[0:donSP] = mat(d.penalties[:]).reshape(donSP,1)
231 currentPhi[donSP:donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
232 currentPhi[donSP+accSP:donSP+accSP+lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
233 currentPhi[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP] = mmatrix[:]
234
235 if Conf.mode == 'using_quality_scores':
236 totalQualityPenalties = param[-totalQualSP:]
237 currentPhi[donSP+accSP+lengthSP+mmatrixSP:] = totalQualityPenalties[:]
238
239 # Calculate w'phi(x,y) the total score of the alignment
240 trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
241
242 # The allWeights vector is supposed to store the weight parameter
243 # of the true alignment as well as the weight parameters of the
244 # num_path[exampleIdx] other alignments
245 allWeights = zeros((Conf.numFeatures,num_path[exampleIdx]+1))
246 allWeights[:,0] = trueWeight[:,0]
247
248 AlignmentScores = [0.0]*(num_path[exampleIdx]+1)
249 AlignmentScores[0] = trueAlignmentScore
250
251 ################## Calculate wrong alignment(s) ######################
252
253 # Compute donor, acceptor with penalty_lookup_new
254 # returns two double lists
255 donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
256
257 #myalign wants the acceptor site on the g of the ag
258 acceptor = acceptor[1:]
259 acceptor.append(-inf)
260
261 dna = str(dna)
262 est = str(est)
263 dna_len = len(dna)
264 est_len = len(est)
265
266 # check that splice site scores are at dna positions as expected by
267 # the dynamic programming component
268 for d_pos in [pos for pos,elem in enumerate(donor) if elem != -inf]:
269 assert dna[d_pos] == 'g' and (dna[d_pos+1] == 'c'\
270 or dna[d_pos+1] == 't'), pdb.set_trace()
271
272 for a_pos in [pos for pos,elem in enumerate(acceptor) if elem != -inf]:
273 assert dna[a_pos-1] == 'a' and dna[a_pos] == 'g', pdb.set_trace()
274
275 ps = h.convert2SWIG()
276
277 prb = QPalmaDP.createDoubleArrayFromList(quality)
278 chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
279
280 matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
281 mm_len = Conf.sizeMatchmatrix[0]*Conf.sizeMatchmatrix[1]
282
283 d_len = len(donor)
284 donor = QPalmaDP.createDoubleArrayFromList(donor)
285 a_len = len(acceptor)
286 acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
287
288 # Create the alignment object representing the interface to the C/C++ code.
289 currentAlignment = QPalmaDP.Alignment(Conf.numQualPlifs,Conf.numQualSuppPoints, self.use_quality_scores)
290
291 c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
292 #print 'Calling myalign...'
293 # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
294 currentAlignment.myalign( num_path[exampleIdx], dna, dna_len,\
295 est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
296 acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
297 print_matrix)
298
299 #print 'After calling myalign...'
300 #print 'Calling getAlignmentResults...'
301
302 c_SpliceAlign = QPalmaDP.createIntArrayFromList([0]*(dna_len*num_path[exampleIdx]))
303 c_EstAlign = QPalmaDP.createIntArrayFromList([0]*(est_len*num_path[exampleIdx]))
304 c_WeightMatch = QPalmaDP.createIntArrayFromList([0]*(mm_len*num_path[exampleIdx]))
305 c_DPScores = QPalmaDP.createDoubleArrayFromList([.0]*num_path[exampleIdx])
306
307 c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(Conf.totalQualSuppPoints*num_path[exampleIdx]))
308
309 currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
310 c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
311
312 #print 'After calling getAlignmentResults...'
313
314 newSpliceAlign = zeros((num_path[exampleIdx]*dna_len,1))
315 newEstAlign = zeros((est_len*num_path[exampleIdx],1))
316 newWeightMatch = zeros((num_path[exampleIdx]*mm_len,1))
317 newDPScores = zeros((num_path[exampleIdx],1))
318 newQualityPlifsFeatures = zeros((Conf.totalQualSuppPoints*num_path[exampleIdx],1))
319
320 #print 'newSpliceAlign'
321 for i in range(dna_len*num_path[exampleIdx]):
322 newSpliceAlign[i] = c_SpliceAlign[i]
323 # print '%f' % (spliceAlign[i])
324
325 #print 'newEstAlign'
326 for i in range(est_len*num_path[exampleIdx]):
327 newEstAlign[i] = c_EstAlign[i]
328 # print '%f' % (spliceAlign[i])
329
330 #print 'weightMatch'
331 for i in range(mm_len*num_path[exampleIdx]):
332 newWeightMatch[i] = c_WeightMatch[i]
333 # print '%f' % (weightMatch[i])
334
335 #print 'ViterbiScores'
336 for i in range(num_path[exampleIdx]):
337 newDPScores[i] = c_DPScores[i]
338
339 if self.use_quality_scores:
340 for i in range(Conf.totalQualSuppPoints*num_path[exampleIdx]):
341 newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
342
343 # equals palma up to here
344
345 #print "Calling destructors"
346 del c_SpliceAlign
347 del c_EstAlign
348 del c_WeightMatch
349 del c_DPScores
350 del c_qualityPlifsFeatures
351 del currentAlignment
352
353 newSpliceAlign = newSpliceAlign.reshape(num_path[exampleIdx],dna_len)
354 newWeightMatch = newWeightMatch.reshape(num_path[exampleIdx],mm_len)
355 # Calculate weights of the respective alignments Note that we are
356 # calculating n-best alignments without hamming loss, so we
357 # have to keep track which of the n-best alignments correspond to
358 # the true one in order not to incorporate a true alignment in the
359 # constraints. To keep track of the true and false alignments we
360 # define an array true_map with a boolean indicating the
361 # equivalence to the true alignment for each decoded alignment.
362 true_map = [0]*(num_path[exampleIdx]+1)
363 true_map[0] = 1
364 path_loss = [0]*(num_path[exampleIdx])
365
366 for pathNr in range(num_path[exampleIdx]):
367 #print 'decodedWeights'
368 weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a,\
369 h, newSpliceAlign[pathNr,:].flatten().tolist()[0], don_supp,\
370 acc_supp)
371
372 decodedQualityFeatures = zeros((Conf.totalQualSuppPoints,1))
373 for qidx in range(Conf.totalQualSuppPoints):
374 decodedQualityFeatures[qidx] = newQualityPlifsFeatures[(pathNr*Conf.totalQualSuppPoints)+qidx]
375
376 #pdb.set_trace()
377
378 path_loss[pathNr] = 0
379 # sum up positionwise loss between alignments
380 for alignPosIdx in range(newSpliceAlign[pathNr,:].shape[1]):
381 if newSpliceAlign[pathNr,alignPosIdx] != trueSpliceAlign[alignPosIdx]:
382 path_loss[pathNr] += 1
383
384 #pdb.set_trace()
385
386 # Gewichte in restliche Zeilen der Matrix speichern
387 wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch[pathNr,:].T, decodedQualityFeatures])
388 allWeights[:,pathNr+1] = wp
389
390 #pdb.set_trace()
391
392 hpen = mat(h.penalties).reshape(len(h.penalties),1)
393 dpen = mat(d.penalties).reshape(len(d.penalties),1)
394 apen = mat(a.penalties).reshape(len(a.penalties),1)
395 features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties])
396
397 AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
398
399 # Check wether scalar product + loss equals viterbi score
400 #print 'Example nr.: %d, path nr. %d, scores: %f vs %f' % (exampleIdx,pathNr,newDPScores[pathNr,0], AlignmentScores[pathNr+1])
401
402 distinct_scores = False
403 if math.fabs(AlignmentScores[pathNr] - AlignmentScores[pathNr+1]) > 1e-5:
404 distinct_scores = True
405
406 if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) <= 1e-5:
407 pdb.set_trace()
408
409 # if the pathNr-best alignment is very close to the true alignment consider it as true
410 if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
411 true_map[pathNr+1] = 1
412
413 # assert AlignmentScores[0] > max(AlignmentScores[1:]) + 1e-6, pdb.set_trace()
414
415 # the true label sequence should not have a larger score than the maximal one WHYYYYY?
416 # this means that all n-best paths are to close to each other
417 # we have to extend the n-best search to a (n+1)-best
418 if len([elem for elem in true_map if elem == 1]) == len(true_map):
419 num_path[exampleIdx] = num_path[exampleIdx]+1
420
421 # Choose true and first false alignment for extending
422 firstFalseIdx = -1
423 for map_idx,elem in enumerate(true_map):
424 if elem == 0:
425 firstFalseIdx = map_idx
426 break
427
428 # if there is at least one useful false alignment add the
429 # corresponding constraints to the optimization problem
430 if firstFalseIdx != -1:
431 trueWeights = allWeights[:,0]
432 firstFalseWeights = allWeights[:,firstFalseIdx]
433 differenceVector = trueWeights - firstFalseWeights
434 #pdb.set_trace()
435
436 if Conf.USE_OPT:
437 const_added = solver.addConstraint(differenceVector, exampleIdx)
438 const_added_ctr += 1
439 #
440 # end of one example processing
441 #
442
443 # call solver every nth example //added constraint
444 if exampleIdx != 0 and exampleIdx % 100 == 0 and Conf.USE_OPT:
445 objValue,w,self.slacks = solver.solve()
446
447 if math.fabs(objValue - self.oldObjValue) <= 1e-6:
448 self.noImprovementCtr += 1
449
450 if self.noImprovementCtr == numExamples+1:
451 break
452
453 self.oldObjValue = objValue
454 print "objValue is %f" % objValue
455
456 sum_xis = 0
457 for elem in self.slacks:
458 sum_xis += elem
459
460 for i in range(len(param)):
461 param[i] = w[i]
462
463 #pdb.set_trace()
464 cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
465 param_idx += 1
466 [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
467
468 #
469 # end of one iteration through all examples
470 #
471 if self.noImprovementCtr == numExamples+1:
472 break
473
474 iteration_nr += 1
475
476 #
477 # end of optimization
478 #
479 print 'Training completed'
480
481 pa = para()
482 pa.h = h
483 pa.d = d
484 pa.a = a
485 pa.mmatrix = mmatrix
486 pa.qualityPlifs = qualityPlifs
487
488 cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
489 #cPickle.dump(pa,open('elegans.param','w+'))
490 self.logfh.close()
491
492 def evaluate(self,param_filename):
493 beg = Conf.prediction_begin
494 end = Conf.prediction_end
495
496 # predict on training set
497 print '##### Prediction on the training set #####'
498 self.predict(param_filename,0,beg)
499
500 # predict on test set
501 print '##### Prediction on the test set #####'
502 self.predict(param_filename,beg,end)
503
504 pdb.set_trace()
505
506 def predict(self,param_filename,beg,end):
507 self.logfh = open('_qpalma_predict.log','w+')
508
509 Sequences = self.Sequences[beg:end]
510 Exons = self.Exons[beg:end]
511 Ests = self.Ests[beg:end]
512 Qualities = self.Qualities[beg:end]
513 Acceptors = self.Acceptors[beg:end]
514 Donors = self.Donors[beg:end]
515 SplitPos = self.SplitPos[beg:end]
516
517 # number of training instances
518 N = numExamples = len(Sequences)
519 assert len(Exons) == N and len(Ests) == N\
520 and len(Qualities) == N and len(Acceptors) == N\
521 and len(Donors) == N, 'The Exons,Acc,Don,.. arrays are of different lengths'
522 self.plog('Number of training examples: %d\n'% numExamples)
523
524 self.noImprovementCtr = 0
525 self.oldObjValue = 1e8
526
527 remove_duplicate_scores = Conf.remove_duplicate_scores
528 print_matrix = Conf.print_matrix
529 anzpath = Conf.anzpath
530
531 param = cPickle.load(open(param_filename))
532
533 # Set the parameters such as limits penalties for the Plifs
534 [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
535
536 #############################################################################################
537 # Prediction
538 #############################################################################################
539 self.plog('Starting prediction...\n')
540
541 donSP = Conf.numDonSuppPoints
542 accSP = Conf.numAccSuppPoints
543 lengthSP = Conf.numLengthSuppPoints
544 mmatrixSP = Conf.sizeMatchmatrix[0]\
545 *Conf.sizeMatchmatrix[1]
546 numq = Conf.numQualSuppPoints
547 totalQualSP = Conf.totalQualSuppPoints
548
549 currentPhi = zeros((Conf.numFeatures,1))
550 totalQualityPenalties = zeros((totalQualSP,1))
551
552 exon1Begin = []
553 exon1End = []
554 exon2Begin = []
555 exon2End = []
556 allWrongExons = []
557
558 for exampleIdx in range(numExamples):
559 dna = Sequences[exampleIdx]
560 est = Ests[exampleIdx]
561 currentSplitPos = SplitPos[exampleIdx]
562
563 if Conf.mode == 'normal':
564 quality = [40]*len(est)
565
566 if Conf.mode == 'using_quality_scores':
567 quality = Qualities[exampleIdx]
568
569 exons = Exons[exampleIdx]
570 # NoiseMatrix = Noises[exampleIdx]
571 don_supp = Donors[exampleIdx]
572 acc_supp = Acceptors[exampleIdx]
573
574 # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
575 trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
576
577 # Calculate the weights
578 trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
579 trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
580
581 currentPhi[0:donSP] = mat(d.penalties[:]).reshape(donSP,1)
582 currentPhi[donSP:donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
583 currentPhi[donSP+accSP:donSP+accSP+lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
584 currentPhi[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP] = mmatrix[:]
585
586 if Conf.mode == 'using_quality_scores':
587 totalQualityPenalties = param[-totalQualSP:]
588 currentPhi[donSP+accSP+lengthSP+mmatrixSP:] = totalQualityPenalties[:]
589
590 # Calculate w'phi(x,y) the total score of the alignment
591 trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
592
593 # The allWeights vector is supposed to store the weight parameter
594 # of the true alignment as well as the weight parameters of the
595 # 1 other alignments
596 allWeights = zeros((Conf.numFeatures,1+1))
597 allWeights[:,0] = trueWeight[:,0]
598
599 AlignmentScores = [0.0]*(1+1)
600 AlignmentScores[0] = trueAlignmentScore
601
602 ################## Calculate wrong alignment(s) ######################
603
604 # Compute donor, acceptor with penalty_lookup_new
605 # returns two double lists
606 donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
607
608 #myalign wants the acceptor site on the g of the ag
609 acceptor = acceptor[1:]
610 acceptor.append(-inf)
611
612 dna = str(dna)
613 est = str(est)
614 dna_len = len(dna)
615 est_len = len(est)
616
617 ps = h.convert2SWIG()
618
619 prb = QPalmaDP.createDoubleArrayFromList(quality)
620 chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
621
622 matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
623 mm_len = Conf.sizeMatchmatrix[0]*Conf.sizeMatchmatrix[1]
624
625 d_len = len(donor)
626 donor = QPalmaDP.createDoubleArrayFromList(donor)
627 a_len = len(acceptor)
628 acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
629
630 # Create the alignment object representing the interface to the C/C++ code.
631 currentAlignment = QPalmaDP.Alignment(Conf.numQualPlifs,Conf.numQualSuppPoints, self.use_quality_scores)
632
633 c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
634
635 # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
636 currentAlignment.myalign(1, dna, dna_len,\
637 est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
638 acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
639 print_matrix)
640
641 c_SpliceAlign = QPalmaDP.createIntArrayFromList([0]*(dna_len*1))
642 c_EstAlign = QPalmaDP.createIntArrayFromList([0]*(est_len*1))
643 c_WeightMatch = QPalmaDP.createIntArrayFromList([0]*(mm_len*1))
644 c_DPScores = QPalmaDP.createDoubleArrayFromList([.0]*1)
645
646 c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(Conf.totalQualSuppPoints))
647
648 currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
649 c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
650
651 newSpliceAlign = zeros((dna_len,1))
652 newEstAlign = zeros((est_len,1))
653 newWeightMatch = zeros((mm_len,1))
654 newQualityPlifsFeatures = zeros((Conf.totalQualSuppPoints,1))
655
656 for i in range(dna_len):
657 newSpliceAlign[i] = c_SpliceAlign[i]
658
659 for i in range(est_len):
660 newEstAlign[i] = c_EstAlign[i]
661
662 for i in range(mm_len):
663 newWeightMatch[i] = c_WeightMatch[i]
664
665 newDPScores = c_DPScores[0]
666
667 for i in range(Conf.totalQualSuppPoints):
668 newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
669
670 del c_SpliceAlign
671 del c_EstAlign
672 del c_WeightMatch
673 del c_DPScores
674 del c_qualityPlifsFeatures
675 del currentAlignment
676
677 newSpliceAlign = newSpliceAlign.reshape(1,dna_len)
678 newWeightMatch = newWeightMatch.reshape(1,mm_len)
679 true_map = [0]*2
680 true_map[0] = 1
681 pathNr = 0
682
683 weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a, h, newSpliceAlign.flatten().tolist()[0], don_supp, acc_supp)
684
685 decodedQualityFeatures = zeros((Conf.totalQualSuppPoints,1))
686 for qidx in range(Conf.totalQualSuppPoints):
687 decodedQualityFeatures[qidx] = newQualityPlifsFeatures[qidx]
688
689 # Gewichte in restliche Zeilen der Matrix speichern
690 wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch.T, decodedQualityFeatures])
691 allWeights[:,pathNr+1] = wp
692
693 hpen = mat(h.penalties).reshape(len(h.penalties),1)
694 dpen = mat(d.penalties).reshape(len(d.penalties),1)
695 apen = mat(a.penalties).reshape(len(a.penalties),1)
696 features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties])
697 AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
698
699 # if the pathNr-best alignment is very close to the true alignment consider it as true
700 if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
701 true_map[pathNr+1] = 1
702
703 e1_b_off,e1_e_off,e2_b_off,e2_e_off,newExons = evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign,currentSplitPos)
704
705 if e1_b_off != None:
706 exon1Begin.append(e1_b_off)
707 exon1End.append(e1_e_off)
708 exon2Begin.append(e2_b_off)
709 exon2End.append(e2_e_off)
710 else:
711 allWrongExons.append((newExons,exons))
712
713 e1Begin_pos,e1Begin_neg,e1End_pos,e1End_neg,mean_e1Begin_neg,mean_e1End_neg = self.evaluatePositions(exon1Begin,exon1End)
714 e2Begin_pos,e2Begin_neg,e2End_pos,e2End_neg,mean_e2Begin_neg,mean_e2End_neg = self.evaluatePositions(exon2Begin,exon2End)
715
716 print 'Total num. of Examples: %d' % numExamples
717 print 'Correct positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_pos),len(e1End_pos),len(e2Begin_pos),len(e2End_pos))
718 print 'Incorrect positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_neg),len(e1End_neg),len(e2Begin_neg),len(e2End_neg))
719 print 'Mean of pos. offset:\t\t%.2f\t%.2f\t%.2f\t%.2f' % (mean_e1Begin_neg,mean_e1End_neg,mean_e2Begin_neg,mean_e2End_neg)
720
721 print 'Prediction completed'
722 self.logfh.close()
723
724 def evaluatePositions(self,eBegin,eEnd):
725 eBegin_pos = [elem for elem in eBegin if elem == 0]
726 eBegin_neg = [elem for elem in eBegin if elem != 0]
727 eEnd_pos = [elem for elem in eEnd if elem == 0]
728 eEnd_neg = [elem for elem in eEnd if elem != 0]
729
730 mean_eBegin_neg = 0
731 for idx in range(len(eBegin_neg)):
732 mean_eBegin_neg += eBegin_neg[idx]
733
734 try:
735 mean_eBegin_neg /= 1.0*len(eBegin_neg)
736 except:
737 mean_eBegin_neg = -1
738
739 mean_eEnd_neg = 0
740 for idx in range(len(eEnd_neg)):
741 mean_eEnd_neg += eEnd_neg[idx]
742
743 try:
744 mean_eEnd_neg /= 1.0*len(eEnd_neg)
745 except:
746 mean_eEnd_neg = -1
747
748 return eBegin_pos,eBegin_neg,eEnd_pos,eEnd_neg,mean_eBegin_neg,mean_eEnd_neg
749
750
751 def fetch_genome_info(ginfo_filename):
752 if not os.path.exists(ginfo_filename):
753 cmd = ['']*4
754 cmd[0] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/utils'
755 cmd[1] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/genomes'
756 cmd[2] = 'genome_info = init_genome(\'%s\')' % gen_file
757 cmd[3] = 'save genome_info.mat genome_info'
758 full_cmd = "matlab -nojvm -nodisplay -r \"%s; %s; %s; %s; exit\"" % (cmd[0],cmd[1],cmd[2],cmd[3])
759
760 obj = subprocess.Popen(full_cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
761 out,err = obj.communicate()
762 assert err == '', 'An error occured!\n%s'%err
763
764 ginfo = scipy.io.loadmat('genome_info.mat')
765 cPickle.dump(self.genome_info,open(ginfo_filename,'w+'))
766 return ginfo['genome_info']
767
768 else:
769 return cPickle.load(open(ginfo_filename))
770
771 def evaluateExample(dna,est,exons,SpliceAlign,newEstAlign,spos):
772 newExons = []
773 oldElem = -1
774 SpliceAlign = SpliceAlign.flatten().tolist()[0]
775 SpliceAlign.append(-1)
776 for pos,elem in enumerate(SpliceAlign):
777 if pos == 0:
778 oldElem = -1
779 else:
780 oldElem = SpliceAlign[pos-1]
781
782 if oldElem != 0 and elem == 0: # start of exon
783 newExons.append(pos)
784
785 if oldElem == 0 and elem != 0: # end of exon
786 newExons.append(pos)
787
788 e1_b_off,e1_e_off,e2_b_off,e2_e_off = 0,0,0,0
789
790 if len(newExons) == 4:
791 e1_begin,e1_end = newExons[0],newExons[1]
792 e2_begin,e2_end = newExons[2],newExons[3]
793 else:
794 return None,None,None,None,newExons
795
796 e1_b_off = int(math.fabs(e1_begin - exons[0,0]))
797 e1_e_off = int(math.fabs(e1_end - exons[0,1]))
798
799 e2_b_off = int(math.fabs(e2_begin - exons[1,0]))
800 e2_e_off = int(math.fabs(e2_end - exons[1,1]))
801
802 return e1_b_off,e1_e_off,e2_b_off,e2_e_off,newExons
803
804
805 def calcStat(Acceptor,Donor,Exons):
806 maxAcc = -100
807 minAcc = 100
808 maxDon = -100
809 minDon = 100
810
811 acc_vec_pos = []
812 acc_vec_neg = []
813 don_vec_pos = []
814 don_vec_neg = []
815
816 for jdx in range(len(Acceptors)):
817 currentExons = Exons[jdx]
818 currentAcceptor = Acceptors[jdx]
819 currentAcceptor = currentAcceptor[1:]
820 currentAcceptor.append(-inf)
821 currentDonor = Donors[jdx]
822
823 for idx,elem in enumerate(currentAcceptor):
824 if idx == (int(currentExons[1,0])-1): # acceptor site
825 acc_vec_pos.append(elem)
826 else:
827 acc_vec_neg.append(elem)
828
829 for idx,elem in enumerate(currentDonor):
830 if idx == (int(currentExons[0,1])): # donor site
831 don_vec_pos.append(elem)
832 else:
833 don_vec_neg.append(elem)
834
835 acc_pos = [elem for elem in acc_vec_pos if elem != -inf]
836 acc_neg = [elem for elem in acc_vec_neg if elem != -inf]
837 don_pos = [elem for elem in don_vec_pos if elem != -inf]
838 don_neg = [elem for elem in don_vec_neg if elem != -inf]
839
840 pdb.set_trace()
841
842 for idx in range(len(Sequences)):
843 acc = [elem for elem in Acceptors[idx] if elem != -inf]
844 maxAcc = max(max(acc),maxAcc)
845 minAcc = min(min(acc),minAcc)
846
847 don = [elem for elem in Donors[idx] if elem != -inf]
848 maxDon = max(max(don),maxDon)
849 minDon = min(min(don),minDon)
850
851
852 if __name__ == '__main__':
853 qpalma = QPalma()
854
855 if len(sys.argv) == 2:
856 mode = sys.argv[1]
857 assert mode == 'train'
858 qpalma.train()
859
860 elif len(sys.argv) == 3:
861 mode = sys.argv[1]
862 param_filename = sys.argv[2]
863 assert mode == 'predict'
864 assert os.path.exists(param_filename)
865 qpalma.evaluate(param_filename)
866 else:
867 print 'You have to choose between training or prediction mode:'
868 print 'python qpalma. py (train|predict) <param_file>'