+ minor changes
[qpalma.git] / scripts / qpalma_main.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 ###########################################################
5 #
6 # The QPalma project aims at extending the Palma project
7 # to be able to use Solexa reads toegether with their
8 # quality scores.
9 #
10 # This file represents the conversion of the main matlab
11 # training loop for Palma to python.
12 #
13 # Author: Fabio De Bona
14 #
15 ###########################################################
16
17 import sys
18 import subprocess
19 import scipy.io
20 import pdb
21 import os.path
22
23 from numpy.matlib import mat,zeros,ones,inf
24 from numpy.linalg import norm
25
26 import QPalmaDP
27 import qpalma
28 from qpalma.SIQP_CPX import SIQPSolver
29 from qpalma.DataProc import *
30 from qpalma.generateEvaluationData import *
31 from qpalma.computeSpliceWeights import *
32 from qpalma.set_param_palma import *
33 from qpalma.computeSpliceAlignWithQuality import *
34 from qpalma.penalty_lookup_new import *
35 from qpalma.compute_donacc import *
36 from qpalma.export_param import *
37 from qpalma.TrainingParam import Param
38 from qpalma.Plif import Plf
39
40 from qpalma.tools.splicesites import getDonAccScores
41 from qpalma.Configuration import *
42
43 class para:
44 pass
45
46 class QPalma:
47 """
48 A training method for the QPalma project
49 """
50
51 def __init__(self):
52 self.ARGS = Param()
53
54 from compile_dataset import compile_d
55
56 compile_d(Conf.gff_fn,Conf.dna_flat_fn,Conf.filtered_fn,Conf.remapped_fn,Conf.tmp_dir,Conf.dataset_fn)
57 #gen_file= '%s/genome.config' % self.ARGS.basedir
58 #ginfo_filename = 'genome_info.pickle'
59 #self.genome_info = fetch_genome_info(ginfo_filename)
60 #self.plog('genome_info.basedir is %s\n'%self.genome_info.basedir)
61 #self.ARGS.train_with_splicesitescoreinformation = False
62
63 # Load the whole dataset
64 if Conf.mode == 'normal':
65 #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
66 Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
67 self.use_quality_scores = False
68
69 elif Conf.mode == 'using_quality_scores':
70
71 #Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos = paths_load_data_solexa('training',None,self.ARGS)
72 data_filename = Conf.dataset_fn
73
74 Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos = paths_load_data(data_filename,'training',None,self.ARGS)
75
76 #end = 1000
77 self.Sequences = Sequences
78 self.Exons = Exons
79 self.Ests = Ests
80 self.Qualities = Qualities
81 self.SplitPos = SplitPos
82 self.Donors = Donors
83 self.Acceptors = Acceptors
84
85 #self.Acceptors = self.Acceptors[:end]
86 #self.Donors = self.Donors[:end]
87
88 #Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
89 #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
90 #Qualities = []
91 #for i in range(len(Ests)):
92 # Qualities.append([40]*len(Ests[i]))
93 self.use_quality_scores = True
94 else:
95 assert(False)
96
97 def plog(self,string):
98 self.logfh.write(string)
99 self.logfh.flush()
100
101 def train(self):
102 self.logfh = open('_qpalma_train.log','w+')
103
104 beg = Conf.training_begin
105 end = Conf.training_end
106
107 Sequences = self.Sequences[beg:end]
108 Exons = self.Exons[beg:end]
109 Ests = self.Ests[beg:end]
110 Qualities = self.Qualities[beg:end]
111 Acceptors = self.Acceptors[beg:end]
112 Donors = self.Donors[beg:end]
113
114 # number of training instances
115 N = numExamples = len(Sequences)
116 assert len(Exons) == N and len(Ests) == N\
117 and len(Qualities) == N and len(Acceptors) == N\
118 and len(Donors) == N, 'The Exons,Acc,Don,.. arrays are of different lengths'
119 self.plog('Number of training examples: %d\n'% numExamples)
120
121 self.noImprovementCtr = 0
122 self.oldObjValue = 1e8
123
124 iteration_steps = Conf.iter_steps ; #upper bound on iteration steps
125 remove_duplicate_scores = Conf.remove_duplicate_scores
126 print_matrix = Conf.print_matrix
127 anzpath = Conf.anzpath
128
129 # Initialize parameter vector / param = numpy.matlib.rand(126,1)
130 param = Conf.fixedParam
131
132 # Set the parameters such as limits penalties for the Plifs
133 [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
134
135 # Initialize solver
136 if Conf.USE_OPT:
137 self.plog('Initializing problem...\n')
138 solver = SIQPSolver(Conf.numFeatures,numExamples,Conf.C,self.logfh)
139
140 # stores the number of alignments done for each example (best path, second-best path etc.)
141 num_path = [anzpath]*numExamples
142 # stores the gap for each example
143 gap = [0.0]*numExamples
144
145 #############################################################################################
146 # Training
147 #############################################################################################
148 self.plog('Starting training...\n')
149
150 donSP = Conf.numDonSuppPoints
151 accSP = Conf.numAccSuppPoints
152 lengthSP = Conf.numLengthSuppPoints
153 mmatrixSP = Conf.sizeMatchmatrix[0]\
154 *Conf.sizeMatchmatrix[1]
155 numq = Conf.numQualSuppPoints
156 totalQualSP = Conf.totalQualSuppPoints
157
158 currentPhi = zeros((Conf.numFeatures,1))
159 totalQualityPenalties = zeros((totalQualSP,1))
160
161 iteration_nr = 0
162 param_idx = 0
163 const_added_ctr = 0
164 while True:
165 if iteration_nr == iteration_steps:
166 break
167
168 for exampleIdx in range(numExamples):
169 if (exampleIdx%100) == 0:
170 print 'Current example nr %d' % exampleIdx
171
172 dna = Sequences[exampleIdx]
173 est = Ests[exampleIdx]
174
175 if Conf.mode == 'normal':
176 quality = [40]*len(est)
177
178 if Conf.mode == 'using_quality_scores':
179 quality = Qualities[exampleIdx]
180
181 exons = Exons[exampleIdx]
182 # NoiseMatrix = Noises[exampleIdx]
183 don_supp = Donors[exampleIdx]
184 acc_supp = Acceptors[exampleIdx]
185
186 # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
187 trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
188
189 #print 'trueWeights'
190 # Calculate the weights
191 trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
192 trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
193
194 currentPhi[0:donSP] = mat(d.penalties[:]).reshape(donSP,1)
195 currentPhi[donSP:donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
196 currentPhi[donSP+accSP:donSP+accSP+lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
197 currentPhi[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP] = mmatrix[:]
198
199 #pdb.set_trace()
200 if Conf.mode == 'using_quality_scores':
201 totalQualityPenalties = param[-totalQualSP:]
202 currentPhi[donSP+accSP+lengthSP+mmatrixSP:] = totalQualityPenalties[:]
203
204 # Calculate w'phi(x,y) the total score of the alignment
205 trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
206
207 # The allWeights vector is supposed to store the weight parameter
208 # of the true alignment as well as the weight parameters of the
209 # num_path[exampleIdx] other alignments
210 allWeights = zeros((Conf.numFeatures,num_path[exampleIdx]+1))
211 allWeights[:,0] = trueWeight[:,0]
212
213 AlignmentScores = [0.0]*(num_path[exampleIdx]+1)
214 AlignmentScores[0] = trueAlignmentScore
215
216 ################## Calculate wrong alignment(s) ######################
217
218 # Compute donor, acceptor with penalty_lookup_new
219 # returns two double lists
220 donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
221
222 #myalign wants the acceptor site on the g of the ag
223 acceptor = acceptor[1:]
224 acceptor.append(-inf)
225
226 dna = str(dna)
227 est = str(est)
228 dna_len = len(dna)
229 est_len = len(est)
230
231 # check that splice site scores are at dna positions as expected by
232 # the dynamic programming component
233 for d_pos in [pos for pos,elem in enumerate(donor) if elem != -inf]:
234 assert dna[d_pos] == 'g' and (dna[d_pos+1] == 'c'\
235 or dna[d_pos+1] == 't'), pdb.set_trace()
236
237 for a_pos in [pos for pos,elem in enumerate(acceptor) if elem != -inf]:
238 assert dna[a_pos-1] == 'a' and dna[a_pos] == 'g', pdb.set_trace()
239
240 ps = h.convert2SWIG()
241
242 prb = QPalmaDP.createDoubleArrayFromList(quality)
243 chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
244
245 matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
246 mm_len = Conf.sizeMatchmatrix[0]*Conf.sizeMatchmatrix[1]
247
248 d_len = len(donor)
249 donor = QPalmaDP.createDoubleArrayFromList(donor)
250 a_len = len(acceptor)
251 acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
252
253 # Create the alignment object representing the interface to the C/C++ code.
254 currentAlignment = QPalmaDP.Alignment(Conf.numQualPlifs,Conf.numQualSuppPoints, self.use_quality_scores)
255
256 c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
257 #print 'Calling myalign...'
258 # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
259 currentAlignment.myalign( num_path[exampleIdx], dna, dna_len,\
260 est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
261 acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
262 print_matrix)
263
264 #print 'After calling myalign...'
265 #print 'Calling getAlignmentResults...'
266
267 c_SpliceAlign = QPalmaDP.createIntArrayFromList([0]*(dna_len*num_path[exampleIdx]))
268 c_EstAlign = QPalmaDP.createIntArrayFromList([0]*(est_len*num_path[exampleIdx]))
269 c_WeightMatch = QPalmaDP.createIntArrayFromList([0]*(mm_len*num_path[exampleIdx]))
270 c_DPScores = QPalmaDP.createDoubleArrayFromList([.0]*num_path[exampleIdx])
271
272 c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(Conf.totalQualSuppPoints*num_path[exampleIdx]))
273
274 currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
275 c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
276
277 #print 'After calling getAlignmentResults...'
278
279 newSpliceAlign = zeros((num_path[exampleIdx]*dna_len,1))
280 newEstAlign = zeros((est_len*num_path[exampleIdx],1))
281 newWeightMatch = zeros((num_path[exampleIdx]*mm_len,1))
282 newDPScores = zeros((num_path[exampleIdx],1))
283 newQualityPlifsFeatures = zeros((Conf.totalQualSuppPoints*num_path[exampleIdx],1))
284
285 #print 'newSpliceAlign'
286 for i in range(dna_len*num_path[exampleIdx]):
287 newSpliceAlign[i] = c_SpliceAlign[i]
288 # print '%f' % (spliceAlign[i])
289
290 #print 'newEstAlign'
291 for i in range(est_len*num_path[exampleIdx]):
292 newEstAlign[i] = c_EstAlign[i]
293 # print '%f' % (spliceAlign[i])
294
295 #print 'weightMatch'
296 for i in range(mm_len*num_path[exampleIdx]):
297 newWeightMatch[i] = c_WeightMatch[i]
298 # print '%f' % (weightMatch[i])
299
300 #print 'ViterbiScores'
301 for i in range(num_path[exampleIdx]):
302 newDPScores[i] = c_DPScores[i]
303
304
305 if self.use_quality_scores:
306 for i in range(Conf.totalQualSuppPoints*num_path[exampleIdx]):
307 newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
308
309 # equals palma up to here
310
311 #print "Calling destructors"
312 del c_SpliceAlign
313 del c_EstAlign
314 del c_WeightMatch
315 del c_DPScores
316 del c_qualityPlifsFeatures
317 del currentAlignment
318
319 newSpliceAlign = newSpliceAlign.reshape(num_path[exampleIdx],dna_len)
320 newWeightMatch = newWeightMatch.reshape(num_path[exampleIdx],mm_len)
321 # Calculate weights of the respective alignments Note that we are
322 # calculating n-best alignments without hamming loss, so we
323 # have to keep track which of the n-best alignments correspond to
324 # the true one in order not to incorporate a true alignment in the
325 # constraints. To keep track of the true and false alignments we
326 # define an array true_map with a boolean indicating the
327 # equivalence to the true alignment for each decoded alignment.
328 true_map = [0]*(num_path[exampleIdx]+1)
329 true_map[0] = 1
330 path_loss = [0]*(num_path[exampleIdx])
331
332 for pathNr in range(num_path[exampleIdx]):
333 #print 'decodedWeights'
334 weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a,\
335 h, newSpliceAlign[pathNr,:].flatten().tolist()[0], don_supp,\
336 acc_supp)
337
338 decodedQualityFeatures = zeros((Conf.totalQualSuppPoints,1))
339 for qidx in range(Conf.totalQualSuppPoints):
340 decodedQualityFeatures[qidx] = newQualityPlifsFeatures[(pathNr*Conf.totalQualSuppPoints)+qidx]
341
342 #pdb.set_trace()
343
344 path_loss[pathNr] = 0
345 # sum up positionwise loss between alignments
346 for alignPosIdx in range(newSpliceAlign[pathNr,:].shape[1]):
347 if newSpliceAlign[pathNr,alignPosIdx] != trueSpliceAlign[alignPosIdx]:
348 path_loss[pathNr] += 1
349
350 #pdb.set_trace()
351
352 # Gewichte in restliche Zeilen der Matrix speichern
353 wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch[pathNr,:].T, decodedQualityFeatures])
354 allWeights[:,pathNr+1] = wp
355
356 hpen = mat(h.penalties).reshape(len(h.penalties),1)
357 dpen = mat(d.penalties).reshape(len(d.penalties),1)
358 apen = mat(a.penalties).reshape(len(a.penalties),1)
359 features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties])
360
361 AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
362
363 # Check wether scalar product + loss equals viterbi score
364 #print 'Example nr.: %d, path nr. %d, scores: %f vs %f' % (exampleIdx,pathNr,newDPScores[pathNr,0], AlignmentScores[pathNr+1])
365
366 distinct_scores = False
367 if math.fabs(AlignmentScores[pathNr] - AlignmentScores[pathNr+1]) > 1e-5:
368 distinct_scores = True
369
370 #if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) + [0,1][distinct_scores and (pathNr>0)] <= 1e-5:
371 if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) <= 1e-5:
372 pdb.set_trace()
373
374 # # if the pathNr-best alignment is very close to the true alignment consider it as true
375 if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
376 true_map[pathNr+1] = 1
377
378 # assert AlignmentScores[0] > max(AlignmentScores[1:]) + 1e-6, pdb.set_trace()
379
380 # the true label sequence should not have a larger score than the maximal one WHYYYYY?
381 # this means that all n-best paths are to close to each other
382 # we have to extend the n-best search to a (n+1)-best
383 if len([elem for elem in true_map if elem == 1]) == len(true_map):
384 num_path[exampleIdx] = num_path[exampleIdx]+1
385
386 # Choose true and first false alignment for extending A
387 firstFalseIdx = -1
388 for map_idx,elem in enumerate(true_map):
389 if elem == 0:
390 firstFalseIdx = map_idx
391 break
392
393 # if there is at least one useful false alignment add the
394 # corresponding constraints to the optimization problem
395 if firstFalseIdx != -1:
396 trueWeights = allWeights[:,0]
397 firstFalseWeights = allWeights[:,firstFalseIdx]
398 differenceVector = trueWeights - firstFalseWeights
399 #pdb.set_trace()
400
401 if Conf.USE_OPT:
402 const_added = solver.addConstraint(differenceVector, exampleIdx)
403 const_added_ctr += 1
404 #
405 # end of one example processing
406 #
407
408 # call solver every nth example //added constraint
409 if exampleIdx != 0 and exampleIdx % 100 == 0 and Conf.USE_OPT:
410 objValue,w,self.slacks = solver.solve()
411
412 if math.fabs(objValue - self.oldObjValue) <= 1e-6:
413 self.noImprovementCtr += 1
414
415 if self.noImprovementCtr == numExamples+1:
416 break
417
418 self.oldObjValue = objValue
419 print "objValue is %f" % objValue
420
421 sum_xis = 0
422 for elem in self.slacks:
423 sum_xis += elem
424
425 for i in range(len(param)):
426 param[i] = w[i]
427
428 #pdb.set_trace()
429 cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
430 param_idx += 1
431 [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
432
433 #
434 # end of one iteration through all examples
435 #
436 if self.noImprovementCtr == numExamples+1:
437 break
438
439 iteration_nr += 1
440
441 #
442 # end of optimization
443 #
444 print 'Training completed'
445
446 pa = para()
447 pa.h = h
448 pa.d = d
449 pa.a = a
450 pa.mmatrix = mmatrix
451 pa.qualityPlifs = qualityPlifs
452
453 cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
454 #cPickle.dump(pa,open('elegans.param','w+'))
455 self.logfh.close()
456
457 def evaluate(self,param_filename):
458 beg = Conf.prediction_begin
459 end = Conf.prediction_end
460
461 # predict on training set
462 print '##### Prediction on the training set #####'
463 self.predict(param_filename,0,beg)
464
465 # predict on test set
466 print '##### Prediction on the test set #####'
467 self.predict(param_filename,beg,end)
468
469 pdb.set_trace()
470
471 def predict(self,param_filename,beg,end):
472 self.logfh = open('_qpalma_predict.log','w+')
473
474 beg = Conf.prediction_begin
475 end = Conf.prediction_end
476
477 Sequences = self.Sequences[beg:end]
478 Exons = self.Exons[beg:end]
479 Ests = self.Ests[beg:end]
480 Qualities = self.Qualities[beg:end]
481 Acceptors = self.Acceptors[beg:end]
482 Donors = self.Donors[beg:end]
483 SplitPos = self.SplitPos[beg:end]
484
485 # number of training instances
486 N = numExamples = len(Sequences)
487 assert len(Exons) == N and len(Ests) == N\
488 and len(Qualities) == N and len(Acceptors) == N\
489 and len(Donors) == N, 'The Exons,Acc,Don,.. arrays are of different lengths'
490 self.plog('Number of training examples: %d\n'% numExamples)
491
492 self.noImprovementCtr = 0
493 self.oldObjValue = 1e8
494
495 remove_duplicate_scores = Conf.remove_duplicate_scores
496 print_matrix = Conf.print_matrix
497 anzpath = Conf.anzpath
498
499 #param_filename='/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/param_25.pickle'
500 param = load_param(param_filename)
501
502 # Set the parameters such as limits penalties for the Plifs
503 [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
504
505 #############################################################################################
506 # Prediction
507 #############################################################################################
508 self.plog('Starting prediction...\n')
509
510 donSP = Conf.numDonSuppPoints
511 accSP = Conf.numAccSuppPoints
512 lengthSP = Conf.numLengthSuppPoints
513 mmatrixSP = Conf.sizeMatchmatrix[0]\
514 *Conf.sizeMatchmatrix[1]
515 numq = Conf.numQualSuppPoints
516 totalQualSP = Conf.totalQualSuppPoints
517
518 currentPhi = zeros((Conf.numFeatures,1))
519 totalQualityPenalties = zeros((totalQualSP,1))
520
521 exon1Begin = []
522 exon1End = []
523 exon2Begin = []
524 exon2End = []
525 allWrongExons = []
526
527 for exampleIdx in range(numExamples):
528 dna = Sequences[exampleIdx]
529 est = Ests[exampleIdx]
530 currentSplitPos = SplitPos[exampleIdx]
531
532 if Conf.mode == 'normal':
533 quality = [40]*len(est)
534
535 if Conf.mode == 'using_quality_scores':
536 quality = Qualities[exampleIdx]
537
538 exons = Exons[exampleIdx]
539 # NoiseMatrix = Noises[exampleIdx]
540 don_supp = Donors[exampleIdx]
541 acc_supp = Acceptors[exampleIdx]
542
543 # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
544 trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
545
546 # Calculate the weights
547 trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
548 trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
549
550 currentPhi[0:donSP] = mat(d.penalties[:]).reshape(donSP,1)
551 currentPhi[donSP:donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
552 currentPhi[donSP+accSP:donSP+accSP+lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
553 currentPhi[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP] = mmatrix[:]
554
555 if Conf.mode == 'using_quality_scores':
556 totalQualityPenalties = param[-totalQualSP:]
557 currentPhi[donSP+accSP+lengthSP+mmatrixSP:] = totalQualityPenalties[:]
558
559 # Calculate w'phi(x,y) the total score of the alignment
560 trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
561
562 # The allWeights vector is supposed to store the weight parameter
563 # of the true alignment as well as the weight parameters of the
564 # 1 other alignments
565 allWeights = zeros((Conf.numFeatures,1+1))
566 allWeights[:,0] = trueWeight[:,0]
567
568 AlignmentScores = [0.0]*(1+1)
569 AlignmentScores[0] = trueAlignmentScore
570
571 ################## Calculate wrong alignment(s) ######################
572
573 # Compute donor, acceptor with penalty_lookup_new
574 # returns two double lists
575 donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
576
577 #myalign wants the acceptor site on the g of the ag
578 acceptor = acceptor[1:]
579 acceptor.append(-inf)
580
581 dna = str(dna)
582 est = str(est)
583 dna_len = len(dna)
584 est_len = len(est)
585
586 ps = h.convert2SWIG()
587
588 prb = QPalmaDP.createDoubleArrayFromList(quality)
589 chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
590
591 matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
592 mm_len = Conf.sizeMatchmatrix[0]*Conf.sizeMatchmatrix[1]
593
594 d_len = len(donor)
595 donor = QPalmaDP.createDoubleArrayFromList(donor)
596 a_len = len(acceptor)
597 acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
598
599 # Create the alignment object representing the interface to the C/C++ code.
600 currentAlignment = QPalmaDP.Alignment(Conf.numQualPlifs,Conf.numQualSuppPoints, self.use_quality_scores)
601
602 c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
603
604 # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
605 currentAlignment.myalign(1, dna, dna_len,\
606 est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
607 acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
608 print_matrix)
609
610 c_SpliceAlign = QPalmaDP.createIntArrayFromList([0]*(dna_len*1))
611 c_EstAlign = QPalmaDP.createIntArrayFromList([0]*(est_len*1))
612 c_WeightMatch = QPalmaDP.createIntArrayFromList([0]*(mm_len*1))
613 c_DPScores = QPalmaDP.createDoubleArrayFromList([.0]*1)
614
615 c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(Conf.totalQualSuppPoints))
616
617 currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
618 c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
619
620 newSpliceAlign = zeros((dna_len,1))
621 newEstAlign = zeros((est_len,1))
622 newWeightMatch = zeros((mm_len,1))
623 newQualityPlifsFeatures = zeros((Conf.totalQualSuppPoints,1))
624
625 for i in range(dna_len):
626 newSpliceAlign[i] = c_SpliceAlign[i]
627
628 for i in range(est_len):
629 newEstAlign[i] = c_EstAlign[i]
630
631 for i in range(mm_len):
632 newWeightMatch[i] = c_WeightMatch[i]
633
634 newDPScores = c_DPScores[0]
635
636 for i in range(Conf.totalQualSuppPoints):
637 newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
638
639 del c_SpliceAlign
640 del c_EstAlign
641 del c_WeightMatch
642 del c_DPScores
643 del c_qualityPlifsFeatures
644 del currentAlignment
645
646 newSpliceAlign = newSpliceAlign.reshape(1,dna_len)
647 newWeightMatch = newWeightMatch.reshape(1,mm_len)
648 true_map = [0]*2
649 true_map[0] = 1
650 pathNr = 0
651
652 weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a, h, newSpliceAlign.flatten().tolist()[0], don_supp, acc_supp)
653
654 decodedQualityFeatures = zeros((Conf.totalQualSuppPoints,1))
655 for qidx in range(Conf.totalQualSuppPoints):
656 decodedQualityFeatures[qidx] = newQualityPlifsFeatures[qidx]
657
658 # Gewichte in restliche Zeilen der Matrix speichern
659 wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch.T, decodedQualityFeatures])
660 allWeights[:,pathNr+1] = wp
661
662 hpen = mat(h.penalties).reshape(len(h.penalties),1)
663 dpen = mat(d.penalties).reshape(len(d.penalties),1)
664 apen = mat(a.penalties).reshape(len(a.penalties),1)
665 features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties])
666 AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
667
668 # if the pathNr-best alignment is very close to the true alignment consider it as true
669 if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
670 true_map[pathNr+1] = 1
671
672 e1_b_off,e1_e_off,e2_b_off,e2_e_off,newExons = evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign,currentSplitPos)
673
674 if e1_b_off != None:
675 exon1Begin.append(e1_b_off)
676 exon1End.append(e1_e_off)
677 exon2Begin.append(e2_b_off)
678 exon2End.append(e2_e_off)
679 else:
680 allWrongExons.append((newExons,exons))
681
682 e1Begin_pos,e1Begin_neg,e1End_pos,e1End_neg,mean_e1Begin_neg,mean_e1End_neg = self.evaluatePositions(exon1Begin,exon1End)
683 e2Begin_pos,e2Begin_neg,e2End_pos,e2End_neg,mean_e2Begin_neg,mean_e2End_neg = self.evaluatePositions(exon2Begin,exon2End)
684
685 print 'Total num. of Examples: %d' % numExamples
686 print 'Correct positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_pos),len(e1End_pos),len(e2Begin_pos),len(e2End_pos))
687 print 'Incorrect positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_neg),len(e1End_neg),len(e2Begin_neg),len(e2End_neg))
688 print 'Mean of pos. offset:\t\t%.2f\t%.2f\t%.2f\t%.2f' % (mean_e1Begin_neg,mean_e1End_neg,mean_e2Begin_neg,mean_e2End_neg)
689
690 print 'Prediction completed'
691 self.logfh.close()
692
693 def evaluatePositions(self,eBegin,eEnd):
694 eBegin_pos = [elem for elem in eBegin if elem == 0]
695 eBegin_neg = [elem for elem in eBegin if elem != 0]
696 eEnd_pos = [elem for elem in eEnd if elem == 0]
697 eEnd_neg = [elem for elem in eEnd if elem != 0]
698
699 mean_eBegin_neg = 0
700 for idx in range(len(eBegin_neg)):
701 mean_eBegin_neg += eBegin_neg[idx]
702
703 mean_eBegin_neg /= 1.0*len(eBegin_neg)
704
705 mean_eEnd_neg = 0
706 for idx in range(len(eEnd_neg)):
707 mean_eEnd_neg += eEnd_neg[idx]
708
709 mean_eEnd_neg /= 1.0*len(eEnd_neg)
710
711 return eBegin_pos,eBegin_neg,eEnd_pos,eEnd_neg,mean_eBegin_neg,mean_eEnd_neg
712
713
714 def fetch_genome_info(ginfo_filename):
715 if not os.path.exists(ginfo_filename):
716 cmd = ['']*4
717 cmd[0] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/utils'
718 cmd[1] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/genomes'
719 cmd[2] = 'genome_info = init_genome(\'%s\')' % gen_file
720 cmd[3] = 'save genome_info.mat genome_info'
721 full_cmd = "matlab -nojvm -nodisplay -r \"%s; %s; %s; %s; exit\"" % (cmd[0],cmd[1],cmd[2],cmd[3])
722
723 obj = subprocess.Popen(full_cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
724 out,err = obj.communicate()
725 assert err == '', 'An error occured!\n%s'%err
726
727 ginfo = scipy.io.loadmat('genome_info.mat')
728 cPickle.dump(self.genome_info,open(ginfo_filename,'w+'))
729 return ginfo['genome_info']
730
731 else:
732 return cPickle.load(open(ginfo_filename))
733
734
735 def plifs2param(h,d,a,mmatrix,qualityPlifs):
736 donSP = Conf.numDonSuppPoints
737 accSP = Conf.numAccSuppPoints
738 lengthSP = Conf.numLengthSuppPoints
739 mmatrixSP = Conf.sizeMatchmatrix[0]\
740 *Conf.sizeMatchmatrix[1]
741
742
743 param = zeros((Conf.numFeatures,1))
744 param[0:donSP] = mat(d.penalties[:]).reshape(donSP,1)
745 param[donSP:donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
746 param[donSP+accSP:donSP+accSP+lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
747 param[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP] = mmatrix[:]
748
749 for idx in range(len(qualityPlifs)):
750 currentPlif = qualityPlifs[idx]
751 begin = lengthSP+donSP+accSP+mmatrixSP+(idx*Conf.numQualSuppPoints)
752 end = lengthSP+donSP+accSP+mmatrixSP+((idx+1)*Conf.numQualSuppPoints)
753 param[begin:end] = mat(currentPlif.penalties).reshape(Conf.numQualSuppPoints,1)
754
755 return param
756
757
758 def load_param(filename):
759 param = None
760 #try:
761 # para = cPickle.load(open(filename))
762 #except:
763 # print 'Error: Could not open parameter file!'
764 # sys.exit(1)
765 #
766 #param = plifs2param(para.h,para.d,para.a,para.mmatrix,para.qualityPlifs)
767
768 param = cPickle.load(open(filename))
769 return param
770
771
772 def evaluateExample(dna,est,exons,SpliceAlign,newEstAlign,spos):
773 newExons = []
774 oldElem = -1
775 SpliceAlign = SpliceAlign.flatten().tolist()[0]
776 SpliceAlign.append(-1)
777 for pos,elem in enumerate(SpliceAlign):
778 if pos == 0:
779 oldElem = -1
780 else:
781 oldElem = SpliceAlign[pos-1]
782
783 if oldElem != 0 and elem == 0: # start of exon
784 newExons.append(pos)
785
786 if oldElem == 0 and elem != 0: # end of exon
787 newExons.append(pos)
788
789 e1_b_off,e1_e_off,e2_b_off,e2_e_off = 0,0,0,0
790
791 if len(newExons) == 4:
792 e1_begin,e1_end = newExons[0],newExons[1]
793 e2_begin,e2_end = newExons[2],newExons[3]
794
795 #elif len(newExons) > 4 :
796 # e1_begin,e1_end = newExons[0],newExons[1]
797 # e2_begin,e2_end = newExons[-2],newExons[-1]
798 else:
799 return None,None,None,None,newExons
800
801 e1_b_off = int(math.fabs(e1_begin - exons[0,0]))
802 e1_e_off = int(math.fabs(e1_end - exons[0,1]))
803
804 e2_b_off = int(math.fabs(e2_begin - exons[1,0]))
805 e2_e_off = int(math.fabs(e2_end - exons[1,1]))
806
807 return e1_b_off,e1_e_off,e2_b_off,e2_e_off,newExons
808
809
810 def calcStat(Acceptor,Donor,Exons):
811 maxAcc = -100
812 minAcc = 100
813 maxDon = -100
814 minDon = 100
815
816 acc_vec_pos = []
817 acc_vec_neg = []
818 don_vec_pos = []
819 don_vec_neg = []
820
821 for jdx in range(len(Acceptors)):
822 currentExons = Exons[jdx]
823 currentAcceptor = Acceptors[jdx]
824 currentAcceptor = currentAcceptor[1:]
825 currentAcceptor.append(-inf)
826 currentDonor = Donors[jdx]
827
828 for idx,elem in enumerate(currentAcceptor):
829 if idx == (int(currentExons[1,0])-1): # acceptor site
830 acc_vec_pos.append(elem)
831 else:
832 acc_vec_neg.append(elem)
833
834 for idx,elem in enumerate(currentDonor):
835 if idx == (int(currentExons[0,1])): # donor site
836 don_vec_pos.append(elem)
837 else:
838 don_vec_neg.append(elem)
839
840 acc_pos = [elem for elem in acc_vec_pos if elem != -inf]
841 acc_neg = [elem for elem in acc_vec_neg if elem != -inf]
842 don_pos = [elem for elem in don_vec_pos if elem != -inf]
843 don_neg = [elem for elem in don_vec_neg if elem != -inf]
844
845 pdb.set_trace()
846
847 for idx in range(len(Sequences)):
848 acc = [elem for elem in Acceptors[idx] if elem != -inf]
849 maxAcc = max(max(acc),maxAcc)
850 minAcc = min(min(acc),minAcc)
851
852 don = [elem for elem in Donors[idx] if elem != -inf]
853 maxDon = max(max(don),maxDon)
854 minDon = min(min(don),minDon)
855
856
857 if __name__ == '__main__':
858 qpalma = QPalma()
859
860 if len(sys.argv) == 2:
861 mode = sys.argv[1]
862 assert mode == 'train'
863 qpalma.train()
864
865 elif len(sys.argv) == 3:
866 mode = sys.argv[1]
867 param_filename = sys.argv[2]
868 assert mode == 'predict'
869 assert os.path.exists(param_filename)
870 qpalma.evaluate(param_filename)
871 else:
872 print 'You have to choose between training or prediction mode:'
873 print 'python qpalma. py (train|predict) <param_file>'