+ added exons read boundary output to filterReads output
[qpalma.git] / scripts / qpalma_predict.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 ###########################################################
5 #
6 #
7 #
8 ###########################################################
9
10 import sys
11 import subprocess
12 import scipy.io
13 import pdb
14 import os.path
15
16 from numpy.matlib import mat,zeros,ones,inf
17 from numpy.linalg import norm
18
19 import QPalmaDP
20
21 import qpalma
22 from qpalma.SIQP_CPX import SIQPSolver
23 from qpalma.DataProc import *
24
25 from qpalma.generateEvaluationData import *
26 from qpalma.computeSpliceWeights import *
27 from qpalma.set_param_palma import *
28 from qpalma.computeSpliceAlignWithQuality import *
29 from qpalma.penalty_lookup_new import *
30 from qpalma.compute_donacc import *
31 from qpalma.TrainingParam import Param
32 from qpalma.export_param import *
33
34 import qpalma.Configuration
35 from qpalma.Plif import Plf
36 from qpalma.Helpers import *
37
38 from qpalma.tools.splicesites import getDonAccScores
39
40 def getQualityFeatureCounts(qualityPlifs):
41 weightQuality = qualityPlifs[0].penalties
42 for currentPlif in qualityPlifs[1:]:
43 weightQuality = numpy.vstack([weightQuality, currentPlif.penalties])
44
45 return weightQuality
46
47 class QPalma:
48 """
49 A training method for the QPalma project
50 """
51
52 def __init__(self):
53 self.ARGS = Param()
54
55 self.logfh = open('qpalma.log','w+')
56 #gen_file= '%s/genome.config' % self.ARGS.basedir
57 #ginfo_filename = 'genome_info.pickle'
58 #self.genome_info = fetch_genome_info(ginfo_filename)
59 #self.plog('genome_info.basedir is %s\n'%self.genome_info.basedir)
60
61 #self.ARGS.train_with_splicesitescoreinformation = False
62
63 def plog(self,string):
64 self.logfh.write(string)
65 self.logfh.flush()
66
67 def run(self):
68 # Load the whole dataset
69 if Conf.mode == 'normal':
70 #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
71 Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
72 use_quality_scores = False
73
74 elif Conf.mode == 'using_quality_scores':
75 Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos =\
76 paths_load_data_solexa('training',None,self.ARGS,True)
77
78 begin = 200
79 end = 400
80 Sequences = Sequences[begin:end]
81 Exons = Exons[begin:end]
82 Ests = Ests[begin:end]
83 Qualities = Qualities[begin:end]
84 Acceptors = Acceptors[begin:end]
85 Donors = Donors[begin:end]
86
87 pdb.set_trace()
88
89 Donors, Acceptors = getDonAccScores(Sequences)
90
91
92 #Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
93 #SplitPos = [1]*len(Qualities)
94
95 #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
96 #pdb.set_trace()
97 #Qualities = []
98 #for i in range(len(Ests)):
99 # Qualities.append([40]*len(Ests[i]))
100 use_quality_scores = True
101 else:
102 assert(False)
103
104 # number of training instances
105 N = len(Sequences)
106 self.numExamples = N
107 assert N == len(Acceptors) and N == len(Acceptors) and N == len(Exons)\
108 and N == len(Ests), 'The Seq,Accept,Donor,.. arrays are of different lengths'
109 self.plog('Number of training examples: %d\n'% N)
110 print 'Number of features: %d\n'% Conf.numFeatures
111
112 remove_duplicate_scores = Conf.remove_duplicate_scores
113 print_matrix = Conf.print_matrix
114 anzpath = Conf.anzpath
115
116 # Initialize parameter vector / param = numpy.matlib.rand(126,1)
117 #param_filename = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/python/elegans.param'
118 param_filename='/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/param_200.pickle'
119 param = load_param(param_filename)
120
121 # Set the parameters such as limits penalties for the Plifs
122 [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
123
124 # stores the number of alignments done for each example (best path, second-best path etc.)
125 num_path = [anzpath]*N
126
127 #############################################################################################
128 # Training
129 #############################################################################################
130 self.plog('Starting training...\n')
131
132 donSP = Conf.numDonSuppPoints
133 accSP = Conf.numAccSuppPoints
134 lengthSP = Conf.numLengthSuppPoints
135 mmatrixSP = Conf.sizeMatchmatrix[0]\
136 *Conf.sizeMatchmatrix[1]
137 numq = Conf.numQualSuppPoints
138 totalQualSP = Conf.totalQualSuppPoints
139
140 currentPhi = zeros((Conf.numFeatures,1))
141 totalQualityPenalties = zeros((totalQualSP,1))
142
143 total_up_off = []
144 total_down_off = []
145
146 #for exampleIdx in range(self.numExamples):
147 for exampleIdx in range(200):
148
149 dna = Sequences[exampleIdx]
150 est = Ests[exampleIdx]
151 currentSplitPos = SplitPos[exampleIdx]
152
153 if Conf.mode == 'normal':
154 quality = [40]*len(est)
155
156 if Conf.mode == 'using_quality_scores':
157 quality = Qualities[exampleIdx]
158
159 exons = Exons[exampleIdx]
160 # NoiseMatrix = Noises[exampleIdx]
161 don_supp = Donors[exampleIdx]
162 acc_supp = Acceptors[exampleIdx]
163
164 if exons[-1,1] > len(dna):
165 continue
166
167 # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
168 trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
169
170 # Calculate the weights
171 trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
172 trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
173
174 currentPhi[0:donSP] = mat(d.penalties[:]).reshape(donSP,1)
175 currentPhi[donSP:donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
176 currentPhi[donSP+accSP:donSP+accSP+lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
177 currentPhi[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP] = mmatrix[:]
178
179 if Conf.mode == 'using_quality_scores':
180 totalQualityPenalties = param[-totalQualSP:]
181 currentPhi[donSP+accSP+lengthSP+mmatrixSP:] = totalQualityPenalties[:]
182
183 # Calculate w'phi(x,y) the total score of the alignment
184 trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
185
186 # The allWeights vector is supposed to store the weight parameter
187 # of the true alignment as well as the weight parameters of the
188 # 1 other alignments
189 allWeights = zeros((Conf.numFeatures,1+1))
190 allWeights[:,0] = trueWeight[:,0]
191
192 AlignmentScores = [0.0]*(1+1)
193 AlignmentScores[0] = trueAlignmentScore
194
195 ################## Calculate wrong alignment(s) ######################
196
197 # Compute donor, acceptor with penalty_lookup_new
198 # returns two double lists
199 donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
200
201 #myalign wants the acceptor site on the g of the ag
202 acceptor = acceptor[1:]
203 acceptor.append(-inf)
204
205 # for now we don't use donor/acceptor scores
206 #donor = [-inf] * len(donor)
207 #acceptor = [-inf] * len(donor)
208
209 dna = str(dna)
210 est = str(est)
211 dna_len = len(dna)
212 est_len = len(est)
213
214 ps = h.convert2SWIG()
215
216 prb = QPalmaDP.createDoubleArrayFromList(quality)
217 chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
218
219 matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
220 mm_len = Conf.sizeMatchmatrix[0]*Conf.sizeMatchmatrix[1]
221
222 d_len = len(donor)
223 donor = QPalmaDP.createDoubleArrayFromList(donor)
224 a_len = len(acceptor)
225 acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
226
227 # Create the alignment object representing the interface to the C/C++ code.
228 currentAlignment = QPalmaDP.Alignment(Conf.numQualPlifs,Conf.numQualSuppPoints, use_quality_scores)
229
230 c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
231
232 # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
233 currentAlignment.myalign(1, dna, dna_len,\
234 est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
235 acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
236 print_matrix)
237
238 c_SpliceAlign = QPalmaDP.createIntArrayFromList([0]*(dna_len*1))
239 c_EstAlign = QPalmaDP.createIntArrayFromList([0]*(est_len*1))
240 c_WeightMatch = QPalmaDP.createIntArrayFromList([0]*(mm_len*1))
241 c_DPScores = QPalmaDP.createDoubleArrayFromList([.0]*1)
242
243 c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(Conf.totalQualSuppPoints))
244
245 currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
246 c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
247
248 newSpliceAlign = zeros((dna_len,1))
249 newEstAlign = zeros((est_len,1))
250 newWeightMatch = zeros((mm_len,1))
251 newQualityPlifsFeatures = zeros((Conf.totalQualSuppPoints,1))
252
253 #print 'newSpliceAlign'
254 for i in range(dna_len):
255 newSpliceAlign[i] = c_SpliceAlign[i]
256 # print '%f' % (spliceAlign[i])
257
258 #print 'newEstAlign'
259 for i in range(est_len):
260 newEstAlign[i] = c_EstAlign[i]
261 # print '%f' % (spliceAlign[i])
262
263 #print 'weightMatch'
264 for i in range(mm_len):
265 newWeightMatch[i] = c_WeightMatch[i]
266 # print '%f' % (weightMatch[i])
267
268 newDPScores = c_DPScores[0]
269
270 for i in range(Conf.totalQualSuppPoints):
271 newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
272
273 # equals palma up to here
274
275 #print "Calling destructors"
276 del c_SpliceAlign
277 del c_EstAlign
278 del c_WeightMatch
279 del c_DPScores
280 del c_qualityPlifsFeatures
281 del currentAlignment
282
283 newSpliceAlign = newSpliceAlign.reshape(1,dna_len)
284 newWeightMatch = newWeightMatch.reshape(1,mm_len)
285 # Calculate weights of the respective alignments Note that we are
286 # calculating n-best alignments without hamming loss, so we
287 # have to keep track which of the n-best alignments correspond to
288 # the true one in order not to incorporate a true alignment in the
289 # constraints. To keep track of the true and false alignments we
290 # define an array true_map with a boolean indicating the
291 # equivalence to the true alignment for each decoded alignment.
292 true_map = [0]*2
293 true_map[0] = 1
294 pathNr = 0
295
296 weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a, h, newSpliceAlign.flatten().tolist()[0], don_supp, acc_supp)
297
298 decodedQualityFeatures = zeros((Conf.totalQualSuppPoints,1))
299 for qidx in range(Conf.totalQualSuppPoints):
300 decodedQualityFeatures[qidx] = newQualityPlifsFeatures[qidx]
301
302 # Gewichte in restliche Zeilen der Matrix speichern
303 wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch.T, decodedQualityFeatures])
304 allWeights[:,pathNr+1] = wp
305
306 hpen = mat(h.penalties).reshape(len(h.penalties),1)
307 dpen = mat(d.penalties).reshape(len(d.penalties),1)
308 apen = mat(a.penalties).reshape(len(a.penalties),1)
309 features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties])
310
311 AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
312
313 # Check wether scalar product + loss equals viterbi score
314 print '%f vs. %f' % (newDPScores, AlignmentScores[pathNr+1])
315
316 # if the pathNr-best alignment is very close to the true alignment consider it as true
317 if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
318 true_map[pathNr+1] = 1
319
320 #pdb.set_trace()
321
322 #up_off,down_off = evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign,currentSplitPos)
323 evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign,currentSplitPos)
324 #print up_off,down_off
325
326 #if up_off > -1:
327 # total_up_off.append(up_off)
328 # total_down_off.append(down_off)
329
330 #total_up = 0
331 #total_down = 0
332 #for idx in range(len(total_up_off)):
333 # total_up += total_up_off[idx]
334 # total_down += total_down_off[idx]
335 #
336 #total_up /= len(total_up_off)
337 #total_down /= len(total_down_off)
338
339 #print 'Mean up_off is %f' % total_up
340 #print 'Mean down_off is %f' % total_down
341 ##print total_up_off
342 ##print total_down_off
343 #print 'len is %d' % len(total_up_off)
344
345 print 'Prediction completed'
346 self.logfh.close()
347
348 def fetch_genome_info(ginfo_filename):
349 if not os.path.exists(ginfo_filename):
350 cmd = ['']*4
351 cmd[0] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/utils'
352 cmd[1] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/genomes'
353 cmd[2] = 'genome_info = init_genome(\'%s\')' % gen_file
354 cmd[3] = 'save genome_info.mat genome_info'
355 full_cmd = "matlab -nojvm -nodisplay -r \"%s; %s; %s; %s; exit\"" % (cmd[0],cmd[1],cmd[2],cmd[3])
356
357 obj = subprocess.Popen(full_cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
358 out,err = obj.communicate()
359 assert err == '', 'An error occured!\n%s'%err
360
361 ginfo = scipy.io.loadmat('genome_info.mat')
362 cPickle.dump(self.genome_info,open(ginfo_filename,'w+'))
363 return ginfo['genome_info']
364
365 else:
366 return cPickle.load(open(ginfo_filename))
367
368 def plifs2param(h,d,a,mmatrix,qualityPlifs):
369 donSP = Conf.numDonSuppPoints
370 accSP = Conf.numAccSuppPoints
371 lengthSP = Conf.numLengthSuppPoints
372 mmatrixSP = Conf.sizeMatchmatrix[0]\
373 *Conf.sizeMatchmatrix[1]
374
375
376 param = zeros((Conf.numFeatures,1))
377 param[0:donSP] = mat(d.penalties[:]).reshape(donSP,1)
378 param[donSP:donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
379 param[donSP+accSP:donSP+accSP+lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
380 param[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP] = mmatrix[:]
381
382 for idx in range(len(qualityPlifs)):
383 currentPlif = qualityPlifs[idx]
384 begin = lengthSP+donSP+accSP+mmatrixSP+(idx*Conf.numQualSuppPoints)
385 end = lengthSP+donSP+accSP+mmatrixSP+((idx+1)*Conf.numQualSuppPoints)
386 param[begin:end] = mat(currentPlif.penalties).reshape(Conf.numQualSuppPoints,1)
387
388 return param
389
390 def load_param(filename):
391 param = None
392 #try:
393 # para = cPickle.load(open(filename))
394 #except:
395 # print 'Error: Could not open parameter file!'
396 # sys.exit(1)
397 #
398 #param = plifs2param(para.h,para.d,para.a,para.mmatrix,para.qualityPlifs)
399
400 param = cPickle.load(open(filename))
401 return param
402
403 def evaluateExample(dna,est,exons,SpliceAlign,newEstAlign,spos):
404 newExons = []
405 oldElem = -1
406 SpliceAlign = SpliceAlign.flatten().tolist()[0]
407 SpliceAlign.append(-1)
408 for pos,elem in enumerate(SpliceAlign):
409 if pos == 0:
410 oldElem = -1
411 else:
412 oldElem = SpliceAlign[pos-1]
413
414 if oldElem != 0 and elem == 0: # start of exon
415 newExons.append(pos-1)
416
417 if oldElem == 0 and elem != 0: # end of exon
418 newExons.append(pos)
419
420 up_off = -1
421 down_off = -1
422
423 if len(newExons) != 4:
424 acc = 0.0
425 else:
426 e1_begin,e1_end = newExons[0],newExons[1]
427 e2_begin,e2_end = newExons[2],newExons[3]
428
429 up_off = int(math.fabs(e1_end - exons[0,1]))
430 down_off = int(math.fabs(e2_begin - exons[1,0]))
431
432 pdb.set_trace()
433
434 return up_off,down_off
435
436 if __name__ == '__main__':
437 qpalma = QPalma()
438 qpalma.run()