+ changed output in the filterReads
[qpalma.git] / scripts / qpalma_predict.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 ###########################################################
5 #
6 #
7 #
8 ###########################################################
9
10 import sys
11 import subprocess
12 import scipy.io
13 import pdb
14 import os.path
15
16 from numpy.matlib import mat,zeros,ones,inf
17 from numpy.linalg import norm
18
19 import QPalmaDP
20
21 import qpalma
22 from qpalma.SIQP_CPX import SIQPSolver
23 from qpalma.DataProc import *
24
25 from qpalma.generateEvaluationData import *
26 from qpalma.computeSpliceWeights import *
27 from qpalma.set_param_palma import *
28 from qpalma.computeSpliceAlignWithQuality import *
29 from qpalma.penalty_lookup_new import *
30 from qpalma.compute_donacc import *
31 from qpalma.TrainingParam import Param
32 from qpalma.export_param import *
33
34 import qpalma.Configuration
35 from qpalma.Plif import Plf
36 from qpalma.Helpers import *
37
38 def getQualityFeatureCounts(qualityPlifs):
39 weightQuality = qualityPlifs[0].penalties
40 for currentPlif in qualityPlifs[1:]:
41 weightQuality = numpy.vstack([weightQuality, currentPlif.penalties])
42
43 return weightQuality
44
45 class QPalma:
46 """
47 A training method for the QPalma project
48 """
49
50 def __init__(self):
51 self.ARGS = Param()
52
53 self.logfh = open('qpalma.log','w+')
54 #gen_file= '%s/genome.config' % self.ARGS.basedir
55 #ginfo_filename = 'genome_info.pickle'
56 #self.genome_info = fetch_genome_info(ginfo_filename)
57 #self.plog('genome_info.basedir is %s\n'%self.genome_info.basedir)
58
59 #self.ARGS.train_with_splicesitescoreinformation = False
60
61 def plog(self,string):
62 self.logfh.write(string)
63 self.logfh.flush()
64
65 def run(self):
66 # Load the whole dataset
67 if Conf.mode == 'normal':
68 #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
69 Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
70 use_quality_scores = False
71
72 elif Conf.mode == 'using_quality_scores':
73 Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos = paths_load_data_solexa('training',None,self.ARGS)
74
75 #Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
76 #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
77 pdb.set_trace()
78 #Qualities = []
79 #for i in range(len(Ests)):
80 # Qualities.append([40]*len(Ests[i]))
81 use_quality_scores = True
82 else:
83 assert(False)
84
85 # number of training instances
86 N = len(Sequences)
87 self.numExamples = N
88 assert N == len(Acceptors) and N == len(Acceptors) and N == len(Exons)\
89 and N == len(Ests), 'The Seq,Accept,Donor,.. arrays are of different lengths'
90 self.plog('Number of training examples: %d\n'% N)
91 print 'Number of features: %d\n'% Conf.numFeatures
92
93 remove_duplicate_scores = Conf.remove_duplicate_scores
94 print_matrix = Conf.print_matrix
95 anzpath = Conf.anzpath
96
97 # Initialize parameter vector / param = numpy.matlib.rand(126,1)
98 #param_filename = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/python/elegans.param'
99 param_filename='/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/param_16.pickle'
100 param = load_param(param_filename)
101
102 # Set the parameters such as limits penalties for the Plifs
103 [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
104
105 # stores the number of alignments done for each example (best path, second-best path etc.)
106 num_path = [anzpath]*N
107
108 #############################################################################################
109 # Training
110 #############################################################################################
111 self.plog('Starting training...\n')
112
113 donSP = Conf.numDonSuppPoints
114 accSP = Conf.numAccSuppPoints
115 lengthSP = Conf.numLengthSuppPoints
116 mmatrixSP = Conf.sizeMatchmatrix[0]\
117 *Conf.sizeMatchmatrix[1]
118 numq = Conf.numQualSuppPoints
119 totalQualSP = Conf.totalQualSuppPoints
120
121 currentPhi = zeros((Conf.numFeatures,1))
122 totalQualityPenalties = zeros((totalQualSP,1))
123
124 for exampleIdx in range(self.numExamples):
125
126 dna = Sequences[exampleIdx]
127 est = Ests[exampleIdx]
128 currentSplitPos = SplitPos[exampleIdx]
129
130 if Conf.mode == 'normal':
131 quality = [40]*len(est)
132
133 if Conf.mode == 'using_quality_scores':
134 quality = Qualities[exampleIdx]
135
136 exons = Exons[exampleIdx]
137 # NoiseMatrix = Noises[exampleIdx]
138 don_supp = Donors[exampleIdx]
139 acc_supp = Acceptors[exampleIdx]
140
141 if exons[-1,1] > len(dna):
142 continue
143
144 # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
145 trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
146
147 # Calculate the weights
148 trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
149 trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
150
151 currentPhi[0:donSP] = mat(d.penalties[:]).reshape(donSP,1)
152 currentPhi[donSP:donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
153 currentPhi[donSP+accSP:donSP+accSP+lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
154 currentPhi[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP] = mmatrix[:]
155
156 if Conf.mode == 'using_quality_scores':
157 totalQualityPenalties = param[-totalQualSP:]
158 currentPhi[donSP+accSP+lengthSP+mmatrixSP:] = totalQualityPenalties[:]
159
160 # Calculate w'phi(x,y) the total score of the alignment
161 trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
162
163 # The allWeights vector is supposed to store the weight parameter
164 # of the true alignment as well as the weight parameters of the
165 # 1 other alignments
166 allWeights = zeros((Conf.numFeatures,1+1))
167 allWeights[:,0] = trueWeight[:,0]
168
169 AlignmentScores = [0.0]*(1+1)
170 AlignmentScores[0] = trueAlignmentScore
171
172 ################## Calculate wrong alignment(s) ######################
173
174 # Compute donor, acceptor with penalty_lookup_new
175 # returns two double lists
176 donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
177
178 #myalign wants the acceptor site on the g of the ag
179 acceptor = acceptor[1:]
180 acceptor.append(-inf)
181
182 # for now we don't use donor/acceptor scores
183 #donor = [-inf] * len(donor)
184 #acceptor = [-inf] * len(donor)
185
186 dna = str(dna)
187 est = str(est)
188 dna_len = len(dna)
189 est_len = len(est)
190
191 ps = h.convert2SWIG()
192
193 prb = QPalmaDP.createDoubleArrayFromList(quality)
194 chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
195
196 matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
197 mm_len = Conf.sizeMatchmatrix[0]*Conf.sizeMatchmatrix[1]
198
199 d_len = len(donor)
200 donor = QPalmaDP.createDoubleArrayFromList(donor)
201 a_len = len(acceptor)
202 acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
203
204 # Create the alignment object representing the interface to the C/C++ code.
205 currentAlignment = QPalmaDP.Alignment(Conf.numQualPlifs,Conf.numQualSuppPoints, use_quality_scores)
206
207 c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
208
209 # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
210 currentAlignment.myalign(1, dna, dna_len,\
211 est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
212 acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
213 print_matrix)
214
215 c_SpliceAlign = QPalmaDP.createIntArrayFromList([0]*(dna_len*1))
216 c_EstAlign = QPalmaDP.createIntArrayFromList([0]*(est_len*1))
217 c_WeightMatch = QPalmaDP.createIntArrayFromList([0]*(mm_len*1))
218 c_DPScores = QPalmaDP.createDoubleArrayFromList([.0]*1)
219
220 c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(Conf.totalQualSuppPoints))
221
222 currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
223 c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
224
225 newSpliceAlign = zeros((dna_len,1))
226 newEstAlign = zeros((est_len,1))
227 newWeightMatch = zeros((mm_len,1))
228 newQualityPlifsFeatures = zeros((Conf.totalQualSuppPoints,1))
229
230 #print 'newSpliceAlign'
231 for i in range(dna_len):
232 newSpliceAlign[i] = c_SpliceAlign[i]
233 # print '%f' % (spliceAlign[i])
234
235 #print 'newEstAlign'
236 for i in range(est_len):
237 newEstAlign[i] = c_EstAlign[i]
238 # print '%f' % (spliceAlign[i])
239
240 #print 'weightMatch'
241 for i in range(mm_len):
242 newWeightMatch[i] = c_WeightMatch[i]
243 # print '%f' % (weightMatch[i])
244
245 newDPScores = c_DPScores[0]
246
247 for i in range(Conf.totalQualSuppPoints):
248 newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
249
250 # equals palma up to here
251
252 #print "Calling destructors"
253 del c_SpliceAlign
254 del c_EstAlign
255 del c_WeightMatch
256 del c_DPScores
257 del c_qualityPlifsFeatures
258 del currentAlignment
259
260 newSpliceAlign = newSpliceAlign.reshape(1,dna_len)
261 newWeightMatch = newWeightMatch.reshape(1,mm_len)
262 # Calculate weights of the respective alignments Note that we are
263 # calculating n-best alignments without hamming loss, so we
264 # have to keep track which of the n-best alignments correspond to
265 # the true one in order not to incorporate a true alignment in the
266 # constraints. To keep track of the true and false alignments we
267 # define an array true_map with a boolean indicating the
268 # equivalence to the true alignment for each decoded alignment.
269 true_map = [0]*2
270 true_map[0] = 1
271 pathNr = 0
272
273 weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a, h, newSpliceAlign.flatten().tolist()[0], don_supp, acc_supp)
274
275 decodedQualityFeatures = zeros((Conf.totalQualSuppPoints,1))
276 for qidx in range(Conf.totalQualSuppPoints):
277 decodedQualityFeatures[qidx] = newQualityPlifsFeatures[qidx]
278
279 # Gewichte in restliche Zeilen der Matrix speichern
280 wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch.T, decodedQualityFeatures])
281 allWeights[:,pathNr+1] = wp
282
283 hpen = mat(h.penalties).reshape(len(h.penalties),1)
284 dpen = mat(d.penalties).reshape(len(d.penalties),1)
285 apen = mat(a.penalties).reshape(len(a.penalties),1)
286 features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties])
287
288 AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
289
290 # Check wether scalar product + loss equals viterbi score
291 print '%f vs %f' % (newDPScores, AlignmentScores[pathNr+1])
292
293 # if the pathNr-best alignment is very close to the true alignment consider it as true
294 if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
295 true_map[pathNr+1] = 1
296
297 pdb.set_trace()
298
299 evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign,currentSplitPos)
300
301 print 'Prediction completed'
302 self.logfh.close()
303
304 def fetch_genome_info(ginfo_filename):
305 if not os.path.exists(ginfo_filename):
306 cmd = ['']*4
307 cmd[0] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/utils'
308 cmd[1] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/genomes'
309 cmd[2] = 'genome_info = init_genome(\'%s\')' % gen_file
310 cmd[3] = 'save genome_info.mat genome_info'
311 full_cmd = "matlab -nojvm -nodisplay -r \"%s; %s; %s; %s; exit\"" % (cmd[0],cmd[1],cmd[2],cmd[3])
312
313 obj = subprocess.Popen(full_cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
314 out,err = obj.communicate()
315 assert err == '', 'An error occured!\n%s'%err
316
317 ginfo = scipy.io.loadmat('genome_info.mat')
318 cPickle.dump(self.genome_info,open(ginfo_filename,'w+'))
319 return ginfo['genome_info']
320
321 else:
322 return cPickle.load(open(ginfo_filename))
323
324 def plifs2param(h,d,a,mmatrix,qualityPlifs):
325 donSP = Conf.numDonSuppPoints
326 accSP = Conf.numAccSuppPoints
327 lengthSP = Conf.numLengthSuppPoints
328 mmatrixSP = Conf.sizeMatchmatrix[0]\
329 *Conf.sizeMatchmatrix[1]
330
331
332 param = zeros((Conf.numFeatures,1))
333 param[0:donSP] = mat(d.penalties[:]).reshape(donSP,1)
334 param[donSP:donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
335 param[donSP+accSP:donSP+accSP+lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
336 param[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP] = mmatrix[:]
337
338 for idx in range(len(qualityPlifs)):
339 currentPlif = qualityPlifs[idx]
340 begin = lengthSP+donSP+accSP+mmatrixSP+(idx*Conf.numQualSuppPoints)
341 end = lengthSP+donSP+accSP+mmatrixSP+((idx+1)*Conf.numQualSuppPoints)
342 param[begin:end] = mat(currentPlif.penalties).reshape(Conf.numQualSuppPoints,1)
343
344 return param
345
346 def load_param(filename):
347 param = None
348 #try:
349 # para = cPickle.load(open(filename))
350 #except:
351 # print 'Error: Could not open parameter file!'
352 # sys.exit(1)
353 #
354 #param = plifs2param(para.h,para.d,para.a,para.mmatrix,para.qualityPlifs)
355
356 param = cPickle.load(open(filename))
357 return param
358
359 def evaluateExample(dna,est,exons,SpliceAlign,newEstAlign,spos):
360 newExons = []
361 oldElem = -1
362 SpliceAlign = SpliceAlign.flatten().tolist()[0]
363 SpliceAlign.append(-1)
364 for pos,elem in enumerate(SpliceAlign):
365 if pos == 0:
366 oldElem = -1
367 else:
368 oldElem = SpliceAlign[pos-1]
369
370 if oldElem != 0 and elem == 0:
371 newExons.append(pos)
372
373 if oldElem == 0 and elem != 0:
374 newExons.append(pos-1)
375
376 pdb.set_trace()
377
378 if __name__ == '__main__':
379 qpalma = QPalma()
380 qpalma.run()