+ moved pythn data tools to new dir
[qpalma.git] / python / qpalma_predict.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 ###########################################################
5 #
6 #
7 #
8 ###########################################################
9
10 import sys
11 import subprocess
12 import scipy.io
13 import pdb
14 import os.path
15
16 from numpy.matlib import mat,zeros,ones,inf
17 from numpy.linalg import norm
18
19 import QPalmaDP
20 from SIQP_CPX import SIQPSolver
21
22 from paths_load_data_pickle import *
23 from paths_load_data_solexa import *
24
25 from generateEvaluationData import *
26
27 from computeSpliceWeights import *
28 from set_param_palma import *
29 from computeSpliceAlignWithQuality import *
30 from penalty_lookup_new import *
31 from compute_donacc import *
32 from TrainingParam import Param
33 from export_param import *
34
35 import Configuration as Conf
36 from Plif import Plf
37 from Helpers import *
38
39 def getQualityFeatureCounts(qualityPlifs):
40 weightQuality = qualityPlifs[0].penalties
41 for currentPlif in qualityPlifs[1:]:
42 weightQuality = numpy.vstack([weightQuality, currentPlif.penalties])
43
44 return weightQuality
45
46 class QPalma:
47 """
48 A training method for the QPalma project
49 """
50
51 def __init__(self):
52 self.ARGS = Param()
53
54 self.logfh = open('qpalma.log','w+')
55 gen_file= '%s/genome.config' % self.ARGS.basedir
56
57 ginfo_filename = 'genome_info.pickle'
58 self.genome_info = fetch_genome_info(ginfo_filename)
59
60 self.plog('genome_info.basedir is %s\n'%self.genome_info.basedir)
61
62 #self.ARGS.train_with_splicesitescoreinformation = False
63
64 def plog(self,string):
65 self.logfh.write(string)
66 self.logfh.flush()
67
68 def run(self):
69 # Load the whole dataset
70 if Conf.mode == 'normal':
71 #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
72 Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
73 use_quality_scores = False
74
75 elif Conf.mode == 'using_quality_scores':
76 Sequences, Acceptors, Donors, Exons, Ests, Qualities = paths_load_data_solexa('training',self.genome_info,self.ARGS)
77 #Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
78
79 #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
80 pdb.set_trace()
81 #Qualities = []
82 #for i in range(len(Ests)):
83 # Qualities.append([40]*len(Ests[i]))
84 use_quality_scores = True
85 else:
86 assert(False)
87
88 # number of training instances
89 N = len(Sequences)
90 self.numExamples = N
91 assert N == len(Acceptors) and N == len(Acceptors) and N == len(Exons)\
92 and N == len(Ests), 'The Seq,Accept,Donor,.. arrays are of different lengths'
93 self.plog('Number of training examples: %d\n'% N)
94 print 'Number of features: %d\n'% Conf.numFeatures
95
96 remove_duplicate_scores = Conf.remove_duplicate_scores
97 print_matrix = Conf.print_matrix
98 anzpath = Conf.anzpath
99
100 # Initialize parameter vector / param = numpy.matlib.rand(126,1)
101 #param_filename = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/python/elegans.param'
102 param_filename='/fml/ag-raetsch/home/fabio/svn/projects/QPalma/python/param_16.pickle'
103 param = load_param(param_filename)
104
105 # Set the parameters such as limits penalties for the Plifs
106 [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
107
108 # stores the number of alignments done for each example (best path, second-best path etc.)
109 num_path = [anzpath]*N
110
111 #############################################################################################
112 # Training
113 #############################################################################################
114 self.plog('Starting training...\n')
115
116 donSP = Conf.numDonSuppPoints
117 accSP = Conf.numAccSuppPoints
118 lengthSP = Conf.numLengthSuppPoints
119 mmatrixSP = Conf.sizeMatchmatrix[0]\
120 *Conf.sizeMatchmatrix[1]
121 numq = Conf.numQualSuppPoints
122 totalQualSP = Conf.totalQualSuppPoints
123
124 currentPhi = zeros((Conf.numFeatures,1))
125 totalQualityPenalties = zeros((totalQualSP,1))
126
127 for exampleIdx in range(self.numExamples):
128
129 dna = Sequences[exampleIdx]
130 est = Ests[exampleIdx]
131
132 if Conf.mode == 'normal':
133 quality = [40]*len(est)
134
135 if Conf.mode == 'using_quality_scores':
136 quality = Qualities[exampleIdx]
137
138 exons = Exons[exampleIdx]
139 # NoiseMatrix = Noises[exampleIdx]
140 don_supp = Donors[exampleIdx]
141 acc_supp = Acceptors[exampleIdx]
142
143 if exons[-1,1] > len(dna):
144 continue
145
146 # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
147 trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
148
149 # Calculate the weights
150 trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
151 trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
152
153 currentPhi[0:donSP] = mat(d.penalties[:]).reshape(donSP,1)
154 currentPhi[donSP:donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
155 currentPhi[donSP+accSP:donSP+accSP+lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
156 currentPhi[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP] = mmatrix[:]
157
158 if Conf.mode == 'using_quality_scores':
159 totalQualityPenalties = param[-totalQualSP:]
160 currentPhi[donSP+accSP+lengthSP+mmatrixSP:] = totalQualityPenalties[:]
161
162 # Calculate w'phi(x,y) the total score of the alignment
163 trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
164
165 # The allWeights vector is supposed to store the weight parameter
166 # of the true alignment as well as the weight parameters of the
167 # 1 other alignments
168 allWeights = zeros((Conf.numFeatures,1+1))
169 allWeights[:,0] = trueWeight[:,0]
170
171 AlignmentScores = [0.0]*(1+1)
172 AlignmentScores[0] = trueAlignmentScore
173
174 ################## Calculate wrong alignment(s) ######################
175
176 # Compute donor, acceptor with penalty_lookup_new
177 # returns two double lists
178 donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
179
180 #myalign wants the acceptor site on the g of the ag
181 acceptor = acceptor[1:]
182 acceptor.append(-inf)
183
184 # for now we don't use donor/acceptor scores
185 #donor = [-inf] * len(donor)
186 #acceptor = [-inf] * len(donor)
187
188 dna = str(dna)
189 est = str(est)
190 dna_len = len(dna)
191 est_len = len(est)
192
193 ps = h.convert2SWIG()
194
195 prb = QPalmaDP.createDoubleArrayFromList(quality)
196 chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
197
198 matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
199 mm_len = Conf.sizeMatchmatrix[0]*Conf.sizeMatchmatrix[1]
200
201 d_len = len(donor)
202 donor = QPalmaDP.createDoubleArrayFromList(donor)
203 a_len = len(acceptor)
204 acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
205
206 # Create the alignment object representing the interface to the C/C++ code.
207 currentAlignment = QPalmaDP.Alignment(Conf.numQualPlifs,Conf.numQualSuppPoints, use_quality_scores)
208
209 c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
210
211 # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
212 currentAlignment.myalign(1, dna, dna_len,\
213 est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
214 acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
215 print_matrix)
216
217 c_SpliceAlign = QPalmaDP.createIntArrayFromList([0]*(dna_len*1))
218 c_EstAlign = QPalmaDP.createIntArrayFromList([0]*(est_len*1))
219 c_WeightMatch = QPalmaDP.createIntArrayFromList([0]*(mm_len*1))
220 c_DPScores = QPalmaDP.createDoubleArrayFromList([.0]*1)
221
222 c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(Conf.totalQualSuppPoints))
223
224 currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
225 c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
226
227 newSpliceAlign = zeros((dna_len,1))
228 newEstAlign = zeros((est_len,1))
229 newWeightMatch = zeros((mm_len,1))
230 newQualityPlifsFeatures = zeros((Conf.totalQualSuppPoints,1))
231
232 #print 'newSpliceAlign'
233 for i in range(dna_len):
234 newSpliceAlign[i] = c_SpliceAlign[i]
235 # print '%f' % (spliceAlign[i])
236
237 #print 'newEstAlign'
238 for i in range(est_len):
239 newEstAlign[i] = c_EstAlign[i]
240 # print '%f' % (spliceAlign[i])
241
242 #print 'weightMatch'
243 for i in range(mm_len):
244 newWeightMatch[i] = c_WeightMatch[i]
245 # print '%f' % (weightMatch[i])
246
247 newDPScores = c_DPScores[0]
248
249 for i in range(Conf.totalQualSuppPoints):
250 newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
251
252 # equals palma up to here
253
254 #print "Calling destructors"
255 del c_SpliceAlign
256 del c_EstAlign
257 del c_WeightMatch
258 del c_DPScores
259 del c_qualityPlifsFeatures
260 del currentAlignment
261
262 newSpliceAlign = newSpliceAlign.reshape(1,dna_len)
263 newWeightMatch = newWeightMatch.reshape(1,mm_len)
264 # Calculate weights of the respective alignments Note that we are
265 # calculating n-best alignments without hamming loss, so we
266 # have to keep track which of the n-best alignments correspond to
267 # the true one in order not to incorporate a true alignment in the
268 # constraints. To keep track of the true and false alignments we
269 # define an array true_map with a boolean indicating the
270 # equivalence to the true alignment for each decoded alignment.
271 true_map = [0]*2
272 true_map[0] = 1
273 pathNr = 0
274
275 weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a, h, newSpliceAlign.flatten().tolist()[0], don_supp, acc_supp)
276
277 decodedQualityFeatures = zeros((Conf.totalQualSuppPoints,1))
278 for qidx in range(Conf.totalQualSuppPoints):
279 decodedQualityFeatures[qidx] = newQualityPlifsFeatures[qidx]
280
281 # Gewichte in restliche Zeilen der Matrix speichern
282 wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch.T, decodedQualityFeatures])
283 allWeights[:,pathNr+1] = wp
284
285 hpen = mat(h.penalties).reshape(len(h.penalties),1)
286 dpen = mat(d.penalties).reshape(len(d.penalties),1)
287 apen = mat(a.penalties).reshape(len(a.penalties),1)
288 features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties])
289
290 AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
291
292 # Check wether scalar product + loss equals viterbi score
293 print '%f vs %f' % (newDPScores, AlignmentScores[pathNr+1])
294
295 # if the pathNr-best alignment is very close to the true alignment consider it as true
296 if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
297 true_map[pathNr+1] = 1
298
299 pdb.set_trace()
300
301 evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign)
302
303 print 'Prediction completed'
304 self.logfh.close()
305
306 def fetch_genome_info(ginfo_filename):
307 if not os.path.exists(ginfo_filename):
308 cmd = ['']*4
309 cmd[0] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/utils'
310 cmd[1] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/genomes'
311 cmd[2] = 'genome_info = init_genome(\'%s\')' % gen_file
312 cmd[3] = 'save genome_info.mat genome_info'
313 full_cmd = "matlab -nojvm -nodisplay -r \"%s; %s; %s; %s; exit\"" % (cmd[0],cmd[1],cmd[2],cmd[3])
314
315 obj = subprocess.Popen(full_cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
316 out,err = obj.communicate()
317 assert err == '', 'An error occured!\n%s'%err
318
319 ginfo = scipy.io.loadmat('genome_info.mat')
320 cPickle.dump(self.genome_info,open(ginfo_filename,'w+'))
321 return ginfo['genome_info']
322
323 else:
324 return cPickle.load(open(ginfo_filename))
325
326 def plifs2param(h,d,a,mmatrix,qualityPlifs):
327 donSP = Conf.numDonSuppPoints
328 accSP = Conf.numAccSuppPoints
329 lengthSP = Conf.numLengthSuppPoints
330 mmatrixSP = Conf.sizeMatchmatrix[0]\
331 *Conf.sizeMatchmatrix[1]
332
333
334 param = zeros((Conf.numFeatures,1))
335 param[0:donSP] = mat(d.penalties[:]).reshape(donSP,1)
336 param[donSP:donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
337 param[donSP+accSP:donSP+accSP+lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
338 param[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP] = mmatrix[:]
339
340 for idx in range(len(qualityPlifs)):
341 currentPlif = qualityPlifs[idx]
342 begin = lengthSP+donSP+accSP+mmatrixSP+(idx*Conf.numQualSuppPoints)
343 end = lengthSP+donSP+accSP+mmatrixSP+((idx+1)*Conf.numQualSuppPoints)
344 param[begin:end] = mat(currentPlif.penalties).reshape(Conf.numQualSuppPoints,1)
345
346 return param
347
348 def load_param(filename):
349 param = None
350 #try:
351 # para = cPickle.load(open(filename))
352 #except:
353 # print 'Error: Could not open parameter file!'
354 # sys.exit(1)
355 #
356 #param = plifs2param(para.h,para.d,para.a,para.mmatrix,para.qualityPlifs)
357
358 param = cPickle.load(open(filename))
359 return param
360
361 def evaluateExample(dna,est,exons,SpliceAlign,newEstAlign):
362 newExons = []
363 oldElem = -1
364 SpliceAlign = SpliceAlign.flatten().tolist()[0]
365 SpliceAlign.append(-1)
366 for pos,elem in enumerate(SpliceAlign):
367 if pos == 0:
368 oldElem = -1
369 else:
370 oldElem = SpliceAlign[pos-1]
371
372 if oldElem != 0 and elem == 0:
373 newExons.append(pos)
374
375 if oldElem == 0 and elem != 0:
376 newExons.append(pos-1)
377
378 pdb.set_trace()
379
380 if __name__ == '__main__':
381 qpalma = QPalma()
382 qpalma.run()