+ started to extend fill_matrix in order to be able to use quality scores
[qpalma.git] / python / qpalma.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 ###########################################################
5 #
6 # This file contains the
7 #
8 ###########################################################
9
10 import sys
11 import subprocess
12 import scipy.io
13 import pdb
14
15 from numpy.matlib import mat,zeros,ones,inf
16 from numpy.linalg import norm
17
18 import QPalmaDP
19
20 from SIQP_CPX import SIQPSolver
21
22 from paths_load_data import *
23 from paths_load_data_pickle import *
24
25 from computeSpliceWeights import *
26 from set_param_palma import *
27 from computeSpliceAlign import *
28 from penalty_lookup_new import *
29 from compute_donacc import *
30 from TrainingParam import Param
31 from export_param import *
32
33 import Configuration
34
35
36
37 def initializeQualityScoringFunctions(numPlifs,numSuppPoints):
38
39 allPlifs = [0]*numPlifs
40
41 for idx in range(numPlifs):
42
43 curPlif = Plf()
44 curPlif.limits = linspace(min_svm_score,max_svm_score,numSuppPoints)
45 curPlif.penalties = [0]*numSuppPoints
46 curPlif.transform = ''
47 curPlif.name = ''
48 curPlif.max_len = 100
49 curPlif.min_len = -100
50 curPlif.id = 1
51 curPlif.use_svm = 0
52 curPlif.next_id = 0
53
54 allPlifs[idx] = d
55
56 return allPlifs
57
58
59 class QPalma:
60 """
61 A training method for the QPalma project
62 """
63
64 def __init__(self):
65 self.ARGS = Param()
66
67 self.logfh = open('qpalma.log','w+')
68 gen_file= '%s/genome.config' % self.ARGS.basedir
69
70 cmd = ['']*4
71 cmd[0] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/utils'
72 cmd[1] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/genomes'
73 cmd[2] = 'genome_info = init_genome(\'%s\')' % gen_file
74 cmd[3] = 'save genome_info.mat genome_info'
75 full_cmd = "matlab -nojvm -nodisplay -r \"%s; %s; %s; %s; exit\"" % (cmd[0],cmd[1],cmd[2],cmd[3])
76
77 obj = subprocess.Popen(full_cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
78 out,err = obj.communicate()
79 assert err == '', 'An error occured!\n%s'%err
80
81 ginfo = scipy.io.loadmat('genome_info.mat')
82 self.genome_info = ginfo['genome_info']
83
84 self.plog('genome_info.basedir is %s\n'%self.genome_info.basedir)
85
86 self.C=1.0
87
88 # 'normal' means work like Palma
89 # 'using_quality_scores' means work like Palma plus using sequencing
90 # quality scores
91 self.mode = 'normal'
92 #self.mode = 'using_quality_scores'
93
94 # Here we specify the total number of parameters.
95 # When using quality scores our scoring function is defined as
96 #
97 # f: S x R x S -> R
98 #
99 # as opposed to a usage without quality scores when we only have
100 #
101 # f: S x S -> R
102 #
103 self.numDonSuppPoints = 30
104 self.numAccSuppPoints = 30
105 self.numLengthSuppPoints = 30
106 if self.mode == 'normal':
107 self.sizeMMatrix = 36
108 elif self.mode == 'using_quality_scores':
109 self.sizeMMatrix = 728
110 else:
111 assert False, 'Wrong operation mode specified'
112
113 # this number defines the number of support points for one tuple (a,b)
114 # where 'a' comes with a quality score
115 self.numQualSuppPoints = 30
116 self.numQualSuppPoints = 0
117
118 self.numFeatures = self.numDonSuppPoints + self.numAccSuppPoints\
119 + self.numLengthSuppPoints + self.sizeMMatrix
120
121 self.plog('Initializing problem...\n')
122
123
124 def plog(self,string):
125 self.logfh.write(string)
126
127
128 def run(self):
129 # Load the whole dataset
130 #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data('training',self.genome_info,self.ARGS)
131 Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
132
133 # number of training instances
134 N = len(Sequences)
135 self.numExamples = N
136 assert N == len(Acceptors) and N == len(Acceptors) and N == len(Exons)\
137 and N == len(Ests), 'The Seq,Accept,Donor,.. arrays are of different lengths'
138 self.plog('Number of training examples: %d\n'% N)
139
140 #iteration_steps = 200 ; #upper bound on iteration steps
141 iteration_steps = 2 ; #upper bound on iteration steps
142
143 remove_duplicate_scores = False
144 print_matrix = False
145 anzpath = 2
146
147 # Initialize parameter vector
148 # param = numpy.matlib.rand(126,1)
149 param = Configuration.fixedParam
150
151 # Set the parameters such as limits penalties for the Plifs
152 [h,d,a,mmatrix] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
153
154 # delete splicesite-score-information
155 if not self.ARGS.train_with_splicesitescoreinformation:
156 for i in range(len(Acceptors)):
157 if Acceptors[i] > -20:
158 Acceptors[i] = 1
159 if Donors[i] >-20:
160 Donors[i] = 1
161
162 # Initialize solver
163 if not __debug__:
164 solver = SIQPSolver(self.numFeatures,self.numExamples,self.C,self.logfh)
165
166 # stores the number of alignments done for each example (best path, second-best path etc.)
167 num_path = [anzpath]*N
168 # stores the gap for each example
169 gap = [0.0]*N
170
171 qualityMatrix = zeros((self.numQualSuppPoints,1))
172
173 numPlifs = 24
174 numSuppPoints = 30
175 allMatchingPlifs = initializeQualityScoringFunctions(numPlifs,numSuppPoints)
176 # self.numQualSuppPoints
177
178 #############################################################################################
179 # Training
180 #############################################################################################
181 self.plog('Starting training...\n')
182
183 iteration_nr = 0
184
185 while True:
186 if iteration_nr == iteration_steps:
187 break
188
189 for exampleIdx in range(self.numExamples):
190 if (exampleIdx%10) == 0:
191 print 'Current example nr %d' % exampleIdx
192
193 dna = Sequences[exampleIdx]
194 est = Ests[exampleIdx]
195
196 exons = Exons[exampleIdx]
197 # NoiseMatrix = Noises[exampleIdx]
198 don_supp = Donors[exampleIdx]
199 acc_supp = Acceptors[exampleIdx]
200
201 # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
202 trueSpliceAlign, trueWeightMatch = computeSpliceAlign(dna, exons)
203
204 # Calculate the weights
205 trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
206 trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, qualityMatrix ])
207
208 currentPhi = zeros((self.numFeatures,1))
209 currentPhi[0:30] = mat(d.penalties[:]).reshape(30,1)
210 currentPhi[30:60] = mat(a.penalties[:]).reshape(30,1)
211 currentPhi[60:90] = mat(h.penalties[:]).reshape(30,1)
212 currentPhi[90:126] = mmatrix[:]
213 currentPhi[126:] = qualityMatrix[:]
214
215 # Calculate w'phi(x,y) the total score of the alignment
216 trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
217
218 # The allWeights vector is supposed to store the weight parameter
219 # of the true alignment as well as the weight parameters of the
220 # num_path[exampleIdx] other alignments
221 allWeights = zeros((self.numFeatures,num_path[exampleIdx]+1))
222 allWeights[:,0] = trueWeight[:,0]
223
224 AlignmentScores = [0.0]*(num_path[exampleIdx]+1)
225 AlignmentScores[0] = trueAlignmentScore
226
227 ################## Calculate wrong alignment(s) ######################
228
229 # Compute donor, acceptor with penalty_lookup_new
230 # returns two double lists
231 donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
232
233 #myalign wants the acceptor site on the g of the ag
234 acceptor = acceptor[1:]
235 acceptor.append(-inf)
236
237 dna = str(dna)
238 est = str(est)
239 dna_len = len(dna)
240 est_len = len(est)
241 ps = h.convert2SWIG()
242
243 matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
244 mm_len = 36
245
246 d_len = len(donor)
247 donor = QPalmaDP.createDoubleArrayFromList(donor)
248 a_len = len(acceptor)
249 acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
250
251 currentAlignment = QPalmaDP.Alignment()
252 qualityMat = QPalmaDP.createDoubleArrayFromList(qualityMatrix)
253 currentAlignment.setQualityMatrix(qualityMat,self.numQualSuppPoints)
254
255 currentAlignment.setMatchPlifs()
256
257 #print 'PYTHON: Calling myalign...'
258 # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
259 currentAlignment.myalign( num_path[exampleIdx], dna, dna_len,\
260 est, est_len, ps, matchmatrix, mm_len, donor, d_len,\
261 acceptor, a_len, remove_duplicate_scores, print_matrix)
262 #print 'PYTHON: After myalign call...'
263
264 newSpliceAlign = QPalmaDP.createIntArrayFromList([0]*(dna_len*num_path[exampleIdx]))
265 newEstAlign = QPalmaDP.createIntArrayFromList([0]*(est_len*num_path[exampleIdx]))
266 newWeightMatch = QPalmaDP.createIntArrayFromList([0]*(mm_len*num_path[exampleIdx]))
267 newAlignmentScores = QPalmaDP.createDoubleArrayFromList([.0]*num_path[exampleIdx])
268
269 currentAlignment.getAlignmentResults(newSpliceAlign, newEstAlign, newWeightMatch, newAlignmentScores)
270 del currentAlignment
271
272 spliceAlign = zeros((num_path[exampleIdx]*dna_len,1))
273 weightMatch = zeros((num_path[exampleIdx]*mm_len,1))
274
275 #print 'spliceAlign'
276 #for i in range(dna_len*num_path[exampleIdx]):
277 # spliceAlign[i] = newSpliceAlign[i]
278 # print '%f' % (spliceAlign[i])
279
280 #print 'weightMatch'
281 #for i in range(mm_len*num_path[exampleIdx]):
282 # weightMatch[i] = newWeightMatch[i]
283 # print '%f' % (weightMatch[i])
284
285 for i in range(num_path[exampleIdx]):
286 AlignmentScores[i+1] = newAlignmentScores[i]
287
288 spliceAlign = spliceAlign.reshape(num_path[exampleIdx],dna_len)
289 weightMatch = weightMatch.reshape(num_path[exampleIdx],mm_len)
290 # Calculate weights of the respective alignments Note that we are
291 # calculating n-best alignments without any hamming loss, so we
292 # have to keep track which of the n-best alignments correspond to
293 # the true one in order not to incorporate a true alignment in the
294 # constraints. To keep track of the true and false alignments we
295 # define an array true_map with a boolean indicating the
296 # equivalence to the true alignment for each decoded alignment.
297 true_map = [0]*(num_path[exampleIdx]+1)
298 true_map[0] = 1
299 path_loss = [0]*(num_path[exampleIdx]+1)
300
301 for pathNr in range(num_path[exampleIdx]):
302 #dna_numbers = dnaest{1,pathNr}
303 #est_numbers = dnaest{2,pathNr}
304
305 weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a, h, spliceAlign[pathNr,:].flatten().tolist()[0], don_supp, acc_supp)
306
307 # sum up positionwise loss between alignments
308 for alignPosIdx in range(len(spliceAlign[pathNr,:])):
309 if spliceAlign[pathNr,alignPosIdx] != trueSpliceAlign[alignPosIdx]:
310 path_loss[pathNr+1] += 1
311
312 # Gewichte in restliche Zeilen der Matrix speichern
313 wp = numpy.vstack([weightIntron, weightDon, weightAcc, weightMatch[pathNr,:].T, qualityMatrix ])
314 allWeights[:,pathNr+1] = wp
315
316 hpen = mat(h.penalties).reshape(len(h.penalties),1)
317 dpen = mat(d.penalties).reshape(len(d.penalties),1)
318 apen = mat(a.penalties).reshape(len(a.penalties),1)
319
320 features = numpy.vstack([hpen , dpen , apen , mmatrix[:]])
321 AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
322
323 # Check wether scalar product + loss equals viterbi score
324 #assert math.fabs(newAlignmentScores[pathNr] - AlignmentScores[pathNr+1]) < 1e-6,\
325 #'Scalar prod + loss is not equal Viterbi score. Respective values are %f, %f' % \
326 #(newAlignmentScores[pathNr],AlignmentScores[pathNr+1])
327
328 # # if the pathNr-best alignment is very close to the true alignment consider it as true
329 if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
330 true_map[pathNr+1] = 1
331
332 # the true label sequence should not have a larger score than the maximal one WHYYYYY?
333
334 # this means that all n-best paths are to close to each other
335 # we have to extend the n-best search to a (n+1)-best
336 if len([elem for elem in true_map if elem == 1]) == len(true_map):
337 num_path[exampleIdx] = num_path[exampleIdx]+1
338
339 # Choose true and first false alignment for extending A
340 firstFalseIdx = -1
341 for map_idx,elem in enumerate(true_map):
342 if elem == 0:
343 firstFalseIdx = map_idx
344 break
345
346 # if there is at least one useful false alignment add the
347 # corresponding constraints to the optimization problem
348 if firstFalseIdx != -1:
349 trueWeights = allWeights[:,0]
350 firstFalseWeights = allWeights[:,firstFalseIdx]
351
352 # LMM.py code:
353 deltas = firstFalseWeights - trueWeights
354 if not __debug__:
355 const_added = solver.addConstraint(deltas, exampleIdx)
356 objValue,w,self.slacks = solver.solve()
357
358 sum_xis = 0
359 for elem in self.slacks:
360 sum_xis += elem
361
362 for i in range(len(param)):
363 param[i] = w[i]
364
365 [h,d,a,mmatrix] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
366
367 #
368 # end of one example processing
369 #
370 #if exampleIdx == 100:
371 # break
372
373 #break
374
375 #
376 # end of one iteration through all examples
377 #
378 iteration_nr += 1
379
380 #
381 # end of optimization
382 #
383 export_param('elegans.param',h,d,a,mmatrix)
384 self.logfh.close()
385 print 'Training completed'
386
387 if __name__ == '__main__':
388 qpalma = QPalma()
389 qpalma.run()