+ removed accidently checked in vim swap files
[qpalma.git] / python / qpalma.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 ###########################################################
5 #
6 # This file containts the
7 #
8 ###########################################################
9
10 import sys
11 import subprocess
12 import scipy.io
13 import pdb
14
15 from numpy.matlib import mat,zeros,ones,inf
16 from numpy.linalg import norm
17
18 import QPalmaDP
19
20 from SIQP_CPX import SIQPSolver
21
22 from paths_load_data import *
23 from computeSpliceWeights import *
24 from set_param_palma import *
25 from computeSpliceAlign import *
26 from penalty_lookup_new import *
27 from compute_donacc import *
28 from TrainingParam import Param
29
30 import Configuration
31
32 class QPalma:
33 """
34 A training method for the QPalma project
35 """
36
37 def __init__(self):
38 self.ARGS = Param()
39
40 self.logfh = open('qpalma.log','w+')
41
42 gen_file= '%s/genome.config' % self.ARGS.basedir
43
44 cmd = ['']*4
45 cmd[0] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/utils'
46 cmd[1] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/genomes'
47 cmd[2] = 'genome_info = init_genome(\'%s\')' % gen_file
48 cmd[3] = 'save genome_info.mat genome_info'
49 full_cmd = "matlab -nojvm -nodisplay -r \"%s; %s; %s; %s; exit\"" % (cmd[0],cmd[1],cmd[2],cmd[3])
50
51 obj = subprocess.Popen(full_cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
52 out,err = obj.communicate()
53 assert err == '', 'An error occured!\n%s'%err
54
55 ginfo = scipy.io.loadmat('genome_info.mat')
56 self.genome_info = ginfo['genome_info']
57
58 self.plog('genome_info.basedir is %s\n'%self.genome_info.basedir)
59
60 self.C=1.0
61
62 self.numDonSuppPoints = 30
63 self.numAccSuppPoints = 30
64 self.numLengthSuppPoints = 30
65 self.sizeMMatrix = 36
66 self.numQualSuppPoints = 16*0
67
68 self.numFeatures = self.numDonSuppPoints + self.numAccSuppPoints + self.numLengthSuppPoints\
69 + self.sizeMMatrix + self.numQualSuppPoints
70
71 self.plog('Initializing problem...\n')
72
73
74 def createParameterVector(self, trueWeightDon, trueWeightAcc, trueWeightIntron, trueWeightMatch, qualityMatrix):
75 currentW = zeros((self.numFeatures,1))
76
77 donS = self.numDonSuppPoints
78 accS = self.numAccSuppPoints
79 lenS = self.numLengthSuppPoints
80 matS = self.sizeMMatrix
81 qualS = self.numQualSuppPoints
82
83 currentW[0:donS,0] = trueWeightDon[:,0]
84 currentW[donS:donS+accS,0] = trueWeightAcc[:,0]
85 currentW[donS+accS:donS+accS+lenS,0] = trueWeightIntron[:,0]
86 currentW[donS+accS+lenS:donS+accS+lenS+matS,0] = trueWeightMatch[:,0]
87 currentW[donS+accS+lenS+matS:donS+accS+lenS+matS+qualS,0] = qualityMatrix[:,0]
88
89 return currentW
90
91
92 def plog(self,string):
93 self.logfh.write(string)
94
95
96 def run(self):
97
98 # Load the whole dataset
99 Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data('training',self.genome_info,self.ARGS)
100
101 # number of training instances
102 N = len(Sequences)
103 self.numExamples = N
104 assert N == len(Acceptors) and N == len(Acceptors) and N == len(Exons)\
105 and N == len(Ests), 'The Seq,Accept,Donor,.. arrays are of different lengths'
106
107 self.plog('Number of training examples: %d\n'% N)
108
109 iteration_steps = 200 ; #upper bound on iteration steps
110
111 remove_duplicate_scores = False
112 print_matrix = False
113 anzpath = 2
114
115 # Initialize parameter vector
116 # param = numpy.matlib.rand(126,1)
117 param = Configuration.fixedParam
118
119 # Set the parameters such as limits penalties for the Plifs
120 [h,d,a,mmatrix] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
121
122 # delete splicesite-score-information
123 if not self.ARGS.train_with_splicesitescoreinformation:
124 for i in range(len(Acceptors)):
125 if Acceptors[i] > -20:
126 Acceptors[i] = 1
127 if Donors[i] >-20:
128 Donors[i] = 1
129
130 # Initialize solver
131 solver = SIQPSolver(self.numFeatures,self.numExamples,self.C,self.logfh)
132
133 # stores the number of alignments done for each example (best path, second-best path etc.)
134 num_path = [anzpath]*N
135 # stores the gap for each example
136 gap = [0.0]*N
137
138 qualityMatrix = zeros((self.numQualSuppPoints,1))
139
140 #############################################################################################
141 # Training
142 #############################################################################################
143 self.plog('Starting training...\n')
144
145 iteration_nr = 1
146
147 while True:
148 print 'Iteration step: %d'% iteration_nr
149
150 for exampleIdx in range(self.numExamples):
151 if exampleIdx% 1000 == 0:
152 print 'Current example nr %d' % exampleIdx
153
154 dna = Sequences[exampleIdx]
155 est = Ests[exampleIdx]
156
157 exons = Exons[exampleIdx]
158 # NoiseMatrix = Noises[exampleIdx]
159 don_supp = Donors[exampleIdx]
160 acc_supp = Acceptors[exampleIdx]
161
162 # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
163 trueSpliceAlign, trueWeightMatch = computeSpliceAlign(dna, exons)
164
165 # Calculate the weights
166 trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
167
168 #trueWeight = self.createParameterVector(trueWeightDon, trueWeightAcc, trueWeightIntron, trueWeightMatch, qualityMatrix)
169 trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, qualityMatrix ])
170
171 #currentPhi = createFeatureVector()
172
173 currentPhi = zeros((self.numFeatures,1))
174 currentPhi[0:30] = mat(d.penalties[:]).reshape(30,1)
175 currentPhi[30:60] = mat(a.penalties[:]).reshape(30,1)
176 currentPhi[60:90] = mat(h.penalties[:]).reshape(30,1)
177 currentPhi[90:126] = mmatrix[:]
178 currentPhi[126:] = qualityMatrix[:]
179
180 # Calculate w'phi(x,y) the total score of the alignment
181 trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
182
183 # The allWeights vector is supposed to store the weight parameter
184 # of the true alignment as well as the weight parameters of the
185 # num_path[exampleIdx] other alignments
186 allWeights = zeros((self.numFeatures,num_path[exampleIdx]+1))
187 allWeights[:,0] = trueWeight[:,0]
188
189 AlignmentScores = [0.0]*(num_path[exampleIdx]+1)
190 AlignmentScores[0] = trueAlignmentScore
191
192 ################## Calculate wrong alignment(s) ######################
193
194 # Compute donor, acceptor with penalty_lookup_new
195 # returns two double lists
196 donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
197
198 #myalign wants the acceptor site on the g of the ag
199 acceptor = acceptor[1:]
200 acceptor.append(-inf)
201
202 dna = str(dna)
203 est = str(est)
204 dna_len = len(dna)
205 est_len = len(est)
206 ps = h.convert2SWIG()
207
208 matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
209 mm_len = 36
210
211 d_len = len(donor)
212 donor = QPalmaDP.createDoubleArrayFromList(donor)
213 a_len = len(acceptor)
214 acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
215
216 currentAlignment = QPalmaDP.Alignment()
217 qualityMat = QPalmaDP.createDoubleArrayFromList(qualityMatrix)
218 currentAlignment.setQualityMatrix(qualityMat,self.numQualSuppPoints)
219
220 print 'PYTHON: Calling myalign...'
221 # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
222 currentAlignment.myalign( num_path[exampleIdx], dna, dna_len,\
223 est, est_len, ps, matchmatrix, mm_len, donor, d_len,\
224 acceptor, a_len, remove_duplicate_scores, print_matrix)
225 print 'PYTHON: After myalign call...'
226
227 newSpliceAlign = QPalmaDP.createIntArrayFromList([0]*(dna_len*num_path[exampleIdx]))
228 newEstAlign = QPalmaDP.createIntArrayFromList([0]*(est_len*num_path[exampleIdx]))
229 newWeightMatch = QPalmaDP.createIntArrayFromList([0]*(mm_len*num_path[exampleIdx]))
230 newAlignmentScores = QPalmaDP.createDoubleArrayFromList([.0]*num_path[exampleIdx])
231
232 currentAlignment.getAlignmentResults(newSpliceAlign, newEstAlign, newWeightMatch, newAlignmentScores)
233
234 spliceAlign = zeros((num_path[exampleIdx]*dna_len,1))
235 weightMatch = zeros((num_path[exampleIdx]*mm_len,1))
236
237 for i in range(dna_len*num_path[exampleIdx]):
238 spliceAlign[i] = newSpliceAlign[i]
239
240 for i in range(mm_len*num_path[exampleIdx]):
241 weightMatch[i] = newWeightMatch[i]
242
243 for i in range(num_path[exampleIdx]):
244 AlignmentScores[i+1] = newAlignmentScores[i]
245
246 spliceAlign = spliceAlign.reshape(num_path[exampleIdx],dna_len)
247 weightMatch = weightMatch.reshape(num_path[exampleIdx],mm_len)
248
249 #pdb.set_trace()
250 #print spliceAlign[:,:]
251 #print weightMatch[:,:]
252 #print AlignmentScores
253
254 # Calculate weights of the respective alignments Note that we are
255 # calculating n-best alignments without any hamming loss, so we
256 # have to keep track which of the n-best alignments correspond to
257 # the true one in order not to incorporate a true alignment in the
258 # constraints. To keep track of the true and false alignments we
259 # define an array true_map with a boolean indicating the
260 # equivalence to the true alignment for each decoded alignment.
261 true_map = [0]*(num_path[exampleIdx]+1)
262 true_map[0] = 1
263 path_loss = [0]*(num_path[exampleIdx]+1)
264
265 for pathNr in range(num_path[exampleIdx]):
266 #dna_numbers = dnaest{1,pathNr}
267 #est_numbers = dnaest{2,pathNr}
268
269 weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a, h, spliceAlign[pathNr,:].flatten().tolist()[0], don_supp, acc_supp)
270
271 # sum up positionwise loss between alignments
272 for alignPosIdx in range(len(spliceAlign[pathNr,:])):
273 if spliceAlign[pathNr,alignPosIdx] != trueSpliceAlign[alignPosIdx]:
274 path_loss[pathNr+1] += 1
275
276 # Gewichte in restliche Zeilen der Matrix speichern
277 # Weights[pathNr,:] = [weightIntron, weightDon, weightAcc, weightMatch(pathNr,:)]
278 wp = numpy.vstack([weightIntron, weightDon, weightAcc, weightMatch[pathNr,:].T, qualityMatrix ])
279 allWeights[:,pathNr+1] = wp
280
281 #print allWeights[:,0]
282 #print allWeights[:,1]
283 #print allWeights[:,2]
284
285 hpen = mat(h.penalties).reshape(len(h.penalties),1)
286 dpen = mat(d.penalties).reshape(len(d.penalties),1)
287 apen = mat(a.penalties).reshape(len(a.penalties),1)
288
289 features = numpy.vstack([hpen , dpen , apen , mmatrix[:]])
290
291 AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
292
293 # Check wether scalar product + loss equals viterbi score
294 #assert math.fabs(newAlignmentScores[pathNr] - AlignmentScores[pathNr+1]) < 1e-6,\
295 #'Scalar prod + loss is not equal Viterbi score. Respective values are %f, %f' % \
296 #(newAlignmentScores[pathNr],AlignmentScores[pathNr+1])
297
298 # # if the pathNr-best alignment is very close to the true alignment consider it as true
299 if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
300 true_map[pathNr+1] = 1
301
302 # # the true label sequence should not have a larger score than the
303 # # maximal one WHYYYYY?
304
305 # # this means that all n-best paths are to close to each other
306 # # we have to extend the n-best search to a (n+1)-best
307 if len([elem for elem in true_map if elem == 1]) == len(true_map):
308 num_path[exampleIdx] = num_path[exampleIdx]+1
309
310 # Choose true and first false alignment for extending A
311 firstFalseIdx = -1
312 for map_idx,elem in enumerate(true_map):
313 if elem == 0:
314 firstFalseIdx = map_idx
315 break
316
317 # if there is at least one useful false alignment add the
318 # corresponding constraints to the optimization problem
319 if firstFalseIdx != -1:
320 trueWeights = allWeights[:,0]
321 firstFalseWeights = allWeights[:,firstFalseIdx]
322
323 # if size(Weights,1)>1 & sum((Weights(1,:)-Weights(2,:)).*param)+xis[exampleIdx]<1+column_eps,
324 # e=zeros(1,N) ;
325 # e[exampleIdx] = 1 ;
326 # A=[A; Weights(2,:)-Weights(1,:) zeros(1,126) -e] ;
327 # b=[b; -1] ;
328 # gap[exampleIdx] = 1-sum((Weights(1,:)-Weights(2,:)).*param)-xis[exampleIdx] ;
329 # else
330 # gap[exampleIdx] = 0 ;
331 # end ;
332
333 # LMM.py code:
334 deltas = firstFalseWeights - trueWeights
335 const_added = solver.addConstraint(deltas, exampleIdx)
336
337 objValue,w,self.slacks = solver.solve()
338
339 sum_xis = 0
340 for elem in self.slacks:
341 sum_xis += elem
342
343 if exampleIdx==0:
344 break
345
346 iteration_nr += 1
347 break
348
349 self.logfh.close()
350 print 'Training completed'
351
352 if __name__ == '__main__':
353 qpalma = QPalma()
354 qpalma.run()