+ changed design
[qpalma.git] / python / qpalma.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import sys
5 import subprocess
6 import scipy.io
7 import pdb
8
9 from numpy.matlib import mat,zeros,ones,inf
10
11 import QPalmaDP
12
13 from SIQP_CPX import SIQPSolver
14
15 from paths_load_data import *
16 from computeSpliceWeights import *
17 from set_param_palma import *
18 from computeSpliceAlign import *
19 from penalty_lookup_new import *
20 from compute_donacc import *
21 from TrainingParam import Param
22
23 """
24 A training method for the QPalma project
25 """
26
27
28
29 class QPalma:
30
31 def __init__(self):
32 self.ARGS = Param()
33
34 self.logfh = open('qpalma.log','w+')
35
36 gen_file= '%s/genome.config' % self.ARGS.basedir
37
38 cmd = ['']*4
39 cmd[0] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/utils'
40 cmd[1] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/genomes'
41 cmd[2] = 'genome_info = init_genome(\'%s\')' % gen_file
42 cmd[3] = 'save genome_info.mat genome_info'
43 full_cmd = "matlab -nojvm -nodisplay -r \"%s; %s; %s; %s; exit\"" % (cmd[0],cmd[1],cmd[2],cmd[3])
44
45 obj = subprocess.Popen(full_cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
46 out,err = obj.communicate()
47 assert err == '', 'An error occured!\n%s'%err
48
49 ginfo = scipy.io.loadmat('genome_info.mat')
50 self.genome_info = ginfo['genome_info']
51
52 self.plog('genome_info.basedir is %s\n'%self.genome_info.basedir)
53
54 self.C=1.0
55
56 self.numDonSuppPoints = 30
57 self.numAccSuppPoints = 30
58 self.numLengthSuppPoints = 30
59 self.sizeMMatrix = 36
60 self.numQualSuppPoints = 16*0
61
62 self.numFeatures = self.numDonSuppPoints + self.numAccSuppPoints + self.numLengthSuppPoints\
63 + self.sizeMMatrix + self.numQualSuppPoints
64
65 self.plog('Initializing problem...\n')
66
67
68 def createParameterVector(self, trueWeightDon, trueWeightAcc, trueWeightIntron, trueWeightMatch, qualityMatrix):
69 currentW = zeros((self.numFeatures,1))
70
71 donS = self.numDonSuppPoints
72 accS = self.numAccSuppPoints
73 lenS = self.numLengthSuppPoints
74 matS = self.sizeMMatrix
75 qualS = self.numQualSuppPoints
76
77 currentW[0:donS,0] = trueWeightDon[:,0]
78 currentW[donS:donS+accS,0] = trueWeightAcc[:,0]
79 currentW[donS+accS:donS+accS+lenS,0] = trueWeightIntron[:,0]
80 currentW[donS+accS+lenS:donS+accS+lenS+matS,0] = trueWeightMatch[:,0]
81 currentW[donS+accS+lenS+matS:donS+accS+lenS+matS+qualS,0] = qualityMatrix[:,0]
82
83 return currentW
84
85
86 def plog(self,string):
87 self.logfh.write(string)
88
89
90 def run(self):
91 Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data('training',self.genome_info,self.ARGS)
92
93 # number of training instances
94 N = len(Sequences)
95 self.numExamples = N
96 assert N == len(Acceptors) and N == len(Acceptors) and N == len(Exons)\
97 and N == len(Ests), 'The Seq,Accept,Donor,.. arrays are of different lengths'
98
99 self.plog('Number of training examples: %d\n'% N)
100
101 random_N = 100 ; # number of constraints to generate per iteration
102 iteration_steps = 200 ; #upper bound on iteration steps
103
104 remove_duplicate_scores = 0
105 anzpath = 2
106 print_matrix = 0
107
108 # param = numpy.matlib.rand(126,1)
109 param = numpy.matlib.mat([[ 0.62870709], [ 0.7012026 ], [ 0.60236784],
110 [ 0.51316259], [ 0.20220814], [ 0.70324863], [ 0.37218684], [ 0.82178927],
111 [ 0.60394866], [ 0.70371272], [ 0.07548074], [ 0.63412803], [ 0.97442266],
112 [ 0.13216791], [ 0.71041168], [ 0.2093887 ], [ 0.35227344], [ 0.3405142 ],
113 [ 0.69677236], [ 0.41673747], [ 0.564245 ], [ 0.37613432], [ 0.88805642],
114 [ 0.88691608], [ 0.69476752], [ 0.81659504], [ 0.17801859], [ 0.71048235],
115 [ 0.08188783], [ 0.54884803], [ 0.84039558], [ 0.6982093 ], [ 0.41686176],
116 [ 0.38568873], [ 0.29401347], [ 0.12704074], [ 0.30640858], [ 0.89578031],
117 [ 0.84621571], [ 0.11783439], [ 0.0944695 ], [ 0.34081575], [ 0.44157643],
118 [ 0.77847185], [ 0.04283567], [ 0.45107823], [ 0.89789891], [ 0.41045519],
119 [ 0.49073531], [ 0.29727627], [ 0.94711483], [ 0.24898204], [ 0.26181212],
120 [ 0.71760957], [ 0.60326883], [ 0.80887576], [ 0.09448718], [ 0.88064525],
121 [ 0.84317654], [ 0.48893703], [ 0.24847021], [ 0.84203596], [ 0.34104156],
122 [ 0.75604701], [ 0.91703057], [ 0.69325475], [ 0.61276969], [ 0.16335226],
123 [ 0.4684374 ], [ 0.16553371], [ 0.79594434], [ 0.6440283 ], [ 0.80922237],
124 [ 0.5349296 ], [ 0.31924316], [ 0.10960695], [ 0.40151062], [ 0.50473641],
125 [ 0.14812671], [ 0.73523169], [ 0.35141625], [ 0.80364238], [ 0.02128181],
126 [ 0.0061226 ], [ 0.34541924], [ 0.07694485], [ 0.05551339], [ 0.23087636],
127 [ 0.87016395], [ 0.31682377], [ 0.27375113], [ 0.72226332], [ 0.62914149],
128 [ 0.59236012], [ 0.2070238 ], [ 0.52390942], [ 0.11894098], [ 0.55725917],
129 [ 0.72706009], [ 0.087196 ], [ 0.04745082], [ 0.95636492], [ 0.31524576],
130 [ 0.79685218], [ 0.80386771], [ 0.70942604], [ 0.82869417], [ 0.26906569],
131 [ 0.51848039], [ 0.64169354], [ 0.07114973], [ 0.39249454], [ 0.07002803],
132 [ 0.94667567], [ 0.02252752], [ 0.01039039], [ 0.5721312 ], [ 0.06065969],
133 [ 0.69422476], [ 0.4310939 ], [ 0.03069099], [ 0.35969779], [ 0.18047331],
134 [ 0.4177651 ], [ 0.01360547], [ 0.29069319]])
135
136 [h,d,a,mmatrix] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
137
138 # checked values till this position
139
140 # delete splicesite-score-information
141 if not self.ARGS.train_with_splicesitescoreinformation:
142 for i in range(len(Acceptors)):
143 if Acceptors[i] > -20:
144 Acceptors[i] = 1
145 if Donors[i] >-20:
146 Donors[i] = 1
147
148 #problem = SIQPSolver(numFeatures,numExamples,C,logfile)
149 # stores the number of alignments done for each example (best path, second-best path etc.)
150 num_path = [anzpath]*N
151 # stores the gap for each example
152 gap = [0.0]*N
153
154 qualityMatrix = zeros((self.numQualSuppPoints,1))
155 #############################################################################################
156 # Training
157 #############################################################################################
158 self.plog('Starting training...')
159
160
161 self.plog('Starting training...\n')
162
163 iteration_nr = 1
164
165 while True:
166 print 'Iteration step: %d'% iteration_nr
167
168 for exampleIdx in range(self.numExamples):
169 if exampleIdx% 1000 == 0:
170 print 'Current example nr %d' % exampleIdx
171
172 dna = Sequences[exampleIdx]
173 est = Ests[exampleIdx]
174
175 exons = Exons[exampleIdx]
176 # NoiseMatrix = Noises[exampleIdx]
177 don_supp = Donors[exampleIdx]
178 acc_supp = Acceptors[exampleIdx]
179
180 # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
181 trueSpliceAlign, trueWeightMatch = computeSpliceAlign(dna, exons)
182
183 # Calculate the weights
184 trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
185
186 # Reshape currentW param
187 currentW = self.createParameterVector(trueWeightDon, trueWeightAcc, trueWeightIntron, trueWeightMatch, qualityMatrix)
188
189 #currentPhi = createFeatureVector()
190
191 currentPhi = zeros((self.numFeatures,1))
192 currentPhi[0:30] = mat(d.penalties[:]).reshape(30,1)
193 currentPhi[30:60] = mat(a.penalties[:]).reshape(30,1)
194 currentPhi[60:90] = mat(h.penalties[:]).reshape(30,1)
195 currentPhi[90:126] = mmatrix[:]
196 currentPhi[126:] = qualityMatrix[:]
197
198 # Calculate w'phi(x,y) the total score of the alignment
199 alignmentScore = currentW.T * currentPhi
200
201 ################## Calculate wrong alignment(s) ######################
202
203 # Compute donor, acceptor with penalty_lookup_new
204 # returns two double lists
205 donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
206
207 #myalign wants the acceptor site on the g of the ag
208 acceptor = acceptor[1:]
209 acceptor.append(-inf)
210
211 ####################### checked above values ##########################################
212 dna = str(dna)
213 est = str(est)
214 dna_len = len(dna)
215 est_len = len(est)
216
217 ps = h.convert2SWIG()
218
219 matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
220 mm_len = 36
221
222 d_len = len(donor)
223 donor = QPalmaDP.createDoubleArrayFromList(donor)
224 a_len = len(acceptor)
225 acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
226
227 remove_duplicate_scores = False
228 print_matrix = False
229
230 currentAlignment = QPalmaDP.Alignment()
231 qualityMat = QPalmaDP.createDoubleArrayFromList(qualityMatrix)
232 currentAlignment.setQualityMatrix(qualityMat,self.numQualSuppPoints)
233
234 print 'PYTHON: Calling myalign...'
235 # returns [SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest]
236 currentAlignment.myalign( num_path[exampleIdx], dna, dna_len,\
237 est, est_len, ps, matchmatrix, mm_len, donor, d_len,\
238 acceptor, a_len, remove_duplicate_scores, print_matrix)
239
240 print 'after myalign call...'
241
242 newSpliceAlign = QPalmaDP.createIntArrayFromList([0]*(dna_len*num_path[exampleIdx]))
243 newEstAlign = QPalmaDP.createIntArrayFromList([0]*(est_len*num_path[exampleIdx]))
244 newWeightMatch = QPalmaDP.createIntArrayFromList([0]*(mm_len*num_path[exampleIdx]))
245 newAlignmentScores = QPalmaDP.createDoubleArrayFromList([.0]*num_path[exampleIdx])
246
247 currentAlignment.getAlignmentResults(newSpliceAlign, newEstAlign, newWeightMatch, newAlignmentScores)
248
249 #Berechne Gewichte der durch Alignment berechneten Pfade
250 true_map = zeros((1,1+num_path[exampleIdx]))
251 true_map[0] = 1
252 path_loss = [0]*num_path[exampleIdx]
253
254 # for pathNr in range(num_path(exampleIdx)):
255 # #dna_numbers = dnaest{1,pathNr}
256 # #est_numbers = dnaest{2,pathNr}
257 #
258 # weightDon, weightAcc, weightIntron = compute_SpliceWeights(d, a, h, SpliceAlign[pathNr,:], don_supp, acc_supp)
259
260 # # sum up positionwise loss between alignments
261 # for alignPosIdx in range(len(SpliceAlign[pathNr,:])):
262 # if SpliceAlign[pathNr,alignPosIdx] != true_spliceAlign[alignPosIdx]:
263 # path_loss[pathNr] += 1
264
265 # # Gewichte in restliche Zeilen der Matrix speichern
266 # Weights[pathNr+1,:] = [weightIntron, weightDon, weightAcc, weightMatch(pathNr,:)]
267
268 # AlignmentScores(pathNr+1) = Weights(pathNr+1,:) * [h.penalties ; d.penalties ; a.penalties ; mmatrix(:)]
269
270 # # Check wether scalar product + loss equals viterbi score
271 # assert(abs(Gesamtscores(pathNr) - AlignmentScores(pathNr+1)) < 1e-6)
272
273 # if norm(Weights(1,:)-Weights(pathNr+1,:))<1e-5,
274 # true_map(pathNr+1)=1
275 #
276 # # the true label sequence should not have a larger score than the
277 # # maximal one WHYYYYY?
278 # if AlignmentScores(1) >= max(AlignmentScores(2:end))+1e-6,
279 # AlignmentScores
280 # warning('true score > max score\n') ;
281 # keyboard
282 # end ;
283
284 # # %%set_param_palma should have done this right
285 # for z = 1:num_path[exampleIdx]
286 # assert(abs(Weights(z,:) * param(1:126) -AlignmentScores(z)) <= 1e-6)
287 # end
288
289 # if all(true_map==1)
290 # num_path[exampleIdx]=num_path[exampleIdx]+1 ; %next iteration step: one alignment more
291 # end ;
292
293 # # Choose true and first false alignment for extending A
294 # Weights = Weights([1 min(find(true_map==0))], :) ;
295
296 # # if there is a false alignment
297 # if size(Weights,1)>1 & sum((Weights(1,:)-Weights(2,:)).*param)+xis[exampleIdx]<1+column_eps,
298 # e=zeros(1,N) ;
299 # e[exampleIdx] = 1 ;
300 # A=[A; Weights(2,:)-Weights(1,:) zeros(1,126) -e] ;
301 # b=[b; -1] ;
302 # gap[exampleIdx] = 1-sum((Weights(1,:)-Weights(2,:)).*param)-xis[exampleIdx] ;
303 # else
304 # gap[exampleIdx] = 0 ;
305 # end ;
306
307 if exampleIdx== 10:
308 break
309
310 iteration_nr += 1
311 break
312
313 self.logfh.close()
314
315 """
316 const_added = solver.addConstraint(deltas, loss, pos, marginRescaling)
317
318 objValue,w,self.slacks = solver.solve()
319
320 sum_xis = 0
321 for elem in self.slacks:
322 sum_xis += elem
323
324 """
325 print 'Training completed'
326
327 if __name__ == '__main__':
328 qpalma = QPalma()
329 qpalma.run()