217561bb5dc985ff6e3ad98a24131ed14144f6cd
[qpalma.git] / python / qpalma.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import sys
5 import subprocess
6 import scipy.io
7 from paths_load_data import *
8 import pdb
9 from numpy.matlib import mat,zeros,ones
10 from set_param_palma import *
11 from computeSpliceAlign import *
12
13 from computeSpliceWeights import *
14 import QPalmaDP
15 from SIQP_CPX import SIQPSolver
16
17 from penalty_lookup_new import *
18 from compute_donacc import *
19
20 from TrainingParam import Param
21
22 """
23 A training method for the QPalma project
24
25 Overall procedure:
26
27 1. Load the data -> via paths_load_data
28 2. Create a QP using SIQP (paths_create_qp)
29 3. Set initial params using set_param_palma -> palma_utils
30 4. computeSpliceAlign
31 5. compute_SpliceAlignLocal
32 6. computeSpliceWeights
33 7. myalign_local
34
35 """
36
37 def plog(fh,string):
38 fh.write(string)
39
40 def run():
41 ARGS = Param()
42
43 logfh = open('qpalma.log','w+')
44
45 gen_file= '%s/genome.config' % ARGS.basedir
46
47 cmd = ['']*4
48 cmd[0] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/utils'
49 cmd[1] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/genomes'
50 cmd[2] = 'genome_info = init_genome(\'%s\')' % gen_file
51 cmd[3] = 'save genome_info.mat genome_info'
52 full_cmd = "matlab -nojvm -nodisplay -r \"%s; %s; %s; %s; exit\"" % (cmd[0],cmd[1],cmd[2],cmd[3])
53
54 obj = subprocess.Popen(full_cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
55 out,err = obj.communicate()
56 assert err == '', 'An error occured!\n%s'%err
57
58 ginfo = scipy.io.loadmat('genome_info.mat')
59 genome_info = ginfo['genome_info']
60
61 plog(logfh,'genome_info.basedir is %s\n'%genome_info.basedir)
62
63 Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data('training',genome_info,ARGS)
64
65 # number of training instances
66 N = len(Sequences)
67 assert N == len(Acceptors) and N == len(Acceptors) and N == len(Exons)\
68 and N == len(Ests), 'The Seq,Accept,Donor,.. arrays are of different lengths'
69
70 plog(logfh,'Number of training examples: %d\n'% N)
71
72 random_N = 100 ; # number of constraints to generate per iteration
73 iteration_steps = 200 ; #upper bound on iteration steps
74
75 remove_duplicate_scores = 0
76 anzpath = 2
77 print_matrix = 0
78
79 # param = numpy.matlib.rand(126,1)
80 param = numpy.matlib.mat([[ 0.62870709], [ 0.7012026 ], [ 0.60236784],
81 [ 0.51316259], [ 0.20220814], [ 0.70324863], [ 0.37218684], [ 0.82178927],
82 [ 0.60394866], [ 0.70371272], [ 0.07548074], [ 0.63412803], [ 0.97442266],
83 [ 0.13216791], [ 0.71041168], [ 0.2093887 ], [ 0.35227344], [ 0.3405142 ],
84 [ 0.69677236], [ 0.41673747], [ 0.564245 ], [ 0.37613432], [ 0.88805642],
85 [ 0.88691608], [ 0.69476752], [ 0.81659504], [ 0.17801859], [ 0.71048235],
86 [ 0.08188783], [ 0.54884803], [ 0.84039558], [ 0.6982093 ], [ 0.41686176],
87 [ 0.38568873], [ 0.29401347], [ 0.12704074], [ 0.30640858], [ 0.89578031],
88 [ 0.84621571], [ 0.11783439], [ 0.0944695 ], [ 0.34081575], [ 0.44157643],
89 [ 0.77847185], [ 0.04283567], [ 0.45107823], [ 0.89789891], [ 0.41045519],
90 [ 0.49073531], [ 0.29727627], [ 0.94711483], [ 0.24898204], [ 0.26181212],
91 [ 0.71760957], [ 0.60326883], [ 0.80887576], [ 0.09448718], [ 0.88064525],
92 [ 0.84317654], [ 0.48893703], [ 0.24847021], [ 0.84203596], [ 0.34104156],
93 [ 0.75604701], [ 0.91703057], [ 0.69325475], [ 0.61276969], [ 0.16335226],
94 [ 0.4684374 ], [ 0.16553371], [ 0.79594434], [ 0.6440283 ], [ 0.80922237],
95 [ 0.5349296 ], [ 0.31924316], [ 0.10960695], [ 0.40151062], [ 0.50473641],
96 [ 0.14812671], [ 0.73523169], [ 0.35141625], [ 0.80364238], [ 0.02128181],
97 [ 0.0061226 ], [ 0.34541924], [ 0.07694485], [ 0.05551339], [ 0.23087636],
98 [ 0.87016395], [ 0.31682377], [ 0.27375113], [ 0.72226332], [ 0.62914149],
99 [ 0.59236012], [ 0.2070238 ], [ 0.52390942], [ 0.11894098], [ 0.55725917],
100 [ 0.72706009], [ 0.087196 ], [ 0.04745082], [ 0.95636492], [ 0.31524576],
101 [ 0.79685218], [ 0.80386771], [ 0.70942604], [ 0.82869417], [ 0.26906569],
102 [ 0.51848039], [ 0.64169354], [ 0.07114973], [ 0.39249454], [ 0.07002803],
103 [ 0.94667567], [ 0.02252752], [ 0.01039039], [ 0.5721312 ], [ 0.06065969],
104 [ 0.69422476], [ 0.4310939 ], [ 0.03069099], [ 0.35969779], [ 0.18047331],
105 [ 0.4177651 ], [ 0.01360547], [ 0.29069319]])
106
107 [h,d,a,mmatrix] = set_param_palma(param,ARGS.train_with_intronlengthinformation)
108
109 # checked values till this position
110
111 # delete splicesite-score-information
112 if not ARGS.train_with_splicesitescoreinformation:
113 for i in range(len(Acceptors)):
114 if Acceptors[i] > -20:
115 Acceptors[i] = 1
116 if Donors[i] >-20:
117 Donors[i] = 1
118
119 #############################################################################################
120 # Training
121 #############################################################################################
122 plog(logfh,'Starting training...')
123
124 numExamples = N
125 C=1.0
126
127 numDonSuppPoints = 30
128 numAccSuppPoints = 30
129 numLengthSuppPoints = 30
130 sizeMMatrix = 36
131 numQualSuppPoints = 16*0
132
133 numFeatures = numDonSuppPoints + numAccSuppPoints + + numLengthSuppPoints\
134 + sizeMMatrix + numQualSuppPoints
135
136 qualityMatrix = zeros((numQualSuppPoints,1))
137
138 plog(logfh,'Initializing problem...\n')
139
140 #problem = SIQPSolver(numFeatures,numExamples,C,logfile)
141 num_path = anzpath*ones((1,N)) ; # nr of alignments done (best path, second-best path etc.)
142 gap = zeros((1,N))
143
144 plog(logfh,'Starting training...\n')
145
146 iteration_nr = 1
147
148 while True:
149 print 'Iteration step: %d'% iteration_nr
150
151 for exampleId in range(numExamples):
152 if exampleId % 1000 == 0:
153 print 'Current example nr %d' % exampleId
154
155 dna = Sequences[exampleId]
156 est = Ests[exampleId]
157
158 exons = Exons[exampleId]
159 # NoiseMatrix = Noises[exampleId]
160 don_supp = Donors[exampleId]
161 acc_supp = Acceptors[exampleId]
162
163 # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
164
165 # trueSpliceAlign is equal
166 # trueWeightMatch is equal
167 trueSpliceAlign, trueWeightMatch = computeSpliceAlign(dna, exons)
168
169 #print d.limits
170 #print d.penalties
171 #print a.limits
172 #print a.penalties
173 #print h.limits
174 #print h.penalties
175
176 ####################### checked above values ##########################################
177
178 # Calculate
179 trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
180
181 #print trueWeightDon.T
182 #print trueWeightAcc.T
183 #print trueWeightIntron.T
184
185 pdb.set_trace()
186
187 # Reshape currentW param
188 currentW = zeros((numFeatures,1))
189
190 currentW[0:numDonSuppPoints,0] = trueWeightDon[:,0]
191 currentW[numDonSuppPoints:numDonSuppPoints+numAccSuppPoints,0] = trueWeightAcc[:,0]
192 currentW[numDonSuppPoints+numAccSuppPoints:numDonSuppPoints+numAccSuppPoints+numLengthSuppPoints,0] = trueWeightIntron[:,0]
193 currentW[numDonSuppPoints+numAccSuppPoints+numLengthSuppPoints:numDonSuppPoints+numAccSuppPoints+numLengthSuppPoints+sizeMMatrix,0] = trueWeightMatch[:,0]
194 currentW[numDonSuppPoints+numAccSuppPoints+numLengthSuppPoints+sizeMMatrix:numDonSuppPoints+numAccSuppPoints+numLengthSuppPoints+sizeMMatrix+numQualSuppPoints,0] = qualityMatrix[:,0]
195
196 currentPhi = zeros((numFeatures,1))
197 currentPhi[0:30] = d.penalties[:]
198 currentPhi[30:60] = a.penalties[:]
199 currentPhi[60:90] = h.penalties[:]
200 currentPhi[90:126] = mmatrix[:]
201 currentPhi[126:] = qualityMatrix[:]
202
203 # Calculate w'phi(x,y) the total score of the alignment
204 alignmentScore = currentW.T * currentPhi
205
206 #
207 # Calculate wrong alignments
208 #
209
210 # Compute donor, acceptor with penalty_lookup_new
211 # returns two double lists
212 donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
213
214 #myalign wants the acceptor site on the g of the ag
215 #acceptor = [acceptor(2:end) -Inf] ;
216
217 nr_paths = 2
218 dna = 'acgtagct'
219 dna_len = len(dna)
220
221 est = 'acgtagct'
222 est_len = len(est)
223
224 matchmatrix = QPalmaDP.createDoubleArrayFromList([1.0]*36)
225 mm_len = 36
226 donor = QPalmaDP.createDoubleArrayFromList([1,2.0,4])
227 d_len = 3
228 acceptor = QPalmaDP.createDoubleArrayFromList([.1,2,4])
229 a_len = 3
230 remove_duplicate_scores = False
231 print_matrix = False
232
233 currentAlignment = QPalmaDP.Alignment()
234
235 qualityMat = QPalmaDP.createDoubleArrayFromList(qualityMatrix)
236 currentAlignment.setQualityMatrix(qualityMat,numQualSuppPoints)
237 ps = h.convert2SWIG()
238
239 currentAlignment.myalign( nr_paths, dna, dna_len,\
240 est, est_len, ps, matchmatrix, mm_len, donor, d_len,\
241 acceptor, a_len, remove_duplicate_scores, print_matrix)
242
243 print 'after myalign call...'
244
245 # SpliceAlign = double(SpliceAlign') ; %column
246 # weightMatch = double(weightMatch') ;
247
248
249 iteration_nr += 1
250 break
251
252 logfh.close()
253
254 """
255 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
256 % Wrong Alignments
257 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
258 %test
259 for pfadNo = 1:num_path(id)
260 assert(sum(weightMatch(pfadNo,7:end)) == sum(SpliceAlign(pfadNo,:)==0)) ;
261 new_weightMatch = zeros(1,36) ;
262 for iii = 1:length(dnaest{1,pfadNo})
263 if dnaest{2,pfadNo}(iii) ~= 6
264 new_weightMatch(dnaest{1,pfadNo}(iii)*6 + dnaest{2,pfadNo}(iii) + 1) = new_weightMatch(dnaest{1,pfadNo}(iii)*6 + dnaest{2,pfadNo}(iii)+1) + 1 ;
265 end
266 end
267 assert(all(new_weightMatch == weightMatch(pfadNo,:))) ;
268 assert(sum(new_weightMatch(7:end)) == sum(SpliceAlign(pfadNo,:)==0)) ;
269 end
270
271 %assert that it is the right file
272 assert(all(dna(find(SpliceAlign(1,:)==1)) == 'g')) ;
273
274 %just some info
275
276 %Berechne Gewichte der durch Alignment berechneten Pfade
277 true_map = zeros(1,1+num_path(id)) ;
278 true_map(1) = 1 ;
279 path_loss=[] ;
280 path_loss(1) = 0 ;
281 for pfadNo = 1:num_path(id)
282 dna_numbers = dnaest{1,pfadNo} ;
283 est_numbers = dnaest{2,pfadNo} ;
284
285 [weightDon, weightAcc, weightIntron] = ...
286 compute_SpliceWeights(d, a, h, SpliceAlign(pfadNo,:), don_supp, acc_supp) ;
287
288 path_loss(pfadNo+1) = sum(double(SpliceAlign(pfadNo,:))~=true_SpliceAlign) ; %not too simple?
289
290 %Gewichte in restliche Zeilen der Matrix speichern
291 Weights(pfadNo+1,:) = [weightIntron, weightDon, weightAcc, weightMatch(pfadNo,:)] ;
292
293 AlignmentScores(pfadNo+1) = Weights(pfadNo+1,:) * [h.penalties' ; d.penalties' ; a.penalties' ; mmatrix(:)] ;
294
295 %Test, ob Alignprogr. auch das richtige Ergebnis liefert:
296 assert(abs(Gesamtscores(pfadNo) - AlignmentScores(pfadNo+1)) < 1e-6) ;
297
298 if norm(Weights(1,:)-Weights(pfadNo+1,:))<1e-5,
299 true_map(pfadNo+1)=1 ;
300 end
301 end
302
303 % the true label sequence should not have a larger score than the
304 % maximal one WHYYYYY?
305 if AlignmentScores(1) >= max(AlignmentScores(2:end))+1e-6,
306 AlignmentScores
307 warning('true score > max score\n') ;
308 keyboard
309 end ;
310
311 %%%set_param_palma should have done this right
312 for z = 1:num_path(id)
313 assert(abs(Weights(z,:) * param(1:126)' -AlignmentScores(z)) <= 1e-6) ; %abs: absolute value
314 end
315
316 if all(true_map==1)
317 num_path(id)=num_path(id)+1 ; %next iteration step: one alignment more
318 end ;
319
320 %Choose true and first false alignment for extending A
321 Weights = Weights([1 min(find(true_map==0))], :) ;
322
323 %if there is a false alignment
324 if size(Weights,1)>1 & sum((Weights(1,:)-Weights(2,:)).*param)+xis(id)<1+column_eps,
325 e=zeros(1,N) ;
326 e(id) = 1 ;
327 A=[A;
328 Weights(2,:)-Weights(1,:) zeros(1,126) -e] ;
329 b=[b;
330 -1] ;
331 gap(id) = 1-sum((Weights(1,:)-Weights(2,:)).*param)-xis(id) ;
332 else
333 gap(id) = 0 ;
334 end ;
335 end
336 fprintf('\n') ;
337 #################################################################################
338
339 const_added = solver.addConstraint(deltas, loss, pos, marginRescaling)
340
341 objValue,w,self.slacks = solver.solve()
342
343 sum_xis = 0
344 for elem in self.slacks:
345 sum_xis += elem
346
347 """
348 print 'Training completed'
349
350 if __name__ == '__main__':
351 run()