+ using correct interface now
[qpalma.git] / python / qpalma.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import sys
5 import subprocess
6 import scipy.io
7 from paths_load_data import *
8 import pdb
9 import numpy.matlib
10 from set_param_palma import *
11 from computeSpliceAlign import *
12
13 from computeSpliceWeights import *
14 import QPalmaDP
15
16 from penalty_lookup import *
17 from compute_donacc import *
18
19 """
20 A training method for the QPalma project
21
22 Overall :
23
24 1. Load the data
25 via paths_load_data
26
27 2. Create a QP using SIQP (paths_create_qp)
28
29 3. Set initial params using set_param_palma -> palma_utils
30
31 4. computeSpliceAlign
32
33 5. compute_SpliceAlignLocal
34
35 6. computeSpliceWeights
36
37 7. myalign_local
38
39 """
40
41 class Param:
42
43 def __init__(self):
44 """ default parameters """
45
46 self.basedir = '/fml/ag-raetsch/share/projects/qpalma/elegans_palma'
47 self.MAX_MEM = 31000;
48 self.LOCAL_ALIGN = 1;
49 self.init_param = 1;
50 self.train_with_splicesitescoreinformation = 1;
51 self.train_with_intronlengthinformation = 1;
52 self.C = 0.001;
53 self.microexon = 0;
54 self.prob = 0;
55 self.organism = 'elegans'
56 self.expt = 'training'
57
58 self.insertion_prob = self.prob/3 ;
59 self.deletion_prob = self.prob/3 ;
60 self.mutation_prob = self.prob/3 ;
61
62 def run():
63 ARGS = Param()
64
65 logfh = open('qpalma.log','w+')
66
67 gen_file= '%s/genome.config' % ARGS.basedir
68
69 cmd = ['']*4
70 cmd[0] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/utils'
71 cmd[1] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/genomes'
72 cmd[2] = 'genome_info = init_genome(\'%s\')' % gen_file
73 cmd[3] = 'save genome_info.mat genome_info'
74 full_cmd = "matlab -nojvm -nodisplay -r \"%s; %s; %s; %s; exit\"" % (cmd[0],cmd[1],cmd[2],cmd[3])
75
76 obj = subprocess.Popen(full_cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
77 out,err = obj.communicate()
78 assert err == '', 'An error occured!\n%s'%err
79
80 ginfo = scipy.io.loadmat('genome_info.mat')
81 genome_info = ginfo['genome_info']
82
83 plog(logfh,'genome_info.basedir is %s\n'%genome_info.basedir)
84
85 Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data('training',genome_info,ARGS)
86
87 # number of training instances
88 N = len(Sequences)
89 assert N == len(Acceptors) and N == len(Acceptors) and N == len(Exons)\
90 and N == len(Ests), 'The Seq,Accept,Donor,.. arrays are of different lengths'
91
92 plog(logfh,'Number of training examples: %d\n'% N)
93
94 random_N = 100 ; # number of constraints to generate per iteration
95 iteration_steps = 200 ; #upper bound on iteration steps
96
97 # myalign
98 remove_duplicate_scores = 0
99 anzpath = 2
100 print_matrix = 0
101
102 param = numpy.matlib.rand(126,1)
103
104 [h,d,a,mmatrix] = set_param_palma(param,ARGS.train_with_intronlengthinformation)
105
106 # delete splicesite-score-information
107 if not ARGS.train_with_splicesitescoreinformation:
108 for i in range(len(Acceptors)):
109 if Acceptors[i] > -20:
110 Acceptors[i] = 1
111 if Donors[i] >-20:
112 Donors[i] = 1
113
114 #############################################################################################
115 # Training
116 #############################################################################################
117 print 'Starting training...'
118
119 #The alignment matrix M looks like
120
121 # DNA
122
123 # A C G T -
124 # ------------
125 # A
126 #
127 #E C
128 #S
129 #T G
130
131 # T
132
133 # -
134
135
136 #where
137
138 # - is a gap,
139
140
141 numDonorSupportPoints = 30
142 numAcceptorSupportPoints = 30
143 numIntronLengthSupportPoints = 30
144 numQualityScoreSupportPoints = 0
145
146 numFeatures = numDonorSupportPoints\
147 + numAcceptorSupportPoints\
148 + numIntronLengthSupportPoints\
149 + numQualityScoreSupportPoints
150
151 numExamples = N
152
153 from SIQP_CPX import SIQPSolver
154 C=1.0
155
156 logfile = open('qpalma.log','w+')
157 #problem = SIQPSolver(numFeatures,numExamples,C,logfile)
158
159 #xis = zeros(1,N) ; #ninitialize slack variables
160 #num_path = anzpath*ones(1,N) ; # nr of alignments done (best path, second-best path etc.)
161 #gap = zeros(1,N) ; %? sum of differences between true alignment score and 'false' alignment scores
162
163
164 #for step_nr=1:iteration_steps,
165 iteration_nr = 1
166
167 while True:
168 print 'Iteration step: %d'% iteration_nr
169
170 for exampleId in range(numExamples):
171 if exampleId % 1000 == 0:
172 print 'Current example nr %d' % exampleId
173 dna = Sequences[exampleId]
174 est = Ests[exampleId]
175
176 exons = Exons[exampleId]
177 # NoiseMatrix = Noises[exampleId]
178 don_supp = Donors[exampleId]
179 acc_supp = Acceptors[exampleId]
180
181 # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
182 trueSpliceAlign, trueWeightMatch = computeSpliceAlign(dna, exons)
183
184 trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
185
186 #
187 # compute wrong alignments
188 #
189 nr_paths = 1
190 dna = 'acgtagct'
191 dna_len = len(dna)
192
193 est = 'acgtagct'
194 est_len = len(est)
195
196 matchmatrix = QPalmaDP.createDoubleArrayFromList([1.0]*36)
197 mm_len = 36
198 donor = QPalmaDP.createDoubleArrayFromList([1,2.0,4])
199 d_len = 3
200 acceptor = QPalmaDP.createDoubleArrayFromList([.1,2,4])
201 a_len = 3
202 remove_duplicate_scores = False
203 print_matrix = False
204
205 currentAlignment = QPalmaDP.Alignment()
206 ps = h.convert2SWIG()
207
208 currentAlignment.myalign( nr_paths, dna, dna_len,\
209 est, est_len, ps, matchmatrix, mm_len, donor, d_len,\
210 acceptor, a_len, remove_duplicate_scores, print_matrix)
211
212 print 'after myalign call...'
213
214 # # Compute donor, acceptor with penalty_lookup_new
215 # # returns two double lists
216 # [donor, acceptor] = compute_donacc(don_supp, acc_supp, d, a) ;
217
218 # #myalign wants the acceptor site on the g of the ag
219 # acceptor = [acceptor(2:end) -Inf] ;
220 #
221 # # call myalign
222 # [SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest] = ...
223 # myalign_local(num_path(id), dna, est, {h}, mmatrix, donor, acceptor, ...
224 # remove_duplicate_scores, print_matrix);
225
226 # SpliceAlign = double(SpliceAlign') ; %column
227 # weightMatch = double(weightMatch') ;
228
229 iteration_nr += 1
230 break
231
232 logfh.close()
233
234 """
235 for id = 1:N
236 %fprintf('%i\n', id) ;
237 if (mod(id,50)==0), fprintf(1,'.'); end
238
239 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
240 % True Alignment and Comparison with wrong ones
241 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
242
243 %Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
244 [true_SpliceAlign, true_weightMatch] = compute_SpliceAlign_local(dna, exons) ;
245 [true_weightDon, true_weightAcc, true_weightIntron] = ...
246 compute_SpliceWeights(d, a, h, true_SpliceAlign, don_supp, acc_supp) ;
247
248 double(true_weightMatch) ;
249
250 % Berechne Gewichtsmatrix fuer aktuelle id (matrix anlegen)
251 Weights = zeros(num_path(id)+1, 126) ; %first line: true alignment, rest: wrong alignments
252 Weights(1,:) = [true_weightIntron, true_weightDon, true_weightAcc, true_weightMatch] ;
253
254 %Score of whole alignment
255 AlignmentScores = zeros(1, num_path(id)+1) ; %first entry: true alignment
256 AlignmentScores(1) = Weights(1,:) * [h.penalties' ; d.penalties' ; a.penalties' ; mmatrix(:)] ;
257
258 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
259 % Wrong Alignments
260 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
261 % Compute donor, acceptor with penalty_lookup_new
262 [donor, acceptor] = compute_donacc(don_supp, acc_supp, d, a) ;
263
264 %myalign wants the acceptor site on the g of the ag
265 acceptor = [acceptor(2:end) -Inf] ;
266
267 %call myalign
268 [SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest] = ...
269 myalign_local(num_path(id), dna, est, {h}, mmatrix, donor, acceptor, ...
270 remove_duplicate_scores, print_matrix);
271
272 %[SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest] = myalign_local(1, dna, est, {h}, mmatrix, donor,
273 %acceptor, remove_duplicate_scores, print_matrix);
274
275 %return values (are int32, have to be double later cause we compute scores
276 SpliceAlign = double(SpliceAlign') ; %column
277 weightMatch = double(weightMatch') ;
278
279
280 %test
281 for pfadNo = 1:num_path(id)
282 assert(sum(weightMatch(pfadNo,7:end)) == sum(SpliceAlign(pfadNo,:)==0)) ;
283 new_weightMatch = zeros(1,36) ;
284 for iii = 1:length(dnaest{1,pfadNo})
285 if dnaest{2,pfadNo}(iii) ~= 6
286 new_weightMatch(dnaest{1,pfadNo}(iii)*6 + dnaest{2,pfadNo}(iii) + 1) = new_weightMatch(dnaest{1,pfadNo}(iii)*6 + dnaest{2,pfadNo}(iii)+1) + 1 ;
287 end
288 end
289 assert(all(new_weightMatch == weightMatch(pfadNo,:))) ;
290 assert(sum(new_weightMatch(7:end)) == sum(SpliceAlign(pfadNo,:)==0)) ;
291 %reshape(new_weightMatch,6,6)
292 %reshape(weightMatch(pfadNo,:),6,6)
293 end
294
295 %assert that it is the right file
296 assert(all(dna(find(SpliceAlign(1,:)==1)) == 'g')) ;
297
298 %just some info
299 %print_align(dnaest,1) ;
300 %exons(3,2)-exons(1,1)
301 %length(dnaest{1,1})
302 %keyboard
303
304 %Berechne Gewichte der durch Alignment berechneten Pfade
305 true_map = zeros(1,1+num_path(id)) ;
306 true_map(1) = 1 ;
307 path_loss=[] ;
308 path_loss(1) = 0 ;
309 for pfadNo = 1:num_path(id)
310 dna_numbers = dnaest{1,pfadNo} ;
311 est_numbers = dnaest{2,pfadNo} ;
312
313 [weightDon, weightAcc, weightIntron] = ...
314 compute_SpliceWeights(d, a, h, SpliceAlign(pfadNo,:), don_supp, acc_supp) ;
315
316 path_loss(pfadNo+1) = sum(double(SpliceAlign(pfadNo,:))~=true_SpliceAlign) ; %not too simple?
317
318 %Gewichte in restliche Zeilen der Matrix speichern
319 Weights(pfadNo+1,:) = [weightIntron, weightDon, weightAcc, weightMatch(pfadNo,:)] ;
320
321 AlignmentScores(pfadNo+1) = Weights(pfadNo+1,:) * [h.penalties' ; d.penalties' ; a.penalties' ; mmatrix(:)] ;
322
323 %Test, ob Alignprogr. auch das richtige Ergebnis liefert:
324 assert(abs(Gesamtscores(pfadNo) - AlignmentScores(pfadNo+1)) < 1e-6) ;
325
326 if norm(Weights(1,:)-Weights(pfadNo+1,:))<1e-5,
327 true_map(pfadNo+1)=1 ;
328 end
329 end
330
331 % the true label sequence should not have a larger score than the
332 % maximal one WHYYYYY?
333 if AlignmentScores(1) >= max(AlignmentScores(2:end))+1e-6,
334 AlignmentScores
335 warning('true score > max score\n') ;
336 keyboard
337 end ;
338
339 %%%set_param_palma should have done this right
340 for z = 1:num_path(id)
341 assert(abs(Weights(z,:) * param(1:126)' -AlignmentScores(z)) <= 1e-6) ; %abs: absolute value
342 end
343
344 if all(true_map==1)
345 num_path(id)=num_path(id)+1 ; %next iteration step: one alignment more
346 end ;
347
348 %Choose true and first false alignment for extending A
349 Weights = Weights([1 min(find(true_map==0))], :) ;
350
351 %if there is a false alignment
352 if size(Weights,1)>1 & sum((Weights(1,:)-Weights(2,:)).*param)+xis(id)<1+column_eps,
353 e=zeros(1,N) ;
354 e(id) = 1 ;
355 A=[A;
356 Weights(2,:)-Weights(1,:) zeros(1,126) -e] ;
357 b=[b;
358 -1] ;
359 gap(id) = 1-sum((Weights(1,:)-Weights(2,:)).*param)-xis(id) ;
360 else
361 gap(id) = 0 ;
362 end ;
363 end
364 fprintf('\n') ;
365 #################################################################################
366 const_added = solver.addConstraint(deltas, loss, pos, marginRescaling)
367
368 objValue,w,self.slacks = solver.solve()
369
370 # [param,xis] = paths_qp_solve(lpenv,Q,f,A,b,LB,UB);
371 # sum_xis = sum(xis)
372 # maxgap=max(gap)
373 [mean(xis>=1) mean(gap>=1) mean(xis>=1e-3) mean(gap>=1e-3)]
374
375 if max(gap)<=column_eps:
376 break
377
378 assert(all(maxgap<=column_eps))
379 assert(length(take_idx)==N)
380 """
381 print 'Training completed'
382
383 if __name__ == '__main__':
384 run()
385
386 """
387 def __init__(self):
388 numAcceptorParams = 0
389 numDonorParams = 0
390 numIntronLengthParams = 30
391
392 # how many supporting points do we have for the
393 # quality function per base
394 numQualityParamsPerBase = 5
395
396 # we have quality params for each possible
397 # for all possible combinations of
398 # {A,C,G,T} x {A,C,G,T,-}
399 # 4 x 5
400 numQualityParams = numQualityParamsPerBase * 4 * 5
401
402 # for all possible combinations of
403 # {A,C,G,T} x {A,C,G,T,-,*} plus
404 # {-} x {A,C,G,T,-}
405 # 1 x 5
406 numParams = numQualityParams + 1*5
407 """