+ added c code from the palma project
[qpalma.git] / python / qpalma.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import sys
5 import subprocess
6 import scipy.io
7 from paths_load_data import *
8 import pdb
9 import numpy.matlib
10 from set_param_palma import *
11 from computeSpliceAlign import *
12
13 from computeSpliceWeights import *
14
15 """
16 A training method for the QPalma project
17
18 Overall :
19
20 1. Load the data
21 via paths_load_data
22
23 2. Create a QP using SIQP (paths_create_qp)
24
25 3. Set initial params using set_param_palma -> palma_utils
26
27 4. computeSpliceAlign
28
29 5. compute_SpliceAlignLocal
30
31 6. computeSpliceWeights
32
33 7. myalign_local
34
35 """
36
37 class Param:
38
39 def __init__(self):
40 """ default parameters """
41
42 self.basedir = '/fml/ag-raetsch/share/projects/qpalma/elegans_palma'
43 self.MAX_MEM = 31000;
44 self.LOCAL_ALIGN = 1;
45 self.init_param = 1;
46 self.train_with_splicesitescoreinformation = 1;
47 self.train_with_intronlengthinformation = 1;
48 self.C = 0.001;
49 self.microexon = 0;
50 self.prob = 0;
51 self.organism = 'elegans'
52 self.expt = 'training'
53
54 self.insertion_prob = self.prob/3 ;
55 self.deletion_prob = self.prob/3 ;
56 self.mutation_prob = self.prob/3 ;
57
58
59 #%basedir='/fml/ag-raetsch/share/projects/splicing/uta/'
60 #if ~exist('basedir','var')
61 # basedir='/fml/ag-raetsch/share/projects/palma/elegans/' ;
62 #end
63 #
64 #%basedir='/fml/ag-raetsch/share/projects/palma/human/' ;
65
66
67 class GenomeInfo:
68 """
69 GenomeInfo's init function replaces
70 init_genome
71
72 """
73
74 def __init__(self,gen_file):
75 """ load the info stored in genome.config """
76
77 self.est_fnames = g_est_fnames
78 self.cdna_fnames = g_cdna_fnames
79 self.flcdna_fnames=g_flcdna_fnames
80 self.contig_names=g_contig_names
81 self.flat_fnames=g_flat_fnames
82 self.fasta_fnames=g_fasta_fnames
83 self.annotation_fnames=g_annotation_fnames
84 self.alphabet=g_alphabet
85 self.basedir=g_basedir
86 self.max_intron_len = g_max_intron_len
87 self.min_intron_len = g_min_intron_len
88 self.merge_est_transcripts = g_merge_est_transcripts
89
90
91 def run():
92 ARGS = Param()
93
94 gen_file= '%s/genome.config' % ARGS.basedir
95
96 cmd = ['']*4
97 cmd[0] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/utils'
98 cmd[1] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/genomes'
99 cmd[2] = 'genome_info = init_genome(\'%s\')' % gen_file
100 cmd[3] = 'save genome_info.mat genome_info'
101 full_cmd = "matlab -nojvm -nodisplay -r \"%s; %s; %s; %s; exit\"" % (cmd[0],cmd[1],cmd[2],cmd[3])
102
103 obj = subprocess.Popen(full_cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
104 out,err = obj.communicate()
105 assert err == '', 'An error occured!\n%s'%err
106
107 ginfo = scipy.io.loadmat('genome_info.mat')
108 genome_info = ginfo['genome_info']
109
110 print 'genome_info.basedir is %s\n'%genome_info.basedir
111
112 Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data('training',genome_info,ARGS)
113
114 #############################################################################################
115 # Constants
116 #############################################################################################
117
118 # number of training instances
119 N = len(Sequences)
120 print 'Number of training examples: %d\n'% N
121
122 random_N = 100 ; # number of constraints to generate per iteration
123 iteration_steps = 200 ; #upper bound on iteration steps
124
125 # myalign
126 remove_duplicate_scores = 0
127 anzpath = 2
128 print_matrix = 0
129
130 param = numpy.matlib.rand(126,1)
131
132 [h,d,a,mmatrix] = set_param_palma(param,ARGS.train_with_intronlengthinformation)
133
134 #import palma.model
135 #currentModel = palma.model.parse_file(modelFile)
136
137 #import palma_utils
138 #h,d,a,mmatrix = palma_utils.set_param(currentModel)
139
140 #############################################################################################
141 # delete splicesite-score-information
142 #############################################################################################
143
144 #if ARGS.train_with_splicesitescoreinformation:
145 # for i in range(len(Acceptors)):
146 # Acceptors[i](Acceptors[{i]>-20)=1 ;
147 # Donors{i}(Donors{i}>-20)=1 ;
148 # end
149 #end
150
151 #############################################################################################
152 # Training
153 #############################################################################################
154 print 'Starting training...'
155
156 #The alignment matrix M looks like
157
158 # DNA
159
160 # A C G T -
161 # ------------
162 # A
163 #
164 #E C
165 #S
166 #T G
167
168 # T
169
170 # -
171
172
173 #where
174
175 # - is a gap,
176
177
178 numDonorSupportPoints = 30
179 numAcceptorSupportPoints = 30
180 numIntronLengthSupportPoints = 30
181 numQualityScoreSupportPoints = 0
182
183 numFeatures = numDonorSupportPoints\
184 + numAcceptorSupportPoints\
185 + numIntronLengthSupportPoints\
186 + numQualityScoreSupportPoints
187
188 numExamples = N
189
190 from SIQP_CPX import SIQPSolver
191 C=1.0
192
193 logfile = open('qpalma.log','w+')
194 #problem = SIQPSolver(numFeatures,numExamples,C,logfile)
195
196 #xis = zeros(1,N) ; #ninitialize slack variables
197 #num_path = anzpath*ones(1,N) ; # nr of alignments done (best path, second-best path etc.)
198 #gap = zeros(1,N) ; %? sum of differences between true alignment score and 'false' alignment scores
199
200
201 #for step_nr=1:iteration_steps,
202 iteration_nr = 1
203
204 while True:
205 print 'Iteration step: %d'% iteration_nr
206
207 for exampleId in range(numExamples):
208 if exampleId % 1000 == 0:
209 print 'Current example nr %d' % exampleId
210 dna = Sequences[exampleId]
211 est = Ests[exampleId]
212
213 exons = Exons[exampleId]
214 # NoiseMatrix = Noises[exampleId]
215 don_supp = Donors[exampleId]
216 acc_supp = Acceptors[exampleId]
217
218 # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
219 trueSpliceAlign, trueWeightMatch = computeSpliceAlign(dna, exons)
220
221 trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
222 pdb.set_trace()
223
224 #
225 # compute wrong alignments
226 #
227
228 # Compute donor, acceptor with penalty_lookup_new
229 # returns two double lists
230 [donor, acceptor] = compute_donacc(don_supp, acc_supp, d, a) ;
231
232 #myalign wants the acceptor site on the g of the ag
233 acceptor = [acceptor(2:end) -Inf] ;
234
235 # call myalign
236 [SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest] = ...
237 myalign_local(num_path(id), dna, est, {h}, mmatrix, donor, acceptor, ...
238 remove_duplicate_scores, print_matrix);
239
240 SpliceAlign = double(SpliceAlign') ; %column
241 weightMatch = double(weightMatch') ;
242
243
244 iteration_nr += 1
245 break
246
247 """
248 for id = 1:N
249 %fprintf('%i\n', id) ;
250 if (mod(id,50)==0), fprintf(1,'.'); end
251
252 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
253 % True Alignment and Comparison with wrong ones
254 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
255
256 %Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
257 [true_SpliceAlign, true_weightMatch] = compute_SpliceAlign_local(dna, exons) ;
258 [true_weightDon, true_weightAcc, true_weightIntron] = ...
259 compute_SpliceWeights(d, a, h, true_SpliceAlign, don_supp, acc_supp) ;
260
261 double(true_weightMatch) ;
262
263 % Berechne Gewichtsmatrix fuer aktuelle id (matrix anlegen)
264 Weights = zeros(num_path(id)+1, 126) ; %first line: true alignment, rest: wrong alignments
265 Weights(1,:) = [true_weightIntron, true_weightDon, true_weightAcc, true_weightMatch] ;
266
267 %Score of whole alignment
268 AlignmentScores = zeros(1, num_path(id)+1) ; %first entry: true alignment
269 AlignmentScores(1) = Weights(1,:) * [h.penalties' ; d.penalties' ; a.penalties' ; mmatrix(:)] ;
270
271 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
272 % Wrong Alignments
273 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
274 % Compute donor, acceptor with penalty_lookup_new
275 [donor, acceptor] = compute_donacc(don_supp, acc_supp, d, a) ;
276
277 %myalign wants the acceptor site on the g of the ag
278 acceptor = [acceptor(2:end) -Inf] ;
279
280 %call myalign
281 [SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest] = ...
282 myalign_local(num_path(id), dna, est, {h}, mmatrix, donor, acceptor, ...
283 remove_duplicate_scores, print_matrix);
284
285 %[SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest] = myalign_local(1, dna, est, {h}, mmatrix, donor,
286 %acceptor, remove_duplicate_scores, print_matrix);
287
288 %return values (are int32, have to be double later cause we compute scores
289 SpliceAlign = double(SpliceAlign') ; %column
290 weightMatch = double(weightMatch') ;
291
292
293 %test
294 for pfadNo = 1:num_path(id)
295 assert(sum(weightMatch(pfadNo,7:end)) == sum(SpliceAlign(pfadNo,:)==0)) ;
296 new_weightMatch = zeros(1,36) ;
297 for iii = 1:length(dnaest{1,pfadNo})
298 if dnaest{2,pfadNo}(iii) ~= 6
299 new_weightMatch(dnaest{1,pfadNo}(iii)*6 + dnaest{2,pfadNo}(iii) + 1) = new_weightMatch(dnaest{1,pfadNo}(iii)*6 + dnaest{2,pfadNo}(iii)+1) + 1 ;
300 end
301 end
302 assert(all(new_weightMatch == weightMatch(pfadNo,:))) ;
303 assert(sum(new_weightMatch(7:end)) == sum(SpliceAlign(pfadNo,:)==0)) ;
304 %reshape(new_weightMatch,6,6)
305 %reshape(weightMatch(pfadNo,:),6,6)
306 end
307
308 %assert that it is the right file
309 assert(all(dna(find(SpliceAlign(1,:)==1)) == 'g')) ;
310
311 %just some info
312 %print_align(dnaest,1) ;
313 %exons(3,2)-exons(1,1)
314 %length(dnaest{1,1})
315 %keyboard
316
317 %Berechne Gewichte der durch Alignment berechneten Pfade
318 true_map = zeros(1,1+num_path(id)) ;
319 true_map(1) = 1 ;
320 path_loss=[] ;
321 path_loss(1) = 0 ;
322 for pfadNo = 1:num_path(id)
323 dna_numbers = dnaest{1,pfadNo} ;
324 est_numbers = dnaest{2,pfadNo} ;
325
326 [weightDon, weightAcc, weightIntron] = ...
327 compute_SpliceWeights(d, a, h, SpliceAlign(pfadNo,:), don_supp, acc_supp) ;
328
329 path_loss(pfadNo+1) = sum(double(SpliceAlign(pfadNo,:))~=true_SpliceAlign) ; %not too simple?
330
331 %Gewichte in restliche Zeilen der Matrix speichern
332 Weights(pfadNo+1,:) = [weightIntron, weightDon, weightAcc, weightMatch(pfadNo,:)] ;
333
334 AlignmentScores(pfadNo+1) = Weights(pfadNo+1,:) * [h.penalties' ; d.penalties' ; a.penalties' ; mmatrix(:)] ;
335
336 %Test, ob Alignprogr. auch das richtige Ergebnis liefert:
337 assert(abs(Gesamtscores(pfadNo) - AlignmentScores(pfadNo+1)) < 1e-6) ;
338
339 if norm(Weights(1,:)-Weights(pfadNo+1,:))<1e-5,
340 true_map(pfadNo+1)=1 ;
341 end
342 end
343
344 % the true label sequence should not have a larger score than the
345 % maximal one WHYYYYY?
346 if AlignmentScores(1) >= max(AlignmentScores(2:end))+1e-6,
347 AlignmentScores
348 warning('true score > max score\n') ;
349 keyboard
350 end ;
351
352 %%%set_param_palma should have done this right
353 for z = 1:num_path(id)
354 assert(abs(Weights(z,:) * param(1:126)' -AlignmentScores(z)) <= 1e-6) ; %abs: absolute value
355 end
356
357 if all(true_map==1)
358 num_path(id)=num_path(id)+1 ; %next iteration step: one alignment more
359 end ;
360
361 %Choose true and first false alignment for extending A
362 Weights = Weights([1 min(find(true_map==0))], :) ;
363
364 %if there is a false alignment
365 if size(Weights,1)>1 & sum((Weights(1,:)-Weights(2,:)).*param)+xis(id)<1+column_eps,
366 e=zeros(1,N) ;
367 e(id) = 1 ;
368 A=[A;
369 Weights(2,:)-Weights(1,:) zeros(1,126) -e] ;
370 b=[b;
371 -1] ;
372 gap(id) = 1-sum((Weights(1,:)-Weights(2,:)).*param)-xis(id) ;
373 else
374 gap(id) = 0 ;
375 end ;
376 end
377 fprintf('\n') ;
378 #################################################################################
379 const_added = solver.addConstraint(deltas, loss, pos, marginRescaling)
380
381 objValue,w,self.slacks = solver.solve()
382
383 # [param,xis] = paths_qp_solve(lpenv,Q,f,A,b,LB,UB);
384 # sum_xis = sum(xis)
385 # maxgap=max(gap)
386 [mean(xis>=1) mean(gap>=1) mean(xis>=1e-3) mean(gap>=1e-3)]
387
388 if max(gap)<=column_eps:
389 break
390
391 assert(all(maxgap<=column_eps))
392 assert(length(take_idx)==N)
393 """
394 print 'Training completed'
395
396 if __name__ == '__main__':
397 run()
398
399 """
400 def __init__(self):
401 numAcceptorParams = 0
402 numDonorParams = 0
403 numIntronLengthParams = 30
404
405 # how many supporting points do we have for the
406 # quality function per base
407 numQualityParamsPerBase = 5
408
409 # we have quality params for each possible
410 # for all possible combinations of
411 # {A,C,G,T} x {A,C,G,T,-}
412 # 4 x 5
413 numQualityParams = numQualityParamsPerBase * 4 * 5
414
415 # for all possible combinations of
416 # {A,C,G,T} x {A,C,G,T,-,*} plus
417 # {-} x {A,C,G,T,-}
418 # 1 x 5
419 numParams = numQualityParams + 1*5
420 """