+ added first converted scripts
[qpalma.git] / python / qpalma.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import sys
5 import subprocess
6 import scipy.io
7 from paths_load_data import *
8 import pdb
9 import numpy.matlib
10 from set_param_palma import *
11 from computeSpliceAlign import *
12
13 """
14 A training method for the QPalma project
15
16 Overall :
17
18 1. Load the data
19 via paths_load_data
20
21 2. Create a QP using SIQP (paths_create_qp)
22
23 3. Set initial params using set_param_palma -> palma_utils
24
25 4. computeSpliceAlign
26
27 5. compute_SpliceAlignLocal
28
29 6. computeSpliceWeights
30
31 7. myalign_local
32
33 """
34
35 class Param:
36
37 def __init__(self):
38 """ default parameters """
39
40 self.basedir = '/fml/ag-raetsch/share/projects/qpalma/elegans_palma'
41 self.MAX_MEM = 31000;
42 self.LOCAL_ALIGN = 1;
43 self.init_param = 1;
44 self.train_with_splicesitescoreinformation = 1;
45 self.train_with_intronlengthinformation = 1;
46 self.C = 0.001;
47 self.microexon = 0;
48 self.prob = 0;
49 self.organism = 'elegans'
50 self.expt = 'training'
51
52 self.insertion_prob = self.prob/3 ;
53 self.deletion_prob = self.prob/3 ;
54 self.mutation_prob = self.prob/3 ;
55
56
57 #%basedir='/fml/ag-raetsch/share/projects/splicing/uta/'
58 #if ~exist('basedir','var')
59 # basedir='/fml/ag-raetsch/share/projects/palma/elegans/' ;
60 #end
61 #
62 #%basedir='/fml/ag-raetsch/share/projects/palma/human/' ;
63
64
65 class GenomeInfo:
66 """
67 GenomeInfo's init function replaces
68 init_genome
69
70 """
71
72 def __init__(self,gen_file):
73 """ load the info stored in genome.config """
74
75 self.est_fnames = g_est_fnames
76 self.cdna_fnames = g_cdna_fnames
77 self.flcdna_fnames=g_flcdna_fnames
78 self.contig_names=g_contig_names
79 self.flat_fnames=g_flat_fnames
80 self.fasta_fnames=g_fasta_fnames
81 self.annotation_fnames=g_annotation_fnames
82 self.alphabet=g_alphabet
83 self.basedir=g_basedir
84 self.max_intron_len = g_max_intron_len
85 self.min_intron_len = g_min_intron_len
86 self.merge_est_transcripts = g_merge_est_transcripts
87
88
89 def run():
90 ARGS = Param()
91
92 gen_file= '%s/genome.config' % ARGS.basedir
93
94 cmd = ['']*4
95 cmd[0] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/utils'
96 cmd[1] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/genomes'
97 cmd[2] = 'genome_info = init_genome(\'%s\')' % gen_file
98 cmd[3] = 'save genome_info.mat genome_info'
99 full_cmd = "matlab -nojvm -nodisplay -r \"%s; %s; %s; %s; exit\"" % (cmd[0],cmd[1],cmd[2],cmd[3])
100
101 obj = subprocess.Popen(full_cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
102 out,err = obj.communicate()
103 assert err == '', 'An error occured!\n%s'%err
104
105 ginfo = scipy.io.loadmat('genome_info.mat')
106 genome_info = ginfo['genome_info']
107
108 print 'genome_info.basedir is %s\n'%genome_info.basedir
109
110 Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data('training',genome_info,ARGS)
111
112 #############################################################################################
113 # Constants
114 #############################################################################################
115
116 # number of training instances
117 N = len(Sequences)
118 print 'Number of training examples: %d\n'% N
119
120 random_N = 100 ; # number of constraints to generate per iteration
121 iteration_steps = 200 ; #upper bound on iteration steps
122
123 # myalign
124 remove_duplicate_scores = 0
125 anzpath = 2
126 print_matrix = 0
127
128 param = numpy.matlib.rand(126,1)
129
130 [h,d,a,mmatrix] = set_param_palma(param,ARGS.train_with_intronlengthinformation)
131
132 #import palma.model
133 #currentModel = palma.model.parse_file(modelFile)
134
135 #import palma_utils
136 #h,d,a,mmatrix = palma_utils.set_param(currentModel)
137
138 #############################################################################################
139 # delete splicesite-score-information
140 #############################################################################################
141
142 #if ARGS.train_with_splicesitescoreinformation:
143 # for i in range(len(Acceptors)):
144 # Acceptors[i](Acceptors[{i]>-20)=1 ;
145 # Donors{i}(Donors{i}>-20)=1 ;
146 # end
147 #end
148
149 #############################################################################################
150 # Training
151 #############################################################################################
152 print 'Starting training...'
153
154 #The alignment matrix M looks like
155
156 # DNA
157
158 # A C G T -
159 # ------------
160 # A
161 #
162 #E C
163 #S
164 #T G
165
166 # T
167
168 # -
169
170
171 #where
172
173 # - is a gap,
174
175
176 numDonorSupportPoints = 30
177 numAcceptorSupportPoints = 30
178 numIntronLengthSupportPoints = 30
179 numQualityScoreSupportPoints = 0
180
181 numFeatures = numDonorSupportPoints\
182 + numAcceptorSupportPoints\
183 + numIntronLengthSupportPoints\
184 + numQualityScoreSupportPoints
185
186 numExamples = N
187
188 from SIQP_CPX import SIQPSolver
189 C=1.0
190
191 logfile = open('qpalma.log','w+')
192 #problem = SIQPSolver(numFeatures,numExamples,C,logfile)
193
194 #xis = zeros(1,N) ; #ninitialize slack variables
195 #num_path = anzpath*ones(1,N) ; # nr of alignments done (best path, second-best path etc.)
196 #gap = zeros(1,N) ; %? sum of differences between true alignment score and 'false' alignment scores
197
198
199 #for step_nr=1:iteration_steps,
200 iteration_nr = 1
201
202 while True:
203 print 'Iteration step: %d'% iteration_nr
204
205 for exampleId in range(numExamples):
206 if exampleId % 1000 == 0:
207 print 'Current example nr %d' % exampleId
208 dna = Sequences[exampleId]
209 est = Ests[exampleId]
210
211 exons = Exons[exampleId]
212 # NoiseMatrix = Noises[exampleId]
213 don_supp = Donors[exampleId]
214 acc_supp = Acceptors[exampleId]
215
216
217 # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
218 true_SpliceAlign, true_weightMatch = computeSpliceAlign(dna, exons)
219
220 pdb.set_trace()
221
222 true_weightDon, true_weightAcc, true_weightIntron = computeSpliceWeights(d, a, h, true_SpliceAlign, don_supp, acc_supp)
223
224
225 iteration_nr += 1
226 break
227
228 """
229 for id = 1:N
230 %fprintf('%i\n', id) ;
231 if (mod(id,50)==0), fprintf(1,'.'); end
232
233
234 %some assertions concerning the splicesites
235 idx_don = find(don_supp~=-Inf) ;
236 assert(all(dna(idx_don)=='g')) ;
237 assert(all(dna(idx_don(idx_don<length(dna))+1)=='t' | dna(idx_don(idx_don<length(dna))+1)=='c')) ;
238 idx_acc = find(acc_supp~=-Inf) ;
239 assert(all(dna(idx_acc(idx_acc>3)-2)=='a')) ;
240 assert(all(dna(idx_acc(idx_acc>2)-1)=='g')) ;
241
242 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
243 % True Alignment and Comparison with wrong ones
244 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
245
246 %Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
247 [true_SpliceAlign, true_weightMatch] = compute_SpliceAlign_local(dna, exons) ;
248 [true_weightDon, true_weightAcc, true_weightIntron] = ...
249 compute_SpliceWeights(d, a, h, true_SpliceAlign, don_supp, acc_supp) ;
250
251 double(true_weightMatch) ;
252
253 % Berechne Gewichtsmatrix fuer aktuelle id (matrix anlegen)
254 Weights = zeros(num_path(id)+1, 126) ; %first line: true alignment, rest: wrong alignments
255 Weights(1,:) = [true_weightIntron, true_weightDon, true_weightAcc, true_weightMatch] ;
256
257 %Score of whole alignment
258 AlignmentScores = zeros(1, num_path(id)+1) ; %first entry: true alignment
259 AlignmentScores(1) = Weights(1,:) * [h.penalties' ; d.penalties' ; a.penalties' ; mmatrix(:)] ;
260
261 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
262 % Wrong Alignments
263 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
264 % Compute donor, acceptor with penalty_lookup_new
265 [donor, acceptor] = compute_donacc(don_supp, acc_supp, d, a) ;
266
267 %myalign wants the acceptor site on the g of the ag
268 acceptor = [acceptor(2:end) -Inf] ;
269
270 %call myalign
271 [SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest] = ...
272 myalign_local(num_path(id), dna, est, {h}, mmatrix, donor, acceptor, ...
273 remove_duplicate_scores, print_matrix);
274
275 %[SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest] = myalign_local(1, dna, est, {h}, mmatrix, donor,
276 %acceptor, remove_duplicate_scores, print_matrix);
277
278 %return values (are int32, have to be double later cause we compute scores
279 SpliceAlign = double(SpliceAlign') ; %column
280 weightMatch = double(weightMatch') ;
281
282
283 %test
284 for pfadNo = 1:num_path(id)
285 assert(sum(weightMatch(pfadNo,7:end)) == sum(SpliceAlign(pfadNo,:)==0)) ;
286 new_weightMatch = zeros(1,36) ;
287 for iii = 1:length(dnaest{1,pfadNo})
288 if dnaest{2,pfadNo}(iii) ~= 6
289 new_weightMatch(dnaest{1,pfadNo}(iii)*6 + dnaest{2,pfadNo}(iii) + 1) = new_weightMatch(dnaest{1,pfadNo}(iii)*6 + dnaest{2,pfadNo}(iii)+1) + 1 ;
290 end
291 end
292 assert(all(new_weightMatch == weightMatch(pfadNo,:))) ;
293 assert(sum(new_weightMatch(7:end)) == sum(SpliceAlign(pfadNo,:)==0)) ;
294 %reshape(new_weightMatch,6,6)
295 %reshape(weightMatch(pfadNo,:),6,6)
296 end
297
298 %assert that it is the right file
299 assert(all(dna(find(SpliceAlign(1,:)==1)) == 'g')) ;
300
301 %just some info
302 %print_align(dnaest,1) ;
303 %exons(3,2)-exons(1,1)
304 %length(dnaest{1,1})
305 %keyboard
306
307 %Berechne Gewichte der durch Alignment berechneten Pfade
308 true_map = zeros(1,1+num_path(id)) ;
309 true_map(1) = 1 ;
310 path_loss=[] ;
311 path_loss(1) = 0 ;
312 for pfadNo = 1:num_path(id)
313 dna_numbers = dnaest{1,pfadNo} ;
314 est_numbers = dnaest{2,pfadNo} ;
315
316 [weightDon, weightAcc, weightIntron] = ...
317 compute_SpliceWeights(d, a, h, SpliceAlign(pfadNo,:), don_supp, acc_supp) ;
318
319 path_loss(pfadNo+1) = sum(double(SpliceAlign(pfadNo,:))~=true_SpliceAlign) ; %not too simple?
320
321 %Gewichte in restliche Zeilen der Matrix speichern
322 Weights(pfadNo+1,:) = [weightIntron, weightDon, weightAcc, weightMatch(pfadNo,:)] ;
323
324 AlignmentScores(pfadNo+1) = Weights(pfadNo+1,:) * [h.penalties' ; d.penalties' ; a.penalties' ; mmatrix(:)] ;
325
326 %Test, ob Alignprogr. auch das richtige Ergebnis liefert:
327 assert(abs(Gesamtscores(pfadNo) - AlignmentScores(pfadNo+1)) < 1e-6) ;
328
329 if norm(Weights(1,:)-Weights(pfadNo+1,:))<1e-5,
330 true_map(pfadNo+1)=1 ;
331 end
332 end
333
334 % the true label sequence should not have a larger score than the
335 % maximal one WHYYYYY?
336 if AlignmentScores(1) >= max(AlignmentScores(2:end))+1e-6,
337 AlignmentScores
338 warning('true score > max score\n') ;
339 keyboard
340 end ;
341
342 %%%set_param_palma should have done this right
343 for z = 1:num_path(id)
344 assert(abs(Weights(z,:) * param(1:126)' -AlignmentScores(z)) <= 1e-6) ; %abs: absolute value
345 end
346
347 if all(true_map==1)
348 num_path(id)=num_path(id)+1 ; %next iteration step: one alignment more
349 end ;
350
351 %Choose true and first false alignment for extending A
352 Weights = Weights([1 min(find(true_map==0))], :) ;
353
354 %if there is a false alignment
355 if size(Weights,1)>1 & sum((Weights(1,:)-Weights(2,:)).*param)+xis(id)<1+column_eps,
356 e=zeros(1,N) ;
357 e(id) = 1 ;
358 A=[A;
359 Weights(2,:)-Weights(1,:) zeros(1,126) -e] ;
360 b=[b;
361 -1] ;
362 gap(id) = 1-sum((Weights(1,:)-Weights(2,:)).*param)-xis(id) ;
363 else
364 gap(id) = 0 ;
365 end ;
366 end
367 fprintf('\n') ;
368
369
370
371 const_added = solver.addConstraint(deltas, loss, pos, marginRescaling)
372
373 objValue,w,self.slacks = solver.solve()
374
375 # [param,xis] = paths_qp_solve(lpenv,Q,f,A,b,LB,UB);
376 # sum_xis = sum(xis)
377 # maxgap=max(gap)
378 [mean(xis>=1) mean(gap>=1) mean(xis>=1e-3) mean(gap>=1e-3)]
379
380 [h,d,a,mmatrix] = set_param_palma(param,ARGS.train_with_intronlengthinformation) ;
381
382 save(paramfilename, ...
383 'param', 'h', 'd', 'a', 'mmatrix', 'A', 'b') ;
384
385 if max(gap)<=column_eps:
386 break
387
388
389 assert(all(maxgap<=column_eps))
390 assert(length(take_idx)==N)
391 """
392 print 'Training completed'
393
394 if __name__ == '__main__':
395 run()
396
397 """
398
399 def __init__(self):
400 numAcceptorParams = 0
401 numDonorParams = 0
402 numIntronLengthParams = 30
403
404 # how many supporting points do we have for the
405 # quality function per base
406 numQualityParamsPerBase = 5
407
408 # we have quality params for each possible
409 # for all possible combinations of
410 # {A,C,G,T} x {A,C,G,T,-}
411 # 4 x 5
412 numQualityParams = numQualityParamsPerBase * 4 * 5
413
414 # for all possible combinations of
415 # {A,C,G,T} x {A,C,G,T,-,*} plus
416 # {-} x {A,C,G,T,-}
417 # 1 x 5
418 numParams = numQualityParams + 1*5
419 """