+ rewrote C interface for SWIG/Python
[qpalma.git] / python / qpalma.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import sys
5 import subprocess
6 import scipy.io
7 from paths_load_data import *
8 import pdb
9 import numpy.matlib
10 from set_param_palma import *
11 from computeSpliceAlign import *
12
13 from computeSpliceWeights import *
14 import QPalmaDP
15 from SIQP_CPX import SIQPSolver
16
17 from penalty_lookup import *
18 from compute_donacc import *
19
20 """
21 A training method for the QPalma project
22
23 Overall procedure:
24
25 1. Load the data -> via paths_load_data
26 2. Create a QP using SIQP (paths_create_qp)
27 3. Set initial params using set_param_palma -> palma_utils
28 4. computeSpliceAlign
29 5. compute_SpliceAlignLocal
30 6. computeSpliceWeights
31 7. myalign_local
32
33 """
34
35 class Param:
36
37 def __init__(self):
38 """ default parameters """
39
40 self.basedir = '/fml/ag-raetsch/share/projects/qpalma/elegans_palma'
41 self.MAX_MEM = 31000;
42 self.LOCAL_ALIGN = 1;
43 self.init_param = 1;
44 self.train_with_splicesitescoreinformation = 1;
45 self.train_with_intronlengthinformation = 1;
46 self.C = 0.001;
47 self.microexon = 0;
48 self.prob = 0;
49 self.organism = 'elegans'
50 self.expt = 'training'
51
52 self.insertion_prob = self.prob/3 ;
53 self.deletion_prob = self.prob/3 ;
54 self.mutation_prob = self.prob/3 ;
55
56 def run():
57 ARGS = Param()
58
59 logfh = open('qpalma.log','w+')
60
61 gen_file= '%s/genome.config' % ARGS.basedir
62
63 cmd = ['']*4
64 cmd[0] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/utils'
65 cmd[1] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/genomes'
66 cmd[2] = 'genome_info = init_genome(\'%s\')' % gen_file
67 cmd[3] = 'save genome_info.mat genome_info'
68 full_cmd = "matlab -nojvm -nodisplay -r \"%s; %s; %s; %s; exit\"" % (cmd[0],cmd[1],cmd[2],cmd[3])
69
70 obj = subprocess.Popen(full_cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
71 out,err = obj.communicate()
72 assert err == '', 'An error occured!\n%s'%err
73
74 ginfo = scipy.io.loadmat('genome_info.mat')
75 genome_info = ginfo['genome_info']
76
77 plog(logfh,'genome_info.basedir is %s\n'%genome_info.basedir)
78
79 Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data('training',genome_info,ARGS)
80
81 # number of training instances
82 N = len(Sequences)
83 assert N == len(Acceptors) and N == len(Acceptors) and N == len(Exons)\
84 and N == len(Ests), 'The Seq,Accept,Donor,.. arrays are of different lengths'
85
86 plog(logfh,'Number of training examples: %d\n'% N)
87
88 random_N = 100 ; # number of constraints to generate per iteration
89 iteration_steps = 200 ; #upper bound on iteration steps
90
91 remove_duplicate_scores = 0
92 anzpath = 2
93 print_matrix = 0
94
95 param = numpy.matlib.rand(126,1)
96
97 [h,d,a,mmatrix] = set_param_palma(param,ARGS.train_with_intronlengthinformation)
98
99 # delete splicesite-score-information
100 if not ARGS.train_with_splicesitescoreinformation:
101 for i in range(len(Acceptors)):
102 if Acceptors[i] > -20:
103 Acceptors[i] = 1
104 if Donors[i] >-20:
105 Donors[i] = 1
106
107 #############################################################################################
108 # Training
109 #############################################################################################
110 plog(logfh,'Starting training...')
111
112 numExamples = N
113 C=1.0
114
115 numDonorSupportPoints = 30
116 numAcceptorSupportPoints = 30
117 numIntronLengthSupportPoints = 30
118 sizeMMatrix = 36
119 numQualityScoreSupportPoints = 16*0
120
121 numFeatures = numDonorSupportPoints\
122 + numAcceptorSupportPoints\
123 + numIntronLengthSupportPoints\
124 + sizeMMatrix\
125 + numQualityScoreSupportPoints
126
127 qualityMatrix = [0.0]*numQualityScoreSupportPoints
128
129 plog(logfh,'Initializing problem...\n')
130 #problem = SIQPSolver(numFeatures,numExamples,C,logfile)
131 #num_path = anzpath*ones(1,N) ; # nr of alignments done (best path, second-best path etc.)
132 #gap = zeros(1,N) ; %? sum of differences between true alignment score and 'false' alignment scores
133 #for step_nr=1:iteration_steps,
134
135 plog(logfh,'Starting training...\n')
136
137 iteration_nr = 1
138
139 while True:
140 print 'Iteration step: %d'% iteration_nr
141
142 for exampleId in range(numExamples):
143 if exampleId % 1000 == 0:
144 print 'Current example nr %d' % exampleId
145
146 dna = Sequences[exampleId]
147 est = Ests[exampleId]
148
149 exons = Exons[exampleId]
150 # NoiseMatrix = Noises[exampleId]
151 don_supp = Donors[exampleId]
152 acc_supp = Acceptors[exampleId]
153
154 # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
155 trueSpliceAlign, trueWeightMatch = computeSpliceAlign(dna, exons)
156
157 # Calculate
158 trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
159
160 # Reshape currentW param
161 currentW = zeros((numFeatures,1))
162 currentW[0:numDonorSupportPoints,1] = trueWeigthDon[:]
163 currentW[0:numAcceptorSupportPoints,1] = trueWeigthAcc[:]
164 currentW[0:numIntronLengthSupportPoints,1] = trueWeigthIntron[:]
165 currentW[0:numDonorSupportPoints,1] = trueWeigthMatch[:]
166
167 #currentPhi = d.penalties a h mmatrix
168
169 # Calculate w'phi(x,y) the total score of the alignment
170 #alignmentScore = currentW.T * currentPhi
171
172 #
173 # Calculate wrong alignments
174 #
175 nr_paths = 2
176 dna = 'acgtagct'
177 dna_len = len(dna)
178
179 est = 'acgtagct'
180 est_len = len(est)
181
182 matchmatrix = QPalmaDP.createDoubleArrayFromList([1.0]*36)
183 mm_len = 36
184 donor = QPalmaDP.createDoubleArrayFromList([1,2.0,4])
185 d_len = 3
186 acceptor = QPalmaDP.createDoubleArrayFromList([.1,2,4])
187 a_len = 3
188 remove_duplicate_scores = False
189 print_matrix = False
190
191 currentAlignment = QPalmaDP.Alignment()
192
193 qualityMat = QPalmaDP.createDoubleArrayFromList(qualityMatrix)
194 currentAlignment.setQualityMatrix(qualityMat,numQualitySupportPoints)
195 ps = h.convert2SWIG()
196
197 currentAlignment.myalign( nr_paths, dna, dna_len,\
198 est, est_len, ps, matchmatrix, mm_len, donor, d_len,\
199 acceptor, a_len, remove_duplicate_scores, print_matrix)
200
201 print 'after myalign call...'
202
203 # # Compute donor, acceptor with penalty_lookup_new
204 # # returns two double lists
205 # [donor, acceptor] = compute_donacc(don_supp, acc_supp, d, a) ;
206
207 # #myalign wants the acceptor site on the g of the ag
208 # acceptor = [acceptor(2:end) -Inf] ;
209 #
210 # # call myalign
211 # [SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest] = ...
212 # myalign_local(num_path(id), dna, est, {h}, mmatrix, donor, acceptor, ...
213 # remove_duplicate_scores, print_matrix);
214
215 # SpliceAlign = double(SpliceAlign') ; %column
216 # weightMatch = double(weightMatch') ;
217
218 iteration_nr += 1
219 break
220
221 logfh.close()
222
223 """
224 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
225 % Wrong Alignments
226 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
227 % Compute donor, acceptor with penalty_lookup_new
228 [donor, acceptor] = compute_donacc(don_supp, acc_supp, d, a) ;
229
230 %myalign wants the acceptor site on the g of the ag
231 acceptor = [acceptor(2:end) -Inf] ;
232
233 %call myalign
234 [SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest] = ...
235 myalign_local(num_path(id), dna, est, {h}, mmatrix, donor, acceptor, ...
236 remove_duplicate_scores, print_matrix);
237
238 %return values (are int32, have to be double later cause we compute scores
239 SpliceAlign = double(SpliceAlign') ; %column
240 weightMatch = double(weightMatch') ;
241
242 %test
243 for pfadNo = 1:num_path(id)
244 assert(sum(weightMatch(pfadNo,7:end)) == sum(SpliceAlign(pfadNo,:)==0)) ;
245 new_weightMatch = zeros(1,36) ;
246 for iii = 1:length(dnaest{1,pfadNo})
247 if dnaest{2,pfadNo}(iii) ~= 6
248 new_weightMatch(dnaest{1,pfadNo}(iii)*6 + dnaest{2,pfadNo}(iii) + 1) = new_weightMatch(dnaest{1,pfadNo}(iii)*6 + dnaest{2,pfadNo}(iii)+1) + 1 ;
249 end
250 end
251 assert(all(new_weightMatch == weightMatch(pfadNo,:))) ;
252 assert(sum(new_weightMatch(7:end)) == sum(SpliceAlign(pfadNo,:)==0)) ;
253 end
254
255 %assert that it is the right file
256 assert(all(dna(find(SpliceAlign(1,:)==1)) == 'g')) ;
257
258 %just some info
259
260 %Berechne Gewichte der durch Alignment berechneten Pfade
261 true_map = zeros(1,1+num_path(id)) ;
262 true_map(1) = 1 ;
263 path_loss=[] ;
264 path_loss(1) = 0 ;
265 for pfadNo = 1:num_path(id)
266 dna_numbers = dnaest{1,pfadNo} ;
267 est_numbers = dnaest{2,pfadNo} ;
268
269 [weightDon, weightAcc, weightIntron] = ...
270 compute_SpliceWeights(d, a, h, SpliceAlign(pfadNo,:), don_supp, acc_supp) ;
271
272 path_loss(pfadNo+1) = sum(double(SpliceAlign(pfadNo,:))~=true_SpliceAlign) ; %not too simple?
273
274 %Gewichte in restliche Zeilen der Matrix speichern
275 Weights(pfadNo+1,:) = [weightIntron, weightDon, weightAcc, weightMatch(pfadNo,:)] ;
276
277 AlignmentScores(pfadNo+1) = Weights(pfadNo+1,:) * [h.penalties' ; d.penalties' ; a.penalties' ; mmatrix(:)] ;
278
279 %Test, ob Alignprogr. auch das richtige Ergebnis liefert:
280 assert(abs(Gesamtscores(pfadNo) - AlignmentScores(pfadNo+1)) < 1e-6) ;
281
282 if norm(Weights(1,:)-Weights(pfadNo+1,:))<1e-5,
283 true_map(pfadNo+1)=1 ;
284 end
285 end
286
287 % the true label sequence should not have a larger score than the
288 % maximal one WHYYYYY?
289 if AlignmentScores(1) >= max(AlignmentScores(2:end))+1e-6,
290 AlignmentScores
291 warning('true score > max score\n') ;
292 keyboard
293 end ;
294
295 %%%set_param_palma should have done this right
296 for z = 1:num_path(id)
297 assert(abs(Weights(z,:) * param(1:126)' -AlignmentScores(z)) <= 1e-6) ; %abs: absolute value
298 end
299
300 if all(true_map==1)
301 num_path(id)=num_path(id)+1 ; %next iteration step: one alignment more
302 end ;
303
304 %Choose true and first false alignment for extending A
305 Weights = Weights([1 min(find(true_map==0))], :) ;
306
307 %if there is a false alignment
308 if size(Weights,1)>1 & sum((Weights(1,:)-Weights(2,:)).*param)+xis(id)<1+column_eps,
309 e=zeros(1,N) ;
310 e(id) = 1 ;
311 A=[A;
312 Weights(2,:)-Weights(1,:) zeros(1,126) -e] ;
313 b=[b;
314 -1] ;
315 gap(id) = 1-sum((Weights(1,:)-Weights(2,:)).*param)-xis(id) ;
316 else
317 gap(id) = 0 ;
318 end ;
319 end
320 fprintf('\n') ;
321 #################################################################################
322
323 const_added = solver.addConstraint(deltas, loss, pos, marginRescaling)
324
325 objValue,w,self.slacks = solver.solve()
326
327 sum_xis = 0
328 for elem in self.slacks:
329 sum_xis += elem
330
331 """
332
333 print 'Training completed'
334
335 if __name__ == '__main__':
336 run()