+ fixed some index bugs in the weights calculation
[qpalma.git] / python / qpalma.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import sys
5 import subprocess
6 import scipy.io
7 from paths_load_data import *
8 import pdb
9 from numpy.matlib import mat,zeros,ones,inf
10 from set_param_palma import *
11 from computeSpliceAlign import *
12
13 from computeSpliceWeights import *
14 import QPalmaDP
15 from SIQP_CPX import SIQPSolver
16
17 from penalty_lookup_new import *
18 from compute_donacc import *
19
20 from TrainingParam import Param
21
22 """
23 A training method for the QPalma project
24
25 Overall procedure:
26
27 1. Load the data -> via paths_load_data
28 2. Create a QP using SIQP (paths_create_qp)
29 3. Set initial params using set_param_palma -> palma_utils
30 4. computeSpliceAlign
31 5. compute_SpliceAlignLocal
32 6. computeSpliceWeights
33 7. myalign_local
34
35 """
36
37 def plog(fh,string):
38 fh.write(string)
39
40 def run():
41 ARGS = Param()
42
43 logfh = open('qpalma.log','w+')
44
45 gen_file= '%s/genome.config' % ARGS.basedir
46
47 cmd = ['']*4
48 cmd[0] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/utils'
49 cmd[1] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/genomes'
50 cmd[2] = 'genome_info = init_genome(\'%s\')' % gen_file
51 cmd[3] = 'save genome_info.mat genome_info'
52 full_cmd = "matlab -nojvm -nodisplay -r \"%s; %s; %s; %s; exit\"" % (cmd[0],cmd[1],cmd[2],cmd[3])
53
54 obj = subprocess.Popen(full_cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
55 out,err = obj.communicate()
56 assert err == '', 'An error occured!\n%s'%err
57
58 ginfo = scipy.io.loadmat('genome_info.mat')
59 genome_info = ginfo['genome_info']
60
61 plog(logfh,'genome_info.basedir is %s\n'%genome_info.basedir)
62
63 Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data('training',genome_info,ARGS)
64
65 # number of training instances
66 N = len(Sequences)
67 assert N == len(Acceptors) and N == len(Acceptors) and N == len(Exons)\
68 and N == len(Ests), 'The Seq,Accept,Donor,.. arrays are of different lengths'
69
70 plog(logfh,'Number of training examples: %d\n'% N)
71
72 random_N = 100 ; # number of constraints to generate per iteration
73 iteration_steps = 200 ; #upper bound on iteration steps
74
75 remove_duplicate_scores = 0
76 anzpath = 2
77 print_matrix = 0
78
79 # param = numpy.matlib.rand(126,1)
80 param = numpy.matlib.mat([[ 0.62870709], [ 0.7012026 ], [ 0.60236784],
81 [ 0.51316259], [ 0.20220814], [ 0.70324863], [ 0.37218684], [ 0.82178927],
82 [ 0.60394866], [ 0.70371272], [ 0.07548074], [ 0.63412803], [ 0.97442266],
83 [ 0.13216791], [ 0.71041168], [ 0.2093887 ], [ 0.35227344], [ 0.3405142 ],
84 [ 0.69677236], [ 0.41673747], [ 0.564245 ], [ 0.37613432], [ 0.88805642],
85 [ 0.88691608], [ 0.69476752], [ 0.81659504], [ 0.17801859], [ 0.71048235],
86 [ 0.08188783], [ 0.54884803], [ 0.84039558], [ 0.6982093 ], [ 0.41686176],
87 [ 0.38568873], [ 0.29401347], [ 0.12704074], [ 0.30640858], [ 0.89578031],
88 [ 0.84621571], [ 0.11783439], [ 0.0944695 ], [ 0.34081575], [ 0.44157643],
89 [ 0.77847185], [ 0.04283567], [ 0.45107823], [ 0.89789891], [ 0.41045519],
90 [ 0.49073531], [ 0.29727627], [ 0.94711483], [ 0.24898204], [ 0.26181212],
91 [ 0.71760957], [ 0.60326883], [ 0.80887576], [ 0.09448718], [ 0.88064525],
92 [ 0.84317654], [ 0.48893703], [ 0.24847021], [ 0.84203596], [ 0.34104156],
93 [ 0.75604701], [ 0.91703057], [ 0.69325475], [ 0.61276969], [ 0.16335226],
94 [ 0.4684374 ], [ 0.16553371], [ 0.79594434], [ 0.6440283 ], [ 0.80922237],
95 [ 0.5349296 ], [ 0.31924316], [ 0.10960695], [ 0.40151062], [ 0.50473641],
96 [ 0.14812671], [ 0.73523169], [ 0.35141625], [ 0.80364238], [ 0.02128181],
97 [ 0.0061226 ], [ 0.34541924], [ 0.07694485], [ 0.05551339], [ 0.23087636],
98 [ 0.87016395], [ 0.31682377], [ 0.27375113], [ 0.72226332], [ 0.62914149],
99 [ 0.59236012], [ 0.2070238 ], [ 0.52390942], [ 0.11894098], [ 0.55725917],
100 [ 0.72706009], [ 0.087196 ], [ 0.04745082], [ 0.95636492], [ 0.31524576],
101 [ 0.79685218], [ 0.80386771], [ 0.70942604], [ 0.82869417], [ 0.26906569],
102 [ 0.51848039], [ 0.64169354], [ 0.07114973], [ 0.39249454], [ 0.07002803],
103 [ 0.94667567], [ 0.02252752], [ 0.01039039], [ 0.5721312 ], [ 0.06065969],
104 [ 0.69422476], [ 0.4310939 ], [ 0.03069099], [ 0.35969779], [ 0.18047331],
105 [ 0.4177651 ], [ 0.01360547], [ 0.29069319]])
106
107 [h,d,a,mmatrix] = set_param_palma(param,ARGS.train_with_intronlengthinformation)
108
109 # checked values till this position
110
111 # delete splicesite-score-information
112 if not ARGS.train_with_splicesitescoreinformation:
113 for i in range(len(Acceptors)):
114 if Acceptors[i] > -20:
115 Acceptors[i] = 1
116 if Donors[i] >-20:
117 Donors[i] = 1
118
119 #############################################################################################
120 # Training
121 #############################################################################################
122 plog(logfh,'Starting training...')
123
124 numExamples = N
125 C=1.0
126
127 numDonSuppPoints = 30
128 numAccSuppPoints = 30
129 numLengthSuppPoints = 30
130 sizeMMatrix = 36
131 numQualSuppPoints = 16*0
132
133 numFeatures = numDonSuppPoints + numAccSuppPoints + + numLengthSuppPoints\
134 + sizeMMatrix + numQualSuppPoints
135
136 qualityMatrix = zeros((numQualSuppPoints,1))
137
138 plog(logfh,'Initializing problem...\n')
139
140 #problem = SIQPSolver(numFeatures,numExamples,C,logfile)
141 num_path = anzpath*ones((1,N)) ; # nr of alignments done (best path, second-best path etc.)
142 gap = zeros((1,N))
143
144 plog(logfh,'Starting training...\n')
145
146 iteration_nr = 1
147
148 while True:
149 print 'Iteration step: %d'% iteration_nr
150
151 for exampleId in range(numExamples):
152 if exampleId % 1000 == 0:
153 print 'Current example nr %d' % exampleId
154
155 dna = Sequences[exampleId]
156 est = Ests[exampleId]
157
158 exons = Exons[exampleId]
159 # NoiseMatrix = Noises[exampleId]
160 don_supp = Donors[exampleId]
161 acc_supp = Acceptors[exampleId]
162
163 # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
164 trueSpliceAlign, trueWeightMatch = computeSpliceAlign(dna, exons)
165
166 pdb.set_trace()
167
168 # Calculate
169 trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
170
171 # Reshape currentW param
172 currentW = zeros((numFeatures,1))
173
174 currentW[0:numDonSuppPoints,0] = trueWeightDon[:,0]
175 currentW[numDonSuppPoints:numDonSuppPoints+numAccSuppPoints,0] = trueWeightAcc[:,0]
176 currentW[numDonSuppPoints+numAccSuppPoints:numDonSuppPoints+numAccSuppPoints+numLengthSuppPoints,0] = trueWeightIntron[:,0]
177 currentW[numDonSuppPoints+numAccSuppPoints+numLengthSuppPoints:numDonSuppPoints+numAccSuppPoints+numLengthSuppPoints+sizeMMatrix,0] = trueWeightMatch[:,0]
178 currentW[numDonSuppPoints+numAccSuppPoints+numLengthSuppPoints+sizeMMatrix:numDonSuppPoints+numAccSuppPoints+numLengthSuppPoints+sizeMMatrix+numQualSuppPoints,0] = qualityMatrix[:,0]
179
180 currentPhi = zeros((numFeatures,1))
181 currentPhi[0:30] = mat(d.penalties[:]).reshape(30,1)
182 currentPhi[30:60] = mat(a.penalties[:]).reshape(30,1)
183 currentPhi[60:90] = mat(h.penalties[:]).reshape(30,1)
184 currentPhi[90:126] = mmatrix[:]
185 currentPhi[126:] = qualityMatrix[:]
186
187 # Calculate w'phi(x,y) the total score of the alignment
188 alignmentScore = currentW.T * currentPhi
189
190 ################## Calculate wrong alignment(s) ######################
191
192 # Compute donor, acceptor with penalty_lookup_new
193 # returns two double lists
194 donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
195
196 #myalign wants the acceptor site on the g of the ag
197 acceptor = acceptor[1:]
198 acceptor.append(-inf)
199
200 ####################### checked above values ##########################################
201 nr_paths = 2
202
203 dna = str(dna)
204 est = str(est)
205 dna_len = len(dna)
206 est_len = len(est)
207
208 ps = h.convert2SWIG()
209
210 matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
211 mm_len = 36
212
213 d_len = len(donor)
214 donor = QPalmaDP.createDoubleArrayFromList(donor)
215 a_len = len(acceptor)
216 acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
217
218 remove_duplicate_scores = False
219 print_matrix = False
220
221 currentAlignment = QPalmaDP.Alignment()
222 qualityMat = QPalmaDP.createDoubleArrayFromList(qualityMatrix)
223 currentAlignment.setQualityMatrix(qualityMat,numQualSuppPoints)
224
225 print 'PYTHON: Calling myalign...'
226 # returns [SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest]
227 currentAlignment.myalign( nr_paths, dna, dna_len,\
228 est, est_len, ps, matchmatrix, mm_len, donor, d_len,\
229 acceptor, a_len, remove_duplicate_scores, print_matrix)
230
231 #newSpliceAlign = QPalmaDP.createIntArrayFromList([0]*(dna_len*nr_paths))
232 #newEstAlign = QPalmaDP.createIntArrayFromList([0]*(est_len*nr_paths))
233 #newWeightMatch = QPalmaDP.createIntArrayFromList([0]*(mm_len*nr_paths))
234 #newAlignmentScores = QPalmaDP.createDoubleArrayFromList([.0]*nr_paths)
235
236 #currentAlignment.getAlignmentResults(newSpliceAlign, newEstAlign, newWeightMatch, newAlignmentScores)
237
238 #print newSpliceAlign
239
240 print 'after myalign call...'
241
242 if exampleId == 2:
243 break
244
245 iteration_nr += 1
246 break
247
248 logfh.close()
249
250 """
251 const_added = solver.addConstraint(deltas, loss, pos, marginRescaling)
252
253 objValue,w,self.slacks = solver.solve()
254
255 sum_xis = 0
256 for elem in self.slacks:
257 sum_xis += elem
258
259 """
260 print 'Training completed'
261
262 if __name__ == '__main__':
263 run()