2 # -*- coding: utf-8 -*-
4 ###########################################################
6 # This file containts the
8 ###########################################################
15 from numpy
.matlib
import mat
,zeros
,ones
,inf
16 from numpy
.linalg
import norm
20 from SIQP_CPX
import SIQPSolver
22 from paths_load_data
import *
23 from computeSpliceWeights
import *
24 from set_param_palma
import *
25 from computeSpliceAlign
import *
26 from penalty_lookup_new
import *
27 from compute_donacc
import *
28 from TrainingParam
import Param
34 A training method for the QPalma project
40 self
.logfh
= open('qpalma.log','w+')
42 gen_file
= '%s/genome.config' % self
.ARGS
.basedir
45 cmd
[0] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/utils'
46 cmd
[1] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/genomes'
47 cmd
[2] = 'genome_info = init_genome(\'%s\')' % gen_file
48 cmd
[3] = 'save genome_info.mat genome_info'
49 full_cmd
= "matlab -nojvm -nodisplay -r \"%s; %s; %s; %s; exit\"" % (cmd
[0],cmd
[1],cmd
[2],cmd
[3])
51 obj
= subprocess
.Popen(full_cmd
,shell
=True,stdout
=subprocess
.PIPE
,stderr
=subprocess
.PIPE
)
52 out
,err
= obj
.communicate()
53 assert err
== '', 'An error occured!\n%s'%err
55 ginfo
= scipy
.io
.loadmat('genome_info.mat')
56 self
.genome_info
= ginfo
['genome_info']
58 self
.plog('genome_info.basedir is %s\n'%self
.genome_info
.basedir
)
62 self
.numDonSuppPoints
= 30
63 self
.numAccSuppPoints
= 30
64 self
.numLengthSuppPoints
= 30
66 self
.numQualSuppPoints
= 16*0
68 self
.numFeatures
= self
.numDonSuppPoints
+ self
.numAccSuppPoints
+ self
.numLengthSuppPoints\
69 + self
.sizeMMatrix
+ self
.numQualSuppPoints
71 self
.plog('Initializing problem...\n')
74 def plog(self
,string
):
75 self
.logfh
.write(string
)
79 # Load the whole dataset
80 Sequences
, Acceptors
, Donors
, Exons
, Ests
, Noises
= paths_load_data('training',self
.genome_info
,self
.ARGS
)
82 #Sequences, Acceptors, Donors, Exons, Ests, QualityScores = paths_load_data('training',self.genome_info,self.ARGS)
84 # number of training instances
87 assert N
== len(Acceptors
) and N
== len(Acceptors
) and N
== len(Exons
)\
88 and N
== len(Ests
), 'The Seq,Accept,Donor,.. arrays are of different lengths'
90 self
.plog('Number of training examples: %d\n'% N
)
92 iteration_steps
= 200 ; #upper bound on iteration steps
94 remove_duplicate_scores
= False
98 # Initialize parameter vector
99 # param = numpy.matlib.rand(126,1)
100 param
= Configuration
.fixedParam
102 # Set the parameters such as limits penalties for the Plifs
103 [h
,d
,a
,mmatrix
] = set_param_palma(param
,self
.ARGS
.train_with_intronlengthinformation
)
105 # delete splicesite-score-information
106 if not self
.ARGS
.train_with_splicesitescoreinformation
:
107 for i
in range(len(Acceptors
)):
108 if Acceptors
[i
] > -20:
115 solver
= SIQPSolver(self
.numFeatures
,self
.numExamples
,self
.C
,self
.logfh
)
117 # stores the number of alignments done for each example (best path, second-best path etc.)
118 num_path
= [anzpath
]*N
119 # stores the gap for each example
122 qualityMatrix
= zeros((self
.numQualSuppPoints
,1))
124 #############################################################################################
126 #############################################################################################
127 self
.plog('Starting training...\n')
132 print 'Iteration step: %d'% iteration_nr
134 for exampleIdx
in range(self
.numExamples
):
135 if exampleIdx
% 1000 == 0:
136 print 'Current example nr %d' % exampleIdx
138 dna
= Sequences
[exampleIdx
]
139 est
= Ests
[exampleIdx
]
141 exons
= Exons
[exampleIdx
]
142 # NoiseMatrix = Noises[exampleIdx]
143 don_supp
= Donors
[exampleIdx
]
144 acc_supp
= Acceptors
[exampleIdx
]
146 # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
147 trueSpliceAlign
, trueWeightMatch
= computeSpliceAlign(dna
, exons
)
149 # Calculate the weights
150 trueWeightDon
, trueWeightAcc
, trueWeightIntron
= computeSpliceWeights(d
, a
, h
, trueSpliceAlign
, don_supp
, acc_supp
)
151 trueWeight
= numpy
.vstack([trueWeightIntron
, trueWeightDon
, trueWeightAcc
, trueWeightMatch
, qualityMatrix
])
153 currentPhi
= zeros((self
.numFeatures
,1))
154 currentPhi
[0:30] = mat(d
.penalties
[:]).reshape(30,1)
155 currentPhi
[30:60] = mat(a
.penalties
[:]).reshape(30,1)
156 currentPhi
[60:90] = mat(h
.penalties
[:]).reshape(30,1)
157 currentPhi
[90:126] = mmatrix
[:]
158 currentPhi
[126:] = qualityMatrix
[:]
160 # Calculate w'phi(x,y) the total score of the alignment
161 trueAlignmentScore
= (trueWeight
.T
* currentPhi
)[0,0]
163 # The allWeights vector is supposed to store the weight parameter
164 # of the true alignment as well as the weight parameters of the
165 # num_path[exampleIdx] other alignments
166 allWeights
= zeros((self
.numFeatures
,num_path
[exampleIdx
]+1))
167 allWeights
[:,0] = trueWeight
[:,0]
169 AlignmentScores
= [0.0]*(num_path
[exampleIdx
]+1)
170 AlignmentScores
[0] = trueAlignmentScore
172 ################## Calculate wrong alignment(s) ######################
174 # Compute donor, acceptor with penalty_lookup_new
175 # returns two double lists
176 donor
, acceptor
= compute_donacc(don_supp
, acc_supp
, d
, a
)
178 #myalign wants the acceptor site on the g of the ag
179 acceptor
= acceptor
[1:]
180 acceptor
.append(-inf
)
186 ps
= h
.convert2SWIG()
188 matchmatrix
= QPalmaDP
.createDoubleArrayFromList(mmatrix
.flatten().tolist()[0])
192 donor
= QPalmaDP
.createDoubleArrayFromList(donor
)
193 a_len
= len(acceptor
)
194 acceptor
= QPalmaDP
.createDoubleArrayFromList(acceptor
)
196 currentAlignment
= QPalmaDP
.Alignment()
197 qualityMat
= QPalmaDP
.createDoubleArrayFromList(qualityMatrix
)
198 currentAlignment
.setQualityMatrix(qualityMat
,self
.numQualSuppPoints
)
200 print 'PYTHON: Calling myalign...'
201 # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
202 currentAlignment
.myalign( num_path
[exampleIdx
], dna
, dna_len
,\
203 est
, est_len
, ps
, matchmatrix
, mm_len
, donor
, d_len
,\
204 acceptor
, a_len
, remove_duplicate_scores
, print_matrix
)
205 print 'PYTHON: After myalign call...'
207 newSpliceAlign
= QPalmaDP
.createIntArrayFromList([0]*(dna_len
*num_path
[exampleIdx
]))
208 newEstAlign
= QPalmaDP
.createIntArrayFromList([0]*(est_len
*num_path
[exampleIdx
]))
209 newWeightMatch
= QPalmaDP
.createIntArrayFromList([0]*(mm_len
*num_path
[exampleIdx
]))
210 newAlignmentScores
= QPalmaDP
.createDoubleArrayFromList([.0]*num_path
[exampleIdx
])
214 currentAlignment
.getAlignmentResults(newSpliceAlign
, newEstAlign
, newWeightMatch
, newAlignmentScores
)
216 spliceAlign
= zeros((num_path
[exampleIdx
]*dna_len
,1))
217 weightMatch
= zeros((num_path
[exampleIdx
]*mm_len
,1))
220 for i
in range(dna_len
*num_path
[exampleIdx
]):
221 spliceAlign
[i
] = newSpliceAlign
[i
]
222 print '%f' % (spliceAlign
[i
])
225 for i
in range(mm_len
*num_path
[exampleIdx
]):
226 weightMatch
[i
] = newWeightMatch
[i
]
227 print '%f' % (weightMatch
[i
])
229 for i
in range(num_path
[exampleIdx
]):
230 AlignmentScores
[i
+1] = newAlignmentScores
[i
]
232 print AlignmentScores
234 spliceAlign
= spliceAlign
.reshape(num_path
[exampleIdx
],dna_len
)
235 weightMatch
= weightMatch
.reshape(num_path
[exampleIdx
],mm_len
)
236 # Calculate weights of the respective alignments Note that we are
237 # calculating n-best alignments without any hamming loss, so we
238 # have to keep track which of the n-best alignments correspond to
239 # the true one in order not to incorporate a true alignment in the
240 # constraints. To keep track of the true and false alignments we
241 # define an array true_map with a boolean indicating the
242 # equivalence to the true alignment for each decoded alignment.
243 true_map
= [0]*(num_path
[exampleIdx
]+1)
245 path_loss
= [0]*(num_path
[exampleIdx
]+1)
247 for pathNr
in range(num_path
[exampleIdx
]):
248 #dna_numbers = dnaest{1,pathNr}
249 #est_numbers = dnaest{2,pathNr}
251 weightDon
, weightAcc
, weightIntron
= computeSpliceWeights(d
, a
, h
, spliceAlign
[pathNr
,:].flatten().tolist()[0], don_supp
, acc_supp
)
253 # sum up positionwise loss between alignments
254 for alignPosIdx
in range(len(spliceAlign
[pathNr
,:])):
255 if spliceAlign
[pathNr
,alignPosIdx
] != trueSpliceAlign
[alignPosIdx
]:
256 path_loss
[pathNr
+1] += 1
258 # Gewichte in restliche Zeilen der Matrix speichern
259 wp
= numpy
.vstack([weightIntron
, weightDon
, weightAcc
, weightMatch
[pathNr
,:].T
, qualityMatrix
])
260 allWeights
[:,pathNr
+1] = wp
262 hpen
= mat(h
.penalties
).reshape(len(h
.penalties
),1)
263 dpen
= mat(d
.penalties
).reshape(len(d
.penalties
),1)
264 apen
= mat(a
.penalties
).reshape(len(a
.penalties
),1)
266 features
= numpy
.vstack([hpen
, dpen
, apen
, mmatrix
[:]])
267 AlignmentScores
[pathNr
+1] = (allWeights
[:,pathNr
+1].T
* features
)[0,0]
269 # Check wether scalar product + loss equals viterbi score
270 #assert math.fabs(newAlignmentScores[pathNr] - AlignmentScores[pathNr+1]) < 1e-6,\
271 #'Scalar prod + loss is not equal Viterbi score. Respective values are %f, %f' % \
272 #(newAlignmentScores[pathNr],AlignmentScores[pathNr+1])
274 # # if the pathNr-best alignment is very close to the true alignment consider it as true
275 if norm( allWeights
[:,0] - allWeights
[:,pathNr
+1] ) < 1e-5:
276 true_map
[pathNr
+1] = 1
278 # the true label sequence should not have a larger score than the maximal one WHYYYYY?
280 # this means that all n-best paths are to close to each other
281 # we have to extend the n-best search to a (n+1)-best
282 if len([elem
for elem
in true_map
if elem
== 1]) == len(true_map
):
283 num_path
[exampleIdx
] = num_path
[exampleIdx
]+1
285 # Choose true and first false alignment for extending A
287 for map_idx
,elem
in enumerate(true_map
):
289 firstFalseIdx
= map_idx
292 # if there is at least one useful false alignment add the
293 # corresponding constraints to the optimization problem
294 if firstFalseIdx
!= -1:
295 trueWeights
= allWeights
[:,0]
296 firstFalseWeights
= allWeights
[:,firstFalseIdx
]
299 deltas
= firstFalseWeights
- trueWeights
301 const_added
= solver
.addConstraint(deltas
, exampleIdx
)
302 objValue
,w
,self
.slacks
= solver
.solve()
305 for elem
in self
.slacks
:
315 print 'Training completed'
317 if __name__
== '__main__':