#!/usr/bin/env python
# -*- coding: utf-8 -*-
+###########################################################
+#
+# This file containts the
+#
+###########################################################
+
import sys
import subprocess
import scipy.io
from compute_donacc import *
from TrainingParam import Param
-"""
-A training method for the QPalma project
-"""
-
-
+import Configuration
class QPalma:
+ """
+ A training method for the QPalma project
+ """
def __init__(self):
self.ARGS = Param()
def run(self):
+
+ # Load the whole dataset
Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data('training',self.genome_info,self.ARGS)
# number of training instances
self.plog('Number of training examples: %d\n'% N)
- random_N = 100 ; # number of constraints to generate per iteration
iteration_steps = 200 ; #upper bound on iteration steps
- remove_duplicate_scores = 0
+ remove_duplicate_scores = False
+ print_matrix = False
anzpath = 2
- print_matrix = 0
+ # Initialize parameter vector
# param = numpy.matlib.rand(126,1)
- param = numpy.matlib.mat([[ 0.62870709], [ 0.7012026 ], [ 0.60236784],
- [ 0.51316259], [ 0.20220814], [ 0.70324863], [ 0.37218684], [ 0.82178927],
- [ 0.60394866], [ 0.70371272], [ 0.07548074], [ 0.63412803], [ 0.97442266],
- [ 0.13216791], [ 0.71041168], [ 0.2093887 ], [ 0.35227344], [ 0.3405142 ],
- [ 0.69677236], [ 0.41673747], [ 0.564245 ], [ 0.37613432], [ 0.88805642],
- [ 0.88691608], [ 0.69476752], [ 0.81659504], [ 0.17801859], [ 0.71048235],
- [ 0.08188783], [ 0.54884803], [ 0.84039558], [ 0.6982093 ], [ 0.41686176],
- [ 0.38568873], [ 0.29401347], [ 0.12704074], [ 0.30640858], [ 0.89578031],
- [ 0.84621571], [ 0.11783439], [ 0.0944695 ], [ 0.34081575], [ 0.44157643],
- [ 0.77847185], [ 0.04283567], [ 0.45107823], [ 0.89789891], [ 0.41045519],
- [ 0.49073531], [ 0.29727627], [ 0.94711483], [ 0.24898204], [ 0.26181212],
- [ 0.71760957], [ 0.60326883], [ 0.80887576], [ 0.09448718], [ 0.88064525],
- [ 0.84317654], [ 0.48893703], [ 0.24847021], [ 0.84203596], [ 0.34104156],
- [ 0.75604701], [ 0.91703057], [ 0.69325475], [ 0.61276969], [ 0.16335226],
- [ 0.4684374 ], [ 0.16553371], [ 0.79594434], [ 0.6440283 ], [ 0.80922237],
- [ 0.5349296 ], [ 0.31924316], [ 0.10960695], [ 0.40151062], [ 0.50473641],
- [ 0.14812671], [ 0.73523169], [ 0.35141625], [ 0.80364238], [ 0.02128181],
- [ 0.0061226 ], [ 0.34541924], [ 0.07694485], [ 0.05551339], [ 0.23087636],
- [ 0.87016395], [ 0.31682377], [ 0.27375113], [ 0.72226332], [ 0.62914149],
- [ 0.59236012], [ 0.2070238 ], [ 0.52390942], [ 0.11894098], [ 0.55725917],
- [ 0.72706009], [ 0.087196 ], [ 0.04745082], [ 0.95636492], [ 0.31524576],
- [ 0.79685218], [ 0.80386771], [ 0.70942604], [ 0.82869417], [ 0.26906569],
- [ 0.51848039], [ 0.64169354], [ 0.07114973], [ 0.39249454], [ 0.07002803],
- [ 0.94667567], [ 0.02252752], [ 0.01039039], [ 0.5721312 ], [ 0.06065969],
- [ 0.69422476], [ 0.4310939 ], [ 0.03069099], [ 0.35969779], [ 0.18047331],
- [ 0.4177651 ], [ 0.01360547], [ 0.29069319]])
+ param = Configuration.fixedParam
+ # Set the parameters such as limits penalties for the Plifs
[h,d,a,mmatrix] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
- # checked values till this position
-
# delete splicesite-score-information
if not self.ARGS.train_with_splicesitescoreinformation:
for i in range(len(Acceptors)):
if Donors[i] >-20:
Donors[i] = 1
+ # Initialize solver
#problem = SIQPSolver(numFeatures,numExamples,C,logfile)
+
# stores the number of alignments done for each example (best path, second-best path etc.)
num_path = [anzpath]*N
# stores the gap for each example
gap = [0.0]*N
qualityMatrix = zeros((self.numQualSuppPoints,1))
+
#############################################################################################
# Training
#############################################################################################
- self.plog('Starting training...')
-
-
self.plog('Starting training...\n')
iteration_nr = 1
currentPhi[126:] = qualityMatrix[:]
# Calculate w'phi(x,y) the total score of the alignment
- alignmentScore = currentW.T * currentPhi
+ trueAlignmentScore = currentW.T * currentPhi
+
################## Calculate wrong alignment(s) ######################
+
# Compute donor, acceptor with penalty_lookup_new
# returns two double lists
donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
acceptor = acceptor[1:]
acceptor.append(-inf)
- ####################### checked above values ##########################################
dna = str(dna)
est = str(est)
dna_len = len(dna)
est_len = len(est)
-
ps = h.convert2SWIG()
matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
a_len = len(acceptor)
acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
- remove_duplicate_scores = False
- print_matrix = False
-
currentAlignment = QPalmaDP.Alignment()
qualityMat = QPalmaDP.createDoubleArrayFromList(qualityMatrix)
currentAlignment.setQualityMatrix(qualityMat,self.numQualSuppPoints)
print 'PYTHON: Calling myalign...'
- # returns [SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest]
+ # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
currentAlignment.myalign( num_path[exampleIdx], dna, dna_len,\
est, est_len, ps, matchmatrix, mm_len, donor, d_len,\
acceptor, a_len, remove_duplicate_scores, print_matrix)
-
- print 'after myalign call...'
+ print 'PYTHON: After myalign call...'
newSpliceAlign = QPalmaDP.createIntArrayFromList([0]*(dna_len*num_path[exampleIdx]))
newEstAlign = QPalmaDP.createIntArrayFromList([0]*(est_len*num_path[exampleIdx]))
currentAlignment.getAlignmentResults(newSpliceAlign, newEstAlign, newWeightMatch, newAlignmentScores)
- #Berechne Gewichte der durch Alignment berechneten Pfade
+ spliceAlign = zeros((num_path[exampleIdx]*dna_len,1))
+ weightMatch = zeros((num_path[exampleIdx]*mm_len,1))
+
+ for i in range(dna_len*num_path[exampleIdx]):
+ spliceAlign[i] = newSpliceAlign[i]
+
+ for i in range(mm_len*num_path[exampleIdx]):
+ weightMatch[i] = newWeightMatch[i]
+
+ spliceAlign = spliceAlign.reshape(num_path[exampleIdx],dna_len)
+ weightMatch = weightMatch.reshape(num_path[exampleIdx],mm_len)
+
+ # Calculate weights of the respective alignments Note that we are
+ # calculating n-best alignments without any hamming loss, so we
+ # have to keep track which of the n-best alignments correspond to
+ # the true one in order not to incorporate a true alignment in the
+ # constraints. To keep track of the true and false alignments we
+ # define an array true_map with a boolean indicating the
+ # equivalence to the true alignment for each decoded alignment.
true_map = zeros((1,1+num_path[exampleIdx]))
true_map[0] = 1
path_loss = [0]*num_path[exampleIdx]
- # for pathNr in range(num_path(exampleIdx)):
- # #dna_numbers = dnaest{1,pathNr}
- # #est_numbers = dnaest{2,pathNr}
- #
- # weightDon, weightAcc, weightIntron = compute_SpliceWeights(d, a, h, SpliceAlign[pathNr,:], don_supp, acc_supp)
+ AlignmentScores = [0.0]*num_path[exampleIdx]
+ Weights = zeros((num_path[exampleIdx],self.numFeatures))
+
+ for pathNr in range(num_path[exampleIdx]):
+ #dna_numbers = dnaest{1,pathNr}
+ #est_numbers = dnaest{2,pathNr}
- # # sum up positionwise loss between alignments
- # for alignPosIdx in range(len(SpliceAlign[pathNr,:])):
- # if SpliceAlign[pathNr,alignPosIdx] != true_spliceAlign[alignPosIdx]:
- # path_loss[pathNr] += 1
+ weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a, h, spliceAlign[pathNr,:].flatten().tolist()[0], don_supp, acc_supp)
- # # Gewichte in restliche Zeilen der Matrix speichern
- # Weights[pathNr+1,:] = [weightIntron, weightDon, weightAcc, weightMatch(pathNr,:)]
+ # sum up positionwise loss between alignments
+ for alignPosIdx in range(len(spliceAlign[pathNr,:])):
+ if spliceAlign[pathNr,alignPosIdx] != trueSpliceAlign[alignPosIdx]:
+ path_loss[pathNr] += 1
+
+ # Gewichte in restliche Zeilen der Matrix speichern
+ # Weights[pathNr,:] = [weightIntron, weightDon, weightAcc, weightMatch(pathNr,:)]
+ wp = self.createParameterVector(weightIntron, weightDon, weightAcc, weightMatch[pathNr,:], qualityMatrix)
+ Weights[pathNr,:] = wp.T
+
+ hpen = mat(h.penalties).reshape(len(h.penalties),1)
+ dpen = mat(d.penalties).reshape(len(d.penalties),1)
+ apen = mat(a.penalties).reshape(len(a.penalties),1)
+
+ features = numpy.vstack([hpen , dpen , apen , mmatrix[:]])
- # AlignmentScores(pathNr+1) = Weights(pathNr+1,:) * [h.penalties ; d.penalties ; a.penalties ; mmatrix(:)]
+ AlignmentScores[pathNr] = (Weights[pathNr,:] * features)[0,0]
- # # Check wether scalar product + loss equals viterbi score
- # assert(abs(Gesamtscores(pathNr) - AlignmentScores(pathNr+1)) < 1e-6)
+ # Check wether scalar product + loss equals viterbi score
+ # assert math.fabs(newAlignmentScores[pathNr] - AlignmentScores[pathNr]) < 1e-6, 'Scalar prod + loss is not equal Viterbi score'
- # if norm(Weights(1,:)-Weights(pathNr+1,:))<1e-5,
- # true_map(pathNr+1)=1
+ # if norm(Weights(1,:)-Weights(pathNr+1,:))<1e-5,
+ # true_map(pathNr+1)=1
#
# # the true label sequence should not have a larger score than the
# # maximal one WHYYYYY?
# keyboard
# end ;
- # # %%set_param_palma should have done this right
- # for z = 1:num_path[exampleIdx]
- # assert(abs(Weights(z,:) * param(1:126) -AlignmentScores(z)) <= 1e-6)
- # end
-
# if all(true_map==1)
# num_path[exampleIdx]=num_path[exampleIdx]+1 ; %next iteration step: one alignment more
# end ;
# gap[exampleIdx] = 0 ;
# end ;
+ # LMM.py code:
+ # deltas = Weights(2,:)-Weights(1,:) zeros(1,126) -e
+ # const_added = solver.addConstraint(deltas, idx)
+ #
+ # objValue,w,self.slacks = solver.solve()
+
+ # sum_xis = 0
+ # for elem in self.slacks:
+ # sum_xis += elem
+
if exampleIdx== 10:
break
break
self.logfh.close()
-
- """
- const_added = solver.addConstraint(deltas, loss, pos, marginRescaling)
-
- objValue,w,self.slacks = solver.solve()
-
- sum_xis = 0
- for elem in self.slacks:
- sum_xis += elem
-
- """
print 'Training completed'
if __name__ == '__main__':