+ added feature calculation for the labels
[qpalma.git] / python / qpalma.py
index 217561b..4efc708 100644 (file)
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
+###########################################################
+#
+# 
+#
+###########################################################
+
 import sys
 import subprocess
 import scipy.io
-from paths_load_data import *
 import pdb
-from numpy.matlib import mat,zeros,ones
-from set_param_palma import *
-from computeSpliceAlign import *
+import os.path
+
+from numpy.matlib import mat,zeros,ones,inf
+from numpy.linalg import norm
 
-from computeSpliceWeights import *
 import QPalmaDP
+
 from SIQP_CPX import SIQPSolver
 
+from paths_load_data import *
+from paths_load_data_pickle import *
+
+from computeSpliceWeights import *
+from set_param_palma import *
+from computeSpliceAlign import *
+from computeSpliceAlignWithQuality import *
 from penalty_lookup_new import *
 from compute_donacc import *
-
 from TrainingParam import Param
+from export_param import *
 
-"""
-A training method for the QPalma project
-
-Overall procedure:
-   1. Load the data -> via paths_load_data
-   2. Create a QP using SIQP (paths_create_qp)
-   3. Set initial params using set_param_palma -> palma_utils
-   4. computeSpliceAlign
-   5. compute_SpliceAlignLocal
-   6. computeSpliceWeights
-   7. myalign_local
-
-"""
-
-def plog(fh,string):
-   fh.write(string)
-
-def run():
-   ARGS = Param()
-
-   logfh = open('qpalma.log','w+')
-
-   gen_file= '%s/genome.config' % ARGS.basedir
-
-   cmd = ['']*4
-   cmd[0] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/utils'
-   cmd[1] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/genomes'
-   cmd[2] = 'genome_info = init_genome(\'%s\')' % gen_file
-   cmd[3] = 'save genome_info.mat genome_info'  
-   full_cmd = "matlab -nojvm -nodisplay -r \"%s; %s; %s; %s; exit\"" % (cmd[0],cmd[1],cmd[2],cmd[3])
-
-   obj = subprocess.Popen(full_cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
-   out,err = obj.communicate()
-   assert err == '', 'An error occured!\n%s'%err
-
-   ginfo = scipy.io.loadmat('genome_info.mat')
-   genome_info = ginfo['genome_info']
-
-   plog(logfh,'genome_info.basedir is %s\n'%genome_info.basedir)
-
-   Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data('training',genome_info,ARGS)
-
-   # number of training instances
-   N = len(Sequences) 
-   assert N == len(Acceptors) and N == len(Acceptors) and N == len(Exons)\
-   and N == len(Ests), 'The Seq,Accept,Donor,.. arrays are of different lengths'
-
-   plog(logfh,'Number of training examples: %d\n'% N)
-
-   random_N = 100 ; # number of constraints to generate per iteration
-   iteration_steps = 200 ; #upper bound on iteration steps
-
-   remove_duplicate_scores = 0
-   anzpath = 2
-   print_matrix = 0
-
-   # param = numpy.matlib.rand(126,1)
-   param = numpy.matlib.mat([[ 0.62870709], [ 0.7012026 ], [ 0.60236784],
-    [ 0.51316259], [ 0.20220814], [ 0.70324863], [ 0.37218684], [ 0.82178927],
-    [ 0.60394866], [ 0.70371272], [ 0.07548074], [ 0.63412803], [ 0.97442266],
-    [ 0.13216791], [ 0.71041168], [ 0.2093887 ], [ 0.35227344], [ 0.3405142 ],
-    [ 0.69677236], [ 0.41673747], [ 0.564245  ], [ 0.37613432], [ 0.88805642],
-    [ 0.88691608], [ 0.69476752], [ 0.81659504], [ 0.17801859], [ 0.71048235],
-    [ 0.08188783], [ 0.54884803], [ 0.84039558], [ 0.6982093 ], [ 0.41686176],
-    [ 0.38568873], [ 0.29401347], [ 0.12704074], [ 0.30640858], [ 0.89578031],
-    [ 0.84621571], [ 0.11783439], [ 0.0944695 ], [ 0.34081575], [ 0.44157643],
-    [ 0.77847185], [ 0.04283567], [ 0.45107823], [ 0.89789891], [ 0.41045519],
-    [ 0.49073531], [ 0.29727627], [ 0.94711483], [ 0.24898204], [ 0.26181212],
-    [ 0.71760957], [ 0.60326883], [ 0.80887576], [ 0.09448718], [ 0.88064525],
-    [ 0.84317654], [ 0.48893703], [ 0.24847021], [ 0.84203596], [ 0.34104156],
-    [ 0.75604701], [ 0.91703057], [ 0.69325475], [ 0.61276969], [ 0.16335226],
-    [ 0.4684374 ], [ 0.16553371], [ 0.79594434], [ 0.6440283 ], [ 0.80922237],
-    [ 0.5349296 ], [ 0.31924316], [ 0.10960695], [ 0.40151062], [ 0.50473641],
-    [ 0.14812671], [ 0.73523169], [ 0.35141625], [ 0.80364238], [ 0.02128181],
-    [ 0.0061226 ], [ 0.34541924], [ 0.07694485], [ 0.05551339], [ 0.23087636],
-    [ 0.87016395], [ 0.31682377], [ 0.27375113], [ 0.72226332], [ 0.62914149],
-    [ 0.59236012], [ 0.2070238 ], [ 0.52390942], [ 0.11894098], [ 0.55725917],
-    [ 0.72706009], [ 0.087196  ], [ 0.04745082], [ 0.95636492], [ 0.31524576],
-    [ 0.79685218], [ 0.80386771], [ 0.70942604], [ 0.82869417], [ 0.26906569],
-    [ 0.51848039], [ 0.64169354], [ 0.07114973], [ 0.39249454], [ 0.07002803],
-    [ 0.94667567], [ 0.02252752], [ 0.01039039], [ 0.5721312 ], [ 0.06065969],
-    [ 0.69422476], [ 0.4310939 ], [ 0.03069099], [ 0.35969779], [ 0.18047331],
-    [ 0.4177651 ], [ 0.01360547], [ 0.29069319]])
-
-   [h,d,a,mmatrix] = set_param_palma(param,ARGS.train_with_intronlengthinformation)
-
-   # checked values till this position
-
-   # delete splicesite-score-information
-   if not ARGS.train_with_splicesitescoreinformation:
-      for i in range(len(Acceptors)):
-         if Acceptors[i] > -20:
-            Acceptors[i] = 1
-         if Donors[i] >-20:
-            Donors[i] = 1
-
-   #############################################################################################
-   # Training
-   #############################################################################################
-   plog(logfh,'Starting training...')
-
-   numExamples = N
-   C=1.0
-
-   numDonSuppPoints     = 30
-   numAccSuppPoints     = 30
-   numLengthSuppPoints  = 30 
-   sizeMMatrix          = 36
-   numQualSuppPoints    = 16*0
-
-   numFeatures = numDonSuppPoints + numAccSuppPoints + + numLengthSuppPoints\
-               + sizeMMatrix + numQualSuppPoints
-
-   qualityMatrix = zeros((numQualSuppPoints,1))
-   
-   plog(logfh,'Initializing problem...\n')
-
-   #problem = SIQPSolver(numFeatures,numExamples,C,logfile)
-   num_path = anzpath*ones((1,N)) ; # nr of alignments done (best path, second-best path etc.)
-   gap = zeros((1,N))
+import Configuration
 
-   plog(logfh,'Starting training...\n')
+from Plif import Plf
 
-   iteration_nr = 1
+def getQualityFeatureCounts(qualityPlifs):
+   weightQuality = qualityPlifs[0].penalties
+   for currentPlif in qualityPlifs[1:]:
+      weightQuality = numpy.vstack([weightQuality, currentPlif.penalties])
 
-   while True:
-      print 'Iteration step: %d'% iteration_nr
+   return weightQuality 
 
-      for exampleId in range(numExamples):
-         if exampleId % 1000 == 0:
-            print 'Current example nr %d' % exampleId
 
-         dna = Sequences[exampleId] 
-         est = Ests[exampleId] 
+def initializeQualityScoringFunctions(numPlifs,numSuppPoints):
 
-         exons = Exons[exampleId] 
-         # NoiseMatrix = Noises[exampleId] 
-         don_supp = Donors[exampleId] 
-         acc_supp = Acceptors[exampleId] 
+   min_intron_len=20
+   max_intron_len=1000
+   min_svm_score=-5
+   max_svm_score=5
 
-         # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)    
+   qualityPlifs = [None]*numPlifs
 
-         # trueSpliceAlign is equal
-         # trueWeightMatch is equal
-         trueSpliceAlign, trueWeightMatch = computeSpliceAlign(dna, exons)
+   for idx in range(numPlifs):
 
-         #print d.limits
-         #print d.penalties
-         #print a.limits
-         #print a.penalties
-         #print h.limits
-         #print h.penalties
+      curPlif = Plf()
+      curPlif.limits    = linspace(min_svm_score,max_svm_score,numSuppPoints) 
+      curPlif.penalties = [0]*numSuppPoints
+      curPlif.transform = '' 
+      curPlif.name      = '' 
+      curPlif.max_len   = 100 
+      curPlif.min_len   = -100 
+      curPlif.id        = 1 
+      curPlif.use_svm   = 0 
+      curPlif.next_id   = 0 
 
-         ####################### checked above values ##########################################
-      
-         # Calculate
-         trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
-
-         #print trueWeightDon.T
-         #print trueWeightAcc.T
-         #print trueWeightIntron.T
+      if idx == 0:
+         curPlif.penalties[0] = 11
+         curPlif.penalties[1] = 22
+         curPlif.penalties[2] = 33
 
-         pdb.set_trace()
+      if idx == 1:
+         curPlif.penalties[0] = 99
+         curPlif.penalties[1] = 100
+         curPlif.penalties[2] = 101
 
-         # Reshape currentW param 
-         currentW = zeros((numFeatures,1))
-      
-         currentW[0:numDonSuppPoints,0] = trueWeightDon[:,0]
-         currentW[numDonSuppPoints:numDonSuppPoints+numAccSuppPoints,0] = trueWeightAcc[:,0]
-         currentW[numDonSuppPoints+numAccSuppPoints:numDonSuppPoints+numAccSuppPoints+numLengthSuppPoints,0] = trueWeightIntron[:,0]
-         currentW[numDonSuppPoints+numAccSuppPoints+numLengthSuppPoints:numDonSuppPoints+numAccSuppPoints+numLengthSuppPoints+sizeMMatrix,0] = trueWeightMatch[:,0]
-         currentW[numDonSuppPoints+numAccSuppPoints+numLengthSuppPoints+sizeMMatrix:numDonSuppPoints+numAccSuppPoints+numLengthSuppPoints+sizeMMatrix+numQualSuppPoints,0] = qualityMatrix[:,0]
-
-         currentPhi = zeros((numFeatures,1))
-         currentPhi[0:30]     = d.penalties[:]
-         currentPhi[30:60]    = a.penalties[:]
-         currentPhi[60:90]    = h.penalties[:]
-         currentPhi[90:126]   = mmatrix[:]
-         currentPhi[126:]   = qualityMatrix[:]
-
-         # Calculate w'phi(x,y) the total score of the alignment
-         alignmentScore = currentW.T * currentPhi
+      curPlif = curPlif.convert2SWIG()
+      qualityPlifs[idx] = curPlif
 
-         #
-         # Calculate wrong alignments
-         #
+   return qualityPlifs
 
-         # Compute donor, acceptor with penalty_lookup_new
-         # returns two double lists
-         donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
-       
-         #myalign wants the acceptor site on the g of the ag
-         #acceptor = [acceptor(2:end) -Inf] ;
-
-         nr_paths = 2
-         dna = 'acgtagct'
-         dna_len = len(dna)
-
-         est = 'acgtagct'
-         est_len = len(est)
-
-         matchmatrix = QPalmaDP.createDoubleArrayFromList([1.0]*36)
-         mm_len = 36
-         donor = QPalmaDP.createDoubleArrayFromList([1,2.0,4])
-         d_len = 3
-         acceptor = QPalmaDP.createDoubleArrayFromList([.1,2,4])
-         a_len = 3
-         remove_duplicate_scores = False
-         print_matrix = False
-
-         currentAlignment = QPalmaDP.Alignment()
-
-         qualityMat = QPalmaDP.createDoubleArrayFromList(qualityMatrix)
-         currentAlignment.setQualityMatrix(qualityMat,numQualSuppPoints)
-         ps = h.convert2SWIG()
-
-         currentAlignment.myalign( nr_paths, dna, dna_len,\
-         est, est_len, ps, matchmatrix, mm_len, donor, d_len,\
-         acceptor, a_len, remove_duplicate_scores, print_matrix)
+class QPalma:
+   """
+   A training method for the QPalma project
+   """
+   
+   def __init__(self):
+      self.ARGS = Param()
 
-         print 'after myalign call...'
+      self.logfh = open('qpalma.log','w+')
+      gen_file= '%s/genome.config' % self.ARGS.basedir
 
-         #  SpliceAlign = double(SpliceAlign') ; %column
-         #  weightMatch = double(weightMatch') ;
+      ginfo_filename = 'genome_info.pickle'
+      self.genome_info = fetch_genome_info(ginfo_filename)
 
+      self.plog('genome_info.basedir is %s\n'%self.genome_info.basedir)
 
-      iteration_nr += 1
-      break
 
-   logfh.close()
+   def plog(self,string):
+      self.logfh.write(string)
 
-   """
-       %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-       % Wrong Alignments
-       %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%   
-       %test
-       for pfadNo = 1:num_path(id)
-         assert(sum(weightMatch(pfadNo,7:end)) == sum(SpliceAlign(pfadNo,:)==0)) ;
-         new_weightMatch = zeros(1,36) ;
-         for iii = 1:length(dnaest{1,pfadNo})
-           if dnaest{2,pfadNo}(iii) ~= 6
-             new_weightMatch(dnaest{1,pfadNo}(iii)*6 + dnaest{2,pfadNo}(iii) + 1) = new_weightMatch(dnaest{1,pfadNo}(iii)*6 + dnaest{2,pfadNo}(iii)+1) + 1 ;
-           end
-         end
-         assert(all(new_weightMatch == weightMatch(pfadNo,:))) ;
-         assert(sum(new_weightMatch(7:end)) == sum(SpliceAlign(pfadNo,:)==0)) ;
-       end
-       
-       %assert that it is the right file 
-       assert(all(dna(find(SpliceAlign(1,:)==1)) == 'g')) ;
-       
-       %just some info
-       
-       %Berechne Gewichte der durch Alignment berechneten Pfade    
-       true_map = zeros(1,1+num_path(id)) ;
-       true_map(1) = 1 ;
-       path_loss=[] ;
-       path_loss(1) = 0 ;
-       for pfadNo = 1:num_path(id)
-         dna_numbers = dnaest{1,pfadNo} ;
-         est_numbers = dnaest{2,pfadNo} ;
-           
-         [weightDon, weightAcc, weightIntron] = ...
-             compute_SpliceWeights(d, a, h, SpliceAlign(pfadNo,:), don_supp, acc_supp) ;
-         
-         path_loss(pfadNo+1) = sum(double(SpliceAlign(pfadNo,:))~=true_SpliceAlign) ; %not too simple?
-         
-         %Gewichte in restliche Zeilen der Matrix speichern
-         Weights(pfadNo+1,:) = [weightIntron, weightDon, weightAcc, weightMatch(pfadNo,:)] ;
-         
-         AlignmentScores(pfadNo+1) = Weights(pfadNo+1,:) * [h.penalties' ; d.penalties' ; a.penalties' ; mmatrix(:)] ;
-         
-         %Test, ob Alignprogr. auch das richtige Ergebnis liefert:
-         assert(abs(Gesamtscores(pfadNo) - AlignmentScores(pfadNo+1)) < 1e-6) ;
-        
-         if norm(Weights(1,:)-Weights(pfadNo+1,:))<1e-5,
-           true_map(pfadNo+1)=1 ;
-         end 
-       end
-          
-       % the true label sequence should not have a larger score than the
-       % maximal one WHYYYYY?
-       if AlignmentScores(1) >= max(AlignmentScores(2:end))+1e-6,
-         AlignmentScores
-         warning('true score > max score\n') ;
-         keyboard
-       end ;
-       
-       %%%set_param_palma should have done this right
-       for z = 1:num_path(id)
-         assert(abs(Weights(z,:) * param(1:126)' -AlignmentScores(z)) <= 1e-6) ; %abs: absolute value
-       end
-       
-       if all(true_map==1)
-         num_path(id)=num_path(id)+1 ; %next iteration step: one alignment more
-       end ;
-      
-       %Choose true and first false alignment for extending A
-       Weights = Weights([1 min(find(true_map==0))], :) ; 
-       
-       %if there is a false alignment
-       if size(Weights,1)>1 & sum((Weights(1,:)-Weights(2,:)).*param)+xis(id)<1+column_eps,
-         e=zeros(1,N) ; 
-         e(id) = 1 ;
-         A=[A;
-            Weights(2,:)-Weights(1,:) zeros(1,126) -e] ;
-         b=[b;
-            -1] ;
-         gap(id) = 1-sum((Weights(1,:)-Weights(2,:)).*param)-xis(id) ;
-       else
-         gap(id) = 0 ;
-       end ;
-     end       
-     fprintf('\n') ;
-      #################################################################################
-
-      const_added = solver.addConstraint(deltas, loss, pos, marginRescaling)
       
-      objValue,w,self.slacks = solver.solve()
+   def run(self):
+      # Load the whole dataset 
+      #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data('training',self.genome_info,self.ARGS)
+      Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
+
+      # number of training instances
+      N = len(Sequences) 
+      self.numExamples = N
+      assert N == len(Acceptors) and N == len(Acceptors) and N == len(Exons)\
+      and N == len(Ests), 'The Seq,Accept,Donor,.. arrays are of different lengths'
+      self.plog('Number of training examples: %d\n'% N)
+
+      #iteration_steps = 200 ; #upper bound on iteration steps
+      iteration_steps = 2 ; #upper bound on iteration steps
+
+      remove_duplicate_scores = False
+      print_matrix = False
+      anzpath = 2
+
+      # Initialize parameter vector
+      # param = numpy.matlib.rand(126,1)
+      param = Configuration.fixedParam 
+
+      # Set the parameters such as limits penalties for the Plifs
+      [h,d,a,mmatrix] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
+
+      # delete splicesite-score-information
+      if not self.ARGS.train_with_splicesitescoreinformation:
+         for i in range(len(Acceptors)):
+            if Acceptors[i] > -20:
+               Acceptors[i] = 1
+            if Donors[i] >-20:
+               Donors[i] = 1
+
+      # Initialize solver 
+      if not __debug__:
+         self.plog('Initializing problem...\n')
+         solver = SIQPSolver(Configuration.numFeatures,Configuration.numExamples,Configuration.C,self.logfh)
+
+      # stores the number of alignments done for each example (best path, second-best path etc.)
+      num_path = [anzpath]*N 
+      # stores the gap for each example
+      gap      = [0.0]*N
+
+      #############################################################################################
+      # Training
+      #############################################################################################
+      self.plog('Starting training...\n')
+
+      donSP       = Configuration.numDonSuppPoints
+      accSP       = Configuration.numAccSuppPoints
+      lengthSP    = Configuration.numLengthSuppPoints
+      mmatrixSP   = Configuration.sizeMMatrix
+      totalQualSP = Configuration.totalQualSuppPoints
+
+      currentPhi = zeros((Configuration.numFeatures,1))
+      totalQualityPenalties = zeros((totalQualSP,1))
+
+      #qualityMatrix = zeros((Configuration.numPlifSuppPoints*Configuration.numQualPlifs,1))
+
+      iteration_nr = 0
+
+      while True:
+         if iteration_nr == iteration_steps:
+            break
+
+         for exampleIdx in range(self.numExamples):
+            if (exampleIdx%10) == 0:
+               print 'Current example nr %d' % exampleIdx
+
+            dna = Sequences[exampleIdx] 
+            est = Ests[exampleIdx] 
+
+            quality = [0.0]*len(est)
+
+            exons = Exons[exampleIdx] 
+            # NoiseMatrix = Noises[exampleIdx] 
+            don_supp = Donors[exampleIdx] 
+            acc_supp = Acceptors[exampleIdx] 
+
+            # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)    
+            # trueSpliceAlign, trueWeightMatch = computeSpliceAlign(dna, exons)
+            trueSpliceAlign, trueWeightMatch, trueQualityPlifs = computeSpliceAlignWithQuality(dna, exons, quality)
+            
+            # Calculate the weights
+            trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
+
+            trueWeightQuality = getQualityFeatureCounts(trueQualityPlifs)
+            trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
+
+            currentPhi[0:donSP]                                               = mat(d.penalties[:]).reshape(donSP,1)
+            currentPhi[donSP:donSP+accSP]                                     = mat(a.penalties[:]).reshape(accSP,1)
+            currentPhi[donSP+accSP:donSP+accSP+lengthSP]                      = mat(h.penalties[:]).reshape(lengthSP,1)
+            currentPhi[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP]   = mmatrix[:]
+            currentPhi[donSP+accSP+lengthSP+mmatrixSP:]                       = totalQualityPenalties[:]
+
+            # Calculate w'phi(x,y) the total score of the alignment
+            trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
+
+            # The allWeights vector is supposed to store the weight parameter
+            # of the true alignment as well as the weight parameters of the
+            # num_path[exampleIdx] other alignments
+            allWeights = zeros((Configuration.numFeatures,num_path[exampleIdx]+1))
+            allWeights[:,0] = trueWeight[:,0]
+
+            AlignmentScores = [0.0]*(num_path[exampleIdx]+1)
+            AlignmentScores[0] = trueAlignmentScore
+
+            ################## Calculate wrong alignment(s) ######################
+
+            # Compute donor, acceptor with penalty_lookup_new
+            # returns two double lists
+            donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
+
+            #myalign wants the acceptor site on the g of the ag
+            acceptor = acceptor[1:]
+            acceptor.append(-inf)
+
+            dna = str(dna)
+            est = str(est)
+            dna_len = len(dna)
+            est_len = len(est)
+            ps = h.convert2SWIG()
+
+            prb = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
+            chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
+
+            matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
+            mm_len = 36
+
+            d_len = len(donor)
+            donor = QPalmaDP.createDoubleArrayFromList(donor)
+            a_len = len(acceptor)
+            acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
+
+            currentAlignment = QPalmaDP.Alignment()
+            #qualityMat = QPalmaDP.createDoubleArrayFromList(qualityMatrix)
+            #currentAlignment.setQualityMatrix(qualityMat,self.numQualSuppPoints)
+
+            qualityPlifs = initializeQualityScoringFunctions(Configuration.numQualPlifs,Configuration.numQualSuppPoints)
+
+            c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList(qualityPlifs)
+
+            #print 'PYTHON: Calling myalign...'
+            # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
+            currentAlignment.myalign( num_path[exampleIdx], dna, dna_len,\
+             est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
+             acceptor, a_len, c_qualityPlifs, remove_duplicate_scores, print_matrix)
+            #print 'PYTHON: After myalign call...'
+
+            c_SpliceAlign       = QPalmaDP.createIntArrayFromList([0]*(dna_len*num_path[exampleIdx]))
+            c_EstAlign          = QPalmaDP.createIntArrayFromList([0]*(est_len*num_path[exampleIdx]))
+            c_WeightMatch       = QPalmaDP.createIntArrayFromList([0]*(mm_len*num_path[exampleIdx]))
+            c_AlignmentScores   = QPalmaDP.createDoubleArrayFromList([.0]*num_path[exampleIdx])
+
+            emptyPlif = Plf(30)
+            emptyPlif = emptyPlif.convert2SWIG()
+            c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([emptyPlif]*(Configuration.numQualPlifs*num_path[exampleIdx]))
+
+            currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
+            c_WeightMatch, c_AlignmentScores, c_qualityPlifs)
+
+            newSpliceAlign = zeros((num_path[exampleIdx]*dna_len,1))
+            newEstAlign = zeros((est_len*num_path[exampleIdx],1))
+            newWeightMatch = zeros((num_path[exampleIdx]*mm_len,1))
+            newQualityPlifs = [None]*num_path[exampleIdx]*Configuration.numQualPlifs
+
+            #print 'newSpliceAlign'
+            for i in range(dna_len*num_path[exampleIdx]):
+               newSpliceAlign[i] = c_SpliceAlign[i]
+            #   print '%f' % (spliceAlign[i])
+
+            #print 'newEstAlign'
+            for i in range(est_len*num_path[exampleIdx]):
+               newEstAlign[i] = c_EstAlign[i]
+            #   print '%f' % (spliceAlign[i])
+
+            #print 'weightMatch'
+            for i in range(mm_len*num_path[exampleIdx]):
+               newWeightMatch[i] = c_WeightMatch[i]
+            #   print '%f' % (weightMatch[i])
+
+            #print 'AlignmentScores'
+            for i in range(num_path[exampleIdx]):
+               AlignmentScores[i+1] = c_AlignmentScores[i]
+
+            #print 'newQualityPlifs'
+            for i in range(num_path[exampleIdx]*Configuration.numQualPlifs):
+               newQualityPlifs[i] = QPalmaDP.penaltyArray_getitem(c_qualityPlifs, i)
+
+            #print "Calling destructors"
+
+            del c_SpliceAlign
+            del c_EstAlign
+            del c_WeightMatch
+            del c_AlignmentScores
+            del c_qualityPlifs
+            del currentAlignment
+
+            newSpliceAlign = newSpliceAlign.reshape(num_path[exampleIdx],dna_len)
+            newWeightMatch = newWeightMatch.reshape(num_path[exampleIdx],mm_len)
+            # Calculate weights of the respective alignments Note that we are
+            # calculating n-best alignments without hamming loss, so we
+            # have to keep track which of the n-best alignments correspond to
+            # the true one in order not to incorporate a true alignment in the
+            # constraints. To keep track of the true and false alignments we
+            # define an array true_map with a boolean indicating the
+            # equivalence to the true alignment for each decoded alignment.
+            true_map = [0]*(num_path[exampleIdx]+1)
+            true_map[0] = 1
+            path_loss = [0]*(num_path[exampleIdx]+1)
+
+            for pathNr in range(num_path[exampleIdx]):
+
+               weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a, h, newSpliceAlign[pathNr,:].flatten().tolist()[0], don_supp, acc_supp)
+
+               decodedQualityFeatures = zeros((Configuration.totalQualSuppPoints,1))
+               qidx = 0
+
+               for currentPlif in newQualityPlifs[Configuration.numQualPlifs*pathNr:Configuration.numQualPlifs*(pathNr+1)]:
+                  for tidx in range(currentPlif.len):
+                     #elem = currentPlif.penalties[tidx]
+                     elem = QPalmaDP.doubleFArray_getitem(currentPlif.penalties, tidx)
+                     #print '%f ' % elem, 
+                     print qidx
+                     decodedQualityFeatures[qidx] = elem
+                     qidx += 1
+                  #print
+
+               # sum up positionwise loss between alignments
+               for alignPosIdx in range(len(newSpliceAlign[pathNr,:])):
+                  if newSpliceAlign[pathNr,alignPosIdx] != trueSpliceAlign[alignPosIdx]:
+                     path_loss[pathNr+1] += 1
+
+               # Gewichte in restliche Zeilen der Matrix speichern
+               wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch[pathNr,:].T, decodedQualityFeatures])
+               allWeights[:,pathNr+1] = wp
+
+               hpen = mat(h.penalties).reshape(len(h.penalties),1)
+               dpen = mat(d.penalties).reshape(len(d.penalties),1)
+               apen = mat(a.penalties).reshape(len(a.penalties),1)
+
+               features = numpy.vstack([hpen , dpen , apen , mmatrix[:], zeros((Configuration.totalQualSuppPoints,1))])
+               AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
+
+               # Check wether scalar product + loss equals viterbi score
+               #assert math.fabs(newAlignmentScores[pathNr] - AlignmentScores[pathNr+1]) < 1e-6,\
+               #'Scalar prod + loss is not equal Viterbi score. Respective values are %f, %f' % \
+               #(newAlignmentScores[pathNr],AlignmentScores[pathNr+1])
+
+               #  # if the pathNr-best alignment is very close to the true alignment consider it as true
+               if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
+                  true_map[pathNr+1] = 1
+
+               # the true label sequence should not have a larger score than the maximal one WHYYYYY?
+
+               # this means that all n-best paths are to close to each other 
+               # we have to extend the n-best search to a (n+1)-best
+               if len([elem for elem in true_map if elem == 1]) == len(true_map):
+                  num_path[exampleIdx] = num_path[exampleIdx]+1
+
+               # Choose true and first false alignment for extending A
+               firstFalseIdx = -1
+               for map_idx,elem in enumerate(true_map):
+                  if elem == 0:
+                     firstFalseIdx = map_idx
+                     break
+
+               # if there is at least one useful false alignment add the
+               # corresponding constraints to the optimization problem
+               if firstFalseIdx != -1:
+                  trueWeights        = allWeights[:,0]
+                  firstFalseWeights  = allWeights[:,firstFalseIdx]
+
+                  # LMM.py code:
+                  deltas  = firstFalseWeights - trueWeights
+                  if not __debug__:
+                     const_added = solver.addConstraint(deltas, exampleIdx)
+                     objValue,w,self.slacks = solver.solve()
+
+                     sum_xis = 0
+                     for elem in self.slacks:
+                        sum_xis +=  elem
+
+                     for i in range(len(param)):
+                        param[i] = w[i]
+
+                     [h,d,a,mmatrix] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
+
+               #
+               # end of one example processing 
+               #
+            #if exampleIdx == 100:
+            #   break
+         
+         #break
 
-      sum_xis = 0
-      for elem in self.slacks:
-         sum_xis +=  elem
-     
-   """
-   print 'Training completed'
+         #
+         # end of one iteration through all examples
+         #
+         iteration_nr += 1
+
+      #
+      # end of optimization 
+      #  
+      export_param('elegans.param',h,d,a,mmatrix)
+      self.logfh.close()
+      print 'Training completed'
+
+def fetch_genome_info(ginfo_filename):
+   if not os.path.exists(ginfo_filename):
+      cmd = ['']*4
+      cmd[0] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/utils'
+      cmd[1] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/genomes'
+      cmd[2] = 'genome_info = init_genome(\'%s\')' % gen_file
+      cmd[3] = 'save genome_info.mat genome_info'  
+      full_cmd = "matlab -nojvm -nodisplay -r \"%s; %s; %s; %s; exit\"" % (cmd[0],cmd[1],cmd[2],cmd[3])
+
+      obj = subprocess.Popen(full_cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
+      out,err = obj.communicate()
+      assert err == '', 'An error occured!\n%s'%err
+
+      ginfo = scipy.io.loadmat('genome_info.mat')
+      cPickle.dump(self.genome_info,open(ginfo_filename,'w+'))
+      return ginfo['genome_info']
+
+   else:
+      return cPickle.load(open(ginfo_filename))
 
 if __name__ == '__main__':
-   run()
+   qpalma = QPalma()
+   qpalma.run()