+ found bug in SWIG interface dna_len vs. dna_len-1 because of '\0' in matlab
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 7 Dec 2007 13:08:45 +0000 (13:08 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 7 Dec 2007 13:08:45 +0000 (13:08 +0000)
git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@6941 e1793c9e-67f9-0310-80fc-b846ff1f7b36

QPalmaDP/qpalma_dp.cpp
python/.computeSpliceAlign.py.swp
python/.qpalma.py.swp
python/qpalma.log
python/qpalma.py

index cb6cb8c..0474ded 100644 (file)
@@ -115,16 +115,16 @@ void Alignment::myalign(int nr_paths_p, char* dna, int dna_len_p, char* est,
 
   printf("before init of splice_align and rest...\n");
 
-  splice_align = new int[(dna_len-1)*nr_paths];
-  est_align = new int[(est_len-1)*nr_paths];
+  splice_align = new int[(dna_len)*nr_paths];
+  est_align = new int[(est_len)*nr_paths];
   mmatrix_param = new int[(mlen*mlen)*nr_paths];
   alignmentscores = new double[nr_paths]; //alignment score for each path/matrix
 
   printf("Lengths are %d %d %d...\n",dna_len,est_len,mlen);
   printf("before memset...\n");
 
-  memset((char*)splice_align, -1, (dna_len-1)*nr_paths*sizeof(int)); // fills splice_align with zeros
-  memset((char*)est_align, -1, (est_len-1)*nr_paths*sizeof(int)); // fills est_align with zeros
+  memset((char*)splice_align, -1, (dna_len)*nr_paths*sizeof(int)); // fills splice_align with zeros
+  memset((char*)est_align, -1, (est_len)*nr_paths*sizeof(int)); // fills est_align with zeros
   memset((char*)mmatrix_param, 0, mlen*mlen*nr_paths*sizeof(int)); //fills mmatrix_param with zeros
   memset(alignmentscores, -1, nr_paths*sizeof(double)); //fills alignmentscores with zeros
   
@@ -133,8 +133,8 @@ void Alignment::myalign(int nr_paths_p, char* dna, int dna_len_p, char* est,
   for (int z=0; z<nr_paths; z++) {      
     result_length = 0 ;
     
-    int* s_align = splice_align + (dna_len-1)*z;  //pointer
-    int* e_align = est_align + (est_len-1)*z ;    //pointer
+    int* s_align = splice_align + (dna_len)*z;  //pointer
+    int* e_align = est_align + (est_len)*z ;    //pointer
     int* mparam  = mmatrix_param + (mlen*mlen)*z; //pointer
     
     bool no_more_path = result_align(matrices, z, est_len, dna_len, &result_length, est, dna, s_align, e_align, mparam, alignmentscores, max_score_positions);
index 5191fdd..b0f7485 100644 (file)
Binary files a/python/.computeSpliceAlign.py.swp and b/python/.computeSpliceAlign.py.swp differ
index d0afb30..8deaa99 100644 (file)
Binary files a/python/.qpalma.py.swp and b/python/.qpalma.py.swp differ
index 725f6dc..9ee9b98 100644 (file)
@@ -1,4 +1,4 @@
 genome_info.basedir is /fml/ag-raetsch/share/projects/palma/elegans_uncut/
 Initializing problem...
 Number of training examples: 4604
-Starting training...Starting training...
+Starting training...
index d0af41a..b6333ad 100644 (file)
@@ -1,6 +1,12 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
+###########################################################
+#
+# This file containts the 
+#
+###########################################################
+
 import sys
 import subprocess
 import scipy.io
@@ -20,13 +26,12 @@ from penalty_lookup_new import *
 from compute_donacc import *
 from TrainingParam import Param
 
-"""
-A training method for the QPalma project
-"""
-
-
+import Configuration
 
 class QPalma:
+   """
+   A training method for the QPalma project
+   """
    
    def __init__(self):
       self.ARGS = Param()
@@ -88,6 +93,8 @@ class QPalma:
 
       
    def run(self):
+
+      # Load the whole dataset 
       Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data('training',self.genome_info,self.ARGS)
 
       # number of training instances
@@ -98,45 +105,19 @@ class QPalma:
 
       self.plog('Number of training examples: %d\n'% N)
 
-      random_N = 100 ; # number of constraints to generate per iteration
       iteration_steps = 200 ; #upper bound on iteration steps
 
-      remove_duplicate_scores = 0
+      remove_duplicate_scores = False
+      print_matrix = False
       anzpath = 2
-      print_matrix = 0
 
+      # Initialize parameter vector
       # param = numpy.matlib.rand(126,1)
-      param = numpy.matlib.mat([[ 0.62870709], [ 0.7012026 ], [ 0.60236784],
-       [ 0.51316259], [ 0.20220814], [ 0.70324863], [ 0.37218684], [ 0.82178927],
-       [ 0.60394866], [ 0.70371272], [ 0.07548074], [ 0.63412803], [ 0.97442266],
-       [ 0.13216791], [ 0.71041168], [ 0.2093887 ], [ 0.35227344], [ 0.3405142 ],
-       [ 0.69677236], [ 0.41673747], [ 0.564245  ], [ 0.37613432], [ 0.88805642],
-       [ 0.88691608], [ 0.69476752], [ 0.81659504], [ 0.17801859], [ 0.71048235],
-       [ 0.08188783], [ 0.54884803], [ 0.84039558], [ 0.6982093 ], [ 0.41686176],
-       [ 0.38568873], [ 0.29401347], [ 0.12704074], [ 0.30640858], [ 0.89578031],
-       [ 0.84621571], [ 0.11783439], [ 0.0944695 ], [ 0.34081575], [ 0.44157643],
-       [ 0.77847185], [ 0.04283567], [ 0.45107823], [ 0.89789891], [ 0.41045519],
-       [ 0.49073531], [ 0.29727627], [ 0.94711483], [ 0.24898204], [ 0.26181212],
-       [ 0.71760957], [ 0.60326883], [ 0.80887576], [ 0.09448718], [ 0.88064525],
-       [ 0.84317654], [ 0.48893703], [ 0.24847021], [ 0.84203596], [ 0.34104156],
-       [ 0.75604701], [ 0.91703057], [ 0.69325475], [ 0.61276969], [ 0.16335226],
-       [ 0.4684374 ], [ 0.16553371], [ 0.79594434], [ 0.6440283 ], [ 0.80922237],
-       [ 0.5349296 ], [ 0.31924316], [ 0.10960695], [ 0.40151062], [ 0.50473641],
-       [ 0.14812671], [ 0.73523169], [ 0.35141625], [ 0.80364238], [ 0.02128181],
-       [ 0.0061226 ], [ 0.34541924], [ 0.07694485], [ 0.05551339], [ 0.23087636],
-       [ 0.87016395], [ 0.31682377], [ 0.27375113], [ 0.72226332], [ 0.62914149],
-       [ 0.59236012], [ 0.2070238 ], [ 0.52390942], [ 0.11894098], [ 0.55725917],
-       [ 0.72706009], [ 0.087196  ], [ 0.04745082], [ 0.95636492], [ 0.31524576],
-       [ 0.79685218], [ 0.80386771], [ 0.70942604], [ 0.82869417], [ 0.26906569],
-       [ 0.51848039], [ 0.64169354], [ 0.07114973], [ 0.39249454], [ 0.07002803],
-       [ 0.94667567], [ 0.02252752], [ 0.01039039], [ 0.5721312 ], [ 0.06065969],
-       [ 0.69422476], [ 0.4310939 ], [ 0.03069099], [ 0.35969779], [ 0.18047331],
-       [ 0.4177651 ], [ 0.01360547], [ 0.29069319]])
+      param = Configuration.fixedParam 
 
+      # Set the parameters such as limits penalties for the Plifs
       [h,d,a,mmatrix] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
 
-      # checked values till this position
-
       # delete splicesite-score-information
       if not self.ARGS.train_with_splicesitescoreinformation:
          for i in range(len(Acceptors)):
@@ -145,19 +126,19 @@ class QPalma:
             if Donors[i] >-20:
                Donors[i] = 1
 
+      # Initialize solver 
       #problem = SIQPSolver(numFeatures,numExamples,C,logfile)
+
       # stores the number of alignments done for each example (best path, second-best path etc.)
       num_path = [anzpath]*N 
       # stores the gap for each example
       gap      = [0.0]*N
 
       qualityMatrix = zeros((self.numQualSuppPoints,1))
+
       #############################################################################################
       # Training
       #############################################################################################
-      self.plog('Starting training...')
-
-
       self.plog('Starting training...\n')
 
       iteration_nr = 1
@@ -196,10 +177,12 @@ class QPalma:
             currentPhi[126:]   = qualityMatrix[:]
 
             # Calculate w'phi(x,y) the total score of the alignment
-            alignmentScore = currentW.T * currentPhi
+            trueAlignmentScore = currentW.T * currentPhi
+
 
             ################## Calculate wrong alignment(s) ######################
 
+
             # Compute donor, acceptor with penalty_lookup_new
             # returns two double lists
             donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
@@ -208,12 +191,10 @@ class QPalma:
             acceptor = acceptor[1:]
             acceptor.append(-inf)
 
-            ####################### checked above values ##########################################
             dna = str(dna)
             est = str(est)
             dna_len = len(dna)
             est_len = len(est)
-
             ps = h.convert2SWIG()
 
             matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
@@ -224,20 +205,16 @@ class QPalma:
             a_len = len(acceptor)
             acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
 
-            remove_duplicate_scores = False
-            print_matrix = False
-
             currentAlignment = QPalmaDP.Alignment()
             qualityMat = QPalmaDP.createDoubleArrayFromList(qualityMatrix)
             currentAlignment.setQualityMatrix(qualityMat,self.numQualSuppPoints)
 
             print 'PYTHON: Calling myalign...'
-            # returns [SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest]
+            # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
             currentAlignment.myalign( num_path[exampleIdx], dna, dna_len,\
             est, est_len, ps, matchmatrix, mm_len, donor, d_len,\
             acceptor, a_len, remove_duplicate_scores, print_matrix)
-
-            print 'after myalign call...'
+            print 'PYTHON: After myalign call...'
 
             newSpliceAlign       = QPalmaDP.createIntArrayFromList([0]*(dna_len*num_path[exampleIdx]))
             newEstAlign          = QPalmaDP.createIntArrayFromList([0]*(est_len*num_path[exampleIdx]))
@@ -246,32 +223,61 @@ class QPalma:
 
             currentAlignment.getAlignmentResults(newSpliceAlign, newEstAlign, newWeightMatch, newAlignmentScores)
 
-            #Berechne Gewichte der durch Alignment berechneten Pfade
+            spliceAlign = zeros((num_path[exampleIdx]*dna_len,1))
+            weightMatch = zeros((num_path[exampleIdx]*mm_len,1))
+
+            for i in range(dna_len*num_path[exampleIdx]):
+               spliceAlign[i] = newSpliceAlign[i]
+
+            for i in range(mm_len*num_path[exampleIdx]):
+               weightMatch[i] = newWeightMatch[i]
+               
+            spliceAlign = spliceAlign.reshape(num_path[exampleIdx],dna_len)
+            weightMatch = weightMatch.reshape(num_path[exampleIdx],mm_len)
+
+            # Calculate weights of the respective alignments Note that we are
+            # calculating n-best alignments without any hamming loss, so we
+            # have to keep track which of the n-best alignments correspond to
+            # the true one in order not to incorporate a true alignment in the
+            # constraints. To keep track of the true and false alignments we
+            # define an array true_map with a boolean indicating the
+            # equivalence to the true alignment for each decoded alignment.
             true_map = zeros((1,1+num_path[exampleIdx]))
             true_map[0] = 1
             path_loss = [0]*num_path[exampleIdx]
 
-            # for pathNr in range(num_path(exampleIdx)):
-            #    #dna_numbers = dnaest{1,pathNr}
-            #    #est_numbers = dnaest{2,pathNr}
-            #   
-            #    weightDon, weightAcc, weightIntron = compute_SpliceWeights(d, a, h, SpliceAlign[pathNr,:], don_supp, acc_supp)
+            AlignmentScores = [0.0]*num_path[exampleIdx]
+            Weights = zeros((num_path[exampleIdx],self.numFeatures))
+
+            for pathNr in range(num_path[exampleIdx]):
+               #dna_numbers = dnaest{1,pathNr}
+               #est_numbers = dnaest{2,pathNr}
 
-            #    # sum up positionwise loss between alignments
-            #    for alignPosIdx in range(len(SpliceAlign[pathNr,:])):
-            #       if SpliceAlign[pathNr,alignPosIdx] != true_spliceAlign[alignPosIdx]:
-            #          path_loss[pathNr] += 1
+               weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a, h, spliceAlign[pathNr,:].flatten().tolist()[0], don_supp, acc_supp)
 
-            #    # Gewichte in restliche Zeilen der Matrix speichern
-            #    Weights[pathNr+1,:] = [weightIntron, weightDon, weightAcc, weightMatch(pathNr,:)]
+               # sum up positionwise loss between alignments
+               for alignPosIdx in range(len(spliceAlign[pathNr,:])):
+                  if spliceAlign[pathNr,alignPosIdx] != trueSpliceAlign[alignPosIdx]:
+                     path_loss[pathNr] += 1
+
+               # Gewichte in restliche Zeilen der Matrix speichern
+               # Weights[pathNr,:] = [weightIntron, weightDon, weightAcc, weightMatch(pathNr,:)]
+               wp = self.createParameterVector(weightIntron, weightDon, weightAcc, weightMatch[pathNr,:], qualityMatrix)
+               Weights[pathNr,:] = wp.T
+
+               hpen = mat(h.penalties).reshape(len(h.penalties),1)
+               dpen = mat(d.penalties).reshape(len(d.penalties),1)
+               apen = mat(a.penalties).reshape(len(a.penalties),1)
+   
+               features = numpy.vstack([hpen , dpen , apen , mmatrix[:]])
 
-            #    AlignmentScores(pathNr+1) = Weights(pathNr+1,:) * [h.penalties ; d.penalties ; a.penalties ; mmatrix(:)]
+               AlignmentScores[pathNr] = (Weights[pathNr,:] * features)[0,0]
 
-               # Check wether scalar product + loss equals viterbi score
-            #    assert(abs(Gesamtscores(pathNr) - AlignmentScores(pathNr+1)) < 1e-6)
+               # Check wether scalar product + loss equals viterbi score
+               # assert math.fabs(newAlignmentScores[pathNr] - AlignmentScores[pathNr]) < 1e-6, 'Scalar prod + loss is not equal Viterbi score'
 
-            #    if norm(Weights(1,:)-Weights(pathNr+1,:))<1e-5,
-            #      true_map(pathNr+1)=1
+            #   if norm(Weights(1,:)-Weights(pathNr+1,:))<1e-5,
+            #     true_map(pathNr+1)=1
             #     
             #    # the true label sequence should not have a larger score than the
             #    # maximal one WHYYYYY?
@@ -281,11 +287,6 @@ class QPalma:
             #       keyboard
             #    end ;
 
-            #    # %%set_param_palma should have done this right
-            #    for z = 1:num_path[exampleIdx]
-            #       assert(abs(Weights(z,:) * param(1:126) -AlignmentScores(z)) <= 1e-6)
-            #    end
-
             #    if all(true_map==1)
             #       num_path[exampleIdx]=num_path[exampleIdx]+1 ; %next iteration step: one alignment more
             #    end ;
@@ -304,6 +305,16 @@ class QPalma:
             #       gap[exampleIdx] = 0 ;
             #    end ;
 
+            # LMM.py code:
+            # deltas  = Weights(2,:)-Weights(1,:) zeros(1,126) -e
+            # const_added = solver.addConstraint(deltas, idx)
+            # 
+            # objValue,w,self.slacks = solver.solve()
+
+            # sum_xis = 0
+            # for elem in self.slacks:
+            #    sum_xis +=  elem
+
             if exampleIdx== 10:
                break
 
@@ -311,17 +322,6 @@ class QPalma:
          break
 
       self.logfh.close()
-
-      """
-         const_added = solver.addConstraint(deltas, loss, pos, marginRescaling)
-         
-         objValue,w,self.slacks = solver.solve()
-
-         sum_xis = 0
-         for elem in self.slacks:
-            sum_xis +=  elem
-        
-      """
       print 'Training completed'
 
 if __name__ == '__main__':