+ fixed some index bugs
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Mon, 10 Dec 2007 12:14:34 +0000 (12:14 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Mon, 10 Dec 2007 12:14:34 +0000 (12:14 +0000)
+ added debugging output

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@6961 e1793c9e-67f9-0310-80fc-b846ff1f7b36

QPalmaDP/qpalma_dp.cpp
python/computeSpliceWeights.py
python/qpalma.py
tools/calculateAlignmentQuality.m

index 0474ded..4c33425 100644 (file)
@@ -6,11 +6,6 @@ using namespace std;
   myalign([nr_paths], dna, est, {h}, matchmatrix, donor, acceptor, remove_duplicate_scores, ...
   print_matrix) */
 
-/* nlhs - Number of expected mxArrays (Left Hand Side)
-   plhs - Array of pointers to expected outputs
-   nrhs - Number of inputs (Right Hand Side)
-   prhs - Array of pointers to input data. */
-
 Alignment::Alignment() {
       len = 0;
       limits = 0;
@@ -45,36 +40,10 @@ void Alignment::myalign(int nr_paths_p, char* dna, int dna_len_p, char* est,
    printf("Entering myalign...\n");
 
    mlen = 6; // score matrix: length of 6 for "- A C G T N"
-   dna_len =  dna_len_p;
-   est_len =  est_len_p;
+   dna_len =  dna_len_p + 1 ;
+   est_len =  est_len_p + 1 ;
    nr_paths = nr_paths_p;
 
-#if 0
-  /***************************************************************************/
-  // 3. h (cell array)
-  /***************************************************************************/
-  mwSize anz_func = 1;
-  
-  functions = read_penalty_struct_from_cell(prhs[arg], anz_func);
-  //std::cout << " bla " << functions->max_len << " \n";
-  //std::cout << "lookup_penalty(5) = " << lookup_penalty(functions, 5, NULL, 0)<<std::endl ;
-  arg++; // arg -> 4
-  
-
-  /***************************************************************************/
-  // 4. match matrix (columnwise)
-  /***************************************************************************/
-  int nr_entries = mxGetN(prhs[arg]) * mxGetM(prhs[arg]);
-  //mexPrintf("\nNumber of entries in matchmatrix: %d\n", nr_entries) ;
-  double *matchmatrix;
-  matchmatrix = mxGetPr(prhs[arg]);
-  //mexPrintf("matchmatrix\n") ;
-  //for (int i=0; i< nr_entries; i++) {
-  //  mexPrintf("entry %d: <%f>\n", i, matchmatrix[i]);
-  //}
-  arg++; // arg -> 5
-#endif
-
   /***************************************************************************/ 
   // initialize alignment matrices and call fill_matrix()  
   /***************************************************************************/
@@ -115,16 +84,15 @@ void Alignment::myalign(int nr_paths_p, char* dna, int dna_len_p, char* est,
 
   printf("before init of splice_align and rest...\n");
 
-  splice_align = new int[(dna_len)*nr_paths];
-  est_align = new int[(est_len)*nr_paths];
+  splice_align = new int[(dna_len-1)*nr_paths];
+  est_align = new int[(est_len-1)*nr_paths];
   mmatrix_param = new int[(mlen*mlen)*nr_paths];
   alignmentscores = new double[nr_paths]; //alignment score for each path/matrix
 
-  printf("Lengths are %d %d %d...\n",dna_len,est_len,mlen);
   printf("before memset...\n");
 
-  memset((char*)splice_align, -1, (dna_len)*nr_paths*sizeof(int)); // fills splice_align with zeros
-  memset((char*)est_align, -1, (est_len)*nr_paths*sizeof(int)); // fills est_align with zeros
+  memset((char*)splice_align, -1, (dna_len-1)*nr_paths*sizeof(int)); // fills splice_align with zeros
+  memset((char*)est_align, -1, (est_len-1)*nr_paths*sizeof(int)); // fills est_align with zeros
   memset((char*)mmatrix_param, 0, mlen*mlen*nr_paths*sizeof(int)); //fills mmatrix_param with zeros
   memset(alignmentscores, -1, nr_paths*sizeof(double)); //fills alignmentscores with zeros
   
@@ -133,8 +101,8 @@ void Alignment::myalign(int nr_paths_p, char* dna, int dna_len_p, char* est,
   for (int z=0; z<nr_paths; z++) {      
     result_length = 0 ;
     
-    int* s_align = splice_align + (dna_len)*z;  //pointer
-    int* e_align = est_align + (est_len)*z ;    //pointer
+    int* s_align = splice_align + (dna_len-1)*z;  //pointer
+    int* e_align = est_align + (est_len-1)*z ;    //pointer
     int* mparam  = mmatrix_param + (mlen*mlen)*z; //pointer
     
     bool no_more_path = result_align(matrices, z, est_len, dna_len, &result_length, est, dna, s_align, e_align, mparam, alignmentscores, max_score_positions);
index fcddbb4..8ed8183 100644 (file)
@@ -3,11 +3,10 @@
 
 from numpy import inf
 from numpy.matlib import zeros
+import pdb
 
-# 3 * 30 supporting points: x_1 ... x_30
-# => 3 * 30 parameters (1 .. 90): y_1 ... y_30
+# 3 * 30 supporting points: x_1 ... x_30 => 3 * 30 parameters (1 .. 90): y_1 ... y_30
 # piecewise linear function:
-# 
 # take score from SVM vektor (don_supp: case 1, acc_supp: case 2) and compute length of intron: case 3
 # these are our values x
 # 
@@ -27,9 +26,11 @@ def calculateWeights(plf, scores):
    for k in range(len(scores)):
       value = scores[k]
       Lower = len([elem for elem in plf.limits if elem <= value])
+      # because we count from 0 in python
+      Lower -= 1
       Upper = Lower+1 ; # x-werte bleiben fest
 
-      #print Lower,Upper,len(plf.limits)
+      print value,Lower,Upper
 
       if Lower == 0:
          currentWeight[0] += 1
@@ -41,8 +42,8 @@ def calculateWeights(plf, scores):
          currentWeight[Upper] = currentWeight[Upper] + weightup
          currentWeight[Lower] = currentWeight[Lower] + weightlow
 
-         #print value,plf.limits[Lower],plf.limits[Upper]
-         #print weightup,weightlow,currentWeight[Upper],currentWeight[Lower]
+         print plf.limits[Lower],plf.limits[Upper]
+         print weightup,weightlow,currentWeight[Upper],currentWeight[Lower]
 
    return currentWeight
 
index d974747..33ac32d 100644 (file)
@@ -71,33 +71,16 @@ class QPalma:
       self.plog('Initializing problem...\n')
 
 
-   def createParameterVector(self, trueWeightDon, trueWeightAcc, trueWeightIntron, trueWeightMatch, qualityMatrix):
-      currentW = zeros((self.numFeatures,1))
-
-      donS = self.numDonSuppPoints
-      accS = self.numAccSuppPoints
-      lenS = self.numLengthSuppPoints
-      matS = self.sizeMMatrix
-      qualS = self.numQualSuppPoints
-
-      currentW[0:donS,0] = trueWeightDon[:,0]
-      currentW[donS:donS+accS,0] = trueWeightAcc[:,0]
-      currentW[donS+accS:donS+accS+lenS,0] = trueWeightIntron[:,0]
-      currentW[donS+accS+lenS:donS+accS+lenS+matS,0] = trueWeightMatch[:,0]
-      currentW[donS+accS+lenS+matS:donS+accS+lenS+matS+qualS,0] = qualityMatrix[:,0]
-
-      return currentW
-
-
    def plog(self,string):
       self.logfh.write(string)
 
       
    def run(self):
-
       # Load the whole dataset 
       Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data('training',self.genome_info,self.ARGS)
 
+      #Sequences, Acceptors, Donors, Exons, Ests, QualityScores = paths_load_data('training',self.genome_info,self.ARGS)
+
       # number of training instances
       N = len(Sequences) 
       self.numExamples = N
@@ -128,7 +111,8 @@ class QPalma:
                Donors[i] = 1
 
       # Initialize solver 
-      solver = SIQPSolver(self.numFeatures,self.numExamples,self.C,self.logfh)
+      if not __debug__:
+         solver = SIQPSolver(self.numFeatures,self.numExamples,self.C,self.logfh)
 
       # stores the number of alignments done for each example (best path, second-best path etc.)
       num_path = [anzpath]*N 
@@ -164,12 +148,8 @@ class QPalma:
             
             # Calculate the weights
             trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
-
-            #trueWeight = self.createParameterVector(trueWeightDon, trueWeightAcc, trueWeightIntron, trueWeightMatch, qualityMatrix)
             trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, qualityMatrix ])
 
-            #currentPhi = createFeatureVector()
-
             currentPhi = zeros((self.numFeatures,1))
             currentPhi[0:30]     = mat(d.penalties[:]).reshape(30,1)
             currentPhi[30:60]    = mat(a.penalties[:]).reshape(30,1)
@@ -229,28 +209,30 @@ class QPalma:
             newWeightMatch       = QPalmaDP.createIntArrayFromList([0]*(mm_len*num_path[exampleIdx]))
             newAlignmentScores   = QPalmaDP.createDoubleArrayFromList([.0]*num_path[exampleIdx])
 
+            pdb.set_trace()
+
             currentAlignment.getAlignmentResults(newSpliceAlign, newEstAlign, newWeightMatch, newAlignmentScores)
 
             spliceAlign = zeros((num_path[exampleIdx]*dna_len,1))
             weightMatch = zeros((num_path[exampleIdx]*mm_len,1))
 
+            print 'spliceAlign'
             for i in range(dna_len*num_path[exampleIdx]):
                spliceAlign[i] = newSpliceAlign[i]
+               print '%f' % (spliceAlign[i])
 
+            print 'weightMatch'
             for i in range(mm_len*num_path[exampleIdx]):
                weightMatch[i] = newWeightMatch[i]
+               print '%f' % (weightMatch[i])
 
             for i in range(num_path[exampleIdx]):
                AlignmentScores[i+1] = newAlignmentScores[i]
+
+            print AlignmentScores
                
             spliceAlign = spliceAlign.reshape(num_path[exampleIdx],dna_len)
             weightMatch = weightMatch.reshape(num_path[exampleIdx],mm_len)
-             
-            #pdb.set_trace()
-            #print spliceAlign[:,:]
-            #print weightMatch[:,:]
-            #print AlignmentScores
-
             # Calculate weights of the respective alignments Note that we are
             # calculating n-best alignments without any hamming loss, so we
             # have to keep track which of the n-best alignments correspond to
@@ -274,20 +256,14 @@ class QPalma:
                      path_loss[pathNr+1] += 1
 
                # Gewichte in restliche Zeilen der Matrix speichern
-               # Weights[pathNr,:] = [weightIntron, weightDon, weightAcc, weightMatch(pathNr,:)]
                wp = numpy.vstack([weightIntron, weightDon, weightAcc, weightMatch[pathNr,:].T, qualityMatrix ])
                allWeights[:,pathNr+1] = wp
 
-            #print allWeights[:,0]
-            #print allWeights[:,1]
-            #print allWeights[:,2]
-
                hpen = mat(h.penalties).reshape(len(h.penalties),1)
                dpen = mat(d.penalties).reshape(len(d.penalties),1)
                apen = mat(a.penalties).reshape(len(a.penalties),1)
 
                features = numpy.vstack([hpen , dpen , apen , mmatrix[:]])
-
                AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
 
                # Check wether scalar product + loss equals viterbi score
@@ -299,11 +275,10 @@ class QPalma:
                if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
                   true_map[pathNr+1] = 1
 
-               #  # the true label sequence should not have a larger score than the
-               #  # maximal one WHYYYYY?
+               # the true label sequence should not have a larger score than the maximal one WHYYYYY?
 
-               #  # this means that all n-best paths are to close to each other 
-               #  # we have to extend the n-best search to a (n+1)-best
+               # this means that all n-best paths are to close to each other 
+               # we have to extend the n-best search to a (n+1)-best
                if len([elem for elem in true_map if elem == 1]) == len(true_map):
                   num_path[exampleIdx] = num_path[exampleIdx]+1
 
@@ -313,32 +288,22 @@ class QPalma:
                   if elem == 0:
                      firstFalseIdx = map_idx
                      break
-                 
+
                # if there is at least one useful false alignment add the
                # corresponding constraints to the optimization problem
                if firstFalseIdx != -1:
                   trueWeights        = allWeights[:,0]
                   firstFalseWeights  = allWeights[:,firstFalseIdx]
 
-            #    if size(Weights,1)>1 & sum((Weights(1,:)-Weights(2,:)).*param)+xis[exampleIdx]<1+column_eps,
-            #       e=zeros(1,N) ; 
-            #       e[exampleIdx] = 1 ;
-            #       A=[A; Weights(2,:)-Weights(1,:) zeros(1,126) -e] ;
-            #       b=[b; -1] ;
-            #       gap[exampleIdx] = 1-sum((Weights(1,:)-Weights(2,:)).*param)-xis[exampleIdx] ;
-            #    else
-            #       gap[exampleIdx] = 0 ;
-            #    end ;
-
-               # LMM.py code:
-               deltas  = firstFalseWeights - trueWeights
-               const_added = solver.addConstraint(deltas, exampleIdx)
-
-               objValue,w,self.slacks = solver.solve()
-
-               sum_xis = 0
-               for elem in self.slacks:
-                  sum_xis +=  elem
+                  # LMM.py code:
+                  deltas  = firstFalseWeights - trueWeights
+                  if not __debug__:
+                     const_added = solver.addConstraint(deltas, exampleIdx)
+                     objValue,w,self.slacks = solver.solve()
+
+                     sum_xis = 0
+                     for elem in self.slacks:
+                        sum_xis +=  elem
 
             if exampleIdx==0:
                break
index 84bc4d4..0338a4b 100644 (file)
@@ -19,7 +19,7 @@ for i = 1:length(ground_truth)
    for j = 1:length(testrun)
       currentPred = testrun(j);
 
-      if strcmp(currentEntry.chr,currentPred.chr) && currentEntry.is_alt_spliced
+      if ~strcmp(currentEntry.chr,currentPred.chr) || currentEntry.is_alt_spliced
          continue;
       end
 
@@ -41,13 +41,11 @@ for i = 1:length(ground_truth)
          %disp(sprintf('estIdx %i ',estIdx));
          numberOfExons = size(currentExons{estIdx},1);
 
-         for exonIdx = 1:numberOfExons
-            %disp(sprintf('exonIdx %i ',exonIdx));
-            exonStart = currentExons{estIdx}(exonIdx,1);
-            exonStop  = currentExons{estIdx}(exonIdx,2);
+         for exonIdx = 1:(numberOfExons-1)
+            intronStart  = currentExons{estIdx}(exonIdx,2);
+            intronStop   = currentExons{estIdx}(exonIdx+1,1);
 
             for predEstIdx = 1:numberOfPredEsts
-            %disp(sprintf('predEstIdx %i ',predEstIdx));
             numberOfPredExons = size(currentPredExons{predEstIdx},1);
             currentESTName = currentPred.transcripts{predEstIdx};
 
@@ -58,17 +56,19 @@ for i = 1:length(ground_truth)
 
                   %disp('\n');
                   %%disp(sprintf('%i %i %i %i %i %i\n',i,j,estIdx,predEstIdx,exonIdx,predExonIdx));
-
-                  if exonStart >= predExonStart && exonStart <= predExonStop
-                     %%disp('Overlapping');
-                     fprintf(fh,sprintf('%s before %d\n',currentESTName,exonStart-predExonStart))
+               
+                  % est is covering full intron
+                  if intronStart >= predExonStart && intronStop <= predExonStop
+                     fprintf(fh,sprintf(' %s is completely overlapping from %d %d\n',currentESTName,intronStart,intronStop))
+                  % est is nested inside intron
+                  if intronStart < predExonStart && intronStop > predExonStop
+                     fprintf(fh,sprintf(' %s is completely inside intron from %d %d\n',currentESTName,predExonStart,predExonStop))
                   % end of exonth is nested inside predExoniction
-                  elseif exonStop >= predExonStart && exonStop <= predExonStop
-                     %%disp('Overlapping');
-                     fprintf(fh,sprintf('%s after %d\n',currentESTName,predExonStop-exonStop))
+                  elseif intronStart >= predExonStart && predExonStop >= intronStart && intronStop >= predExonStop
+                     fprintf(fh,sprintf('%s is upstream overlapping from %d %d\n',currentESTName,intronStart,predExonStop))
                   % predExoniction is nested inside exonth
-                  elseif exonStart <= predExonStart && predExonStop <= exonStop
-                     %disp('Overlapping');
+                  elseif intronStart <= predExonStart && predExonStart <= intronStop && intronStop <= predExonStop
+                     fprintf(fh,sprintf('%s is downstream overlapping from %d %d\n',currentESTName,predExonStart,intronStop))
                      fprintf('%s %d %d',fh)
                   else
                      d=1;