+ fixed some index bugs in the c code
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Thu, 10 Jan 2008 16:30:14 +0000 (16:30 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Thu, 10 Jan 2008 16:30:14 +0000 (16:30 +0000)
+ fixed feature count calculation for the label features

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@7284 e1793c9e-67f9-0310-80fc-b846ff1f7b36

QPalmaDP/fill_matrix.cpp
QPalmaDP/qpalma_dp.cpp
QPalmaDP/result_align.cpp
python/Configuration.py
python/Plif.py
python/computeSpliceAlignWithQuality.py
python/paths_load_data_solexa.py
python/qpalma.py
python/set_param_palma.py

index ff55671..f552121 100644 (file)
@@ -73,6 +73,7 @@ double getScore(struct penalty_struct* qualityScores, double* matchmatrix, int m
    double score = 0.0;
 
    if( scoringScheme == NORMAL ) {
+      //printf("Starting scoring / scheme is NORMAL\n");
       if( estChar == -1 )
          score = matchmatrix[mlen*dnaChar];
 
@@ -80,16 +81,31 @@ double getScore(struct penalty_struct* qualityScores, double* matchmatrix, int m
          score = matchmatrix[estChar];
 
       if( estChar != -1 && dnaChar != -1 )
-         score = matchmatrix[mlen* dnaChar +estChar];
+         score = matchmatrix[mlen*dnaChar+estChar];
    } else {
-         int estIdx = check_char(estChar) - 1;
-         int dnaIdx = check_char(dnaChar) - 1;
-         // assert 0 <= estIdx <= 3
-         int rowSize = 6;
-         struct penalty_struct currentPen = qualityScores[estIdx*rowSize+dnaIdx];
-         score = 0;//scoreMatch(dnaChar,estChar,baseScore);
-   }
+      //printf("Starting scoring / scheme is USE_QUALITY_SCORES\n");
+      int rowSize = 5;
+      //printf("estChar,rowSize,dnaChar are: %d,%d,%d",estChar,rowSize,dnaChar);
 
+      if( estChar == -1 )
+         score = matchmatrix[mlen*dnaChar];
+
+      if( dnaChar == -1 )
+         score = matchmatrix[estChar];
+
+      if( estChar != -1 && dnaChar != -1 ) {
+         struct penalty_struct currentPen = qualityScores[estChar*rowSize+dnaChar];
+
+         int sp_idx;
+         if(baseScore <= currentPen.limits[0]) {
+            score = currentPen.penalties[0];
+         } else {
+            for(sp_idx=0;sp_idx<currentPen.len;sp_idx++) {
+               if(currentPen.limits[sp_idx] < baseScore )
+                  score = currentPen.penalties[sp_idx];
+   }}}}
+
+   //printf("Current scores is %f\n",score);
    return score;
 }
 
index 2f7c42a..f48d7ed 100644 (file)
@@ -77,6 +77,7 @@ void Alignment::myalign(int nr_paths_p, char* dna, int dna_len_p, char* est,
   // printf("calling fill_matrix...\n");
   //
    mode currentMode = NORMAL;
+   currentMode = USE_QUALITY_SCORES;
 
   fill_matrix(nr_paths, matrices, est_len, dna_len, est, dna, &h, matchmatrix, qualityScores, donor, acceptor, remove_duplicate_scores, nr_donor_sites, donor_sites, max_score_positions,currentMode);
   
index 5f2a054..3b95634 100644 (file)
@@ -6,11 +6,23 @@ using namespace std;
 void increaseFeatureCount(penalty_struct* qparam, int dnanum, int estnum, double estprb) {
 
    if(dnanum == 0) {
+      //printf("Zero entry\n");
       return;
    }
 
-   penalty_struct currentStruct = qparam[(estnum-1)*6+(dnanum-1)];
-   //printf("check_char is %d\n",(estnum-1)*6+(dnanum-1));
+   penalty_struct currentStruct = qparam[(estnum-1)*5+(dnanum-1)];
+
+   //   printf("before\n");
+   //   int p_idx;
+   //   for(p_idx=0;p_idx<10;p_idx++) {
+   //      printf("%f ",currentStruct.limits[p_idx]);
+   //   }
+   //   printf("\n");
+   //
+   //   for(p_idx=0;p_idx<10;p_idx++) {
+   //      printf("%f ",currentStruct.penalties[p_idx]);
+   //   }
+   //   printf("\n");
 
    double value = estprb;
    int Lower = 0;
@@ -28,10 +40,21 @@ void increaseFeatureCount(penalty_struct* qparam, int dnanum, int estnum, double
 
    Lower -= 1;
    int Upper = Lower+1; // x-werte bleiben fest
-   double weightup  = 1.0*(value - currentStruct.limits[Lower]) / (currentStruct.limits[Upper] - currentStruct.limits[Lower]);
-   double weightlow = 1.0*(currentStruct.limits[Upper] - value) / (currentStruct.limits[Upper] - currentStruct.limits[Lower]);
-   currentStruct.penalties[Upper] = currentStruct.penalties[Upper] + weightup;
-   currentStruct.penalties[Lower] = currentStruct.penalties[Lower] + weightlow;
+   currentStruct.penalties[Upper] += 1; 
+   currentStruct.penalties[Lower] += 1; 
+
+   //printf("estprb/Lower/Upper %f %d %d\n",estprb,Lower,Upper);
+
+   qparam[(estnum-1)*5+(dnanum-1)] = currentStruct;
+
+   //printf("after\n");
+   //for(p_idx=0;p_idx<10;p_idx++) {
+   //   printf("%f ",currentStruct.limits[p_idx]);
+   //}
+   //printf("\n");
+   //for(p_idx=0;p_idx<10;p_idx++)
+   //   printf("%f ",currentStruct.penalties[p_idx]);
+   //printf("\n");
 }
 
 bool result_align(Pre_score* matrices[], int z, int est_len, int dna_len, int* result_length_ptr, char* est, char* dna, double* prb, double* chastity, int* s_align, int* e_align, int* mparam, double* alignmentscores, int* max_score_positions, penalty_struct* qparam)
index 9303dcc..6457e37 100644 (file)
@@ -3,7 +3,8 @@
 
 import numpy.matlib
 
-fixedParam = numpy.matlib.mat([[ 0.62870709], [ 0.7012026 ], [ 0.60236784],
+fixedParam = numpy.matlib.mat(
+[[ 0.62870709], [ 0.7012026 ], [ 0.60236784],
        [ 0.51316259], [ 0.20220814], [ 0.70324863], [ 0.37218684], [ 0.82178927],
        [ 0.60394866], [ 0.70371272], [ 0.07548074], [ 0.63412803], [ 0.97442266],
        [ 0.13216791], [ 0.71041168], [ 0.2093887 ], [ 0.35227344], [ 0.3405142 ],
index 5acf385..a52594f 100644 (file)
@@ -19,7 +19,7 @@ class Plf:
          self.min_len = 0
       else:
          self.len = num
-         self.limits = [0]*num
+         self.limits = linspace(0,40,num) 
          self.penalties = [0]*num
          self.transform = ''
          self.name = ''
index d69b2eb..f5c7652 100644 (file)
@@ -6,7 +6,7 @@ import pdb
 from Plif import Plf,base_coord,linspace
 import Configuration
 
-def  computeSpliceAlignWithQuality(dna, exons, quality):
+def  computeSpliceAlignWithQuality(dna, exons, read, quality):
    """
    Exonpos: Anfang Exon bis 1 rechts neben Ende Exon
    Daraus folgt: SpliceposAnfang ist gleich Ende Exon von Vorgaenger
@@ -68,6 +68,23 @@ def  computeSpliceAlignWithQuality(dna, exons, quality):
       currentPlif.limits = mat(linspace(0,40,10)).reshape(10,1)
       currentPlif.penalties = zeros((10,1))
 
+   for pos, elem in enumerate(read):
+      currentPlif = trueQualityPlifs[base_coord(elem,elem)]
+
+      value = quality[pos]
+      Lower = len([el for el in currentPlif.limits if el <= value])
+
+      if Lower == 0:
+         currentPlif.penalties[0] += 1
+      elif Lower == len(currentPlif.limits):
+         currentPlif.penalties[-1] += 1
+      else:
+         # because we count from 0 in python
+         Lower -= 1
+         Upper = Lower+1 ; # x-werte bleiben fest
+         currentPlif.penalties[Upper] += 1
+         currentPlif.penalties[Lower] += 1
+
    for pos, elem in enumerate(est):
       if elem == 'a':
          numChar[0] += 1
@@ -80,25 +97,12 @@ def  computeSpliceAlignWithQuality(dna, exons, quality):
       if elem == 'n':
          numChar[4] += 1
 
-      currentPlif = trueQualityPlifs[base_coord(elem,elem)]
-      currentQualityScore = quality[pos]
-
-      if currentQualityScore < currentPlif.limits[0]:
-            currentPlif.penalties[0] += 1
-
-      elif currentQualityScore >= currentPlif.limits[-1]:
-            currentPlif.penalties[-1] += 1
-
-      else:
-         for idx in range(1,currentPlif.len):
-            if currentPlif.limits[idx-1] <= currentQualityScore < currentPlif.limits[idx]:
-               currentPlif.penalties[idx] += 1
 
    totalNumChar = 0
    for idx in range(sizeMatchmatrix):
       totalNumChar += numChar[idx]
 
-   assert totalNumChar == len(est)
+   assert totalNumChar == len(est), "numChar %d, len(est) %d" % (totalNumChar,len(est))
 
    # writing in weight match matrix
    # matrix is saved columnwise
@@ -107,4 +111,3 @@ def  computeSpliceAlignWithQuality(dna, exons, quality):
       trueWeightMatch[(sizeMatchmatrix+1)*idx] = numChar[idx-1]
 
    return SpliceAlign, trueWeightMatch, trueQualityPlifs
-
index 6614fb2..797d70d 100644 (file)
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
+from numpy.matlib import mat,zeros,ones,inf
 import cPickle
 import pdb
 
@@ -35,22 +36,31 @@ def paths_load_data_solexa(expt,genome_info,PAR):
       cal = [ord(elem)-64 for elem in cal]
       chastity = [ord(elem)+10 for elem in chastity]
 
+      assert len(prb) == len(seq)
+
       currentGene = allGenes[gene_id]
 
-      print "gene_id is %s " % gene_id
+      if len(currentGene.exons) != 3:
+         continue
 
       try:
-         Sequences.append(tair7_seq[gene_id+'.1']['sequence'])
-         print 'found corresponding sequence'
+         currentSeq = tair7_seq[gene_id+'.1']['sequence'].lower()
+         Sequences.append(currentSeq)
       except:
          continue
          
       currentSize = len(Sequences[-1])
       #assert currentSize == len(Sequences[-1]), 'gene_size %d / seq size %d' % (currentSize,len(Sequences[-1]))
+      #Acceptors.append([-inf]*currentSize)
+      #Donors.append([-inf]*currentSize)
+      Acceptors.append([0]*currentSize)
+      Donors.append([0]*currentSize)
 
-      Acceptors.append([0.0]*currentSize)
-      Donors.append([0.0]*currentSize)
-      Exons.append(currentGene.exons)
+      currentExons = zeros((len(currentGene.exons),2))
+      for idx in range(len(currentGene.exons)):
+         currentExons[idx,0] = currentGene.exons[idx][0]-currentGene.start
+         currentExons[idx,1] = currentGene.exons[idx][1]-currentGene.start
+      Exons.append(currentExons)
       Ests.append(seq)
       Qualities.append(prb)
    
index 7d687d8..283ad9e 100644 (file)
@@ -59,6 +59,8 @@ class QPalma:
 
       self.plog('genome_info.basedir is %s\n'%self.genome_info.basedir)
 
+      #self.ARGS.train_with_splicesitescoreinformation = False
+
    def plog(self,string):
       self.logfh.write(string)
 
@@ -77,6 +79,7 @@ class QPalma:
       assert N == len(Acceptors) and N == len(Acceptors) and N == len(Exons)\
       and N == len(Ests), 'The Seq,Accept,Donor,.. arrays are of different lengths'
       self.plog('Number of training examples: %d\n'% N)
+      print 'Number of features: %d\n'% Configuration.numFeatures
 
       iteration_steps         = Configuration.iter_steps ; #upper bound on iteration steps
       remove_duplicate_scores = Configuration.remove_duplicate_scores 
@@ -84,7 +87,8 @@ class QPalma:
       anzpath                 = Configuration.anzpath 
 
       # Initialize parameter vector  / param = numpy.matlib.rand(126,1)
-      param = Configuration.fixedParam 
+      #param = Configuration.fixedParam 
+      param = cPickle.load(open('initial_param.pickle'))
 
       # Set the parameters such as limits penalties for the Plifs
       [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
@@ -132,14 +136,14 @@ class QPalma:
 
             dna = Sequences[exampleIdx] 
             est = Ests[exampleIdx] 
-            quality = [0.0]*len(est)
+            quality = Qualities[exampleIdx]
             exons = Exons[exampleIdx] 
             # NoiseMatrix = Noises[exampleIdx] 
             don_supp = Donors[exampleIdx] 
             acc_supp = Acceptors[exampleIdx] 
 
             # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)    
-            trueSpliceAlign, trueWeightMatch, trueQualityPlifs = computeSpliceAlignWithQuality(dna, exons, quality)
+            trueSpliceAlign, trueWeightMatch, trueQualityPlifs = computeSpliceAlignWithQuality(dna, exons, est, quality)
             
             # Calculate the weights
             trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
@@ -147,6 +151,12 @@ class QPalma:
             trueWeightQuality = getQualityFeatureCounts(trueQualityPlifs)
             trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
 
+            pdb.set_trace()
+
+            numq = Configuration.numQualSuppPoints
+            for pos,plif in enumerate(qualityPlifs):
+               totalQualityPenalties[pos*numq:(pos+1)*numq] = mat(plif.penalties).reshape(10,1)
+
             currentPhi[0:donSP]                                               = mat(d.penalties[:]).reshape(donSP,1)
             currentPhi[donSP:donSP+accSP]                                     = mat(a.penalties[:]).reshape(accSP,1)
             currentPhi[donSP+accSP:donSP+accSP+lengthSP]                      = mat(h.penalties[:]).reshape(lengthSP,1)
@@ -207,12 +217,13 @@ class QPalma:
             c_WeightMatch       = QPalmaDP.createIntArrayFromList([0]*(mm_len*num_path[exampleIdx]))
             c_AlignmentScores   = QPalmaDP.createDoubleArrayFromList([.0]*num_path[exampleIdx])
 
-            emptyPlif = Plf(30)
+            emptyPlif = Plf(Configuration.numQualSuppPoints)
+            #pdb.set_trace()
             emptyPlif = emptyPlif.convert2SWIG()
-            c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([emptyPlif]*(Configuration.numQualPlifs*num_path[exampleIdx]))
+            c_qualityPlifsFeatures = QPalmaDP.createPenaltyArrayFromList([emptyPlif]*(Configuration.numQualPlifs*num_path[exampleIdx]))
 
             currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
-            c_WeightMatch, c_AlignmentScores, c_qualityPlifs)
+            c_WeightMatch, c_AlignmentScores, c_qualityPlifsFeatures)
 
             newSpliceAlign = zeros((num_path[exampleIdx]*dna_len,1))
             newEstAlign = zeros((est_len*num_path[exampleIdx],1))
@@ -240,7 +251,7 @@ class QPalma:
 
             #print 'newQualityPlifs'
             for i in range(num_path[exampleIdx]*Configuration.numQualPlifs):
-               newQualityPlifs[i] = QPalmaDP.penaltyArray_getitem(c_qualityPlifs, i)
+               newQualityPlifs[i] = QPalmaDP.penaltyArray_getitem(c_qualityPlifsFeatures, i)
 
             #print "Calling destructors"
 
@@ -285,6 +296,8 @@ class QPalma:
                   if newSpliceAlign[pathNr,alignPosIdx] != trueSpliceAlign[alignPosIdx]:
                      path_loss[pathNr+1] += 1
 
+               #pdb.set_trace()
+
                # Gewichte in restliche Zeilen der Matrix speichern
                wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch[pathNr,:].T, decodedQualityFeatures])
                allWeights[:,pathNr+1] = wp
@@ -297,9 +310,9 @@ class QPalma:
                AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
 
                # Check wether scalar product + loss equals viterbi score
-               assert math.fabs(newAlignmentScores[pathNr] - AlignmentScores[pathNr+1]) < 1e-6,\
-               'Scalar prod + loss is not equal Viterbi score. Respective values are %f, %f' % \
-               (newAlignmentScores[pathNr],AlignmentScores[pathNr+1])
+               #assert math.fabs(AlignmentScores[pathNr] - AlignmentScores[pathNr+1]) < 1e-6,\
+               #'Scalar prod + loss is not equal Viterbi score. Respective values are %f, %f' % \
+               #(AlignmentScores[pathNr],AlignmentScores[pathNr+1])
 
                #  # if the pathNr-best alignment is very close to the true alignment consider it as true
                if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
@@ -326,6 +339,8 @@ class QPalma:
 
                   # LMM.py code:
                   deltas  = firstFalseWeights - trueWeights
+                  pdb.set_trace()
+
                   if not __debug__:
                      const_added = solver.addConstraint(deltas, exampleIdx)
                      objValue,w,self.slacks = solver.solve()
@@ -338,7 +353,6 @@ class QPalma:
                         param[i] = w[i]
 
                      [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
-
                #
                # end of one example processing 
                #
index fecbb99..71fe2c6 100644 (file)
@@ -69,7 +69,7 @@ def set_param_palma(param, train_with_intronlengthinformation,\
    # Gapfunktion
    ####################
    h.limits    = logspace(math.log(min_intron_len,10),math.log(max_intron_len,10),30) 
-   h.penalties = param[0:lengthSP].flatten().tolist()[0]
+   h.penalties = param[0:lengthSP].flatten().tolist()
    #h.transform = '+1' 
    h.transform = '' 
    h.name      = 'h' 
@@ -84,7 +84,7 @@ def set_param_palma(param, train_with_intronlengthinformation,\
    # Donorfunktion
    ####################
    d.limits    = linspace(min_svm_score,max_svm_score,30) 
-   d.penalties = param[lengthSP:lengthSP+donSP].flatten().tolist()[0]
+   d.penalties = param[lengthSP:lengthSP+donSP].flatten().tolist()
    #d.transform = '+1' 
    d.transform = '' 
    d.name      = 'd' 
@@ -99,7 +99,7 @@ def set_param_palma(param, train_with_intronlengthinformation,\
    # Acceptorfunktion
    ####################
    a.limits    = linspace(min_svm_score,max_svm_score,30) 
-   a.penalties = param[lengthSP+donSP:lengthSP+donSP+accSP].flatten().tolist()[0]
+   a.penalties = param[lengthSP+donSP:lengthSP+donSP+accSP].flatten().tolist()
    #a.transform = '+1' 
    a.transform = '' 
    a.name      = 'a' 
@@ -124,7 +124,7 @@ def set_param_palma(param, train_with_intronlengthinformation,\
       currentPlif.limits    = linspace(min_svm_score,max_svm_score,Configuration.numQualSuppPoints) 
       begin                 = lengthSP+donSP+accSP+mmatrixSP+(idx*Configuration.numQualSuppPoints)
       end                   = lengthSP+donSP+accSP+mmatrixSP+((idx+1)*Configuration.numQualSuppPoints)
-      currentPlif.penalties = param[begin:end].flatten().tolist()[0]
+      currentPlif.penalties = param[begin:end].flatten().tolist()
       currentPlif.transform = '' 
       currentPlif.name      = 'q' 
       currentPlif.max_len   = 100