+ fixed bug in the computeSpliceAlign... function
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 8 Feb 2008 20:33:39 +0000 (20:33 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 8 Feb 2008 20:33:39 +0000 (20:33 +0000)
+ added some more assertions and log messages

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@7728 e1793c9e-67f9-0310-80fc-b846ff1f7b36

dyn_prog/fill_matrix.cpp
dyn_prog/result_align.cpp
qpalma/Configuration.py
qpalma/computeSpliceAlignWithQuality.py
scripts/compile_dataset.py
scripts/qpalma_main.py

index b494a2e..e10321b 100644 (file)
@@ -83,6 +83,9 @@ double getScore(struct penalty_struct* qualityScores, int mlen, int dnaChar, int
    int currentPos = (estChar-1)*6+dnaChar;
    struct penalty_struct currentPen = qualityScores[currentPos];
 
+   //if( estChar == 1 and dnaChar == 1)
+   //   printf("A-A : %f %f\n",currentPen.penalties[0],currentPen.penalties[1]);
+
        INT idx = 0 ;
        for (INT i=0; i<currentPen.len; i++)
                if (currentPen.limits[i] <= baseScore)
index d0c6944..2386360 100644 (file)
@@ -15,17 +15,19 @@ void increaseFeatureCount(penalty_struct* qparam, int dnaChar, int estChar, doub
 
    //printf("Current index %d est/dna: %d %d score %f\n",currentPos,estChar,dnaChar,estprb);
 
-   //printf("before\n");
-   //int p_idx;
-   //for(p_idx=0;p_idx<currentStruct.len;p_idx++) {
-   //   printf("%f ",currentStruct.limits[p_idx]);
+   //if( estChar == 1 and dnaChar == 1) {
+   //   printf("before\n");
+   //   int p_idx;
+   //   for(p_idx=0;p_idx<currentStruct.len;p_idx++) {
+   //      printf("%f ",currentStruct.limits[p_idx]);
+   //   }
+   //   printf("\n");
+
+   //   for(p_idx=0;p_idx<currentStruct.len;p_idx++) {
+   //      printf("%f ",currentStruct.penalties[p_idx]);
+   //   }
+   //   printf("\n");
    //}
-   //printf("\n");
-
-   //for(p_idx=0;p_idx<currentStruct.len;p_idx++) {
-   //   printf("%f ",currentStruct.penalties[p_idx]);
-   //}
-   //printf("\n");
 
    double value = estprb;
    int Lower = 0;
@@ -56,15 +58,18 @@ void increaseFeatureCount(penalty_struct* qparam, int dnaChar, int estChar, doub
    currentStruct.penalties[Upper] += weightup;
    currentStruct.penalties[Lower] += weightlow; 
 
-   //printf("estprb/Lower/Upper %f %d %d\n",estprb,Lower,Upper);
-   //printf("after\n");
-   //for(p_idx=0;p_idx<currentStruct.len;p_idx++) {
-   //   printf("%f ",currentStruct.limits[p_idx]);
+   //if( estChar == 1 and dnaChar == 1) {
+   //   printf("estprb/Lower/Upper %f %d %d\n",estprb,Lower,Upper);
+   //   int p_idx;
+   //   printf("after\n");
+   //   for(p_idx=0;p_idx<currentStruct.len;p_idx++) {
+   //      printf("%f ",currentStruct.limits[p_idx]);
+   //   }
+   //   printf("\n");
+   //   for(p_idx=0;p_idx<currentStruct.len;p_idx++)
+   //      printf("%f ",currentStruct.penalties[p_idx]);
+   //   printf("\n");
    //}
-   //printf("\n");
-   //for(p_idx=0;p_idx<currentStruct.len;p_idx++)
-   //   printf("%f ",currentStruct.penalties[p_idx]);
-   //printf("\n");
    qparam[currentPos] = currentStruct;
 }
 
index 43c39a1..110fcf6 100644 (file)
@@ -26,7 +26,7 @@ max_intron_len = 2000
 min_svm_score = 0.0 
 max_svm_score = 1.0
 
-numConstraintsPerRound = 1
+numConstraintsPerRound = 100
 
 ###############################################################################
 # 
@@ -84,12 +84,12 @@ mode = 'using_quality_scores'
 #numAccSuppPoints     = 20
 #numQualSuppPoints    =  8
 
-numLengthSuppPoints  = 2
-numDonSuppPoints     = 2
-numAccSuppPoints     = 2
-numQualSuppPoints    = 2
+numLengthSuppPoints  = 10
+numDonSuppPoints     = 10
+numAccSuppPoints     = 10
+numQualSuppPoints    = 10
 
-min_qual = -1
+min_qual = -5
 max_qual = 40
 
 USE_OPT = True
@@ -142,10 +142,10 @@ else:
 ###############################################################################
 
 training_begin    =    0
-training_end      =    2
+training_end      =    1000
 
-prediction_begin  =   10
-prediction_end    =   20
+prediction_begin  =   1000
+prediction_end    =   2000
 
 joinp = os.path.join
 
index 454a613..1ec72c9 100644 (file)
@@ -50,7 +50,8 @@ qualityPlifs=None):
    # number of matches: just have to look at the underlying est
    # est = dna(find(SpliceAlign==0)) # exon nucleotides
    exonPos = [pos for pos,elem in enumerate(SpliceAlign) if elem == 0]
-   est = [elem for pos,elem in enumerate(dna) if pos in exonPos]
+
+   #est = [elem for pos,elem in enumerate(dna) if pos in exonPos]
 
    #length_est = sum(exon_sizes) ;
    totalESTLength = 0
@@ -89,6 +90,10 @@ qualityPlifs=None):
    dna_part = first_exon + second_exon
 
    assert len(dna_part) == len(est)
+   assert len(est) == sum(numChar)
+   
+   #print numChar
+   #print est
 
    map = {'-':0, 'a':1, 'c':2, 'g':3, 't':4, 'n':5}
 
@@ -128,7 +133,7 @@ qualityPlifs=None):
                trueWeightQuality[ctr] = trueWeightQualityMat[row][col][p_idx]
                ctr += 1
 
-   assert int(sum(trueWeightQuality.flatten().tolist()[0])) == len(est)
+   #assert int(sum(trueWeightQuality.flatten().tolist()[0])) == len(est)
 
    totalNumChar = 0
    for idx in range(sizeMatchmatrix[0]):
index 3a9087d..4479d8b 100644 (file)
@@ -205,7 +205,7 @@ def process_read(reReads,fRead,currentGene,dna_flat_files,test):
    # if we perform testing we cut a much wider region as we want to detect
    # how good we perform on a region
    import random
-   ADD_POS = random.randint(1,20)
+   ADD_POS = random.randint(1,1000)
 
    if test:
       up_cut = up_cut-ADD_POS
index 27aac5a..80c49f5 100644 (file)
@@ -86,7 +86,7 @@ class QPalma:
       self.logfh.write(string)
       self.logfh.flush()
 
-   def train(self):
+   def train(self,value):
       self.logfh = open('_qpalma_train.log','w+')
 
       beg = Conf.training_begin
@@ -130,9 +130,17 @@ class QPalma:
       i3 = range(lengthSP+donSP,lengthSP+donSP+accSP)
       i4 = range(lengthSP+donSP+accSP,lengthSP+donSP+accSP+mmatrixSP)
       i5 = range(lengthSP+donSP+accSP+mmatrixSP,lengthSP+donSP+accSP+mmatrixSP+totalQualSP)
-      intervals = [] #i5,\ #i2,#i3,#i4,#i5]
+      
+      offset =lengthSP+donSP+accSP+mmatrixSP
+      #intervals = [[offset+2,offset+3]] #i5,\ #i2,#i3,#i4,#i5]
+      intervals = []
+
+      #param[offset+2] = 10.0
+      #param[offset+3] = 10.0
+
       zero_out(param,intervals)
-      pdb.set_trace()
+
+      #pdb.set_trace()
 
       # Set the parameters such as limits penalties for the Plifs
       [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
@@ -173,6 +181,7 @@ class QPalma:
             dna = Sequences[exampleIdx] 
             est = Ests[exampleIdx] 
             est = "".join(est)
+            est = est.lower()
 
             if Conf.mode == 'normal':
                quality = [40]*len(est)
@@ -180,10 +189,21 @@ class QPalma:
             if Conf.mode == 'using_quality_scores':
                quality = Qualities[exampleIdx]
 
+            #quality = [(int(math.fabs(e))) for e in quality]
+            #quality = [40]*len(est)
+
             exons = Exons[exampleIdx] 
             don_supp = Donors[exampleIdx] 
             acc_supp = Acceptors[exampleIdx] 
 
+            #for idx,elem in enumerate(don_supp):
+            #   if elem != -inf:
+            #      don_supp[idx] = 0.0
+
+            #for idx,elem in enumerate(acc_supp):
+            #   if elem != -inf:
+            #      acc_supp[idx] = 0.0
+
             # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)    
             if Conf.mode == 'using_quality_scores':
                trueSpliceAlign, trueWeightMatch, trueWeightQuality =\
@@ -207,8 +227,6 @@ class QPalma:
             zero_out(currentPhi,intervals)
             zero_out(trueWeight,intervals)
 
-            #pdb.set_trace()
-
             # Calculate w'phi(x,y) the total score of the alignment
             trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
 
@@ -237,6 +255,7 @@ class QPalma:
 
             # check that splice site scores are at dna positions as expected by
             # the dynamic programming component
+
             for d_pos in [pos for pos,elem in enumerate(donor) if elem != -inf]:
                assert dna[d_pos] == 'g' and (dna[d_pos+1] == 'c'\
                or dna[d_pos+1] == 't'), pdb.set_trace()
@@ -351,10 +370,10 @@ class QPalma:
                zero_out(features,intervals)
                zero_out(allWeights[:,pathNr+1],intervals)
 
-               #pdb.set_trace()
-
                AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
 
+               #pdb.set_trace()
+
                distinct_scores = False
                if math.fabs(AlignmentScores[pathNr] - AlignmentScores[pathNr+1]) > 1e-5:
                   distinct_scores = True
@@ -363,24 +382,26 @@ class QPalma:
                if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) <= 1e-5:
                   pdb.set_trace()
 
-               if exampleIdx == 1:
-                  self.plog(" scalar prod (correct) : %f\n"%AlignmentScores[0])
-                  self.plog(" scalar prod (pred.) : %f %f\n"%(newDPScores[pathNr,0],AlignmentScores[pathNr+1]))
+               self.plog(" scalar prod (correct) : %f\n"%AlignmentScores[0])
+               self.plog(" scalar prod (pred.) : %f %f\n"%(newDPScores[pathNr,0],AlignmentScores[pathNr+1]))
 
-               pdb.set_trace()
+               #pdb.set_trace()
 
                # if the pathNr-best alignment is very close to the true alignment consider it as true
                if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
                   true_map[pathNr+1] = 1
 
-               #pdb.set_trace()
 
                #assert AlignmentScores[0] <= max(AlignmentScores[1:]) + 1e-6, pdb.set_trace()
                if not trueAlignmentScore <= max(AlignmentScores[1:]) + 1e-6:
+                  print "suboptimal_example %d\n" %exampleIdx
+                  trueSpliceAlign, trueWeightMatch, trueWeightQuality =\
+                  computeSpliceAlignWithQuality(dna, exons, est, quality, qualityPlifs)
+
+                  pdb.set_trace()
                   suboptimal_example += 1
                   self.plog("suboptimal_example %d\n" %exampleIdx)
 
-
                # the true label sequence should not have a larger score than the maximal one WHYYYYY?
                # this means that all n-best paths are to close to each other 
                # we have to extend the n-best search to a (n+1)-best
@@ -394,9 +415,7 @@ class QPalma:
                      firstFalseIdx = map_idx
                      break
 
-               #if exampleIdx == 1:
                if False:
-
                   self.plog("Is considered as: %d\n" % true_map[1])
 
                   result_len = currentAlignment.getResultLength()
@@ -422,9 +441,8 @@ class QPalma:
                # if there is at least one useful false alignment add the
                # corresponding constraints to the optimization problem
                if firstFalseIdx != -1:
-                  trueWeights       = trueWeight
                   firstFalseWeights = allWeights[:,firstFalseIdx]
-                  differenceVector  = trueWeights - firstFalseWeights
+                  differenceVector  = trueWeight - firstFalseWeights
                   #pdb.set_trace()
 
                   if Conf.USE_OPT:
@@ -452,6 +470,7 @@ class QPalma:
                   sum_xis +=  elem
 
                print 'sum of slacks is %f'% sum_xis 
+               self.plog('sum of slacks is %f\n'% sum_xis)
    
                for i in range(len(param)):
                   param[i] = w[i]
@@ -565,42 +584,52 @@ class QPalma:
          if Conf.mode == 'using_quality_scores':
             quality = Qualities[exampleIdx]
 
+         #quality = [40]*len(est)
+
          exons = Exons[exampleIdx] 
          # NoiseMatrix = Noises[exampleIdx] 
          don_supp = Donors[exampleIdx] 
          acc_supp = Acceptors[exampleIdx] 
 
-         # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)    
-         if Conf.mode == 'using_quality_scores':
-            trueSpliceAlign, trueWeightMatch, trueWeightQuality =\
-            computeSpliceAlignWithQuality(dna, exons, est, quality, qualityPlifs)
-         else:
-            trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
-         
-         # Calculate the weights
-         trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
-         trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
+         # for idx,elem in enumerate(don_supp):
+         #    if elem != -inf:
+         #       don_supp[idx] = 0.0
+
+         # for idx,elem in enumerate(acc_supp):
+         #    if elem != -inf:
+         #       acc_supp[idx] = 0.0
+
+         ## Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)    
+         #if Conf.mode == 'using_quality_scores':
+         #   trueSpliceAlign, trueWeightMatch, trueWeightQuality =\
+         #   computeSpliceAlignWithQuality(dna, exons, est, quality, qualityPlifs)
+         #else:
+         #   trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
+         #
+         ## Calculate the weights
+         #trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
+         #trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
+
+         #if Conf.mode == 'using_quality_scores':
+         #   totalQualityPenalties = param[-totalQualSP:]
+         #   currentPhi[donSP+accSP+lengthSP+mmatrixSP:]                    = totalQualityPenalties[:]
 
          currentPhi[0:donSP]                                               = mat(d.penalties[:]).reshape(donSP,1)
          currentPhi[donSP:donSP+accSP]                                     = mat(a.penalties[:]).reshape(accSP,1)
          currentPhi[donSP+accSP:donSP+accSP+lengthSP]                      = mat(h.penalties[:]).reshape(lengthSP,1)
          currentPhi[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP]   = mmatrix[:]
 
-         if Conf.mode == 'using_quality_scores':
-            totalQualityPenalties = param[-totalQualSP:]
-            currentPhi[donSP+accSP+lengthSP+mmatrixSP:]                    = totalQualityPenalties[:]
-
          # Calculate w'phi(x,y) the total score of the alignment
-         trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
+         #trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
 
          # The allWeights vector is supposed to store the weight parameter
          # of the true alignment as well as the weight parameters of the
          # 1 other alignments
-         allWeights = zeros((Conf.numFeatures,1+1))
-         allWeights[:,0] = trueWeight[:,0]
+         #allWeights = zeros((Conf.numFeatures,1+1))
+         #allWeights[:,0] = trueWeight[:,0]
 
-         AlignmentScores = [0.0]*(1+1)
-         AlignmentScores[0] = trueAlignmentScore
+         #AlignmentScores = [0.0]*(1+1)
+         #AlignmentScores[0] = trueAlignmentScore
 
          ################## Calculate wrong alignment(s) ######################
 
@@ -617,6 +646,7 @@ class QPalma:
          dna_len = len(dna)
          est_len = len(est)
 
+
          ps = h.convert2SWIG()
 
          prb = QPalmaDP.createDoubleArrayFromList(quality)
@@ -704,25 +734,21 @@ class QPalma:
          self.plog(line2+'\n')
          self.plog(line3+'\n')
 
-         weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a, h, newSpliceAlign.flatten().tolist()[0], don_supp, acc_supp)
+         #weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a, h, newSpliceAlign.flatten().tolist()[0], don_supp, acc_supp)
 
-         decodedQualityFeatures = zeros((Conf.totalQualSuppPoints,1))
-         for qidx in range(Conf.totalQualSuppPoints):
-            decodedQualityFeatures[qidx] = newQualityPlifsFeatures[qidx]
+         #decodedQualityFeatures = zeros((Conf.totalQualSuppPoints,1))
+         #for qidx in range(Conf.totalQualSuppPoints):
+         #   decodedQualityFeatures[qidx] = newQualityPlifsFeatures[qidx]
 
          # Gewichte in restliche Zeilen der Matrix speichern
-         wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch.T, decodedQualityFeatures])
-         allWeights[:,pathNr+1] = wp
+         #wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch.T, decodedQualityFeatures])
+         #allWeights[:,pathNr+1] = wp
 
-         hpen = mat(h.penalties).reshape(len(h.penalties),1)
-         dpen = mat(d.penalties).reshape(len(d.penalties),1)
-         apen = mat(a.penalties).reshape(len(a.penalties),1)
-         features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties])
-         AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
-
-         # if the pathNr-best alignment is very close to the true alignment consider it as true
-         if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
-            true_map[pathNr+1] = 1
+         #hpen = mat(h.penalties).reshape(len(h.penalties),1)
+         #dpen = mat(d.penalties).reshape(len(d.penalties),1)
+         #apen = mat(a.penalties).reshape(len(a.penalties),1)
+         #features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties])
+         #AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
 
          e1_b_off,e1_e_off,e2_b_off,e2_e_off,newExons = self.evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign,currentSplitPos,exampleIdx)
 
@@ -976,15 +1002,13 @@ def zero_out(vec,intervals):
 if __name__ == '__main__':
    qpalma = QPalma()
 
-   if len(sys.argv) == 2:
-      mode = sys.argv[1]
-      assert mode == 'train'
-      qpalma.train()
+   mode = sys.argv[1]
+
+   if len(sys.argv) == 3 and mode == 'train':
+      qpalma.train(int(sys.argv[2]))
 
-   elif len(sys.argv) == 3:
-      mode = sys.argv[1]
+   elif len(sys.argv) == 3 and mode == 'predict':
       param_filename = sys.argv[2]
-      assert mode == 'predict' 
       assert os.path.exists(param_filename)
       qpalma.evaluate(param_filename)
    else: