+ added part of the feature calculation for the quality score values
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Sun, 6 Jan 2008 16:12:07 +0000 (16:12 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Sun, 6 Jan 2008 16:12:07 +0000 (16:12 +0000)
+ rearranged some code to eliminate redundancies -> Plif class
+ computeSpliceAlignWithQuality is responsible for the extraction of the feature
counts. this method could be combined with the "normal" computeSpliceAlign later
on.
TODO:
- fix GC problem during parameter returning of the alignemnt
- enable the use of quality scores in the result_align function

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@7235 e1793c9e-67f9-0310-80fc-b846ff1f7b36

QPalmaDP/penalty_info.h
QPalmaDP/qpalma_dp.cpp
QPalmaDP/result_align.cpp
python/Plif.py [new file with mode: 0644]
python/computeSpliceAlignWithQuality.py [new file with mode: 0644]
python/qpalma.py
python/set_param_palma.py

index a5725b0..30de4aa 100644 (file)
@@ -40,8 +40,6 @@ void copy_penalty_struct(struct penalty_struct *old, struct penalty_struct *newp
 REAL lookup_penalty(const struct penalty_struct *PEN, INT p_value, 
                                        REAL* svm_values, bool follow_next=true) ;
 
-extern struct penalty_struct *qualityScores;
-
 enum mode { NORMAL, USE_QUALITY_SCORES};
 
 #endif
index 3339376..1fa3a4f 100644 (file)
@@ -104,6 +104,11 @@ void Alignment::myalign(int nr_paths_p, char* dna, int dna_len_p, char* est,
   mmatrix_param = new int[(mlen*mlen)*nr_paths];
   alignmentscores = new double[nr_paths]; //alignment score for each path/matrix
   qualityScoresAllPaths = new penalty_struct[24*nr_paths];
+  int qidx;
+  for(qidx=0;qidx<24*nr_paths;qidx++) {
+      penalty_struct p;
+      qualityScoresAllPaths[qidx] = p;
+  }
 
   // printf("before memset...\n");
 
@@ -124,8 +129,12 @@ void Alignment::myalign(int nr_paths_p, char* dna, int dna_len_p, char* est,
     int* e_align = est_align + (est_len-1)*z ;    //pointer
     int* mparam  = mmatrix_param + (mlen*mlen)*z; //pointer
     penalty_struct* qparam = qualityScoresAllPaths + (24*z);
+
     
+    printf("before call to result_align...\n");
+  
     bool no_more_path = result_align(matrices, z, est_len, dna_len, &result_length, est, dna, prb, chastity, s_align, e_align, mparam, alignmentscores, max_score_positions, qparam);
+    printf("after call to result_align...\n");
       
       if(DNA_ARRAY != 0) {
        delete[] DNA_ARRAY;
@@ -187,27 +196,28 @@ void Alignment::myalign(int nr_paths_p, char* dna, int dna_len_p, char* est,
 void Alignment::getAlignmentResults(int* s_align, int* e_align,
       int* mmatrix_p, double* alignscores, penalty_struct* qScores) {
 
-   // printf("Entering getAlignmentResults...\n");
+   printf("Entering getAlignmentResults...\n");
    uint splice_align_size = (dna_len-1)*nr_paths;
    uint est_align_size = (est_len-1)*nr_paths;
    uint mmatrix_param_size = (mlen*mlen)*nr_paths;
    uint alignmentscores_size = nr_paths; //alignment score for each path/matrix
+   uint qScores_size = 24*nr_paths; //alignment score for each path/matrix
 
    int idx;
    for(idx=0; idx<splice_align_size; idx++)
       s_align[idx] = splice_align[idx];
 
-   for(idx=0; idx<splice_align_size; idx++)
+   for(idx=0; idx<est_align_size; idx++)
       e_align[idx] =  est_align[idx];
 
-   for(idx=0; idx<splice_align_size; idx++)
+   for(idx=0; idx<mmatrix_param_size; idx++)
       mmatrix_p[idx] = mmatrix_param[idx];
 
    for(idx=0; idx<splice_align_size; idx++)
       alignscores[idx] = alignmentscores[idx];
    
-   for(idx=0; idx<splice_align_size; idx++)
+   for(idx=0; idx<qScores_size; idx++)
       qScores[idx] = qualityScoresAllPaths[idx];
 
-   // printf("Leaving getAlignmentResults...\n");
+   printf("Leaving getAlignmentResults...\n");
 }
index 1bfe6b1..99b3238 100644 (file)
@@ -5,26 +5,28 @@ using namespace std;
 
 void increaseFeatureCount(penalty_struct* qparam, int dnanum, int estnum, double estprb) {
 
+   penalty_struct currentStruct = qparam[estnum*6+dnanum];
+
    double value = estprb;
    int Lower = 0;
    int idx;
-   for (idx=0;idx<qparam->len;idx++) {
-      if (qparam->limits[idx] <= value)
+   for (idx=0;idx<currentStruct.len;idx++) {
+      if (currentStruct.limits[idx] <= value)
          Lower++;
    }
 
    if (Lower == 0)
-         qparam->penalties[0] += 1;
+         currentStruct.penalties[0] += 1;
 
-   if (Lower == qparam->len)
-         qparam->penalties[qparam->len-1] += 1;
+   if (Lower == currentStruct.len)
+         currentStruct.penalties[currentStruct.len-1] += 1;
 
    Lower -= 1;
    int Upper = Lower+1; // x-werte bleiben fest
-   double weightup  = 1.0*(value - qparam->limits[Lower]) / (qparam->limits[Upper] - qparam->limits[Lower]);
-   double weightlow = 1.0*(qparam->limits[Upper] - value) / (qparam->limits[Upper] - qparam->limits[Lower]);
-   qparam->penalties[Upper] = qparam->penalties[Upper] + weightup;
-   qparam->penalties[Lower] = qparam->penalties[Lower] + weightlow;
+   double weightup  = 1.0*(value - currentStruct.limits[Lower]) / (currentStruct.limits[Upper] - currentStruct.limits[Lower]);
+   double weightlow = 1.0*(currentStruct.limits[Upper] - value) / (currentStruct.limits[Upper] - currentStruct.limits[Lower]);
+   currentStruct.penalties[Upper] = currentStruct.penalties[Upper] + weightup;
+   currentStruct.penalties[Lower] = currentStruct.penalties[Lower] + weightlow;
 }
 
 bool result_align(Pre_score* matrices[], int z, int est_len, int dna_len, int* result_length_ptr, char* est, char* dna, double* prb, double* chastity, int* s_align, int* e_align, int* mparam, double* alignmentscores, int* max_score_positions, penalty_struct* qparam)
@@ -104,7 +106,7 @@ bool result_align(Pre_score* matrices[], int z, int est_len, int dna_len, int* r
 
       prbnum = prb[i-1];
       chastitynum = chastity[i-1];
-      increaseFeatureCount(qparam,dnanum,estnum,prbnum);
+      //increaseFeatureCount(qparam,dnanum,estnum,prbnum);
 
       mparam[mlen*dnanum +estnum] ++ ;
     }
@@ -129,7 +131,7 @@ bool result_align(Pre_score* matrices[], int z, int est_len, int dna_len, int* r
 
       prbnum = prb[i-1];
       chastitynum = chastity[i-1];
-      increaseFeatureCount(qparam,dnanum,estnum,prbnum);
+      //increaseFeatureCount(qparam,dnanum,estnum,prbnum);
 
       mparam[mlen*dnanum +estnum] ++ ;
     }
diff --git a/python/Plif.py b/python/Plif.py
new file mode 100644 (file)
index 0000000..433c3d4
--- /dev/null
@@ -0,0 +1,44 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import QPalmaDP
+
+class Plf: 
+   """
+   means piecewise linear function
+   """
+
+   def __init__(self,num=None):
+      if num == None:
+         self.len = 0
+         self.limits = []
+         self.penalties = []
+         self.transform = ''
+         self.name = ''
+         self.max_len = 0
+         self.min_len = 0
+      else:
+         self.len = num
+         self.limits = [0]*num
+         self.penalties = [0]*num
+         self.transform = ''
+         self.name = ''
+         self.max_len = 0
+         self.min_len = 0
+         
+
+   def convert2SWIG(self):
+      ps = QPalmaDP.penalty_struct()
+
+      ps.len = len(self.limits)
+   
+      ps.limits = QPalmaDP.createDoubleArrayFromList(self.limits)
+      ps.penalties = QPalmaDP.createDoubleArrayFromList(self.penalties)
+
+      ps.max_len = self.max_len
+      ps.min_len = self.min_len
+      ps.transform = 0
+      ps.name = self.name
+
+      return ps
+
diff --git a/python/computeSpliceAlignWithQuality.py b/python/computeSpliceAlignWithQuality.py
new file mode 100644 (file)
index 0000000..f53098a
--- /dev/null
@@ -0,0 +1,91 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from numpy.matlib import zeros
+import pdb
+from Plif import *
+
+def  computeSpliceAlignWithQuality(dna, exons, quality):
+   """
+   Exonpos: Anfang Exon bis 1 rechts neben Ende Exon
+   Daraus folgt: SpliceposAnfang ist gleich Ende Exon von Vorgaenger
+   (ausser dem letzten und ersten)
+    und Ende Splicestelle ist gleich Anfang Exon vom naechsten, da Splicestelleende notiert 1 rechts neben Splciestellenende
+   bsp: 
+   cccccA1cccccE1cccccA2 ... AmcccccEmcccccccc
+   cccccXXcccccA1cccccE1 ... Em-1cccXXcccccccc
+   """
+
+   numberOfExons = (exons.shape)[0] # how many rows ?
+   exonSizes = [-1]*numberOfExons
+
+   assert numberOfExons == 3
+
+   for idx in range(numberOfExons):
+      exonSizes[idx] = exons[idx,1] - exons[idx,0]
+
+   sizeMatchmatrix = 6 # -acgtn
+
+   # SpliceAlign vector describes alignment: 
+   # 1:donorpos, 3:intron 2:acceptorpos, 0:exon, 4: dangling end
+   SpliceAlign = []
+
+   if exons[0,0] > 0:
+      SpliceAlign.extend([4]*(exons[0,0]))
+
+   for idx in range(numberOfExons):
+      exonLength = exonSizes[idx]
+      SpliceAlign.extend([0]*exonLength)
+      
+      if idx < numberOfExons-1:
+         intronLength = exons[idx+1,0] - exons[idx,1]
+         SpliceAlign.extend([1]+[3]*(intronLength-2)+[2])
+
+   if len(dna) > exons[2,1]:
+      SpliceAlign.extend([4]*(len(dna)-exons[2,1]))
+
+   assert len(SpliceAlign) == len(dna), pdb.set_trace()
+
+   # number of matches: just have to look at the underlying est
+   # est = dna(find(SpliceAlign==0)) # exon nucleotides
+   exonPos = [pos for pos,elem in enumerate(SpliceAlign) if elem == 0]
+   est = [elem for pos,elem in enumerate(dna) if pos in exonPos]
+
+   #length_est = sum(exon_sizes) ;
+   totalESTLength = 0
+   for elem in exonSizes:
+      totalESTLength += elem
+
+   assert totalESTLength == len(est) 
+
+   # counts the occurences of a,c,g,t,n in this order
+   numChar = [0]*sizeMatchmatrix
+
+   trueQualityPlifs =  [Plf()] * 24
+
+   for elem in est:
+      if elem == 'a':
+         numChar[0] += 1
+      if elem == 'c':
+         numChar[1] += 1
+      if elem == 'g':
+         numChar[2] += 1
+      if elem == 't':
+         numChar[3] += 1
+      if elem == 'n':
+         numChar[4] += 1
+
+   totalNumChar = 0
+   for idx in range(sizeMatchmatrix):
+      totalNumChar += numChar[idx]
+
+   assert totalNumChar == len(est)
+
+   # writing in weight match matrix
+   # matrix is saved columnwise
+   trueWeightMatch = zeros((sizeMatchmatrix*sizeMatchmatrix,1)) # Scorematrix fuer Wahrheit
+   for idx in range(1,sizeMatchmatrix):
+      trueWeightMatch[(sizeMatchmatrix+1)*idx] = numChar[idx-1]
+
+   return SpliceAlign, trueWeightMatch, trueQualityPlifs
+
index 8d7af91..d615b6a 100644 (file)
@@ -11,6 +11,7 @@ import sys
 import subprocess
 import scipy.io
 import pdb
+import os.path
 
 from numpy.matlib import mat,zeros,ones,inf
 from numpy.linalg import norm
@@ -25,6 +26,7 @@ from paths_load_data_pickle import *
 from computeSpliceWeights import *
 from set_param_palma import *
 from computeSpliceAlign import *
+from computeSpliceAlignWithQuality import *
 from penalty_lookup_new import *
 from compute_donacc import *
 from TrainingParam import Param
@@ -32,6 +34,7 @@ from export_param import *
 
 import Configuration
 
+from Plif import Plf
 
 
 def initializeQualityScoringFunctions(numPlifs,numSuppPoints):
@@ -82,19 +85,26 @@ class QPalma:
       self.logfh = open('qpalma.log','w+')
       gen_file= '%s/genome.config' % self.ARGS.basedir
 
-      cmd = ['']*4
-      cmd[0] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/utils'
-      cmd[1] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/genomes'
-      cmd[2] = 'genome_info = init_genome(\'%s\')' % gen_file
-      cmd[3] = 'save genome_info.mat genome_info'  
-      full_cmd = "matlab -nojvm -nodisplay -r \"%s; %s; %s; %s; exit\"" % (cmd[0],cmd[1],cmd[2],cmd[3])
+      ginfo_filename = 'genome_info.pickle'
 
-      obj = subprocess.Popen(full_cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
-      out,err = obj.communicate()
-      assert err == '', 'An error occured!\n%s'%err
+      if not os.path.exists(ginfo_filename):
+         cmd = ['']*4
+         cmd[0] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/utils'
+         cmd[1] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/genomes'
+         cmd[2] = 'genome_info = init_genome(\'%s\')' % gen_file
+         cmd[3] = 'save genome_info.mat genome_info'  
+         full_cmd = "matlab -nojvm -nodisplay -r \"%s; %s; %s; %s; exit\"" % (cmd[0],cmd[1],cmd[2],cmd[3])
 
-      ginfo = scipy.io.loadmat('genome_info.mat')
-      self.genome_info = ginfo['genome_info']
+         obj = subprocess.Popen(full_cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
+         out,err = obj.communicate()
+         assert err == '', 'An error occured!\n%s'%err
+
+         ginfo = scipy.io.loadmat('genome_info.mat')
+         self.genome_info = ginfo['genome_info']
+         cPickle.dump(self.genome_info,open(ginfo_filename,'w+'))
+
+      else:
+         self.genome_info = cPickle.load(open(ginfo_filename))
 
       self.plog('genome_info.basedir is %s\n'%self.genome_info.basedir)
 
@@ -208,6 +218,8 @@ class QPalma:
             dna = Sequences[exampleIdx] 
             est = Ests[exampleIdx] 
 
+            quality = [0.0]*len(est)
+
             exons = Exons[exampleIdx] 
             # NoiseMatrix = Noises[exampleIdx] 
             don_supp = Donors[exampleIdx] 
@@ -215,7 +227,7 @@ class QPalma:
 
             # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)    
             trueSpliceAlign, trueWeightMatch = computeSpliceAlign(dna, exons)
-            trueSpliceAlign, trueWeightMatch, trueQualityPlifs = computeSpliceAlignWithQuality(dna, exons, qualityPlifs)
+            trueSpliceAlign, trueWeightMatch, trueQualityPlifs = computeSpliceAlignWithQuality(dna, exons, quality)
             
             # Calculate the weights
             trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
@@ -285,40 +297,58 @@ class QPalma:
             c_WeightMatch       = QPalmaDP.createIntArrayFromList([0]*(mm_len*num_path[exampleIdx]))
             c_AlignmentScores   = QPalmaDP.createDoubleArrayFromList([.0]*num_path[exampleIdx])
 
-            emptyPlif = Plf()
-            emptyPlif = curPlif.convert2SWIG()
-            c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList(emptyPlif*(24*num_path[exampleIdx]))
+            emptyPlif = Plf(30)
+            emptyPlif = emptyPlif.convert2SWIG()
+            c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([emptyPlif]*(24*num_path[exampleIdx]))
+
+            print "Trying to fetch results from c-code"
 
             currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
             c_WeightMatch, c_AlignmentScores, c_qualityPlifs)
-            del currentAlignment
+
+            print "Fetched results from c-code"
 
             newSpliceAlign = zeros((num_path[exampleIdx]*dna_len,1))
             newEstAlign = zeros((est_len*num_path[exampleIdx],1))
             newWeightMatch = zeros((num_path[exampleIdx]*mm_len,1))
+            newQualityPlifs = [None]*num_path[exampleIdx]*24
 
-            #print 'newSpliceAlign'
+            print 'newSpliceAlign'
             for i in range(dna_len*num_path[exampleIdx]):
                newSpliceAlign[i] = c_SpliceAlign[i]
             #   print '%f' % (spliceAlign[i])
 
-            #print 'newEstAlign'
+            print 'newEstAlign'
             for i in range(est_len*num_path[exampleIdx]):
                newEstAlign[i] = c_EstAlign[i]
             #   print '%f' % (spliceAlign[i])
 
-            #print 'weightMatch'
+            print 'weightMatch'
             for i in range(mm_len*num_path[exampleIdx]):
                newWeightMatch[i] = c_WeightMatch[i]
             #   print '%f' % (weightMatch[i])
 
+            print 'AlignmentScores'
             for i in range(num_path[exampleIdx]):
                AlignmentScores[i+1] = c_AlignmentScores[i]
 
+            print 'newQualityPlifs'
+            for i in range(num_path[exampleIdx]*24):
+               newQualityPlifs[i+1] = c_qualityPlifs[i]
+
+            print "Calling destructors"
+
+            del c_SpliceAlign
+            del c_EstAlign
+            del c_WeightMatch
+            del c_AlignmentScores
+            del c_qualityPlifs
+            del currentAlignment
+
             newSpliceAlign = newSpliceAlign.reshape(num_path[exampleIdx],dna_len)
             newWeightMatch = newWeightMatch.reshape(num_path[exampleIdx],mm_len)
             # Calculate weights of the respective alignments Note that we are
-            # calculating n-best alignments without any hamming loss, so we
+            # calculating n-best alignments without hamming loss, so we
             # have to keep track which of the n-best alignments correspond to
             # the true one in order not to incorporate a true alignment in the
             # constraints. To keep track of the true and false alignments we
index 25746ce..c98e270 100644 (file)
@@ -5,6 +5,7 @@ import math
 import numpy.matlib
 import QPalmaDP
 import pdb
+from Plif import *
 
 def linspace(a,b,n):
    intervalLength = b-a
@@ -34,31 +35,6 @@ def logspace(a,b,n):
 def log10(x):
    return math.log(x)/math.log(10)
 
-class Plf: #means piecewise linear function
-
-   def __init_(self):
-      self.len = 0
-      self.limits = []
-      self.penalties = []
-      self.transform = ''
-      self.name = ''
-      self.max_len = 0
-      self.min_len = 0
-
-   def convert2SWIG(self):
-      ps = QPalmaDP.penalty_struct()
-
-      ps.len = len(self.limits)
-   
-      ps.limits = QPalmaDP.createDoubleArrayFromList(self.limits)
-      ps.penalties = QPalmaDP.createDoubleArrayFromList(self.penalties)
-
-      ps.max_len = self.max_len
-      ps.min_len = self.min_len
-      ps.transform = 0
-      ps.name = self.name
-
-      return ps
 
 
 def set_params_pa():