+ modifications and bugfixes
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 11 Jan 2008 18:17:53 +0000 (18:17 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 11 Jan 2008 18:17:53 +0000 (18:17 +0000)
+ added some checks for correct dimensions
+ replaced magic numbers by Configuration.varname
TODO
- check features / parameters matrices for correct values

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@7294 e1793c9e-67f9-0310-80fc-b846ff1f7b36

QPalmaDP/fill_matrix.cpp
QPalmaDP/qpalma_dp.cpp
QPalmaDP/qpalma_dp.h
python/Configuration.py
python/computeSpliceAlignWithQuality.py
python/paths_load_data_solexa.py
python/penalty_lookup_new.py
python/qpalma.py
python/set_param_palma.py

index f552121..773f229 100644 (file)
@@ -111,7 +111,7 @@ double getScore(struct penalty_struct* qualityScores, double* matchmatrix, int m
 
 void fill_matrix(int nr_paths, Pre_score* matrices[], int est_len /*counter var: i*/, int dna_len /*counter var: j*/, char* est, char* dna, penalty_struct* functions , double* matchmatrix, penalty_struct* qualityScores, double* donor, double* acceptor, bool remove_duplicate_scores, int nr_donor_sites, int* donor_sites, int* max_score_positions, mode currentMode)
 {      
-  // printf("Entering fill_matrix...\n");
+  //printf("Entering fill_matrix...\n");
 
   /*********************************************************************************************/
   // variables
@@ -119,15 +119,14 @@ void fill_matrix(int nr_paths, Pre_score* matrices[], int est_len /*counter var:
   const int mlen = 6; // length of matchmatrix
   int estChar, dnaChar; // 0:'-', 1:'a', 2:'c', 3:'g', 4:'t', 5:'n'
 
-  printf("1st plif first three elems %f %f %f\n",qualityScores[0].penalties[0],qualityScores[0].penalties[1],qualityScores[0].penalties[2]);
-  printf("2nd plif first three elems %f %f %f\n",qualityScores[1].penalties[0],qualityScores[1].penalties[1],qualityScores[1].penalties[2]);
+  //printf("1st plif first three elems %f %f %f\n",qualityScores[0].penalties[0],qualityScores[0].penalties[1],qualityScores[0].penalties[2]);
+  //printf("2nd plif first three elems %f %f %f\n",qualityScores[1].penalties[0],qualityScores[1].penalties[1],qualityScores[1].penalties[2]);
 
   double baseScore;
   double *est_scores = new double[est_len];
   for(int i=0;i<est_len;i++)
    est_scores[i] = 0.0;
 
-
   double prevValue ;
   double tempValue;
   int max_len = (int)functions->limits[functions->len-1] ; //last (len) supporting point of h
@@ -220,6 +219,7 @@ void fill_matrix(int nr_paths, Pre_score* matrices[], int est_len /*counter var:
   /*********************************************************************************************/      
 
 
+   //printf("after initialization\n");
   
   /*********************************************************************************************/
   // all other entries for all matrices, last column and row treated seperately
@@ -239,6 +239,8 @@ void fill_matrix(int nr_paths, Pre_score* matrices[], int est_len /*counter var:
       baseScore = est_scores[i-1];
 
       
+      //printf("at iteration %d %d\n",i,j);
+
       //if introns of length greater than max_len possible
       if (j > max_len)
       {
@@ -482,8 +484,6 @@ void fill_matrix(int nr_paths, Pre_score* matrices[], int est_len /*counter var:
   delete[] array;
   delete[] scores;
 
-  // printf("Leaving fill_matrix...\n");
+  //printf("Leaving fill_matrix...\n");
   return;
 }
-
-
index d35d434..cae6d15 100644 (file)
@@ -6,7 +6,7 @@ using namespace std;
   myalign([nr_paths], dna, est, {h}, matchmatrix, donor, acceptor, remove_duplicate_scores, ...
   print_matrix) */
 
-Alignment::Alignment() {
+Alignment::Alignment(int numq) {
       len = 0;
       limits = 0;
       penalties = 0;
@@ -23,6 +23,7 @@ Alignment::Alignment() {
       mmatrix_param = 0;
       alignmentscores = 0;
       qualityScoresAllPaths = 0;
+      numQualSuppPoints = numq;
 }
 
 void Alignment::setQualityMatrix(double* qMat, int length){
@@ -106,7 +107,7 @@ void Alignment::myalign(int nr_paths_p, char* dna, int dna_len_p, char* est,
   for(qidx=0;qidx<25*nr_paths;qidx++) {
       penalty_struct p;
       init_penalty_struct(p);
-      p.len = 10;
+      p.len = numQualSuppPoints;
       p.limits = (REAL*) calloc(p.len,sizeof(REAL));
       p.penalties = (REAL*) calloc(p.len,sizeof(REAL));
       qualityScoresAllPaths[qidx] = p;
@@ -143,7 +144,7 @@ void Alignment::myalign(int nr_paths_p, char* dna, int dna_len_p, char* est,
    DNA_ARRAY = new double[result_length];
    EST_ARRAY = new double[result_length];
 
-    //converting result
+    //backtracking
     int i = max_score_positions[2*z] ; //i (est)
     int j = max_score_positions[2*z +1] ; //j (dna)
     int prev_i = ((Pre_score*)matrices[z] + i*dna_len +j)->prev_i; 
index 80173a7..efb2b5c 100644 (file)
@@ -76,6 +76,8 @@ class Alignment {
       int mlen;
       int nr_paths;
 
+      int numQualSuppPoints;
+
       uint splice_align_size ;
       uint est_align_size ;
       uint mmatrix_param_size ;
@@ -96,7 +98,7 @@ class Alignment {
       INT use_svm;
 
    public:
-      Alignment();
+      Alignment(int numq);
       ~Alignment() {
          if(splice_align != 0)
             delete[] splice_align;
index 6457e37..419b0be 100644 (file)
@@ -9,6 +9,47 @@ fixedParam = numpy.matlib.mat(
        [ 0.60394866], [ 0.70371272], [ 0.07548074], [ 0.63412803], [ 0.97442266],
        [ 0.13216791], [ 0.71041168], [ 0.2093887 ], [ 0.35227344], [ 0.3405142 ],
        [ 0.69677236], [ 0.41673747], [ 0.564245  ], [ 0.37613432], [ 0.88805642],
+       [ 0.69677236], [ 0.41673747], [ 0.564245  ], [ 0.37613432], [ 0.88805642],
+       [ 0.69677236], [ 0.41673747], [ 0.564245  ], [ 0.37613432], [ 0.88805642],
+       [ 0.88691608], [ 0.69476752], [ 0.81659504], [ 0.17801859], [ 0.71048235],
+       [ 0.08188783], [ 0.54884803], [ 0.84039558], [ 0.6982093 ], [ 0.41686176],
+       [ 0.38568873], [ 0.29401347], [ 0.12704074], [ 0.30640858], [ 0.89578031],
+       [ 0.84621571], [ 0.11783439], [ 0.0944695 ], [ 0.34081575], [ 0.44157643],
+       [ 0.77847185], [ 0.04283567], [ 0.45107823], [ 0.89789891], [ 0.41045519],
+       [ 0.49073531], [ 0.29727627], [ 0.94711483], [ 0.24898204], [ 0.26181212],
+       [ 0.71760957], [ 0.60326883], [ 0.80887576], [ 0.09448718], [ 0.88064525],
+       [ 0.84317654], [ 0.48893703], [ 0.24847021], [ 0.84203596], [ 0.34104156],
+       [ 0.75604701], [ 0.91703057], [ 0.69325475], [ 0.61276969], [ 0.16335226],
+       [ 0.4684374 ], [ 0.16553371], [ 0.79594434], [ 0.6440283 ], [ 0.80922237],
+       [ 0.5349296 ], [ 0.31924316], [ 0.10960695], [ 0.40151062], [ 0.50473641],
+       [ 0.14812671], [ 0.73523169], [ 0.35141625], [ 0.80364238], [ 0.02128181],
+       [ 0.0061226 ], [ 0.34541924], [ 0.07694485], [ 0.05551339], [ 0.23087636],
+       [ 0.87016395], [ 0.31682377], [ 0.27375113], [ 0.72226332], [ 0.62914149],
+       [ 0.59236012], [ 0.2070238 ], [ 0.52390942], [ 0.11894098], [ 0.55725917],
+       [ 0.72706009], [ 0.087196  ], [ 0.04745082], [ 0.95636492], [ 0.31524576],
+       [ 0.79685218], [ 0.80386771], [ 0.70942604], [ 0.82869417], [ 0.26906569],
+       [ 0.51848039], [ 0.64169354], [ 0.07114973], [ 0.39249454], [ 0.07002803],
+       [ 0.94667567], [ 0.02252752], [ 0.01039039], [ 0.5721312 ], [ 0.06065969],
+       [ 0.88691608], [ 0.69476752], [ 0.81659504], [ 0.17801859], [ 0.71048235],
+       [ 0.88691608], [ 0.69476752], [ 0.81659504], [ 0.17801859], [ 0.71048235],
+       [ 0.08188783], [ 0.54884803], [ 0.84039558], [ 0.6982093 ], [ 0.41686176],
+       [ 0.38568873], [ 0.29401347], [ 0.12704074], [ 0.30640858], [ 0.89578031],
+       [ 0.84621571], [ 0.11783439], [ 0.0944695 ], [ 0.34081575], [ 0.44157643],
+       [ 0.77847185], [ 0.04283567], [ 0.45107823], [ 0.89789891], [ 0.41045519],
+       [ 0.49073531], [ 0.29727627], [ 0.94711483], [ 0.24898204], [ 0.26181212],
+       [ 0.71760957], [ 0.60326883], [ 0.80887576], [ 0.09448718], [ 0.88064525],
+       [ 0.84317654], [ 0.48893703], [ 0.24847021], [ 0.84203596], [ 0.34104156],
+       [ 0.75604701], [ 0.91703057], [ 0.69325475], [ 0.61276969], [ 0.16335226],
+       [ 0.4684374 ], [ 0.16553371], [ 0.79594434], [ 0.6440283 ], [ 0.80922237],
+       [ 0.5349296 ], [ 0.31924316], [ 0.10960695], [ 0.40151062], [ 0.50473641],
+       [ 0.14812671], [ 0.73523169], [ 0.35141625], [ 0.80364238], [ 0.02128181],
+       [ 0.0061226 ], [ 0.34541924], [ 0.07694485], [ 0.05551339], [ 0.23087636],
+       [ 0.87016395], [ 0.31682377], [ 0.27375113], [ 0.72226332], [ 0.62914149],
+       [ 0.59236012], [ 0.2070238 ], [ 0.52390942], [ 0.11894098], [ 0.55725917],
+       [ 0.72706009], [ 0.087196  ], [ 0.04745082], [ 0.95636492], [ 0.31524576],
+       [ 0.79685218], [ 0.80386771], [ 0.70942604], [ 0.82869417], [ 0.26906569],
+       [ 0.51848039], [ 0.64169354], [ 0.07114973], [ 0.39249454], [ 0.07002803],
+       [ 0.94667567], [ 0.02252752], [ 0.01039039], [ 0.5721312 ], [ 0.06065969],
        [ 0.88691608], [ 0.69476752], [ 0.81659504], [ 0.17801859], [ 0.71048235],
        [ 0.08188783], [ 0.54884803], [ 0.84039558], [ 0.6982093 ], [ 0.41686176],
        [ 0.38568873], [ 0.29401347], [ 0.12704074], [ 0.30640858], [ 0.89578031],
@@ -29,7 +70,8 @@ fixedParam = numpy.matlib.mat(
        [ 0.51848039], [ 0.64169354], [ 0.07114973], [ 0.39249454], [ 0.07002803],
        [ 0.94667567], [ 0.02252752], [ 0.01039039], [ 0.5721312 ], [ 0.06065969],
        [ 0.69422476], [ 0.4310939 ], [ 0.03069099], [ 0.35969779], [ 0.18047331],
-       [ 0.4177651 ], [ 0.01360547], [ 0.29069319]])
+       [ 0.4177651 ], [ 0.01360547], [ 0.29069319]
+       ])
 
 ###########################################################
 #
@@ -59,7 +101,7 @@ mode = 'using_quality_scores'
 numDonSuppPoints     = 30
 numAccSuppPoints     = 30
 numLengthSuppPoints  = 30 
-numQualSuppPoints    = 10
+numQualSuppPoints    = 4
 
 if mode == 'normal':
    sizeMMatrix          = 36
@@ -79,3 +121,6 @@ iter_steps = 2
 remove_duplicate_scores = False
 print_matrix            = False
 anzpath                 = 2
+
+fixedParam = fixedParam[:numFeatures]
+
index f5c7652..885eb08 100644 (file)
@@ -62,28 +62,17 @@ def  computeSpliceAlignWithQuality(dna, exons, read, quality):
    # counts the occurences of a,c,g,t,n in this order
    numChar = [0]*sizeMatchmatrix
 
-   # counts the occurrnces of a,c,g,t,n with their quality scores
+   # counts the occurrences of a,c,g,t,n with their quality scores
    trueQualityPlifs =  [Plf(Configuration.numQualSuppPoints)] * Configuration.numQualPlifs
    for currentPlif in trueQualityPlifs:
-      currentPlif.limits = mat(linspace(0,40,10)).reshape(10,1)
-      currentPlif.penalties = zeros((10,1))
+      currentPlif.limits = mat(linspace(0,40,Configuration.numQualSuppPoints)).reshape(Configuration.numQualSuppPoints,1)
+      currentPlif.penalties = zeros((Configuration.numQualSuppPoints,1))
 
-   for pos, elem in enumerate(read):
+   for elem in ['a','c','g','t','n']
       currentPlif = trueQualityPlifs[base_coord(elem,elem)]
 
-      value = quality[pos]
-      Lower = len([el for el in currentPlif.limits if el <= value])
-
-      if Lower == 0:
-         currentPlif.penalties[0] += 1
-      elif Lower == len(currentPlif.limits):
-         currentPlif.penalties[-1] += 1
-      else:
-         # because we count from 0 in python
-         Lower -= 1
-         Upper = Lower+1 ; # x-werte bleiben fest
-         currentPlif.penalties[Upper] += 1
-         currentPlif.penalties[Lower] += 1
+      for idx in range(Configuration.numQualSuppPoints):
+         currentPlif.penalties[idx] = 1
 
    for pos, elem in enumerate(est):
       if elem == 'a':
@@ -97,7 +86,6 @@ def  computeSpliceAlignWithQuality(dna, exons, read, quality):
       if elem == 'n':
          numChar[4] += 1
 
-
    totalNumChar = 0
    for idx in range(sizeMatchmatrix):
       totalNumChar += numChar[idx]
index 797d70d..33e6e25 100644 (file)
@@ -40,6 +40,8 @@ def paths_load_data_solexa(expt,genome_info,PAR):
 
       currentGene = allGenes[gene_id]
 
+      seq = seq.lower().replace('n','a')
+
       if len(currentGene.exons) != 3:
          continue
 
index a0f5d1c..9837dd5 100644 (file)
@@ -35,9 +35,9 @@ def penalty_lookup_new(penalty_struct, value):
    if count == 0:
       pen = penalties[0]
    elif count == len(limits):
-     pen=penalties[-1]
+      pen=penalties[-1]
    else:
-     pen = (penalties[count]*(value-limits[count-1]) + penalties[count-1]\
+      pen = (penalties[count]*(value-limits[count-1]) + penalties[count-1]\
          *(limits[count]-value)) / (limits[count]-limits[count-1])
 
    return pen
index 283ad9e..1fd0924 100644 (file)
@@ -63,6 +63,7 @@ class QPalma:
 
    def plog(self,string):
       self.logfh.write(string)
+      self.logfh.flush()
 
    def run(self):
       # Load the whole dataset 
@@ -87,8 +88,8 @@ class QPalma:
       anzpath                 = Configuration.anzpath 
 
       # Initialize parameter vector  / param = numpy.matlib.rand(126,1)
-      #param = Configuration.fixedParam 
-      param = cPickle.load(open('initial_param.pickle'))
+      param = Configuration.fixedParam 
+      #param = cPickle.load(open('initial_param.pickle'))
 
       # Set the parameters such as limits penalties for the Plifs
       [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
@@ -120,6 +121,7 @@ class QPalma:
       accSP       = Configuration.numAccSuppPoints
       lengthSP    = Configuration.numLengthSuppPoints
       mmatrixSP   = Configuration.sizeMMatrix
+      numq        = Configuration.numQualSuppPoints
       totalQualSP = Configuration.totalQualSuppPoints
 
       currentPhi = zeros((Configuration.numFeatures,1))
@@ -130,6 +132,7 @@ class QPalma:
          if iteration_nr == iteration_steps:
             break
 
+         #for exampleIdx in range(59,self.numExamples):
          for exampleIdx in range(self.numExamples):
             if (exampleIdx%10) == 0:
                print 'Current example nr %d' % exampleIdx
@@ -151,11 +154,8 @@ class QPalma:
             trueWeightQuality = getQualityFeatureCounts(trueQualityPlifs)
             trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
 
-            pdb.set_trace()
-
-            numq = Configuration.numQualSuppPoints
             for pos,plif in enumerate(qualityPlifs):
-               totalQualityPenalties[pos*numq:(pos+1)*numq] = mat(plif.penalties).reshape(10,1)
+               totalQualityPenalties[pos*numq:(pos+1)*numq] = mat(plif.penalties).reshape(numq,1)
 
             currentPhi[0:donSP]                                               = mat(d.penalties[:]).reshape(donSP,1)
             currentPhi[donSP:donSP+accSP]                                     = mat(a.penalties[:]).reshape(accSP,1)
@@ -191,6 +191,8 @@ class QPalma:
             est_len = len(est)
             ps = h.convert2SWIG()
 
+            #pdb.set_trace()
+
             prb = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
             chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
 
@@ -202,9 +204,8 @@ class QPalma:
             a_len = len(acceptor)
             acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
 
-            currentAlignment = QPalmaDP.Alignment()
+            currentAlignment = QPalmaDP.Alignment(Configuration.numQualSuppPoints )
             c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
-
             #print 'PYTHON: Calling myalign...'
             # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
             currentAlignment.myalign( num_path[exampleIdx], dna, dna_len,\
@@ -334,29 +335,32 @@ class QPalma:
                # if there is at least one useful false alignment add the
                # corresponding constraints to the optimization problem
                if firstFalseIdx != -1:
-                  trueWeights        = allWeights[:,0]
-                  firstFalseWeights  = allWeights[:,firstFalseIdx]
-
-                  # LMM.py code:
-                  deltas  = firstFalseWeights - trueWeights
+                  trueWeights       = allWeights[:,0]
+                  firstFalseWeights = allWeights[:,firstFalseIdx]
+                  differenceVector  = firstFalseWeights - trueWeights
                   pdb.set_trace()
 
                   if not __debug__:
-                     const_added = solver.addConstraint(deltas, exampleIdx)
-                     objValue,w,self.slacks = solver.solve()
-
-                     sum_xis = 0
-                     for elem in self.slacks:
-                        sum_xis +=  elem
+                     const_added = solver.addConstraint(differenceVector, exampleIdx)
 
-                     for i in range(len(param)):
-                        param[i] = w[i]
-
-                     [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
                #
                # end of one example processing 
                #
-            #if exampleIdx == 100:
+
+            # call solver every nth step
+            if exampleIdx != 0 and exampleIdx % 3 == 0 and not __debug__:
+               objValue,w,self.slacks = solver.solve()
+      
+               print "objValue is %f" % objValue
+
+               sum_xis = 0
+               for elem in self.slacks:
+                  sum_xis +=  elem
+   
+               for i in range(len(param)):
+                  param[i] = w[i]
+
+               [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
             #   break
          
          #break
index 71fe2c6..2859d2d 100644 (file)
@@ -57,7 +57,7 @@ def set_param_palma(param, train_with_intronlengthinformation,\
    h = Plf()
    d = Plf()
    a = Plf()
-   qualityPlifs = [Plf()]*Configuration.numQualPlifs
+   qualityPlifs = [None]*Configuration.numQualPlifs
 
    donSP       = Configuration.numDonSuppPoints
    accSP       = Configuration.numAccSuppPoints
@@ -69,7 +69,7 @@ def set_param_palma(param, train_with_intronlengthinformation,\
    # Gapfunktion
    ####################
    h.limits    = logspace(math.log(min_intron_len,10),math.log(max_intron_len,10),30) 
-   h.penalties = param[0:lengthSP].flatten().tolist()
+   h.penalties = param[0:lengthSP].flatten().tolist()[0]
    #h.transform = '+1' 
    h.transform = '' 
    h.name      = 'h' 
@@ -84,7 +84,7 @@ def set_param_palma(param, train_with_intronlengthinformation,\
    # Donorfunktion
    ####################
    d.limits    = linspace(min_svm_score,max_svm_score,30) 
-   d.penalties = param[lengthSP:lengthSP+donSP].flatten().tolist()
+   d.penalties = param[lengthSP:lengthSP+donSP].flatten().tolist()[0]
    #d.transform = '+1' 
    d.transform = '' 
    d.name      = 'd' 
@@ -99,7 +99,7 @@ def set_param_palma(param, train_with_intronlengthinformation,\
    # Acceptorfunktion
    ####################
    a.limits    = linspace(min_svm_score,max_svm_score,30) 
-   a.penalties = param[lengthSP+donSP:lengthSP+donSP+accSP].flatten().tolist()
+   a.penalties = param[lengthSP+donSP:lengthSP+donSP+accSP].flatten().tolist()[0]
    #a.transform = '+1' 
    a.transform = '' 
    a.name      = 'a' 
@@ -109,7 +109,6 @@ def set_param_palma(param, train_with_intronlengthinformation,\
    a.use_svm   = 0 
    a.next_id   = 0 
 
-
    ####################
    # Matchmatrix
    ####################
@@ -120,11 +119,11 @@ def set_param_palma(param, train_with_intronlengthinformation,\
    # Quality Plifs
    ####################
    for idx in range(Configuration.numQualPlifs):
-      currentPlif = qualityPlifs[idx]
+      currentPlif = Plf()
       currentPlif.limits    = linspace(min_svm_score,max_svm_score,Configuration.numQualSuppPoints) 
       begin                 = lengthSP+donSP+accSP+mmatrixSP+(idx*Configuration.numQualSuppPoints)
       end                   = lengthSP+donSP+accSP+mmatrixSP+((idx+1)*Configuration.numQualSuppPoints)
-      currentPlif.penalties = param[begin:end].flatten().tolist()
+      currentPlif.penalties = param[begin:end].flatten().tolist()[0]
       currentPlif.transform = '' 
       currentPlif.name      = 'q' 
       currentPlif.max_len   = 100  
@@ -135,10 +134,6 @@ def set_param_palma(param, train_with_intronlengthinformation,\
    return h,d,a,mmatrix,qualityPlifs
 
 if __name__ == '__main__':
-   #min_intron_len=20
-   #max_intron_len=1000
-   #print logspace(math.log(min_intron_len,10),math.log(max_intron_len,10),30) 
-
    min_svm_score=-5
    max_svm_score=5
    print linspace(min_svm_score,max_svm_score,30)