+ fixed a bug in the C++ interface
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Wed, 12 Dec 2007 16:32:22 +0000 (16:32 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Wed, 12 Dec 2007 16:32:22 +0000 (16:32 +0000)
+ removed some debugging printfs
+ added first version of an evaluation script
+ added standalone code to svn

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@7048 e1793c9e-67f9-0310-80fc-b846ff1f7b36

17 files changed:
QPalmaDP/fill_matrix.cpp
QPalmaDP/qpalma_dp.cpp
QPalmaDP/qpalma_dp.h
python/TrainingParam.py
python/qpalma.py
python/set_param_palma.py
standalone/palma/__init__.py [new file with mode: 0644]
standalone/palma/alignment_tool.py [new file with mode: 0644]
standalone/palma/data/elegans_WS150_ss=1_il=1.param.bz2 [new file with mode: 0644]
standalone/palma/genomic.py [new file with mode: 0644]
standalone/palma/model.py [new file with mode: 0644]
standalone/palma/output_formating.py [new file with mode: 0644]
standalone/palma/palma_utils.py [new file with mode: 0644]
standalone/palma/seqdict.py [new file with mode: 0644]
standalone/palma/signal_detectors.py [new file with mode: 0644]
standalone/setup.py [new file with mode: 0755]
tools/evaluatePrediction.py [new file with mode: 0644]

index 0c40eee..6104f55 100644 (file)
@@ -69,7 +69,7 @@ int check_char(char base)
 
 void fill_matrix(int nr_paths, Pre_score* matrices[], int est_len /*counter var: i*/, int dna_len /*counter var: j*/, char* est, char* dna, penalty_struct* functions , double* matchmatrix, double* donor, double* acceptor, bool remove_duplicate_scores, int nr_donor_sites, int* donor_sites, int* max_score_positions)
 {      
-  printf("Entering fill_matrix...\n");
+  // printf("Entering fill_matrix...\n");
 
   /*********************************************************************************************/
   // variables
@@ -79,7 +79,7 @@ void fill_matrix(int nr_paths, Pre_score* matrices[], int est_len /*counter var:
   double prevValue ;
   double tempValue;
   int max_len = (int)functions->limits[functions->len-1] ; //last (len) supporting point of h
-  printf("max_len: %d\n", max_len) ;
+  //printf("max_len: %d\n", max_len) ;
 
   int num_elem ; // counter variable
 
@@ -416,7 +416,7 @@ void fill_matrix(int nr_paths, Pre_score* matrices[], int est_len /*counter var:
   delete[] array;
   delete[] scores;
 
-  printf("Leaving fill_matrix...\n");
+  // printf("Leaving fill_matrix...\n");
   return;
 }
 
index 4c33425..a81652d 100644 (file)
@@ -17,12 +17,18 @@ Alignment::Alignment() {
       id = 0;
       name = 0;
       use_svm = 0;
-}
 
-Alignment::~Alignment() {}
+      splice_align = 0;
+      est_align = 0;
+      mmatrix_param = 0;
+      alignmentscores = 0;
+      qualityMatrix = 0;
+}
 
 void Alignment::setQualityMatrix(double* qMat, int length){
-   qualityMatrix = new double[length];
+   if(qualityMatrix == 0)
+      qualityMatrix = new double[length];
+
    for(int i=0; i<length; i++)
       qualityMatrix[i] = qMat[i];
 }
@@ -37,7 +43,7 @@ void Alignment::myalign(int nr_paths_p, char* dna, int dna_len_p, char* est,
       double* donor, int d_len, double* acceptor, int a_len,
       bool remove_duplicate_scores, bool print_matrix) {
 
-   printf("Entering myalign...\n");
+   // printf("Entering myalign...\n");
 
    mlen = 6; // score matrix: length of 6 for "- A C G T N"
    dna_len =  dna_len_p + 1 ;
@@ -72,31 +78,34 @@ void Alignment::myalign(int nr_paths_p, char* dna, int dna_len_p, char* est,
       matrices[z] = new Pre_score[dna_len * est_len];
   }
   
-  printf("calling fill_matrix...\n");
+  // printf("calling fill_matrix...\n");
 
   fill_matrix(nr_paths, matrices, est_len, dna_len, est, dna, &h, matchmatrix, donor, acceptor, remove_duplicate_scores, nr_donor_sites, donor_sites, max_score_positions);
   
-  printf("after call to fill_matrix...\n");
+  // printf("after call to fill_matrix...\n");
   /***************************************************************************/ 
   // return arguments etc.
   /***************************************************************************/
   int result_length; //Eine Variable fuer alle Matrizen
 
-  printf("before init of splice_align and rest...\n");
+  // printf("before init of splice_align and rest...\n");
 
   splice_align = new int[(dna_len-1)*nr_paths];
   est_align = new int[(est_len-1)*nr_paths];
   mmatrix_param = new int[(mlen*mlen)*nr_paths];
   alignmentscores = new double[nr_paths]; //alignment score for each path/matrix
 
-  printf("before memset...\n");
+  // printf("before memset...\n");
 
   memset((char*)splice_align, -1, (dna_len-1)*nr_paths*sizeof(int)); // fills splice_align with zeros
   memset((char*)est_align, -1, (est_len-1)*nr_paths*sizeof(int)); // fills est_align with zeros
   memset((char*)mmatrix_param, 0, mlen*mlen*nr_paths*sizeof(int)); //fills mmatrix_param with zeros
   memset(alignmentscores, -1, nr_paths*sizeof(double)); //fills alignmentscores with zeros
   
-  printf("after memset...\n");
+  // printf("after memset...\n");
+    // dnaest
+   double *DNA_ARRAY = 0;
+   double *EST_ARRAY = 0;
 
   for (int z=0; z<nr_paths; z++) {      
     result_length = 0 ;
@@ -107,10 +116,14 @@ void Alignment::myalign(int nr_paths_p, char* dna, int dna_len_p, char* est,
     
     bool no_more_path = result_align(matrices, z, est_len, dna_len, &result_length, est, dna, s_align, e_align, mparam, alignmentscores, max_score_positions);
       
-    // dnaest
-    double *DNA = new double[result_length];
-    double *EST = new double[result_length];
-    
+      if(DNA_ARRAY != 0) {
+       delete[] DNA_ARRAY;
+       delete[] EST_ARRAY;
+    }
+
+   DNA_ARRAY = new double[result_length];
+   EST_ARRAY = new double[result_length];
+
     //backtracking
     int i = max_score_positions[2*z] ; //i (est)
     int j = max_score_positions[2*z +1] ; //j (dna)
@@ -120,21 +133,21 @@ void Alignment::myalign(int nr_paths_p, char* dna, int dna_len_p, char* est,
     
     for (int ii=result_length; ii>0; ii--) {
       if ((prev_i == (i-1)) && (prev_j == (j-1))) { // match or mismatch
-       DNA[ii-1] = check_char(dna[j-1]) ;
-       EST[ii-1] = check_char(est[i-1]) ;
+       DNA_ARRAY[ii-1] = check_char(dna[j-1]) ;
+       EST_ARRAY[ii-1] = check_char(est[i-1]) ;
       }
-      else if ((prev_i == (i)) && (prev_j == (j-1))) {// gap on EST
-       DNA[ii-1] = check_char(dna[j-1]) ;
-       EST[ii-1] = 0 ; //gap
+      else if ((prev_i == (i)) && (prev_j == (j-1))) {// gap on EST_ARRAY
+       DNA_ARRAY[ii-1] = check_char(dna[j-1]) ;
+       EST_ARRAY[ii-1] = 0 ; //gap
       }
-      else if ((prev_i == (i-1)) && (prev_j == (j))) { // gap on DNA
-       DNA[ii-1] = 0 ; //gap
-       EST[ii-1] = check_char(est[i-1]) ;
+      else if ((prev_i == (i-1)) && (prev_j == (j))) { // gap on DNA_ARRAY
+       DNA_ARRAY[ii-1] = 0 ; //gap
+       EST_ARRAY[ii-1] = check_char(est[i-1]) ;
       }
       else {// splice site
        for (j; j > prev_j; j--) {
-         DNA[ii-1] = check_char(dna[j-1]) ;
-         EST[ii-1] = 6 ; //intron
+         DNA_ARRAY[ii-1] = check_char(dna[j-1]) ;
+         EST_ARRAY[ii-1] = 6 ; //intron
          ii-- ;
        }
        ii++ ; // last ii-- too much (because done in for-loop) 
@@ -149,13 +162,21 @@ void Alignment::myalign(int nr_paths_p, char* dna, int dna_len_p, char* est,
     
   } //end of z
 
-  printf("Leaving myalign...\n");
+   if(DNA_ARRAY != 0) {
+    delete[] DNA_ARRAY;
+    delete[] EST_ARRAY;
+    }
+
+   for (int i=nr_paths-1;i>=0; i--)
+      delete[] matrices[i];
+
+   //printf("Leaving myalign...\n");
 }
 
 void Alignment::getAlignmentResults(int* s_align, int* e_align,
       int* mmatrix_p, double* alignscores) {
 
-   printf("Entering getAlignmentResults...\n");
+   // printf("Entering getAlignmentResults...\n");
    uint splice_align_size = (dna_len-1)*nr_paths;
    uint est_align_size = (est_len-1)*nr_paths;
    uint mmatrix_param_size = (mlen*mlen)*nr_paths;
@@ -173,6 +194,6 @@ void Alignment::getAlignmentResults(int* s_align, int* e_align,
    for(int idx=0; idx<alignmentscores_size; idx++)
       alignscores[idx] = alignmentscores[idx];
    
-   printf("Leaving getAlignmentResults...\n");
+   // printf("Leaving getAlignmentResults...\n");
 }
 
index c00788d..134a22e 100644 (file)
@@ -63,7 +63,6 @@ class Alignment {
       int* est_align;
       int* mmatrix_param;
       double* alignmentscores;
-
       double* qualityMatrix;
 
       int dna_len;
@@ -84,7 +83,22 @@ class Alignment {
 
    public:
       Alignment();
-      ~Alignment();
+      ~Alignment() {
+         if(splice_align != 0)
+            delete[] splice_align;
+
+         if(est_align != 0)
+            delete[] est_align;
+
+         if(mmatrix_param != 0)
+            delete[] mmatrix_param;
+
+         if(alignmentscores != 0)
+            delete[] alignmentscores;
+
+         if(qualityMatrix != 0)
+            delete[] qualityMatrix;
+      }
 
 void myalign(int nr_paths_p, char* dna, int dna_len_p, char* est,
       int est_len_p, struct penalty_struct h, double* matchmatrix, int mm_len,
index 038a007..2d5b28b 100644 (file)
@@ -6,8 +6,8 @@ class Param:
    def __init__(self):
       """ default parameters """
 
-      #self.basedir = '/fml/ag-raetsch/share/projects/qpalma/elegans_palma'
-      self.basedir = '/fml/ag-raetsch/share/projects/qpalma/zebrafish'
+      self.basedir = '/fml/ag-raetsch/share/projects/qpalma/elegans_palma2'
+      #self.basedir = '/fml/ag-raetsch/share/projects/qpalma/zebrafish'
       self.MAX_MEM = 31000;
       self.LOCAL_ALIGN = 1;
       self.init_param = 1;
@@ -16,7 +16,7 @@ class Param:
       self.C = 0.001;
       self.microexon = 0;
       self.prob = 0;
-      self.organism = 'zebrafish'
+      self.organism = 'elegans'
       self.expt = 'training'
          
       self.insertion_prob  = self.prob/3 ;
index 4b5af6a..271bde9 100644 (file)
@@ -20,6 +20,8 @@ import QPalmaDP
 from SIQP_CPX import SIQPSolver
 
 from paths_load_data import *
+from paths_load_data_pickle import *
+
 from computeSpliceWeights import *
 from set_param_palma import *
 from computeSpliceAlign import *
@@ -78,10 +80,8 @@ class QPalma:
       
    def run(self):
       # Load the whole dataset 
-      Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data('training',self.genome_info,self.ARGS)
-
-      #Sequences, Acceptors, Donors, Exons, Ests, QualityScores = paths_load_data('training',self.genome_info,self.ARGS)
-
+      #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data('training',self.genome_info,self.ARGS)
+      Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
 
       # number of training instances
       N = len(Sequences) 
@@ -90,7 +90,7 @@ class QPalma:
       and N == len(Ests), 'The Seq,Accept,Donor,.. arrays are of different lengths'
       self.plog('Number of training examples: %d\n'% N)
 
-      iteration_steps = 10 ; #upper bound on iteration steps
+      iteration_steps = 200 ; #upper bound on iteration steps
 
       remove_duplicate_scores = False
       print_matrix = False
@@ -103,7 +103,6 @@ class QPalma:
       # Set the parameters such as limits penalties for the Plifs
       [h,d,a,mmatrix] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
 
-
       # delete splicesite-score-information
       if not self.ARGS.train_with_splicesitescoreinformation:
          for i in range(len(Acceptors)):
@@ -135,7 +134,7 @@ class QPalma:
             break
 
          for exampleIdx in range(self.numExamples):
-            if exampleIdx% 1000 == 0:
+            if (exampleIdx%10) == 0:
                print 'Current example nr %d' % exampleIdx
 
             dna = Sequences[exampleIdx] 
@@ -203,8 +202,8 @@ class QPalma:
             #print 'PYTHON: Calling myalign...'
             # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
             currentAlignment.myalign( num_path[exampleIdx], dna, dna_len,\
-            est, est_len, ps, matchmatrix, mm_len, donor, d_len,\
-            acceptor, a_len, remove_duplicate_scores, print_matrix)
+             est, est_len, ps, matchmatrix, mm_len, donor, d_len,\
+             acceptor, a_len, remove_duplicate_scores, print_matrix)
             #print 'PYTHON: After myalign call...'
 
             newSpliceAlign       = QPalmaDP.createIntArrayFromList([0]*(dna_len*num_path[exampleIdx]))
@@ -213,6 +212,7 @@ class QPalma:
             newAlignmentScores   = QPalmaDP.createDoubleArrayFromList([.0]*num_path[exampleIdx])
 
             currentAlignment.getAlignmentResults(newSpliceAlign, newEstAlign, newWeightMatch, newAlignmentScores)
+            del currentAlignment
 
             spliceAlign = zeros((num_path[exampleIdx]*dna_len,1))
             weightMatch = zeros((num_path[exampleIdx]*mm_len,1))
@@ -306,17 +306,28 @@ class QPalma:
                      for elem in self.slacks:
                         sum_xis +=  elem
 
-                  for i in range(len(param)):
-                     param[i] = w[i]
+                     for i in range(len(param)):
+                        param[i] = w[i]
+
+                     [h,d,a,mmatrix] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
 
-                  [h,d,a,mmatrix] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
-                  
-            if exampleIdx==10:
+               #
+               # end of one example processing 
+               #
+            if exampleIdx == 10:
                break
+         
+         break
 
+         #
+         # end of one iteration through all examples
+         #
          iteration_nr += 1
 
-      export_param('test_params',h,d,a,mmatrix)
+      #
+      # end of optimization 
+      #  
+      export_param('elegans.param',h,d,a,mmatrix)
       self.logfh.close()
       print 'Training completed'
 
index e4befad..25746ce 100644 (file)
@@ -47,6 +47,8 @@ class Plf: #means piecewise linear function
 
    def convert2SWIG(self):
       ps = QPalmaDP.penalty_struct()
+
+      ps.len = len(self.limits)
    
       ps.limits = QPalmaDP.createDoubleArrayFromList(self.limits)
       ps.penalties = QPalmaDP.createDoubleArrayFromList(self.penalties)
diff --git a/standalone/palma/__init__.py b/standalone/palma/__init__.py
new file mode 100644 (file)
index 0000000..23ab6dd
--- /dev/null
@@ -0,0 +1,307 @@
+""" --- palma ---
+       Some useful functions and classes used by palma.py
+"""
+
+__version__ = '0.3.3'
+
+# Copyright notice string
+__copyright__ = """\
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+Written (W) 2006 Uta Schulze
+Copyright (C) 2006 Max-Planck-Society
+"""
+
+
+import sys
+import numpy
+import palma_utils as pu
+import alignment_tool as at
+
+
+class alignment_info: 
+       nr_paths_found = -1
+       score = []
+       match = []
+       mism = []
+       qGapb = []
+       tGapb = []
+       qStart = []
+       qEnd = []
+       tStart = []
+       tEnd = []
+       num_exons = []
+       qExonSizes = []
+       qStarts = []
+       qEnds = []
+       tExonSizes = []
+       tStarts = []
+       tEnds = []
+       exon_length = []
+       identity = []
+       comparison = []
+       qIDX = []
+       tIDX = []
+       dna_letters = []
+       est_letters = []
+       
+def do_alignment(dna, dna_len, est, est_len, reduce_region, model, t_strand,\
+                translation, comparison_char, don_support_values, acc_support_values):
+       import time
+
+       nr_paths = 1 
+       mlen = 6 #length of match matrix, const
+
+       align_info = alignment_info()
+       align_info.score = []
+       align_info.match = []
+       align_info.mism = []
+       align_info.qGapb = []
+       align_info.tGapb = []
+       align_info.qStart = []
+       align_info.qEnd = []
+       align_info.tStart = []
+       align_info.tEnd = []
+       align_info.num_exons = []
+       align_info.qExonSizes = []
+       align_info.qStarts = []
+       align_info.qEnds = []
+       align_info.tExonSizes = []
+       align_info.tStarts = []
+       align_info.tEnds = []
+       align_info.exon_length = []
+       align_info.identity = []
+       align_info.comparison = []
+       align_info.qIDX = []
+       align_info.tIDX = []
+       align_info.dna_letters = []
+       align_info.est_letters = []
+       align_info.splice_align = [] 
+
+       t0 = time.time() 
+       #<------------------------------------------------------------------------------>
+       # assigns parameter [h(30),d(30),a(30),mmatrix(36)]
+       # computes scores for donor/acceptor sites using p.l. function
+       #<------------------------------------------------------------------------------>
+       h,d,a,mmatrix = pu.set_param(model)
+       donor   = don_support_values
+       acceptor = acc_support_values
+       for cnt in range(dna_len):
+               if numpy.isfinite(don_support_values[cnt]):
+                       donor[cnt] = pu.penalty_lookup(d,don_support_values[cnt])
+               if numpy.isfinite(acc_support_values[cnt]):
+                       acceptor[cnt] = pu.penalty_lookup(a,acc_support_values[cnt])
+       del don_support_values
+       del acc_support_values
+       #<------------------------------------------------------------------------------>
+
+       #print time.time() - t0
+       
+       #<------------------------------------------------------------------------------>
+       # convert list to double-Array (py -> c++), 
+       #<------------------------------------------------------------------------------>
+       mmatrix_array  = at.createDoubleArrayFromList(mmatrix)
+       donor_array     = at.createDoubleArrayFromList(donor)
+       acceptor_array = at.createDoubleArrayFromList(acceptor)
+       h_struct = at.penalty()
+       h_struct.len = h.len 
+       h_struct.limits = at.createDoubleArrayFromList(h.limits)
+       h_struct.penalties = at.createDoubleArrayFromList(h.penalties)
+       h_struct.name = h.name 
+       h_struct.max_len = h.max_len 
+       h_struct.min_len = h.min_len 
+       h_struct.transform = h.transform 
+       #<------------------------------------------------------------------------------>
+
+       #print time.time() - t0
+
+       #<------------------------------------------------------------------------------>
+       # call myalign
+       #<------------------------------------------------------------------------------>
+       BLA = at.align_info() ; #not necessary
+       try:
+               BLA = at.myalign(reduce_region, mlen, dna_len, dna, est_len, est, h_struct,\
+                                                mmatrix_array, donor_array, acceptor_array)
+       except MemoryError:
+               raise(MemoryError)
+
+       #print time.time() - t0
+
+#<------------------------------------------------------------------------------>
+
+
+#<------------------------------------------------------------------------------>
+# myalign Output - convert arrays to list (c++ -> py)
+#<------------------------------------------------------------------------------>
+       splice_align_array  = at.createListFromIntArray(BLA.splice_align, dna_len*nr_paths)
+       est_align_array  = at.createListFromIntArray(BLA.est_align, est_len*nr_paths)
+       mmatrix_param_array = at.createListFromIntArray(BLA.mmatrix_param, mlen*mlen*nr_paths)
+       alignment_scores        = at.createListFromDoubleArray(BLA.alignment_scores,nr_paths)
+       alignment_length        = at.createListFromIntArray(BLA.alignment_length, nr_paths)
+       #print "before crash on intel mac"      
+       aligned_dna_array   = at.createListFromIntArray(BLA.aligned_dna, alignment_length[0])
+       aligned_est_array   = at.createListFromIntArray(BLA.aligned_est, alignment_length[0])
+       #print "after crash on intel mac"       
+#<------------------------------------------------------------------------------>
+
+       #print alignment_scores
+
+#<------------------------------------------------------------------------------>
+# free memory
+#<------------------------------------------------------------------------------>
+       at.delete_intArray(BLA.splice_align)
+       at.delete_intArray(BLA.est_align)
+       at.delete_intArray(BLA.mmatrix_param)
+       at.delete_intArray(BLA.aligned_dna)
+       at.delete_intArray(BLA.aligned_est)
+       at.delete_doubleArray(BLA.alignment_scores)
+       at.delete_intArray(BLA.alignment_length)
+    
+       del BLA
+       del h_struct
+       at.delete_doubleArray(mmatrix_array)
+       at.delete_doubleArray(donor_array)
+       at.delete_doubleArray(acceptor_array)
+#<------------------------------------------------------------------------------>
+
+
+#<------------------------------------------------------------------------------>
+# myalign output: creating lists of information
+# i.th entry in list is alignment for i.th path (nr_path many)
+#<------------------------------------------------------------------------------>
+       mmatrix_param = []
+       splice_align  = []
+       est_align        = []
+       dna_numbers   = []
+       est_numbers   = []
+       dna_letters   = []
+       est_letters   = []
+
+       for i in range(nr_paths):
+               result_length = alignment_length[i] #length of i.th alignment
+               if result_length == 0:
+                       #no alignment found
+                       nr_paths = i-1+1 #+1 because index starts with zero
+                       alignment_scores = alignment_scores[0:nr_paths]
+                       alignment_length = alignment_length[0:nr_paths]
+                       assert(len(mmatrix_param)==nr_paths)
+                       assert(len(splice_align)==nr_paths)
+                       assert(len(est_align)==nr_paths)
+                       assert(len(dna_numbers)==nr_paths)
+                       assert(len(est_numbers)==nr_paths)
+                       assert(len(dna_letters)==nr_paths)
+                       assert(len(est_letters)==nr_paths)
+                       break
+
+               #match matrix for the i.th path (numbers of matches, mismatches and gaps):
+               mmatrix_param.append(mmatrix_param_array[i*mlen*mlen:(i+1)*mlen*mlen])
+       
+               #splice_align and est_align for the i.th path (show dangling ends and exons/introns):
+               splice_align.append(splice_align_array[i*dna_len:(i+1)*dna_len])
+               est_align.append(est_align_array[i*est_len:(i+1)*est_len])              
+       
+       dna_numbers.append(aligned_dna_array[0:result_length]) #alignment in numbers (see translation)
+       est_numbers.append(aligned_est_array[0:result_length]) #alignment in numbers (see translation)
+       aligned_dna_array = aligned_dna_array[result_length:]  #shorten aligned_dna_array
+       aligned_est_array = aligned_est_array[result_length:]  #shorten aligned_est_array
+       
+       dna_letters.append(reduce(lambda x,y: x+y, map(lambda x: translation[int(x)],dna_numbers[i])))
+       est_letters.append(reduce(lambda x,y: x+y, map(lambda x: translation[int(x)],est_numbers[i])))
+       
+       if nr_paths==0:
+               #absolutely no alignment found
+               align_info.nr_paths_found = nr_paths
+               return align_info
+
+       assert(len(aligned_dna_array)==0)
+       assert(len(aligned_est_array)==0)
+#<------------------------------------------------------------------------------>
+
+
+#<------------------------------------------------------------------------------>
+# analysing alignments (n-best)
+#<------------------------------------------------------------------------------>
+       if 0:
+               print alignment_scores
+               print alignment_length
+               print mmatrix_param
+               print splice_align
+               print est_align
+               print dna_numbers
+               print est_numbers
+               print dna_letters
+               print est_letters 
+
+       align_info.nr_paths_found = nr_paths
+
+       ai = [] ;
+       for i in range(nr_paths):
+               (match, mism, qGapb, tGapb) = pu.evaluate_mmatrix(mmatrix_param[i])
+               if (match +mism +qGapb +tGapb) == 0:
+                       #what to do then ...?, don't think, this happpens
+                       align_info.nr_paths_found -= 1
+                       continue
+
+               (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds, tExonSizes, tStarts, tEnds) =\
+                                pu.get_splice_info(splice_align[i], est_align[i])
+               #print tStarts
+               
+               score = alignment_scores[i]     
+               
+               (exon_length, identity, ssQuality, exonQuality, comparison, qIDX, tIDX) =\
+                                         pu.comp_identity(dna_letters[i], est_letters[i], t_strand, qStart, tStart, translation, comparison_char)
+
+               first_identity = None
+               last_identity = None 
+               for i in range(len(comparison)):
+                 if comparison[i] == '|' and first_identity is None: first_identity = i 
+                 #print i,comparison[i],first_identity,qIDX[i],tIDX[i]
+                 if comparison[-i] == '|' and last_identity is None: last_identity = len(comparison) - i - 1
+               #print first_identity, last_identity, len(tIDX), len(qIDX)  
+               #print tIDX[first_identity], tIDX[last_identity], qIDX[first_identity], qIDX[last_identity]
+               #print tStart, tEnd, qStart, qEnd
+               #exon_length[0] = exon_length[0] - tIDX[first_identity] - tStart + 2
+               #exon_length[-1] = exon_length[-1] - tEnd + tIDX[last_identity] + 1
+               tStart = tIDX[first_identity]
+               tStarts[0] = tIDX[first_identity]
+               tEnd = tIDX[last_identity]
+               tEnds[-1] = tIDX[last_identity]
+               qStart = qIDX[first_identity]
+               qStarts[0] = qIDX[first_identity]
+               qEnd = qIDX[last_identity]
+               qEnds[-1] = qIDX[last_identity]
+
+               align_info.score = score 
+               align_info.match = match
+               align_info.mism = mism
+               align_info.qGapb = qGapb
+               align_info.tGapb = tGapb
+               align_info.qStart = qStart
+               align_info.qEnd = qEnd
+               align_info.tStart = tStart
+               align_info.tEnd = tEnd
+               align_info.num_exons = num_exons
+               align_info.qExonSizes = qExonSizes
+               align_info.qStarts = qStarts
+               align_info.qEnds = qEnds
+               align_info.tExonSizes =tExonSizes
+               align_info.tStarts = tStarts
+               align_info.tEnds = tEnds
+               align_info.exon_length = exon_length
+               align_info.identity = identity
+               align_info.ssQuality = ssQuality
+               align_info.exonQuality = exonQuality
+               align_info.comparison = comparison
+               align_info.qIDX = qIDX
+               align_info.tIDX = tIDX 
+               align_info.dna_letters = dna_letters
+               align_info.est_letters = est_letters
+
+               ai.append(align_info)
+
+#<------------------------------------------------------------------------------>
+
+       return ai
diff --git a/standalone/palma/alignment_tool.py b/standalone/palma/alignment_tool.py
new file mode 100644 (file)
index 0000000..ed404ed
--- /dev/null
@@ -0,0 +1,167 @@
+# This file was automatically generated by SWIG (http://www.swig.org).
+# Version 1.3.31
+#
+# Don't modify this file, modify the SWIG interface instead.
+# This file is compatible with both classic and new-style classes.
+
+import _alignment_tool
+import new
+new_instancemethod = new.instancemethod
+try:
+    _swig_property = property
+except NameError:
+    pass # Python < 2.2 doesn't have 'property'.
+def _swig_setattr_nondynamic(self,class_type,name,value,static=1):
+    if (name == "thisown"): return self.this.own(value)
+    if (name == "this"):
+        if type(value).__name__ == 'PySwigObject':
+            self.__dict__[name] = value
+            return
+    method = class_type.__swig_setmethods__.get(name,None)
+    if method: return method(self,value)
+    if (not static) or hasattr(self,name):
+        self.__dict__[name] = value
+    else:
+        raise AttributeError("You cannot add attributes to %s" % self)
+
+def _swig_setattr(self,class_type,name,value):
+    return _swig_setattr_nondynamic(self,class_type,name,value,0)
+
+def _swig_getattr(self,class_type,name):
+    if (name == "thisown"): return self.this.own()
+    method = class_type.__swig_getmethods__.get(name,None)
+    if method: return method(self)
+    raise AttributeError,name
+
+def _swig_repr(self):
+    try: strthis = "proxy of " + self.this.__repr__()
+    except: strthis = ""
+    return "<%s.%s; %s >" % (self.__class__.__module__, self.__class__.__name__, strthis,)
+
+import types
+try:
+    _object = types.ObjectType
+    _newclass = 1
+except AttributeError:
+    class _object : pass
+    _newclass = 0
+del types
+
+
+new_intArray = _alignment_tool.new_intArray
+delete_intArray = _alignment_tool.delete_intArray
+intArray_getitem = _alignment_tool.intArray_getitem
+intArray_setitem = _alignment_tool.intArray_setitem
+new_doubleArray = _alignment_tool.new_doubleArray
+delete_doubleArray = _alignment_tool.delete_doubleArray
+doubleArray_getitem = _alignment_tool.doubleArray_getitem
+doubleArray_setitem = _alignment_tool.doubleArray_setitem
+class MyAlignException(_object):
+    __swig_setmethods__ = {}
+    __setattr__ = lambda self, name, value: _swig_setattr(self, MyAlignException, name, value)
+    __swig_getmethods__ = {}
+    __getattr__ = lambda self, name: _swig_getattr(self, MyAlignException, name)
+    __repr__ = _swig_repr
+    def __init__(self, *args): 
+        this = _alignment_tool.new_MyAlignException(*args)
+        try: self.this.append(this)
+        except: self.this = this
+    __swig_destroy__ = _alignment_tool.delete_MyAlignException
+    __del__ = lambda self : None;
+MyAlignException_swigregister = _alignment_tool.MyAlignException_swigregister
+MyAlignException_swigregister(MyAlignException)
+
+class penalty(_object):
+    __swig_setmethods__ = {}
+    __setattr__ = lambda self, name, value: _swig_setattr(self, penalty, name, value)
+    __swig_getmethods__ = {}
+    __getattr__ = lambda self, name: _swig_getattr(self, penalty, name)
+    __repr__ = _swig_repr
+    __swig_setmethods__["len"] = _alignment_tool.penalty_len_set
+    __swig_getmethods__["len"] = _alignment_tool.penalty_len_get
+    if _newclass:len = _swig_property(_alignment_tool.penalty_len_get, _alignment_tool.penalty_len_set)
+    __swig_setmethods__["limits"] = _alignment_tool.penalty_limits_set
+    __swig_getmethods__["limits"] = _alignment_tool.penalty_limits_get
+    if _newclass:limits = _swig_property(_alignment_tool.penalty_limits_get, _alignment_tool.penalty_limits_set)
+    __swig_setmethods__["penalties"] = _alignment_tool.penalty_penalties_set
+    __swig_getmethods__["penalties"] = _alignment_tool.penalty_penalties_get
+    if _newclass:penalties = _swig_property(_alignment_tool.penalty_penalties_get, _alignment_tool.penalty_penalties_set)
+    __swig_setmethods__["name"] = _alignment_tool.penalty_name_set
+    __swig_getmethods__["name"] = _alignment_tool.penalty_name_get
+    if _newclass:name = _swig_property(_alignment_tool.penalty_name_get, _alignment_tool.penalty_name_set)
+    __swig_setmethods__["max_len"] = _alignment_tool.penalty_max_len_set
+    __swig_getmethods__["max_len"] = _alignment_tool.penalty_max_len_get
+    if _newclass:max_len = _swig_property(_alignment_tool.penalty_max_len_get, _alignment_tool.penalty_max_len_set)
+    __swig_setmethods__["min_len"] = _alignment_tool.penalty_min_len_set
+    __swig_getmethods__["min_len"] = _alignment_tool.penalty_min_len_get
+    if _newclass:min_len = _swig_property(_alignment_tool.penalty_min_len_get, _alignment_tool.penalty_min_len_set)
+    __swig_setmethods__["transform"] = _alignment_tool.penalty_transform_set
+    __swig_getmethods__["transform"] = _alignment_tool.penalty_transform_get
+    if _newclass:transform = _swig_property(_alignment_tool.penalty_transform_get, _alignment_tool.penalty_transform_set)
+    def __init__(self, *args): 
+        this = _alignment_tool.new_penalty(*args)
+        try: self.this.append(this)
+        except: self.this = this
+    __swig_destroy__ = _alignment_tool.delete_penalty
+    __del__ = lambda self : None;
+penalty_swigregister = _alignment_tool.penalty_swigregister
+penalty_swigregister(penalty)
+
+class align_info(_object):
+    __swig_setmethods__ = {}
+    __setattr__ = lambda self, name, value: _swig_setattr(self, align_info, name, value)
+    __swig_getmethods__ = {}
+    __getattr__ = lambda self, name: _swig_getattr(self, align_info, name)
+    __repr__ = _swig_repr
+    __swig_setmethods__["splice_align"] = _alignment_tool.align_info_splice_align_set
+    __swig_getmethods__["splice_align"] = _alignment_tool.align_info_splice_align_get
+    if _newclass:splice_align = _swig_property(_alignment_tool.align_info_splice_align_get, _alignment_tool.align_info_splice_align_set)
+    __swig_setmethods__["est_align"] = _alignment_tool.align_info_est_align_set
+    __swig_getmethods__["est_align"] = _alignment_tool.align_info_est_align_get
+    if _newclass:est_align = _swig_property(_alignment_tool.align_info_est_align_get, _alignment_tool.align_info_est_align_set)
+    __swig_setmethods__["mmatrix_param"] = _alignment_tool.align_info_mmatrix_param_set
+    __swig_getmethods__["mmatrix_param"] = _alignment_tool.align_info_mmatrix_param_get
+    if _newclass:mmatrix_param = _swig_property(_alignment_tool.align_info_mmatrix_param_get, _alignment_tool.align_info_mmatrix_param_set)
+    __swig_setmethods__["alignment_scores"] = _alignment_tool.align_info_alignment_scores_set
+    __swig_getmethods__["alignment_scores"] = _alignment_tool.align_info_alignment_scores_get
+    if _newclass:alignment_scores = _swig_property(_alignment_tool.align_info_alignment_scores_get, _alignment_tool.align_info_alignment_scores_set)
+    __swig_setmethods__["alignment_length"] = _alignment_tool.align_info_alignment_length_set
+    __swig_getmethods__["alignment_length"] = _alignment_tool.align_info_alignment_length_get
+    if _newclass:alignment_length = _swig_property(_alignment_tool.align_info_alignment_length_get, _alignment_tool.align_info_alignment_length_set)
+    __swig_setmethods__["aligned_dna"] = _alignment_tool.align_info_aligned_dna_set
+    __swig_getmethods__["aligned_dna"] = _alignment_tool.align_info_aligned_dna_get
+    if _newclass:aligned_dna = _swig_property(_alignment_tool.align_info_aligned_dna_get, _alignment_tool.align_info_aligned_dna_set)
+    __swig_setmethods__["aligned_est"] = _alignment_tool.align_info_aligned_est_set
+    __swig_getmethods__["aligned_est"] = _alignment_tool.align_info_aligned_est_get
+    if _newclass:aligned_est = _swig_property(_alignment_tool.align_info_aligned_est_get, _alignment_tool.align_info_aligned_est_set)
+    def __init__(self, *args): 
+        this = _alignment_tool.new_align_info(*args)
+        try: self.this.append(this)
+        except: self.this = this
+    __swig_destroy__ = _alignment_tool.delete_align_info
+    __del__ = lambda self : None;
+align_info_swigregister = _alignment_tool.align_info_swigregister
+align_info_swigregister(align_info)
+
+myalign = _alignment_tool.myalign
+def createDoubleArrayFromList(list):
+    array = new_doubleArray(len(list))
+    for i in range(len(list)):
+       doubleArray_setitem(array, i, list[i])
+    return array
+
+def createListFromIntArray(array, array_len):
+    list = [0]*array_len
+    for i in range(array_len):
+       list[i] = intArray_getitem(array,i)
+    return list
+
+def createListFromDoubleArray(array, array_len):
+    list = [0]*array_len
+    for i in range(array_len):
+       list[i] = doubleArray_getitem(array,i)
+    return list
+
+
+
+
diff --git a/standalone/palma/data/elegans_WS150_ss=1_il=1.param.bz2 b/standalone/palma/data/elegans_WS150_ss=1_il=1.param.bz2
new file mode 100644 (file)
index 0000000..7cee26e
Binary files /dev/null and b/standalone/palma/data/elegans_WS150_ss=1_il=1.param.bz2 differ
diff --git a/standalone/palma/genomic.py b/standalone/palma/genomic.py
new file mode 100644 (file)
index 0000000..a15cf55
--- /dev/null
@@ -0,0 +1,168 @@
+# 
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+# 
+# Written (W) 2006-2007 Soeren Sonnenburg
+# Written (W) 2006-2007 Mikio Braun
+# Copyright (C) 2007 Fraunhofer Institute FIRST and Max-Planck-Society
+# 
+
+import time
+from string import maketrans
+
+""" this function is 100% compatible to the matlab function, thus it is one based (!)
+       use one_based=False if needed, then however the interval is [start,stop) (excluding stop)
+"""
+def load_genomic(chromosome, strand, start, stop, genome, one_based=True):
+       fname = '/fml/ag-raetsch/share/databases/genomes/' + genome + '/' + chromosome[3:] + '.flat'
+       f=file(fname)
+       if one_based:
+               f.seek(start-1)
+               str=f.read(stop-start+1)
+       else:
+               f.seek(start)
+               str=f.read(stop-start)
+
+       if strand=='-':
+               return reverse_complement(str)
+       elif strand=='+':
+               return str
+       else:
+               print 'strand must be + or -'
+               raise KeyError
+
+""" read a table browser ascii output file (http://genome.ucsc.edu/cgi-bin/hgTables) """
+def read_table_browser(f):
+       table=dict();
+       for l in f.readlines():
+               if not l.startswith('#'):
+                       (name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds,proteinID,alignID)=l.split('\t')
+                       exonStarts=[ int(i) for i in exonStarts.split(',')[:-1] ]
+                       exonEnds=[ int(i) for i in exonEnds.split(',')[:-1] ]
+
+                       table[name]={ 'chrom': chrom, 'strand': strand, 'txStart': int(txStart), 'txEnd': int(txEnd), 
+                       'cdsStart': int(cdsStart), 'cdsEnd': int(cdsEnd), 'exonCount': int(exonCount), 'exonStarts': exonStarts,
+                       'exonEnds': exonEnds, 'proteinID': proteinID, 'alignID': alignID[:-1] }
+
+       return table
+
+""" get promoter region """
+def get_promoter_region(chromosome, strand, gene_start, gene_end, genome, length):
+
+       if strand == '+':
+               return load_genomic(chromosome, strand, gene_start, gene_start+length, genome, one_based=False)
+       elif strand == '-':
+               return load_genomic(chromosome, strand, gene_end, gene_end+length, genome, one_based=False)
+       else:
+               print 'unknown strand'
+               return None
+
+""" reverse + complement a DNA sequence (only letters ACGT are translated!)
+       FIXME won't work with all the rest like y... """
+def reverse_complement(str):
+       t=maketrans('acgtACGT','tgcaTGCA')
+       return str[len(str)::-1].translate(t)
+
+""" works only with .fa files that contain a single entry """
+def read_single_fasta(fname):
+       str=file(fname).read()
+       str=str[str.index('\n')+1:].replace('\n','')
+       return str
+
+""" writes only single enty .fa files """
+def write_single_fasta(fname, name, str, linelen=60):
+       header= '>' + name + '\n'
+       f=file(fname,'a')
+       f.write(header)
+       for i in xrange(0,len(str),linelen):
+               f.write(str[i:i+linelen]+'\n')
+       f.close()
+
+""" read fasta as dictionary """
+def read_fasta(f, fasta=dict(), id_only=True):
+       import numpy
+       str=f.read()
+       idx = str.find('>')
+       #if idx==-1:
+       if 0: # no support for contig list files -> would need extra blastn workaround
+               # file name list?
+               files = str.split('\n') ;
+               for s in files:
+                       if len(s)==0: continue
+                       print s.strip() + '\n'
+                       fasta = read_fasta(file(s.strip()), fasta) 
+       else:
+               # real fasta file?
+               sequences = str.split('>') 
+               for s in sequences:
+                       if len(s)==0: continue
+                       header = s[0:s.index('\n')]
+                       if id_only:
+                               header = header.split(' ')[0] ;
+                               header = header.split('\t')[0] ;
+                       seq = s[s.index('\n')+1:].replace('\n','').upper()
+                       #print 'has_key', fasta.has_key(header),header
+                       assert(not fasta.has_key(header))
+                       fasta[header]=seq ;
+
+       return fasta
+
+""" write dictionary fasta """
+def write_fasta(f, d, linelen=60):
+    for k in sorted(d):
+        f.write('>%s\n' % k);
+        s = d[k]
+        for i in xrange(0, len(s), linelen):
+            f.write(s[i:i+linelen] + '\n')
+
+def write_gff(f, (source, version), (seqtype, seqname), descrlist, skipheader=False):
+       """ writes a gff version 2 file
+               descrlist is a list of dictionaries, each of which contain these fields:
+               <seqname> <source> <feature> <start> <end> <score> <strand> <frame> [attributes] [comments]
+       """
+
+       if not skipheader:
+               f.write('##gff-version 2\n')
+               f.write('##source-version %s %s\n' % (source, version) )
+
+               t=time.localtime()
+               f.write("##date %d-%d-%d %d:%d:%d\n" % t[0:6])
+
+       f.write('##Type %s %s\n' % (seqtype, seqname) )
+
+       for d in descrlist:
+               f.write('%s\t%s\t%s\t%d\t%d\t%f\t%s\t%d' % (d['seqname'], d['source'], 
+                                                                                       d['feature'], d['start'], d['end'], 
+                                                                                       d['score'], d['strand'], d['frame']))
+               if d.has_key('attributes'):
+                       f.write('\t' + d['attributes'])
+                       if d.has_key('comments'):
+                               f.write('\t' + d['comments'])
+               f.write('\n')
+
+
+if __name__ == '__main__':
+       import sys,os
+
+       table=read_table_browser(file('/fml/ag-raetsch/home/sonne/addnet/tfbs/share/data/wt1_bibliosphere_table_browser_hg17.txt'))
+       print table.keys()
+       print table[table.keys()[0]]
+       d = { 'ahoernchen' : 'ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT',
+                 'bhoernchen' : 'GATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACA' }
+
+       write_fasta(sys.stdout, d)
+       write_fasta(file('/tmp/test.fa','w'), d)
+
+       d2 = read_fasta(file('/tmp/test.fa'))
+       os.unlink('/tmp/test.fa')
+
+       print d
+       print d2
+       print d == d2
+
+       p=load_genomic('chr5', '+', 100000, 100100,'hg17')
+       n=load_genomic('chr1', '-', 3000000, 3001000,'mm7')
+       write_single_fasta('bla.fa','bla', 'ACGT')
+       n2=read_single_fasta('bla.fa')
diff --git a/standalone/palma/model.py b/standalone/palma/model.py
new file mode 100644 (file)
index 0000000..2801526
--- /dev/null
@@ -0,0 +1,238 @@
+# 
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+# 
+# Written (W) 2006-2007 Soeren Sonnenburg
+# Written (W) 2007 Gunnar Raetsch
+# Copyright (C) 2007 Fraunhofer Institute FIRST and Max-Planck-Society
+# 
+
+import sys
+import numpy
+from numpy import mat,array,chararray,inf
+
+class model(object):
+    #acceptor
+    acceptor_bins=None
+    acceptor_limits=None
+    acceptor_penalties=None
+
+    acceptor_splice_b=None
+    acceptor_splice_order=None
+    acceptor_splice_window_left=None
+    acceptor_splice_window_right=None
+    acceptor_splice_alphas=None
+    acceptor_splice_svs=None
+
+    #donor
+    donor_bins=None
+    donor_limits=None
+    donor_penalties=None
+    
+    donor_splice_b=None
+    donor_splice_order=None
+    #donor_splice_use_gc=None
+    donor_splice_window_left=None
+    donor_splice_window_right=None
+    donor_splice_alphas=None
+    donor_splice_svs=None
+
+    # intron length
+    intron_len_bins=None
+    intron_len_limits=None
+    intron_len_penalties=None
+    intron_len_min=None
+    intron_len_max=None
+    intron_len_transform=None
+
+    gene_len_max = None
+    
+    # substitution matrix
+    substitution_matrix=None 
+
+def parse_file(file):
+    m=model()
+
+    sys.stdout.write("loading model file"); sys.stdout.flush()
+
+    l=file.readline();
+
+    if l != '%palma definition file version: 1.0\n':
+        sys.stderr.write("\nfile not a palma definition file\n")
+        print l
+        return None
+
+    while l:
+        if not ( l.startswith('%') or l.startswith('\n') ): # comment
+            #print l[0:100]
+            
+            #acceptor
+            if m.acceptor_bins is None: m.acceptor_bins=parse_value(l, 'acceptor_bins')
+            if m.acceptor_limits is None: m.acceptor_limits=parse_vector(l, file, 'acceptor_limits')
+            if m.acceptor_penalties is None: m.acceptor_penalties=parse_vector(l, file, 'acceptor_penalties') #DEAD
+
+            if m.acceptor_splice_b is None: m.acceptor_splice_b=parse_value(l, 'acceptor_splice_b')
+            if m.acceptor_splice_order is None: m.acceptor_splice_order=parse_value(l, 'acceptor_splice_order')
+            if m.acceptor_splice_window_left is None: m.acceptor_splice_window_left=parse_value(l, 'acceptor_splice_window_left')
+            if m.acceptor_splice_window_right is None: m.acceptor_splice_window_right=parse_value(l, 'acceptor_splice_window_right')
+            if m.acceptor_splice_alphas is None: m.acceptor_splice_alphas=parse_vector(l, file, 'acceptor_splice_alphas')
+            if m.acceptor_splice_svs is None: m.acceptor_splice_svs=parse_string(l, file, 'acceptor_splice_svs')
+
+            #donor
+            if m.donor_bins is None: m.donor_bins=parse_value(l, 'donor_bins')
+            if m.donor_limits is None: m.donor_limits=parse_vector(l, file, 'donor_limits')
+            if m.donor_penalties is None: m.donor_penalties=parse_vector(l, file, 'donor_penalties') #DEAD
+
+            if m.donor_splice_b is None: m.donor_splice_b=parse_value(l, 'donor_splice_b')
+            if m.donor_splice_order is None: m.donor_splice_order=parse_value(l, 'donor_splice_order')
+            #if m.donor_splice_use_gc is None: m.donor_splice_use_gc=parse_value(l, 'donor_splice_use_gc')
+            if m.donor_splice_window_left is None: m.donor_splice_window_left=parse_value(l, 'donor_splice_window_left')
+            if m.donor_splice_window_right is None: m.donor_splice_window_right=parse_value(l, 'donor_splice_window_right')
+            if m.donor_splice_alphas is None: m.donor_splice_alphas=parse_vector(l, file, 'donor_splice_alphas')
+            if m.donor_splice_svs is None: m.donor_splice_svs=parse_string(l, file, 'donor_splice_svs')
+
+
+            # intron length
+            if m.intron_len_bins is None: m.intron_len_bins=parse_value(l, 'intron_len_bins')
+            if m.intron_len_limits is None: m.intron_len_limits=parse_vector(l, file, 'intron_len_limits')
+            if m.intron_len_penalties is None: m.intron_len_penalties=parse_vector(l, file, 'intron_len_penalties')
+            if m.intron_len_min is None: m.intron_len_min=parse_value(l, 'intron_len_min')
+            if m.intron_len_max is None: m.intron_len_max=parse_value(l, 'intron_len_max')
+            if m.intron_len_transform is None: m.intron_len_transform=parse_value(l, 'intron_len_transform')
+
+            # gene length
+            if m.gene_len_max is None: m.gene_len_max=parse_value(l, 'gene_len_max')
+
+            if m.substitution_matrix is None: m.substitution_matrix=parse_vector(l, file, 'substitution_matrix')
+
+        l=file.readline()
+
+    sys.stderr.write('done\n')
+    return m
+
+def parse_value(line, name):
+    if (line.startswith(name)):
+        #print 'found ' + name
+        sys.stdout.write('.'); sys.stdout.flush()
+        str = line[line.find('=')+1:-1] ;
+        if str[0] == "'":
+            return str[1:-1]
+        else:
+            #print float(str)
+            return float(str)
+    else:
+        return None
+
+def parse_vector(line, file, name):
+    mat = parse_matrix(line, file, name)
+    if mat is None:
+     return mat
+    else:
+     mat = numpy.array(mat).flatten()
+     return mat 
+
+def parse_matrix(line, file, name):
+    if (line.startswith(name)):
+        sys.stdout.write('.'); sys.stdout.flush()
+        if line.find(']') < 0:
+            l=''
+            while l is not None and l.find(']') < 0:
+                line+=l
+                l=file.readline()
+            if l is not None and l.find(']') >= 0:
+                line+=l
+
+        if line.find(']') < 0:
+            sys.stderr.write("matrix `" + name + "' ended without ']'\n")
+            return None
+        else:
+            return mat(line[line.find('['):line.find(']')+1])
+    else:
+        return None
+
+def parse_string(line, file, name):
+    if (line.startswith(name)):
+        sys.stdout.write('.'); sys.stdout.flush()
+        l=''
+        lines=[]
+        while l is not None and l.find(']') < 0:
+            if l:
+                lines+=[list(l[:-1])]
+            l=file.readline()
+
+        if l.find(']') < 0:
+            sys.stderr.write("string ended without ']'\n")
+            return None
+        else:
+            #seqlen=len(lines[0])
+            num=len(lines)
+            #trdat = chararray((seqlen,num),1,order='FORTRAN')
+            #for i in xrange(num):
+            #    trdat[:,i]=lines[i]
+            trdat = num*[[]]
+            for idx,example in enumerate(lines):
+                trdat[idx] = ''.join(example)
+            return trdat
+    else:
+        return None
+
+if __name__ == '__main__':
+    import bz2
+    import sys
+    #import hotshot, hotshot.stats
+
+    def load():
+        #f=bz2.BZ2File('../../splicing/msplicer/standalone/python/data/msplicer_elegansWS120_gc=1_orf=0.dat.bz2');
+        filename='test.dat.bz2' 
+        if filename.endswith('.bz2'):
+            f=bz2.BZ2File(filename);
+        else:
+            f=file(filename);
+            
+        m=parse_file(f);
+
+        print m.acceptor_bins is None
+        print m.acceptor_limits is None
+        print m.acceptor_penalties is None
+
+        print m.acceptor_splice_b is None
+        print m.acceptor_splice_order is None
+        print m.acceptor_splice_window_left is None
+        print m.acceptor_splice_window_right is None
+        print m.acceptor_splice_alphas is None
+        print m.acceptor_splice_svs is None
+
+        print m.donor_bins is None
+        print m.donor_limits is None
+        print m.donor_penalties is None
+
+        print m.donor_splice_b is None
+        print m.donor_splice_order is None
+        #print m.donor_splice_use_gc is None
+        print m.donor_splice_window_left is None
+        print m.donor_splice_window_right is None
+        print m.donor_splice_alphas is None
+        print m.donor_splice_svs is None
+        print m.intron_len_bins is None
+        print m.intron_len_limits is None
+        print m.intron_len_penalties is None
+        print m.intron_len_min is None
+        print m.intron_len_max is None
+        print m.intron_len_transform is None
+
+        print m.substitution_matrix is None
+
+        print m.intron_len_transform
+        print m.substitution_matrix
+
+    load()
+
+    #prof = hotshot.Profile("model.prof")
+    #benchtime = prof.runcall(load)
+    #prof.close()
+    #stats = hotshot.stats.load("model.prof")
+    #stats.strip_dirs()
+    #stats.sort_stats('time', 'calls')
+    #stats.print_stats(20)
diff --git a/standalone/palma/output_formating.py b/standalone/palma/output_formating.py
new file mode 100644 (file)
index 0000000..d137aa0
--- /dev/null
@@ -0,0 +1,309 @@
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+# 
+# Written (W) 2006-2007 Uta Schulze, Gunnar Raetsch
+# Copyright (C) 2006-2007 Max-Planck-Society
+# 
+
+import pdb
+
+def print_line(score, match, mism, qGapb, tGapb, strand, tStrand, qName, qSize, qStart, qEnd,\
+                          tName, tSize, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
+                          tExonSizes, tStarts, tEnds, exon_length, identity, out_file):
+    """Produces a tab separated line in the out_file which describes the alignment.
+    
+    1. align. score: palma alignment score
+    2. match:        number of matches in the local alignment
+    3. mismatch:     number of mismatches in the local alignment 
+    4. Q gap bases:  number of gaps on the target sequence (insert in query acc. to BLAT) 
+    5. T gap bases:  number of gaps on the query sequence (insert in target acc. to BLAT) 
+                    (introns, which are basically gaps on the query, are not included!) 
+    6. strand:       '+' or '-' for query strand (second strand for target if provided)
+                    if you align two "short" sequences and no whole genome search, the
+                    strand equals "+" by default and logic.
+    7. Q name:       first word in header of the entry in the query-file.
+    8. Q size:       length of the sequence of the entry in the query-file
+    9. Q start:      alignment start position on query sequence
+   10. Q end:        alignment end position on query sequence
+   11. T name:       first word in header of the entry in the target-file.
+   12. T size:       length of the sequence of the entry in the target-file
+   13. T start:      alignment start position on target sequence
+   14. T end:        alignment end position on target sequence
+   15. exon count:   number of exons in the alignment. Different to BLAT, where each block is
+                    defined by the surrounding gaps, exons are defined by adjacent introns.
+                    Exons can contain gaps. The length of the without gaps on both target
+                    and query will most likely be different.
+   16. Q bound:      Exact beginning and end of each exon on the query, without showing the gaps.
+                    E.g. "ex1_start-ex1_end,ex2_start-ex2_end,..,"
+                    Beginning of first exon most likely equals Q start, except the alignment
+                    begins with an intron. Analog with Q end for the end of the last exon.
+   17. T bound:      Exact beginning and end of each exon on the query, without showing the gaps.
+                    Beginning of first exon most likely equals T start, except the alignment
+                    begins with an intron. Analog with T end for the end of the last exon.
+   18. exon lengths: These are actually the length of the exons in the alignment, where gaps are
+                    included.
+   19. identity:     Number of matches divided by length of exon on alignment times 100.
+                    If the exon is perfect, and there are no gaps or mismatches, it's 100%.
+    """
+
+    qExonSizes_string = ''
+    for i in qExonSizes:
+        qExonSizes_string = qExonSizes_string + ('%d,' %i)
+        tExonSizes_string = ''
+    for i in tExonSizes:
+        tExonSizes_string = tExonSizes_string + ('%d,' %i)
+        qBounds_string = ''
+    for (i,j) in zip(qStarts, qEnds):
+        qBounds_string = qBounds_string + ('%d-%d,' %(i,j))
+        tBounds_string = ''
+    for (i,j) in zip(tStarts, tEnds):
+        tBounds_string = tBounds_string + ('%d-%d,' %(i,j))
+        exon_length_string = ''        
+    for i in exon_length:
+        exon_length_string = exon_length_string + ('%d,' %i)
+        identity_string = ''
+    for i in identity:
+        identity_string = identity_string + ('%1.1f,' %(i*100))
+
+    if tStrand == '+':
+        out_file.write("%4.3f\t%d\t%d\t%d\t%d\t%s\t%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\n"\
+                                   %(score, match, mism, qGapb, tGapb, strand, qName, qSize, qStart,\
+                                         qEnd, tName, tSize, tStart, tEnd, num_exons, qBounds_string,\
+                                         tBounds_string, exon_length_string, identity_string))
+    if tStrand == '-':
+        out_file.write("%4.3f\t%d\t%d\t%d\t%d\t%s%s\t%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\n"\
+               %(score, match, mism, qGapb, tGapb, strand, tStrand, qName, qSize, qStart,\
+                                         qEnd, tName, tSize, tStart, tEnd, num_exons, qBounds_string,\
+                                         tBounds_string, exon_length_string, identity_string))
+    return
+#<------------------------------------------------------------------------------>
+
+#<------------------------------------------------------------------------------>
+# prints a line in the output_file which describes the alignment, in blat psl format
+#<------------------------------------------------------------------------------>
+def print_line_blatformat(score, match, mism, qGapb, tGapb, qStrand, tStrand, qName, qSize, qStart, qEnd,\
+                          tName, tSize, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
+                          tExonSizes, tStarts, tEnds, exon_length, identity, ssQuality, out_file):
+    """
+    matches int unsigned ,           # Number of bases that match (in BLAT this excludes repeats, not here)
+    misMatches int unsigned ,        # Number of bases that don't match
+    repMatches int unsigned ,        # Number of bases that match but are part of repeats (always zero here)
+    nCount int unsigned ,            # Number of 'N' bases (always zero here)
+    qNumInsert int unsigned ,        # Number of inserts in query (length of tStarts)
+    qBaseInsert int unsigned ,       # Number of bases inserted in query
+    tNumInsert int unsigned ,        # Number of inserts in target (length of qStarts)
+    tBaseInsert int unsigned ,       # Number of bases inserted in target
+    strand char(2) ,                 # + or - for query strand, optionally followed by + or - for target strand
+    qName varchar(255) ,             # Query sequence name
+    qSize int unsigned ,             # Query sequence size
+    qStart int unsigned ,            # Alignment start position in query
+    qEnd int unsigned ,              # Alignment end position in query
+    tName varchar(255) ,             # Target sequence name
+    tSize int unsigned ,             # Target sequence size
+    tStart int unsigned ,            # Alignment start position in target
+    tEnd int unsigned ,              # Alignment end position in target
+    blockCount int unsigned ,        # Number of blocks in alignment. A block contains no gaps.
+    blockSizes longblob ,            # Size of each block in a comma separated list
+    qStarts longblob ,               # Start of each block in query in a comma separated list
+    tStarts longblob ,               # Start of each block in target in a comma separated list
+    """
+
+    if tStrand == '-':
+        exon_length.reverse()
+        qEnds.reverse()
+        qStarts = map(lambda x: qSize - x + 1, qEnds)
+        tStarts.reverse()
+        tEnds.reverse()
+    #if strand == '-':
+    #    qStarts.reverse()
+    #    qEnds.reverse()
+    if True:
+        for i in range(len(qStarts)):
+            if i>0: qStarts[i]=qStarts[i-1]+exon_length[i-1]
+
+    repMatches = 0
+    nCount = 0
+    qStarts_string = ''
+    for i in qStarts:
+        assert(i>0)
+        qStarts_string = qStarts_string + ('%d,' %(i-1))
+    tStarts_string = ''
+    #if len(tStarts)>0:
+    #    tStarts_string = tStarts_string + ('%d,' %(tStarts[0]))
+    for i in tStarts:
+        assert(i>0)
+        tStarts_string = tStarts_string + ('%d,' %(i-1))
+    exon_length_string = ''    
+    for i in exon_length:
+        exon_length_string = exon_length_string + ('%d,' %i)
+    ssQuality_string = ''      
+    for i in ssQuality:
+        ssQuality_string = ssQuality_string + ('%1.1f,' %(i*100))
+    identity_string = '' 
+    for i in identity:
+        identity_string = identity_string + ('%1.1f,' %(i*100))
+
+    out_file.write('%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t'\
+                   %(match, mism, repMatches, nCount, len(tStarts), qGapb, len(qStarts), tGapb))
+    #out_file.write('%c%c\t' %(tStrand,qStrand))
+    out_file.write('%c\t' %(tStrand))
+        
+    out_file.write('%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\n'\
+                   %(qName, qSize, qStart-1, qEnd, tName, tSize, tStart-1, tEnd, \
+                   num_exons, exon_length_string, qStarts_string, tStarts_string, identity_string, ssQuality_string))
+        
+    return
+#<------------------------------------------------------------------------------>
+
+
+#<------------------------------------------------------------------------------>
+# prints the alignment, should be clear (hehe)
+# len(DNA) == len(EST)
+#<------------------------------------------------------------------------------>
+def print_align(DNA, EST, comparison, qIDX, tIDX, out_file, width=60):
+    import numpy
+    idx = []
+    idx_old = []
+    for i in range(1,len(DNA),width): #e.g. 1, 61, 121 ...
+        if i == 1:
+            idx = [i,numpy.minimum(i+width-1,len(DNA))]
+            out_file.write("Q: %9i  %s %9i\n" %(qIDX[idx[0]-1], EST[idx[0]-1:idx[1]], qIDX[idx[1]-1]))
+            out_file.write("              %s     \n" %(comparison[idx[0]-1:idx[1]]))
+            out_file.write("T: %9i  %s %9i\n\n\n" %(tIDX[idx[0]-1], DNA[idx[0]-1:idx[1]], tIDX[idx[1]-1]))
+        else:
+            idx_old = idx
+            idx = [i,numpy.minimum(i+width-1,len(DNA))]
+            out_file.write("Q: %9i  %s %9i\n" %(min(qIDX[idx_old[1]-1]+1,qIDX[idx[1]-1]), EST[idx[0]-1:idx[1]], qIDX[idx[1]-1]))
+            out_file.write("              %s     \n" %(comparison[idx[0]-1:idx[1]]))
+            out_file.write("T: %9i  %s %9i\n\n\n" %(min(tIDX[idx_old[1]-1]+1,tIDX[idx[1]-1]), DNA[idx[0]-1:idx[1]], tIDX[idx[1]-1]))
+    return
+
+
+def print_header(out_file):
+    """prints some lines in output_file that describe what each column means
+
+    Can be turned off via --noHead.
+    """
+    out_file.write("align.\tmatch\tmis-\tQ gap\tT gap\tstrand\tQ\tQ\tQ\tQ\tT\tT\tT\tT\texon\tQ\tT\texon\tidentity\n")
+    out_file.write("score\t\tmatch\tbases\tbases\t\tname\tsize\tstart\tend\tname\tsize\tstart\tend\tcount\tbound\tbound\tlengths\n")
+    out_file.write("--------------------------------------------------------------------------------------------------------------------------------------------------------\n")
+    return
+
+def print_blat_header(out_file):
+    """prints blat header lines in output_file that describe what each column means
+
+    Copied from blat output. Can be turned off via --noHead.
+    """
+    out_file.write('psLayout version 3\n\n')
+    out_file.write("match\tmis-\trep.\tN's\tQ gap\tQ gap\tT gap\tT gap\tstrand\tQ\t\tQ\tQ\tQ\tT\t\tT\tT\tT\tblock\tblockSizes\tqStarts\t tStarts\n")
+    out_file.write('\tmatch\tmatch\t\tcount\tbases\tcount\tbases\t\tname\t\tsize\tstart\tend\tname\t\tsize\tstart\tend\tcount\n')
+    out_file.write('---------------------------------------------------------------------------------------------------------------------------------------------------------------\n')
+
+def print_bed_header(bed_file):
+    """prints some lines in bed_file that describe what each column means
+
+    Can be turned off via --noHead.
+    """
+    bed_file.write("chrom\tchrom\tchrom\tname\tscore\tstrand\tblock\tblock\tblock\n")
+    bed_file.write("\tStart\tEnd\t\t\t\tCount\tSizes\tStart\n")
+    bed_file.write("---------------------------------------------------------------------\n")
+    return
+
+
+
+#<------------------------------------------------------------------------------>
+# prints a line in the output bed_file which describes the alignment
+#<------------------------------------------------------------------------------>
+def print_bed_line(tName, tStart, tEnd, qName, score, tStrand,\
+                  num_exons, tStarts, tEnds, bed_file):
+
+    tStarts_string = ''
+    tExonSizes_string = ''
+    for (i,j) in zip(tStarts, tEnds):
+        tStarts_string = tStarts_string + ('%d,' %(i))
+       tExonSizes_string = tExonSizes_string + ('%d,' %(j-i+1))    
+
+    bed_file.write("%s\t%d\t%d\t%s\t%4.3f\t%s\t%d\t%s\t%s\n"\
+               %(tName, tStart, tEnd, qName, score, tStrand,\
+                 num_exons, tExonSizes_string, tStarts_string))
+    return
+#<------------------------------------------------------------------------------>
+
+
+def print_results(align_info, out_file, bed_file, tName, tSize, qName, qSize, qStrand, tStrand, parameters):
+    """printing results in output_file"""
+
+    score = align_info.score
+    match = align_info.match
+    mism = align_info.mism
+    qGapb = align_info.qGapb
+    tGapb = align_info.tGapb
+
+    tStart = align_info.tStart
+    tEnd = align_info.tEnd
+    tStarts = align_info.tStarts
+    tEnds = align_info.tEnds
+    tIDX = align_info.tIDX
+    
+    qStart = align_info.qStart
+    qEnd = align_info.qEnd
+    qStarts = align_info.qStarts
+    qEnds = align_info.qEnds
+    qIDX = align_info.qIDX
+    #if tStrand=='+':
+    #    tStart = align_info.tStart
+    #    tEnd = align_info.tEnd
+    #    tStarts = align_info.tStarts
+    #    tEnds = align_info.tEnds
+    #    tIDX = align_info.tIDX
+    #else:
+    #    tEnd = tSize - align_info.tStart + 1
+    #    tStart = tSize - align_info.tEnd + 1
+    #    tStarts = map(lambda x: tSize - x + 1, align_info.tStarts)
+    #    tEnds = map(lambda x: tSize - x + 1, align_info.tEnds)
+    #    tIDX = map(lambda x: tSize - x + 1, align_info.tIDX)
+
+    #if qStrand=='+':
+    #    qStart = align_info.qStart
+    #    qEnd = align_info.qEnd
+    #    qStarts = align_info.qStarts
+    #    qEnds = align_info.qEnds
+    #    qIDX = align_info.qIDX
+    #else:
+    #    qEnd = qSize - align_info.qStart + 1
+    #    qStart = qSize - align_info.qEnd + 1
+    #    qStarts = map(lambda x: qSize - x + 1, align_info.qStarts)
+    #    qEnds = map(lambda x: qSize - x + 1, align_info.qEnds)
+    #    qIDX = map(lambda x: qSize - x + 1, align_info.qIDX)
+
+    num_exons = align_info.num_exons
+    qExonSizes = align_info.qExonSizes
+    tExonSizes = align_info.tExonSizes
+    exon_length = align_info.exon_length
+    identity = align_info.identity
+    ssQuality = align_info.ssQuality 
+    exonQuality = align_info.exonQuality 
+    comparison = align_info.comparison
+    dna_letters = align_info.dna_letters
+    est_letters = align_info.est_letters
+
+    if parameters['print_psl2']:
+        print_line(score, match, mism, qGapb, tGapb, qStrand, tStrand, qName, qSize, qStart, qEnd,\
+                   tName, tSize, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
+                   tExonSizes, tStarts, tEnds, exon_length, identity, ssQuality, out_file)
+    else:
+        print_line_blatformat(score, match, mism, qGapb, tGapb, qStrand, tStrand, qName, qSize, qStart, qEnd,\
+                   tName, tSize, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
+                   tExonSizes, tStarts, tEnds, exon_length, exonQuality, ssQuality, out_file)
+
+    if parameters['print_sequences']:
+        out_file.write("\n")
+        print_align(dna_letters[0], est_letters[0], comparison, qIDX, tIDX, out_file)
+
+    if print_header and bed_file is not None:
+        print_bed_header(bed_file)
+        print_bed_line(tName, tStart, tEnd, qName, score, tStrand, num_exons, tStarts, tEnds, bed_file)
+
diff --git a/standalone/palma/palma_utils.py b/standalone/palma/palma_utils.py
new file mode 100644 (file)
index 0000000..62a2d70
--- /dev/null
@@ -0,0 +1,581 @@
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+# 
+# Written (W) 2006-2007 Uta Schulze, Gunnar Raetsch
+# Copyright (C) 2006-2007 Max-Planck-Society
+# 
+
+def set_param(model):
+    import numpy
+
+    class plf: #means piecewise linear function
+       len = 0
+       limits = []
+       penalties = []
+       transform = ''
+       name = ''
+       max_len = 0
+       min_len = 0
+
+    h = plf()
+    h.len = int(model.intron_len_bins) ;
+    h.limits = model.intron_len_limits ;
+    h.penalties = model.intron_len_penalties ;
+    h.name = 'h'
+    h.max_len   = int(model.intron_len_max)
+    h.min_len   = int(model.intron_len_min)
+    h.transform = model.intron_len_transform
+
+    d = plf()
+    d.len = int(model.donor_bins)
+    d.limits = model.donor_limits
+    d.penalties = model.donor_penalties 
+    d.name = 'd'
+    d.max_len   = 100
+    d.min_len   = -100
+
+    a = plf()
+    a.len = int(model.acceptor_bins)
+    a.limits = model.acceptor_limits 
+    a.penalties = model.acceptor_penalties 
+    a.name = 'a'
+    a.max_len   = 100
+    a.min_len   = -100
+
+    mmatrix = model.substitution_matrix 
+    return h,d,a,mmatrix
+#<------------------------------------------------------------------------------>
+
+
+#<------------------------------------------------------------------------------>
+# gives back y-value acc, to "support"-value (piecewise linear function)
+#<------------------------------------------------------------------------------>
+def penalty_lookup(plf, supp_value):
+       import numpy
+       limits    = plf.limits
+       penalties = plf.penalties
+       transform = plf.transform
+
+       if transform == '':
+               supp_value = supp_value
+       elif transform == 'log':
+               print 'log'
+               supp_value = numpy.log(supp_value)
+       elif transform == 'log(+3)':
+               print 'log(+3)'
+               supp_value = numpy.log(supp_value+3)
+       elif transform == 'log(+1)':
+               print 'log(+1)'
+               supp_value = numpy.log(supp_value+1)
+       elif transform == '(+3)':
+               print '(+3)'
+               supp_value = supp_value+3
+       elif transform == 'mod3':
+               print 'mod3'
+               supp_value = numpy.mod(supp_value,3)
+       else:
+               raise NameError, 'unknown penalty transform'
+
+       #print 'supp_value = %i\n' %supp_value
+       
+       if supp_value <= limits[0]:
+               score = penalties[0]
+               #print 'supp_value <= limits[0]'
+       elif supp_value >= limits[plf.len-1]:
+               score = penalties[plf.len-1]
+               #print 'supp_value >= limits[len(limits)-1]'
+       else:
+               for cnt in range(plf.len):
+                       if supp_value <= limits[cnt]:
+                               upper_idx = cnt
+                               break
+               lower_idx = upper_idx-1
+               #print '%1.3f <= %1.3f <= %1.3f\n' %(limits[lower_idx], supp_value, limits[upper_idx])
+               score = ((supp_value-limits[lower_idx])*penalties[upper_idx] + (limits[upper_idx]-supp_value)*penalties[lower_idx])/(limits[upper_idx]-limits[lower_idx])     
+
+       return score
+#<------------------------------------------------------------------------------>
+
+
+#<------------------------------------------------------------------------------>
+# counts matches, mismatches, gaps on DNA (means bases inserted in query, qGapb)
+# and gaps on EST (means baes inserted in transcript, tGapb)
+#
+#   matchmatrix (columnwise)
+#     - A C G T N (dna)
+#   - x x x x x x
+#   A x x x x x x      
+#   C x x x x x x
+#   G x x x x x x
+#   T x x x x x x
+#   N x x x x x x
+# (est)
+#<------------------------------------------------------------------------------>
+def evaluate_mmatrix(mmatrix):
+    import numpy
+    #7: [a,a], 14:[c,c], 21:[g,g], 28:[t,t], 35:[n.n]
+    match = mmatrix[7] + mmatrix[14] + mmatrix[21] + mmatrix[28] + mmatrix[35]
+    qGapb = numpy.sum(mmatrix[1:6]) #gaps on DNA
+    tGapb = mmatrix[6] + mmatrix[12] + mmatrix[18] + mmatrix[24] + mmatrix[30] #gaps on EST
+    mism  = numpy.sum(mmatrix) - (match + qGapb + tGapb)
+
+    if 0:
+       print 'matches: %d' %match
+       print 'mismatches: %d' %mism
+       print 'qGapb (gaps on DNA): %d' %qGapb
+       print 'tGapb (gaps on EST): %d' %tGapb
+    
+    return match, mism, qGapb, tGapb
+#<------------------------------------------------------------------------------>
+
+
+#<------------------------------------------------------------------------------>
+# qStart, qEnd, tStart, tEnd, blocks, qStarts, tStarts
+#<------------------------------------------------------------------------------>
+def get_splice_info(splice_align, est_align):
+    import sys
+    #problem: because of the dangling ends, alignment can start with an intron
+    
+    #print splice_align
+    #print est_align   
+
+    dna_len = len(splice_align)
+    est_len = len(est_align)
+    qStart = 1 #(default)
+    qEnd   = est_len #(default)
+    tStart = 1 #(default)
+    tEnd   = dna_len #(default)
+    qStarts = []
+    qEnds   = []
+    qExonSizes = []
+    tStarts = []
+    tEnds   = []
+    tExonSizes = []
+
+    #<---------------------------------------------------------->
+    #query: dangling start
+    if est_align[0]==4:
+       for i in range(est_len):
+           if est_align[i]!=4:
+               qStart = i+1 #+1 bc index starts with zero
+               break
+
+    #query: dangling end
+    if est_align[est_len-1]==4:
+       list_idx = range(est_len)
+       list_idx.reverse()
+       for i in list_idx:
+           if est_align[i]!=4:
+               qEnd = i+1 #last "4", +1 bc index starts with zero
+               break
+    #qStarts: list of exon starting points on EST
+    #qEnds:   list of exon ending points on EST
+    #(2 in est_align means: first nucleotideof new exon)
+   
+    index_count = 0
+    exon_label = est_align[qStart-1]
+    qStarts.append(qStart)
+    for i in range(qStart-1,qEnd):
+       if (est_align[i]==1) and (exon_label==2): #new exon labeled "1"
+           exon_label = 1
+           qStarts.append(i+1) #+1 bc index starts with zero, start of this exon saved
+           qEnds.append(i-1+1) #end of last exon labeled "2" saved
+           qExonSizes.append(qEnds[index_count] - qStarts[index_count] +1) #length of  last exon
+           index_count+=1
+       elif (est_align[i]==2) and (exon_label==1): #new exon labeled "2"
+           exon_label = 2
+           qStarts.append(i+1) #+1 bc index starts with zero, start of this exon saved
+           qEnds.append(i-1+1) #end of last exon labeled "2" saved
+           qExonSizes.append(qEnds[index_count] - qStarts[index_count] +1) #length of  last exon
+           index_count+=1
+       if (i == qEnd-1): #end of last exon
+           qEnds.append(i+1) #end of last exon saved
+           qExonSizes.append(qEnds[index_count] - qStarts[index_count] +1) #length of  last exon          
+                  
+    assert(len(qStarts)==len(qEnds))
+    
+    if 0:
+       print 'qStart: %d' %qStart
+       print 'qEnd:   %d' %qEnd
+       print qStarts
+       print qEnds
+       print qExonSizes
+    #<---------------------------------------------------------->
+    
+    #<---------------------------------------------------------->          
+    #transcript: dangling start
+    if splice_align[0] == 4:
+       for i in range(dna_len):
+           if splice_align[i]!=4:
+               tStart = i+1 #+1 bc index starts with zero
+               break
+    #if splice_align[tStart-1]==1: #alignment starts with intron
+       #print 'alignment starts with intron'
+
+    #transcript: dangling end
+    if splice_align[dna_len-1]==4:
+       list_idx = range(dna_len)
+       list_idx.reverse()
+       for i in list_idx:
+           if splice_align[i]!=4:
+               tEnd = i+1 #+1 bc index starts with zero
+               break   
+    #if splice_align[tEnd-1]==2: #alignment ends with intron
+       #print 'alignment ends with intron'
+
+    #tStarts: list of exon starting points on DNA
+    #tEnds:   list of exon ending points on DNA
+   
+    index_count = 0
+    act_state = 3 #3 means intron, 0 means exon
+    for i in range(tStart-1,tEnd):
+       if (splice_align[i]==0) and (act_state==3): #first base of exon
+           tStarts.append(i+1) #+1 bc index starts with zero
+           act_state=0 #exon
+       elif (splice_align[i]==1) and (act_state==0): #exon ended one base ago
+           tEnds.append(i-1+1) #+1 bc index starts with zero
+           act_state=3 #intron
+           tExonSizes.append(tEnds[index_count] - tStarts[index_count] +1) #length
+           index_count+=1
+       if (i==tEnd-1) and (act_state==0): #end of alignment reached and still in exon
+           tEnds.append(i+1) #+1 bc index starts with zero
+           tExonSizes.append(tEnds[index_count] - tStarts[index_count] +1) #length
+
+    #<---------------------------------------------------------->
+
+           
+
+    if (len(tStarts)!=len(tEnds)):
+       print len(tStarts)
+       print len(tEnds)
+       print 'tStart: %d' %tStart
+       print 'tEnd:   %d' %tEnd 
+       print tStarts
+       print tEnds
+       sys.exit(2)
+
+    #print 'exons on query: %d, exons on target: %d' %(len(qStarts),len(tStarts))
+    num_exons = len(tStarts)
+    #print num_exons
+    #print len(qStarts)
+
+    if 0:
+       print 'tStart: %d' %tStart
+       print 'tEnd:   %d' %tEnd 
+       print tStarts
+       print tEnds
+       print tExonSizes
+
+   # assert(num_exons==len(qStarts))
+
+    return qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds, tExonSizes, tStarts, tEnds
+#<------------------------------------------------------------------------------>
+
+""" read fasta as dictionary """
+def read_fasta(f):
+       import sys
+       fasta=dict()
+       for s in f.readlines():
+               if s.startswith('>'):
+                       key=s[1:-1]
+                       print key
+                       if fasta.has_key(key):
+                               sys.stderr.write("ERROR: duplicated sequence name '%s'\n" %(key))
+                               sys.exit(2)
+                       fasta[key]=""
+               else:
+                       fasta[key]+=s[:-1]
+
+       return fasta
+
+#<------------------------------------------------------------------------------>
+#
+# DNA and EST with gaps and est with intron
+# len(dna) == len(est)
+# identity: list of length exon_numbers: #matches / length(exon_with_gaps)
+#<------------------------------------------------------------------------------>
+def comp_identity(DNA, EST, strand, qStart, tStart, translation, comparison_char):
+    def set_idx(i,qNum,tNum):
+       qIDX[i] = qNum
+       tIDX[i] = tNum
+       return qIDX, tIDX
+    
+    gap_char = translation[0] #e.g. "-"
+    in_char = translation[6] #e.g. "*", intron char
+    equ_char = comparison_char[0] #match char e.g. "|"
+    comp_in_char = comparison_char[1] #intron char when comparing 
+    comparison = [' '] *len(EST)
+    qIDX = [-1] *len(EST)
+    tIDX = [-1] *len(DNA)
+    exon_idx = 0
+    matches = []
+    gaps = []
+    exon_length = []
+    identity = []
+    exonQuality = [] # matches - 2*gaps
+    ssQuality = []
+    state = 3 #intron:3, exon:0
+    EST_started = 0
+    
+    for i in range(len(DNA)):
+      if (i>0 and i<len(DNA)-1):
+       if (EST[i] == in_char): #intron
+        (qIDX, tIDX) = set_idx(i,qIDX[i-1],tIDX[i-1]+1)
+        comparison[i] = comp_in_char
+        if (state==0): #exon finished
+            identity.append(float(matches[exon_idx])/float(exon_length[exon_idx])) #save identity for finished exon
+            exonQuality.append(float(matches[exon_idx]-2*gaps[exon_idx])/float(exon_length[exon_idx])) #save identity for finished exon
+            exon_idx+=1 #next exon starts sooner or later
+            state = 3 #now intron
+            last_ssQuality = 0.0 ;
+            last_ssLen = 0 ;
+            for k in range(5):
+                if i-k-1<0: break 
+                last_ssLen+=1
+                if DNA[i-k-1]==EST[i-k-1]: last_ssQuality+=1.0 
+                else:
+                    if (DNA[i-k-1]=='N' and EST[i-k-1]!=gap_char) or (DNA[i-k-1]!=gap_char and EST[i-k-1]=='N'): last_ssQuality+=0.5
+                #print k,DNA[i-k-1],EST[i-k-1],last_ssQuality
+       else: #exon
+        if (state==0): #still in same exon
+            if EST_started and DNA[i]!=gap_char: # only count DNA length
+                exon_length[exon_idx] += 1 #one bp added
+            if EST[i]!=gap_char: EST_started=1 ;
+            #print i,EST[i],EST_started,exon_length[exon_idx]
+            if (DNA[i]==EST[i]):
+                matches[exon_idx]+=1 #one match added
+            else:
+                if (DNA[i]=='N' and EST[i]!=gap_char) or (EST[i]=='N' and DNA[i]!=gap_char): matches[exon_idx]+=0.5
+        else: #new exon
+            for k in range(5):
+                if i+k>=len(DNA) or i+k>len(EST): break 
+                last_ssLen+=1 ;
+                if DNA[i+k]==EST[i+k]: last_ssQuality+=1.0 ;
+                else:
+                    if (DNA[i+k]=='N' and EST[i+k]!=gap_char) or (DNA[i+k]!=gap_char and EST[i+k]=='N'): last_ssQuality+=0.5
+                #print k,DNA[i+k],EST[i+k],last_ssQuality
+            ssQuality.append(last_ssQuality/last_ssLen) 
+            if DNA[i]!=gap_char: exon_length.append(1)
+            else: exon_length.append(0)
+            state = 0 #now exon
+            if (DNA[i]==EST[i]):
+             gaps.append(0)
+             matches.append(1)
+            else:
+             if (DNA[i]=='N' and EST[i]!=gap_char) or (EST[i]=='N' and DNA[i]!=gap_char): matches.append(0.5); gaps.append(0)
+             else: matches.append(0); gaps.append(1) 
+
+        if (DNA[i]==EST[i]) and (DNA[i]!='-'): #match
+            (qIDX, tIDX) = set_idx(i,qIDX[i-1]+1,tIDX[i-1]+1)
+            comparison[i] = equ_char    
+        elif (DNA[i]==EST[i]) and (DNA[i]=='-'): #strange match
+            (qIDX, tIDX) = set_idx(i,qIDX[i-1],tIDX[i-1])
+        elif (EST[i]==gap_char): #gap on est
+            (qIDX, tIDX) = set_idx(i,qIDX[i-1],tIDX[i-1]+1)
+        elif (DNA[i]==gap_char): #gap on dna
+            (qIDX, tIDX) = set_idx(i,qIDX[i-1]+1,tIDX[i-1])
+        else: #mismatch
+            (qIDX, tIDX) = set_idx(i,qIDX[i-1]+1,tIDX[i-1]+1)
+
+      elif (i==0):
+       if (EST[i] == in_char): #alignment starts with intron
+        (qIDX, tIDX) = set_idx(i,0,1)
+        comparison[i] = comp_in_char
+       else: #alignment starts with exon
+        state = 0 #go into exon_state
+        if DNA[i]!=gap_char: exon_length.append(1)
+        else: exon_length.append(0)
+        if (DNA[i]==EST[i]) and (DNA[i]!='-'): #match
+            #(qIDX, tIDX) = set_idx(i,1,1)
+            EST_started = 1 
+            (qIDX, tIDX) = set_idx(i,qStart,tStart)
+            matches.append(1)
+            gaps.append(0)
+            comparison[i] = equ_char
+        elif (DNA[i]==EST[i]) and (DNA[i]=='-'): #strange match
+            (qIDX, tIDX) = set_idx(i,qStart,tStart)
+            matches.append(0)
+            gaps.append(1)
+        elif (EST[i]==gap_char): #gap on est
+            #(qIDX, tIDX) = set_idx(i,0,1)
+            (qIDX, tIDX) = set_idx(i,qStart,tStart)
+            matches.append(0)
+            gaps.append(1)
+        elif (DNA[i]==gap_char): #gap on dna
+            #(qIDX, tIDX) = set_idx(i,1,0)
+            (qIDX, tIDX) = set_idx(i,qStart,tStart)
+            matches.append(0)
+            gaps.append(1)
+        else: #mismatch
+            #(qIDX, tIDX) = set_idx(i,1,1)
+            (qIDX, tIDX) = set_idx(i,qStart,tStart)
+            gaps.append(0)
+            if (DNA[i]=='N' or EST[i]=='N'): matches.append(0.5)
+            else: matches.append(0.5)
+      elif (i==len(DNA)-1):
+       if (EST[i] == in_char): #alignment ends with intron, with exons everything ok
+        assert(state==3) #ok, because intron longer than 4
+        comparison[i] = comp_in_char
+        (qIDX, tIDX) = set_idx(i,qIDX[i-1],tIDX[i-1]+1)
+       else: #alignment ends with exon, exon finished
+        if (state==0): #finish exon
+           if DNA[i]!=gap_char: exon_length[exon_idx] += 1 #one bp added
+           else: exon_length[exon_idx] += 1 #one bp added
+           if (DNA[i]==EST[i]) and (DNA[i]!=gap_char): #match
+            matches[exon_idx]+=1 #one match added
+            comparison[i] = equ_char
+            (qIDX, tIDX) = set_idx(i,qIDX[i-1]+1,tIDX[i-1]+1)
+           elif (DNA[i]==EST[i]) and (DNA[i]==gap_char): #strange match
+            (qIDX, tIDX) = set_idx(i,qIDX[i-1],tIDX[i-1])
+           elif (EST[i]==gap_char): #gap on est
+            (qIDX, tIDX) = set_idx(i,qIDX[i-1],tIDX[i-1]+1)
+           elif (DNA[i]==gap_char): #gap on dna
+            (qIDX, tIDX) = set_idx(i,qIDX[i-1]+1,tIDX[i-1])
+           else: #mismatch
+            if (DNA[i]=='N' or EST[i]=='N'): matches[exon_idx]+=0.5
+            (qIDX, tIDX) = set_idx(i,qIDX[i-1]+1,tIDX[i-1]+1)
+           identity.append(float(matches[exon_idx])/float(exon_length[exon_idx])) #save identity for finished exon
+           exonQuality.append(float(matches[exon_idx]-2*gaps[exon_idx])/float(exon_length[exon_idx])) #save identity for finished exon
+        else: #last exon has length 1
+           if DNA[i]!=gap_char: exon_length.append(1)
+           else: exon_length.append(1)
+           if (DNA[i]==EST[i]): #match
+            matches.append(1) #one match added
+            gaps.append(0) 
+            comparison[i] = equ_char
+            (qIDX, tIDX) = set_idx(i,qIDX[i-1]+1,tIDX[i-1]+1)
+           elif (EST[i]==gap_char): #gap on est
+            gaps.append(1) 
+            matches.append(0)
+            (qIDX, tIDX) = set_idx(i,qIDX[i-1],tIDX[i-1]+1)
+           elif (DNA[i]==gap_char): #gap on dna
+            gaps.append(1) 
+            matches.append(0)
+            (qIDX, tIDX) = set_idx(i,qIDX[i-1]+1,tIDX[i-1])
+           else: #mismatch
+            gaps.append(0) 
+            if (DNA[i]=='N' or EST[i]=='N'): matches.append(0.5)
+            else: matches.append(0)
+            (qIDX, tIDX) = set_idx(i,qIDX[i-1]+1,tIDX[i-1]+1)
+           identity.append(float(matches[exon_idx])/float(exon_length[exon_idx])) #save identity for finished exon
+           exonQuality.append(float(matches[exon_idx]-2*gaps[exon_idx])/float(exon_length[exon_idx])) #save identity for finished exon
+      else:
+        print 'this "else" should not happen'
+
+    comparison = reduce(lambda x,y: x+y, comparison)
+    #print qIDX
+    #print tIDX
+    #print exon_length
+    #print ssQuality, exonQuality, identity
+
+    return exon_length, identity, ssQuality, exonQuality, comparison, qIDX, tIDX
+#<------------------------------------------------------------------------------>
+
+
+#<------------------------------------------------------------------------------>
+# 
+#<------------------------------------------------------------------------------>
+def reverse_complement(DNA):
+    import string
+    import sys
+    dict = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N'}
+    
+    DNA = DNA.upper()
+    reverse_DNA = ''
+    cnt_list = range(len(DNA))
+    cnt_list.reverse()
+    for i in cnt_list:
+      if not dict.has_key(DNA[i]):
+        print 'unknown letter', DNA[i], '-> translating to N'
+        reverse_DNA = reverse_DNA + 'N' ;
+      else:
+        reverse_DNA = reverse_DNA + dict[DNA[i]]
+    return reverse_DNA
+#<------------------------------------------------------------------------------>
+
+
+#<------------------------------------------------------------------------------>
+#
+#<------------------------------------------------------------------------------>
+def parse_wublast(output, e_value_bound):
+    import numpy
+    import sys
+    
+    strand_matrix='- +'
+
+    class wublast_info:
+        score = []
+        q_name = []
+        t_name = []
+        q_strand = []
+        t_strand = []
+        q_start = []
+        q_end = []
+        t_start = []
+        t_end = []
+
+    est_entry = 0
+    still_in_group = 0
+    
+    est_wbi = wublast_info()
+    for line in output:
+        fields = line.strip().split('\t')
+        #print line, still_in_group
+        if float(fields[2]) <= e_value_bound:
+            if len(est_wbi.q_name)>0:
+                flag = (fields[0] == est_wbi.q_name[len(est_wbi.t_name)-1]) and (fields[1] == est_wbi.t_name[len(est_wbi.t_name)-1]) and (int(fields[3]) == group_size)
+            else:
+                flag = True 
+
+            if (still_in_group == 0) or (not flag):
+                if (not still_in_group == 0):
+                    sys.stderr.write('blastn output format problem near line:\n'+line)
+
+                # start a new group
+                est_wbi.score.append(float(fields[4]))
+                est_wbi.q_name.append(fields[0])
+                est_wbi.t_name.append(fields[1])
+                est_wbi.q_strand.append(strand_matrix[int(fields[16])+1])
+                est_wbi.t_strand.append(strand_matrix[int(fields[19])+1])
+                if strand_matrix[int(fields[16])+1] == '+':
+                    est_wbi.q_start.append(int(fields[17]))
+                    est_wbi.q_end.append(int(fields[18]))
+                else:
+                    est_wbi.q_start.append(int(fields[18]))
+                    est_wbi.q_end.append(int(fields[17]))
+                est_wbi.t_start.append(int(fields[20]))
+                est_wbi.t_end.append(int(fields[21]))
+                still_in_group = int(fields[3])-1
+                #print 'still_in_group', still_in_group
+                group_size = int(fields[3])
+            else:
+                # extend a group
+                assert(fields[0] == est_wbi.q_name[len(est_wbi.t_name)-1]) 
+                assert(fields[1] == est_wbi.t_name[len(est_wbi.t_name)-1]) 
+                assert(int(fields[3]) == group_size) 
+                est_wbi.score[len(est_wbi.score)-1] += float(fields[4])
+                if strand_matrix[int(fields[16])+1] == '+':
+                    est_wbi.q_start[len(est_wbi.q_start)-1] = numpy.minimum(est_wbi.q_start[len(est_wbi.q_start)-1],int(fields[17]))
+                    est_wbi.q_end[len(est_wbi.q_end)-1] =numpy.maximum(est_wbi.q_end[len(est_wbi.q_end)-1],int(fields[18]))
+                else:
+                    est_wbi.q_start[len(est_wbi.q_start)-1] =numpy.minimum(est_wbi.q_start[len(est_wbi.q_start)-1],int(fields[18]))
+                    est_wbi.q_end[len(est_wbi.q_end)-1] =numpy.maximum(est_wbi.q_end[len(est_wbi.q_end)-1],int(fields[17]))
+                est_wbi.t_start[len(est_wbi.t_start)-1] =numpy.minimum(est_wbi.t_start[len(est_wbi.t_start)-1],int(fields[20]))
+                est_wbi.t_end[len(est_wbi.t_end)-1] =numpy.maximum(est_wbi.t_end[len(est_wbi.t_end)-1],int(fields[21]))
+                still_in_group -= 1
+
+    if 0:
+        print est_wbi.q_name
+        print est_wbi.t_name
+        print est_wbi.q_strand
+        print est_wbi.t_strand
+        print est_wbi.q_start
+        print est_wbi.q_end
+        print est_wbi.t_start
+        print est_wbi.t_end
+
+    return est_wbi
+#<------------------------------------------------------------------------------>
diff --git a/standalone/palma/seqdict.py b/standalone/palma/seqdict.py
new file mode 100644 (file)
index 0000000..ee9087a
--- /dev/null
@@ -0,0 +1,68 @@
+import string
+
+class predictions(object):
+    def __init__(self, positions=None, scores=None):
+        self.positions=positions
+        self.scores=scores
+
+    def set_positions(self, positions):
+        self.positions=positions;
+    def get_positions(self):
+        return self.positions
+
+    def set_scores(self, scores):
+        self.scores=scores
+    def get_scores(self):
+        return self.scores
+
+    def __str__(self):
+        return 'positions: ' + `self.positions` + 'scores: ' + `self.scores`
+    def __repr__(self):
+        return self.__str__()
+
+class sequence(object):
+    def __init__(self, name, seq, (start,end)):
+        assert(start<end<=len(seq))
+        self.start=start
+        self.end=end
+        self.name=name
+        self.seq=seq
+        self.preds=dict()
+        self.preds['acceptor']=predictions()
+        self.preds['donor']=predictions()
+
+    def __str__(self):
+        s="start:" + `self.start` 
+        s+=" end:" + `self.end`
+        s+=" name:" + `self.name`
+        s+=" sequence:" + `self.seq[0:10]`
+        s+="... preds:" + `self.preds`
+        return s
+    def __repr__(self):
+        return self.__str__()
+
+def seqdict(dic):
+    """ takes a fasta dict as input and
+    generates a list of sequence objects from it """
+    sequences=list()
+
+    #translate string to ACGT / all non ACGT letters are mapped to A
+    tab=''
+    for i in xrange(256):
+        if chr(i).upper() in 'ACGT':
+            tab+=chr(i).upper()
+        else:
+            tab+='A'
+
+    for seqname in dic:
+        seq=string.translate(dic[seqname], tab) 
+        seq=seq.upper()
+        #if end<0:
+        #    stop=len(seq)+end
+        #else:
+        #    stop=end
+
+        sequences.append(sequence(seqname, seq, (0,len(seq))))
+        #sequences.append(seq)
+        
+    return sequences
diff --git a/standalone/palma/signal_detectors.py b/standalone/palma/signal_detectors.py
new file mode 100644 (file)
index 0000000..cfa0eb1
--- /dev/null
@@ -0,0 +1,216 @@
+# 
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+# 
+# Written (W) 2006-2007 Soeren Sonnenburg
+# Written (W) 2007 Gunnar Raetsch
+# Copyright (C) 2007 Fraunhofer Institute FIRST and Max-Planck-Society
+# 
+
+import sys
+import numpy
+import seqdict
+import model
+
+from shogun.Classifier import SVM
+from shogun.Features import StringCharFeatures,DNA
+# from shogun.Kernel import WeightedDegreeCharKernel
+# In the svn version of shogun, this has been renamed.
+from shogun.Kernel import WeightedDegreeStringKernel
+
+class svm_splice_model(object):
+    def __init__(self, order, traindat, alphas, b, (window_left,offset,window_right), consensus):
+
+        f=StringCharFeatures(DNA)
+        f.set_string_features(traindat)
+                # In the svn version of shogun, this has been renamed.
+        wd_kernel = WeightedDegreeStringKernel(f,f, int(order))
+        # wd_kernel = WeightedDegreeCharKernel(f,f, int(order))
+        wd_kernel.io.set_target_to_stderr()
+
+        self.svm=SVM(wd_kernel, alphas, numpy.arange(len(alphas), dtype=numpy.int32), b)
+        self.svm.io.set_target_to_stderr()
+        self.svm.parallel.set_num_threads(self.svm.parallel.get_num_cpus())
+        #self.svm.set_linadd_enabled(False)
+        #self.svm.set_batch_computation_enabled(true)
+
+        self.window_left=int(window_left)
+        self.window_right=int(window_right)
+
+        self.consensus=consensus
+        self.wd_kernel=wd_kernel
+        self.traindat=f
+        self.offset = offset ;
+
+    def get_positions(self, sequence):
+        """DEPRECATED: Please use get_positions_from_seqdict"""
+        print "DEPRECATED: Please use get_positions_from_seqdict"
+        positions=list()
+
+        for cons in self.consensus:
+            l=sequence.find(cons)
+            while l>-1:
+                #if l<len(sequence)-self.window_right-2 and l>self.window_left:
+                positions.append(l+self.offset)
+                l=sequence.find(cons, l+1)
+
+        positions.sort()
+        return positions
+
+    def get_predictions(self, sequence, positions):
+        """DEPRECATED: Please use get_predictions_from_seqdict"""
+        print "DEPRECATED: Please use get_predictions_from_seqdict"
+        seqlen=self.window_right+self.window_left+2
+        num=len(positions)
+
+        #testdat = numpy.chararray((seqlen,num),1,order='FORTRAN')
+        testdat = num*[[]]
+        
+        for j in xrange(num):
+            i=positions[j] - self.offset ;
+            start = i-self.window_left
+            if start<0:
+                s_start='A'*(-start)
+                start =  0;
+            else:
+                s_start = ''
+            stop = i+self.window_right+2
+            if stop>len(sequence):
+                s_stop = 'A'*(stop-len(sequence))
+                stop=len(sequence) - 1 ;
+            else:
+                s_stop = '' ;
+            s= s_start + sequence[start:stop] + s_stop
+            testdat[:,j]=list(s)
+            testdat[j]=s
+
+        t=StringCharFeatures(DNA)
+        t.set_string_features(testdat)
+
+        self.wd_kernel.init(self.traindat, t)
+        l=self.svm.classify().get_labels()
+        sys.stderr.write("\n...done...\n")
+        return l
+
+
+    def get_predictions_from_seqdict(self, seqdic, site):
+        """ we need to generate a huge test features object 
+            containing all locations found in each seqdict-sequence
+            and each location (this is necessary to efficiently
+            (==fast,low memory) compute the splice outputs
+        """
+
+        #seqlen=self.window_right+self.window_left+2
+
+        num=0
+        for s in seqdic:
+            num+= len(s.preds[site].positions)
+
+        #testdat = numpy.chararray((seqlen,num), 1, order='FORTRAN')
+        testdat = num*[[]]
+
+        k=0
+        si = 0 ;
+        for s in seqdic:
+            sequence=s.seq
+            positions=s.preds[site].positions
+            si += 1 
+            for j in xrange(len(positions)):
+                if len(positions)>50000 and j%10000==0:
+                    sys.stderr.write('sequence %i: progress %1.2f%%\r' %(si,(100*j/len(positions))))
+                i=positions[j] - self.offset
+                start = i-self.window_left
+                if start<0:
+                    s_start='A'*(-start)
+                    start =  0;
+                else:
+                    s_start = ''
+                stop = i+self.window_right+2
+                if stop>len(sequence):
+                    s_stop = 'A'*(stop-len(sequence) +1)
+                    stop=len(sequence) - 1 ;
+                else:
+                    s_stop = '' ;
+                s= s_start + sequence[start:stop] + s_stop
+                #print len(s)
+                #s=sequence[i-self.window_left:i+self.window_right+2]
+                #testdat[:,k]=list(s)
+                testdat[k]=s
+                k+=1
+                
+        if len(positions)>50000:
+            sys.stderr.write('\n')
+
+        t=StringCharFeatures(DNA)
+        t.set_string_features(testdat)
+
+        self.wd_kernel.init(self.traindat, t)
+        l=self.svm.classify().get_labels()
+        sys.stderr.write("\n...done...\n")
+
+        k=0
+        for s in seqdic:
+            num=len(s.preds[site].positions)
+            scores= num * [0]
+            for j in xrange(num):
+                scores[j]=l[k]
+                k+=1
+            s.preds[site].set_scores(scores)
+
+    def get_positions_from_seqdict(self, seqdic, site):
+        for d in seqdic:
+            positions=list()
+            sequence=d.seq
+            for cons in self.consensus:
+                l=sequence.find(cons)
+                while l>-1:
+                    #if l<len(sequence)-self.window_right-2 and l>self.window_left:
+                    positions.append(l+self.offset)
+                    l=sequence.find(cons, l+1)
+            positions.sort()
+            d.preds[site].set_positions(positions)
+
+
+class signal_detectors(object):
+    def __init__(self, model, donor_splice_use_gc):
+        if donor_splice_use_gc:
+            donor_consensus=['GC','GT']
+        else:
+            donor_consensus=['GT']
+        
+        self.acceptor=svm_splice_model(model.acceptor_splice_order, model.acceptor_splice_svs,
+                numpy.array(model.acceptor_splice_alphas).flatten(), model.acceptor_splice_b, 
+                (model.acceptor_splice_window_left, 2, model.acceptor_splice_window_right), ['AG'])
+        self.donor=svm_splice_model(model.donor_splice_order, model.donor_splice_svs, 
+                numpy.array(model.donor_splice_alphas).flatten(), model.donor_splice_b, 
+                (model.donor_splice_window_left, 0, model.donor_splice_window_right),
+                donor_consensus)
+
+    def set_sequence(self, seq):
+        self.acceptor.set_sequence(seq)
+        self.donor.set_sequence(seq)
+
+    def predict_acceptor_sites(self, seq):
+        pos=self.acceptor.get_positions(seq)
+        sys.stderr.write("computing svm output for acceptor positions\n")
+        pred=self.acceptor.get_predictions(seq, pos)
+        return (pos,pred)
+
+    def predict_donor_sites(self,seq):
+        pos=self.donor.get_positions(seq)
+        sys.stderr.write("computing svm output for donor positions\n")
+        pred=self.donor.get_predictions(seq, pos)
+        return (pos,pred)
+
+    def predict_acceptor_sites_from_seqdict(self, seqs):
+        self.acceptor.get_positions_from_seqdict(seqs, 'acceptor')
+        sys.stderr.write("computing svm output for acceptor positions\n")
+        self.acceptor.get_predictions_from_seqdict(seqs, 'acceptor')
+
+    def predict_donor_sites_from_seqdict(self, seqs): 
+        self.donor.get_positions_from_seqdict(seqs, 'donor')
+        sys.stderr.write("computing svm output for donor positions\n")
+        self.donor.get_predictions_from_seqdict(seqs, 'donor')
+
diff --git a/standalone/setup.py b/standalone/setup.py
new file mode 100755 (executable)
index 0000000..228219c
--- /dev/null
@@ -0,0 +1,65 @@
+#!/usr/bin/env python2.4
+
+import sys
+#import os
+import shutil
+from distutils.core import setup, Extension
+
+try:
+    import numpy
+except:
+    sys.stderr.write("WARNING: did not find 'numpy'\n")
+
+try:
+    import shogun
+except:
+    sys.stderr.write("WARNING: did not find 'shogun'\n")
+    sys.stderr.write("         shogun is required for computing splice site scores\n")
+    sys.stderr.write("         See shogun website: http://www.shogun-toolbox.org)\n")
+
+if sys.argv[1] != 'sdist':
+    sys.stderr.write('Using swig to construct interface file alignment_tool.py\n')
+    setup (name = 'palma._alignment_tool',
+           version = '0.3.7',
+           description = '!!!This is a hack to ensure that swig runs first. Real setup is below!!!',
+           author = ['Uta Schulze', 'Bettina Hepp', 'Gunnar Raetsch', \
+                     'Soeren Sonnenburg','Cheng Soon Ong'],
+           author_email = ['Gunnar.Raetsch@tuebingen.mpg.de'],
+           license='GPL2',
+           url = 'http://www.fml.tuebingen.mpg.de/raetsch/projects/palma',
+           ext_modules = [Extension('palma._alignment_tool',
+                                    sources = ['src/bound_matrix.cpp', 'src/myalign_1best.cpp', \
+                                               'src/fill_matrix_1best.cpp', 'src/result_align_1best.cpp',\
+                                               'src/alignment_tool_1best.i'])],
+           options={'build_ext':{'swig_opts':'-c++'}},
+           )
+    shutil.copy2('src/alignment_tool.py','palma/alignment_tool.py')
+    #os.rename('src/alignment_tool.py','palma/alignment_tool.py')
+
+sys.stderr.write('PALMA - mRNA to Genome Alignments using Large Margin Algorithms\n')
+setup (name = 'palma',
+       version = '0.3.7',
+       description = 'PALMA - mRNA to Genome Alignments using Large Margin Algorithms',
+       author = ['Uta Schulze', 'Bettina Hepp', 'Gunnar Raetsch', \
+                 'Soeren Sonnenburg','Cheng Soon Ong'],
+       author_email = ['Gunnar.Raetsch@tuebingen.mpg.de'],
+       license='GPL2',
+       url = 'http://www.fml.tuebingen.mpg.de/raetsch/projects/palma',
+       #py_modules=['palma.palma_utils','palma.alignment_tool','palma.genomic',\
+       py_modules=['palma.palma_utils','palma.genomic',\
+                   'palma.model','palma.output_formating','palma.seqdict','palma.signal_detectors'],
+       ext_modules = [Extension('palma._alignment_tool',
+                                sources = ['src/bound_matrix.cpp', 'src/myalign_1best.cpp', \
+                                           'src/fill_matrix_1best.cpp', 'src/result_align_1best.cpp',\
+                                           'src/alignment_tool_1best.i'])],
+       packages=['palma'],
+       package_data={'palma': ['data/*.param.bz2']},
+       scripts=['scripts/palma-align.py'],
+       options={'build_ext':{'swig_opts':'-c++'}},
+       long_description='''
+       PALMA aligns two genetic sequences 'the best way' according
+       to its underlying algorithm and trained parameters. See the paper on the website for details.''')
+
+# cleanup swig outputs so that each build is clean
+#os.remove('palma/alignment_tool.py')
+#os.remove('src/alignment_tool_1best_wrap.cpp')
diff --git a/tools/evaluatePrediction.py b/tools/evaluatePrediction.py
new file mode 100644 (file)
index 0000000..ee8e9af
--- /dev/null
@@ -0,0 +1,49 @@
+#!/usr/bin/env python
+# -*- coding:latin-1 -*-
+
+import sys
+import cPickle
+
+ground_truth   = cPickle.load(sys.argv[1])
+prediction     = cPickle.load(sys.argv[2])
+
+ESTs     = ground_truth['TestEsts']
+Exons    = ground_truth['TestExon']
+Acc      = ground_truth['TestAcc']
+Don      = ground_truth['TestDon']
+DNA      = ground_truth['Test']
+
+totalWrongPos = [0]*len(prediction)
+
+for pos,alignmentObj in enumerate(prediction):
+   predPos  = zips(alignmentObj.tStarts, alignmentObj.tEnds)
+   wrongPos  = 0
+
+   for predExonStart,predExonStop in predPos:
+
+      for pos,currentExons in enumerate(Exons):
+         for idx in range(1,len(currentExons)-1):
+
+            intronStart = currentExons[idx][1]
+            intronStop  = currentExons[idx+1][0]
+
+            # est is covering full intron
+            if intronStart > predExonStart and intronStop < predExonStop:
+               #fprintf(fh,sprintf(' %s is completely overlapping from %d %d\n',currentESTName,intronStart,intronStop));
+               wrongPos += intronStop-intronStart
+            # est is nested inside intron
+            elif intronStart < predExonStart and intronStop > predExonStop:
+               #fprintf(fh,sprintf(' %s is completely inside intron from %d %d\n',currentESTName,predExonStart,predExonStop));
+               wrongPos += intronStop-intronStart
+            # end of exonth is nested inside predExoniction
+            elif intronStart > predExonStart and predExonStop > intronStart and intronStop > predExonStop:
+               #fprintf(fh,sprintf('%s is upstream overlapping from %d %d\n',currentESTName,intronStart,predExonStop));
+               wrongPos += intronStop-intronStart
+            # predExoniction is nested inside exonth
+            elif intronStart < predExonStart and predExonStart < intronStop and intronStop < predExonStop:
+               #fprintf(fh,sprintf('%s is downstream overlapping from %d %d\n',currentESTName,predExonStart,intronStop));
+               wrongPos += intronStop-intronStart
+            else:
+               pass
+   
+   totalWrongPositions[pos] = wrongPos