+ added some docu
authorFabio <fabio@congo.fml.local>
Mon, 1 Sep 2008 14:02:45 +0000 (16:02 +0200)
committerFabio <fabio@congo.fml.local>
Mon, 1 Sep 2008 14:02:45 +0000 (16:02 +0200)
+ changed interface for dynamic program
+ started refactoring of qpalma main code

doc/qpalma.tex
dyn_prog/Makefile
dyn_prog/QPalmaDP.i
dyn_prog/qpalma_dp.cpp
dyn_prog/qpalma_dp.h
scripts/grid_predict.py
scripts/qpalma_main.py
scripts/qpalma_pipeline.py

index 55c96c5..f338fb8 100644 (file)
@@ -88,6 +88,18 @@ check\_dataset\_consistency.py script.
 Performs sanity checking of the configurations file(s). Initializes needed
 directories.
 
+\section{Training}
+
+QPalma needs some training examples
+For the training you need:
+
+\begin{itemize}
+\item Training examples i.e. correct alignments (you can artificially generate those see \ref)
+\item Splice site predictions
+\item Flat file of the genomic sequence
+\item VMatch 
+\end{itemize}
+
 
 \section{Data Standards}
 
index 120d6a6..3a8afef 100644 (file)
@@ -18,12 +18,8 @@ HDRS= Mathmatics.h\
 
 OBJS = $(SRCS:%.cpp=%.o)
 
-#CXXFLAGS=-O3 -fPIC
-#CXXFLAGS=-O3 -fPIC -pg -fprofile-arcs
-
-#CXXFLAGS=-O3 -fPIC -ggdb -pg
-#CXXFLAGS=-O3 -fPIC -ggdb -lprofiler #-fno-inline
-CXXFLAGS=-O3 -fPIC -g -ggdb 
+#CXXFLAGS=-O3 -fPIC -g -ggdb 
+CXXFLAGS=-Wall -Wshadow -std=c++98 -ggdb -O3 -fPIC -I/usr/include/python2.5
 LDFLAGS=-lprofiler -L/fml/ag-raetsch/home/fabio/own_libs/libunwind/lib -lunwind-x86_64 -lunwind
 
 
index 66ed219..4cf5864 100644 (file)
@@ -7,8 +7,8 @@
 #include "qpalma_dp.h"
 %}
 
-%include "std_vector.i"
-%include "std_string.i"
+/*%include "std_vector.i"
+%include "std_string.i"*/
 
 %include "carrays.i"
 
index a20af18..518591a 100644 (file)
@@ -246,10 +246,10 @@ void Alignment::myalign(int nr_paths_p, char* dna, int dna_len_p, char* est,
       delete[] matrices[i];
 }
 
-void Alignment::getAlignmentResults(int* s_align, int* e_align,
+void Alignment::getAlignmentResultsOld(int* s_align, int* e_align,
       int* mmatrix_p, double* alignscores, double* qScores) {
 
-   int idx;
+   size_t idx;
    for(idx=0; idx<splice_align_size; idx++)
       s_align[idx] = splice_align[idx];
 
@@ -285,7 +285,7 @@ void Alignment::getAlignmentResults(int* s_align, int* e_align,
    //printf("Leaving getAlignmentResults...\n");
 }
 
-void Alignment::getAlignmentArrays(int* dna_align, int* est_align) {
+void Alignment::getAlignmentArraysOld(int* dna_align, int* est_align) {
 
    int idx;
    for(idx=0; idx<result_len; idx++) {
@@ -293,3 +293,78 @@ void Alignment::getAlignmentArrays(int* dna_align, int* est_align) {
       est_align[idx] = EST_ARRAY[idx];
    }
 }
+
+
+PyObject* Alignment::getAlignmentResults() {
+
+   PyObject* result = PyTuple_New(5);
+   PyObject* py_splice_align  = PyList_New(splice_align_size);
+   PyObject* py_est_align     = PyList_New(est_align_size);
+   PyObject* py_mmatrix       = PyList_New(mmatrix_param_size);
+   PyObject* py_align_scores  = PyList_New(alignmentscores_size);
+   PyObject* py_q_scores      = PyList_New(0);
+
+   size_t idx;
+   for(idx=0; idx<splice_align_size; idx++)
+      PyList_SetItem(py_splice_align,idx,PyInt_FromLong(splice_align[idx]));
+   //s_align[idx] = splice_align[idx];
+
+   for(idx=0; idx<est_align_size; idx++)
+      PyList_SetItem(py_est_align,idx,PyInt_FromLong(est_align[idx]));
+
+   //e_align[idx] =  est_align[idx];
+
+   for(idx=0; idx<mmatrix_param_size; idx++)
+      PyList_SetItem(py_mmatrix,idx,PyInt_FromLong(mmatrix_param[idx]));
+
+   //mmatrix_p[idx] = mmatrix_param[idx];
+
+   for(idx=0; idx<alignmentscores_size; idx++)
+      PyList_SetItem(py_align_scores,idx,PyFloat_FromDouble(alignmentscores[idx]));
+
+   //alignscores[idx] = alignmentscores[idx];
+   
+   if (use_quality_scores) {
+      penalty_struct currentPlif;
+      int ctr=0;
+      for (int z=0; z<nr_paths; z++) {
+         for(int estChar=1;estChar<6;estChar++) {
+            for(int dnaChar=0;dnaChar<6;dnaChar++) {
+
+               int currentPos = (estChar-1)*6+dnaChar;
+               currentPlif = qualityFeaturesAllPaths[z][currentPos];
+
+               for(int pidx=0; pidx<currentPlif.len; pidx++) {
+                  //qScores[ctr] = currentPlif.penalties[pidx];
+                  //printf("%f ",qScores[ctr]);
+                  PyList_Append(py_q_scores,PyFloat_FromDouble(currentPlif.penalties[pidx]));
+                  ctr++;
+               }
+               //printf("\n");
+      }}}
+      //printf("\nctr is %d\n",ctr);
+   }
+
+   PyTuple_SetItem(result,0,py_splice_align);
+   PyTuple_SetItem(result,1,py_est_align);
+   PyTuple_SetItem(result,2,py_mmatrix);
+   PyTuple_SetItem(result,3,py_align_scores);
+   PyTuple_SetItem(result,4,py_q_scores);
+
+   return result;
+}
+
+PyObject* Alignment::getAlignmentArrays() {
+
+   PyObject* result = PyTuple_New(2);
+   PyObject* dna_align  = PyList_New(result_len);
+   PyObject* est_align  = PyList_New(result_len);
+
+   int idx;
+   for(idx=0; idx<result_len; idx++) {
+      PyList_SetItem(dna_align,idx,PyInt_FromLong(DNA_ARRAY[idx]));
+      PyList_SetItem(est_align,idx,PyInt_FromLong(EST_ARRAY[idx]));
+   }
+
+   return result;
+}
index 9940b86..0253cd0 100644 (file)
@@ -1,6 +1,8 @@
 #ifndef _QPALMA_DP_H_
 #define _QPALMA_DP_H_
 
+#include "Python.h"
+
 #include "penalty_info.h"
 #include "debug_tools.h"
 
@@ -72,12 +74,12 @@ class Alignment {
       double* alignmentscores;
       struct penalty_struct** qualityFeaturesAllPaths;
 
-      int dna_len;
-      int est_len;
-      int mlen;
-      int nr_paths;
+      size_t dna_len;
+      size_t est_len;
+      size_t mlen;
+      size_t nr_paths;
 
-      int result_len;
+      size_t result_len;
       int* DNA_ARRAY;
       int* EST_ARRAY;
 
@@ -128,11 +130,17 @@ class Alignment {
       bool print_matrix);
 
       void getDNAEST();
-      void getAlignmentResults(int* s_align, int* e_align,
-      int* mmatrix_p, double* alignscores, double* qScores);
 
       int getResultLength() { return result_len; }
-      void getAlignmentArrays(int* dna_align, int* est_align);
+
+      void getAlignmentResultsOld(int* s_align, int* e_align,
+      int* mmatrix_p, double* alignscores, double* qScores);
+    
+      void getAlignmentArraysOld(int* dna_align, int* est_align);
+
+
+      PyObject* getAlignmentResults();
+      PyObject* getAlignmentArrays();
 };
 
 #endif  // _QPALMA_DP_H_
index 25197cb..c5ab75e 100644 (file)
@@ -45,8 +45,21 @@ def makeJobs(run,dataset_fn,chunks,param_fn):
 
    jobs=[]
 
+   # fetch the data needed
+   g_dir    = run['dna_flat_files'] #'/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+   acc_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
+   don_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
+
+   g_fmt = 'chr%d.dna.flat'
+   s_fmt = 'contig_%d%s'
+
+   num_chromo = 6
+
+   accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
+   seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
+
    for c_name,current_chunk in chunks:
-      current_job = KybJob(grid_predict.g_predict,[run,dataset_fn,current_chunk,param_fn,c_name])
+      current_job = KybJob(grid_predict.g_predict,[run,dataset_fn,current_chunk,param_fn,seqInfo,c_name])
       current_job.h_vmem = '20.0G'
       #current_job.express = 'True'
 
@@ -110,13 +123,13 @@ def create_and_submit():
    (sid, jobids) = submit_jobs(functionJobs)
 
 
-def g_predict(run,dataset_fn,prediction_keys,param_fn,set_name):
+def g_predict(run,dataset_fn,prediction_keys,param_fn,seqInfo,set_name):
    """
   
    """
 
    qp = QPalma(False)
-   qp.init_prediction(run,dataset_fn,prediction_keys,param_fn,set_name)
+   qp.init_prediction(run,dataset_fn,prediction_keys,param_fn,seqInfo,set_name)
    return 'finished prediction of set %s.' % set_name
 
 
index af47cd0..afac4b8 100644 (file)
@@ -40,10 +40,6 @@ from qpalma.Plif import Plf,compute_donacc
 
 import QPalmaConfiguration as Conf
 
-# these two imports are needed for the load genomic resp. interval query
-# functions
-#from Genefinding import *
-#from genome_utils import load_genomic
 from Utils import calc_stat, calc_info, pprint_alignment, get_alignment
 
 class SpliceSiteException:
@@ -70,12 +66,6 @@ def getData(training_set,exampleKey,run):
    dna_flat_files =  '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
    dna, acc_supp, don_supp = get_seq_and_scores(chr,strand,up_cut,down_cut,dna_flat_files)
 
-   # splice score is located at g of ag
-   #ag_tuple_pos = [p for p,e in enumerate(dna) if p>1 and dna[p-1]=='a' and dna[p]=='g' ]
-   #assert ag_tuple_pos == [p for p,e in enumerate(acc_supp) if e != -inf and p > 1], pdb.set_trace()
-   #gt_tuple_pos = [p for p,e in enumerate(dna) if p>0 and p<len(dna)-1 and e=='g' and (dna[p+1]=='t' or dna[p+1]=='c')]
-   #assert gt_tuple_pos == [p for p,e in enumerate(don_supp) if e != -inf and p > 0], pdb.set_trace()
-
    original_exons = currentExons
    exons = original_exons - (up_cut-1)
    exons[0,0] -= 1
@@ -160,7 +150,8 @@ class QPalma:
          c_dna_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
          c_est_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
 
-         currentAlignment.getAlignmentArrays(c_dna_array,c_est_array)
+         currentAlignment.getAlignmentArraysOld(c_dna_array,c_est_array)
+         __dna_array,__est_array = currentAlignment.getAlignmentArrays()
 
          dna_array = [0.0]*result_len
          est_array = [0.0]*result_len
@@ -173,9 +164,15 @@ class QPalma:
          dna_array = None
          est_array = None
 
-      currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
+      assert dna_array == __dna_array
+      assert est_array == __est_array
+
+      currentAlignment.getAlignmentResultsOld(c_SpliceAlign, c_EstAlign,\
       c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
 
+      __newSpliceAlign, __newEstAlign, __newWeightMatch, __newDPScores, __newQualityPlifsFeatures =\
+      currentAlignment.getAlignmentResults()
+
       newSpliceAlign = zeros((current_num_path*dna_len,1))
       newEstAlign    = zeros((est_len*current_num_path,1))
       newWeightMatch = zeros((current_num_path*mm_len,1))
@@ -205,6 +202,12 @@ class QPalma:
       del c_qualityPlifsFeatures
       del currentAlignment
 
+      assert newSpliceAlign = __newSpliceAlign
+      assert newEstAlign    = __newEstAlign
+      assert newWeightMatch = __newWeightMatch
+      assert newDPScores    = __newDPScores 
+      assert newQualityPlifsFeatures = __newQualityPlifsFeatures 
+
       return newSpliceAlign, newEstAlign, newWeightMatch, newDPScores,\
       newQualityPlifsFeatures, dna_array, est_array
 
@@ -565,14 +568,13 @@ class QPalma:
 #
 ###############################################################################
 
-   def init_prediction(self,run,dataset_fn,prediction_keys,param_fn,set_name):
+   def init_prediction(self,run,dataset_fn,prediction_keys,param_fn,seqInfo,set_name):
       """
       Performing a prediction takes...
       """
       self.run = run
       self.set_name = set_name
 
-      #full_working_path = os.path.join(run['alignment_dir'],run['name'])
       full_working_path = run['result_dir']
 
       print 'full_working_path is %s' % full_working_path 
@@ -588,7 +590,6 @@ class QPalma:
 
       self.logfh = open('_qpalma_predict_%s.log'%set_name,'w+')
 
-
       # number of prediction instances
       self.plog('Number of prediction examples: %d\n'% len(prediction_keys))
 
@@ -605,23 +606,19 @@ class QPalma:
       # Load parameter vector to predict with
       param = cPickle.load(open(param_fn))
 
-      self.predict(run,prediction_set,param)
+      self.predict(run,prediction_set,param,seqInfo)
 
 
-   def predict(self,run,prediction_set,param):
+   def predict(self,run,prediction_set,param,seqInfo):
       """
-      This method...
+      This method contains the prediction loop over all reads of the given dataset.
       """
 
       self.run = run
 
-      if self.run['mode'] == 'normal':
-         self.use_quality_scores = False
-
-      elif self.run['mode'] == 'using_quality_scores':
-         self.use_quality_scores = True
-      else:
-         assert(False)
+      self.use_quality_scores = False
+      if self.run['mode'] == 'using_quality_scores':
+         use_quality_scores = True
 
       # Set the parameters such as limits/penalties for the Plifs
       [h,d,a,mmatrix,qualityPlifs] =\
@@ -634,14 +631,13 @@ class QPalma:
       if not self.qpalma_debug_mode:
          self.plog('Starting prediction...\n')
 
-      donSP       = self.run['numDonSuppPoints']
-      accSP       = self.run['numAccSuppPoints']
-      lengthSP    = self.run['numLengthSuppPoints']
-      mmatrixSP   = run['matchmatrixRows']*run['matchmatrixCols']
-      numq        = self.run['numQualSuppPoints']
-      totalQualSP = self.run['totalQualSuppPoints']
-
-      totalQualityPenalties = zeros((totalQualSP,1))
+      #donSP       = self.run['numDonSuppPoints']
+      #accSP       = self.run['numAccSuppPoints']
+      #lengthSP    = self.run['numLengthSuppPoints']
+      #mmatrixSP   = run['matchmatrixRows']*run['matchmatrixCols']
+      #numq        = self.run['numQualSuppPoints']
+      #totalQualSP = self.run['totalQualSuppPoints']
+      #totalQualityPenalties = zeros((totalQualSP,1))
 
       problem_ctr = 0
 
@@ -651,23 +647,8 @@ class QPalma:
       # we take the first quality vector of the tuple of quality vectors
       quality_index = 0
 
-      # fetch the data needed
-      g_dir    = run['dna_flat_files'] #'/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
-      acc_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
-      don_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
-
-      g_fmt = 'chr%d.dna.flat'
-      s_fmt = 'contig_%d%s'
-
-      num_chromo = 6
-
-      accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
-      seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
-
       # beginning of the prediction loop
       for example_key in prediction_set.keys():
-         print 'Current example %d' % example_key
-
          for example in prediction_set[example_key]:
 
             currentSeqInfo,read,currentQualities = example
@@ -678,13 +659,9 @@ class QPalma:
             if not self.qpalma_debug_mode:
                self.plog('Loading example id: %d...\n'% int(id))
 
-            if run['mode'] == 'normal':
-               quality = [40]*len(read)
-
-            if run['mode'] == 'using_quality_scores':
+            if use_quality_scores = True
                quality = currentQualities[quality_index]
-
-            if not run['enable_quality_scores']:
+            else:
                quality = [40]*len(read)
 
             try:
@@ -694,12 +671,11 @@ class QPalma:
                continue
 
             if not run['enable_splice_signals']:
-               for idx,elem in enumerate(currentDon):
-                  if elem != -inf:
+               for idx in range(len(currentDon)):
+                  if currentDon[idx] != -inf:
                      currentDon[idx] = 0.0
 
-               for idx,elem in enumerate(currentAcc):
-                  if elem != -inf:
+                  if currentAcc[idx] != -inf:
                      currentAcc[idx] = 0.0
 
             current_prediction = self.calc_alignment(currentDNASeq, read,\
index f7614f8..6dc82f8 100644 (file)
@@ -15,7 +15,6 @@
 #
 #
 
-
 from optparse import OptionParser
 
 from qpalma.gridtools import ApproximationTask,PreprocessingTask