+ changed python and c code in order to perform easier parameter handling and
authorFabio <fabio@congo.fml.local>
Mon, 1 Sep 2008 15:11:28 +0000 (17:11 +0200)
committerFabio <fabio@congo.fml.local>
Mon, 1 Sep 2008 15:11:28 +0000 (17:11 +0200)
reduce wrapper code on the python layer

dyn_prog/qpalma_dp.cpp
dyn_prog/qpalma_dp.h
qpalma/sequence_utils.py
scripts/qpalma_main.py
tests/test_qpalma_prediction.py

index 518591a..1c71936 100644 (file)
@@ -246,7 +246,7 @@ void Alignment::myalign(int nr_paths_p, char* dna, int dna_len_p, char* est,
       delete[] matrices[i];
 }
 
-void Alignment::getAlignmentResultsOld(int* s_align, int* e_align,
+void Alignment::getAlignmentResults(int* s_align, int* e_align,
       int* mmatrix_p, double* alignscores, double* qScores) {
 
    size_t idx;
@@ -285,7 +285,7 @@ void Alignment::getAlignmentResultsOld(int* s_align, int* e_align,
    //printf("Leaving getAlignmentResults...\n");
 }
 
-void Alignment::getAlignmentArraysOld(int* dna_align, int* est_align) {
+void Alignment::getAlignmentArrays(int* dna_align, int* est_align) {
 
    int idx;
    for(idx=0; idx<result_len; idx++) {
@@ -295,9 +295,8 @@ void Alignment::getAlignmentArraysOld(int* dna_align, int* est_align) {
 }
 
 
-PyObject* Alignment::getAlignmentResults() {
+PyObject* Alignment::getAlignmentResultsNew() {
 
-   PyObject* result = PyTuple_New(5);
    PyObject* py_splice_align  = PyList_New(splice_align_size);
    PyObject* py_est_align     = PyList_New(est_align_size);
    PyObject* py_mmatrix       = PyList_New(mmatrix_param_size);
@@ -345,26 +344,29 @@ PyObject* Alignment::getAlignmentResults() {
       //printf("\nctr is %d\n",ctr);
    }
 
-   PyTuple_SetItem(result,0,py_splice_align);
-   PyTuple_SetItem(result,1,py_est_align);
-   PyTuple_SetItem(result,2,py_mmatrix);
-   PyTuple_SetItem(result,3,py_align_scores);
-   PyTuple_SetItem(result,4,py_q_scores);
+   PyObject* result = PyTuple_Pack(5,
+   py_splice_align, py_est_align, py_mmatrix,
+   py_align_scores, py_q_scores);
+
+   //PyTuple_SetItem(result,0,py_splice_align);
+   //PyTuple_SetItem(result,1,py_est_align);
+   //PyTuple_SetItem(result,2,py_mmatrix);
+   //PyTuple_SetItem(result,3,py_align_scores);
+   //PyTuple_SetItem(result,4,py_q_scores);
 
    return result;
 }
 
-PyObject* Alignment::getAlignmentArrays() {
+PyObject* Alignment::getAlignmentArraysNew() {
 
-   PyObject* result = PyTuple_New(2);
-   PyObject* dna_align  = PyList_New(result_len);
-   PyObject* est_align  = PyList_New(result_len);
+   PyObject* py_dna_align = PyList_New(result_len);
+   PyObject* py_est_align = PyList_New(result_len);
 
-   int idx;
-   for(idx=0; idx<result_len; idx++) {
-      PyList_SetItem(dna_align,idx,PyInt_FromLong(DNA_ARRAY[idx]));
-      PyList_SetItem(est_align,idx,PyInt_FromLong(EST_ARRAY[idx]));
+   for(size_t idx=0; idx<result_len; idx++) {
+      PyList_SetItem(py_dna_align,idx,PyInt_FromLong(DNA_ARRAY[idx]));
+      PyList_SetItem(py_est_align,idx,PyInt_FromLong(EST_ARRAY[idx]));
    }
 
+   PyObject* result = PyTuple_Pack(2, py_dna_align, py_est_align);
    return result;
 }
index 0253cd0..07c5f37 100644 (file)
@@ -133,14 +133,14 @@ class Alignment {
 
       int getResultLength() { return result_len; }
 
-      void getAlignmentResultsOld(int* s_align, int* e_align,
+      void getAlignmentResults(int* s_align, int* e_align,
       int* mmatrix_p, double* alignscores, double* qScores);
     
-      void getAlignmentArraysOld(int* dna_align, int* est_align);
+      void getAlignmentArrays(int* dna_align, int* est_align);
 
 
-      PyObject* getAlignmentResults();
-      PyObject* getAlignmentArrays();
+      PyObject* getAlignmentResultsNew();
+      PyObject* getAlignmentArraysNew();
 };
 
 #endif  // _QPALMA_DP_H_
index a83dfdc..e5130af 100644 (file)
@@ -17,7 +17,7 @@ import numpy
 
 from numpy.matlib import inf
 
-#from Genefinding import *
+from Genefinding import *
 from genome_utils import load_genomic
 
 extended_alphabet    = ['-','a','c','g','t','n','[',']']
index 164c1e5..b3edea8 100644 (file)
@@ -38,6 +38,10 @@ from qpalma.computeSpliceAlignWithQuality import *
 from qpalma.TrainingParam import Param
 from qpalma.Plif import Plf,compute_donacc
 
+# these two imports are needed for the load genomic resp. interval query
+# functions
+#from Genefinding import *
+#from genome_utils import load_genomic
 from Utils import calc_stat, calc_info, pprint_alignment, get_alignment
 
 class SpliceSiteException:
@@ -64,6 +68,12 @@ def getData(training_set,exampleKey,run):
    dna_flat_files =  '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
    dna, acc_supp, don_supp = get_seq_and_scores(chr,strand,up_cut,down_cut,dna_flat_files)
 
+   # splice score is located at g of ag
+   #ag_tuple_pos = [p for p,e in enumerate(dna) if p>1 and dna[p-1]=='a' and dna[p]=='g' ]
+   #assert ag_tuple_pos == [p for p,e in enumerate(acc_supp) if e != -inf and p > 1], pdb.set_trace()
+   #gt_tuple_pos = [p for p,e in enumerate(dna) if p>0 and p<len(dna)-1 and e=='g' and (dna[p+1]=='t' or dna[p+1]=='c')]
+   #assert gt_tuple_pos == [p for p,e in enumerate(don_supp) if e != -inf and p > 0], pdb.set_trace()
+
    original_exons = currentExons
    exons = original_exons - (up_cut-1)
    exons[0,0] -= 1
@@ -148,8 +158,8 @@ class QPalma:
          c_dna_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
          c_est_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
 
-         currentAlignment.getAlignmentArraysOld(c_dna_array,c_est_array)
-         __dna_array,__est_array = currentAlignment.getAlignmentArrays()
+         currentAlignment.getAlignmentArrays(c_dna_array,c_est_array)
+         _dna_array,_est_array = currentAlignment.getAlignmentArraysNew()
 
          dna_array = [0.0]*result_len
          est_array = [0.0]*result_len
@@ -162,14 +172,14 @@ class QPalma:
          dna_array = None
          est_array = None
 
-      assert dna_array == __dna_array
-      assert est_array == __est_array
+      assert dna_array == _dna_array
+      assert est_array == _est_array
 
-      currentAlignment.getAlignmentResultsOld(c_SpliceAlign, c_EstAlign,\
+      currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
       c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
 
-      __newSpliceAlign, __newEstAlign, __newWeightMatch, __newDPScores, __newQualityPlifsFeatures =\
-      currentAlignment.getAlignmentResults()
+      _SpliceAlign, _EstAlign, _WeightMatch, _DPScores, _newQualityPlifsFeatures =\
+      currentAlignment.getAlignmentResultsNew()
 
       newSpliceAlign = zeros((current_num_path*dna_len,1))
       newEstAlign    = zeros((est_len*current_num_path,1))
@@ -200,11 +210,14 @@ class QPalma:
       del c_qualityPlifsFeatures
       del currentAlignment
 
-      assert newSpliceAlign == __newSpliceAlign
-      assert newEstAlign    == __newEstAlign
-      assert newWeightMatch == __newWeightMatch
-      assert newDPScores    == __newDPScores 
-      assert newQualityPlifsFeatures == __newQualityPlifsFeatures 
+
+      assert newSpliceAlign.flatten().tolist()[0] == _SpliceAlign
+      assert newEstAlign.flatten().tolist()[0]    == _EstAlign
+      assert newWeightMatch.flatten().tolist()[0]  == _WeightMatch
+      assert newDPScores.flatten().tolist()[0]     == _DPScores 
+      assert newQualityPlifsFeatures.flatten().tolist()[0]  == _newQualityPlifsFeatures 
+
+      pdb.set_trace()
 
       return newSpliceAlign, newEstAlign, newWeightMatch, newDPScores,\
       newQualityPlifsFeatures, dna_array, est_array
@@ -566,13 +579,14 @@ class QPalma:
 #
 ###############################################################################
 
-   def init_prediction(self,run,dataset_fn,prediction_keys,param_fn,seqInfo,set_name):
+   def init_prediction(self,run,dataset_fn,prediction_keys,param_fn,seqInfo,set_name,):
       """
       Performing a prediction takes...
       """
       self.run = run
       self.set_name = set_name
 
+      #full_working_path = os.path.join(run['alignment_dir'],run['name'])
       full_working_path = run['result_dir']
 
       print 'full_working_path is %s' % full_working_path 
@@ -588,6 +602,7 @@ class QPalma:
 
       self.logfh = open('_qpalma_predict_%s.log'%set_name,'w+')
 
+
       # number of prediction instances
       self.plog('Number of prediction examples: %d\n'% len(prediction_keys))
 
@@ -609,15 +624,18 @@ class QPalma:
 
    def predict(self,run,prediction_set,param,seqInfo):
       """
-      This method contains the prediction loop over all reads of the given dataset.
+      This method...
       """
 
       self.run = run
-      self.read_size = 38
 
-      self.use_quality_scores = False
-      if self.run['mode'] == 'using_quality_scores':
-         use_quality_scores = True
+      if self.run['mode'] == 'normal':
+         self.use_quality_scores = False
+
+      elif self.run['mode'] == 'using_quality_scores':
+         self.use_quality_scores = True
+      else:
+         assert(False)
 
       # Set the parameters such as limits/penalties for the Plifs
       [h,d,a,mmatrix,qualityPlifs] =\
@@ -638,7 +656,7 @@ class QPalma:
       #totalQualSP = self.run['totalQualSuppPoints']
       #totalQualityPenalties = zeros((totalQualSP,1))
 
-      problem_ctr = 0
+      self.problem_ctr = 0
 
       # where we store the predictions
       allPredictions = []
@@ -646,8 +664,23 @@ class QPalma:
       # we take the first quality vector of the tuple of quality vectors
       quality_index = 0
 
+      # fetch the data needed
+      #g_dir    = run['dna_flat_files'] #'/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+      #acc_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
+      #don_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
+
+      #g_fmt = 'chr%d.dna.flat'
+      #s_fmt = 'contig_%d%s'
+
+      #num_chromo = 6
+
+      #accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
+      #seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
+
       # beginning of the prediction loop
       for example_key in prediction_set.keys():
+         print 'Current example %d' % example_key
+
          for example in prediction_set[example_key]:
 
             currentSeqInfo,read,currentQualities = example
@@ -658,23 +691,28 @@ class QPalma:
             if not self.qpalma_debug_mode:
                self.plog('Loading example id: %d...\n'% int(id))
 
-            if use_quality_scores:
+            if run['mode'] == 'normal':
+               quality = [40]*len(read)
+
+            if run['mode'] == 'using_quality_scores':
                quality = currentQualities[quality_index]
-            else:
+
+            if not run['enable_quality_scores']:
                quality = [40]*len(read)
 
             try:
                currentDNASeq, currentAcc, currentDon = seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop)
             except:
-               problem_ctr += 1
+               self.problem_ctr += 1
                continue
 
             if not run['enable_splice_signals']:
-               for idx in range(len(currentDon)):
-                  if currentDon[idx] != -inf:
+               for idx,elem in enumerate(currentDon):
+                  if elem != -inf:
                      currentDon[idx] = 0.0
 
-                  if currentAcc[idx] != -inf:
+               for idx,elem in enumerate(currentAcc):
+                  if elem != -inf:
                      currentAcc[idx] = 0.0
 
             current_prediction = self.calc_alignment(currentDNASeq, read,\
@@ -688,17 +726,17 @@ class QPalma:
             allPredictions.append(current_prediction)
 
       if not self.qpalma_debug_mode:
-         self.finalizePrediction(allPredictions,problem_ctr)
+         self.finalizePrediction(allPredictions)
       else:
          return allPredictions
 
 
-   def finalizePrediction(self,allPredictions,problem_ctr):
+   def finalizePrediction(self,allPredictions):
       # end of the prediction loop we save all predictions in a pickle file and exit
       cPickle.dump(allPredictions,open('%s.predictions.pickle'%(self.set_name),'w+'))
       print 'Prediction completed'
       self.plog('Prediction completed\n')
-      mes =  'Problem ctr %d' % problem_ctr
+      mes =  'Problem ctr %d' % self.problem_ctr
       print mes
       self.plog(mes+'\n')
       self.logfh.close()
@@ -718,7 +756,7 @@ class QPalma:
       if '-' in read:
          self.plog('found gap\n')
          read = read.replace('-','')
-         assert len(read) == self.read_size
+         assert len(read) == Conf.read_size
 
       dna_len = len(dna)
       read_len = len(read)
@@ -738,6 +776,8 @@ class QPalma:
       true_map[0] = 1
       pathNr      = 0
 
+      #pdb.set_trace()
+
       _newSpliceAlign   = array.array('B',newSpliceAlign.flatten().tolist()[0])
       _newEstAlign      = array.array('B',newEstAlign.flatten().tolist()[0])
        
index 936a163..8e5e37a 100644 (file)
@@ -98,7 +98,6 @@ class TestQPalmaPrediction(unittest.TestCase):
             print 'size'
             print len(example)
 
-
       # fetch the data needed
       g_dir    = run['dna_flat_files'] #'/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
       acc_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
@@ -112,11 +111,10 @@ class TestQPalmaPrediction(unittest.TestCase):
       accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
       seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
 
-
-
       qp = QPalma(True)
       #qp.init_prediction(run,set_name)
       allPredictions = qp.predict(run,self.prediction_set,param,seqInfo )
+
       for current_prediction in allPredictions:
          align_str = print_prediction(current_prediction)
          print align_str
@@ -134,6 +132,7 @@ class TestQPalmaPrediction(unittest.TestCase):
          print alignment_reconstruct(current_prediction,numExons)
          #print id,start_pos,predExons
 
+      print 'Problem counter is %d' % qp.problem_ctr 
 
 if __name__ == '__main__':
    suite = unittest.TestLoader().loadTestsFromTestCase(TestQPalmaPrediction)