+ added some docu
[qpalma.git] / scripts / qpalma_main.py
index af47cd0..afac4b8 100644 (file)
@@ -40,10 +40,6 @@ from qpalma.Plif import Plf,compute_donacc
 
 import QPalmaConfiguration as Conf
 
-# these two imports are needed for the load genomic resp. interval query
-# functions
-#from Genefinding import *
-#from genome_utils import load_genomic
 from Utils import calc_stat, calc_info, pprint_alignment, get_alignment
 
 class SpliceSiteException:
@@ -70,12 +66,6 @@ def getData(training_set,exampleKey,run):
    dna_flat_files =  '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
    dna, acc_supp, don_supp = get_seq_and_scores(chr,strand,up_cut,down_cut,dna_flat_files)
 
-   # splice score is located at g of ag
-   #ag_tuple_pos = [p for p,e in enumerate(dna) if p>1 and dna[p-1]=='a' and dna[p]=='g' ]
-   #assert ag_tuple_pos == [p for p,e in enumerate(acc_supp) if e != -inf and p > 1], pdb.set_trace()
-   #gt_tuple_pos = [p for p,e in enumerate(dna) if p>0 and p<len(dna)-1 and e=='g' and (dna[p+1]=='t' or dna[p+1]=='c')]
-   #assert gt_tuple_pos == [p for p,e in enumerate(don_supp) if e != -inf and p > 0], pdb.set_trace()
-
    original_exons = currentExons
    exons = original_exons - (up_cut-1)
    exons[0,0] -= 1
@@ -160,7 +150,8 @@ class QPalma:
          c_dna_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
          c_est_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
 
-         currentAlignment.getAlignmentArrays(c_dna_array,c_est_array)
+         currentAlignment.getAlignmentArraysOld(c_dna_array,c_est_array)
+         __dna_array,__est_array = currentAlignment.getAlignmentArrays()
 
          dna_array = [0.0]*result_len
          est_array = [0.0]*result_len
@@ -173,9 +164,15 @@ class QPalma:
          dna_array = None
          est_array = None
 
-      currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
+      assert dna_array == __dna_array
+      assert est_array == __est_array
+
+      currentAlignment.getAlignmentResultsOld(c_SpliceAlign, c_EstAlign,\
       c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
 
+      __newSpliceAlign, __newEstAlign, __newWeightMatch, __newDPScores, __newQualityPlifsFeatures =\
+      currentAlignment.getAlignmentResults()
+
       newSpliceAlign = zeros((current_num_path*dna_len,1))
       newEstAlign    = zeros((est_len*current_num_path,1))
       newWeightMatch = zeros((current_num_path*mm_len,1))
@@ -205,6 +202,12 @@ class QPalma:
       del c_qualityPlifsFeatures
       del currentAlignment
 
+      assert newSpliceAlign = __newSpliceAlign
+      assert newEstAlign    = __newEstAlign
+      assert newWeightMatch = __newWeightMatch
+      assert newDPScores    = __newDPScores 
+      assert newQualityPlifsFeatures = __newQualityPlifsFeatures 
+
       return newSpliceAlign, newEstAlign, newWeightMatch, newDPScores,\
       newQualityPlifsFeatures, dna_array, est_array
 
@@ -565,14 +568,13 @@ class QPalma:
 #
 ###############################################################################
 
-   def init_prediction(self,run,dataset_fn,prediction_keys,param_fn,set_name):
+   def init_prediction(self,run,dataset_fn,prediction_keys,param_fn,seqInfo,set_name):
       """
       Performing a prediction takes...
       """
       self.run = run
       self.set_name = set_name
 
-      #full_working_path = os.path.join(run['alignment_dir'],run['name'])
       full_working_path = run['result_dir']
 
       print 'full_working_path is %s' % full_working_path 
@@ -588,7 +590,6 @@ class QPalma:
 
       self.logfh = open('_qpalma_predict_%s.log'%set_name,'w+')
 
-
       # number of prediction instances
       self.plog('Number of prediction examples: %d\n'% len(prediction_keys))
 
@@ -605,23 +606,19 @@ class QPalma:
       # Load parameter vector to predict with
       param = cPickle.load(open(param_fn))
 
-      self.predict(run,prediction_set,param)
+      self.predict(run,prediction_set,param,seqInfo)
 
 
-   def predict(self,run,prediction_set,param):
+   def predict(self,run,prediction_set,param,seqInfo):
       """
-      This method...
+      This method contains the prediction loop over all reads of the given dataset.
       """
 
       self.run = run
 
-      if self.run['mode'] == 'normal':
-         self.use_quality_scores = False
-
-      elif self.run['mode'] == 'using_quality_scores':
-         self.use_quality_scores = True
-      else:
-         assert(False)
+      self.use_quality_scores = False
+      if self.run['mode'] == 'using_quality_scores':
+         use_quality_scores = True
 
       # Set the parameters such as limits/penalties for the Plifs
       [h,d,a,mmatrix,qualityPlifs] =\
@@ -634,14 +631,13 @@ class QPalma:
       if not self.qpalma_debug_mode:
          self.plog('Starting prediction...\n')
 
-      donSP       = self.run['numDonSuppPoints']
-      accSP       = self.run['numAccSuppPoints']
-      lengthSP    = self.run['numLengthSuppPoints']
-      mmatrixSP   = run['matchmatrixRows']*run['matchmatrixCols']
-      numq        = self.run['numQualSuppPoints']
-      totalQualSP = self.run['totalQualSuppPoints']
-
-      totalQualityPenalties = zeros((totalQualSP,1))
+      #donSP       = self.run['numDonSuppPoints']
+      #accSP       = self.run['numAccSuppPoints']
+      #lengthSP    = self.run['numLengthSuppPoints']
+      #mmatrixSP   = run['matchmatrixRows']*run['matchmatrixCols']
+      #numq        = self.run['numQualSuppPoints']
+      #totalQualSP = self.run['totalQualSuppPoints']
+      #totalQualityPenalties = zeros((totalQualSP,1))
 
       problem_ctr = 0
 
@@ -651,23 +647,8 @@ class QPalma:
       # we take the first quality vector of the tuple of quality vectors
       quality_index = 0
 
-      # fetch the data needed
-      g_dir    = run['dna_flat_files'] #'/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
-      acc_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
-      don_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
-
-      g_fmt = 'chr%d.dna.flat'
-      s_fmt = 'contig_%d%s'
-
-      num_chromo = 6
-
-      accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
-      seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
-
       # beginning of the prediction loop
       for example_key in prediction_set.keys():
-         print 'Current example %d' % example_key
-
          for example in prediction_set[example_key]:
 
             currentSeqInfo,read,currentQualities = example
@@ -678,13 +659,9 @@ class QPalma:
             if not self.qpalma_debug_mode:
                self.plog('Loading example id: %d...\n'% int(id))
 
-            if run['mode'] == 'normal':
-               quality = [40]*len(read)
-
-            if run['mode'] == 'using_quality_scores':
+            if use_quality_scores = True
                quality = currentQualities[quality_index]
-
-            if not run['enable_quality_scores']:
+            else:
                quality = [40]*len(read)
 
             try:
@@ -694,12 +671,11 @@ class QPalma:
                continue
 
             if not run['enable_splice_signals']:
-               for idx,elem in enumerate(currentDon):
-                  if elem != -inf:
+               for idx in range(len(currentDon)):
+                  if currentDon[idx] != -inf:
                      currentDon[idx] = 0.0
 
-               for idx,elem in enumerate(currentAcc):
-                  if elem != -inf:
+                  if currentAcc[idx] != -inf:
                      currentAcc[idx] = 0.0
 
             current_prediction = self.calc_alignment(currentDNASeq, read,\