+ changed python and c code in order to perform easier parameter handling and
[qpalma.git] / scripts / qpalma_main.py
index 164c1e5..b3edea8 100644 (file)
@@ -38,6 +38,10 @@ from qpalma.computeSpliceAlignWithQuality import *
 from qpalma.TrainingParam import Param
 from qpalma.Plif import Plf,compute_donacc
 
+# these two imports are needed for the load genomic resp. interval query
+# functions
+#from Genefinding import *
+#from genome_utils import load_genomic
 from Utils import calc_stat, calc_info, pprint_alignment, get_alignment
 
 class SpliceSiteException:
@@ -64,6 +68,12 @@ def getData(training_set,exampleKey,run):
    dna_flat_files =  '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
    dna, acc_supp, don_supp = get_seq_and_scores(chr,strand,up_cut,down_cut,dna_flat_files)
 
+   # splice score is located at g of ag
+   #ag_tuple_pos = [p for p,e in enumerate(dna) if p>1 and dna[p-1]=='a' and dna[p]=='g' ]
+   #assert ag_tuple_pos == [p for p,e in enumerate(acc_supp) if e != -inf and p > 1], pdb.set_trace()
+   #gt_tuple_pos = [p for p,e in enumerate(dna) if p>0 and p<len(dna)-1 and e=='g' and (dna[p+1]=='t' or dna[p+1]=='c')]
+   #assert gt_tuple_pos == [p for p,e in enumerate(don_supp) if e != -inf and p > 0], pdb.set_trace()
+
    original_exons = currentExons
    exons = original_exons - (up_cut-1)
    exons[0,0] -= 1
@@ -148,8 +158,8 @@ class QPalma:
          c_dna_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
          c_est_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
 
-         currentAlignment.getAlignmentArraysOld(c_dna_array,c_est_array)
-         __dna_array,__est_array = currentAlignment.getAlignmentArrays()
+         currentAlignment.getAlignmentArrays(c_dna_array,c_est_array)
+         _dna_array,_est_array = currentAlignment.getAlignmentArraysNew()
 
          dna_array = [0.0]*result_len
          est_array = [0.0]*result_len
@@ -162,14 +172,14 @@ class QPalma:
          dna_array = None
          est_array = None
 
-      assert dna_array == __dna_array
-      assert est_array == __est_array
+      assert dna_array == _dna_array
+      assert est_array == _est_array
 
-      currentAlignment.getAlignmentResultsOld(c_SpliceAlign, c_EstAlign,\
+      currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
       c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
 
-      __newSpliceAlign, __newEstAlign, __newWeightMatch, __newDPScores, __newQualityPlifsFeatures =\
-      currentAlignment.getAlignmentResults()
+      _SpliceAlign, _EstAlign, _WeightMatch, _DPScores, _newQualityPlifsFeatures =\
+      currentAlignment.getAlignmentResultsNew()
 
       newSpliceAlign = zeros((current_num_path*dna_len,1))
       newEstAlign    = zeros((est_len*current_num_path,1))
@@ -200,11 +210,14 @@ class QPalma:
       del c_qualityPlifsFeatures
       del currentAlignment
 
-      assert newSpliceAlign == __newSpliceAlign
-      assert newEstAlign    == __newEstAlign
-      assert newWeightMatch == __newWeightMatch
-      assert newDPScores    == __newDPScores 
-      assert newQualityPlifsFeatures == __newQualityPlifsFeatures 
+
+      assert newSpliceAlign.flatten().tolist()[0] == _SpliceAlign
+      assert newEstAlign.flatten().tolist()[0]    == _EstAlign
+      assert newWeightMatch.flatten().tolist()[0]  == _WeightMatch
+      assert newDPScores.flatten().tolist()[0]     == _DPScores 
+      assert newQualityPlifsFeatures.flatten().tolist()[0]  == _newQualityPlifsFeatures 
+
+      pdb.set_trace()
 
       return newSpliceAlign, newEstAlign, newWeightMatch, newDPScores,\
       newQualityPlifsFeatures, dna_array, est_array
@@ -566,13 +579,14 @@ class QPalma:
 #
 ###############################################################################
 
-   def init_prediction(self,run,dataset_fn,prediction_keys,param_fn,seqInfo,set_name):
+   def init_prediction(self,run,dataset_fn,prediction_keys,param_fn,seqInfo,set_name,):
       """
       Performing a prediction takes...
       """
       self.run = run
       self.set_name = set_name
 
+      #full_working_path = os.path.join(run['alignment_dir'],run['name'])
       full_working_path = run['result_dir']
 
       print 'full_working_path is %s' % full_working_path 
@@ -588,6 +602,7 @@ class QPalma:
 
       self.logfh = open('_qpalma_predict_%s.log'%set_name,'w+')
 
+
       # number of prediction instances
       self.plog('Number of prediction examples: %d\n'% len(prediction_keys))
 
@@ -609,15 +624,18 @@ class QPalma:
 
    def predict(self,run,prediction_set,param,seqInfo):
       """
-      This method contains the prediction loop over all reads of the given dataset.
+      This method...
       """
 
       self.run = run
-      self.read_size = 38
 
-      self.use_quality_scores = False
-      if self.run['mode'] == 'using_quality_scores':
-         use_quality_scores = True
+      if self.run['mode'] == 'normal':
+         self.use_quality_scores = False
+
+      elif self.run['mode'] == 'using_quality_scores':
+         self.use_quality_scores = True
+      else:
+         assert(False)
 
       # Set the parameters such as limits/penalties for the Plifs
       [h,d,a,mmatrix,qualityPlifs] =\
@@ -638,7 +656,7 @@ class QPalma:
       #totalQualSP = self.run['totalQualSuppPoints']
       #totalQualityPenalties = zeros((totalQualSP,1))
 
-      problem_ctr = 0
+      self.problem_ctr = 0
 
       # where we store the predictions
       allPredictions = []
@@ -646,8 +664,23 @@ class QPalma:
       # we take the first quality vector of the tuple of quality vectors
       quality_index = 0
 
+      # fetch the data needed
+      #g_dir    = run['dna_flat_files'] #'/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+      #acc_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
+      #don_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
+
+      #g_fmt = 'chr%d.dna.flat'
+      #s_fmt = 'contig_%d%s'
+
+      #num_chromo = 6
+
+      #accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
+      #seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
+
       # beginning of the prediction loop
       for example_key in prediction_set.keys():
+         print 'Current example %d' % example_key
+
          for example in prediction_set[example_key]:
 
             currentSeqInfo,read,currentQualities = example
@@ -658,23 +691,28 @@ class QPalma:
             if not self.qpalma_debug_mode:
                self.plog('Loading example id: %d...\n'% int(id))
 
-            if use_quality_scores:
+            if run['mode'] == 'normal':
+               quality = [40]*len(read)
+
+            if run['mode'] == 'using_quality_scores':
                quality = currentQualities[quality_index]
-            else:
+
+            if not run['enable_quality_scores']:
                quality = [40]*len(read)
 
             try:
                currentDNASeq, currentAcc, currentDon = seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop)
             except:
-               problem_ctr += 1
+               self.problem_ctr += 1
                continue
 
             if not run['enable_splice_signals']:
-               for idx in range(len(currentDon)):
-                  if currentDon[idx] != -inf:
+               for idx,elem in enumerate(currentDon):
+                  if elem != -inf:
                      currentDon[idx] = 0.0
 
-                  if currentAcc[idx] != -inf:
+               for idx,elem in enumerate(currentAcc):
+                  if elem != -inf:
                      currentAcc[idx] = 0.0
 
             current_prediction = self.calc_alignment(currentDNASeq, read,\
@@ -688,17 +726,17 @@ class QPalma:
             allPredictions.append(current_prediction)
 
       if not self.qpalma_debug_mode:
-         self.finalizePrediction(allPredictions,problem_ctr)
+         self.finalizePrediction(allPredictions)
       else:
          return allPredictions
 
 
-   def finalizePrediction(self,allPredictions,problem_ctr):
+   def finalizePrediction(self,allPredictions):
       # end of the prediction loop we save all predictions in a pickle file and exit
       cPickle.dump(allPredictions,open('%s.predictions.pickle'%(self.set_name),'w+'))
       print 'Prediction completed'
       self.plog('Prediction completed\n')
-      mes =  'Problem ctr %d' % problem_ctr
+      mes =  'Problem ctr %d' % self.problem_ctr
       print mes
       self.plog(mes+'\n')
       self.logfh.close()
@@ -718,7 +756,7 @@ class QPalma:
       if '-' in read:
          self.plog('found gap\n')
          read = read.replace('-','')
-         assert len(read) == self.read_size
+         assert len(read) == Conf.read_size
 
       dna_len = len(dna)
       read_len = len(read)
@@ -738,6 +776,8 @@ class QPalma:
       true_map[0] = 1
       pathNr      = 0
 
+      #pdb.set_trace()
+
       _newSpliceAlign   = array.array('B',newSpliceAlign.flatten().tolist()[0])
       _newEstAlign      = array.array('B',newEstAlign.flatten().tolist()[0])