+ added paths for the dataset generation
authorFabio <fabio@congo.fml.local>
Sat, 13 Sep 2008 02:16:56 +0000 (04:16 +0200)
committerFabio <fabio@congo.fml.local>
Sat, 13 Sep 2008 02:16:56 +0000 (04:16 +0200)
+ checked dataset generation

qpalma/DatasetUtils.py
qpalma/gridtools.py
qpalma/sequence_utils.py
scripts/SettingsParser.py
scripts/qpalma_pipeline.py

index b98de77..f9c1c34 100644 (file)
@@ -9,10 +9,13 @@
 import array
 import cPickle
 import os
+import os.path
 import pdb
 
 from sequence_utils import SeqSpliceInfo,DataAccessWrapper,get_flat_file_size,reverse_complement,unbracket_seq,create_bracket_seq,reconstruct_dna_seq
 
+jp = os.path.join
+
 illumina_ga_range = (-5,40)
 #roche_454_range   =
 
@@ -52,12 +55,13 @@ def generatePredictionDataset(settings):
    """
 
    #map_file = settings['map_file']
-   map_file = jp(result_dir  = self.global_settings['approximation_dir'],'map.vm')
+   map_file = jp(settings['approximation_dir'],'map.vm')
    assert os.path.exists(map_file), 'Error: Can not find map file'
 
    dataset = {}
 
    prb_offset = 64
+   prb_offset = 50
 
    # This tuple specifies an interval for valid Illumina Genome Analyzer quality values
    if settings['platform'] == 'IGA':
@@ -97,13 +101,13 @@ def generatePredictionDataset(settings):
       assert not '[' in read_seq and not ']' in read_seq
 
       # we use an area of +/-  `self.half_window_size` nucleotides around the seed position
-      if pos > self.half_window_size+1:
-         us_offset = self.half_window_size
+      if pos > half_window_size+1:
+         us_offset = half_window_size
       else:
          us_offset = pos - 1 
 
-      if pos+self.half_window_size < seqInfo.chromo_sizes[chromo]:
-         ds_offset = self.half_window_size
+      if pos+half_window_size < seqInfo.chromo_sizes[chromo]:
+         ds_offset = half_window_size
       else:
          ds_offset = seqInfo.chromo_sizes[chromo]-pos-1
          
@@ -115,7 +119,7 @@ def generatePredictionDataset(settings):
       # In order to save some space we use a signed char to store the
       # qualities. Each quality element can range as follows: -128 <= elem <= 127
       
-      q_values = map(lambda x: ord(x)-self.prb_offset,slist[5])
+      q_values = map(lambda x: ord(x)-prb_offset,slist[5])
 
       if settings['perform_checks']:
          for entry in q_values:
@@ -140,12 +144,18 @@ def generatePredictionDataset(settings):
 def saveData(prefix,dataset,settings):
    """
    """
-   ddir = settings['dataset_dir']
-   dataset_fn        = jp(ddir,'%s_data.pickle'%prefix)
-   dataset_keys_fn   = jp(ddir,'%s_data.keys.pickle'%prefix)
 
-   assert not os.path.exists(dataset_fn), 'The data_file already exists!'
-   assert not os.path.exists(dataset_keys_fn), 'The data_keys file already exists!'
+   if prefix == 'prediction':
+      dataset_fn        = settings['prediction_dataset_fn']
+      dataset_keys_fn   = settings['prediction_dataset_keys_fn']
+   elif prefix == 'training':
+      dataset_fn        = settings['training_dataset_fn']
+      dataset_keys_fn   = settings['training_dataset_keys_fn']
+   else:
+      assert False
+
+   assert not os.path.exists(dataset_fn), 'The data file already exists!'
+   assert not os.path.exists(dataset_keys_fn), 'The data keys file already exists!'
 
    # saving new dataset and single keys as well
    cPickle.dump(dataset,open(dataset_fn,'w+'),protocol=2)
index 34301c5..40282a8 100644 (file)
@@ -48,14 +48,14 @@ class ClusterTask(Thread):
 
    """
    
-   def __init__(self,global_settings):
+   def __init__(self,settings):
       self.sleep_time = 0
 
       # this list stores the cluster/local jobs objects
       self.functionJobs    = []
 
       # this object stores the configuration
-      self.global_settings = global_settings
+      self.settings = settings
 
 
    def CreateJobs(self):
@@ -119,18 +119,18 @@ class ApproximationTask(ClusterTask):
       Create...
       """
 
-      num_splits = self.global_settings['num_splits']
+      num_splits = self.settings['num_splits']
 
       #run_dir  = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/run_enable_quality_scores_+_enable_splice_signals_+_enable_intron_length_+'
       #param_fname    = jp(run_dir,'param_526.pickle')
-      param_fname = self.global_settings['prediction_parameter_fn']
+      param_fname = self.settings['prediction_parameter_fn']
       #run_fname      = jp(run_dir,'run_obj.pickle')
-      run_fname = self.global_settings['run_fn']
+      run_fname = self.settings['run_fn']
 
       #result_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/main'
-      result_dir = self.global_settings['approximation_dir']
+      result_dir = self.settings['approximation_dir']
 
-      original_map_fname = self.global_settings['read_ascii_data_fn']
+      original_map_fname = self.settings['unspliced_reads_fn']
       split_file(original_map_fname,result_dir,num_splits)
    
       self.result_files = []
@@ -140,7 +140,7 @@ class ApproximationTask(ClusterTask):
          result_fname   = jp(result_dir,'map.vm.part_%d.heuristic'%idx)
          self.result_files.append(result_fname)
 
-         current_job = KybJob(gridtools.ApproximationTaskStarter,[run_fname,data_fname,param_fname,result_fname,self.global_settings])
+         current_job = KybJob(gridtools.ApproximationTaskStarter,[run_fname,data_fname,param_fname,result_fname,self.settings])
          current_job.h_vmem = '25.0G'
          #current_job.express = 'True'
 
@@ -150,10 +150,10 @@ class ApproximationTask(ClusterTask):
 
 
    def collectResults(self):
-      result_dir  = self.global_settings['approximation_dir']
+      result_dir  = self.settings['approximation_dir']
       combined_fn = jp(result_dir,'map.vm.spliced')
       combine_files(self.result_files,combined_fn)
-      combine_files([combined_fn,settings[spliced_reads_fn]],'map.vm')
+      combine_files([combined_fn,settings['spliced_reads_fn']],'map.vm')
 
 
 def ApproximationTaskStarter(run_fname,data_fname,param_fname,result_fname,settings):
@@ -173,9 +173,12 @@ class PreprocessingTask(ClusterTask):
 
 
 class AlignmentTask(ClusterTask):
+   """
+   This task represents the main part of QPalma. 
+   """
 
-   def __init__(self):
-      ClusterTask.__init__(self)
+   def __init__(self,settings):
+      ClusterTask.__init__(self,settings)
 
 
    def CreateJobs():
@@ -183,20 +186,18 @@ class AlignmentTask(ClusterTask):
 
       """
 
-      num_splits = self.global_settings['num_splits']
+      num_splits = self.settings['num_splits']
 
       jp = os.path.join
 
-      run_dir = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/saved_run'
-
-      run   = cPickle.load(open(jp(run_dir,'run_obj.pickle')))
+      run   = cPickle.load(open(settings['run_fn']))
       run['name'] = 'saved_run'
 
-      param = cPickle.load(open(jp(run_dir,'param_526.pickle')))
+      param = settings['prediction_parameter_fn']
 
-      run['result_dir']    = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1/prediction'
-      dataset_fn           = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1/dataset/dataset_run_1.pickle.pickle'
-      prediction_keys_fn   = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1/dataset/dataset_run_1.pickle.keys.pickle'
+      run['result_dir']    = settings['prediction_dir']
+      dataset_fn           = settings['prediction_dataset_fn']
+      prediction_keys_fn   = settings['prediction_dataset_keys_fn']
 
       prediction_keys = cPickle.load(open(prediction_keys_fn))
 
index 54bc0d0..8effa98 100644 (file)
@@ -200,15 +200,11 @@ class DataAccessWrapper:
 
    def __init__(self,settings):
       self.genomic_dir        = settings['genome_dir'] 
-      self.acc_score_dir      = settings['acceptor_scores_dir'] 
-      self.don_score_dir      = settings['donor_scores__dir'] 
+      self.acc_score_dir      = settings['acceptor_scores_loc'] 
+      self.don_score_dir      = settings['donor_scores_loc'] 
       self.genomic_fmt        = settings['genome_file_fmt'] 
       self.sscore_fmt         = settings['splice_score_file_fmt'] 
 
-      assert os.path.isdir(genomic_data_dir)
-      assert os.path.isdir(acc_score_dir)
-      assert os.path.isdir(don_score_dir)
-
 
    def get_genomic_fragment_fn(self,id,strand):
       if strand == '+':
@@ -306,7 +302,7 @@ class SeqSpliceInfo():
 
       #print genomicSeq_start,genomicSeq_stop
 
-      assert genomicSeq_start < genomicSeq_stop
+      assert genomicSeq_start < genomicSeq_stop, pdb.set_trace()
 
       if strand == '+':
          s_start  = genomicSeq_start - 1
index 7abcd59..ac66128 100644 (file)
@@ -21,53 +21,63 @@ def parseSettingsFile(filename):
    """
    This function parse all key value pairs from the given filename
    """
-   global_settings = {}
+   settings = {}
    for line in open(filename):
       if (not line.strip()) or line.startswith('#'):
          pass
       else:
          key, val = line.strip().replace(' ', '').split('=')
-         global_settings[key] = val
+         settings[key] = val
 
-   return global_settings
+   return settings
 
 
-def makeSettings(global_settings):
+def makeSettings(settings):
    """
    
    """
-   assert os.path.exists(global_settings['result_dir']),'Error: You have to specify a existing result directory!'
-   result_dir = global_settings['result_dir']
-   global_settings['approximation_dir'] = jp(result_dir, 'approximation')
-   global_settings['preproc_dir'] = jp(result_dir, 'preprocessing')
-   global_settings['postproc_dir'] = jp(result_dir, 'postprocessing')
-   global_settings['prediction_dir'] = jp(result_dir, 'prediction')
-   global_settings['training_dir'] = jp(result_dir, 'training')
-
-   for dir_name in ['approximation_dir', 'preproc_dir', 'postproc_dir',\
+   assert os.path.exists(settings['result_dir']),'Error: You have to specify a existing result directory!'
+   result_dir = settings['result_dir']
+   settings['approximation_dir'] = jp(result_dir, 'approximation')
+   settings['dataset_dir'] = jp(result_dir, 'dataset')
+   settings['preproc_dir'] = jp(result_dir, 'preprocessing')
+   settings['postproc_dir'] = jp(result_dir, 'postprocessing')
+   settings['prediction_dir'] = jp(result_dir, 'prediction')
+   settings['training_dir'] = jp(result_dir, 'training')
+
+   for dir_name in ['approximation_dir','dataset_dir', 'preproc_dir', 'postproc_dir',\
      'prediction_dir', 'training_dir']:
       try:
-         os.mkdir(global_settings[dir_name])
+         os.mkdir(settings[dir_name])
          continue
       except:
          print ('Error: There was a problem generating the subdirectory: %s' % dir_name)
 
+   ddir = settings['dataset_dir']
+   settings['prediction_dataset_fn']      = jp(ddir,'prediction_data.pickle')
+   settings['prediction_dataset_keys_fn'] = jp(ddir,'prediction_data.keys.pickle')
+   settings['training_dataset_fn']        = jp(ddir,'training_data.pickle')
+   settings['training_dataset_keys_fn']   = jp(ddir,'training_data.keys.pickle')
+
+
    try:
-      os.mkdir(global_settings['global_log_fn'])
+      os.mkdir(settings['global_log_fn'])
    except:
-      print 'Error: There was a problem generating the logfile %s' % global_settings['global_log_fn']
+      print 'Error: There was a problem generating the logfile %s' % settings['global_log_fn']
 
-   
    try:
-      global_settings['num_splits'] = int(global_settings['num_splits'])
+      settings['num_splits'] = int(settings['num_splits'])
    except:
       print 'Error: num_splits has to be a positive integer'
 
-   return global_settings
+   settings['allowed_fragments'] = eval(settings['allowed_fragments'])
+   settings['half_window_size']  = int(settings['half_window_size'])
+
+   return settings
 
 
-def checkSettings(global_settings):
-   for (key, val,) in global_settings.items():
+def checkSettings(settings):
+   for (key, val,) in settings.items():
       if key.endswith('_fn'):
          assert os.path.exists(val), 'Error: Path/File %s with value %s does not seem to exist!' % (key,val)
       if key.endswith('_dir'):
@@ -77,8 +87,8 @@ def checkSettings(global_settings):
 
 
 def parseSettings(filename):
-   global_settings = parseSettingsFile(filename)
-   global_settings = makeSettings(global_settings)
-   assert checkSettings(global_settings),'Check your settings some entries were invalid!'
+   settings = parseSettingsFile(filename)
+   settings = makeSettings(settings)
+   assert checkSettings(settings),'Check your settings some entries were invalid!'
 
-   return global_settings
+   return settings
index 5e4bea2..7b4c7a6 100644 (file)
@@ -21,7 +21,7 @@ import sys
 from qpalma.gridtools import ApproximationTask,PreprocessingTask
 from qpalma.gridtools import AlignmentTask,PostprocessingTask
 
-from qpalma.DatasetUtils import generateDataset
+from qpalma.DatasetUtils import generatePredictionDataset,generateTrainingDataset
 
 from SettingsParser import parseSettings
 
@@ -47,7 +47,7 @@ class System:
       all parameters.
       """
 
-      self.global_settings = parseSettings(filename)
+      self.settings = parseSettings(filename)
 
 
    def training(self):
@@ -57,7 +57,7 @@ class System:
       algorithm.
       """
 
-      pre_task = TrainingPreprocessingTask(self.global_settings)
+      pre_task = TrainingPreprocessingTask(self.settings)
       pre_task.createJobs()
       pre_task.submit() 
       pre_task.checkIfTaskFinished()
@@ -73,34 +73,29 @@ class System:
       # Before creating a candidate spliced read dataset we have to first filter
       # the matches from the first seed finding run.
 
-      approx_task = ApproximationTask(self.global_settings)
-      approx_task.CreateJobs()
-      approx_task.Submit()
-      approx_task.CheckIfTaskFinished()
+      #approx_task = ApproximationTask(self.settings)
+      #approx_task.CreateJobs()
+      #approx_task.Submit()
+      #approx_task.CheckIfTaskFinished()
       
       # After filtering combine the filtered matches from the first run and the
       # found matches from the second run to a full dataset
 
-      generateDataset(self.global_settings)
-      #pre_task = PreprocessingTask(self.global_settings)
+      generatePredictionDataset(self.settings)
+      #pre_task = PreprocessingTask(self.settings)
       #pre_task.CreateJobs()
       #pre_task.Submit() 
       #pre_task.CheckIfTaskFinished()
 
-      sys.exit(0)
-
-      # Now that we have a dataset we can perform the accurate alignments for this
-      # data
-
-      align_task = AlignmentTask(self.global_settings)
+      # Now that we have a dataset we can perform accurate alignments
+      align_task = AlignmentTask(self.settings)
       align_task.CreateJobs()
       align_task.Submit()
       align_task.CheckIfTaskFinished()
 
       # The results of the above alignment step can be converted to a data format
       # needed for further postprocessing.
-
-      post_task = PostprocessingTask(self.global_settings)
+      post_task = PostprocessingTask(self.settings)
       post_task.CreateJobs()
       post_task.Submit()
       post_task.CheckIfTaskFinished()