+ update makefiles to fetch automatically valid Python includes and libs
[qpalma.git] / scripts / qpalma_pipeline.py
index 42e8c38..d0efc0c 100644 (file)
@@ -18,100 +18,29 @@ import os.path
 import pdb
 import sys
 
-from optparse import OptionParser
-
 from qpalma.gridtools import ApproximationTask,PreprocessingTask
 from qpalma.gridtools import AlignmentTask,PostprocessingTask
+from qpalma.gridtools import TrainingTask
 
+from qpalma.DatasetUtils import generatePredictionDataset,generateTrainingDataset
 
-Errormsg = """Usage is: python qpalma_pipeline.py <config filename>"""
-
-
-"""
-def create_option_parser():
-   parser = OptionParser()
-
-   #  
-   parser.add_option("-ci", "--check_and_init", help="check configuration and initialize directories")
-
-   #
-   parser.add_option("-r", "--run", help="write report to FILE", metavar="FILE")
-
-   #
-   parser.add_option("-xx", "--clear", action="store_false", dest="verbose", help="cleanup directories delete all created data")
-
-   return parser
-"""
-
-jp = os.path.join
-
-def parseSettings(filename):
-   """
-   """
-
-   #global_settings = {\
-   #'result_dir':'/fml/ag-raetsch/...',\
-   #'read_ascii_data_fn':'/fml/ag-raetsch/...',\
-   #'num_splits':50
-   #'global_log_fn':'~/qpalma.log'
-   #}
-
-   global_settings = {}
-
-   for line in open(filename):
-      if not line.strip() or line.startswith('#'):
-         continue
-
-      key,val = line.strip().replace(' ','').split('=')
-      global_settings[key] = val
-
-   return global_settings
-
-
-def makeSettings(global_settings):
-   """
-   
-   """
-
-   # first check wether the top level result directory exists
-   assert os.path.exists(global_settings['result_dir']), 'Error: You have to specify a existing result directory!'
-
-   result_dir = global_settings['result_dir']
-
-   # now create some subdirectories needed for the different steps performed by QPalma 
-   global_settings['approximation_dir'] = jp(result_dir,'approximation')
-   global_settings['preproc_dir']       = jp(result_dir,'preprocessing')
-   global_settings['postproc_dir']      = jp(result_dir,'postprocessing')
-   global_settings['prediction_dir']    = jp(result_dir,'prediction')
-   global_settings['training_dir']      = jp(result_dir,'training')
+from qpalma.SettingsParser import parseSettings
 
-   for dir_name in ['approximation_dir', 'preproc_dir', 'postproc_dir', 'prediction_dir', 'training_dir']:
-      try:
-         os.mkdir(global_settings[dir_name])
-      except:
-         print 'Error: There was a problem generating the subdirectory: %s' % dir_name
+from qpalma.utils import logwrite
 
-   try:
-      os.mkdir(global_settings['global_log_fn'])
-   except:
-      print 'Error: There was a problem generating the logfile %s' % global_settings['global_log_fn']
-
-   return global_settings
-
-
-def checkSettings(global_settings):
-   for key,val in global_settings.items():
-      if key.endswith('_fn'):
-         assert os.path.exists(val), 'Error: Path/File %s with value %s does not seem to exist!' % (key,val)
-
-
-      if key.endswith('_dir'):
-         assert os.path.exists(val), 'Error: Path/File %s with value %s does not seem to exist!' % (key,val)
-   
-
-   return True
+Errormsg = """Usage is:  
+                  
+                  python qpalma_pipeline.py train <config filename> <training data filename>
+              or
+                  python qpalma_pipeline.py predict <config filename> <parameter filename> <putative unspliced reads filename> <putative spliced reads filename>
+                  
+           """
 
 
+def printMessage(mes):
+   print '#'*80
+   print '\t\t\t%s...\n'%mes
+   print '#'*80
 
 
 class System:
@@ -132,76 +61,113 @@ class System:
       all parameters.
       """
 
-      #parser = create_option_parser()
-      #(options, args) = parser.parse_args()
-
-      global_settings = parseSettings(filename)
-      global_settings = makeSettings(global_settings)
-      assert checkSettings(global_settings), 'Check your settings some entries were invalid!'
-
-      self.global_settings = global_settings
+      self.settings = parseSettings(filename)
+      logwrite('Parsed settings system set up.',self.settings)
 
-      pdb.set_trace()
 
-   def training(self):
+   def training(self, training_data_fn):
       """
       This function is responsible for the whole training process. It first
       converts the data to the right format needed by QPalma for the training
       algorithm.
       """
+      logwrite('Begin of training.\n',self.settings)
 
-      pre_task = TrainingPreprocessingTask(self.global_settings)
-      pre_task.createJobs()
-      pre_task.submit() 
-      while pre_task.checkIfTaskFinished() == False:
-         sleep(20)
-      
+      printMessage('Starting dataset generation')
+
+      self.settings['training_reads_fn'] = training_data_fn
+
+      # Collect the data and create a pickled training set
+      generateTrainingDataset(self.settings)
 
-   def prediction(self):
+      printMessage('Starting training')
+
+      # Now that we have a dataset we can perform training
+      train_task = TrainingTask(self.settings)
+      train_task.CreateJobs()
+      train_task.Submit()
+      train_task.CheckIfTaskFinished()
+
+      logwrite('End of training.\n',self.settings)
+
+
+   def prediction(self, param_fn, unspliced_reads_fn, spliced_reads_fn):
       """
       This function encapsulates all steps needed to perform a prediction. Given
       the parameter of the training and paths to a prediction set it will
       generate several output files containing the spliced alignments
       """
 
+      logwrite('Begin of prediction.\n',self.settings)
+
+      printMessage('Starting approximation')
+
+      self.settings['prediction_param_fn'] = param_fn
+      self.settings['unspliced_reads_fn']  = unspliced_reads_fn
+      self.settings['spliced_reads_fn']    = spliced_reads_fn
+
       # Before creating a candidate spliced read dataset we have to first filter
       # the matches from the first seed finding run.
 
-      approx_task = ApproximationTask(self.global_settings)
-      approx_task.createJobs()
-      approx_task.submit()
-      approx_task.checkIfTaskFinished()
+      approx_task = ApproximationTask(self.settings)
+      approx_task.CreateJobs()
+      approx_task.Submit()
+      approx_task.CheckIfTaskFinished()
       
       # After filtering combine the filtered matches from the first run and the
       # found matches from the second run to a full dataset
 
-      pre_task = PreprocessingTask(self.global_settings)
-      pre_task.createJobs()
-      pre_task.submit() 
-      pre_task.checkIfTaskFinished()
+      printMessage('Starting dataset generation')
+
+      generatePredictionDataset(self.settings)
+
+      printMessage('Starting alignments')
 
-      # Now that we have a dataset we can perform the accurate alignments for this
-      # data
+      # Now that we have a dataset we can perform accurate alignments
+      align_task = AlignmentTask(self.settings)
+      align_task.CreateJobs()
+      align_task.Submit()
+      align_task.CheckIfTaskFinished()
 
-      align_task = AlignmentTask(self.global_settings)
-      align_task.createJobs()
-      align_task.submit()
-      align_task.checkIfTaskFinished()
+      printMessage('Postprocessing')
 
       # The results of the above alignment step can be converted to a data format
       # needed for further postprocessing.
+      post_task = PostprocessingTask(self.settings)
+      post_task.CreateJobs()
+      post_task.Submit()
+      post_task.CheckIfTaskFinished()
 
-      post_task = PostprocessingTask(self.global_settings)
-      post_task.createJobs()
-      post_task.submit()
-      post_task.checkIfTaskFinished()
-
-      print "Success!"
+      logwrite('End of prediction.\n',self.settings)
    
 
 if __name__ == '__main__':
-   filename = sys.argv[1]
-   assert os.path.exists(filename), Errormsg
+   if len(sys.argv) != 4 and len(sys.argv) != 6:
+      print 'Invalid number of arguments!'
+      print len(sys.argv)
+      print Errormsg
+      sys.exit(1)
+
+   mode     = sys.argv[1]
+   if not mode in ['predict','train']:
+      print Errormsg
+      sys.exit(1)
+
+   filename = sys.argv[2]
+   if not os.path.exists(filename):
+      print Errormsg
+      sys.exit(1)
+
+   # creating system object
    system_obj = System(filename)
-   #system_obj.prediction()
-   #system_obj.training()
+
+   if mode == 'predict':
+      param_fn           = sys.argv[3]
+      unspliced_reads_fn = sys.argv[4]
+      spliced_reads_fn   = sys.argv[5]
+      system_obj.prediction(param_fn,unspliced_reads_fn, spliced_reads_fn)
+   elif mode == 'train':
+      training_data_fn   = sys.argv[3]
+      system_obj.training(training_data_fn)
+   else:
+      assert False