+ added configuration variables to main config file
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Tue, 13 May 2008 14:53:57 +0000 (14:53 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Tue, 13 May 2008 14:53:57 +0000 (14:53 +0000)
+ extended parameter checking script
+ wrote pipeline framework for vmatch and heuristic steps

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@8999 e1793c9e-67f9-0310-80fc-b846ff1f7b36

qpalma/Configuration.py
scripts/QPalmaPipeline.py [new file with mode: 0644]
scripts/check_and_init.py

index d2cf527..69be9c6 100644 (file)
@@ -5,6 +5,13 @@ import numpy.matlib
 import os.path
 import cPickle
 
+#
+# choose a path where all results of the QPalma pipeline will be stored
+#
+
+result_dir = '/fml/ag-raetsch/home/fabio/tmp/newest_run'
+
+
 ###############################################################################
 # Load a random but fixed initial parameter vector this makes debugging easier
 ###############################################################################
@@ -146,3 +153,40 @@ assert os.path.exists(dna_flat_fn), 'DNA data does not exist!'
 
 extended_alphabet = ['-','a','c','g','t','n','[',']']
 alphabet          = ['-','a','c','g','t','n']
+
+###############################################################################
+#
+# Settings for the VMatch pipeline steps
+#
+###############################################################################
+
+reads_location = ''
+
+#
+# First VMatch step 
+#
+
+mismatches_1      = 2
+end_gap_1         = 0 
+repeat_mapping_1  = 1
+seedlength_1      = 9
+suffixtree_1      = '/media/oka_raid/nobackup/data/Vmatch/ATH/ATH1.v5.seed9.fa'
+
+#
+# Second VMatch step 
+#
+
+mismatches_2      = 1
+sub_mismatches_2  = 1
+min_short_end_2   = 14
+repeat_mapping_2  = 1
+seedlength_2      = 9
+suffixtree_2      = '/media/oka_raid/nobackup/data/Vmatch/ATH/ATH1.v5.seed9.fa'
+
+#
+#
+# do not modify anything below this line
+#
+#
+
+conf_object_path = os.path.join(result_dir,'config_object.pickle')
diff --git a/scripts/QPalmaPipeline.py b/scripts/QPalmaPipeline.py
new file mode 100644 (file)
index 0000000..46bb66b
--- /dev/null
@@ -0,0 +1,138 @@
+#!/usr/bin/env python
+# -*- coding:latin-1 -*-
+
+# first step 
+
+import qpalma.Configuration as Conf
+
+from pipeline.Pipeline import *
+
+import os.path
+import cPickle
+
+from PipelineHeuristic import *
+
+import glob
+import time
+
+
+def get_most_recent_param(current_dir):                                                                                                                                                                                        
+   date_file_list = []
+   for file in glob.glob(current_dir + '/param*'):
+      stats = os.stat(file)
+      lastmod_date = time.localtime(stats[8])
+      date_file_tuple = lastmod_date, file
+      date_file_list.append(date_file_tuple)
+
+   date_file_list.sort()
+   date_file_list.reverse()
+
+   return date_file_list[0][1]
+
+
+class InitStep(PipelineStep):
+
+   def __init__(self):
+
+      result_dir = Conf.result_dir      
+
+
+class FirstStep(PipelineStep):
+   """
+   
+   """
+   
+   def __init__(self):
+      jp = os.path.join
+
+      # a list storing all commands for this particular step
+      cL = []
+
+
+      # fetch the config object which stores all relevant parameters
+      Config = cPickle.load(open(Conf.conf_object_path))
+
+      reads_0_1st = jp(Config['mapping_main_dir'],'reads_0.fl')
+      cmd = 'fix_indels.pl %s > %s' % (Config['reads_location'],reads_0_1st)
+      cL.append(cmd)
+
+      # ATTENTION: Changing working directory
+      os.chdir(Config['mapping_main_dir'])
+
+      cmd = 'map_full_transcript.pl --mismatches=%d --end_gap=%d -no_edit_distance --read_length=%d --repeat_mapping=%d --seedlength=%d --dir=. --suffixtree=%s' % (\
+      Config['mismatches_1'],\
+      Config['end_gap_1'],\
+      Config['read_size'],\
+      Config['repeat_mapping_1'],\
+      Config['seedlength_1'],\
+      Config['suffixtree_1'])
+
+      cL.append(cmd)
+
+      reads_0_2nd = jp(Config['mapping_spliced_dir'],'reads_0.fl')
+      cmd = 'add_quality_2_fl.pl reads_0.fl reads_3.fl > %s' % reads_0_2nd
+
+      cL.append(cmd)
+
+      # ATTENTION: Changing working directory
+      os.chdir(Config['mapping_spliced_dir'])
+
+      cmd = 'filter_quality.pl 8 reads_0.fl && mv reads_0.fl.filtered reads_0.fl'
+
+      cL.append(cmd)
+
+      cmd = 'map_spliced_transcript.pl --mismatches=%d --sub_mismatches=%d --read_length=%d --min_short_end=%d --repeat_mapping=%d --seedlength=%d --dir=. --suffixtree=%s' % (\
+      Config['mismatches_2'],\
+      Config['sub_mismatches_2'],\
+      Config['read_size'],\
+      Config['min_short_end_2'],\
+      Config['repeat_mapping_2'],\
+      Config['seedlength_2'],\
+      Config['suffixtree_2'])
+
+      cL.append(cmd)
+
+      for elem in cL:
+         print elem
+
+      PipelineStep.__init__(self,cL)
+
+
+class SecondStep(PipelineStep):
+   """
+   We can train 
+   """
+
+   pass
+
+
+class ThirdStep(PipelineStep):
+   """
+   We use the QPalma heuristic to predict those reads that should be aligned by
+   the QPalma alignment program even though vmatch found a full match with some
+   mismatches. 
+   """
+   jp = os.path.join
+
+   # fetch the config object which stores all relevant parameters
+   Config = cPickle.load(open(Conf.conf_object_path))
+
+   data_fn  = jp(Config['heuristic_dir'],'map.vm')
+   param_dir = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/run_enable_quality_scores_+_enable_splice_signals_+_enable_intron_length_+'
+   param_fn = get_most_recent_param(param_dir)
+   run_fn   = jp(param_dir,'run_obj.pickle')
+
+   result_spliced_fn    = jp(Config['heuristic_dir'],'spliced.heuristic')
+   result_unspliced_fn  = jp(Config['heuristic_dir'],'unspliced.heuristic')
+
+   ph1 = PipelineHeuristic(run_fn,data_fn,param_fn,result_spliced_fn,result_unspliced_fn)
+   ph1.filter()
+
+
+class FourthStep(PipelineStep):
+   """
+   
+   """
+
+if __name__ == '__main__':
+   step1 = FirstStep()
index 393eb22..aed62ad 100644 (file)
@@ -6,6 +6,42 @@ import cPickle
 
 import qpalma.Configuration as Conf
 
+
+def check_vmatch_params(Config):
+   #
+   # Check and set parameters for the first VMatch step 
+   #
+
+   Config['mismatches_1']     = Conf.mismatches_1
+   Config['end_gap_1']        = Conf.end_gap_1
+   Config['read_length_1']    = Conf.read_length_1
+   Config['repeat_mapping_1'] = Conf.repeat_mapping_1
+   Config['seedlength_1']     = Conf.seedlength_1
+   Config['suffixtree_1']     = Conf.suffixtree_1
+
+   assert 0 <= Config['mismatches_1'] <= Config['read_size']
+   assert 0 <= Config['end_gap_1'] <= 5
+   assert 0 <= Config['repeat_mapping_1']  <= 10
+   assert 2 <= Config['seedlength_1'] <= Config['read_size']
+   assert os.path.exists( Config['suffixtree_1'] )
+
+   #
+   # Check and set parameters for the second VMatch step 
+   #
+
+   Config['mismatches_2']     = Conf.mismatches_2
+   Config['sub_mismatches_2'] = Conf.sub_mismatches_2
+   Config['min_short_end_2']  = Conf.min_short_end_2
+   Config['repeat_mapping_2'] = Conf.repeat_mapping_2
+   Config['seedlength_2']     = Conf.seedlength_2
+
+   assert 0 <= Config['mismatches_2'] <= Config['read_size']
+   assert 0 <= Config['sub_mismatches_2'] <= Config['read_size']
+   assert 0 <= Config['min_short_end_2'] <= Config['read_size']
+   assert 0 <= Config['repeat_mapping_2'] <= Config['read_size']
+   assert os.path.exists( Config['suffixtree_2'] )
+
+
 def check_and_init():
    """
    The purpose of this script is to take all the global variables from the
@@ -27,30 +63,13 @@ def check_and_init():
    Config['result_dir'] = result_dir
 
    Config['reads_location'] = Conf.reads_location
-   assert os.path.exists(Config['reads_location'])
-
-   #
-   # Check and set parameter for the first VMatch step 
-   #
-
-   Config['mismatches_1']     = Conf.mismatches_1
-   Config['end_gap_1']        = Conf.end_gap_1
-   Config['read_length_1']    = Conf.read_length_1
-   Config['repeat_mapping_1'] = Conf.repeat_mapping_1
-   Config['seedlength_1']     = Conf.seedlength_1
-   Config['suffixtree_1']     = Conf.suffixtree_1
+   #assert os.path.exists(Config['reads_location'])
 
-   #
-   # Check and set parameter for the second VMatch step 
-   #
+   Config['read_size']     = Conf.read_size
+   assert 2 <= Config['read_size'] <= 100
 
-   Config['mismatches_2']     = Conf.mismatches_2
-   Config['sub_mismatches_2'] = Conf.sub_mismatches_2
-   Config['read_length_2']    = Conf.read_length_2
-   Config['min_short_end_2']  = Conf.min_short_end_2
-   Config['repeat_mapping_2'] = Conf.repeat_mapping_2
-   Config['seedlength_2']     = Conf.seedlength_2
-   Config['suffixtree_2']     = Conf.suffixtree_2
+   # Check VMatch related parameters
+   check_vmatch_params(Config)
 
    subdirs = []
 
@@ -71,14 +90,15 @@ def check_and_init():
 
    subdirs.extend([Config['remapping_dir']])
    
-   try:
-      for current_dir in subdirs:
-         os.mkdir(current_dir)
-   except:
-      print 'Error during initialization of project directories'
+   #try:
+   #   for current_dir in subdirs:
+   #      os.mkdir(current_dir)
+   #except:
+   #   print 'Error during initialization of project directories'
 
    # store the configuration in a pickled python dictionary
    cPickle.dump(Config,open(Conf.conf_object_path,'w+'))
 
+
 if __name__ == '__main__':
    check_and_init()