import os.path
import cPickle
+#
+# choose a path where all results of the QPalma pipeline will be stored
+#
+
+result_dir = '/fml/ag-raetsch/home/fabio/tmp/newest_run'
+
+
###############################################################################
# Load a random but fixed initial parameter vector this makes debugging easier
###############################################################################
extended_alphabet = ['-','a','c','g','t','n','[',']']
alphabet = ['-','a','c','g','t','n']
+
+###############################################################################
+#
+# Settings for the VMatch pipeline steps
+#
+###############################################################################
+
+reads_location = ''
+
+#
+# First VMatch step
+#
+
+mismatches_1 = 2
+end_gap_1 = 0
+repeat_mapping_1 = 1
+seedlength_1 = 9
+suffixtree_1 = '/media/oka_raid/nobackup/data/Vmatch/ATH/ATH1.v5.seed9.fa'
+
+#
+# Second VMatch step
+#
+
+mismatches_2 = 1
+sub_mismatches_2 = 1
+min_short_end_2 = 14
+repeat_mapping_2 = 1
+seedlength_2 = 9
+suffixtree_2 = '/media/oka_raid/nobackup/data/Vmatch/ATH/ATH1.v5.seed9.fa'
+
+#
+#
+# do not modify anything below this line
+#
+#
+
+conf_object_path = os.path.join(result_dir,'config_object.pickle')
--- /dev/null
+#!/usr/bin/env python
+# -*- coding:latin-1 -*-
+
+# first step
+
+import qpalma.Configuration as Conf
+
+from pipeline.Pipeline import *
+
+import os.path
+import cPickle
+
+from PipelineHeuristic import *
+
+import glob
+import time
+
+
+def get_most_recent_param(current_dir):
+ date_file_list = []
+ for file in glob.glob(current_dir + '/param*'):
+ stats = os.stat(file)
+ lastmod_date = time.localtime(stats[8])
+ date_file_tuple = lastmod_date, file
+ date_file_list.append(date_file_tuple)
+
+ date_file_list.sort()
+ date_file_list.reverse()
+
+ return date_file_list[0][1]
+
+
+class InitStep(PipelineStep):
+
+ def __init__(self):
+
+ result_dir = Conf.result_dir
+
+
+class FirstStep(PipelineStep):
+ """
+
+ """
+
+ def __init__(self):
+ jp = os.path.join
+
+ # a list storing all commands for this particular step
+ cL = []
+
+
+ # fetch the config object which stores all relevant parameters
+ Config = cPickle.load(open(Conf.conf_object_path))
+
+ reads_0_1st = jp(Config['mapping_main_dir'],'reads_0.fl')
+ cmd = 'fix_indels.pl %s > %s' % (Config['reads_location'],reads_0_1st)
+ cL.append(cmd)
+
+ # ATTENTION: Changing working directory
+ os.chdir(Config['mapping_main_dir'])
+
+ cmd = 'map_full_transcript.pl --mismatches=%d --end_gap=%d -no_edit_distance --read_length=%d --repeat_mapping=%d --seedlength=%d --dir=. --suffixtree=%s' % (\
+ Config['mismatches_1'],\
+ Config['end_gap_1'],\
+ Config['read_size'],\
+ Config['repeat_mapping_1'],\
+ Config['seedlength_1'],\
+ Config['suffixtree_1'])
+
+ cL.append(cmd)
+
+ reads_0_2nd = jp(Config['mapping_spliced_dir'],'reads_0.fl')
+ cmd = 'add_quality_2_fl.pl reads_0.fl reads_3.fl > %s' % reads_0_2nd
+
+ cL.append(cmd)
+
+ # ATTENTION: Changing working directory
+ os.chdir(Config['mapping_spliced_dir'])
+
+ cmd = 'filter_quality.pl 8 reads_0.fl && mv reads_0.fl.filtered reads_0.fl'
+
+ cL.append(cmd)
+
+ cmd = 'map_spliced_transcript.pl --mismatches=%d --sub_mismatches=%d --read_length=%d --min_short_end=%d --repeat_mapping=%d --seedlength=%d --dir=. --suffixtree=%s' % (\
+ Config['mismatches_2'],\
+ Config['sub_mismatches_2'],\
+ Config['read_size'],\
+ Config['min_short_end_2'],\
+ Config['repeat_mapping_2'],\
+ Config['seedlength_2'],\
+ Config['suffixtree_2'])
+
+ cL.append(cmd)
+
+ for elem in cL:
+ print elem
+
+ PipelineStep.__init__(self,cL)
+
+
+class SecondStep(PipelineStep):
+ """
+ We can train
+ """
+
+ pass
+
+
+class ThirdStep(PipelineStep):
+ """
+ We use the QPalma heuristic to predict those reads that should be aligned by
+ the QPalma alignment program even though vmatch found a full match with some
+ mismatches.
+ """
+ jp = os.path.join
+
+ # fetch the config object which stores all relevant parameters
+ Config = cPickle.load(open(Conf.conf_object_path))
+
+ data_fn = jp(Config['heuristic_dir'],'map.vm')
+ param_dir = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/run_enable_quality_scores_+_enable_splice_signals_+_enable_intron_length_+'
+ param_fn = get_most_recent_param(param_dir)
+ run_fn = jp(param_dir,'run_obj.pickle')
+
+ result_spliced_fn = jp(Config['heuristic_dir'],'spliced.heuristic')
+ result_unspliced_fn = jp(Config['heuristic_dir'],'unspliced.heuristic')
+
+ ph1 = PipelineHeuristic(run_fn,data_fn,param_fn,result_spliced_fn,result_unspliced_fn)
+ ph1.filter()
+
+
+class FourthStep(PipelineStep):
+ """
+
+ """
+
+if __name__ == '__main__':
+ step1 = FirstStep()
import qpalma.Configuration as Conf
+
+def check_vmatch_params(Config):
+ #
+ # Check and set parameters for the first VMatch step
+ #
+
+ Config['mismatches_1'] = Conf.mismatches_1
+ Config['end_gap_1'] = Conf.end_gap_1
+ Config['read_length_1'] = Conf.read_length_1
+ Config['repeat_mapping_1'] = Conf.repeat_mapping_1
+ Config['seedlength_1'] = Conf.seedlength_1
+ Config['suffixtree_1'] = Conf.suffixtree_1
+
+ assert 0 <= Config['mismatches_1'] <= Config['read_size']
+ assert 0 <= Config['end_gap_1'] <= 5
+ assert 0 <= Config['repeat_mapping_1'] <= 10
+ assert 2 <= Config['seedlength_1'] <= Config['read_size']
+ assert os.path.exists( Config['suffixtree_1'] )
+
+ #
+ # Check and set parameters for the second VMatch step
+ #
+
+ Config['mismatches_2'] = Conf.mismatches_2
+ Config['sub_mismatches_2'] = Conf.sub_mismatches_2
+ Config['min_short_end_2'] = Conf.min_short_end_2
+ Config['repeat_mapping_2'] = Conf.repeat_mapping_2
+ Config['seedlength_2'] = Conf.seedlength_2
+
+ assert 0 <= Config['mismatches_2'] <= Config['read_size']
+ assert 0 <= Config['sub_mismatches_2'] <= Config['read_size']
+ assert 0 <= Config['min_short_end_2'] <= Config['read_size']
+ assert 0 <= Config['repeat_mapping_2'] <= Config['read_size']
+ assert os.path.exists( Config['suffixtree_2'] )
+
+
def check_and_init():
"""
The purpose of this script is to take all the global variables from the
Config['result_dir'] = result_dir
Config['reads_location'] = Conf.reads_location
- assert os.path.exists(Config['reads_location'])
-
- #
- # Check and set parameter for the first VMatch step
- #
-
- Config['mismatches_1'] = Conf.mismatches_1
- Config['end_gap_1'] = Conf.end_gap_1
- Config['read_length_1'] = Conf.read_length_1
- Config['repeat_mapping_1'] = Conf.repeat_mapping_1
- Config['seedlength_1'] = Conf.seedlength_1
- Config['suffixtree_1'] = Conf.suffixtree_1
+ #assert os.path.exists(Config['reads_location'])
- #
- # Check and set parameter for the second VMatch step
- #
+ Config['read_size'] = Conf.read_size
+ assert 2 <= Config['read_size'] <= 100
- Config['mismatches_2'] = Conf.mismatches_2
- Config['sub_mismatches_2'] = Conf.sub_mismatches_2
- Config['read_length_2'] = Conf.read_length_2
- Config['min_short_end_2'] = Conf.min_short_end_2
- Config['repeat_mapping_2'] = Conf.repeat_mapping_2
- Config['seedlength_2'] = Conf.seedlength_2
- Config['suffixtree_2'] = Conf.suffixtree_2
+ # Check VMatch related parameters
+ check_vmatch_params(Config)
subdirs = []
subdirs.extend([Config['remapping_dir']])
- try:
- for current_dir in subdirs:
- os.mkdir(current_dir)
- except:
- print 'Error during initialization of project directories'
+ #try:
+ # for current_dir in subdirs:
+ # os.mkdir(current_dir)
+ #except:
+ # print 'Error during initialization of project directories'
# store the configuration in a pickled python dictionary
cPickle.dump(Config,open(Conf.conf_object_path,'w+'))
+
if __name__ == '__main__':
check_and_init()