import array
import cPickle
import os
+import os.path
import pdb
from sequence_utils import SeqSpliceInfo,DataAccessWrapper,get_flat_file_size,reverse_complement,unbracket_seq,create_bracket_seq,reconstruct_dna_seq
+jp = os.path.join
+
illumina_ga_range = (-5,40)
#roche_454_range =
"""
#map_file = settings['map_file']
- map_file = jp(result_dir = self.global_settings['approximation_dir'],'map.vm')
+ map_file = jp(settings['approximation_dir'],'map.vm')
assert os.path.exists(map_file), 'Error: Can not find map file'
dataset = {}
prb_offset = 64
+ prb_offset = 50
# This tuple specifies an interval for valid Illumina Genome Analyzer quality values
if settings['platform'] == 'IGA':
assert not '[' in read_seq and not ']' in read_seq
# we use an area of +/- `self.half_window_size` nucleotides around the seed position
- if pos > self.half_window_size+1:
- us_offset = self.half_window_size
+ if pos > half_window_size+1:
+ us_offset = half_window_size
else:
us_offset = pos - 1
- if pos+self.half_window_size < seqInfo.chromo_sizes[chromo]:
- ds_offset = self.half_window_size
+ if pos+half_window_size < seqInfo.chromo_sizes[chromo]:
+ ds_offset = half_window_size
else:
ds_offset = seqInfo.chromo_sizes[chromo]-pos-1
# In order to save some space we use a signed char to store the
# qualities. Each quality element can range as follows: -128 <= elem <= 127
- q_values = map(lambda x: ord(x)-self.prb_offset,slist[5])
+ q_values = map(lambda x: ord(x)-prb_offset,slist[5])
if settings['perform_checks']:
for entry in q_values:
def saveData(prefix,dataset,settings):
"""
"""
- ddir = settings['dataset_dir']
- dataset_fn = jp(ddir,'%s_data.pickle'%prefix)
- dataset_keys_fn = jp(ddir,'%s_data.keys.pickle'%prefix)
- assert not os.path.exists(dataset_fn), 'The data_file already exists!'
- assert not os.path.exists(dataset_keys_fn), 'The data_keys file already exists!'
+ if prefix == 'prediction':
+ dataset_fn = settings['prediction_dataset_fn']
+ dataset_keys_fn = settings['prediction_dataset_keys_fn']
+ elif prefix == 'training':
+ dataset_fn = settings['training_dataset_fn']
+ dataset_keys_fn = settings['training_dataset_keys_fn']
+ else:
+ assert False
+
+ assert not os.path.exists(dataset_fn), 'The data file already exists!'
+ assert not os.path.exists(dataset_keys_fn), 'The data keys file already exists!'
# saving new dataset and single keys as well
cPickle.dump(dataset,open(dataset_fn,'w+'),protocol=2)
"""
- def __init__(self,global_settings):
+ def __init__(self,settings):
self.sleep_time = 0
# this list stores the cluster/local jobs objects
self.functionJobs = []
# this object stores the configuration
- self.global_settings = global_settings
+ self.settings = settings
def CreateJobs(self):
Create...
"""
- num_splits = self.global_settings['num_splits']
+ num_splits = self.settings['num_splits']
#run_dir = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/run_enable_quality_scores_+_enable_splice_signals_+_enable_intron_length_+'
#param_fname = jp(run_dir,'param_526.pickle')
- param_fname = self.global_settings['prediction_parameter_fn']
+ param_fname = self.settings['prediction_parameter_fn']
#run_fname = jp(run_dir,'run_obj.pickle')
- run_fname = self.global_settings['run_fn']
+ run_fname = self.settings['run_fn']
#result_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/main'
- result_dir = self.global_settings['approximation_dir']
+ result_dir = self.settings['approximation_dir']
- original_map_fname = self.global_settings['read_ascii_data_fn']
+ original_map_fname = self.settings['unspliced_reads_fn']
split_file(original_map_fname,result_dir,num_splits)
self.result_files = []
result_fname = jp(result_dir,'map.vm.part_%d.heuristic'%idx)
self.result_files.append(result_fname)
- current_job = KybJob(gridtools.ApproximationTaskStarter,[run_fname,data_fname,param_fname,result_fname,self.global_settings])
+ current_job = KybJob(gridtools.ApproximationTaskStarter,[run_fname,data_fname,param_fname,result_fname,self.settings])
current_job.h_vmem = '25.0G'
#current_job.express = 'True'
def collectResults(self):
- result_dir = self.global_settings['approximation_dir']
+ result_dir = self.settings['approximation_dir']
combined_fn = jp(result_dir,'map.vm.spliced')
combine_files(self.result_files,combined_fn)
- combine_files([combined_fn,settings[spliced_reads_fn]],'map.vm')
+ combine_files([combined_fn,settings['spliced_reads_fn']],'map.vm')
def ApproximationTaskStarter(run_fname,data_fname,param_fname,result_fname,settings):
class AlignmentTask(ClusterTask):
+ """
+ This task represents the main part of QPalma.
+ """
- def __init__(self):
- ClusterTask.__init__(self)
+ def __init__(self,settings):
+ ClusterTask.__init__(self,settings)
def CreateJobs():
"""
- num_splits = self.global_settings['num_splits']
+ num_splits = self.settings['num_splits']
jp = os.path.join
- run_dir = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/saved_run'
-
- run = cPickle.load(open(jp(run_dir,'run_obj.pickle')))
+ run = cPickle.load(open(settings['run_fn']))
run['name'] = 'saved_run'
- param = cPickle.load(open(jp(run_dir,'param_526.pickle')))
+ param = settings['prediction_parameter_fn']
- run['result_dir'] = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1/prediction'
- dataset_fn = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1/dataset/dataset_run_1.pickle.pickle'
- prediction_keys_fn = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1/dataset/dataset_run_1.pickle.keys.pickle'
+ run['result_dir'] = settings['prediction_dir']
+ dataset_fn = settings['prediction_dataset_fn']
+ prediction_keys_fn = settings['prediction_dataset_keys_fn']
prediction_keys = cPickle.load(open(prediction_keys_fn))
def __init__(self,settings):
self.genomic_dir = settings['genome_dir']
- self.acc_score_dir = settings['acceptor_scores_dir']
- self.don_score_dir = settings['donor_scores__dir']
+ self.acc_score_dir = settings['acceptor_scores_loc']
+ self.don_score_dir = settings['donor_scores_loc']
self.genomic_fmt = settings['genome_file_fmt']
self.sscore_fmt = settings['splice_score_file_fmt']
- assert os.path.isdir(genomic_data_dir)
- assert os.path.isdir(acc_score_dir)
- assert os.path.isdir(don_score_dir)
-
def get_genomic_fragment_fn(self,id,strand):
if strand == '+':
#print genomicSeq_start,genomicSeq_stop
- assert genomicSeq_start < genomicSeq_stop
+ assert genomicSeq_start < genomicSeq_stop, pdb.set_trace()
if strand == '+':
s_start = genomicSeq_start - 1
"""
This function parse all key value pairs from the given filename
"""
- global_settings = {}
+ settings = {}
for line in open(filename):
if (not line.strip()) or line.startswith('#'):
pass
else:
key, val = line.strip().replace(' ', '').split('=')
- global_settings[key] = val
+ settings[key] = val
- return global_settings
+ return settings
-def makeSettings(global_settings):
+def makeSettings(settings):
"""
"""
- assert os.path.exists(global_settings['result_dir']),'Error: You have to specify a existing result directory!'
- result_dir = global_settings['result_dir']
- global_settings['approximation_dir'] = jp(result_dir, 'approximation')
- global_settings['preproc_dir'] = jp(result_dir, 'preprocessing')
- global_settings['postproc_dir'] = jp(result_dir, 'postprocessing')
- global_settings['prediction_dir'] = jp(result_dir, 'prediction')
- global_settings['training_dir'] = jp(result_dir, 'training')
-
- for dir_name in ['approximation_dir', 'preproc_dir', 'postproc_dir',\
+ assert os.path.exists(settings['result_dir']),'Error: You have to specify a existing result directory!'
+ result_dir = settings['result_dir']
+ settings['approximation_dir'] = jp(result_dir, 'approximation')
+ settings['dataset_dir'] = jp(result_dir, 'dataset')
+ settings['preproc_dir'] = jp(result_dir, 'preprocessing')
+ settings['postproc_dir'] = jp(result_dir, 'postprocessing')
+ settings['prediction_dir'] = jp(result_dir, 'prediction')
+ settings['training_dir'] = jp(result_dir, 'training')
+
+ for dir_name in ['approximation_dir','dataset_dir', 'preproc_dir', 'postproc_dir',\
'prediction_dir', 'training_dir']:
try:
- os.mkdir(global_settings[dir_name])
+ os.mkdir(settings[dir_name])
continue
except:
print ('Error: There was a problem generating the subdirectory: %s' % dir_name)
+ ddir = settings['dataset_dir']
+ settings['prediction_dataset_fn'] = jp(ddir,'prediction_data.pickle')
+ settings['prediction_dataset_keys_fn'] = jp(ddir,'prediction_data.keys.pickle')
+ settings['training_dataset_fn'] = jp(ddir,'training_data.pickle')
+ settings['training_dataset_keys_fn'] = jp(ddir,'training_data.keys.pickle')
+
+
try:
- os.mkdir(global_settings['global_log_fn'])
+ os.mkdir(settings['global_log_fn'])
except:
- print 'Error: There was a problem generating the logfile %s' % global_settings['global_log_fn']
+ print 'Error: There was a problem generating the logfile %s' % settings['global_log_fn']
-
try:
- global_settings['num_splits'] = int(global_settings['num_splits'])
+ settings['num_splits'] = int(settings['num_splits'])
except:
print 'Error: num_splits has to be a positive integer'
- return global_settings
+ settings['allowed_fragments'] = eval(settings['allowed_fragments'])
+ settings['half_window_size'] = int(settings['half_window_size'])
+
+ return settings
-def checkSettings(global_settings):
- for (key, val,) in global_settings.items():
+def checkSettings(settings):
+ for (key, val,) in settings.items():
if key.endswith('_fn'):
assert os.path.exists(val), 'Error: Path/File %s with value %s does not seem to exist!' % (key,val)
if key.endswith('_dir'):
def parseSettings(filename):
- global_settings = parseSettingsFile(filename)
- global_settings = makeSettings(global_settings)
- assert checkSettings(global_settings),'Check your settings some entries were invalid!'
+ settings = parseSettingsFile(filename)
+ settings = makeSettings(settings)
+ assert checkSettings(settings),'Check your settings some entries were invalid!'
- return global_settings
+ return settings
from qpalma.gridtools import ApproximationTask,PreprocessingTask
from qpalma.gridtools import AlignmentTask,PostprocessingTask
-from qpalma.DatasetUtils import generateDataset
+from qpalma.DatasetUtils import generatePredictionDataset,generateTrainingDataset
from SettingsParser import parseSettings
all parameters.
"""
- self.global_settings = parseSettings(filename)
+ self.settings = parseSettings(filename)
def training(self):
algorithm.
"""
- pre_task = TrainingPreprocessingTask(self.global_settings)
+ pre_task = TrainingPreprocessingTask(self.settings)
pre_task.createJobs()
pre_task.submit()
pre_task.checkIfTaskFinished()
# Before creating a candidate spliced read dataset we have to first filter
# the matches from the first seed finding run.
- approx_task = ApproximationTask(self.global_settings)
- approx_task.CreateJobs()
- approx_task.Submit()
- approx_task.CheckIfTaskFinished()
+ #approx_task = ApproximationTask(self.settings)
+ #approx_task.CreateJobs()
+ #approx_task.Submit()
+ #approx_task.CheckIfTaskFinished()
# After filtering combine the filtered matches from the first run and the
# found matches from the second run to a full dataset
- generateDataset(self.global_settings)
- #pre_task = PreprocessingTask(self.global_settings)
+ generatePredictionDataset(self.settings)
+ #pre_task = PreprocessingTask(self.settings)
#pre_task.CreateJobs()
#pre_task.Submit()
#pre_task.CheckIfTaskFinished()
- sys.exit(0)
-
- # Now that we have a dataset we can perform the accurate alignments for this
- # data
-
- align_task = AlignmentTask(self.global_settings)
+ # Now that we have a dataset we can perform accurate alignments
+ align_task = AlignmentTask(self.settings)
align_task.CreateJobs()
align_task.Submit()
align_task.CheckIfTaskFinished()
# The results of the above alignment step can be converted to a data format
# needed for further postprocessing.
-
- post_task = PostprocessingTask(self.global_settings)
+ post_task = PostprocessingTask(self.settings)
post_task.CreateJobs()
post_task.Submit()
post_task.CheckIfTaskFinished()