+ some minor modifications
\QP is an alignment tool targeted to align spliced reads produced by ``Next
Generation'' sequencing platforms such as $Illumina Genome Analyzer$ or $454$.
The basic idea is to use an extended Smith-Waterman algorithm for local
\QP is an alignment tool targeted to align spliced reads produced by ``Next
Generation'' sequencing platforms such as $Illumina Genome Analyzer$ or $454$.
The basic idea is to use an extended Smith-Waterman algorithm for local
-alignments that uses the base quality information of the reads directly during
-the alignment step. Optimal alignment parameters i.e. scoring matrices are
-inferred using a machine learning technique similar to \emph{Support Vector
-Machines}. For further details on \QP itself consult the paper \cite{DeBona08}.
-For details about the learning method \cite{Tsochantaridis04}.
+alignments that uses the base quality information of the reads directly in the
+alignment step. Optimal alignment parameters i.e. scoring matrices are inferred
+using a machine learning technique similar to \emph{Support Vector Machines}.
+For further details on \QP itself consult the paper \cite{DeBona08}. For
+details about the learning method \cite{Tsochantaridis04}.
%$SOLid$.
%We refer to the whole pipeline as the \QP pipeline and \QP respectively.
%$SOLid$.
%We refer to the whole pipeline as the \QP pipeline and \QP respectively.
and the scores once for all chromosomes.
"""
and the scores once for all chromosomes.
"""
- def __init__(self,genomic_data_dir,acc_score_dir,don_score_dir,gen_fmt,sscore_fmt,chromo_list):
- accessWrapper = DataAccessWrapper(genomic_data_dir,acc_score_dir,don_score_dir,gen_fmt,sscore_fmt)
- self.seqInfo = SeqSpliceInfo(accessWrapper,chromo_list)
+ def __init__(self,settings):
+ chromo_list = settings['allowed_fragments']
+ accessWrapper = DataAccessWrapper(settings)
+ self.seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
except:
print ('Error: There was a problem generating the subdirectory: %s' % dir_name)
except:
print ('Error: There was a problem generating the subdirectory: %s' % dir_name)
- assert checkSettings(settings),'Check your settings some entries were invalid!'
+ #assert checkSettings(settings),'Check your settings some entries were invalid!'
ddir = settings['dataset_dir']
settings['prediction_dataset_fn'] = jp(ddir,'prediction_data.pickle')
ddir = settings['dataset_dir']
settings['prediction_dataset_fn'] = jp(ddir,'prediction_data.pickle')
settings['training_dataset_keys_fn'] = jp(ddir,'training_data.keys.pickle')
try:
settings['training_dataset_keys_fn'] = jp(ddir,'training_data.keys.pickle')
try:
- os.mkdir(settings['global_log_fn'])
+ os.mknod(settings['global_log_fn'])
except:
print 'Error: There was a problem generating the logfile %s' % settings['global_log_fn']
except:
print 'Error: There was a problem generating the logfile %s' % settings['global_log_fn']
def checkSettings(settings):
for (key, val,) in settings.items():
def checkSettings(settings):
for (key, val,) in settings.items():
- if key.endswith('_fn'):
+ if key.endswith('_fn') or key.endswith('_dir'):
assert os.path.exists(val), 'Error: Path/File %s with value %s does not seem to exist!' % (key,val)
assert os.path.exists(val), 'Error: Path/File %s with value %s does not seem to exist!' % (key,val)
- if key.endswith('_dir'):
- assert os.path.exists(val), 'Error: Path/File %s with value %s does not seem to exist!' % (key,val)
-
return True
def parseSettings(filename):
settings = parseSettingsFile(filename)
settings = makeSettings(settings)
return True
def parseSettings(filename):
settings = parseSettingsFile(filename)
settings = makeSettings(settings)
+ #assert checkSettings(settings)
+def logwrite(mes,settings):
+ """
+ A wrapper to write message to the global logfile.
+ """
+ fh = open(settings['global_log_fn'],'a')
+ fh.write(mes)
+
+
if __name__ == '__main__':
split_file('/fml/ag-raetsch/home/fabio/tmp/lyrata_analysis/map.vm','/tmp',25)
if __name__ == '__main__':
split_file('/fml/ag-raetsch/home/fabio/tmp/lyrata_analysis/map.vm','/tmp',25)
seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
start = cpu()
seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
start = cpu()
- self.lt1 = LookupTable(g_dir,acc_dir,don_dir,g_fmt,s_fmt,self.chromo_range)
+ self.lt1 = LookupTable(settings)
stop = cpu()
print 'prefetched sequence and splice data in %f sec' % (stop-start)
stop = cpu()
print 'prefetched sequence and splice data in %f sec' % (stop-start)
print 'Starting filtering...'
_start = cpu()
print 'Starting filtering...'
_start = cpu()
- #for readId,currentReadLocations in all_remapped_reads.items():
- #for location in currentReadLocations[:1]:
-
for location,original_line in self.all_remapped_reads:
if ctr % 1000 == 0:
for location,original_line in self.all_remapped_reads:
if ctr % 1000 == 0:
# forgot to do this
if strand == '-':
# forgot to do this
if strand == '-':
- #pos = self.lt1.seqInfo.chromo_sizes[chr]-pos-self.read_size
unb_seq = reverse_complement(unb_seq)
seq = reverse_complement(seq)
unb_seq = reverse_complement(unb_seq)
seq = reverse_complement(seq)
accessWrapper = DataAccessWrapper(settings)
seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
accessWrapper = DataAccessWrapper(settings)
seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
for line in open(result_fn):
sl = line.split()
for line in open(result_fn):
sl = line.split()
from qpalma.SettingsParser import parseSettings
from qpalma.SettingsParser import parseSettings
+from qpalma.utils import logwrite
-Errormsg = """Usage is: python qpalma_pipeline.py <config filename>"""
+
+Errormsg = """Usage is: python qpalma_pipeline.py predict|train <config filename>"""
"""
self.settings = parseSettings(filename)
"""
self.settings = parseSettings(filename)
+ logwrite('Parsed settings system set up.',self.settings)
converts the data to the right format needed by QPalma for the training
algorithm.
"""
converts the data to the right format needed by QPalma for the training
algorithm.
"""
+ logwrite('Begin of training.\n',self.settings)
#
pre_task = TrainingPreprocessingTask(self.settings)
#
pre_task = TrainingPreprocessingTask(self.settings)
train_task.Submit()
train_task.CheckIfTaskFinished()
train_task.Submit()
train_task.CheckIfTaskFinished()
+ logwrite('End of training.\n',self.settings)
def prediction(self):
"""
def prediction(self):
"""
generate several output files containing the spliced alignments
"""
generate several output files containing the spliced alignments
"""
+ logwrite('Begin of prediction.\n',self.settings)
+
# Before creating a candidate spliced read dataset we have to first filter
# the matches from the first seed finding run.
# Before creating a candidate spliced read dataset we have to first filter
# the matches from the first seed finding run.
post_task.Submit()
post_task.CheckIfTaskFinished()
post_task.Submit()
post_task.CheckIfTaskFinished()
+ logwrite('End of prediction.\n',self.settings)
if __name__ == '__main__':
if __name__ == '__main__':
+ mode = sys.argv[1]
+ assert mode in ['predict','train'], Errormsg
+ filename = sys.argv[2]
assert os.path.exists(filename), Errormsg
assert os.path.exists(filename), Errormsg
+
+ # creating system object
system_obj = System(filename)
system_obj = System(filename)
- system_obj.prediction()
+ if mode == 'predict':
+ system_obj.prediction()
+ elif mode == 'train':
+ system_obj.training()
+ else:
+ assert False