\QP is an alignment tool targeted to align spliced reads produced by ``Next
Generation'' sequencing platforms such as $Illumina Genome Analyzer$ or $454$.
The basic idea is to use an extended Smith-Waterman algorithm for local
-alignments that uses the base quality information of the reads directly during
-the alignment step. Optimal alignment parameters i.e. scoring matrices are
-inferred using a machine learning technique similar to \emph{Support Vector
-Machines}. For further details on \QP itself consult the paper \cite{DeBona08}.
-For details about the learning method \cite{Tsochantaridis04}.
+alignments that uses the base quality information of the reads directly in the
+alignment step. Optimal alignment parameters i.e. scoring matrices are inferred
+using a machine learning technique similar to \emph{Support Vector Machines}.
+For further details on \QP itself consult the paper \cite{DeBona08}. For
+details about the learning method \cite{Tsochantaridis04}.
%$SOLid$.
%We refer to the whole pipeline as the \QP pipeline and \QP respectively.
and the scores once for all chromosomes.
"""
- def __init__(self,genomic_data_dir,acc_score_dir,don_score_dir,gen_fmt,sscore_fmt,chromo_list):
- accessWrapper = DataAccessWrapper(genomic_data_dir,acc_score_dir,don_score_dir,gen_fmt,sscore_fmt)
- self.seqInfo = SeqSpliceInfo(accessWrapper,chromo_list)
+ def __init__(self,settings):
+ chromo_list = settings['allowed_fragments']
+ accessWrapper = DataAccessWrapper(settings)
+ self.seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
self.strands = ['+','-']
except:
print ('Error: There was a problem generating the subdirectory: %s' % dir_name)
- assert checkSettings(settings),'Check your settings some entries were invalid!'
+ #assert checkSettings(settings),'Check your settings some entries were invalid!'
ddir = settings['dataset_dir']
settings['prediction_dataset_fn'] = jp(ddir,'prediction_data.pickle')
settings['training_dataset_keys_fn'] = jp(ddir,'training_data.keys.pickle')
try:
- os.mkdir(settings['global_log_fn'])
+ os.mknod(settings['global_log_fn'])
except:
print 'Error: There was a problem generating the logfile %s' % settings['global_log_fn']
def checkSettings(settings):
for (key, val,) in settings.items():
- if key.endswith('_fn'):
+ if key.endswith('_fn') or key.endswith('_dir'):
assert os.path.exists(val), 'Error: Path/File %s with value %s does not seem to exist!' % (key,val)
- if key.endswith('_dir'):
- assert os.path.exists(val), 'Error: Path/File %s with value %s does not seem to exist!' % (key,val)
-
return True
def parseSettings(filename):
settings = parseSettingsFile(filename)
settings = makeSettings(settings)
+ #assert checkSettings(settings)
return settings
return all_instances
+def logwrite(mes,settings):
+ """
+ A wrapper to write message to the global logfile.
+ """
+ fh = open(settings['global_log_fn'],'a')
+ fh.write(mes)
+
+
if __name__ == '__main__':
split_file('/fml/ag-raetsch/home/fabio/tmp/lyrata_analysis/map.vm','/tmp',25)
seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
start = cpu()
- self.lt1 = LookupTable(g_dir,acc_dir,don_dir,g_fmt,s_fmt,self.chromo_range)
+ self.lt1 = LookupTable(settings)
stop = cpu()
print 'prefetched sequence and splice data in %f sec' % (stop-start)
print 'Starting filtering...'
_start = cpu()
- #for readId,currentReadLocations in all_remapped_reads.items():
- #for location in currentReadLocations[:1]:
-
for location,original_line in self.all_remapped_reads:
if ctr % 1000 == 0:
# forgot to do this
if strand == '-':
- #pos = self.lt1.seqInfo.chromo_sizes[chr]-pos-self.read_size
unb_seq = reverse_complement(unb_seq)
seq = reverse_complement(seq)
accessWrapper = DataAccessWrapper(settings)
seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
-
for line in open(result_fn):
sl = line.split()
from qpalma.SettingsParser import parseSettings
+from qpalma.utils import logwrite
-Errormsg = """Usage is: python qpalma_pipeline.py <config filename>"""
+
+Errormsg = """Usage is: python qpalma_pipeline.py predict|train <config filename>"""
class System:
"""
self.settings = parseSettings(filename)
+ logwrite('Parsed settings system set up.',self.settings)
def training(self):
converts the data to the right format needed by QPalma for the training
algorithm.
"""
+ logwrite('Begin of training.\n',self.settings)
#
pre_task = TrainingPreprocessingTask(self.settings)
train_task.Submit()
train_task.CheckIfTaskFinished()
+ logwrite('End of training.\n',self.settings)
def prediction(self):
"""
generate several output files containing the spliced alignments
"""
+ logwrite('Begin of prediction.\n',self.settings)
+
# Before creating a candidate spliced read dataset we have to first filter
# the matches from the first seed finding run.
post_task.Submit()
post_task.CheckIfTaskFinished()
- print "Success!"
+ logwrite('End of prediction.\n',self.settings)
if __name__ == '__main__':
- filename = sys.argv[1]
+ mode = sys.argv[1]
+ assert mode in ['predict','train'], Errormsg
+ filename = sys.argv[2]
assert os.path.exists(filename), Errormsg
+
+ # creating system object
system_obj = System(filename)
- system_obj.prediction()
- #system_obj.training()
+ if mode == 'predict':
+ system_obj.prediction()
+ elif mode == 'train':
+ system_obj.training()
+ else:
+ assert False