From: Fabio Date: Mon, 22 Sep 2008 11:01:56 +0000 (+0200) Subject: + added some documentary text X-Git-Url: http://git.tuebingen.mpg.de/?p=qpalma.git;a=commitdiff_plain;h=2d04235176ddbcea68d6508fb45c1e64ef335097 + added some documentary text + some minor modifications --- diff --git a/doc/qpalma.tex b/doc/qpalma.tex index 35f87df..b45cc64 100644 --- a/doc/qpalma.tex +++ b/doc/qpalma.tex @@ -21,11 +21,11 @@ \QP is an alignment tool targeted to align spliced reads produced by ``Next Generation'' sequencing platforms such as $Illumina Genome Analyzer$ or $454$. The basic idea is to use an extended Smith-Waterman algorithm for local -alignments that uses the base quality information of the reads directly during -the alignment step. Optimal alignment parameters i.e. scoring matrices are -inferred using a machine learning technique similar to \emph{Support Vector -Machines}. For further details on \QP itself consult the paper \cite{DeBona08}. -For details about the learning method \cite{Tsochantaridis04}. +alignments that uses the base quality information of the reads directly in the +alignment step. Optimal alignment parameters i.e. scoring matrices are inferred +using a machine learning technique similar to \emph{Support Vector Machines}. +For further details on \QP itself consult the paper \cite{DeBona08}. For +details about the learning method \cite{Tsochantaridis04}. %$SOLid$. %We refer to the whole pipeline as the \QP pipeline and \QP respectively. diff --git a/qpalma/Lookup.py b/qpalma/Lookup.py index 8b4ac2a..f22ecd7 100644 --- a/qpalma/Lookup.py +++ b/qpalma/Lookup.py @@ -13,9 +13,10 @@ class LookupTable: and the scores once for all chromosomes. """ - def __init__(self,genomic_data_dir,acc_score_dir,don_score_dir,gen_fmt,sscore_fmt,chromo_list): - accessWrapper = DataAccessWrapper(genomic_data_dir,acc_score_dir,don_score_dir,gen_fmt,sscore_fmt) - self.seqInfo = SeqSpliceInfo(accessWrapper,chromo_list) + def __init__(self,settings): + chromo_list = settings['allowed_fragments'] + accessWrapper = DataAccessWrapper(settings) + self.seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments']) self.strands = ['+','-'] diff --git a/qpalma/SettingsParser.py b/qpalma/SettingsParser.py index 65c9824..dd56766 100644 --- a/qpalma/SettingsParser.py +++ b/qpalma/SettingsParser.py @@ -53,7 +53,7 @@ def makeSettings(settings): except: print ('Error: There was a problem generating the subdirectory: %s' % dir_name) - assert checkSettings(settings),'Check your settings some entries were invalid!' + #assert checkSettings(settings),'Check your settings some entries were invalid!' ddir = settings['dataset_dir'] settings['prediction_dataset_fn'] = jp(ddir,'prediction_data.pickle') @@ -62,7 +62,7 @@ def makeSettings(settings): settings['training_dataset_keys_fn'] = jp(ddir,'training_data.keys.pickle') try: - os.mkdir(settings['global_log_fn']) + os.mknod(settings['global_log_fn']) except: print 'Error: There was a problem generating the logfile %s' % settings['global_log_fn'] @@ -79,15 +79,13 @@ def makeSettings(settings): def checkSettings(settings): for (key, val,) in settings.items(): - if key.endswith('_fn'): + if key.endswith('_fn') or key.endswith('_dir'): assert os.path.exists(val), 'Error: Path/File %s with value %s does not seem to exist!' % (key,val) - if key.endswith('_dir'): - assert os.path.exists(val), 'Error: Path/File %s with value %s does not seem to exist!' % (key,val) - return True def parseSettings(filename): settings = parseSettingsFile(filename) settings = makeSettings(settings) + #assert checkSettings(settings) return settings diff --git a/qpalma/utils.py b/qpalma/utils.py index 38d4558..a91db33 100644 --- a/qpalma/utils.py +++ b/qpalma/utils.py @@ -270,5 +270,13 @@ def get_slices(dataset_size,num_nodes): return all_instances +def logwrite(mes,settings): + """ + A wrapper to write message to the global logfile. + """ + fh = open(settings['global_log_fn'],'a') + fh.write(mes) + + if __name__ == '__main__': split_file('/fml/ag-raetsch/home/fabio/tmp/lyrata_analysis/map.vm','/tmp',25) diff --git a/scripts/PipelineHeuristic.py b/scripts/PipelineHeuristic.py index 472c0e7..47dae45 100644 --- a/scripts/PipelineHeuristic.py +++ b/scripts/PipelineHeuristic.py @@ -79,7 +79,7 @@ class PipelineHeuristic: seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments']) start = cpu() - self.lt1 = LookupTable(g_dir,acc_dir,don_dir,g_fmt,s_fmt,self.chromo_range) + self.lt1 = LookupTable(settings) stop = cpu() print 'prefetched sequence and splice data in %f sec' % (stop-start) @@ -198,9 +198,6 @@ class PipelineHeuristic: print 'Starting filtering...' _start = cpu() - #for readId,currentReadLocations in all_remapped_reads.items(): - #for location in currentReadLocations[:1]: - for location,original_line in self.all_remapped_reads: if ctr % 1000 == 0: @@ -228,7 +225,6 @@ class PipelineHeuristic: # forgot to do this if strand == '-': - #pos = self.lt1.seqInfo.chromo_sizes[chr]-pos-self.read_size unb_seq = reverse_complement(unb_seq) seq = reverse_complement(seq) diff --git a/scripts/createAlignmentFileFromPrediction.py b/scripts/createAlignmentFileFromPrediction.py index 8fdc822..f591ae2 100644 --- a/scripts/createAlignmentFileFromPrediction.py +++ b/scripts/createAlignmentFileFromPrediction.py @@ -215,7 +215,6 @@ def run(chunk_dir,outfile): accessWrapper = DataAccessWrapper(settings) seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments']) - for line in open(result_fn): sl = line.split() diff --git a/scripts/qpalma_pipeline.py b/scripts/qpalma_pipeline.py index 5bb4af1..6730974 100644 --- a/scripts/qpalma_pipeline.py +++ b/scripts/qpalma_pipeline.py @@ -25,8 +25,10 @@ from qpalma.DatasetUtils import generatePredictionDataset,generateTrainingDatase from qpalma.SettingsParser import parseSettings +from qpalma.utils import logwrite -Errormsg = """Usage is: python qpalma_pipeline.py """ + +Errormsg = """Usage is: python qpalma_pipeline.py predict|train """ class System: @@ -48,6 +50,7 @@ class System: """ self.settings = parseSettings(filename) + logwrite('Parsed settings system set up.',self.settings) def training(self): @@ -56,6 +59,7 @@ class System: converts the data to the right format needed by QPalma for the training algorithm. """ + logwrite('Begin of training.\n',self.settings) # pre_task = TrainingPreprocessingTask(self.settings) @@ -72,6 +76,7 @@ class System: train_task.Submit() train_task.CheckIfTaskFinished() + logwrite('End of training.\n',self.settings) def prediction(self): """ @@ -80,6 +85,8 @@ class System: generate several output files containing the spliced alignments """ + logwrite('Begin of prediction.\n',self.settings) + # Before creating a candidate spliced read dataset we have to first filter # the matches from the first seed finding run. @@ -110,13 +117,21 @@ class System: post_task.Submit() post_task.CheckIfTaskFinished() - print "Success!" + logwrite('End of prediction.\n',self.settings) if __name__ == '__main__': - filename = sys.argv[1] + mode = sys.argv[1] + assert mode in ['predict','train'], Errormsg + filename = sys.argv[2] assert os.path.exists(filename), Errormsg + + # creating system object system_obj = System(filename) - system_obj.prediction() - #system_obj.training() + if mode == 'predict': + system_obj.prediction() + elif mode == 'train': + system_obj.training() + else: + assert False