+ changed parameter handling a bit (input files command line args and not inside...
authorFabio <fabio@congo.fml.local>
Wed, 22 Oct 2008 14:23:46 +0000 (16:23 +0200)
committerFabio <fabio@congo.fml.local>
Wed, 22 Oct 2008 14:23:46 +0000 (16:23 +0200)
+ trainig data parser more flexible

doc/qpalma-manual.tex
qpalma/DatasetUtils.py
qpalma/SettingsParser.py
qpalma/gridtools.py
scripts/qpalma_pipeline.py
test.conf

index 6ab20a4..2bbbe87 100644 (file)
@@ -282,7 +282,7 @@ corresponds to one short read. Each line has six tab-separated entries, namely:
 \begin{enumerate}
 \item unique read id
 \item chromosome/contig id
 \begin{enumerate}
 \item unique read id
 \item chromosome/contig id
-\item strand 
+\item strand [D/P or +/-]
 \item beginning of sequence fragment
 \item end of sequence fragment
 \item read sequence (in strand specific direction) with alignment information (see below)
 \item beginning of sequence fragment
 \item end of sequence fragment
 \item read sequence (in strand specific direction) with alignment information (see below)
index 4116e12..f4db9d8 100644 (file)
@@ -8,6 +8,7 @@
 
 import array
 import cPickle
 
 import array
 import cPickle
+import numpy
 import os
 import os.path
 import pdb
 import os
 import os.path
 import pdb
@@ -70,10 +71,14 @@ def generateTrainingDataset(settings):
 
    """
 
 
    """
 
-   dataset = []
+   dataset = {}
 
    half_window_size = settings['half_window_size']
 
 
    half_window_size = settings['half_window_size']
 
+   # This tuple specifies an interval for valid Illumina Genome Analyzer quality values
+   if settings['platform'] == 'IGA':
+      quality_interval = illumina_ga_range
+
    instance_counter = 0
 
    accessWrapper = DataAccessWrapper(settings)
    instance_counter = 0
 
    accessWrapper = DataAccessWrapper(settings)
@@ -97,13 +102,21 @@ def generateTrainingDataset(settings):
          continue
 
       # convert D/P strand info to +/-
          continue
 
       # convert D/P strand info to +/-
-      strand = ['-','+'][slist[2] == 'D']
+      strand = slist[2]
+
+      if strand in ['-','+']:
+         pass
+      elif strand in ['D','P']:
+         strand = ['-','+'][strand == 'D']
+      else:
+         print 'Error strand information has to be either +/- or D/P'
+         sys.exit(1)
 
       seqBeginning   = int(slist[3])
       seqEnd         = int(slist[4])
 
       readAlignment  = slist[5]
 
       seqBeginning   = int(slist[3])
       seqEnd         = int(slist[4])
 
       readAlignment  = slist[5]
-      prb = processQuality(slist[6],prb_offset,quality_intervals,perform_checks)
+      prb = processQuality(slist[6],settings['prb_offset'],quality_interval,settings['perform_checks'])
       
       exons = numpy.mat([0,0,0,0]).reshape((2,2))
 
       
       exons = numpy.mat([0,0,0,0]).reshape((2,2))
 
@@ -112,8 +125,8 @@ def generateTrainingDataset(settings):
       exons[1,0]     = int(slist[9])
       exons[1,1]     = int(slist[10])
 
       exons[1,0]     = int(slist[9])
       exons[1,1]     = int(slist[10])
 
-      dna = seqInfo.get_seq_and_scores(chr,strand,up_cut,down_cut,only_seq=True)
-      relative_exons = exons - up_cut
+      dna = seqInfo.get_seq_and_scores(chromo,strand,seqBeginning,seqEnd,only_seq=True)
+      relative_exons = exons - seqBeginning
 
       assert checkExons(dna,relative_exons,readAlignment,id)
 
 
       assert checkExons(dna,relative_exons,readAlignment,id)
 
@@ -127,10 +140,6 @@ def generateTrainingDataset(settings):
 def addToDataset(map_file,dataset,settings):
    assert os.path.exists(map_file), 'Error: Can not find map file'
 
 def addToDataset(map_file,dataset,settings):
    assert os.path.exists(map_file), 'Error: Can not find map file'
 
-   perform_checks = settings['perform_checks']
-
-   prb_offset = settings['prb_offset']
-
    # This tuple specifies an interval for valid Illumina Genome Analyzer quality values
    if settings['platform'] == 'IGA':
       quality_interval = illumina_ga_range
    # This tuple specifies an interval for valid Illumina Genome Analyzer quality values
    if settings['platform'] == 'IGA':
       quality_interval = illumina_ga_range
@@ -188,7 +197,7 @@ def addToDataset(map_file,dataset,settings):
 
       dna_seq = seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,True)
       
 
       dna_seq = seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,True)
       
-      prb = processQuality(slist[5],prb_offset,quality_interval,perform_checks)
+      prb = processQuality(slist[5],settings['prb_offset'],quality_interval,settings['perform_checks'])
    
       currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
 
    
       currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
 
index da4d91a..98f3dda 100644 (file)
@@ -94,7 +94,7 @@ def makeSettings(settings):
    # now convert and check all integer parameters
    for parameter in ['matchmatrixCols', 'matchmatrixRows', 'numAccSuppPoints', 'numConstraintsPerRound',\
    'numDonSuppPoints', 'numLengthSuppPoints', 'numQualPlifs', 'numQualSuppPoints', 'anzpath', 'iter_steps',\
    # now convert and check all integer parameters
    for parameter in ['matchmatrixCols', 'matchmatrixRows', 'numAccSuppPoints', 'numConstraintsPerRound',\
    'numDonSuppPoints', 'numLengthSuppPoints', 'numQualPlifs', 'numQualSuppPoints', 'anzpath', 'iter_steps',\
-   'max_intron_len', 'max_qual', 'min_intron_len', 'min_qual', 'totalQualSuppPoints','C','num_splits','prb_offset',\
+   'max_intron_len', 'max_qual', 'min_intron_len', 'min_qual', 'C','num_splits','prb_offset',\
    'half_window_size']:
          
       try:
    'half_window_size']:
          
       try:
index 055331e..57f4da3 100644 (file)
@@ -237,8 +237,8 @@ class TrainingTask(ClusterTask):
    This class represents the cluster task of training QPalma.
    """
 
    This class represents the cluster task of training QPalma.
    """
 
-   def __init__(self):
-      ClusterTask.__init__(self)
+   def __init__(self,settings):
+      ClusterTask.__init__(self,settings)
 
 
    def CreateJobs(self):
 
 
    def CreateJobs(self):
@@ -269,6 +269,7 @@ class TrainingTask(ClusterTask):
    def collectResults(self):
       pass
 
    def collectResults(self):
       pass
 
+
 def TrainingTaskStarter(settings,dataset_fn,training_keys,set_name):
    accessWrapper = DataAccessWrapper(settings)
    seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
 def TrainingTaskStarter(settings,dataset_fn,training_keys,set_name):
    accessWrapper = DataAccessWrapper(settings)
    seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
index 8c9e501..362e5f6 100644 (file)
@@ -20,6 +20,7 @@ import sys
 
 from qpalma.gridtools import ApproximationTask,PreprocessingTask
 from qpalma.gridtools import AlignmentTask,PostprocessingTask
 
 from qpalma.gridtools import ApproximationTask,PreprocessingTask
 from qpalma.gridtools import AlignmentTask,PostprocessingTask
+from qpalma.gridtools import TrainingTask
 
 from qpalma.DatasetUtils import generatePredictionDataset,generateTrainingDataset
 
 
 from qpalma.DatasetUtils import generatePredictionDataset,generateTrainingDataset
 
@@ -27,7 +28,6 @@ from qpalma.SettingsParser import parseSettings
 
 from qpalma.utils import logwrite
 
 
 from qpalma.utils import logwrite
 
-
 Errormsg = """Usage is:  
                   
                   python qpalma_pipeline.py train <config filename> <training data filename>
 Errormsg = """Usage is:  
                   
                   python qpalma_pipeline.py train <config filename> <training data filename>
@@ -37,6 +37,12 @@ Errormsg = """Usage is:
            """
 
 
            """
 
 
+def printMessage(mes):
+   print '#'*80
+   print '\t\t\t%s...\n'%mes
+   print '#'*80
+
+
 class System:
    """
    This class wraps the outer loop of the qpalma project
 class System:
    """
    This class wraps the outer loop of the qpalma project
@@ -67,15 +73,15 @@ class System:
       """
       logwrite('Begin of training.\n',self.settings)
 
       """
       logwrite('Begin of training.\n',self.settings)
 
-      print '#'*80
-      print '\t\t\tStarting approximation...\n'
-      print '#'*80
+      printMessage('Starting dataset generation')
 
       self.settings['training_data_fn'] = training_data_fn
 
       # Collect the data and create a pickled training set
       generateTrainingDataset(self.settings)
 
 
       self.settings['training_data_fn'] = training_data_fn
 
       # Collect the data and create a pickled training set
       generateTrainingDataset(self.settings)
 
+      printMessage('Starting training')
+
       # Now that we have a dataset we can perform training
       train_task = TrainingTask(self.settings)
       train_task.CreateJobs()
       # Now that we have a dataset we can perform training
       train_task = TrainingTask(self.settings)
       train_task.CreateJobs()
@@ -94,9 +100,7 @@ class System:
 
       logwrite('Begin of prediction.\n',self.settings)
 
 
       logwrite('Begin of prediction.\n',self.settings)
 
-      print '#'*80
-      print '\t\t\tStarting approximation...\n'
-      print '#'*80
+      printMessage('Starting approximation')
 
       self.settings['prediction_param_fn'] = param_fn
       self.settings['unspliced_reads_fn']  = unspliced_reads_fn
 
       self.settings['prediction_param_fn'] = param_fn
       self.settings['unspliced_reads_fn']  = unspliced_reads_fn
@@ -113,15 +117,11 @@ class System:
       # After filtering combine the filtered matches from the first run and the
       # found matches from the second run to a full dataset
 
       # After filtering combine the filtered matches from the first run and the
       # found matches from the second run to a full dataset
 
-      print '#'*80
-      print '\t\t\tStarting dataset generation...\n'
-      print '#'*80
+      printMessage('Starting dataset generation')
 
       generatePredictionDataset(self.settings)
 
 
       generatePredictionDataset(self.settings)
 
-      print '#'*80
-      print '\t\t\tStarting alignments...\n'
-      print '#'*80
+      printMessage('Starting alignments')
 
       # Now that we have a dataset we can perform accurate alignments
       align_task = AlignmentTask(self.settings)
 
       # Now that we have a dataset we can perform accurate alignments
       align_task = AlignmentTask(self.settings)
@@ -129,9 +129,7 @@ class System:
       align_task.Submit()
       align_task.CheckIfTaskFinished()
 
       align_task.Submit()
       align_task.CheckIfTaskFinished()
 
-      print '#'*80
-      print '\t\t\tPostprocessing...\n'
-      print '#'*80
+      printMessage('Postprocessing')
 
       # The results of the above alignment step can be converted to a data format
       # needed for further postprocessing.
 
       # The results of the above alignment step can be converted to a data format
       # needed for further postprocessing.
@@ -144,7 +142,9 @@ class System:
    
 
 if __name__ == '__main__':
    
 
 if __name__ == '__main__':
-   if len(sys.argv) != 4 or len(sys.argv) != 6:
+   if len(sys.argv) != 4 and len(sys.argv) != 6:
+      print 'Invalid number of arguments!'
+      print len(sys.argv)
       print Errormsg
       sys.exit(1)
 
       print Errormsg
       sys.exit(1)
 
@@ -165,7 +165,7 @@ if __name__ == '__main__':
       param_fn           = sys.argv[3]
       unspliced_reads_fn = sys.argv[4]
       spliced_reads_fn   = sys.argv[5]
       param_fn           = sys.argv[3]
       unspliced_reads_fn = sys.argv[4]
       spliced_reads_fn   = sys.argv[5]
-      system_obj.prediction(unspliced_reads_fn, spliced_reads_fn)
+      system_obj.prediction(param_fn,unspliced_reads_fn, spliced_reads_fn)
    elif mode == 'train':
       training_data_fn   = sys.argv[3]
       system_obj.training(training_data_fn)
    elif mode == 'train':
       training_data_fn   = sys.argv[3]
       system_obj.training(training_data_fn)
index 9017c22..9eb7f82 100644 (file)
--- a/test.conf
+++ b/test.conf
@@ -28,9 +28,9 @@ result_dir = /fml/ag-raetsch/home/fabio/tmp/sandbox/first_test
 # These variables store the filename of the raw read data
 #
 
 # These variables store the filename of the raw read data
 #
 
-spliced_reads_fn = /fml/ag-raetsch/home/fabio/tmp/sandbox/map_2nd.vm
+#spliced_reads_fn = /fml/ag-raetsch/home/fabio/tmp/sandbox/map_2nd.vm
 
 
-unspliced_reads_fn = /fml/ag-raetsch/home/fabio/tmp/sandbox/map.vm
+#unspliced_reads_fn = /fml/ag-raetsch/home/fabio/tmp/sandbox/map.vm
 
 #
 # You can specify how many nodes you want to use via this variable
 
 #
 # You can specify how many nodes you want to use via this variable
@@ -38,12 +38,6 @@ unspliced_reads_fn = /fml/ag-raetsch/home/fabio/tmp/sandbox/map.vm
 
 num_splits = 5
 
 
 num_splits = 5
 
-#
-# The parameter you want to do prediction with:
-#
-
-prediction_param_fn = /fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/run_enable_quality_scores_+_enable_splice_signals_+_enable_intron_length_+/param_526.pickle
-
 #
 # In order to align short reads QPalma relies on additional information such as
 # splice site score prediction.
 #
 # In order to align short reads QPalma relies on additional information such as
 # splice site score prediction.