+ changed parameter handling a bit (input files command line args and not inside...
authorFabio <fabio@congo.fml.local>
Wed, 22 Oct 2008 14:23:46 +0000 (16:23 +0200)
committerFabio <fabio@congo.fml.local>
Wed, 22 Oct 2008 14:23:46 +0000 (16:23 +0200)
+ trainig data parser more flexible

doc/qpalma-manual.tex
qpalma/DatasetUtils.py
qpalma/SettingsParser.py
qpalma/gridtools.py
scripts/qpalma_pipeline.py
test.conf

index 6ab20a4..2bbbe87 100644 (file)
@@ -282,7 +282,7 @@ corresponds to one short read. Each line has six tab-separated entries, namely:
 \begin{enumerate}
 \item unique read id
 \item chromosome/contig id
-\item strand 
+\item strand [D/P or +/-]
 \item beginning of sequence fragment
 \item end of sequence fragment
 \item read sequence (in strand specific direction) with alignment information (see below)
index 4116e12..f4db9d8 100644 (file)
@@ -8,6 +8,7 @@
 
 import array
 import cPickle
+import numpy
 import os
 import os.path
 import pdb
@@ -70,10 +71,14 @@ def generateTrainingDataset(settings):
 
    """
 
-   dataset = []
+   dataset = {}
 
    half_window_size = settings['half_window_size']
 
+   # This tuple specifies an interval for valid Illumina Genome Analyzer quality values
+   if settings['platform'] == 'IGA':
+      quality_interval = illumina_ga_range
+
    instance_counter = 0
 
    accessWrapper = DataAccessWrapper(settings)
@@ -97,13 +102,21 @@ def generateTrainingDataset(settings):
          continue
 
       # convert D/P strand info to +/-
-      strand = ['-','+'][slist[2] == 'D']
+      strand = slist[2]
+
+      if strand in ['-','+']:
+         pass
+      elif strand in ['D','P']:
+         strand = ['-','+'][strand == 'D']
+      else:
+         print 'Error strand information has to be either +/- or D/P'
+         sys.exit(1)
 
       seqBeginning   = int(slist[3])
       seqEnd         = int(slist[4])
 
       readAlignment  = slist[5]
-      prb = processQuality(slist[6],prb_offset,quality_intervals,perform_checks)
+      prb = processQuality(slist[6],settings['prb_offset'],quality_interval,settings['perform_checks'])
       
       exons = numpy.mat([0,0,0,0]).reshape((2,2))
 
@@ -112,8 +125,8 @@ def generateTrainingDataset(settings):
       exons[1,0]     = int(slist[9])
       exons[1,1]     = int(slist[10])
 
-      dna = seqInfo.get_seq_and_scores(chr,strand,up_cut,down_cut,only_seq=True)
-      relative_exons = exons - up_cut
+      dna = seqInfo.get_seq_and_scores(chromo,strand,seqBeginning,seqEnd,only_seq=True)
+      relative_exons = exons - seqBeginning
 
       assert checkExons(dna,relative_exons,readAlignment,id)
 
@@ -127,10 +140,6 @@ def generateTrainingDataset(settings):
 def addToDataset(map_file,dataset,settings):
    assert os.path.exists(map_file), 'Error: Can not find map file'
 
-   perform_checks = settings['perform_checks']
-
-   prb_offset = settings['prb_offset']
-
    # This tuple specifies an interval for valid Illumina Genome Analyzer quality values
    if settings['platform'] == 'IGA':
       quality_interval = illumina_ga_range
@@ -188,7 +197,7 @@ def addToDataset(map_file,dataset,settings):
 
       dna_seq = seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,True)
       
-      prb = processQuality(slist[5],prb_offset,quality_interval,perform_checks)
+      prb = processQuality(slist[5],settings['prb_offset'],quality_interval,settings['perform_checks'])
    
       currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
 
index da4d91a..98f3dda 100644 (file)
@@ -94,7 +94,7 @@ def makeSettings(settings):
    # now convert and check all integer parameters
    for parameter in ['matchmatrixCols', 'matchmatrixRows', 'numAccSuppPoints', 'numConstraintsPerRound',\
    'numDonSuppPoints', 'numLengthSuppPoints', 'numQualPlifs', 'numQualSuppPoints', 'anzpath', 'iter_steps',\
-   'max_intron_len', 'max_qual', 'min_intron_len', 'min_qual', 'totalQualSuppPoints','C','num_splits','prb_offset',\
+   'max_intron_len', 'max_qual', 'min_intron_len', 'min_qual', 'C','num_splits','prb_offset',\
    'half_window_size']:
          
       try:
index 055331e..57f4da3 100644 (file)
@@ -237,8 +237,8 @@ class TrainingTask(ClusterTask):
    This class represents the cluster task of training QPalma.
    """
 
-   def __init__(self):
-      ClusterTask.__init__(self)
+   def __init__(self,settings):
+      ClusterTask.__init__(self,settings)
 
 
    def CreateJobs(self):
@@ -269,6 +269,7 @@ class TrainingTask(ClusterTask):
    def collectResults(self):
       pass
 
+
 def TrainingTaskStarter(settings,dataset_fn,training_keys,set_name):
    accessWrapper = DataAccessWrapper(settings)
    seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
index 8c9e501..362e5f6 100644 (file)
@@ -20,6 +20,7 @@ import sys
 
 from qpalma.gridtools import ApproximationTask,PreprocessingTask
 from qpalma.gridtools import AlignmentTask,PostprocessingTask
+from qpalma.gridtools import TrainingTask
 
 from qpalma.DatasetUtils import generatePredictionDataset,generateTrainingDataset
 
@@ -27,7 +28,6 @@ from qpalma.SettingsParser import parseSettings
 
 from qpalma.utils import logwrite
 
-
 Errormsg = """Usage is:  
                   
                   python qpalma_pipeline.py train <config filename> <training data filename>
@@ -37,6 +37,12 @@ Errormsg = """Usage is:
            """
 
 
+def printMessage(mes):
+   print '#'*80
+   print '\t\t\t%s...\n'%mes
+   print '#'*80
+
+
 class System:
    """
    This class wraps the outer loop of the qpalma project
@@ -67,15 +73,15 @@ class System:
       """
       logwrite('Begin of training.\n',self.settings)
 
-      print '#'*80
-      print '\t\t\tStarting approximation...\n'
-      print '#'*80
+      printMessage('Starting dataset generation')
 
       self.settings['training_data_fn'] = training_data_fn
 
       # Collect the data and create a pickled training set
       generateTrainingDataset(self.settings)
 
+      printMessage('Starting training')
+
       # Now that we have a dataset we can perform training
       train_task = TrainingTask(self.settings)
       train_task.CreateJobs()
@@ -94,9 +100,7 @@ class System:
 
       logwrite('Begin of prediction.\n',self.settings)
 
-      print '#'*80
-      print '\t\t\tStarting approximation...\n'
-      print '#'*80
+      printMessage('Starting approximation')
 
       self.settings['prediction_param_fn'] = param_fn
       self.settings['unspliced_reads_fn']  = unspliced_reads_fn
@@ -113,15 +117,11 @@ class System:
       # After filtering combine the filtered matches from the first run and the
       # found matches from the second run to a full dataset
 
-      print '#'*80
-      print '\t\t\tStarting dataset generation...\n'
-      print '#'*80
+      printMessage('Starting dataset generation')
 
       generatePredictionDataset(self.settings)
 
-      print '#'*80
-      print '\t\t\tStarting alignments...\n'
-      print '#'*80
+      printMessage('Starting alignments')
 
       # Now that we have a dataset we can perform accurate alignments
       align_task = AlignmentTask(self.settings)
@@ -129,9 +129,7 @@ class System:
       align_task.Submit()
       align_task.CheckIfTaskFinished()
 
-      print '#'*80
-      print '\t\t\tPostprocessing...\n'
-      print '#'*80
+      printMessage('Postprocessing')
 
       # The results of the above alignment step can be converted to a data format
       # needed for further postprocessing.
@@ -144,7 +142,9 @@ class System:
    
 
 if __name__ == '__main__':
-   if len(sys.argv) != 4 or len(sys.argv) != 6:
+   if len(sys.argv) != 4 and len(sys.argv) != 6:
+      print 'Invalid number of arguments!'
+      print len(sys.argv)
       print Errormsg
       sys.exit(1)
 
@@ -165,7 +165,7 @@ if __name__ == '__main__':
       param_fn           = sys.argv[3]
       unspliced_reads_fn = sys.argv[4]
       spliced_reads_fn   = sys.argv[5]
-      system_obj.prediction(unspliced_reads_fn, spliced_reads_fn)
+      system_obj.prediction(param_fn,unspliced_reads_fn, spliced_reads_fn)
    elif mode == 'train':
       training_data_fn   = sys.argv[3]
       system_obj.training(training_data_fn)
index 9017c22..9eb7f82 100644 (file)
--- a/test.conf
+++ b/test.conf
@@ -28,9 +28,9 @@ result_dir = /fml/ag-raetsch/home/fabio/tmp/sandbox/first_test
 # These variables store the filename of the raw read data
 #
 
-spliced_reads_fn = /fml/ag-raetsch/home/fabio/tmp/sandbox/map_2nd.vm
+#spliced_reads_fn = /fml/ag-raetsch/home/fabio/tmp/sandbox/map_2nd.vm
 
-unspliced_reads_fn = /fml/ag-raetsch/home/fabio/tmp/sandbox/map.vm
+#unspliced_reads_fn = /fml/ag-raetsch/home/fabio/tmp/sandbox/map.vm
 
 #
 # You can specify how many nodes you want to use via this variable
@@ -39,12 +39,6 @@ unspliced_reads_fn = /fml/ag-raetsch/home/fabio/tmp/sandbox/map.vm
 num_splits = 5
 
 #
-# The parameter you want to do prediction with:
-#
-
-prediction_param_fn = /fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/run_enable_quality_scores_+_enable_splice_signals_+_enable_intron_length_+/param_526.pickle
-
-#
 # In order to align short reads QPalma relies on additional information such as
 # splice site score prediction.
 #