+ fixed some bugs in the negative strand lookup table
[qpalma.git] / scripts / Experiment.py
index d85d589..c414d5a 100644 (file)
@@ -15,111 +15,164 @@ from Run import *
 import pdb
 import os
 import os.path
+import cPickle
 
-def createRuns():
-   # specify n for n-fold cross validation
-   numFolds=5
+def get_dataset(dataset_size,num_nodes):
+   all_instances = []
 
-   # the main directory where all results are stored
-   experiment_dir = '/fml/ag-raetsch/home/fabio/tmp/QPalma_test'
+   params = [\
+   ('prediction_begin',400000),\
+   ('prediction_end',440000)]
+
+   all_instances.append(params)
+
+   return all_instances
+
+
+def get_dataset_splitting_instances(dataset_size,num_nodes):
+   all_instances = []
+
+   part = dataset_size / num_nodes
+   begin = 0
+   end = 0
+   for idx in range(1,num_nodes+1):
+      
+      if idx == num_nodes:
+         begin = end
+         end   = dataset_size
+      else:
+         begin = end
+         end = begin+part
+
+      params = [\
+      ('prediction_begin',begin),\
+      ('prediction_end',end)]
+
+      all_instances.append(params)
+
+   return all_instances
+
+
+def get_training_instances():
+   all_instances = []
+
+   params = [\
+   ('enable_quality_scores',True),\
+   ('enable_splice_signals',True),\
+   ('enable_intron_length',True)]
+
+   all_instances.append(params)
+
+   return all_instances
+
+
+def get_scoring_instances():
+   all_instances = []
+
+   for QFlag in [True,False]:
+      for SSFlag in [True,False]:
+         for ILFlag in [True,False]:
+            params = [\
+            ('enable_quality_scores',QFlag),\
+            ('enable_splice_signals',SSFlag),\
+            ('enable_intron_length',ILFlag)]
+            all_instances.append(params)
+
+   return all_instances
 
-   assert os.path.exists(experiment_dir), 'toplevel dir for experiment does not exist!'
 
+def createRuns():
+   # load the configuration object
+   Config = cPickle.load(open(Conf.conf_object_path))
+
+   # the main directory where all results are stored
+   alignment_dir   = Config['alignment_dir']
+   
    # list of regularization parameters and additional flags for different runs
    # for example:
    #  - with quality scores
    #  - without quality scores
    #
    bool2str = ['-','+']
+   ctr = 1
 
-   allRuns = []
-
-   #dataset_filename = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/dataset_remapped_test'
-   dataset_filename = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/dataset_remapped_02_04_2008'
-
-   for QFlag in [True,False]:
-      for SSFlag in [True,False]:
-         #for ILFlag in [True]:
-         for ILFlag in [True,False]:
+   all_instances = get_training_instances()
             
-            # create a new Run object
-            currentRun = Run()
+   allRuns = []
+   for parameters in all_instances:
+      # create a new Run object
+      currentRun = Run()
+      currentName = 'run'
+
+      for param_name,param in parameters:
+         currentRun[param_name] = param
+         currentName += '_%s_%s' % (str(param_name),str(bool2str[param]))
             
-            # global settings for all runs
-            currentRun['anzpath']            = Conf.anzpath
-            currentRun['iter_steps']         = Conf.iter_steps
-            currentRun['matchmatrixRows']    = Conf.sizeMatchmatrix[0]
-            currentRun['matchmatrixCols']    = Conf.sizeMatchmatrix[1]
-            currentRun['mode']               = Conf.mode
-            currentRun['numConstraintsPerRound'] = Conf.numConstraintsPerRound 
-
-            currentRun['remove_duplicate_scores']  = Conf.remove_duplicate_scores
-            currentRun['print_matrix']          = Conf.print_matrix
-            currentRun['read_size']             = Conf.read_size
+      # global settings for all runs
+      currentRun['anzpath']            = Conf.anzpath
+      currentRun['iter_steps']         = Conf.iter_steps
+      currentRun['matchmatrixRows']    = Conf.sizeMatchmatrix[0]
+      currentRun['matchmatrixCols']    = Conf.sizeMatchmatrix[1]
+      currentRun['mode']               = Conf.mode
+      currentRun['numConstraintsPerRound'] = Conf.numConstraintsPerRound 
 
-         
-            currentRun['numLengthSuppPoints']   = 10 #Conf.numLengthSuppPoints
+      currentRun['remove_duplicate_scores']  = Conf.remove_duplicate_scores
+      currentRun['print_matrix']          = Conf.print_matrix
+      currentRun['read_size']             = Conf.read_size
 
-            # if we are not using an intron length model at all we do not need the support points
-            if ILFlag == False:
-               currentRun['numLengthSuppPoints']   = 2 #Conf.numLengthSuppPoints
+      currentRun['numDonSuppPoints']      = 10
+      currentRun['numAccSuppPoints']      = 10
 
-            currentRun['numDonSuppPoints']      = 10
-            currentRun['numAccSuppPoints']      = 10
+      currentRun['numQualPlifs']          = Conf.numQualPlifs
+      currentRun['numQualSuppPoints']     = 10
+      currentRun['totalQualSuppPoints']   = currentRun['numQualPlifs']*currentRun['numQualSuppPoints']
 
-            currentRun['numQualPlifs']          = Conf.numQualPlifs
-            currentRun['numQualSuppPoints']     = 10
-            currentRun['totalQualSuppPoints']   = currentRun['numQualPlifs']*currentRun['numQualSuppPoints']
+      currentRun['enable_quality_scores'] = True
+      currentRun['enable_splice_signals'] = True
+      currentRun['enable_intron_length']  = True
 
-            currentRun['numFeatures']           = currentRun['numLengthSuppPoints']\
-            + currentRun['numDonSuppPoints'] + currentRun['numAccSuppPoints']\
-            + currentRun['matchmatrixRows'] * currentRun['matchmatrixCols']\
-            + currentRun['totalQualSuppPoints'] 
+      # if we are not using an intron length model at all we do not need the support points
+      currentRun['numLengthSuppPoints']   = 10 #Conf.numLengthSuppPoints
 
-            # run-specific settings
-            currentRun['training_begin']         = Conf.training_begin
-            currentRun['training_end']           = Conf.training_end
-            currentRun['prediction_begin']       = Conf.prediction_begin
-            currentRun['prediction_end']         = Conf.prediction_end
+      if currentRun['enable_intron_length'] == False:
+         currentRun['numLengthSuppPoints']   = 2 #Conf.numLengthSuppPoints
 
-            currentRun['enable_quality_scores'] = QFlag
-            currentRun['enable_splice_signals'] = SSFlag
-            currentRun['enable_intron_length']  = ILFlag
+      currentRun['numFeatures']           = currentRun['numLengthSuppPoints']\
+      + currentRun['numDonSuppPoints'] + currentRun['numAccSuppPoints']\
+      + currentRun['matchmatrixRows'] * currentRun['matchmatrixCols']\
+      + currentRun['totalQualSuppPoints'] 
 
-            currentName = 'run_%s_quality_%s_splicesignals_%s_intron_len' %\
-            (bool2str[QFlag],bool2str[SSFlag],bool2str[ILFlag])
+      # run-specific settings
+      currentRun['C']                     = 100
 
-            currentRun['C']                     = 100
+      currentRun['name']                  = currentName
+      currentRun['alignment_dir']         = alignment_dir
 
-            currentRun['name']                  = currentName
-            currentRun['dataset_filename']      = dataset_filename
-            currentRun['experiment_path']       = experiment_dir
+      currentRun['min_intron_len']        = 20
+      currentRun['max_intron_len']        = 2000
 
-            currentRun['min_intron_len'] = 20
-            currentRun['max_intron_len'] = 2000
+      currentRun['min_svm_score']         = 0.0 
+      currentRun['max_svm_score']         = 1.0
 
-            #currentRun['min_intron_len'] = 10
-            #currentRun['max_intron_len'] = 100
+      currentRun['min_qual']              = -5
+      currentRun['max_qual']              = 40
 
-            currentRun['min_svm_score'] = 0.0 
-            currentRun['max_svm_score'] = 1.0
+      currentRun['dna_flat_files']        = Conf.dna_flat_fn
 
-            currentRun['min_qual'] = -5
-            currentRun['max_qual'] = 40
+      currentRun['id']      = ctr
+      ctr += 1
 
-            currentRun['dna_flat_files']      = Conf.dna_flat_fn
+      allRuns.append(currentRun)
 
-            allRuns.append(currentRun)
+###############################################################################
+#
+# check for valid paths / options etc
+#
+###############################################################################
 
-   #
-   # check for valid paths / options etc
-   #
    for currentRun in allRuns:
 
       assert 0 < currentRun['anzpath'] < 100
-      assert 0 <= currentRun['training_begin'] < currentRun['training_end']
-      assert currentRun['training_end'] <= currentRun['prediction_begin'] < currentRun['prediction_end']
-
       assert currentRun['iter_steps']
 
       #assert currentRun['matchmatrixCols']
@@ -151,11 +204,10 @@ def createRuns():
       assert currentRun['enable_intron_length']  in [True,False]
 
       #assert currentRun['totalQualSuppPoints']
-      assert os.path.exists(currentRun['dataset_filename'])
-      assert os.path.exists(currentRun['experiment_path'])
+      assert os.path.exists(currentRun['alignment_dir'])
 
    return allRuns
 
 if __name__ == '__main__':
    allRuns = createRuns()
-   pdb.set_trace()
+   #pdb.set_trace()