+ fixed some bugs in the negative strand lookup table
[qpalma.git] / scripts / Experiment.py
index e4143c2..c414d5a 100644 (file)
 ###############################################################################
 #
 ###############################################################################
 #
-# This file contains setting for one experiment
+# This file contains settings for one experiment
 #
 # The general idea is as follows:
 # 
 # Suppose you have an machine learning algorithm you want to perform model
 #
 # The general idea is as follows:
 # 
 # Suppose you have an machine learning algorithm you want to perform model
-# selection. Then for each different value of for example C for a C-SVM this
+# selection with. Then for each different value of for example C for a C-SVM this
 # script generates a Run object a subclass of dict storing the parameters.
 #
 ###############################################################################
 
 import qpalma.Configuration as Conf
 from Run import *
 # script generates a Run object a subclass of dict storing the parameters.
 #
 ###############################################################################
 
 import qpalma.Configuration as Conf
 from Run import *
+import pdb
+import os
+import os.path
+import cPickle
 
 
-def createRuns():
-   # specify n for n-fold cross validation
-   numFolds=5
+def get_dataset(dataset_size,num_nodes):
+   all_instances = []
 
 
-   # the main directory where all results are stored
-   experiment_dir = '/fml/ag-raetsch/home/fabio/tmp/QPalma'
+   params = [\
+   ('prediction_begin',400000),\
+   ('prediction_end',440000)]
+
+   all_instances.append(params)
+
+   return all_instances
+
+
+def get_dataset_splitting_instances(dataset_size,num_nodes):
+   all_instances = []
+
+   part = dataset_size / num_nodes
+   begin = 0
+   end = 0
+   for idx in range(1,num_nodes+1):
+      
+      if idx == num_nodes:
+         begin = end
+         end   = dataset_size
+      else:
+         begin = end
+         end = begin+part
+
+      params = [\
+      ('prediction_begin',begin),\
+      ('prediction_end',end)]
+
+      all_instances.append(params)
+
+   return all_instances
+
+
+def get_training_instances():
+   all_instances = []
 
 
-   assert os.path.exists(experiment_dir), 'toplevel dir for experiment does not exist!'
+   params = [\
+   ('enable_quality_scores',True),\
+   ('enable_splice_signals',True),\
+   ('enable_intron_length',True)]
+
+   all_instances.append(params)
+
+   return all_instances
+
+
+def get_scoring_instances():
+   all_instances = []
+
+   for QFlag in [True,False]:
+      for SSFlag in [True,False]:
+         for ILFlag in [True,False]:
+            params = [\
+            ('enable_quality_scores',QFlag),\
+            ('enable_splice_signals',SSFlag),\
+            ('enable_intron_length',ILFlag)]
+            all_instances.append(params)
+
+   return all_instances
+
+
+def createRuns():
+   # load the configuration object
+   Config = cPickle.load(open(Conf.conf_object_path))
 
 
+   # the main directory where all results are stored
+   alignment_dir   = Config['alignment_dir']
+   
    # list of regularization parameters and additional flags for different runs
    # for example:
    #  - with quality scores
    #  - without quality scores
    #
    # list of regularization parameters and additional flags for different runs
    # for example:
    #  - with quality scores
    #  - without quality scores
    #
-   bool2str = ['-',_'+']
-
-   allRuns = []
+   bool2str = ['-','+']
+   ctr = 1
 
 
-   for QFlag in [True,False]:
-      for SSFlag in [True,False]:
-         for ILFlag in [True]:
+   all_instances = get_training_instances()
             
             
-            # create a new Run object
-            currentRun = Run()
+   allRuns = []
+   for parameters in all_instances:
+      # create a new Run object
+      currentRun = Run()
+      currentName = 'run'
+
+      for param_name,param in parameters:
+         currentRun[param_name] = param
+         currentName += '_%s_%s' % (str(param_name),str(bool2str[param]))
             
             
-            # global settings for all runs
-            currentRun['anzpath']            = Conf.anzpath
-            currentRun['iter_steps']         = Conf.iter_steps
-            currentRun['matchmatrixRows']    = Conf.sizeMatchmatrix[0]
-            currentRun['matchmatrixCols']    = Conf.sizeMatchmatrix[1]
-            currentRun['mode']               = Conf.mode
-            currentRun['numFeatures']        = Conf.numFeatures
-            currentRun['numConstraintsPerRound'] = Conf.numConstraintsPerRound 
+      # global settings for all runs
+      currentRun['anzpath']            = Conf.anzpath
+      currentRun['iter_steps']         = Conf.iter_steps
+      currentRun['matchmatrixRows']    = Conf.sizeMatchmatrix[0]
+      currentRun['matchmatrixCols']    = Conf.sizeMatchmatrix[1]
+      currentRun['mode']               = Conf.mode
+      currentRun['numConstraintsPerRound'] = Conf.numConstraintsPerRound 
 
 
-            currentRun['print_matrix']             = Conf.print_matrix
-            0 < currentRun['read_size']            = Conf.read_size
-            currentRun['remove_duplicate_scores']  = Conf.remove_duplicate_scores
+      currentRun['remove_duplicate_scores']  = Conf.remove_duplicate_scores
+      currentRun['print_matrix']          = Conf.print_matrix
+      currentRun['read_size']             = Conf.read_size
 
 
-            currentRun['numQualPlifs']          = Conf.numQualPlifs
-            currentRun['numQualSuppPoints']     = Conf.numQualSuppPoints
-            currentRun['totalQualSuppPoints']   = Conf.totalQualSuppPoints
+      currentRun['numDonSuppPoints']      = 10
+      currentRun['numAccSuppPoints']      = 10
 
 
-            # run-specific settings
+      currentRun['numQualPlifs']          = Conf.numQualPlifs
+      currentRun['numQualSuppPoints']     = 10
+      currentRun['totalQualSuppPoints']   = currentRun['numQualPlifs']*currentRun['numQualSuppPoints']
 
 
-            currentRun['dataset_begin']         =
-            currentRun['dataset_end']           =
+      currentRun['enable_quality_scores'] = True
+      currentRun['enable_splice_signals'] = True
+      currentRun['enable_intron_length']  = True
 
 
-            currentRun['enable_quality_scores'] = QFlag
-            currentRun['enable_splice_signals'] = SSFlag
-            currentRun['enable_intron_length']  = ILFlag
+      # if we are not using an intron length model at all we do not need the support points
+      currentRun['numLengthSuppPoints']   = 10 #Conf.numLengthSuppPoints
 
 
-            currentName = '%s_quality_%s_splicesignals_%s_intron_len' %\
-            (bool2str[QFlag],bool2str[SSFlag],bool2str[ILFlag])
+      if currentRun['enable_intron_length'] == False:
+         currentRun['numLengthSuppPoints']   = 2 #Conf.numLengthSuppPoints
 
 
-            currentRun['name']                  = currentName
+      currentRun['numFeatures']           = currentRun['numLengthSuppPoints']\
+      + currentRun['numDonSuppPoints'] + currentRun['numAccSuppPoints']\
+      + currentRun['matchmatrixRows'] * currentRun['matchmatrixCols']\
+      + currentRun['totalQualSuppPoints'] 
 
 
-            allRuns.append(currentRun)
+      # run-specific settings
+      currentRun['C']                     = 100
+
+      currentRun['name']                  = currentName
+      currentRun['alignment_dir']         = alignment_dir
+
+      currentRun['min_intron_len']        = 20
+      currentRun['max_intron_len']        = 2000
+
+      currentRun['min_svm_score']         = 0.0 
+      currentRun['max_svm_score']         = 1.0
+
+      currentRun['min_qual']              = -5
+      currentRun['max_qual']              = 40
+
+      currentRun['dna_flat_files']        = Conf.dna_flat_fn
+
+      currentRun['id']      = ctr
+      ctr += 1
+
+      allRuns.append(currentRun)
+
+###############################################################################
+#
+# check for valid paths / options etc
+#
+###############################################################################
 
 
-   #
-   # check for valid paths / options etc
-   #
    for currentRun in allRuns:
 
       assert 0 < currentRun['anzpath'] < 100
    for currentRun in allRuns:
 
       assert 0 < currentRun['anzpath'] < 100
-      assert 0 < currentRun['dataset_begin'] < currentRun['dataset_end']
-      assert currentRun['dataset_begin'] < currentRun['dataset_end']
-
       assert currentRun['iter_steps']
 
       #assert currentRun['matchmatrixCols']
       assert currentRun['iter_steps']
 
       #assert currentRun['matchmatrixCols']
@@ -91,18 +184,30 @@ def createRuns():
 
       assert 0 < currentRun['numFeatures'] < 10000
 
 
       assert 0 < currentRun['numFeatures'] < 10000
 
+      # assert currentRun['numLengthSuppPoints']
+      # assert currentRun['numDonSuppPoints']
+      # assert currentRun['numAccSuppPoints']
       #assert currentRun['numQualPlifs']
       #assert currentRun['numQualSuppPoints']
       #assert currentRun['numQualPlifs']
       #assert currentRun['numQualSuppPoints']
+      #assert numQualPlifs       >= 0
+      #assert numDonSuppPoints    > 1
+      #assert numAccSuppPoints    > 1
+      #assert numLengthSuppPoints > 1 
+      #assert numQualSuppPoints   > 1
 
       assert currentRun['print_matrix'] in [True,False]
       assert 0 < currentRun['read_size'] < 100
       assert currentRun['remove_duplicate_scores'] in [True,False]
 
 
       assert currentRun['print_matrix'] in [True,False]
       assert 0 < currentRun['read_size'] < 100
       assert currentRun['remove_duplicate_scores'] in [True,False]
 
-      currentRun['disable_quality_scores'] in [True,False]
+      assert currentRun['enable_quality_scores'] in [True,False]
+      assert currentRun['enable_splice_signals'] in [True,False]
+      assert currentRun['enable_intron_length']  in [True,False]
 
       #assert currentRun['totalQualSuppPoints']
 
       #assert currentRun['totalQualSuppPoints']
+      assert os.path.exists(currentRun['alignment_dir'])
 
 
+   return allRuns
 
 if __name__ == '__main__':
 
 if __name__ == '__main__':
-   createRuns()
-
+   allRuns = createRuns()
+   #pdb.set_trace()