+ fixed some bugs in the negative strand lookup table
[qpalma.git] / scripts / Experiment.py
index dd22238..c414d5a 100644 (file)
@@ -15,6 +15,7 @@ from Run import *
 import pdb
 import os
 import os.path
 import pdb
 import os
 import os.path
+import cPickle
 
 def get_dataset(dataset_size,num_nodes):
    all_instances = []
 
 def get_dataset(dataset_size,num_nodes):
    all_instances = []
@@ -52,6 +53,19 @@ def get_dataset_splitting_instances(dataset_size,num_nodes):
    return all_instances
 
 
    return all_instances
 
 
+def get_training_instances():
+   all_instances = []
+
+   params = [\
+   ('enable_quality_scores',True),\
+   ('enable_splice_signals',True),\
+   ('enable_intron_length',True)]
+
+   all_instances.append(params)
+
+   return all_instances
+
+
 def get_scoring_instances():
    all_instances = []
 
 def get_scoring_instances():
    all_instances = []
 
@@ -68,45 +82,31 @@ def get_scoring_instances():
 
 
 def createRuns():
 
 
 def createRuns():
-   # specify n for n-fold cross validation
-   numFolds=5
+   # load the configuration object
+   Config = cPickle.load(open(Conf.conf_object_path))
 
    # the main directory where all results are stored
 
    # the main directory where all results are stored
-   experiment_dir = '/fml/ag-raetsch/home/fabio/tmp/QPalma_single_run'
-
-   assert os.path.exists(experiment_dir), 'toplevel dir for experiment does not exist!'
-
+   alignment_dir   = Config['alignment_dir']
+   
    # list of regularization parameters and additional flags for different runs
    # for example:
    #  - with quality scores
    #  - without quality scores
    #
    bool2str = ['-','+']
    # list of regularization parameters and additional flags for different runs
    # for example:
    #  - with quality scores
    #  - without quality scores
    #
    bool2str = ['-','+']
-
-   allRuns = []
-
-   dataset_filename = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/dataset_full'
-
-   dataset_size = 500000
-   num_nodes = 10
-
    ctr = 1
 
    ctr = 1
 
-   #all_instances = get_scoring_instances()
-   all_instances = get_dataset_splitting_instances(dataset_size,num_nodes)
-   #all_instances = get_dataset(dataset_size,num_nodes)
-   #pdb.set_trace()
+   all_instances = get_training_instances()
             
             
+   allRuns = []
    for parameters in all_instances:
       # create a new Run object
       currentRun = Run()
    for parameters in all_instances:
       # create a new Run object
       currentRun = Run()
-      currentName = 'run_'
+      currentName = 'run'
 
       for param_name,param in parameters:
          currentRun[param_name] = param
 
       for param_name,param in parameters:
          currentRun[param_name] = param
-         currentName += '%s_%s_' % (str(param_name),str(param))
-         #print param_name,param
-         #print currentName
+         currentName += '_%s_%s' % (str(param_name),str(bool2str[param]))
             
       # global settings for all runs
       currentRun['anzpath']            = Conf.anzpath
             
       # global settings for all runs
       currentRun['anzpath']            = Conf.anzpath
@@ -127,7 +127,7 @@ def createRuns():
       currentRun['numQualSuppPoints']     = 10
       currentRun['totalQualSuppPoints']   = currentRun['numQualPlifs']*currentRun['numQualSuppPoints']
 
       currentRun['numQualSuppPoints']     = 10
       currentRun['totalQualSuppPoints']   = currentRun['numQualPlifs']*currentRun['numQualSuppPoints']
 
-      currentRun['enable_quality_scores'] = False
+      currentRun['enable_quality_scores'] = True
       currentRun['enable_splice_signals'] = True
       currentRun['enable_intron_length']  = True
 
       currentRun['enable_splice_signals'] = True
       currentRun['enable_intron_length']  = True
 
@@ -143,48 +143,36 @@ def createRuns():
       + currentRun['totalQualSuppPoints'] 
 
       # run-specific settings
       + currentRun['totalQualSuppPoints'] 
 
       # run-specific settings
-      currentRun['training_begin']        = Conf.training_begin
-      currentRun['training_end']          = Conf.training_end
-      #currentRun['prediction_begin']      = Conf.prediction_begin
-      #currentRun['prediction_end']        = Conf.prediction_end
-
       currentRun['C']                     = 100
 
       currentRun['name']                  = currentName
       currentRun['C']                     = 100
 
       currentRun['name']                  = currentName
-      currentRun['dataset_filename']      = dataset_filename
-      currentRun['experiment_path']       = experiment_dir
+      currentRun['alignment_dir']         = alignment_dir
 
 
-      currentRun['min_intron_len'] = 20
-      currentRun['max_intron_len'] = 2000
+      currentRun['min_intron_len']        = 20
+      currentRun['max_intron_len']        = 2000
 
 
-      currentRun['min_svm_score'] = 0.0 
-      currentRun['max_svm_score'] = 1.0
+      currentRun['min_svm_score']         = 0.0 
+      currentRun['max_svm_score']         = 1.0
 
 
-      currentRun['min_qual'] = -5
-      currentRun['max_qual'] = 40
+      currentRun['min_qual']              = -5
+      currentRun['max_qual']              = 40
 
 
-      currentRun['dna_flat_files']      = Conf.dna_flat_fn
+      currentRun['dna_flat_files']        = Conf.dna_flat_fn
 
       currentRun['id']      = ctr
       ctr += 1
 
       allRuns.append(currentRun)
 
 
       currentRun['id']      = ctr
       ctr += 1
 
       allRuns.append(currentRun)
 
-
 ###############################################################################
 #
 # check for valid paths / options etc
 #
 ###############################################################################
 
 ###############################################################################
 #
 # check for valid paths / options etc
 #
 ###############################################################################
 
-
    for currentRun in allRuns:
 
       assert 0 < currentRun['anzpath'] < 100
    for currentRun in allRuns:
 
       assert 0 < currentRun['anzpath'] < 100
-      assert 0 <= currentRun['training_begin'] < currentRun['training_end']
-      assert currentRun['training_begin'] < currentRun['training_end'] 
-      assert currentRun['prediction_begin'] < currentRun['prediction_end']
-
       assert currentRun['iter_steps']
 
       #assert currentRun['matchmatrixCols']
       assert currentRun['iter_steps']
 
       #assert currentRun['matchmatrixCols']
@@ -216,8 +204,7 @@ def createRuns():
       assert currentRun['enable_intron_length']  in [True,False]
 
       #assert currentRun['totalQualSuppPoints']
       assert currentRun['enable_intron_length']  in [True,False]
 
       #assert currentRun['totalQualSuppPoints']
-      assert os.path.exists(currentRun['dataset_filename'])
-      assert os.path.exists(currentRun['experiment_path'])
+      assert os.path.exists(currentRun['alignment_dir'])
 
    return allRuns
 
 
    return allRuns