+ performed some further refactoring
[qpalma.git] / scripts / Experiment.py
1 ###############################################################################
2 #
3 # This file contains settings for one experiment
4 #
5 # The general idea is as follows:
6 #
7 # Suppose you have an machine learning algorithm you want to perform model
8 # selection with. Then for each different value of for example C for a C-SVM this
9 # script generates a Run object a subclass of dict storing the parameters.
10 #
11 ###############################################################################
12
13 import qpalma.Configuration as Conf
14 from Run import *
15 import pdb
16 import os
17 import os.path
18
19 def createRuns():
20 # specify n for n-fold cross validation
21 numFolds=5
22
23 # the main directory where all results are stored
24 experiment_dir = '/fml/ag-raetsch/home/fabio/tmp/QPalma_test'
25
26 assert os.path.exists(experiment_dir), 'toplevel dir for experiment does not exist!'
27
28 # list of regularization parameters and additional flags for different runs
29 # for example:
30 # - with quality scores
31 # - without quality scores
32 #
33 bool2str = ['-','+']
34
35 allRuns = []
36
37 #dataset_filename = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/dataset_10k'
38 #dataset_filename = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/dataset_remapped'
39 dataset_filename = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/dataset_remapped_1k'
40 #dataset_filename = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/new_dataset_100'
41
42 for QFlag in [True,False]:
43 for SSFlag in [True,False]:
44 #for ILFlag in [True]:
45 for ILFlag in [False]:
46
47 # create a new Run object
48 currentRun = Run()
49
50 # global settings for all runs
51 currentRun['anzpath'] = Conf.anzpath
52 currentRun['iter_steps'] = Conf.iter_steps
53 currentRun['matchmatrixRows'] = Conf.sizeMatchmatrix[0]
54 currentRun['matchmatrixCols'] = Conf.sizeMatchmatrix[1]
55 currentRun['mode'] = Conf.mode
56 currentRun['numConstraintsPerRound'] = Conf.numConstraintsPerRound
57
58 currentRun['remove_duplicate_scores'] = Conf.remove_duplicate_scores
59 currentRun['print_matrix'] = Conf.print_matrix
60 currentRun['read_size'] = Conf.read_size
61
62 currentRun['numLengthSuppPoints'] = 2 #Conf.numLengthSuppPoints
63 currentRun['numDonSuppPoints'] = 10
64 currentRun['numAccSuppPoints'] = 10
65
66 currentRun['numQualPlifs'] = Conf.numQualPlifs
67 currentRun['numQualSuppPoints'] = 10
68 currentRun['totalQualSuppPoints'] = currentRun['numQualPlifs']*currentRun['numQualSuppPoints']
69
70 currentRun['numFeatures'] = currentRun['numLengthSuppPoints']\
71 + currentRun['numDonSuppPoints'] + currentRun['numAccSuppPoints']\
72 + currentRun['matchmatrixRows'] * currentRun['matchmatrixCols']\
73 + currentRun['totalQualSuppPoints']
74
75 # run-specific settings
76 currentRun['training_begin'] = Conf.training_begin
77 currentRun['training_end'] = Conf.training_end
78 currentRun['prediction_begin'] = Conf.prediction_begin
79 currentRun['prediction_end'] = Conf.prediction_end
80
81 currentRun['enable_quality_scores'] = QFlag
82 currentRun['enable_splice_signals'] = SSFlag
83 currentRun['enable_intron_length'] = ILFlag
84
85 currentName = 'run_%s_quality_%s_splicesignals_%s_intron_len' %\
86 (bool2str[QFlag],bool2str[SSFlag],bool2str[ILFlag])
87
88 currentRun['C'] = 100
89
90 currentRun['name'] = currentName
91 currentRun['dataset_filename'] = dataset_filename
92 currentRun['experiment_path'] = experiment_dir
93
94 currentRun['min_intron_len'] = 20
95 currentRun['max_intron_len'] = 2000
96
97 #currentRun['min_intron_len'] = 10
98 #currentRun['max_intron_len'] = 100
99
100 currentRun['min_svm_score'] = 0.0
101 currentRun['max_svm_score'] = 1.0
102
103 currentRun['min_qual'] = -5
104 currentRun['max_qual'] = 40
105
106 currentRun['dna_flat_files'] = Conf.dna_flat_fn
107
108
109 allRuns.append(currentRun)
110
111 #
112 # check for valid paths / options etc
113 #
114 for currentRun in allRuns:
115
116 assert 0 < currentRun['anzpath'] < 100
117 assert 0 <= currentRun['training_begin'] < currentRun['training_end']
118 assert currentRun['training_end'] <= currentRun['prediction_begin'] < currentRun['prediction_end']
119
120 assert currentRun['iter_steps']
121
122 #assert currentRun['matchmatrixCols']
123 #assert currentRun['matchmatrixRows']
124
125 assert currentRun['mode'] in ['normal','using_quality_scores']
126
127 #assert currentRun['numConstraintsPerRound']
128
129 assert 0 < currentRun['numFeatures'] < 10000
130
131 # assert currentRun['numLengthSuppPoints']
132 # assert currentRun['numDonSuppPoints']
133 # assert currentRun['numAccSuppPoints']
134 #assert currentRun['numQualPlifs']
135 #assert currentRun['numQualSuppPoints']
136 #assert numQualPlifs >= 0
137 #assert numDonSuppPoints > 1
138 #assert numAccSuppPoints > 1
139 #assert numLengthSuppPoints > 1
140 #assert numQualSuppPoints > 1
141
142 assert currentRun['print_matrix'] in [True,False]
143 assert 0 < currentRun['read_size'] < 100
144 assert currentRun['remove_duplicate_scores'] in [True,False]
145
146 assert currentRun['enable_quality_scores'] in [True,False]
147 assert currentRun['enable_splice_signals'] in [True,False]
148 assert currentRun['enable_intron_length'] in [True,False]
149
150 #assert currentRun['totalQualSuppPoints']
151 assert os.path.exists(currentRun['dataset_filename'])
152 assert os.path.exists(currentRun['experiment_path'])
153
154 return allRuns
155
156 if __name__ == '__main__':
157 allRuns = createRuns()
158 pdb.set_trace()