+ fix changes we made so far
[qpalma.git] / scripts / Experiment.py
1 ###############################################################################
2 #
3 # This file contains settings for one experiment
4 #
5 # The general idea is as follows:
6 #
7 # Suppose you have an machine learning algorithm you want to perform model
8 # selection with. Then for each different value of for example C for a C-SVM this
9 # script generates a Run object a subclass of dict storing the parameters.
10 #
11 ###############################################################################
12
13 import qpalma.Configuration as Conf
14 from Run import *
15 import pdb
16 import os
17 import os.path
18
19 def get_dataset(dataset_size,num_nodes):
20 all_instances = []
21
22 params = [\
23 ('prediction_begin',400000),\
24 ('prediction_end',440000)]
25
26 all_instances.append(params)
27
28 return all_instances
29
30
31 def get_dataset_splitting_instances(dataset_size,num_nodes):
32 all_instances = []
33
34 part = dataset_size / num_nodes
35 begin = 0
36 end = 0
37 for idx in range(1,num_nodes+1):
38
39 if idx == num_nodes:
40 begin = end
41 end = dataset_size
42 else:
43 begin = end
44 end = begin+part
45
46 params = [\
47 ('prediction_begin',begin),\
48 ('prediction_end',end)]
49
50 all_instances.append(params)
51
52 return all_instances
53
54
55 def get_scoring_instances():
56 all_instances = []
57
58 for QFlag in [True,False]:
59 for SSFlag in [True,False]:
60 for ILFlag in [True,False]:
61 params = [\
62 ('enable_quality_scores',QFlag),\
63 ('enable_splice_signals',SSFlag),\
64 ('enable_intron_length',ILFlag)]
65 all_instances.append(params)
66
67 return all_instances
68
69
70 def createRuns():
71 # specify n for n-fold cross validation
72 numFolds=5
73
74 # the main directory where all results are stored
75 experiment_dir = '/fml/ag-raetsch/home/fabio/tmp/QPalma_single_run'
76
77 assert os.path.exists(experiment_dir), 'toplevel dir for experiment does not exist!'
78
79 # list of regularization parameters and additional flags for different runs
80 # for example:
81 # - with quality scores
82 # - without quality scores
83 #
84 bool2str = ['-','+']
85
86 allRuns = []
87
88 dataset_filename = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/dataset_full'
89
90 dataset_size = 500000
91 num_nodes = 10
92
93 ctr = 1
94
95 #all_instances = get_scoring_instances()
96 all_instances = get_dataset_splitting_instances(dataset_size,num_nodes)
97 #all_instances = get_dataset(dataset_size,num_nodes)
98 #pdb.set_trace()
99
100 for parameters in all_instances:
101 # create a new Run object
102 currentRun = Run()
103 currentName = 'run_'
104
105 for param_name,param in parameters:
106 currentRun[param_name] = param
107 currentName += '%s_%s_' % (str(param_name),str(param))
108 #print param_name,param
109 #print currentName
110
111 # global settings for all runs
112 currentRun['anzpath'] = Conf.anzpath
113 currentRun['iter_steps'] = Conf.iter_steps
114 currentRun['matchmatrixRows'] = Conf.sizeMatchmatrix[0]
115 currentRun['matchmatrixCols'] = Conf.sizeMatchmatrix[1]
116 currentRun['mode'] = Conf.mode
117 currentRun['numConstraintsPerRound'] = Conf.numConstraintsPerRound
118
119 currentRun['remove_duplicate_scores'] = Conf.remove_duplicate_scores
120 currentRun['print_matrix'] = Conf.print_matrix
121 currentRun['read_size'] = Conf.read_size
122
123 currentRun['numDonSuppPoints'] = 10
124 currentRun['numAccSuppPoints'] = 10
125
126 currentRun['numQualPlifs'] = Conf.numQualPlifs
127 currentRun['numQualSuppPoints'] = 10
128 currentRun['totalQualSuppPoints'] = currentRun['numQualPlifs']*currentRun['numQualSuppPoints']
129
130 currentRun['enable_quality_scores'] = False
131 currentRun['enable_splice_signals'] = True
132 currentRun['enable_intron_length'] = True
133
134 # if we are not using an intron length model at all we do not need the support points
135 currentRun['numLengthSuppPoints'] = 10 #Conf.numLengthSuppPoints
136
137 if currentRun['enable_intron_length'] == False:
138 currentRun['numLengthSuppPoints'] = 2 #Conf.numLengthSuppPoints
139
140 currentRun['numFeatures'] = currentRun['numLengthSuppPoints']\
141 + currentRun['numDonSuppPoints'] + currentRun['numAccSuppPoints']\
142 + currentRun['matchmatrixRows'] * currentRun['matchmatrixCols']\
143 + currentRun['totalQualSuppPoints']
144
145 # run-specific settings
146 currentRun['training_begin'] = Conf.training_begin
147 currentRun['training_end'] = Conf.training_end
148 #currentRun['prediction_begin'] = Conf.prediction_begin
149 #currentRun['prediction_end'] = Conf.prediction_end
150
151 currentRun['C'] = 100
152
153 currentRun['name'] = currentName
154 currentRun['dataset_filename'] = dataset_filename
155 currentRun['experiment_path'] = experiment_dir
156
157 currentRun['min_intron_len'] = 20
158 currentRun['max_intron_len'] = 2000
159
160 currentRun['min_svm_score'] = 0.0
161 currentRun['max_svm_score'] = 1.0
162
163 currentRun['min_qual'] = -5
164 currentRun['max_qual'] = 40
165
166 currentRun['dna_flat_files'] = Conf.dna_flat_fn
167
168 currentRun['id'] = ctr
169 ctr += 1
170
171 allRuns.append(currentRun)
172
173
174 ###############################################################################
175 #
176 # check for valid paths / options etc
177 #
178 ###############################################################################
179
180
181 for currentRun in allRuns:
182
183 assert 0 < currentRun['anzpath'] < 100
184 assert 0 <= currentRun['training_begin'] < currentRun['training_end']
185 assert currentRun['training_begin'] < currentRun['training_end']
186 assert currentRun['prediction_begin'] < currentRun['prediction_end']
187
188 assert currentRun['iter_steps']
189
190 #assert currentRun['matchmatrixCols']
191 #assert currentRun['matchmatrixRows']
192
193 assert currentRun['mode'] in ['normal','using_quality_scores']
194
195 #assert currentRun['numConstraintsPerRound']
196
197 assert 0 < currentRun['numFeatures'] < 10000
198
199 # assert currentRun['numLengthSuppPoints']
200 # assert currentRun['numDonSuppPoints']
201 # assert currentRun['numAccSuppPoints']
202 #assert currentRun['numQualPlifs']
203 #assert currentRun['numQualSuppPoints']
204 #assert numQualPlifs >= 0
205 #assert numDonSuppPoints > 1
206 #assert numAccSuppPoints > 1
207 #assert numLengthSuppPoints > 1
208 #assert numQualSuppPoints > 1
209
210 assert currentRun['print_matrix'] in [True,False]
211 assert 0 < currentRun['read_size'] < 100
212 assert currentRun['remove_duplicate_scores'] in [True,False]
213
214 assert currentRun['enable_quality_scores'] in [True,False]
215 assert currentRun['enable_splice_signals'] in [True,False]
216 assert currentRun['enable_intron_length'] in [True,False]
217
218 #assert currentRun['totalQualSuppPoints']
219 assert os.path.exists(currentRun['dataset_filename'])
220 assert os.path.exists(currentRun['experiment_path'])
221
222 return allRuns
223
224 if __name__ == '__main__':
225 allRuns = createRuns()
226 #pdb.set_trace()