+ added faster evaluation function
[qpalma.git] / scripts / Experiment.py
1 ###############################################################################
2 #
3 # This file contains settings for one experiment
4 #
5 # The general idea is as follows:
6 #
7 # Suppose you have an machine learning algorithm you want to perform model
8 # selection with. Then for each different value of for example C for a C-SVM this
9 # script generates a Run object a subclass of dict storing the parameters.
10 #
11 ###############################################################################
12
13 import qpalma.Configuration as Conf
14 from Run import *
15 import pdb
16 import os
17 import os.path
18
19 def get_dataset_splitting_instances(dataset_size,num_nodes):
20 all_instances = []
21
22 part = dataset_size / num_nodes
23 begin = 0
24 end = 0
25 for idx in range(1,num_nodes+1):
26
27 if idx == num_nodes:
28 begin = end
29 end = dataset_size
30 else:
31 begin = end
32 end = begin+part
33
34 params = [\
35 ('prediction_begin',begin),\
36 ('prediction_end',end)]
37
38 all_instances.append(params)
39
40 return all_instances
41
42
43 def get_scoring_instances():
44 all_instances = []
45
46 for QFlag in [True,False]:
47 for SSFlag in [True,False]:
48 for ILFlag in [True,False]:
49 params = [\
50 ('enable_quality_scores',QFlag),\
51 ('enable_splice_signals',SSFlag),\
52 ('enable_intron_length',ILFlag)]
53 all_instances.append(params)
54
55 return all_instances
56
57
58 def createRuns():
59 # specify n for n-fold cross validation
60 numFolds=5
61
62 # the main directory where all results are stored
63 experiment_dir = '/fml/ag-raetsch/home/fabio/tmp/QPalma_single_run'
64
65 assert os.path.exists(experiment_dir), 'toplevel dir for experiment does not exist!'
66
67 # list of regularization parameters and additional flags for different runs
68 # for example:
69 # - with quality scores
70 # - without quality scores
71 #
72 bool2str = ['-','+']
73
74 allRuns = []
75
76 dataset_filename = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/dataset_2nd'
77
78 dataset_size = 400000
79 num_nodes = 10
80
81 ctr = 1
82
83 #all_instances = get_scoring_instances()
84 all_instances = get_dataset_splitting_instances(dataset_size,num_nodes)
85
86 for parameters in all_instances:
87 # create a new Run object
88 currentRun = Run()
89 currentName = 'run_'
90
91 for param_name,param in parameters:
92 currentRun[param_name] = param
93 currentName += '%s_%s_' % (str(param_name),str(param))
94 print param_name,param
95 print currentName
96
97 # global settings for all runs
98 currentRun['anzpath'] = Conf.anzpath
99 currentRun['iter_steps'] = Conf.iter_steps
100 currentRun['matchmatrixRows'] = Conf.sizeMatchmatrix[0]
101 currentRun['matchmatrixCols'] = Conf.sizeMatchmatrix[1]
102 currentRun['mode'] = Conf.mode
103 currentRun['numConstraintsPerRound'] = Conf.numConstraintsPerRound
104
105 currentRun['remove_duplicate_scores'] = Conf.remove_duplicate_scores
106 currentRun['print_matrix'] = Conf.print_matrix
107 currentRun['read_size'] = Conf.read_size
108
109 currentRun['numDonSuppPoints'] = 10
110 currentRun['numAccSuppPoints'] = 10
111
112 currentRun['numQualPlifs'] = Conf.numQualPlifs
113 currentRun['numQualSuppPoints'] = 10
114 currentRun['totalQualSuppPoints'] = currentRun['numQualPlifs']*currentRun['numQualSuppPoints']
115
116 currentRun['enable_quality_scores'] = True
117 currentRun['enable_splice_signals'] = True
118 currentRun['enable_intron_length'] = True
119
120 # if we are not using an intron length model at all we do not need the support points
121 currentRun['numLengthSuppPoints'] = 10 #Conf.numLengthSuppPoints
122
123 if currentRun['enable_intron_length'] == False:
124 currentRun['numLengthSuppPoints'] = 2 #Conf.numLengthSuppPoints
125
126 currentRun['numFeatures'] = currentRun['numLengthSuppPoints']\
127 + currentRun['numDonSuppPoints'] + currentRun['numAccSuppPoints']\
128 + currentRun['matchmatrixRows'] * currentRun['matchmatrixCols']\
129 + currentRun['totalQualSuppPoints']
130
131 # run-specific settings
132 currentRun['training_begin'] = Conf.training_begin
133 currentRun['training_end'] = Conf.training_end
134 #currentRun['prediction_begin'] = Conf.prediction_begin
135 #currentRun['prediction_end'] = Conf.prediction_end
136
137 currentRun['C'] = 100
138
139 currentRun['name'] = currentName
140 currentRun['dataset_filename'] = dataset_filename
141 currentRun['experiment_path'] = experiment_dir
142
143 currentRun['min_intron_len'] = 20
144 currentRun['max_intron_len'] = 2000
145
146 currentRun['min_svm_score'] = 0.0
147 currentRun['max_svm_score'] = 1.0
148
149 currentRun['min_qual'] = -5
150 currentRun['max_qual'] = 40
151
152 currentRun['dna_flat_files'] = Conf.dna_flat_fn
153
154 currentRun['id'] = ctr
155 ctr += 1
156
157 allRuns.append(currentRun)
158
159
160 ###############################################################################
161 #
162 # check for valid paths / options etc
163 #
164 ###############################################################################
165
166
167 for currentRun in allRuns:
168
169 assert 0 < currentRun['anzpath'] < 100
170 assert 0 <= currentRun['training_begin'] < currentRun['training_end']
171 assert currentRun['training_begin'] < currentRun['training_end']
172 assert currentRun['prediction_begin'] < currentRun['prediction_end']
173
174 assert currentRun['iter_steps']
175
176 #assert currentRun['matchmatrixCols']
177 #assert currentRun['matchmatrixRows']
178
179 assert currentRun['mode'] in ['normal','using_quality_scores']
180
181 #assert currentRun['numConstraintsPerRound']
182
183 assert 0 < currentRun['numFeatures'] < 10000
184
185 # assert currentRun['numLengthSuppPoints']
186 # assert currentRun['numDonSuppPoints']
187 # assert currentRun['numAccSuppPoints']
188 #assert currentRun['numQualPlifs']
189 #assert currentRun['numQualSuppPoints']
190 #assert numQualPlifs >= 0
191 #assert numDonSuppPoints > 1
192 #assert numAccSuppPoints > 1
193 #assert numLengthSuppPoints > 1
194 #assert numQualSuppPoints > 1
195
196 assert currentRun['print_matrix'] in [True,False]
197 assert 0 < currentRun['read_size'] < 100
198 assert currentRun['remove_duplicate_scores'] in [True,False]
199
200 assert currentRun['enable_quality_scores'] in [True,False]
201 assert currentRun['enable_splice_signals'] in [True,False]
202 assert currentRun['enable_intron_length'] in [True,False]
203
204 #assert currentRun['totalQualSuppPoints']
205 assert os.path.exists(currentRun['dataset_filename'])
206 assert os.path.exists(currentRun['experiment_path'])
207
208 return allRuns
209
210 if __name__ == '__main__':
211 allRuns = createRuns()
212 #pdb.set_trace()