+ minor changes in the paths
[qpalma.git] / scripts / Experiment.py
1 ###############################################################################
2 #
3 # This file contains settings for one experiment
4 #
5 # The general idea is as follows:
6 #
7 # Suppose you have an machine learning algorithm you want to perform model
8 # selection with. Then for each different value of for example C for a C-SVM this
9 # script generates a Run object a subclass of dict storing the parameters.
10 #
11 ###############################################################################
12
13 import QPalmaConfiguration as Conf
14 from Run import *
15 import pdb
16 import os
17 import os.path
18 import cPickle
19
20 def get_dataset(dataset_size,num_nodes):
21 all_instances = []
22
23 params = [\
24 ('prediction_begin',400000),\
25 ('prediction_end',440000)]
26
27 all_instances.append(params)
28
29 return all_instances
30
31
32 def get_dataset_splitting_instances(dataset_size,num_nodes):
33 all_instances = []
34
35 part = dataset_size / num_nodes
36 begin = 0
37 end = 0
38 for idx in range(1,num_nodes+1):
39
40 if idx == num_nodes:
41 begin = end
42 end = dataset_size
43 else:
44 begin = end
45 end = begin+part
46
47 params = [\
48 ('prediction_begin',begin),\
49 ('prediction_end',end)]
50
51 all_instances.append(params)
52
53 return all_instances
54
55
56 def get_training_instances():
57 all_instances = []
58
59 params = [\
60 ('enable_quality_scores',True),\
61 ('enable_splice_signals',True),\
62 ('enable_intron_length',True)]
63
64 all_instances.append(params)
65
66 return all_instances
67
68
69 def get_scoring_instances():
70 all_instances = []
71
72 for QFlag in [True,False]:
73 for SSFlag in [True,False]:
74 for ILFlag in [True,False]:
75 params = [\
76 ('enable_quality_scores',QFlag),\
77 ('enable_splice_signals',SSFlag),\
78 ('enable_intron_length',ILFlag)]
79 all_instances.append(params)
80
81 return all_instances
82
83
84 def createRuns():
85 # load the configuration object
86 Config = cPickle.load(open(Conf.conf_object_path))
87
88 # the main directory where all results are stored
89 alignment_dir = Config['alignment_dir']
90
91 # list of regularization parameters and additional flags for different runs
92 # for example:
93 # - with quality scores
94 # - without quality scores
95 #
96 bool2str = ['-','+']
97 ctr = 1
98
99 all_instances = get_training_instances()
100
101 allRuns = []
102 for parameters in all_instances:
103 # create a new Run object
104 currentRun = Run()
105 currentName = 'run'
106
107 for param_name,param in parameters:
108 currentRun[param_name] = param
109 currentName += '_%s_%s' % (str(param_name),str(bool2str[param]))
110
111 # global settings for all runs
112 currentRun['anzpath'] = Conf.anzpath
113 currentRun['iter_steps'] = Conf.iter_steps
114 currentRun['matchmatrixRows'] = Conf.sizeMatchmatrix[0]
115 currentRun['matchmatrixCols'] = Conf.sizeMatchmatrix[1]
116 currentRun['mode'] = Conf.mode
117 currentRun['numConstraintsPerRound'] = Conf.numConstraintsPerRound
118
119 currentRun['remove_duplicate_scores'] = Conf.remove_duplicate_scores
120 currentRun['print_matrix'] = Conf.print_matrix
121 currentRun['read_size'] = Conf.read_size
122
123 currentRun['numDonSuppPoints'] = 10
124 currentRun['numAccSuppPoints'] = 10
125
126 currentRun['numQualPlifs'] = Conf.numQualPlifs
127 currentRun['numQualSuppPoints'] = 10
128 currentRun['totalQualSuppPoints'] = currentRun['numQualPlifs']*currentRun['numQualSuppPoints']
129
130 currentRun['enable_quality_scores'] = True
131 currentRun['enable_splice_signals'] = True
132 currentRun['enable_intron_length'] = True
133
134 # if we are not using an intron length model at all we do not need the support points
135 currentRun['numLengthSuppPoints'] = 10 #Conf.numLengthSuppPoints
136
137 if currentRun['enable_intron_length'] == False:
138 currentRun['numLengthSuppPoints'] = 2 #Conf.numLengthSuppPoints
139
140 currentRun['numFeatures'] = currentRun['numLengthSuppPoints']\
141 + currentRun['numDonSuppPoints'] + currentRun['numAccSuppPoints']\
142 + currentRun['matchmatrixRows'] * currentRun['matchmatrixCols']\
143 + currentRun['totalQualSuppPoints']
144
145 # run-specific settings
146 currentRun['C'] = 100
147
148 currentRun['name'] = currentName
149 currentRun['alignment_dir'] = alignment_dir
150
151 currentRun['min_intron_len'] = 20
152 currentRun['max_intron_len'] = 2000
153
154 currentRun['min_svm_score'] = 0.0
155 currentRun['max_svm_score'] = 1.0
156
157 currentRun['min_qual'] = -5
158 currentRun['max_qual'] = 40
159
160 currentRun['dna_flat_files'] = Conf.dna_flat_fn
161
162 currentRun['id'] = ctr
163 ctr += 1
164
165 allRuns.append(currentRun)
166
167 ###############################################################################
168 #
169 # check for valid paths / options etc
170 #
171 ###############################################################################
172
173 for currentRun in allRuns:
174
175 assert 0 < currentRun['anzpath'] < 100
176 assert currentRun['iter_steps']
177
178 #assert currentRun['matchmatrixCols']
179 #assert currentRun['matchmatrixRows']
180
181 assert currentRun['mode'] in ['normal','using_quality_scores']
182
183 #assert currentRun['numConstraintsPerRound']
184
185 assert 0 < currentRun['numFeatures'] < 10000
186
187 # assert currentRun['numLengthSuppPoints']
188 # assert currentRun['numDonSuppPoints']
189 # assert currentRun['numAccSuppPoints']
190 #assert currentRun['numQualPlifs']
191 #assert currentRun['numQualSuppPoints']
192 #assert numQualPlifs >= 0
193 #assert numDonSuppPoints > 1
194 #assert numAccSuppPoints > 1
195 #assert numLengthSuppPoints > 1
196 #assert numQualSuppPoints > 1
197
198 assert currentRun['print_matrix'] in [True,False]
199 assert 0 < currentRun['read_size'] < 100
200 assert currentRun['remove_duplicate_scores'] in [True,False]
201
202 assert currentRun['enable_quality_scores'] in [True,False]
203 assert currentRun['enable_splice_signals'] in [True,False]
204 assert currentRun['enable_intron_length'] in [True,False]
205
206 #assert currentRun['totalQualSuppPoints']
207 assert os.path.exists(currentRun['alignment_dir'])
208
209 return allRuns
210
211 if __name__ == '__main__':
212 allRuns = createRuns()
213 #pdb.set_trace()