+ added configuration file parsing and checking functions
[qpalma.git] / scripts / qpalma_pipeline.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 # This program is free software; you can redistribute it and/or modify
5 # it under the terms of the GNU General Public License as published by
6 # the Free Software Foundation; either version 2 of the License, or
7 # (at your option) any later version.
8 #
9 # Written (W) 2008 Fabio De Bona
10 # Copyright (C) 2008 Max-Planck-Society
11
12 #
13 # This file contains the main interface to the QPalma pipeline.
14 #
15
16 import os
17 import os.path
18 import pdb
19 import sys
20
21 from optparse import OptionParser
22
23 from qpalma.gridtools import ApproximationTask,PreprocessingTask
24 from qpalma.gridtools import AlignmentTask,PostprocessingTask
25
26
27 Errormsg = """Usage is: python qpalma_pipeline.py <config filename>"""
28
29
30 """
31 def create_option_parser():
32 parser = OptionParser()
33
34 #
35 parser.add_option("-ci", "--check_and_init", help="check configuration and initialize directories")
36
37 #
38 parser.add_option("-r", "--run", help="write report to FILE", metavar="FILE")
39
40 #
41 parser.add_option("-xx", "--clear", action="store_false", dest="verbose", help="cleanup directories delete all created data")
42
43 return parser
44 """
45
46 jp = os.path.join
47
48 def parseSettings(filename):
49 """
50 """
51
52 #global_settings = {\
53 #'result_dir':'/fml/ag-raetsch/...',\
54 #'read_ascii_data_fn':'/fml/ag-raetsch/...',\
55 #'num_splits':50
56 #'global_log_fn':'~/qpalma.log'
57 #}
58
59 global_settings = {}
60
61 for line in open(filename):
62 if not line.strip() or line.startswith('#'):
63 continue
64
65 key,val = line.strip().replace(' ','').split('=')
66 global_settings[key] = val
67
68 return global_settings
69
70
71 def makeSettings(global_settings):
72 """
73
74 """
75
76 # first check wether the top level result directory exists
77 assert os.path.exists(global_settings['result_dir']), 'Error: You have to specify a existing result directory!'
78
79 result_dir = global_settings['result_dir']
80
81 # now create some subdirectories needed for the different steps performed by QPalma
82 global_settings['approximation_dir'] = jp(result_dir,'approximation')
83 global_settings['preproc_dir'] = jp(result_dir,'preprocessing')
84 global_settings['postproc_dir'] = jp(result_dir,'postprocessing')
85 global_settings['prediction_dir'] = jp(result_dir,'prediction')
86 global_settings['training_dir'] = jp(result_dir,'training')
87
88 for dir_name in ['approximation_dir', 'preproc_dir', 'postproc_dir', 'prediction_dir', 'training_dir']:
89 try:
90 os.mkdir(global_settings[dir_name])
91 except:
92 print 'Error: There was a problem generating the subdirectory: %s' % dir_name
93
94 try:
95 os.mkdir(global_settings['global_log_fn'])
96 except:
97 print 'Error: There was a problem generating the logfile %s' % global_settings['global_log_fn']
98
99 return global_settings
100
101
102 def checkSettings(global_settings):
103 for key,val in global_settings.items():
104 if key.endswith('_fn'):
105 assert os.path.exists(val), 'Error: Path/File %s with value %s does not seem to exist!' % (key,val)
106
107
108 if key.endswith('_dir'):
109 assert os.path.exists(val), 'Error: Path/File %s with value %s does not seem to exist!' % (key,val)
110
111
112 return True
113
114
115
116
117 class System:
118 """
119 This class wraps the outer loop of the qpalma project
120
121 It is responsible for:
122
123 - loading and checking the config file(s)
124 - setting up the different pipeline modules
125 - run the experiment and report the results
126
127 """
128
129 def __init__(self,filename):
130 """
131 Inititalize the system by loading and parsing the settings file to obtain
132 all parameters.
133 """
134
135 #parser = create_option_parser()
136 #(options, args) = parser.parse_args()
137
138 global_settings = parseSettings(filename)
139 global_settings = makeSettings(global_settings)
140 assert checkSettings(global_settings), 'Check your settings some entries were invalid!'
141
142 self.global_settings = global_settings
143
144 pdb.set_trace()
145
146 def training(self):
147 """
148 This function is responsible for the whole training process. It first
149 converts the data to the right format needed by QPalma for the training
150 algorithm.
151 """
152
153 pre_task = TrainingPreprocessingTask(self.global_settings)
154 pre_task.createJobs()
155 pre_task.submit()
156 while pre_task.checkIfTaskFinished() == False:
157 sleep(20)
158
159
160 def prediction(self):
161 """
162 This function encapsulates all steps needed to perform a prediction. Given
163 the parameter of the training and paths to a prediction set it will
164 generate several output files containing the spliced alignments
165 """
166
167 # Before creating a candidate spliced read dataset we have to first filter
168 # the matches from the first seed finding run.
169
170 approx_task = ApproximationTask(self.global_settings)
171 approx_task.createJobs()
172 approx_task.submit()
173 approx_task.checkIfTaskFinished()
174
175 # After filtering combine the filtered matches from the first run and the
176 # found matches from the second run to a full dataset
177
178 pre_task = PreprocessingTask(self.global_settings)
179 pre_task.createJobs()
180 pre_task.submit()
181 pre_task.checkIfTaskFinished()
182
183 # Now that we have a dataset we can perform the accurate alignments for this
184 # data
185
186 align_task = AlignmentTask(self.global_settings)
187 align_task.createJobs()
188 align_task.submit()
189 align_task.checkIfTaskFinished()
190
191 # The results of the above alignment step can be converted to a data format
192 # needed for further postprocessing.
193
194 post_task = PostprocessingTask(self.global_settings)
195 post_task.createJobs()
196 post_task.submit()
197 post_task.checkIfTaskFinished()
198
199 print "Success!"
200
201
202 if __name__ == '__main__':
203 filename = sys.argv[1]
204 assert os.path.exists(filename), Errormsg
205 system_obj = System(filename)
206 #system_obj.prediction()
207 #system_obj.training()