362e5f6ec99e960044fba948f65b87e72574f4e1
[qpalma.git] / scripts / qpalma_pipeline.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 # This program is free software; you can redistribute it and/or modify
5 # it under the terms of the GNU General Public License as published by
6 # the Free Software Foundation; either version 2 of the License, or
7 # (at your option) any later version.
8 #
9 # Written (W) 2008 Fabio De Bona
10 # Copyright (C) 2008 Max-Planck-Society
11
12 #
13 # This file contains the main interface to the QPalma pipeline.
14 #
15
16 import os
17 import os.path
18 import pdb
19 import sys
20
21 from qpalma.gridtools import ApproximationTask,PreprocessingTask
22 from qpalma.gridtools import AlignmentTask,PostprocessingTask
23 from qpalma.gridtools import TrainingTask
24
25 from qpalma.DatasetUtils import generatePredictionDataset,generateTrainingDataset
26
27 from qpalma.SettingsParser import parseSettings
28
29 from qpalma.utils import logwrite
30
31 Errormsg = """Usage is:
32
33 python qpalma_pipeline.py train <config filename> <training data filename>
34 or
35 python qpalma_pipeline.py predict <config filename> <parameter filename> <putative unspliced reads filename> <putative spliced reads filename>
36
37 """
38
39
40 def printMessage(mes):
41 print '#'*80
42 print '\t\t\t%s...\n'%mes
43 print '#'*80
44
45
46 class System:
47 """
48 This class wraps the outer loop of the qpalma project
49
50 It is responsible for:
51
52 - loading and checking the config file(s)
53 - setting up the different pipeline modules
54 - run the experiment and report the results
55
56 """
57
58 def __init__(self,filename):
59 """
60 Inititalize the system by loading and parsing the settings file to obtain
61 all parameters.
62 """
63
64 self.settings = parseSettings(filename)
65 logwrite('Parsed settings system set up.',self.settings)
66
67
68 def training(self, training_data_fn):
69 """
70 This function is responsible for the whole training process. It first
71 converts the data to the right format needed by QPalma for the training
72 algorithm.
73 """
74 logwrite('Begin of training.\n',self.settings)
75
76 printMessage('Starting dataset generation')
77
78 self.settings['training_data_fn'] = training_data_fn
79
80 # Collect the data and create a pickled training set
81 generateTrainingDataset(self.settings)
82
83 printMessage('Starting training')
84
85 # Now that we have a dataset we can perform training
86 train_task = TrainingTask(self.settings)
87 train_task.CreateJobs()
88 train_task.Submit()
89 train_task.CheckIfTaskFinished()
90
91 logwrite('End of training.\n',self.settings)
92
93
94 def prediction(self, param_fn, unspliced_reads_fn, spliced_reads_fn):
95 """
96 This function encapsulates all steps needed to perform a prediction. Given
97 the parameter of the training and paths to a prediction set it will
98 generate several output files containing the spliced alignments
99 """
100
101 logwrite('Begin of prediction.\n',self.settings)
102
103 printMessage('Starting approximation')
104
105 self.settings['prediction_param_fn'] = param_fn
106 self.settings['unspliced_reads_fn'] = unspliced_reads_fn
107 self.settings['spliced_reads_fn'] = spliced_reads_fn
108
109 # Before creating a candidate spliced read dataset we have to first filter
110 # the matches from the first seed finding run.
111
112 approx_task = ApproximationTask(self.settings)
113 approx_task.CreateJobs()
114 approx_task.Submit()
115 approx_task.CheckIfTaskFinished()
116
117 # After filtering combine the filtered matches from the first run and the
118 # found matches from the second run to a full dataset
119
120 printMessage('Starting dataset generation')
121
122 generatePredictionDataset(self.settings)
123
124 printMessage('Starting alignments')
125
126 # Now that we have a dataset we can perform accurate alignments
127 align_task = AlignmentTask(self.settings)
128 align_task.CreateJobs()
129 align_task.Submit()
130 align_task.CheckIfTaskFinished()
131
132 printMessage('Postprocessing')
133
134 # The results of the above alignment step can be converted to a data format
135 # needed for further postprocessing.
136 post_task = PostprocessingTask(self.settings)
137 post_task.CreateJobs()
138 post_task.Submit()
139 post_task.CheckIfTaskFinished()
140
141 logwrite('End of prediction.\n',self.settings)
142
143
144 if __name__ == '__main__':
145 if len(sys.argv) != 4 and len(sys.argv) != 6:
146 print 'Invalid number of arguments!'
147 print len(sys.argv)
148 print Errormsg
149 sys.exit(1)
150
151 mode = sys.argv[1]
152 if not mode in ['predict','train']:
153 print Errormsg
154 sys.exit(1)
155
156 filename = sys.argv[2]
157 if not os.path.exists(filename):
158 print Errormsg
159 sys.exit(1)
160
161 # creating system object
162 system_obj = System(filename)
163
164 if mode == 'predict':
165 param_fn = sys.argv[3]
166 unspliced_reads_fn = sys.argv[4]
167 spliced_reads_fn = sys.argv[5]
168 system_obj.prediction(param_fn,unspliced_reads_fn, spliced_reads_fn)
169 elif mode == 'train':
170 training_data_fn = sys.argv[3]
171 system_obj.training(training_data_fn)
172 else:
173 assert False