8c9e501dc754aa7c9e1ade42b49f9da3d8387e45
[qpalma.git] / scripts / qpalma_pipeline.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 # This program is free software; you can redistribute it and/or modify
5 # it under the terms of the GNU General Public License as published by
6 # the Free Software Foundation; either version 2 of the License, or
7 # (at your option) any later version.
8 #
9 # Written (W) 2008 Fabio De Bona
10 # Copyright (C) 2008 Max-Planck-Society
11
12 #
13 # This file contains the main interface to the QPalma pipeline.
14 #
15
16 import os
17 import os.path
18 import pdb
19 import sys
20
21 from qpalma.gridtools import ApproximationTask,PreprocessingTask
22 from qpalma.gridtools import AlignmentTask,PostprocessingTask
23
24 from qpalma.DatasetUtils import generatePredictionDataset,generateTrainingDataset
25
26 from qpalma.SettingsParser import parseSettings
27
28 from qpalma.utils import logwrite
29
30
31 Errormsg = """Usage is:
32
33 python qpalma_pipeline.py train <config filename> <training data filename>
34 or
35 python qpalma_pipeline.py predict <config filename> <parameter filename> <putative unspliced reads filename> <putative spliced reads filename>
36
37 """
38
39
40 class System:
41 """
42 This class wraps the outer loop of the qpalma project
43
44 It is responsible for:
45
46 - loading and checking the config file(s)
47 - setting up the different pipeline modules
48 - run the experiment and report the results
49
50 """
51
52 def __init__(self,filename):
53 """
54 Inititalize the system by loading and parsing the settings file to obtain
55 all parameters.
56 """
57
58 self.settings = parseSettings(filename)
59 logwrite('Parsed settings system set up.',self.settings)
60
61
62 def training(self, training_data_fn):
63 """
64 This function is responsible for the whole training process. It first
65 converts the data to the right format needed by QPalma for the training
66 algorithm.
67 """
68 logwrite('Begin of training.\n',self.settings)
69
70 print '#'*80
71 print '\t\t\tStarting approximation...\n'
72 print '#'*80
73
74 self.settings['training_data_fn'] = training_data_fn
75
76 # Collect the data and create a pickled training set
77 generateTrainingDataset(self.settings)
78
79 # Now that we have a dataset we can perform training
80 train_task = TrainingTask(self.settings)
81 train_task.CreateJobs()
82 train_task.Submit()
83 train_task.CheckIfTaskFinished()
84
85 logwrite('End of training.\n',self.settings)
86
87
88 def prediction(self, param_fn, unspliced_reads_fn, spliced_reads_fn):
89 """
90 This function encapsulates all steps needed to perform a prediction. Given
91 the parameter of the training and paths to a prediction set it will
92 generate several output files containing the spliced alignments
93 """
94
95 logwrite('Begin of prediction.\n',self.settings)
96
97 print '#'*80
98 print '\t\t\tStarting approximation...\n'
99 print '#'*80
100
101 self.settings['prediction_param_fn'] = param_fn
102 self.settings['unspliced_reads_fn'] = unspliced_reads_fn
103 self.settings['spliced_reads_fn'] = spliced_reads_fn
104
105 # Before creating a candidate spliced read dataset we have to first filter
106 # the matches from the first seed finding run.
107
108 approx_task = ApproximationTask(self.settings)
109 approx_task.CreateJobs()
110 approx_task.Submit()
111 approx_task.CheckIfTaskFinished()
112
113 # After filtering combine the filtered matches from the first run and the
114 # found matches from the second run to a full dataset
115
116 print '#'*80
117 print '\t\t\tStarting dataset generation...\n'
118 print '#'*80
119
120 generatePredictionDataset(self.settings)
121
122 print '#'*80
123 print '\t\t\tStarting alignments...\n'
124 print '#'*80
125
126 # Now that we have a dataset we can perform accurate alignments
127 align_task = AlignmentTask(self.settings)
128 align_task.CreateJobs()
129 align_task.Submit()
130 align_task.CheckIfTaskFinished()
131
132 print '#'*80
133 print '\t\t\tPostprocessing...\n'
134 print '#'*80
135
136 # The results of the above alignment step can be converted to a data format
137 # needed for further postprocessing.
138 post_task = PostprocessingTask(self.settings)
139 post_task.CreateJobs()
140 post_task.Submit()
141 post_task.CheckIfTaskFinished()
142
143 logwrite('End of prediction.\n',self.settings)
144
145
146 if __name__ == '__main__':
147 if len(sys.argv) != 4 or len(sys.argv) != 6:
148 print Errormsg
149 sys.exit(1)
150
151 mode = sys.argv[1]
152 if not mode in ['predict','train']:
153 print Errormsg
154 sys.exit(1)
155
156 filename = sys.argv[2]
157 if not os.path.exists(filename):
158 print Errormsg
159 sys.exit(1)
160
161 # creating system object
162 system_obj = System(filename)
163
164 if mode == 'predict':
165 param_fn = sys.argv[3]
166 unspliced_reads_fn = sys.argv[4]
167 spliced_reads_fn = sys.argv[5]
168 system_obj.prediction(unspliced_reads_fn, spliced_reads_fn)
169 elif mode == 'train':
170 training_data_fn = sys.argv[3]
171 system_obj.training(training_data_fn)
172 else:
173 assert False