+ restructured test cases
[qpalma.git] / scripts / qpalma_pipeline.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 # This program is free software; you can redistribute it and/or modify
5 # it under the terms of the GNU General Public License as published by
6 # the Free Software Foundation; either version 2 of the License, or
7 # (at your option) any later version.
8 #
9 # Written (W) 2008 Fabio De Bona
10 # Copyright (C) 2008 Max-Planck-Society
11
12 #
13 # This file contains the main interface to the QPalma pipeline.
14 #
15
16 import os
17 import os.path
18 import pdb
19 import sys
20
21 from qpalma.gridtools import ApproximationTask,PreprocessingTask
22 from qpalma.gridtools import AlignmentTask,PostprocessingTask
23
24 from qpalma.DatasetUtils import generatePredictionDataset,generateTrainingDataset
25
26 from qpalma.SettingsParser import parseSettings
27
28 from qpalma.utils import logwrite
29
30
31 Errormsg = """Usage is: python qpalma_pipeline.py predict|train <config filename>"""
32
33
34 class System:
35 """
36 This class wraps the outer loop of the qpalma project
37
38 It is responsible for:
39
40 - loading and checking the config file(s)
41 - setting up the different pipeline modules
42 - run the experiment and report the results
43
44 """
45
46 def __init__(self,filename):
47 """
48 Inititalize the system by loading and parsing the settings file to obtain
49 all parameters.
50 """
51
52 self.settings = parseSettings(filename)
53 logwrite('Parsed settings system set up.',self.settings)
54
55
56 def training(self):
57 """
58 This function is responsible for the whole training process. It first
59 converts the data to the right format needed by QPalma for the training
60 algorithm.
61 """
62 logwrite('Begin of training.\n',self.settings)
63
64 print '#'*80
65 print '\t\t\tStarting approximation...\n'
66 print '#'*80
67
68 # When we are given only genomic reads we first generate artificially spliced
69 # ones in order to generate a training set
70 pre_task = TrainingPreprocessingTask(self.settings)
71 pre_task.createJobs()
72 pre_task.submit()
73 pre_task.checkIfTaskFinished()
74
75 # Collect the data and create a pickled training set
76 generateTrainingDataset(self.settings)
77
78 # Now that we have a dataset we can perform training
79 train_task = TrainingTask(self.settings)
80 train_task.CreateJobs()
81 train_task.Submit()
82 train_task.CheckIfTaskFinished()
83
84 logwrite('End of training.\n',self.settings)
85
86
87 def prediction(self):
88 """
89 This function encapsulates all steps needed to perform a prediction. Given
90 the parameter of the training and paths to a prediction set it will
91 generate several output files containing the spliced alignments
92 """
93
94 logwrite('Begin of prediction.\n',self.settings)
95
96 print '#'*80
97 print '\t\t\tStarting approximation...\n'
98 print '#'*80
99
100 # Before creating a candidate spliced read dataset we have to first filter
101 # the matches from the first seed finding run.
102
103 approx_task = ApproximationTask(self.settings)
104 approx_task.CreateJobs()
105 approx_task.Submit()
106 approx_task.CheckIfTaskFinished()
107
108 # After filtering combine the filtered matches from the first run and the
109 # found matches from the second run to a full dataset
110
111 print '#'*80
112 print '\t\t\tStarting dataset generation...\n'
113 print '#'*80
114
115 generatePredictionDataset(self.settings)
116
117 print '#'*80
118 print '\t\t\tStarting alignments...\n'
119 print '#'*80
120
121 # Now that we have a dataset we can perform accurate alignments
122 align_task = AlignmentTask(self.settings)
123 align_task.CreateJobs()
124 align_task.Submit()
125 align_task.CheckIfTaskFinished()
126
127 print '#'*80
128 print '\t\t\tPostprocessing...\n'
129 print '#'*80
130
131 # The results of the above alignment step can be converted to a data format
132 # needed for further postprocessing.
133 post_task = PostprocessingTask(self.settings)
134 post_task.CreateJobs()
135 post_task.Submit()
136 post_task.CheckIfTaskFinished()
137
138 logwrite('End of prediction.\n',self.settings)
139
140
141 if __name__ == '__main__':
142 mode = sys.argv[1]
143 assert mode in ['predict','train'], Errormsg
144 filename = sys.argv[2]
145 assert os.path.exists(filename), Errormsg
146
147 # creating system object
148 system_obj = System(filename)
149
150 if mode == 'predict':
151 system_obj.prediction()
152 elif mode == 'train':
153 system_obj.training()
154 else:
155 assert False