+ set proper logging in optimizer code
[qpalma.git] / tests / test_qpalma.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 # This program is free software; you can redistribute it and/or modify
5 # it under the terms of the GNU General Public License as published by
6 # the Free Software Foundation; either version 2 of the License, or
7 # (at your option) any later version.
8 #
9 # Written (W) 2008 Fabio De Bona
10 # Copyright (C) 2008 Max-Planck-Society
11
12
13 import cPickle
14 import random
15 import sys
16 import math
17 import numpy
18 from numpy import inf
19 import os.path
20 import shutil
21 import pdb
22 import array
23 import unittest
24
25 from qpalma.qpalma_main import QPalma,preprocessExample
26 from qpalma.utils import print_prediction
27
28 from qpalma.Run import Run
29
30 from qpalma.SettingsParser import parseSettings
31 from qpalma.DatasetUtils import processQuality
32 from qpalma.OutputFormat import alignment_reconstruct
33 from qpalma.sequence_utils import DataAccessWrapper,SeqSpliceInfo
34
35 from qpalma.Lookup import LookupTable
36
37 jp = os.path.join
38
39
40 def createTrainingSet():
41
42 dataset = {}
43
44 id = 1111
45 chromo = 1
46 strand = '+'
47 up_cut = 117
48 down_cut = 383
49
50 currentSeqInfo = (id,chromo,strand,up_cut,down_cut)
51
52 # tcaatacaaatcctatttcttgtggttttctttccttcacttagctatggatggtttatcttcattt
53 # tcaatacaaatcctatttctt ctatggatggtttatcttcattt
54
55 originalRead = 'tcaatacaaatcctatttcttctatggatggtttatcttcattt'
56 raw_qualities = 'h'*len(originalRead)
57 prb_offset = 64
58 quality_interval = (-5,40)
59 perform_checks = True
60 quality = processQuality(raw_qualities,prb_offset,quality_interval,perform_checks)
61 currentQualities = [quality]
62
63 exons = numpy.mat([217,238,261,284]).reshape((2,2))
64
65 dataset[id] = (currentSeqInfo,originalRead,currentQualities,exons)
66
67 id = 2222
68 up_cut = 1560
69 down_cut = 1901
70
71 currentSeqInfo = (id,chromo,strand,up_cut,down_cut)
72
73 # tcctttaagattttgtttttataatgtgttcttccatccacatctatctccatatgatatggaccatatcatacatcatcatttgtccaaatgcatgaatgaatttggaaataggtacgagaatgccaacaatgacaagaa
74 # tcctttaagattttgtttttataatgt gtacgagaatgccaacaatgacaagaa
75
76 originalRead = 'tcctttaagattttgtttttataatgtgtacgagaatgccaacaatgacaagaa'
77 raw_qualities = 'h'*len(originalRead)
78 prb_offset = 64
79 quality_interval = (-5,40)
80 perform_checks = True
81 quality = processQuality(raw_qualities,prb_offset,quality_interval,perform_checks)
82 currentQualities = [quality]
83
84 exons = numpy.mat([1660,1687,1774,1801]).reshape((2,2))
85
86 #dataset[id] = (currentSeqInfo,originalRead,currentQualities,exons)
87
88 return dataset
89
90
91 def createPredictionSet():
92
93 dataset = {}
94
95 id = 1111
96 chromo = 1
97 strand = '+'
98 up_cut = 117
99 down_cut = 383
100
101 currentSeqInfo = (id,chromo,strand,up_cut,down_cut)
102
103 # tcaatacaaatcctatttcttgtggttttctttccttcacttagctatggatggtttatcttcattt
104 # tcaatacaaatcctatttctt ctatggatggtttatcttcattt
105
106 originalRead = 'tcaatacaaatcctatttcttctatggatggtttatcttcattt'
107 raw_qualities = 'h'*len(originalRead)
108 prb_offset = 64
109 quality_interval = (-5,40)
110 perform_checks = True
111 quality = processQuality(raw_qualities,prb_offset,quality_interval,perform_checks)
112 currentQualities = [quality]
113
114 exons = numpy.mat([217,238,261,284]).reshape((2,2))
115
116 dataset.setdefault(id, []).append((currentSeqInfo,originalRead,currentQualities))
117
118 id = 2222
119 up_cut = 1560
120 down_cut = 1901
121
122 currentSeqInfo = (id,chromo,strand,up_cut,down_cut)
123
124 # tcctttaagattttgtttttataatgtgttcttccatccacatctatctccatatgatatggaccatatcatacatcatcatttgtccaaatgcatgaatgaatttggaaataggtacgagaatgccaacaatgacaagaa
125 # tcctttaagattttgtttttataatgt gtacgagaatgccaacaatgacaagaa
126
127 originalRead = 'tcctttaagattttgtttttataatgtgtacgagaatgccaacaatgacaagaa'
128 raw_qualities = 'h'*len(originalRead)
129 prb_offset = 64
130 quality_interval = (-5,40)
131 perform_checks = True
132 quality = processQuality(raw_qualities,prb_offset,quality_interval,perform_checks)
133 currentQualities = [quality]
134
135 exons = numpy.mat([1660,1687,1774,1801]).reshape((2,2))
136
137 dataset.setdefault(id, []).append((currentSeqInfo,originalRead,currentQualities))
138
139
140 return dataset
141
142
143 class TestQPalmaTraining(unittest.TestCase):
144 """
145 """
146
147 def setUp(self):
148 dir = '/fml/ag-raetsch/home/fabio/tmp/sandbox/testcases'
149 if os.path.exists(dir):
150 shutil.rmtree(dir)
151 os.mkdir(dir)
152
153 self.settings = parseSettings('train_testcase.conf')
154
155 accessWrapper = DataAccessWrapper(self.settings)
156 self.seqInfo = SeqSpliceInfo(accessWrapper,self.settings['allowed_fragments'])
157
158 self.training_set = createTrainingSet()
159
160
161 def testInitialization(self):
162 pass
163
164
165 def _test_preprocessExample(self):
166 """
167 """
168
169 id = 2222
170 (currentSeqInfo,originalRead,currentQualities,exons) = self.training_set[id]
171 print originalRead
172
173 try:
174 preprocessExample(self.training_set,id,self.seqInfo,self.settings)
175 except:
176 print sys.exc_info()
177
178
179 def _test_performAlignment(self):
180 """
181 """
182 #performAlignment(dna,read,quality,mmatrix,donor,acceptor,ps,qualityPlifs,current_num_path,prediction_mode,settings)
183 pass
184
185
186 def test_train(self):
187 """
188 This function
189 """
190
191 set_name = 'test_train'
192
193 qp = QPalma(self.seqInfo,True)
194 qp.train(self.training_set,self.settings,set_name)
195
196
197
198 class TestQPalmaPrediction(unittest.TestCase):
199 """
200 This class...
201 """
202
203
204 def setUp(self):
205 """
206 """
207
208 dir = '/fml/ag-raetsch/home/fabio/tmp/sandbox/testcases'
209 if os.path.exists(dir):
210 shutil.rmtree(dir)
211 os.mkdir(dir)
212
213 self.settings = parseSettings('train_testcase.conf')
214
215 accessWrapper = DataAccessWrapper(self.settings)
216 self.seqInfo = SeqSpliceInfo(accessWrapper,self.settings['allowed_fragments'])
217
218 self.prediction_set = createPredictionSet()
219
220
221 def test_predictionFromTrainedParameters(self):
222 self.prediction_set
223
224 qp = QPalma(self.seqInfo,True)
225
226 self.settings['prediction_param_fn'] = 'param_0.pickle'
227
228 allPredictions = qp.predict(self.prediction_set,self.settings)
229
230 for current_prediction in allPredictions:
231 align_str = print_prediction(current_prediction)
232 print align_str
233
234 id = current_prediction['id']
235 seq = current_prediction['read']
236 dna = current_prediction['dna']
237 chromo = current_prediction['chr']
238 strand = current_prediction['strand']
239 start_pos = current_prediction['start_pos']
240 predExons = current_prediction['predExons']
241
242 numExons = int(math.ceil(len(predExons) / 2))
243
244 print alignment_reconstruct(current_prediction,numExons)
245 print id,start_pos,predExons
246
247 ## chr1 + 20-120
248 #read = 'catctatgcaacagcattacagtgatcaccggcccaaaaaacctgtgtctggggttttgcctgatgatagcagtgatactgaaactggatcaatggtaag'
249 #currentQualities = [[40]*len(read)]
250
251 #id = 3
252 #chromo = 1
253 #strand = '+'
254
255 #genomicSeq_start = 3500
256 #genomicSeq_stop = 6500
257 #print 'Position: ',
258 #print genomicSeq_start,genomicSeq_stop
259
260 #currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
261
262 #example = (currentSeqInfo,read,currentQualities)
263 #self.prediction_set[id] = [example]
264
265 ## chr1 - 5000-5100
266 #read = 'ctgtgtatctggttgctcaatatgctcgccggaaaatgaagatcatggatgctgtgagttctccttattgttcattatcaaactgatatgagtttctgat'
267 #currentQualities = [[40]*len(read)]
268
269 #id = 4
270 #chromo = 1
271 #strand = '-'
272
273 #total_size = 30432563
274
275 #genomicSeq_start = total_size - 6500
276 #genomicSeq_stop = total_size - 3500
277 #print 'Position: ',
278 #print genomicSeq_start,genomicSeq_stop
279
280 #currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
281
282 #example = (currentSeqInfo,read,currentQualities)
283 #self.prediction_set[id] = [example]
284
285
286 def _testAlignments(self):
287
288 settings = parseSettings('testcase.conf')
289
290 print self.prediction_set
291 for example_key in self.prediction_set.keys():
292 print 'Current example %d' % example_key
293
294 for example in self.prediction_set[example_key]:
295 print example
296 print 'size'
297 print len(example)
298
299 accessWrapper = DataAccessWrapper(settings)
300 seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
301
302 qp = QPalma(seqInfo,True)
303 allPredictions = qp.predict(self.prediction_set,settings)
304
305 for current_prediction in allPredictions:
306 align_str = print_prediction(current_prediction)
307 print align_str
308
309 id = current_prediction['id']
310 seq = current_prediction['read']
311 dna = current_prediction['dna']
312 chromo = current_prediction['chr']
313 strand = current_prediction['strand']
314 start_pos = current_prediction['start_pos']
315 predExons = current_prediction['predExons']
316
317 numExons = int(math.ceil(len(predExons) / 2))
318
319 print alignment_reconstruct(current_prediction,numExons)
320 print id,start_pos,predExons
321
322 print 'Problem counter is %d' % qp.problem_ctr
323
324
325
326
327 def _testAlignments(self):
328 run_dir = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/saved_run'
329
330 run = cPickle.load(open(jp(run_dir,'run_obj.pickle')))
331 run['name'] = 'test_run'
332 run['result_dir'] = '/fml/ag-raetsch/home/fabio/tmp/sandbox/testcases'
333
334 param_fn = jp(run_dir,'param_526.pickle')
335 param = cPickle.load(open(param_fn))
336
337 print self.prediction_set
338 for example_key in self.prediction_set.keys():
339 print 'Current example %d' % example_key
340
341 for example in self.prediction_set[example_key]:
342 print example
343 print 'size'
344 print len(example)
345
346 # fetch the data needed
347 settings = {}
348
349 settings['genome_dir'] = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome'
350 settings['acceptor_scores_loc'] = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
351 settings['donor_scores_loc'] = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
352
353 settings['genome_file_fmt'] = 'chr%d.dna.flat'
354 settings['splice_score_file_fmt']= 'contig_%d%s'
355
356 allowed_fragments = [1]
357 settings['allowed_fragments'] = allowed_fragments
358
359 accessWrapper = DataAccessWrapper(settings)
360 seqInfo = SeqSpliceInfo(accessWrapper,allowed_fragments)
361
362 qp = QPalma(run,seqInfo,True)
363 allPredictions = qp.predict(self.prediction_set,param)
364
365 for current_prediction in allPredictions:
366 align_str = print_prediction(current_prediction)
367 print align_str
368
369 id = current_prediction['id']
370 seq = current_prediction['read']
371 dna = current_prediction['dna']
372 chromo = current_prediction['chr']
373 strand = current_prediction['strand']
374 start_pos = current_prediction['start_pos']
375 predExons = current_prediction['predExons']
376
377 numExons = int(math.ceil(len(predExons) / 2))
378
379 print alignment_reconstruct(current_prediction,numExons)
380 print id,start_pos,predExons
381
382 print 'Problem counter is %d' % qp.problem_ctr
383
384
385 if __name__ == '__main__':
386 train_suite = unittest.TestLoader().loadTestsFromTestCase(TestQPalmaTraining)
387 predict_suite = unittest.TestLoader().loadTestsFromTestCase(TestQPalmaPrediction)
388 all_suites = unittest.TestSuite([train_suite, predict_suite])
389 unittest.TextTestRunner(verbosity=2).run(all_suites)