ba87a16f3cb93a6f78ebb3b511f780068a043b0e
[qpalma.git] / tests / test_qpalma.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 # This program is free software; you can redistribute it and/or modify
5 # it under the terms of the GNU General Public License as published by
6 # the Free Software Foundation; either version 2 of the License, or
7 # (at your option) any later version.
8 #
9 # Written (W) 2008 Fabio De Bona
10 # Copyright (C) 2008 Max-Planck-Society
11
12
13 import cPickle
14 import random
15 import math
16 import numpy
17 from numpy import inf
18 import os.path
19 import pdb
20 import array
21 import unittest
22
23 from qpalma.qpalma_main import QPalma
24 from qpalma.utils import print_prediction
25
26 from qpalma.Run import Run
27
28 from qpalma.SettingsParser import parseSettings
29 from qpalma.OutputFormat import alignment_reconstruct
30 from qpalma.sequence_utils import DataAccessWrapper,SeqSpliceInfo,reverse_complement
31
32 from qpalma.Lookup import LookupTable
33
34
35 jp = os.path.join
36
37
38 def createScoresForSequence(full_chromo_seq,reverse_strand=False):
39 """
40 Given a genomic sequence this function calculates random scores for all
41 ocurring splice sites.
42 """
43
44 acc_pos = []
45 don_pos = []
46
47 total_size = len(full_chromo_seq)
48
49 # the first/last 205 pos do not have any predictions
50 for pos,elem in enumerate(full_chromo_seq[205:-205]):
51 if full_chromo_seq[pos-2:pos] == 'ag':
52 acc_pos.append(pos)
53 if full_chromo_seq[pos:pos+2] == 'gt' or full_chromo_seq[pos:pos+2] == 'gc':
54 don_pos.append(pos)
55
56 # make pos 1-based
57 acc_pos = [e+1 for e in acc_pos]
58 don_pos = [e+1 for e in don_pos]
59
60 acc_scores = [0.0]*len(acc_pos)
61 don_scores = [0.0]*len(don_pos)
62
63 for idx in range(len(acc_pos)):
64 acc_scores[idx] = random.uniform(0.1,1.0)
65
66 for idx in range(len(don_pos)):
67 don_scores[idx] = random.uniform(0.1,1.0)
68
69 # recalculate indices and reverse them in order to have positions relative
70 # to positive strand
71 if reverse_strand:
72 acc_pos = [total_size-1-e for e in acc_pos]
73 acc_pos.reverse()
74 acc_scores.reverse()
75
76 don_pos = [total_size-1-e for e in don_pos]
77 don_pos.reverse()
78 don_scores.reverse()
79
80 acc_pos = array.array('I',acc_pos)
81 acc_scores = array.array('f',acc_scores)
82
83 don_pos = array.array('I',don_pos)
84 don_scores = array.array('f',don_scores)
85
86 return acc_pos,acc_scores,don_pos,don_scores
87
88
89 def saveSpliceInfo(acc_pos,acc_scores,don_pos,don_scores,strand):
90 """
91 """
92
93 acc_score_fn = 'test_data/acc/chromo_1%s.Conf'%strand
94 acc_pos_fn = 'test_data/acc/chromo_1%s.pos'%strand
95 don_score_fn = 'test_data/don/chromo_1%s.Conf'%strand
96 don_pos_fn = 'test_data/don/chromo_1%s.pos'%strand
97
98 acc_scores.tofile(open(acc_score_fn, 'wb'))
99 acc_pos.tofile(open(acc_pos_fn, 'wb'))
100
101 don_scores.tofile(open(don_score_fn, 'wb'))
102 don_pos.tofile(open(don_pos_fn, 'wb'))
103
104
105 def create_mini_chromosome():
106
107 chromo_fn = 'test_data/chromo1.flat'
108
109 chromo_fh = open(chromo_fn)
110 full_chromo_seq = chromo_fh.read()
111 full_chromo_seq = full_chromo_seq.strip()
112
113 print full_chromo_seq[:200]
114
115 # create data for forward strand
116 acc_pos,acc_scores,don_pos,don_scores = createScoresForSequence(full_chromo_seq,reverse_strand=False)
117 print acc_pos[:5]
118 print don_pos[:5]
119 saveSpliceInfo(acc_pos,acc_scores,don_pos,don_scores,'+')
120
121 # create data for reverse strand
122 full_chromo_seq_rev = reverse_complement(full_chromo_seq)
123
124 total_size = len(full_chromo_seq_rev)
125
126 print full_chromo_seq_rev[:200]
127 acc_pos,acc_scores,don_pos,don_scores = createScoresForSequence(full_chromo_seq_rev,reverse_strand=True)
128 acc_pos = [total_size-1-e for e in acc_pos]
129 acc_pos.reverse()
130 print acc_pos[:5]
131 don_pos = [total_size-1-e for e in don_pos]
132 don_pos.reverse()
133 print don_pos[:5]
134
135 #
136 # Remember: The positions are always saved one-based.
137 #
138
139 #saveSpliceInfo(acc_pos,acc_scores,don_pos,don_scores,'-')
140
141 a = 103
142 b = 255
143 print 'pos: %s'%full_chromo_seq[a:b]
144 print 'neg: %s'%full_chromo_seq_rev[a:b]
145
146 total_size = len(full_chromo_seq)
147 ap = total_size-b
148 bp = total_size-a
149 print 'png: %s'%reverse_complement(full_chromo_seq[ap:bp])
150
151
152 def test_rev_comp():
153 get_seq_and_scores(self,chromo,strand,genomicSeq_start,genomicSeq_stop,only_seq=False,perform_checks=True)
154
155 class TestQPalmaPrediction(unittest.TestCase):
156 """
157 This class...
158 """
159
160 def _setUp(self):
161 data_fn = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1/dataset/prediction_set.pickle'
162
163 self.prediction_set = cPickle.load(open(data_fn))
164
165
166
167 def _setUp(self):
168 print
169 self.prediction_set = {}
170
171 # xxxxxxxxxxx_______________________________________________________________________xxxxxxxxxxxxxxxxxxxxxxxxxxx
172 # 'tattttggaaggtatttcatctctccgacattgctctcaacactgtcccctccaatgcctagtccttttatttttttcttagttccaattcccttaaatacatctcacagtcttcttcttcttctcgattgcagtagc'
173
174 read = 'tattttggaagttccaattcccttaatacatctcacag'
175 currentQualities = [[30]*len(read)]
176
177 id = 1
178 chromo = 3
179 strand = '-'
180
181 tsize = 23470805
182
183 genomicSeq_start = tsize - (2038+10)
184 genomicSeq_stop = tsize - 1900 + 10
185 print 'Position: ',
186 print genomicSeq_start,genomicSeq_stop
187
188 currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
189
190 example = (currentSeqInfo,read,currentQualities)
191 self.prediction_set[id] = [example]
192
193 # xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx xxxxxxxx
194 # 'gctactgcaatcgagaagaagaagaagactgtgagatgtatttaagggaattggaactaagaaaaaaataaaaggactaggcattggaggggacagtgttgagagcaatgtcggagagatgaaataccttccaaaata'
195
196 read = 'gctactgcaatcgagaagaagaagaagactatgaaata'
197 currentQualities = [[40]*30+[22]*8]
198
199 id = 2
200 chromo = 3
201 strand = '+'
202
203 tsize = 23470805
204
205 genomicSeq_start = tsize - (2038+10)
206 genomicSeq_stop = tsize - 1900 + 10
207 print 'Position: ',
208 print genomicSeq_start,genomicSeq_stop
209
210 currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
211
212 example = (currentSeqInfo,read,currentQualities)
213 self.prediction_set[id] = [example]
214
215
216 def setUp(self):
217 self.prediction_set = {}
218
219 # chr1 + 20-120
220 read = 'catctatgcaacagcattacagtgatcaccggcccaaaaaacctgtgtctggggttttgcctgatgatagcagtgatactgaaactggatcaatggtaag'
221 currentQualities = [[40]*len(read)]
222
223 id = 3
224 chromo = 1
225 strand = '+'
226
227 genomicSeq_start = 3500
228 genomicSeq_stop = 6500
229 print 'Position: ',
230 print genomicSeq_start,genomicSeq_stop
231
232 currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
233
234 example = (currentSeqInfo,read,currentQualities)
235 self.prediction_set[id] = [example]
236
237 # chr1 - 5000-5100
238 read = 'ctgtgtatctggttgctcaatatgctcgccggaaaatgaagatcatggatgctgtgagttctccttattgttcattatcaaactgatatgagtttctgat'
239 currentQualities = [[40]*len(read)]
240
241 id = 4
242 chromo = 1
243 strand = '-'
244
245 total_size = 30432563
246
247 genomicSeq_start = total_size - 6500
248 genomicSeq_stop = total_size - 3500
249 print 'Position: ',
250 print genomicSeq_start,genomicSeq_stop
251
252 currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
253
254 example = (currentSeqInfo,read,currentQualities)
255 self.prediction_set[id] = [example]
256
257
258 def testAlignments(self):
259
260 settings = parseSettings('testcase.conf')
261
262 print self.prediction_set
263 for example_key in self.prediction_set.keys():
264 print 'Current example %d' % example_key
265
266 for example in self.prediction_set[example_key]:
267 print example
268 print 'size'
269 print len(example)
270
271 accessWrapper = DataAccessWrapper(settings)
272 seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
273
274 qp = QPalma(seqInfo,True)
275 allPredictions = qp.predict(self.prediction_set,settings)
276
277 for current_prediction in allPredictions:
278 align_str = print_prediction(current_prediction)
279 print align_str
280
281 id = current_prediction['id']
282 seq = current_prediction['read']
283 dna = current_prediction['dna']
284 chromo = current_prediction['chr']
285 strand = current_prediction['strand']
286 start_pos = current_prediction['start_pos']
287 predExons = current_prediction['predExons']
288
289 numExons = int(math.ceil(len(predExons) / 2))
290
291 print alignment_reconstruct(current_prediction,numExons)
292 print id,start_pos,predExons
293
294 print 'Problem counter is %d' % qp.problem_ctr
295
296
297 def _testAlignments(self):
298 run_dir = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/saved_run'
299
300 run = cPickle.load(open(jp(run_dir,'run_obj.pickle')))
301 run['name'] = 'test_run'
302 run['result_dir'] = '/fml/ag-raetsch/home/fabio/tmp/sandbox/testcases'
303
304 param_fn = jp(run_dir,'param_526.pickle')
305 param = cPickle.load(open(param_fn))
306
307 print self.prediction_set
308 for example_key in self.prediction_set.keys():
309 print 'Current example %d' % example_key
310
311 for example in self.prediction_set[example_key]:
312 print example
313 print 'size'
314 print len(example)
315
316 # fetch the data needed
317 settings = {}
318
319 settings['genome_dir'] = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome'
320 settings['acceptor_scores_loc'] = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
321 settings['donor_scores_loc'] = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
322
323 settings['genome_file_fmt'] = 'chr%d.dna.flat'
324 settings['splice_score_file_fmt']= 'contig_%d%s'
325
326 allowed_fragments = [1]
327 settings['allowed_fragments'] = allowed_fragments
328
329 accessWrapper = DataAccessWrapper(settings)
330 seqInfo = SeqSpliceInfo(accessWrapper,allowed_fragments)
331
332 qp = QPalma(run,seqInfo,True)
333 allPredictions = qp.predict(self.prediction_set,param)
334
335 for current_prediction in allPredictions:
336 align_str = print_prediction(current_prediction)
337 print align_str
338
339 id = current_prediction['id']
340 seq = current_prediction['read']
341 dna = current_prediction['dna']
342 chromo = current_prediction['chr']
343 strand = current_prediction['strand']
344 start_pos = current_prediction['start_pos']
345 predExons = current_prediction['predExons']
346
347 numExons = int(math.ceil(len(predExons) / 2))
348
349 print alignment_reconstruct(current_prediction,numExons)
350 print id,start_pos,predExons
351
352 print 'Problem counter is %d' % qp.problem_ctr
353
354 def check_reverse_strand_calculation(id,b,e,seqInfo):
355
356 seq,acc,don = seqInfo.get_seq_and_scores(id,'-',b,e,only_seq=False,perform_checks=False)
357
358 total_size = seqInfo.getFragmentSize(1)
359 bp = total_size - e
360 ep = total_size - b
361 seqp,acc,don = seqInfo.get_seq_and_scores(id,'+',bp,ep,only_seq=False,perform_checks=False)
362 seqp = reverse_complement(seqp)
363
364 return (seq == seqp)
365
366
367 def simple_check(settings,seqInfo,lt1):
368
369 print 'Checking sequences for some intervals...'
370
371 intervals = [(0,10000),(545,874),(999,1234)]
372
373 for (b,e) in intervals:
374 print check_reverse_strand_calculation(1,b,e,seqInfo)
375
376 for (b,e) in [(206,874),(545,874),(999,1234)]:
377 lt1 = LookupTable(settings)
378 _dna,_acc,_don= lt1.get_seq_and_scores(1,'-',b,e)
379 dna,acc,don = seqInfo.get_seq_and_scores(1,'-',b,e,only_seq=False,perform_checks=False)
380
381 print dna == _dna
382 print acc == _acc
383 print don == _don
384
385 #print [p for p,e in enumerate(acc) if e != -inf]
386 #print [p for p,e in enumerate(_acc) if e != -inf]
387
388
389 def checks():
390 settings = {}
391
392 settings['genome_dir'] = 'test_data/'
393 settings['acceptor_scores_loc'] = 'test_data/acc'
394 settings['donor_scores_loc'] = 'test_data/don'
395
396 settings['genome_file_fmt'] = 'chromo%d.flat'
397 settings['splice_score_file_fmt']= 'chromo_%d%s'
398
399 allowed_fragments = [1]
400 settings['allowed_fragments'] = allowed_fragments
401
402 accessWrapper = DataAccessWrapper(settings)
403 seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
404 lt1 = LookupTable(settings)
405
406 print 'Checking with toy data...'
407 simple_check(settings,seqInfo,lt1)
408
409 settings = {}
410
411 settings['genome_dir'] = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome'
412 settings['acceptor_scores_loc'] = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
413 settings['donor_scores_loc'] = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
414
415 settings['genome_file_fmt'] = 'chr%d.dna.flat'
416 settings['splice_score_file_fmt']= 'contig_%d%s'
417
418 allowed_fragments = [1]
419 settings['allowed_fragments'] = allowed_fragments
420
421 accessWrapper = DataAccessWrapper(settings)
422 seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
423 lt1 = LookupTable(settings)
424
425 print 'Checking with real data...'
426 simple_check(settings,seqInfo,lt1)
427
428
429 def run():
430 print 'Creating some artifical data...'
431 create_mini_chromosome()
432 print 'Performing some checks...'
433 checks()
434
435 if __name__ == '__main__':
436 run()
437 #suite = unittest.TestLoader().loadTestsFromTestCase(TestQPalmaPrediction)
438 #unittest.TextTestRunner(verbosity=2).run(suite)