e585a750077a1251de07ac5eee9f2e3352380c83
[qpalma.git] / tests / test_qpalma.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 # This program is free software; you can redistribute it and/or modify
5 # it under the terms of the GNU General Public License as published by
6 # the Free Software Foundation; either version 2 of the License, or
7 # (at your option) any later version.
8 #
9 # Written (W) 2008 Fabio De Bona
10 # Copyright (C) 2008 Max-Planck-Society
11
12
13 import cPickle
14 import random
15 import math
16 import numpy
17 from numpy import inf
18 import os.path
19 import pdb
20 import array
21 import unittest
22
23 from qpalma.qpalma_main import QPalma
24 from qpalma.utils import print_prediction
25
26 from qpalma.Run import Run
27
28 from qpalma.SettingsParser import parseSettings
29 from qpalma.OutputFormat import alignment_reconstruct
30 from qpalma.sequence_utils import DataAccessWrapper,SeqSpliceInfo,reverse_complement
31
32 from qpalma.Lookup import LookupTable
33
34 jp = os.path.join
35
36 pos_chr1 = 'ccctaaaccctaaaccctaaaccctaaacctctgaatccttaatccctaaatccctaaatctttaaatcctacatccatgaatccctaaatacctaattccctaaacccgaaaccggtttctctggttgaaaatcattgtgtatataatgataattttatcgtttttatgtaattgcttattgttgtgtgtagattttttaaaaatatcatttgaggtcaatacaaatcctatttcttgtggttttctttccttcacttagctatggatggtttatcttcatttgttatattggatacaagctttgctacgatctacatttgggaatgtgagtctcttattgtaaccttagggttggtttatctcaagaatcttattaattgtttggactgtttatgtttggacatttattgtcattcttactcctttgtggaaatgtttgttctatcaatttatcttttgtgggaaaattatttagttgtagggatgaagtctttcttcgttgttgttacgcttgtcatctcatctctcaatgatatgggatggtcctttagcatttat'+'x'*205
37
38 neg_chr1 = ''
39
40 acc_p = [217, 262, 302, 333, 352, 369, 478, 484, 492, 554]
41 don_p = [217, 239, 242, 261, 271, 285, 301, 306, 328, 332, 342, 353, 357, 382, 391, 397, 412, 429, 437, 441, 461, 477, 480, 491, 501, 504, 507, 512, 516, 545, 553]
42
43 acc_n = [229, 235, 246, 251, 256, 261, 276, 301, 306, 313, 333, 335, 346, 362, 371, 388, 417, 421, 424, 443, 455, 492, 496, 512, 520, 525, 527, 547]
44 don_n = [224, 257, 262, 298, 302, 307, 310, 317, 346, 389, 404, 422, 511, 513, 554]
45
46
47 def check_createScoresForSequence():
48 print 'Positive strand:'
49 createScoresForSequence(pos_chr1,reverse_strand=False)
50 print acc_p
51 print don_p
52
53 print 'Negative strand:'
54 filename = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/chr1.dna.flat.neg'
55 neg_chr1 = open(filename).read().strip()
56 print len(neg_chr1)
57 createScoresForSequence(neg_chr1,reverse_strand=True)
58 print acc_n
59 print don_n
60
61
62 def createScoresForSequence(full_chromo_seq,reverse_strand=False):
63 """
64 Given a genomic sequence this function calculates random scores for all
65 ocurring splice sites.
66 """
67
68 acc_pos = []
69 don_pos = []
70
71 total_size = len(full_chromo_seq)
72
73 # the first/last 205 pos do not have any predictions
74 for pos,elem in enumerate(full_chromo_seq):
75 if pos < 205 or pos > total_size-205:
76 continue
77
78 if full_chromo_seq[pos-2:pos] == 'ag':
79 acc_pos.append(pos)
80 if full_chromo_seq[pos:pos+2] == 'gt' or full_chromo_seq[pos:pos+2] == 'gc':
81 don_pos.append(pos)
82
83 acc_scores = [0.0]*len(acc_pos)
84 don_scores = [0.0]*len(don_pos)
85
86 for idx in range(len(acc_pos)):
87 acc_scores[idx] = random.uniform(0.1,1.0)
88
89 for idx in range(len(don_pos)):
90 don_scores[idx] = random.uniform(0.1,1.0)
91
92 # recalculate indices and reverse them in order to have positions relative
93 # to positive strand
94 if reverse_strand:
95 acc_pos = [total_size-1-e for e in acc_pos]
96 acc_pos.reverse()
97 acc_scores.reverse()
98
99 don_pos = [total_size-1-e for e in don_pos]
100 don_pos.reverse()
101 don_scores.reverse()
102
103 # make pos 1-based
104 acc_pos = [e+1 for e in acc_pos]
105 don_pos = [e+1 for e in don_pos]
106
107 #print acc_pos[:10]
108 #print don_pos[:10]
109
110 acc_pos = array.array('I',acc_pos)
111 acc_scores = array.array('f',acc_scores)
112
113 don_pos = array.array('I',don_pos)
114 don_scores = array.array('f',don_scores)
115
116 return acc_pos,acc_scores,don_pos,don_scores
117
118
119 def saveSpliceInfo(acc_pos,acc_scores,don_pos,don_scores,strand):
120 """
121 """
122
123 acc_score_fn = 'test_data/acc/chromo_1%s.Conf'%strand
124 acc_pos_fn = 'test_data/acc/chromo_1%s.pos'%strand
125 don_score_fn = 'test_data/don/chromo_1%s.Conf'%strand
126 don_pos_fn = 'test_data/don/chromo_1%s.pos'%strand
127
128 acc_scores.tofile(open(acc_score_fn, 'wb'))
129 acc_pos.tofile(open(acc_pos_fn, 'wb'))
130
131 don_scores.tofile(open(don_score_fn, 'wb'))
132 don_pos.tofile(open(don_pos_fn, 'wb'))
133
134
135 def create_mini_chromosome():
136
137 chromo_fn = 'test_data/chromo1.flat'
138
139 chromo_fh = open(chromo_fn)
140 full_chromo_seq = chromo_fh.read()
141 full_chromo_seq = full_chromo_seq.strip()
142
143 print full_chromo_seq[:200]
144
145 # create data for forward strand
146 acc_pos,acc_scores,don_pos,don_scores = createScoresForSequence(full_chromo_seq,reverse_strand=False)
147 print acc_pos[:5]
148 print don_pos[:5]
149 saveSpliceInfo(acc_pos,acc_scores,don_pos,don_scores,'+')
150
151 # create data for reverse strand
152 full_chromo_seq_rev = reverse_complement(full_chromo_seq)
153
154 total_size = len(full_chromo_seq_rev)
155
156 print full_chromo_seq_rev[:200]
157 acc_pos,acc_scores,don_pos,don_scores = createScoresForSequence(full_chromo_seq_rev,reverse_strand=True)
158 saveSpliceInfo(acc_pos,acc_scores,don_pos,don_scores,'-')
159
160
161 def test_rev_comp():
162 get_seq_and_scores(self,chromo,strand,genomicSeq_start,genomicSeq_stop,only_seq=False,perform_checks=True)
163
164 class TestQPalmaPrediction(unittest.TestCase):
165 """
166 This class...
167 """
168
169
170 def _setUp(self):
171 self.prediction_set = {}
172
173 # chr1 + 20-120
174 read = 'catctatgcaacagcattacagtgatcaccggcccaaaaaacctgtgtctggggttttgcctgatgatagcagtgatactgaaactggatcaatggtaag'
175 currentQualities = [[40]*len(read)]
176
177 id = 3
178 chromo = 1
179 strand = '+'
180
181 genomicSeq_start = 3500
182 genomicSeq_stop = 6500
183 print 'Position: ',
184 print genomicSeq_start,genomicSeq_stop
185
186 currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
187
188 example = (currentSeqInfo,read,currentQualities)
189 self.prediction_set[id] = [example]
190
191 # chr1 - 5000-5100
192 read = 'ctgtgtatctggttgctcaatatgctcgccggaaaatgaagatcatggatgctgtgagttctccttattgttcattatcaaactgatatgagtttctgat'
193 currentQualities = [[40]*len(read)]
194
195 id = 4
196 chromo = 1
197 strand = '-'
198
199 total_size = 30432563
200
201 genomicSeq_start = total_size - 6500
202 genomicSeq_stop = total_size - 3500
203 print 'Position: ',
204 print genomicSeq_start,genomicSeq_stop
205
206 currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
207
208 example = (currentSeqInfo,read,currentQualities)
209 self.prediction_set[id] = [example]
210
211
212 def testAlignments(self):
213
214 settings = parseSettings('testcase.conf')
215
216 print self.prediction_set
217 for example_key in self.prediction_set.keys():
218 print 'Current example %d' % example_key
219
220 for example in self.prediction_set[example_key]:
221 print example
222 print 'size'
223 print len(example)
224
225 accessWrapper = DataAccessWrapper(settings)
226 seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
227
228 qp = QPalma(seqInfo,True)
229 allPredictions = qp.predict(self.prediction_set,settings)
230
231 for current_prediction in allPredictions:
232 align_str = print_prediction(current_prediction)
233 print align_str
234
235 id = current_prediction['id']
236 seq = current_prediction['read']
237 dna = current_prediction['dna']
238 chromo = current_prediction['chr']
239 strand = current_prediction['strand']
240 start_pos = current_prediction['start_pos']
241 predExons = current_prediction['predExons']
242
243 numExons = int(math.ceil(len(predExons) / 2))
244
245 print alignment_reconstruct(current_prediction,numExons)
246 print id,start_pos,predExons
247
248 print 'Problem counter is %d' % qp.problem_ctr
249
250
251 def _testAlignments(self):
252 run_dir = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/saved_run'
253
254 run = cPickle.load(open(jp(run_dir,'run_obj.pickle')))
255 run['name'] = 'test_run'
256 run['result_dir'] = '/fml/ag-raetsch/home/fabio/tmp/sandbox/testcases'
257
258 param_fn = jp(run_dir,'param_526.pickle')
259 param = cPickle.load(open(param_fn))
260
261 print self.prediction_set
262 for example_key in self.prediction_set.keys():
263 print 'Current example %d' % example_key
264
265 for example in self.prediction_set[example_key]:
266 print example
267 print 'size'
268 print len(example)
269
270 # fetch the data needed
271 settings = {}
272
273 settings['genome_dir'] = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome'
274 settings['acceptor_scores_loc'] = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
275 settings['donor_scores_loc'] = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
276
277 settings['genome_file_fmt'] = 'chr%d.dna.flat'
278 settings['splice_score_file_fmt']= 'contig_%d%s'
279
280 allowed_fragments = [1]
281 settings['allowed_fragments'] = allowed_fragments
282
283 accessWrapper = DataAccessWrapper(settings)
284 seqInfo = SeqSpliceInfo(accessWrapper,allowed_fragments)
285
286 qp = QPalma(run,seqInfo,True)
287 allPredictions = qp.predict(self.prediction_set,param)
288
289 for current_prediction in allPredictions:
290 align_str = print_prediction(current_prediction)
291 print align_str
292
293 id = current_prediction['id']
294 seq = current_prediction['read']
295 dna = current_prediction['dna']
296 chromo = current_prediction['chr']
297 strand = current_prediction['strand']
298 start_pos = current_prediction['start_pos']
299 predExons = current_prediction['predExons']
300
301 numExons = int(math.ceil(len(predExons) / 2))
302
303 print alignment_reconstruct(current_prediction,numExons)
304 print id,start_pos,predExons
305
306 print 'Problem counter is %d' % qp.problem_ctr
307
308 def check_reverse_strand_calculation(id,b,e,seqInfo):
309 seq,acc,don = seqInfo.get_seq_and_scores(id,'-',b,e,only_seq=False,perform_checks=False)
310
311 total_size = seqInfo.getFragmentSize(1)
312 bp = total_size - e
313 ep = total_size - b
314 seqp,acc,don = seqInfo.get_seq_and_scores(id,'+',bp,ep,only_seq=False,perform_checks=False)
315 seqp = reverse_complement(seqp)
316
317 return (seq == seqp)
318
319 def check_example(chromo,strand,b,e,seqInfo,lt1):
320 dna,acc,don = seqInfo.get_seq_and_scores(1,strand,b,e)
321 #_dna,_acc,_don = seqInfo.get_seq_and_scores(1,strand,b,e)
322 _dna,_acc,_don= lt1.get_seq_and_scores(1,'-',b,e)
323
324 print 'Current interval: (%d,%d), current strand: %s'%(b,e,strand)
325 print 'Results for dna,acc,don:'
326 print '%s %s %s'%(str(dna==_dna),str(acc==_acc),str(don==_don))
327 print [p for p,e in enumerate(acc) if e != -inf][:10]
328 print [p for p,e in enumerate(_acc) if e != -inf][:10]
329 print [p for p,e in enumerate(don) if e != -inf][:10]
330 print [p for p,e in enumerate(_don) if e != -inf][:10]
331
332
333 def simple_check(settings,seqInfo,lt1):
334
335 print 'Checking sequences for some intervals...'
336
337 intervals = [(0,10000),(545,874),(999,1234)]
338
339 chromo = 1
340 total_size = seqInfo.getFragmentSize(chromo)
341
342 for strand in ['+','-']:
343 for (b,e) in [(206,874),(545,874),(999,1234),(1000,total_size-1000),(3,total_size-3)]:
344 check_example(chromo,strand,b,e,seqInfo,lt1)
345
346 for (b,e) in intervals:
347 print 'Rev strand calculation: %s'%str(check_reverse_strand_calculation(1,b,e,seqInfo))
348
349
350 def checks():
351 settings = {}
352
353 settings['genome_dir'] = 'test_data/'
354 settings['acceptor_scores_loc'] = 'test_data/acc'
355 settings['donor_scores_loc'] = 'test_data/don'
356
357 settings['genome_file_fmt'] = 'chromo%d.flat'
358 settings['splice_score_file_fmt']= 'chromo_%d%s'
359
360 allowed_fragments = [1]
361 settings['allowed_fragments'] = allowed_fragments
362
363 accessWrapper = DataAccessWrapper(settings)
364 seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
365
366 lt1 = None
367 lt1 = LookupTable(settings)
368
369 print 'Checking with toy data...'
370 simple_check(settings,seqInfo,lt1)
371
372 settings = {}
373
374 settings['genome_dir'] = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome'
375 settings['acceptor_scores_loc'] = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
376 settings['donor_scores_loc'] = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
377
378 settings['genome_file_fmt'] = 'chr%d.dna.flat'
379 settings['splice_score_file_fmt']= 'contig_%d%s'
380
381 allowed_fragments = [1]
382 settings['allowed_fragments'] = allowed_fragments
383
384 accessWrapper = DataAccessWrapper(settings)
385 seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
386
387 lt1 = None
388 lt1 = LookupTable(settings)
389
390 print 'Checking with real data...'
391 simple_check(settings,seqInfo,lt1)
392
393
394 def run():
395 print 'Creating some artifical data...'
396 create_mini_chromosome()
397 print 'Performing some checks...'
398 checks()
399
400 if __name__ == '__main__':
401 run()
402 #check_createScoresForSequence()
403 #suite = unittest.TestLoader().loadTestsFromTestCase(TestQPalmaPrediction)
404 #unittest.TextTestRunner(verbosity=2).run(suite)