+ extended manual to describe output file format
[qpalma.git] / tests / test_qpalma.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 # This program is free software; you can redistribute it and/or modify
5 # it under the terms of the GNU General Public License as published by
6 # the Free Software Foundation; either version 2 of the License, or
7 # (at your option) any later version.
8 #
9 # Written (W) 2008 Fabio De Bona
10 # Copyright (C) 2008 Max-Planck-Society
11
12
13 import cPickle
14 import random
15 import sys
16 import math
17 import numpy
18 from numpy import inf
19 import os.path
20 import pdb
21 import array
22 import unittest
23
24 from qpalma.qpalma_main import QPalma
25 from qpalma.utils import print_prediction
26
27 from qpalma.Run import Run
28
29 from qpalma.SettingsParser import parseSettings
30 from qpalma.OutputFormat import alignment_reconstruct
31 from qpalma.sequence_utils import DataAccessWrapper,SeqSpliceInfo,reverse_complement
32
33 from qpalma.Lookup import LookupTable
34
35 jp = os.path.join
36
37 pos_chr1 = 'ccctaaaccctaaaccctaaaccctaaacctctgaatccttaatccctaaatccctaaatctttaaatcctacatccatgaatccctaaatacctaattccctaaacccgaaaccggtttctctggttgaaaatcattgtgtatataatgataattttatcgtttttatgtaattgcttattgttgtgtgtagattttttaaaaatatcatttgaggtcaatacaaatcctatttcttgtggttttctttccttcacttagctatggatggtttatcttcatttgttatattggatacaagctttgctacgatctacatttgggaatgtgagtctcttattgtaaccttagggttggtttatctcaagaatcttattaattgtttggactgtttatgtttggacatttattgtcattcttactcctttgtggaaatgtttgttctatcaatttatcttttgtgggaaaattatttagttgtagggatgaagtctttcttcgttgttgttacgcttgtcatctcatctctcaatgatatgggatggtcctttagcatttat'+'x'*205
38
39 neg_chr1 = ''
40
41 acc_p = [217, 262, 302, 333, 352, 369, 478, 484, 492, 554]
42 don_p = [217, 239, 242, 261, 271, 285, 301, 306, 328, 332, 342, 353, 357, 382, 391, 397, 412, 429, 437, 441, 461, 477, 480, 491, 501, 504, 507, 512, 516, 545, 553]
43
44 acc_n = [229, 235, 246, 251, 256, 261, 276, 301, 306, 313, 333, 335, 346, 362, 371, 388, 417, 421, 424, 443, 455, 492, 496, 512, 520, 525, 527, 547]
45 don_n = [224, 257, 262, 298, 302, 307, 310, 317, 346, 389, 404, 422, 511, 513, 554]
46
47
48 def check_createScoresForSequence():
49 print 'Positive strand:'
50 createScoresForSequence(pos_chr1,reverse_strand=False)
51 print acc_p
52 print don_p
53
54 print 'Negative strand:'
55 filename = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/chr1.dna.flat.neg'
56 neg_chr1 = open(filename).read().strip()
57 print len(neg_chr1)
58 createScoresForSequence(neg_chr1,reverse_strand=True)
59 print acc_n
60 print don_n
61
62
63 def createScoresForSequence(full_chromo_seq,reverse_strand=False):
64 """
65 Given a genomic sequence this function calculates random scores for all
66 ocurring splice sites.
67 """
68
69 acc_pos = []
70 don_pos = []
71
72 total_size = len(full_chromo_seq)
73
74 # the first/last 205 pos do not have any predictions
75 for pos,elem in enumerate(full_chromo_seq):
76 if pos < 205 or pos > total_size-205:
77 continue
78
79 if full_chromo_seq[pos-2:pos] == 'ag':
80 acc_pos.append(pos)
81 if full_chromo_seq[pos:pos+2] == 'gt' or full_chromo_seq[pos:pos+2] == 'gc':
82 don_pos.append(pos)
83
84 acc_scores = [0.0]*len(acc_pos)
85 don_scores = [0.0]*len(don_pos)
86
87 for idx in range(len(acc_pos)):
88 acc_scores[idx] = random.uniform(0.1,1.0)
89
90 for idx in range(len(don_pos)):
91 don_scores[idx] = random.uniform(0.1,1.0)
92
93 # recalculate indices and reverse them in order to have positions relative
94 # to positive strand
95 if reverse_strand:
96 acc_pos = [total_size-1-e for e in acc_pos]
97 acc_pos.reverse()
98 acc_scores.reverse()
99
100 don_pos = [total_size-1-e for e in don_pos]
101 don_pos.reverse()
102 don_scores.reverse()
103
104 # make pos 1-based
105 acc_pos = [e+1 for e in acc_pos]
106 don_pos = [e+1 for e in don_pos]
107
108 #print acc_pos[:10]
109 #print don_pos[:10]
110
111 acc_pos = array.array('I',acc_pos)
112 acc_scores = array.array('f',acc_scores)
113
114 don_pos = array.array('I',don_pos)
115 don_scores = array.array('f',don_scores)
116
117 return acc_pos,acc_scores,don_pos,don_scores
118
119
120 def saveSpliceInfo(acc_pos,acc_scores,don_pos,don_scores,strand):
121 """
122 """
123
124 acc_score_fn = 'test_data/acc/chromo_1%s.Conf'%strand
125 acc_pos_fn = 'test_data/acc/chromo_1%s.pos'%strand
126 don_score_fn = 'test_data/don/chromo_1%s.Conf'%strand
127 don_pos_fn = 'test_data/don/chromo_1%s.pos'%strand
128
129 acc_scores.tofile(open(acc_score_fn, 'wb'))
130 acc_pos.tofile(open(acc_pos_fn, 'wb'))
131
132 don_scores.tofile(open(don_score_fn, 'wb'))
133 don_pos.tofile(open(don_pos_fn, 'wb'))
134
135
136 def create_mini_chromosome():
137
138 chromo_fn = 'test_data/chromo1.flat'
139
140 chromo_fh = open(chromo_fn)
141 full_chromo_seq = chromo_fh.read()
142 full_chromo_seq = full_chromo_seq.strip()
143
144 print full_chromo_seq[:200]
145
146 # create data for forward strand
147 acc_pos,acc_scores,don_pos,don_scores = createScoresForSequence(full_chromo_seq,reverse_strand=False)
148 print acc_pos[:5]
149 print don_pos[:5]
150 saveSpliceInfo(acc_pos,acc_scores,don_pos,don_scores,'+')
151
152 # create data for reverse strand
153 full_chromo_seq_rev = reverse_complement(full_chromo_seq)
154
155 total_size = len(full_chromo_seq_rev)
156
157 print full_chromo_seq_rev[:200]
158 acc_pos,acc_scores,don_pos,don_scores = createScoresForSequence(full_chromo_seq_rev,reverse_strand=True)
159 saveSpliceInfo(acc_pos,acc_scores,don_pos,don_scores,'-')
160
161
162 def test_rev_comp():
163 get_seq_and_scores(self,chromo,strand,genomicSeq_start,genomicSeq_stop,only_seq=False,perform_checks=True)
164
165 class TestQPalmaPrediction(unittest.TestCase):
166 """
167 This class...
168 """
169
170
171 def _setUp(self):
172 self.prediction_set = {}
173
174 # chr1 + 20-120
175 read = 'catctatgcaacagcattacagtgatcaccggcccaaaaaacctgtgtctggggttttgcctgatgatagcagtgatactgaaactggatcaatggtaag'
176 currentQualities = [[40]*len(read)]
177
178 id = 3
179 chromo = 1
180 strand = '+'
181
182 genomicSeq_start = 3500
183 genomicSeq_stop = 6500
184 print 'Position: ',
185 print genomicSeq_start,genomicSeq_stop
186
187 currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
188
189 example = (currentSeqInfo,read,currentQualities)
190 self.prediction_set[id] = [example]
191
192 # chr1 - 5000-5100
193 read = 'ctgtgtatctggttgctcaatatgctcgccggaaaatgaagatcatggatgctgtgagttctccttattgttcattatcaaactgatatgagtttctgat'
194 currentQualities = [[40]*len(read)]
195
196 id = 4
197 chromo = 1
198 strand = '-'
199
200 total_size = 30432563
201
202 genomicSeq_start = total_size - 6500
203 genomicSeq_stop = total_size - 3500
204 print 'Position: ',
205 print genomicSeq_start,genomicSeq_stop
206
207 currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
208
209 example = (currentSeqInfo,read,currentQualities)
210 self.prediction_set[id] = [example]
211
212
213 def testAlignments(self):
214
215 settings = parseSettings('testcase.conf')
216
217 print self.prediction_set
218 for example_key in self.prediction_set.keys():
219 print 'Current example %d' % example_key
220
221 for example in self.prediction_set[example_key]:
222 print example
223 print 'size'
224 print len(example)
225
226 accessWrapper = DataAccessWrapper(settings)
227 seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
228
229 qp = QPalma(seqInfo,True)
230 allPredictions = qp.predict(self.prediction_set,settings)
231
232 for current_prediction in allPredictions:
233 align_str = print_prediction(current_prediction)
234 print align_str
235
236 id = current_prediction['id']
237 seq = current_prediction['read']
238 dna = current_prediction['dna']
239 chromo = current_prediction['chr']
240 strand = current_prediction['strand']
241 start_pos = current_prediction['start_pos']
242 predExons = current_prediction['predExons']
243
244 numExons = int(math.ceil(len(predExons) / 2))
245
246 print alignment_reconstruct(current_prediction,numExons)
247 print id,start_pos,predExons
248
249 print 'Problem counter is %d' % qp.problem_ctr
250
251
252 def _testAlignments(self):
253 run_dir = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/saved_run'
254
255 run = cPickle.load(open(jp(run_dir,'run_obj.pickle')))
256 run['name'] = 'test_run'
257 run['result_dir'] = '/fml/ag-raetsch/home/fabio/tmp/sandbox/testcases'
258
259 param_fn = jp(run_dir,'param_526.pickle')
260 param = cPickle.load(open(param_fn))
261
262 print self.prediction_set
263 for example_key in self.prediction_set.keys():
264 print 'Current example %d' % example_key
265
266 for example in self.prediction_set[example_key]:
267 print example
268 print 'size'
269 print len(example)
270
271 # fetch the data needed
272 settings = {}
273
274 settings['genome_dir'] = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome'
275 settings['acceptor_scores_loc'] = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
276 settings['donor_scores_loc'] = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
277
278 settings['genome_file_fmt'] = 'chr%d.dna.flat'
279 settings['splice_score_file_fmt']= 'contig_%d%s'
280
281 allowed_fragments = [1]
282 settings['allowed_fragments'] = allowed_fragments
283
284 accessWrapper = DataAccessWrapper(settings)
285 seqInfo = SeqSpliceInfo(accessWrapper,allowed_fragments)
286
287 qp = QPalma(run,seqInfo,True)
288 allPredictions = qp.predict(self.prediction_set,param)
289
290 for current_prediction in allPredictions:
291 align_str = print_prediction(current_prediction)
292 print align_str
293
294 id = current_prediction['id']
295 seq = current_prediction['read']
296 dna = current_prediction['dna']
297 chromo = current_prediction['chr']
298 strand = current_prediction['strand']
299 start_pos = current_prediction['start_pos']
300 predExons = current_prediction['predExons']
301
302 numExons = int(math.ceil(len(predExons) / 2))
303
304 print alignment_reconstruct(current_prediction,numExons)
305 print id,start_pos,predExons
306
307 print 'Problem counter is %d' % qp.problem_ctr
308
309
310 def check_reverse_strand_calculation(id,b,e,seqInfo):
311 total_size = seqInfo.getFragmentSize(1)
312 bp = total_size - e
313 ep = total_size - b
314
315 seq,acc,don = seqInfo.get_seq_and_scores(id,'-',b,e,only_seq=False,perform_checks=False)
316 seqp,acc,don = seqInfo.get_seq_and_scores(id,'+',bp,ep,only_seq=False,perform_checks=False)
317 seqp = reverse_complement(seqp)
318
319 res1 = (seq == seqp)
320 #print seq
321 #print seqp
322
323 seq,acc,don = seqInfo.get_seq_and_scores(id,'+',b,e,only_seq=False,perform_checks=False)
324 seq,acc,don = seqInfo.get_seq_and_scores(id,'-',bp,ep,only_seq=False,perform_checks=False)
325 #seqp = reverse_complement(seq)
326
327 res2 = (seq == seqp)
328 print seq
329 print seqp
330
331 return res1,res2
332
333
334 def check_example(chromo,strand,b,e,seqInfo,lt1):
335 dna,acc,don = seqInfo.get_seq_and_scores(1,strand,b,e)
336
337 if lt1 != None:
338 _dna,_acc,_don= lt1.get_seq_and_scores(1,strand,b,e)
339 else:
340 _dna,_acc,_don = seqInfo.get_seq_and_scores(1,strand,b,e)
341
342 print 'Current interval: (%d,%d), current strand: %s'%(b,e,strand)
343 print 'Results for dna,acc,don: %s %s %s'%(str(dna==_dna),str(acc==_acc),str(don==_don))
344
345 if dna != _dna:
346 print dna[:20]
347 print _dna[:20]
348
349 if acc != _acc or don != _don:
350 print [p for p,e in enumerate(acc) if e != -inf][:10]
351 print [p for p,e in enumerate(_acc) if e != -inf][:10]
352 print [p for p,e in enumerate(don) if e != -inf][:10]
353 print [p for p,e in enumerate(_don) if e != -inf][:10]
354
355
356 def simple_check(settings,seqInfo,lt1):
357
358 print 'Checking sequences for some intervals...'
359
360 intervals = [(0,10000),(545,874),(999,1234)]
361
362 chromo = 1
363 total_size = seqInfo.getFragmentSize(chromo)
364
365 for strand in ['+','-']:
366 for (b,e) in [(206,874),(545,874),(999,1234),(1000,total_size-1000),(3,total_size-3),(0,total_size)]:
367 check_example(chromo,strand,b,e,seqInfo,lt1)
368
369 #for (b,e) in intervals:
370 # r1,r2 = check_reverse_strand_calculation(1,b,e,seqInfo)
371 # print b,e
372 # print 'Rev strand calculation: %s %s'%(str(r1),str(r2))
373
374
375 def checks():
376 settings = {}
377
378 settings['genome_dir'] = 'test_data/'
379 settings['acceptor_scores_loc'] = 'test_data/acc'
380 settings['donor_scores_loc'] = 'test_data/don'
381
382 settings['genome_file_fmt'] = 'chromo%d.flat'
383 settings['splice_score_file_fmt']= 'chromo_%d%s'
384
385 allowed_fragments = [1]
386 settings['allowed_fragments'] = allowed_fragments
387
388 accessWrapper = DataAccessWrapper(settings)
389 seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
390
391 lt1 = None
392 lt1 = LookupTable(settings)
393
394 print 'Checking with toy data...'
395 #simple_check(settings,seqInfo,lt1)
396
397 settings = {}
398
399 settings['genome_dir'] = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome'
400 settings['acceptor_scores_loc'] = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
401 settings['donor_scores_loc'] = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
402
403 settings['genome_file_fmt'] = 'chr%d.dna.flat'
404 settings['splice_score_file_fmt']= 'contig_%d%s'
405
406 allowed_fragments = [1]
407 settings['allowed_fragments'] = allowed_fragments
408
409 accessWrapper = DataAccessWrapper(settings)
410 seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
411
412 lt1 = None
413 #lt1 = LookupTable(settings)
414
415 #print 'Checking with real data...'
416 simple_check(settings,seqInfo,lt1)
417
418
419 def run():
420 print 'Creating some artifical data...'
421 create_mini_chromosome()
422 print 'Performing some checks...'
423 checks()
424
425 if __name__ == '__main__':
426 run()
427 #check_createScoresForSequence()
428 #suite = unittest.TestLoader().loadTestsFromTestCase(TestQPalmaPrediction)
429 #unittest.TextTestRunner(verbosity=2).run(suite)