+ fixed some issues with the splice site scores and ugly code fragments
[qpalma.git] / tests / test_qpalma.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 # This program is free software; you can redistribute it and/or modify
5 # it under the terms of the GNU General Public License as published by
6 # the Free Software Foundation; either version 2 of the License, or
7 # (at your option) any later version.
8 #
9 # Written (W) 2008 Fabio De Bona
10 # Copyright (C) 2008 Max-Planck-Society
11
12
13 import cPickle
14 import random
15 import math
16 import numpy
17 import os.path
18 import pdb
19 import array
20 import unittest
21
22 from qpalma.qpalma_main import QPalma
23 from qpalma.utils import print_prediction
24
25 from qpalma.Run import Run
26
27 from qpalma.SettingsParser import parseSettings
28 from qpalma.OutputFormat import alignment_reconstruct
29 from qpalma.sequence_utils import DataAccessWrapper,SeqSpliceInfo,reverse_complement
30
31 from qpalma.Lookup import LookupTable
32
33
34 jp = os.path.join
35
36
37 def createScoresForSequence(full_chromo_seq,reverse_strand=False):
38 """
39 Given a genomic sequence this function calculates random scores for all
40 ocurring splice sites.
41 """
42
43 acc_pos = []
44 don_pos = []
45
46 total_size = len(full_chromo_seq)
47
48 # the first/last 205 pos do not have any predictions
49 for pos,elem in enumerate(full_chromo_seq[205:-205]):
50 if full_chromo_seq[pos-2:pos] == 'ag':
51 acc_pos.append(pos)
52 if full_chromo_seq[pos:pos+2] == 'gt' or full_chromo_seq[pos:pos+2] == 'gc':
53 don_pos.append(pos)
54
55 # make pos 1-based
56 acc_pos = [e+1 for e in acc_pos]
57 don_pos = [e+1 for e in don_pos]
58
59 acc_scores = [0.0]*len(acc_pos)
60 don_scores = [0.0]*len(don_pos)
61
62 for idx in range(len(acc_pos)):
63 acc_scores[idx] = random.uniform(0.1,1.0)
64
65 for idx in range(len(don_pos)):
66 don_scores[idx] = random.uniform(0.1,1.0)
67
68 # recalculate indices and reverse them in order to have positions relative
69 # to positive strand
70 if reverse_strand:
71 acc_pos = [total_size-1-e for e in acc_pos]
72 acc_pos.reverse()
73 acc_scores.reverse()
74
75 don_pos = [total_size-1-e for e in don_pos]
76 don_pos.reverse()
77 don_scores.reverse()
78
79 acc_pos = array.array('I',acc_pos)
80 acc_scores = array.array('f',acc_scores)
81
82 don_pos = array.array('I',don_pos)
83 don_scores = array.array('f',don_scores)
84
85 return acc_pos,acc_scores,don_pos,don_scores
86
87
88 def saveSpliceInfo(acc_pos,acc_scores,don_pos,don_scores,strand):
89 """
90 """
91
92 acc_score_fn = 'test_data/acc/chromo_1%s.Conf'%strand
93 acc_pos_fn = 'test_data/acc/chromo_1%s.pos'%strand
94 don_score_fn = 'test_data/don/chromo_1%s.Conf'%strand
95 don_pos_fn = 'test_data/don/chromo_1%s.pos'%strand
96
97 acc_scores.tofile(open(acc_score_fn, 'wb'))
98 acc_pos.tofile(open(acc_pos_fn, 'wb'))
99
100 don_scores.tofile(open(don_score_fn, 'wb'))
101 don_pos.tofile(open(don_pos_fn, 'wb'))
102
103
104 def create_mini_chromosome():
105
106 chromo_fn = 'test_data/chromo1.flat'
107
108 chromo_fh = open(chromo_fn)
109 full_chromo_seq = chromo_fh.read()
110 full_chromo_seq = full_chromo_seq.strip()
111
112 print full_chromo_seq[:200]
113
114 # create data for forward strand
115 acc_pos,acc_scores,don_pos,don_scores = createScoresForSequence(full_chromo_seq,reverse_strand=False)
116 print acc_pos[:5]
117 print don_pos[:5]
118 saveSpliceInfo(acc_pos,acc_scores,don_pos,don_scores,'+')
119
120 # create data for reverse strand
121 full_chromo_seq_rev = reverse_complement(full_chromo_seq)
122
123 total_size = len(full_chromo_seq_rev)
124
125 print full_chromo_seq_rev[:200]
126 acc_pos,acc_scores,don_pos,don_scores = createScoresForSequence(full_chromo_seq_rev,reverse_strand=True)
127 acc_pos = [total_size-1-e for e in acc_pos]
128 acc_pos.reverse()
129 print acc_pos[:5]
130 don_pos = [total_size-1-e for e in don_pos]
131 don_pos.reverse()
132 print don_pos[:5]
133
134 #
135 # Remember: The positions are always saved one-based.
136 #
137
138 #saveSpliceInfo(acc_pos,acc_scores,don_pos,don_scores,'-')
139
140 a = 103
141 b = 255
142 print 'pos: %s'%full_chromo_seq[a:b]
143 print 'neg: %s'%full_chromo_seq_rev[a:b]
144
145 total_size = len(full_chromo_seq)
146 ap = total_size-b
147 bp = total_size-a
148 print 'png: %s'%reverse_complement(full_chromo_seq[ap:bp])
149
150
151 def test_rev_comp():
152 get_seq_and_scores(self,chromo,strand,genomicSeq_start,genomicSeq_stop,only_seq=False,perform_checks=True)
153
154 class TestQPalmaPrediction(unittest.TestCase):
155 """
156 This class...
157 """
158
159 def _setUp(self):
160 data_fn = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1/dataset/prediction_set.pickle'
161
162 self.prediction_set = cPickle.load(open(data_fn))
163
164
165
166 def _setUp(self):
167 print
168 self.prediction_set = {}
169
170 # xxxxxxxxxxx_______________________________________________________________________xxxxxxxxxxxxxxxxxxxxxxxxxxx
171 # 'tattttggaaggtatttcatctctccgacattgctctcaacactgtcccctccaatgcctagtccttttatttttttcttagttccaattcccttaaatacatctcacagtcttcttcttcttctcgattgcagtagc'
172
173 read = 'tattttggaagttccaattcccttaatacatctcacag'
174 currentQualities = [[30]*len(read)]
175
176 id = 1
177 chromo = 3
178 strand = '-'
179
180 tsize = 23470805
181
182 genomicSeq_start = tsize - (2038+10)
183 genomicSeq_stop = tsize - 1900 + 10
184 print 'Position: ',
185 print genomicSeq_start,genomicSeq_stop
186
187 currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
188
189 example = (currentSeqInfo,read,currentQualities)
190 self.prediction_set[id] = [example]
191
192 # xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx xxxxxxxx
193 # 'gctactgcaatcgagaagaagaagaagactgtgagatgtatttaagggaattggaactaagaaaaaaataaaaggactaggcattggaggggacagtgttgagagcaatgtcggagagatgaaataccttccaaaata'
194
195 read = 'gctactgcaatcgagaagaagaagaagactatgaaata'
196 currentQualities = [[40]*30+[22]*8]
197
198 id = 2
199 chromo = 3
200 strand = '+'
201
202 tsize = 23470805
203
204 genomicSeq_start = tsize - (2038+10)
205 genomicSeq_stop = tsize - 1900 + 10
206 print 'Position: ',
207 print genomicSeq_start,genomicSeq_stop
208
209 currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
210
211 example = (currentSeqInfo,read,currentQualities)
212 self.prediction_set[id] = [example]
213
214
215 def setUp(self):
216 self.prediction_set = {}
217
218 # chr1 + 20-120
219 read = 'catctatgcaacagcattacagtgatcaccggcccaaaaaacctgtgtctggggttttgcctgatgatagcagtgatactgaaactggatcaatggtaag'
220 currentQualities = [[40]*len(read)]
221
222 id = 3
223 chromo = 1
224 strand = '+'
225
226 genomicSeq_start = 3500
227 genomicSeq_stop = 6500
228 print 'Position: ',
229 print genomicSeq_start,genomicSeq_stop
230
231 currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
232
233 example = (currentSeqInfo,read,currentQualities)
234 self.prediction_set[id] = [example]
235
236 # chr1 - 5000-5100
237 read = 'ctgtgtatctggttgctcaatatgctcgccggaaaatgaagatcatggatgctgtgagttctccttattgttcattatcaaactgatatgagtttctgat'
238 currentQualities = [[40]*len(read)]
239
240 id = 4
241 chromo = 1
242 strand = '-'
243
244 total_size = 30432563
245
246 genomicSeq_start = total_size - 6500
247 genomicSeq_stop = total_size - 3500
248 print 'Position: ',
249 print genomicSeq_start,genomicSeq_stop
250
251 currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
252
253 example = (currentSeqInfo,read,currentQualities)
254 self.prediction_set[id] = [example]
255
256
257 def testAlignments(self):
258
259 settings = parseSettings('testcase.conf')
260
261 print self.prediction_set
262 for example_key in self.prediction_set.keys():
263 print 'Current example %d' % example_key
264
265 for example in self.prediction_set[example_key]:
266 print example
267 print 'size'
268 print len(example)
269
270 accessWrapper = DataAccessWrapper(settings)
271 seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
272
273 qp = QPalma(seqInfo,True)
274 allPredictions = qp.predict(self.prediction_set,settings)
275
276 for current_prediction in allPredictions:
277 align_str = print_prediction(current_prediction)
278 print align_str
279
280 id = current_prediction['id']
281 seq = current_prediction['read']
282 dna = current_prediction['dna']
283 chromo = current_prediction['chr']
284 strand = current_prediction['strand']
285 start_pos = current_prediction['start_pos']
286 predExons = current_prediction['predExons']
287
288 numExons = int(math.ceil(len(predExons) / 2))
289
290 print alignment_reconstruct(current_prediction,numExons)
291 print id,start_pos,predExons
292
293 print 'Problem counter is %d' % qp.problem_ctr
294
295
296 def _testAlignments(self):
297 run_dir = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/saved_run'
298
299 run = cPickle.load(open(jp(run_dir,'run_obj.pickle')))
300 run['name'] = 'test_run'
301 run['result_dir'] = '/fml/ag-raetsch/home/fabio/tmp/sandbox/testcases'
302
303 param_fn = jp(run_dir,'param_526.pickle')
304 param = cPickle.load(open(param_fn))
305
306 print self.prediction_set
307 for example_key in self.prediction_set.keys():
308 print 'Current example %d' % example_key
309
310 for example in self.prediction_set[example_key]:
311 print example
312 print 'size'
313 print len(example)
314
315 # fetch the data needed
316 settings = {}
317
318 settings['genome_dir'] = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome'
319 settings['acceptor_scores_loc'] = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
320 settings['donor_scores_loc'] = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
321
322 settings['genome_file_fmt'] = 'chr%d.dna.flat'
323 settings['splice_score_file_fmt']= 'contig_%d%s'
324
325 allowed_fragments = [1]
326 settings['allowed_fragments'] = allowed_fragments
327
328 accessWrapper = DataAccessWrapper(settings)
329 seqInfo = SeqSpliceInfo(accessWrapper,allowed_fragments)
330
331 qp = QPalma(run,seqInfo,True)
332 allPredictions = qp.predict(self.prediction_set,param)
333
334 for current_prediction in allPredictions:
335 align_str = print_prediction(current_prediction)
336 print align_str
337
338 id = current_prediction['id']
339 seq = current_prediction['read']
340 dna = current_prediction['dna']
341 chromo = current_prediction['chr']
342 strand = current_prediction['strand']
343 start_pos = current_prediction['start_pos']
344 predExons = current_prediction['predExons']
345
346 numExons = int(math.ceil(len(predExons) / 2))
347
348 print alignment_reconstruct(current_prediction,numExons)
349 print id,start_pos,predExons
350
351 print 'Problem counter is %d' % qp.problem_ctr
352
353
354 def simple_check():
355 # fetch the data needed
356 settings = {}
357
358 #settings['genome_dir'] = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome'
359 #settings['acceptor_scores_loc'] = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
360 #settings['donor_scores_loc'] = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
361
362 #settings['genome_file_fmt'] = 'chr%d.dna.flat'
363 #settings['splice_score_file_fmt']= 'contig_%d%s'
364
365 #allowed_fragments = [1]
366 #settings['allowed_fragments'] = allowed_fragments
367
368 settings['genome_dir'] = 'test_data/'
369 settings['acceptor_scores_loc'] = 'test_data/acc'
370 settings['donor_scores_loc'] = 'test_data/don'
371
372 settings['genome_file_fmt'] = 'chromo%d.flat'
373 settings['splice_score_file_fmt']= 'chromo_%d%s'
374
375 allowed_fragments = [1]
376 settings['allowed_fragments'] = allowed_fragments
377
378 accessWrapper = DataAccessWrapper(settings)
379 seqInfo = SeqSpliceInfo(accessWrapper,allowed_fragments)
380
381 b = 0
382 e = 10000
383 seq = seqInfo.get_seq_and_scores(1,'-',b,e,only_seq=True,perform_checks=False)
384
385 total_size = seqInfo.getFragmentSize(1)
386 bp = total_size - e
387 ep = total_size - b
388 seqp = seqInfo.get_seq_and_scores(1,'+',b,e,only_seq=True,perform_checks=False)
389 seqp = reverse_complement(seqp)
390
391 print seq[0:100] == seqp[0:100]
392 print seq[534:1652] == seqp[534:1652]
393 print seq[-100:] == seqp[-100:]
394
395 lt1 = LookupTable(settings)
396 dna,acc,don= lt1.get_seq_and_scores(1,'-',b,e)
397
398 print seq[:100] == seqp[:100] == dna[:100]
399 print seq[-100:] == seqp[-100:] == dna[-100:]
400
401 b = 206
402 e = 300
403 dna,acc,don = seqInfo.get_seq_and_scores(1,'-',b,e,only_seq=False,perform_checks=False)
404
405 lt1 = LookupTable(settings)
406 _dna,_acc,_don= lt1.get_seq_and_scores(1,'-',b,e)
407
408
409 print dna == _dna
410 print acc == _acc
411 print don == _don
412
413 from numpy import inf
414 print [p for p,e in enumerate(acc) if e != -inf]
415 print [p for p,e in enumerate(_acc) if e != -inf]
416
417
418
419 if __name__ == '__main__':
420 #create_mini_chromosome()
421 simple_check()
422 #suite = unittest.TestLoader().loadTestsFromTestCase(TestQPalmaPrediction)
423 #unittest.TextTestRunner(verbosity=2).run(suite)