1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
4 # This program is free software; you can redistribute it and/or modify
6 # the Free Software Foundation; either version 2 of the License, or
7 # (at your option) any later version.
8 #
9 # Written (W) 2008 Fabio De Bona
10 # Copyright (C) 2008 Max-Planck-Society
13 import cPickle
14 import random
15 import math
16 import numpy
17 from numpy import inf
18 import os.path
19 import pdb
20 import array
21 import unittest
23 from qpalma.qpalma_main import QPalma
24 from qpalma.utils import print_prediction
26 from qpalma.Run import Run
28 from qpalma.SettingsParser import parseSettings
29 from qpalma.OutputFormat import alignment_reconstruct
30 from qpalma.sequence_utils import DataAccessWrapper,SeqSpliceInfo,reverse_complement
32 from qpalma.Lookup import LookupTable
35 jp = os.path.join
38 def createScoresForSequence(full_chromo_seq,reverse_strand=False):
39 """
40 Given a genomic sequence this function calculates random scores for all
41 ocurring splice sites.
42 """
44 acc_pos = []
45 don_pos = []
47 total_size = len(full_chromo_seq)
49 # the first/last 205 pos do not have any predictions
50 for pos,elem in enumerate(full_chromo_seq[205:-205]):
51 if full_chromo_seq[pos-2:pos] == 'ag':
52 acc_pos.append(pos)
53 if full_chromo_seq[pos:pos+2] == 'gt' or full_chromo_seq[pos:pos+2] == 'gc':
54 don_pos.append(pos)
56 # make pos 1-based
57 acc_pos = [e+1 for e in acc_pos]
58 don_pos = [e+1 for e in don_pos]
60 acc_scores = [0.0]*len(acc_pos)
61 don_scores = [0.0]*len(don_pos)
63 for idx in range(len(acc_pos)):
64 acc_scores[idx] = random.uniform(0.1,1.0)
66 for idx in range(len(don_pos)):
67 don_scores[idx] = random.uniform(0.1,1.0)
69 # recalculate indices and reverse them in order to have positions relative
70 # to positive strand
71 if reverse_strand:
72 acc_pos = [total_size-1-e for e in acc_pos]
73 acc_pos.reverse()
74 acc_scores.reverse()
76 don_pos = [total_size-1-e for e in don_pos]
77 don_pos.reverse()
78 don_scores.reverse()
80 acc_pos = array.array('I',acc_pos)
81 acc_scores = array.array('f',acc_scores)
83 don_pos = array.array('I',don_pos)
84 don_scores = array.array('f',don_scores)
86 return acc_pos,acc_scores,don_pos,don_scores
89 def saveSpliceInfo(acc_pos,acc_scores,don_pos,don_scores,strand):
90 """
91 """
93 acc_score_fn = 'test_data/acc/chromo_1%s.Conf'%strand
94 acc_pos_fn = 'test_data/acc/chromo_1%s.pos'%strand
95 don_score_fn = 'test_data/don/chromo_1%s.Conf'%strand
96 don_pos_fn = 'test_data/don/chromo_1%s.pos'%strand
98 acc_scores.tofile(open(acc_score_fn, 'wb'))
99 acc_pos.tofile(open(acc_pos_fn, 'wb'))
101 don_scores.tofile(open(don_score_fn, 'wb'))
102 don_pos.tofile(open(don_pos_fn, 'wb'))
105 def create_mini_chromosome():
107 chromo_fn = 'test_data/chromo1.flat'
109 chromo_fh = open(chromo_fn)
111 full_chromo_seq = full_chromo_seq.strip()
113 print full_chromo_seq[:200]
115 # create data for forward strand
116 acc_pos,acc_scores,don_pos,don_scores = createScoresForSequence(full_chromo_seq,reverse_strand=False)
117 print acc_pos[:5]
118 print don_pos[:5]
119 saveSpliceInfo(acc_pos,acc_scores,don_pos,don_scores,'+')
121 # create data for reverse strand
122 full_chromo_seq_rev = reverse_complement(full_chromo_seq)
124 total_size = len(full_chromo_seq_rev)
126 print full_chromo_seq_rev[:200]
127 acc_pos,acc_scores,don_pos,don_scores = createScoresForSequence(full_chromo_seq_rev,reverse_strand=True)
128 acc_pos = [total_size-1-e for e in acc_pos]
129 acc_pos.reverse()
130 print acc_pos[:5]
131 don_pos = [total_size-1-e for e in don_pos]
132 don_pos.reverse()
133 print don_pos[:5]
135 #
136 # Remember: The positions are always saved one-based.
137 #
139 #saveSpliceInfo(acc_pos,acc_scores,don_pos,don_scores,'-')
141 a = 103
142 b = 255
143 print 'pos: %s'%full_chromo_seq[a:b]
144 print 'neg: %s'%full_chromo_seq_rev[a:b]
146 total_size = len(full_chromo_seq)
147 ap = total_size-b
148 bp = total_size-a
149 print 'png: %s'%reverse_complement(full_chromo_seq[ap:bp])
152 def test_rev_comp():
153 get_seq_and_scores(self,chromo,strand,genomicSeq_start,genomicSeq_stop,only_seq=False,perform_checks=True)
155 class TestQPalmaPrediction(unittest.TestCase):
156 """
157 This class...
158 """
160 def _setUp(self):
161 data_fn = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1/dataset/prediction_set.pickle'
167 def _setUp(self):
168 print
169 self.prediction_set = {}
171 # xxxxxxxxxxx_______________________________________________________________________xxxxxxxxxxxxxxxxxxxxxxxxxxx
172 # 'tattttggaaggtatttcatctctccgacattgctctcaacactgtcccctccaatgcctagtccttttatttttttcttagttccaattcccttaaatacatctcacagtcttcttcttcttctcgattgcagtagc'
177 id = 1
178 chromo = 3
179 strand = '-'
181 tsize = 23470805
183 genomicSeq_start = tsize - (2038+10)
184 genomicSeq_stop = tsize - 1900 + 10
185 print 'Position: ',
186 print genomicSeq_start,genomicSeq_stop
188 currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
191 self.prediction_set[id] = [example]
193 # xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx xxxxxxxx
194 # 'gctactgcaatcgagaagaagaagaagactgtgagatgtatttaagggaattggaactaagaaaaaaataaaaggactaggcattggaggggacagtgttgagagcaatgtcggagagatgaaataccttccaaaata'
197 currentQualities = [[40]*30+[22]*8]
199 id = 2
200 chromo = 3
201 strand = '+'
203 tsize = 23470805
205 genomicSeq_start = tsize - (2038+10)
206 genomicSeq_stop = tsize - 1900 + 10
207 print 'Position: ',
208 print genomicSeq_start,genomicSeq_stop
210 currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
213 self.prediction_set[id] = [example]
216 def setUp(self):
217 self.prediction_set = {}
219 # chr1 + 20-120
223 id = 3
224 chromo = 1
225 strand = '+'
227 genomicSeq_start = 3500
228 genomicSeq_stop = 6500
229 print 'Position: ',
230 print genomicSeq_start,genomicSeq_stop
232 currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
235 self.prediction_set[id] = [example]
237 # chr1 - 5000-5100
241 id = 4
242 chromo = 1
243 strand = '-'
245 total_size = 30432563
247 genomicSeq_start = total_size - 6500
248 genomicSeq_stop = total_size - 3500
249 print 'Position: ',
250 print genomicSeq_start,genomicSeq_stop
252 currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
255 self.prediction_set[id] = [example]
258 def testAlignments(self):
260 settings = parseSettings('testcase.conf')
262 print self.prediction_set
263 for example_key in self.prediction_set.keys():
264 print 'Current example %d' % example_key
266 for example in self.prediction_set[example_key]:
267 print example
268 print 'size'
269 print len(example)
271 accessWrapper = DataAccessWrapper(settings)
272 seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
274 qp = QPalma(seqInfo,True)
275 allPredictions = qp.predict(self.prediction_set,settings)
277 for current_prediction in allPredictions:
278 align_str = print_prediction(current_prediction)
279 print align_str
281 id = current_prediction['id']
283 dna = current_prediction['dna']
284 chromo = current_prediction['chr']
285 strand = current_prediction['strand']
286 start_pos = current_prediction['start_pos']
287 predExons = current_prediction['predExons']
289 numExons = int(math.ceil(len(predExons) / 2))
291 print alignment_reconstruct(current_prediction,numExons)
292 print id,start_pos,predExons
294 print 'Problem counter is %d' % qp.problem_ctr
297 def _testAlignments(self):
301 run['name'] = 'test_run'
302 run['result_dir'] = '/fml/ag-raetsch/home/fabio/tmp/sandbox/testcases'
304 param_fn = jp(run_dir,'param_526.pickle')
307 print self.prediction_set
308 for example_key in self.prediction_set.keys():
309 print 'Current example %d' % example_key
311 for example in self.prediction_set[example_key]:
312 print example
313 print 'size'
314 print len(example)
316 # fetch the data needed
317 settings = {}
319 settings['genome_dir'] = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome'
320 settings['acceptor_scores_loc'] = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
321 settings['donor_scores_loc'] = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
323 settings['genome_file_fmt'] = 'chr%d.dna.flat'
324 settings['splice_score_file_fmt']= 'contig_%d%s'
326 allowed_fragments = [1]
327 settings['allowed_fragments'] = allowed_fragments
329 accessWrapper = DataAccessWrapper(settings)
330 seqInfo = SeqSpliceInfo(accessWrapper,allowed_fragments)
332 qp = QPalma(run,seqInfo,True)
333 allPredictions = qp.predict(self.prediction_set,param)
335 for current_prediction in allPredictions:
336 align_str = print_prediction(current_prediction)
337 print align_str
339 id = current_prediction['id']
341 dna = current_prediction['dna']
342 chromo = current_prediction['chr']
343 strand = current_prediction['strand']
344 start_pos = current_prediction['start_pos']
345 predExons = current_prediction['predExons']
347 numExons = int(math.ceil(len(predExons) / 2))
349 print alignment_reconstruct(current_prediction,numExons)
350 print id,start_pos,predExons
352 print 'Problem counter is %d' % qp.problem_ctr
354 def check_reverse_strand_calculation(id,b,e,seqInfo):
356 seq,acc,don = seqInfo.get_seq_and_scores(id,'-',b,e,only_seq=False,perform_checks=False)
358 total_size = seqInfo.getFragmentSize(1)
359 bp = total_size - e
360 ep = total_size - b
361 seqp,acc,don = seqInfo.get_seq_and_scores(id,'+',bp,ep,only_seq=False,perform_checks=False)
362 seqp = reverse_complement(seqp)
364 return (seq == seqp)
367 def simple_check(settings,seqInfo,lt1):
369 print 'Checking sequences for some intervals...'
371 intervals = [(0,10000),(545,874),(999,1234)]
373 for (b,e) in intervals:
374 print check_reverse_strand_calculation(1,b,e,seqInfo)
376 for (b,e) in [(206,874),(545,874),(999,1234)]:
377 lt1 = LookupTable(settings)
378 _dna,_acc,_don= lt1.get_seq_and_scores(1,'-',b,e)
379 dna,acc,don = seqInfo.get_seq_and_scores(1,'-',b,e,only_seq=False,perform_checks=False)
381 print dna == _dna
382 print acc == _acc
383 print don == _don
385 #print [p for p,e in enumerate(acc) if e != -inf]
386 #print [p for p,e in enumerate(_acc) if e != -inf]
389 def checks():
390 settings = {}
392 settings['genome_dir'] = 'test_data/'
393 settings['acceptor_scores_loc'] = 'test_data/acc'
394 settings['donor_scores_loc'] = 'test_data/don'
396 settings['genome_file_fmt'] = 'chromo%d.flat'
397 settings['splice_score_file_fmt']= 'chromo_%d%s'
399 allowed_fragments = [1]
400 settings['allowed_fragments'] = allowed_fragments
402 accessWrapper = DataAccessWrapper(settings)
403 seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
404 lt1 = LookupTable(settings)
406 print 'Checking with toy data...'
407 simple_check(settings,seqInfo,lt1)
409 settings = {}
411 settings['genome_dir'] = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome'
412 settings['acceptor_scores_loc'] = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
413 settings['donor_scores_loc'] = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
415 settings['genome_file_fmt'] = 'chr%d.dna.flat'
416 settings['splice_score_file_fmt']= 'contig_%d%s'
418 allowed_fragments = [1]
419 settings['allowed_fragments'] = allowed_fragments
421 accessWrapper = DataAccessWrapper(settings)
422 seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
423 lt1 = LookupTable(settings)
425 print 'Checking with real data...'
426 simple_check(settings,seqInfo,lt1)
429 def run():
430 print 'Creating some artifical data...'
431 create_mini_chromosome()
432 print 'Performing some checks...'
433 checks()
435 if __name__ == '__main__':
436 run()