5f62f40f9b4e36c777312eb14a82bb284c003da6
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
4 # This program is free software; you can redistribute it and/or modify
6 # the Free Software Foundation; either version 2 of the License, or
7 # (at your option) any later version.
8 #
9 # Written (W) 2008 Fabio De Bona
10 # Copyright (C) 2008 Max-Planck-Society
13 import cPickle
14 import random
15 import math
16 import numpy
17 import os.path
18 import pdb
19 import array
20 import unittest
22 from qpalma.qpalma_main import QPalma
23 from qpalma.utils import print_prediction
25 from qpalma.Run import Run
27 from qpalma.SettingsParser import parseSettings
28 from qpalma.OutputFormat import alignment_reconstruct
29 from qpalma.sequence_utils import DataAccessWrapper,SeqSpliceInfo,reverse_complement
31 from qpalma.Lookup import LookupTable
34 jp = os.path.join
37 def createScoresForSequence(full_chromo_seq,reverse_strand=False):
38 """
39 Given a genomic sequence this function calculates random scores for all
40 ocurring splice sites.
41 """
43 acc_pos = []
44 don_pos = []
46 total_size = len(full_chromo_seq)
48 # the first/last 205 pos do not have any predictions
49 for pos,elem in enumerate(full_chromo_seq[205:-205]):
50 if full_chromo_seq[pos-2:pos] == 'ag':
51 acc_pos.append(pos)
52 if full_chromo_seq[pos:pos+2] == 'gt' or full_chromo_seq[pos:pos+2] == 'gc':
53 don_pos.append(pos)
55 # make pos 1-based
56 acc_pos = [e+1 for e in acc_pos]
57 don_pos = [e+1 for e in don_pos]
59 acc_scores = [0.0]*len(acc_pos)
60 don_scores = [0.0]*len(don_pos)
62 for idx in range(len(acc_pos)):
63 acc_scores[idx] = random.uniform(0.1,1.0)
65 for idx in range(len(don_pos)):
66 don_scores[idx] = random.uniform(0.1,1.0)
68 # recalculate indices and reverse them in order to have positions relative
69 # to positive strand
70 if reverse_strand:
71 acc_pos = [total_size-1-e for e in acc_pos]
72 acc_pos.reverse()
73 acc_scores.reverse()
75 don_pos = [total_size-1-e for e in don_pos]
76 don_pos.reverse()
77 don_scores.reverse()
79 acc_pos = array.array('I',acc_pos)
80 acc_scores = array.array('f',acc_scores)
82 don_pos = array.array('I',don_pos)
83 don_scores = array.array('f',don_scores)
85 return acc_pos,acc_scores,don_pos,don_scores
88 def saveSpliceInfo(acc_pos,acc_scores,don_pos,don_scores,strand):
89 """
90 """
92 acc_score_fn = 'test_data/acc/chromo_1%s.Conf'%strand
93 acc_pos_fn = 'test_data/acc/chromo_1%s.pos'%strand
94 don_score_fn = 'test_data/don/chromo_1%s.Conf'%strand
95 don_pos_fn = 'test_data/don/chromo_1%s.pos'%strand
97 acc_scores.tofile(open(acc_score_fn, 'wb'))
98 acc_pos.tofile(open(acc_pos_fn, 'wb'))
100 don_scores.tofile(open(don_score_fn, 'wb'))
101 don_pos.tofile(open(don_pos_fn, 'wb'))
104 def create_mini_chromosome():
106 chromo_fn = 'test_data/chromo1.flat'
108 chromo_fh = open(chromo_fn)
110 full_chromo_seq = full_chromo_seq.strip()
112 print full_chromo_seq[:200]
114 # create data for forward strand
115 acc_pos,acc_scores,don_pos,don_scores = createScoresForSequence(full_chromo_seq,reverse_strand=False)
116 print acc_pos[:5]
117 print don_pos[:5]
118 saveSpliceInfo(acc_pos,acc_scores,don_pos,don_scores,'+')
120 # create data for reverse strand
121 full_chromo_seq_rev = reverse_complement(full_chromo_seq)
123 total_size = len(full_chromo_seq_rev)
125 print full_chromo_seq_rev[:200]
126 acc_pos,acc_scores,don_pos,don_scores = createScoresForSequence(full_chromo_seq_rev,reverse_strand=True)
127 acc_pos = [total_size-1-e for e in acc_pos]
128 acc_pos.reverse()
129 print acc_pos[:5]
130 don_pos = [total_size-1-e for e in don_pos]
131 don_pos.reverse()
132 print don_pos[:5]
134 #
135 # Remember: The positions are always saved one-based.
136 #
138 #saveSpliceInfo(acc_pos,acc_scores,don_pos,don_scores,'-')
140 a = 103
141 b = 255
142 print 'pos: %s'%full_chromo_seq[a:b]
143 print 'neg: %s'%full_chromo_seq_rev[a:b]
145 total_size = len(full_chromo_seq)
146 ap = total_size-b
147 bp = total_size-a
148 print 'png: %s'%reverse_complement(full_chromo_seq[ap:bp])
151 def test_rev_comp():
152 get_seq_and_scores(self,chromo,strand,genomicSeq_start,genomicSeq_stop,only_seq=False,perform_checks=True)
154 class TestQPalmaPrediction(unittest.TestCase):
155 """
156 This class...
157 """
159 def _setUp(self):
160 data_fn = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1/dataset/prediction_set.pickle'
166 def _setUp(self):
167 print
168 self.prediction_set = {}
170 # xxxxxxxxxxx_______________________________________________________________________xxxxxxxxxxxxxxxxxxxxxxxxxxx
171 # 'tattttggaaggtatttcatctctccgacattgctctcaacactgtcccctccaatgcctagtccttttatttttttcttagttccaattcccttaaatacatctcacagtcttcttcttcttctcgattgcagtagc'
176 id = 1
177 chromo = 3
178 strand = '-'
180 tsize = 23470805
182 genomicSeq_start = tsize - (2038+10)
183 genomicSeq_stop = tsize - 1900 + 10
184 print 'Position: ',
185 print genomicSeq_start,genomicSeq_stop
187 currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
190 self.prediction_set[id] = [example]
192 # xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx xxxxxxxx
193 # 'gctactgcaatcgagaagaagaagaagactgtgagatgtatttaagggaattggaactaagaaaaaaataaaaggactaggcattggaggggacagtgttgagagcaatgtcggagagatgaaataccttccaaaata'
196 currentQualities = [[40]*30+[22]*8]
198 id = 2
199 chromo = 3
200 strand = '+'
202 tsize = 23470805
204 genomicSeq_start = tsize - (2038+10)
205 genomicSeq_stop = tsize - 1900 + 10
206 print 'Position: ',
207 print genomicSeq_start,genomicSeq_stop
209 currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
212 self.prediction_set[id] = [example]
215 def setUp(self):
216 self.prediction_set = {}
218 # chr1 + 20-120
222 id = 3
223 chromo = 1
224 strand = '+'
226 genomicSeq_start = 3500
227 genomicSeq_stop = 6500
228 print 'Position: ',
229 print genomicSeq_start,genomicSeq_stop
231 currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
234 self.prediction_set[id] = [example]
236 # chr1 - 5000-5100
240 id = 4
241 chromo = 1
242 strand = '-'
244 total_size = 30432563
246 genomicSeq_start = total_size - 6500
247 genomicSeq_stop = total_size - 3500
248 print 'Position: ',
249 print genomicSeq_start,genomicSeq_stop
251 currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
254 self.prediction_set[id] = [example]
257 def testAlignments(self):
259 settings = parseSettings('testcase.conf')
261 print self.prediction_set
262 for example_key in self.prediction_set.keys():
263 print 'Current example %d' % example_key
265 for example in self.prediction_set[example_key]:
266 print example
267 print 'size'
268 print len(example)
270 accessWrapper = DataAccessWrapper(settings)
271 seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
273 qp = QPalma(seqInfo,True)
274 allPredictions = qp.predict(self.prediction_set,settings)
276 for current_prediction in allPredictions:
277 align_str = print_prediction(current_prediction)
278 print align_str
280 id = current_prediction['id']
282 dna = current_prediction['dna']
283 chromo = current_prediction['chr']
284 strand = current_prediction['strand']
285 start_pos = current_prediction['start_pos']
286 predExons = current_prediction['predExons']
288 numExons = int(math.ceil(len(predExons) / 2))
290 print alignment_reconstruct(current_prediction,numExons)
291 print id,start_pos,predExons
293 print 'Problem counter is %d' % qp.problem_ctr
296 def _testAlignments(self):
300 run['name'] = 'test_run'
301 run['result_dir'] = '/fml/ag-raetsch/home/fabio/tmp/sandbox/testcases'
303 param_fn = jp(run_dir,'param_526.pickle')
306 print self.prediction_set
307 for example_key in self.prediction_set.keys():
308 print 'Current example %d' % example_key
310 for example in self.prediction_set[example_key]:
311 print example
312 print 'size'
313 print len(example)
315 # fetch the data needed
316 settings = {}
318 settings['genome_dir'] = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome'
319 settings['acceptor_scores_loc'] = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
320 settings['donor_scores_loc'] = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
322 settings['genome_file_fmt'] = 'chr%d.dna.flat'
323 settings['splice_score_file_fmt']= 'contig_%d%s'
325 allowed_fragments = [1]
326 settings['allowed_fragments'] = allowed_fragments
328 accessWrapper = DataAccessWrapper(settings)
329 seqInfo = SeqSpliceInfo(accessWrapper,allowed_fragments)
331 qp = QPalma(run,seqInfo,True)
332 allPredictions = qp.predict(self.prediction_set,param)
334 for current_prediction in allPredictions:
335 align_str = print_prediction(current_prediction)
336 print align_str
338 id = current_prediction['id']
340 dna = current_prediction['dna']
341 chromo = current_prediction['chr']
342 strand = current_prediction['strand']
343 start_pos = current_prediction['start_pos']
344 predExons = current_prediction['predExons']
346 numExons = int(math.ceil(len(predExons) / 2))
348 print alignment_reconstruct(current_prediction,numExons)
349 print id,start_pos,predExons
351 print 'Problem counter is %d' % qp.problem_ctr
354 def simple_check():
355 # fetch the data needed
356 settings = {}
358 #settings['genome_dir'] = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome'
359 #settings['acceptor_scores_loc'] = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
360 #settings['donor_scores_loc'] = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
362 #settings['genome_file_fmt'] = 'chr%d.dna.flat'
363 #settings['splice_score_file_fmt']= 'contig_%d%s'
365 #allowed_fragments = [1]
366 #settings['allowed_fragments'] = allowed_fragments
368 settings['genome_dir'] = 'test_data/'
369 settings['acceptor_scores_loc'] = 'test_data/acc'
370 settings['donor_scores_loc'] = 'test_data/don'
372 settings['genome_file_fmt'] = 'chromo%d.flat'
373 settings['splice_score_file_fmt']= 'chromo_%d%s'
375 allowed_fragments = [1]
376 settings['allowed_fragments'] = allowed_fragments
378 accessWrapper = DataAccessWrapper(settings)
379 seqInfo = SeqSpliceInfo(accessWrapper,allowed_fragments)
381 b = 0
382 e = 10000
383 seq = seqInfo.get_seq_and_scores(1,'-',b,e,only_seq=True,perform_checks=False)
385 total_size = seqInfo.getFragmentSize(1)
386 bp = total_size - e
387 ep = total_size - b
388 seqp = seqInfo.get_seq_and_scores(1,'+',b,e,only_seq=True,perform_checks=False)
389 seqp = reverse_complement(seqp)
391 print seq[0:100] == seqp[0:100]
392 print seq[534:1652] == seqp[534:1652]
393 print seq[-100:] == seqp[-100:]
395 lt1 = LookupTable(settings)
396 dna,acc,don= lt1.get_seq_and_scores(1,'-',b,e)
398 print seq[:100] == seqp[:100] == dna[:100]
399 print seq[-100:] == seqp[-100:] == dna[-100:]
401 b = 206
402 e = 300
403 dna,acc,don = seqInfo.get_seq_and_scores(1,'-',b,e,only_seq=False,perform_checks=False)
405 lt1 = LookupTable(settings)
406 _dna,_acc,_don= lt1.get_seq_and_scores(1,'-',b,e)
409 print dna == _dna
410 print acc == _acc
411 print don == _don
413 from numpy import inf
414 print [p for p,e in enumerate(acc) if e != -inf]
415 print [p for p,e in enumerate(_acc) if e != -inf]
419 if __name__ == '__main__':
420 #create_mini_chromosome()
421 simple_check()