350e5a7e63bda46bb976c38f387a11733b687cb4
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
4 # This program is free software; you can redistribute it and/or modify
6 # the Free Software Foundation; either version 2 of the License, or
7 # (at your option) any later version.
8 #
9 # Written (W) 2008 Fabio De Bona
10 # Copyright (C) 2008 Max-Planck-Society
13 import cPickle
14 import random
15 import math
16 import numpy
17 import os.path
18 import pdb
19 import array
20 import unittest
22 from qpalma.qpalma_main import QPalma
23 from qpalma.utils import print_prediction
25 from qpalma.Run import Run
27 from qpalma.OutputFormat import alignment_reconstruct
29 from qpalma.sequence_utils import DataAccessWrapper,SeqSpliceInfo,reverse_complement
31 jp = os.path.join
34 def createScoresForSequence(full_chromo_seq):
35 """
36 Given a genomic sequence this function calculates random scores for all
37 ocurring splice sites.
38 """
40 acc_pos = []
41 don_pos = []
43 # the first/last 205 pos do not have any predictions
44 for pos,elem in enumerate(full_chromo_seq[205:-205]):
45 if full_chromo_seq[pos-2:pos] == 'ag':
46 acc_pos.append(pos)
47 if full_chromo_seq[pos:pos+2] == 'gt' or full_chromo_seq[pos:pos+2] == 'gc':
48 don_pos.append(pos)
50 # make pos 1-based
51 acc_pos = [e+1 for e in acc_pos]
52 don_pos = [e+1 for e in don_pos]
54 acc_scores = [0.0]*len(acc_pos)
55 don_scores = [0.0]*len(don_pos)
57 for idx in range(len(acc_pos)):
58 acc_scores[idx] = random.uniform(0.1,1.0)
60 for idx in range(len(don_pos)):
61 don_scores[idx] = random.uniform(0.1,1.0)
63 acc_pos = array.array('I',acc_pos)
64 acc_scores = array.array('f',acc_scores)
66 don_pos = array.array('I',don_pos)
67 don_scores = array.array('f',don_scores)
69 return acc_pos,acc_scores,don_pos,don_scores
72 def saveSpliceInfo(acc_pos,acc_scores,don_pos,don_scores,strand):
73 """
74 """
76 acc_score_fn = 'test_data/acc/chromo_1%s.Conf'%strand
77 acc_pos_fn = 'test_data/acc/chromo_1%s.pos'%strand
78 don_score_fn = 'test_data/don/chromo_1%s.Conf'%strand
79 don_pos_fn = 'test_data/don/chromo_1%s.pos'%strand
81 acc_scores.tofile(open(acc_score_fn, 'wb'))
82 acc_pos.tofile(open(acc_pos_fn, 'wb'))
84 don_scores.tofile(open(don_score_fn, 'wb'))
85 don_pos.tofile(open(don_pos_fn, 'wb'))
89 def create_mini_chromosome():
91 chromo_fn = 'test_data/chromo1.flat'
93 chromo_fh = open(chromo_fn)
95 full_chromo_seq = full_chromo_seq.strip()
97 # create data for forward strand
98 acc_pos,acc_scores,don_pos,don_scores = createScoresForSequence(full_chromo_seq)
99 saveSpliceInfo(acc_pos,acc_scores,don_pos,don_scores,'+')
101 # create data for reverse strand
102 full_chromo_seq_rev = reverse_complement(full_chromo_seq)
103 acc_pos,acc_scores,don_pos,don_scores = createScoresForSequence(full_chromo_seq_rev)
105 # positions are always stored relative to positive strand
106 #acc_scores.reverse()
107 #acc_scores.
109 don_scores.reverse()
111 saveSpliceInfo(acc_pos,acc_scores,don_pos,don_scores,'-')
115 class TestQPalmaPrediction(unittest.TestCase):
116 """
117 This class...
118 """
120 def _setUp(self):
121 data_fn = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1/dataset/prediction_set.pickle'
127 def _setUp(self):
128 print
129 self.prediction_set = {}
131 # xxxxxxxxxxx_______________________________________________________________________xxxxxxxxxxxxxxxxxxxxxxxxxxx
132 # 'tattttggaaggtatttcatctctccgacattgctctcaacactgtcccctccaatgcctagtccttttatttttttcttagttccaattcccttaaatacatctcacagtcttcttcttcttctcgattgcagtagc'
137 id = 1
138 chromo = 3
139 strand = '-'
141 tsize = 23470805
143 genomicSeq_start = tsize - (2038+10)
144 genomicSeq_stop = tsize - 1900 + 10
145 print 'Position: ',
146 print genomicSeq_start,genomicSeq_stop
148 currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
151 self.prediction_set[id] = [example]
153 # xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx xxxxxxxx
154 # 'gctactgcaatcgagaagaagaagaagactgtgagatgtatttaagggaattggaactaagaaaaaaataaaaggactaggcattggaggggacagtgttgagagcaatgtcggagagatgaaataccttccaaaata'
157 currentQualities = [[40]*30+[22]*8]
159 id = 2
160 chromo = 3
161 strand = '+'
163 tsize = 23470805
165 genomicSeq_start = tsize - (2038+10)
166 genomicSeq_stop = tsize - 1900 + 10
167 print 'Position: ',
168 print genomicSeq_start,genomicSeq_stop
170 currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
173 self.prediction_set[id] = [example]
176 def setUp(self):
177 self.prediction_set = {}
179 # chr1 + 20-120
183 id = 3
184 chromo = 1
185 strand = '+'
187 genomicSeq_start = 3500
188 genomicSeq_stop = 6500
189 print 'Position: ',
190 print genomicSeq_start,genomicSeq_stop
192 currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
195 self.prediction_set[id] = [example]
197 # chr1 - 5000-5100
201 id = 4
202 chromo = 1
203 strand = '-'
205 total_size = 30432563
207 genomicSeq_start = total_size - 6500
208 genomicSeq_stop = total_size - 3500
209 print 'Position: ',
210 print genomicSeq_start,genomicSeq_stop
212 currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
215 self.prediction_set[id] = [example]
218 def testAlignments(self):
222 run['name'] = 'test_run'
223 run['result_dir'] = '/fml/ag-raetsch/home/fabio/tmp/sandbox/testcases'
225 param_fn = jp(run_dir,'param_526.pickle')
228 print self.prediction_set
229 for example_key in self.prediction_set.keys():
230 print 'Current example %d' % example_key
232 for example in self.prediction_set[example_key]:
233 print example
234 print 'size'
235 print len(example)
237 # fetch the data needed
238 settings = {}
240 settings['genome_dir'] = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome'
241 settings['acceptor_scores_loc'] = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
242 settings['donor_scores_loc'] = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
244 settings['genome_file_fmt'] = 'chr%d.dna.flat'
245 settings['splice_score_file_fmt']= 'contig_%d%s'
247 allowed_fragments = [1]
248 settings['allowed_fragments'] = allowed_fragments
250 accessWrapper = DataAccessWrapper(settings)
251 seqInfo = SeqSpliceInfo(accessWrapper,allowed_fragments)
253 qp = QPalma(run,seqInfo,True)
254 allPredictions = qp.predict(self.prediction_set,param)
256 for current_prediction in allPredictions:
257 align_str = print_prediction(current_prediction)
258 print align_str
260 id = current_prediction['id']