350e5a7e63bda46bb976c38f387a11733b687cb4
[qpalma.git] / tests / test_qpalma.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 # This program is free software; you can redistribute it and/or modify
5 # it under the terms of the GNU General Public License as published by
6 # the Free Software Foundation; either version 2 of the License, or
7 # (at your option) any later version.
8 #
9 # Written (W) 2008 Fabio De Bona
10 # Copyright (C) 2008 Max-Planck-Society
11
12
13 import cPickle
14 import random
15 import math
16 import numpy
17 import os.path
18 import pdb
19 import array
20 import unittest
21
22 from qpalma.qpalma_main import QPalma
23 from qpalma.utils import print_prediction
24
25 from qpalma.Run import Run
26
27 from qpalma.OutputFormat import alignment_reconstruct
28
29 from qpalma.sequence_utils import DataAccessWrapper,SeqSpliceInfo,reverse_complement
30
31 jp = os.path.join
32
33
34 def createScoresForSequence(full_chromo_seq):
35 """
36 Given a genomic sequence this function calculates random scores for all
37 ocurring splice sites.
38 """
39
40 acc_pos = []
41 don_pos = []
42
43 # the first/last 205 pos do not have any predictions
44 for pos,elem in enumerate(full_chromo_seq[205:-205]):
45 if full_chromo_seq[pos-2:pos] == 'ag':
46 acc_pos.append(pos)
47 if full_chromo_seq[pos:pos+2] == 'gt' or full_chromo_seq[pos:pos+2] == 'gc':
48 don_pos.append(pos)
49
50 # make pos 1-based
51 acc_pos = [e+1 for e in acc_pos]
52 don_pos = [e+1 for e in don_pos]
53
54 acc_scores = [0.0]*len(acc_pos)
55 don_scores = [0.0]*len(don_pos)
56
57 for idx in range(len(acc_pos)):
58 acc_scores[idx] = random.uniform(0.1,1.0)
59
60 for idx in range(len(don_pos)):
61 don_scores[idx] = random.uniform(0.1,1.0)
62
63 acc_pos = array.array('I',acc_pos)
64 acc_scores = array.array('f',acc_scores)
65
66 don_pos = array.array('I',don_pos)
67 don_scores = array.array('f',don_scores)
68
69 return acc_pos,acc_scores,don_pos,don_scores
70
71
72 def saveSpliceInfo(acc_pos,acc_scores,don_pos,don_scores,strand):
73 """
74 """
75
76 acc_score_fn = 'test_data/acc/chromo_1%s.Conf'%strand
77 acc_pos_fn = 'test_data/acc/chromo_1%s.pos'%strand
78 don_score_fn = 'test_data/don/chromo_1%s.Conf'%strand
79 don_pos_fn = 'test_data/don/chromo_1%s.pos'%strand
80
81 acc_scores.tofile(open(acc_score_fn, 'wb'))
82 acc_pos.tofile(open(acc_pos_fn, 'wb'))
83
84 don_scores.tofile(open(don_score_fn, 'wb'))
85 don_pos.tofile(open(don_pos_fn, 'wb'))
86
87
88
89 def create_mini_chromosome():
90
91 chromo_fn = 'test_data/chromo1.flat'
92
93 chromo_fh = open(chromo_fn)
94 full_chromo_seq = chromo_fh.read()
95 full_chromo_seq = full_chromo_seq.strip()
96
97 # create data for forward strand
98 acc_pos,acc_scores,don_pos,don_scores = createScoresForSequence(full_chromo_seq)
99 saveSpliceInfo(acc_pos,acc_scores,don_pos,don_scores,'+')
100
101 # create data for reverse strand
102 full_chromo_seq_rev = reverse_complement(full_chromo_seq)
103 acc_pos,acc_scores,don_pos,don_scores = createScoresForSequence(full_chromo_seq_rev)
104
105 # positions are always stored relative to positive strand
106 #acc_scores.reverse()
107 #acc_scores.
108
109 don_scores.reverse()
110
111 saveSpliceInfo(acc_pos,acc_scores,don_pos,don_scores,'-')
112
113
114
115 class TestQPalmaPrediction(unittest.TestCase):
116 """
117 This class...
118 """
119
120 def _setUp(self):
121 data_fn = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1/dataset/prediction_set.pickle'
122
123 self.prediction_set = cPickle.load(open(data_fn))
124
125
126
127 def _setUp(self):
128 print
129 self.prediction_set = {}
130
131 # xxxxxxxxxxx_______________________________________________________________________xxxxxxxxxxxxxxxxxxxxxxxxxxx
132 # 'tattttggaaggtatttcatctctccgacattgctctcaacactgtcccctccaatgcctagtccttttatttttttcttagttccaattcccttaaatacatctcacagtcttcttcttcttctcgattgcagtagc'
133
134 read = 'tattttggaagttccaattcccttaatacatctcacag'
135 currentQualities = [[30]*len(read)]
136
137 id = 1
138 chromo = 3
139 strand = '-'
140
141 tsize = 23470805
142
143 genomicSeq_start = tsize - (2038+10)
144 genomicSeq_stop = tsize - 1900 + 10
145 print 'Position: ',
146 print genomicSeq_start,genomicSeq_stop
147
148 currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
149
150 example = (currentSeqInfo,read,currentQualities)
151 self.prediction_set[id] = [example]
152
153 # xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx xxxxxxxx
154 # 'gctactgcaatcgagaagaagaagaagactgtgagatgtatttaagggaattggaactaagaaaaaaataaaaggactaggcattggaggggacagtgttgagagcaatgtcggagagatgaaataccttccaaaata'
155
156 read = 'gctactgcaatcgagaagaagaagaagactatgaaata'
157 currentQualities = [[40]*30+[22]*8]
158
159 id = 2
160 chromo = 3
161 strand = '+'
162
163 tsize = 23470805
164
165 genomicSeq_start = tsize - (2038+10)
166 genomicSeq_stop = tsize - 1900 + 10
167 print 'Position: ',
168 print genomicSeq_start,genomicSeq_stop
169
170 currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
171
172 example = (currentSeqInfo,read,currentQualities)
173 self.prediction_set[id] = [example]
174
175
176 def setUp(self):
177 self.prediction_set = {}
178
179 # chr1 + 20-120
180 read = 'catctatgcaacagcattacagtgatcaccggcccaaaaaacctgtgtctggggttttgcctgatgatagcagtgatactgaaactggatcaatggtaag'
181 currentQualities = [[40]*len(read)]
182
183 id = 3
184 chromo = 1
185 strand = '+'
186
187 genomicSeq_start = 3500
188 genomicSeq_stop = 6500
189 print 'Position: ',
190 print genomicSeq_start,genomicSeq_stop
191
192 currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
193
194 example = (currentSeqInfo,read,currentQualities)
195 self.prediction_set[id] = [example]
196
197 # chr1 - 5000-5100
198 read = 'ctgtgtatctggttgctcaatatgctcgccggaaaatgaagatcatggatgctgtgagttctccttattgttcattatcaaactgatatgagtttctgat'
199 currentQualities = [[40]*len(read)]
200
201 id = 4
202 chromo = 1
203 strand = '-'
204
205 total_size = 30432563
206
207 genomicSeq_start = total_size - 6500
208 genomicSeq_stop = total_size - 3500
209 print 'Position: ',
210 print genomicSeq_start,genomicSeq_stop
211
212 currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
213
214 example = (currentSeqInfo,read,currentQualities)
215 self.prediction_set[id] = [example]
216
217
218 def testAlignments(self):
219 run_dir = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/saved_run'
220
221 run = cPickle.load(open(jp(run_dir,'run_obj.pickle')))
222 run['name'] = 'test_run'
223 run['result_dir'] = '/fml/ag-raetsch/home/fabio/tmp/sandbox/testcases'
224
225 param_fn = jp(run_dir,'param_526.pickle')
226 param = cPickle.load(open(param_fn))
227
228 print self.prediction_set
229 for example_key in self.prediction_set.keys():
230 print 'Current example %d' % example_key
231
232 for example in self.prediction_set[example_key]:
233 print example
234 print 'size'
235 print len(example)
236
237 # fetch the data needed
238 settings = {}
239
240 settings['genome_dir'] = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome'
241 settings['acceptor_scores_loc'] = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
242 settings['donor_scores_loc'] = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
243
244 settings['genome_file_fmt'] = 'chr%d.dna.flat'
245 settings['splice_score_file_fmt']= 'contig_%d%s'
246
247 allowed_fragments = [1]
248 settings['allowed_fragments'] = allowed_fragments
249
250 accessWrapper = DataAccessWrapper(settings)
251 seqInfo = SeqSpliceInfo(accessWrapper,allowed_fragments)
252
253 qp = QPalma(run,seqInfo,True)
254 allPredictions = qp.predict(self.prediction_set,param)
255
256 for current_prediction in allPredictions:
257 align_str = print_prediction(current_prediction)
258 print align_str
259
260 id = current_prediction['id']
261 seq = current_prediction['read']
262 dna = current_prediction['dna']
263 chromo = current_prediction['chr']
264 strand = current_prediction['strand']
265 start_pos = current_prediction['start_pos']
266 predExons = current_prediction['predExons']
267
268 numExons = int(math.ceil(len(predExons) / 2))
269
270 print alignment_reconstruct(current_prediction,numExons)
271 print id,start_pos,predExons
272
273 print 'Problem counter is %d' % qp.problem_ctr
274
275
276 if __name__ == '__main__':
277 create_mini_chromosome()
278 #suite = unittest.TestLoader().loadTestsFromTestCase(TestQPalmaPrediction)
279 #unittest.TextTestRunner(verbosity=2).run(suite)