+ added some testcases for sequence_utils module
[qpalma.git] / tests / test_sequence_utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import pdb
5 import unittest
6 import numpy
7
8 from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper,reverse_complement
9 from qpalma.Lookup import LookupTable
10
11 class TestSequenceUtils(unittest.TestCase):
12
13
14 def setUp(self):
15 self.strands = ['+','']
16
17
18 def testThalianaData(self):
19 g_dir = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
20 acc_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
21 don_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
22
23 g_fmt = 'chr%d.dna.flat'
24 s_fmt = 'contig_%d%s'
25
26 num_chromo = 6
27
28 accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
29 seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
30
31 #for chromo in range(1,num_chromo):
32 # dna,acc,don = seqInfo.get_seq_and_scores(chromo,'+',0,1369)
33 # dna_,acc_,don_ = seqInfo.get_seq_and_scores(chromo,'-',0,1369)
34
35 # self.assertEqual(len(dna),len(dna_))
36 # self.assertEqual(dna,dna_)
37
38 dna,acc,don = seqInfo.get_seq_and_scores(1,'+',0,1369)
39 dna,acc,don = seqInfo.get_seq_and_scores(2,'+',0,1369)
40 dna,acc,don = seqInfo.get_seq_and_scores(3,'+',0,1369)
41 dna,acc,don = seqInfo.get_seq_and_scores(4,'+',0,1369)
42 dna,acc,don = seqInfo.get_seq_and_scores(5,'+',0,1369)
43 dna,acc,don = seqInfo.get_seq_and_scores(1,'-',0,1369)
44 dna,acc,don = seqInfo.get_seq_and_scores(2,'-',0,1369)
45 dna,acc,don = seqInfo.get_seq_and_scores(3,'-',0,1369)
46 dna,acc,don = seqInfo.get_seq_and_scores(4,'-',0,1369)
47 dna,acc,don = seqInfo.get_seq_and_scores(5,'-',0,1369)
48
49
50 def testLyrataData(self):
51 g_dir = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/contigs'
52 acc_dir = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/acc'
53 don_dir = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/don'
54
55 g_fmt = 'contig%d.dna.flat'
56 s_fmt = 'contig_%d%s'
57
58 num_chromo = 1099
59
60 accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
61 seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
62
63 dna,acc,don = seqInfo.get_seq_and_scores(1,'+',0,1369)
64 dna,acc,don = seqInfo.get_seq_and_scores(2,'+',0,1369)
65 dna,acc,don = seqInfo.get_seq_and_scores(3,'+',0,1369)
66 dna,acc,don = seqInfo.get_seq_and_scores(4,'+',0,1369)
67 dna,acc,don = seqInfo.get_seq_and_scores(5,'+',0,1369)
68 dna,acc,don = seqInfo.get_seq_and_scores(1,'-',0,1369)
69 dna,acc,don = seqInfo.get_seq_and_scores(2,'-',0,1369)
70 dna,acc,don = seqInfo.get_seq_and_scores(3,'-',0,1369)
71 dna,acc,don = seqInfo.get_seq_and_scores(4,'-',0,1369)
72 dna,acc,don = seqInfo.get_seq_and_scores(5,'-',0,1369)
73 dna,acc,don = seqInfo.get_seq_and_scores(45,'-',0,1369)
74 dna,acc,don = seqInfo.get_seq_and_scores(45,'+',0,1369)
75
76 print 'Finished'
77 #num_tests = 10
78 #for chromo in range(1,6):
79 # for strand in ['+','-']:
80 # for test_idx in range(num_tests):
81 # if strand == '-':
82 # size = seqInfo.chromo_sizes[chromo+7]
83 # else:
84 # size = seqInfo.chromo_sizes[chromo]
85 # begin = random.randint(0,size)
86 # end = random.randint(b,size)
87 # dna,acc,don = seqInfo.get_seq_and_scores(chromo,strand,begin,end)
88
89
90 def tearDown(self):
91 pass
92
93
94 class TestLookupTable(unittest.TestCase):
95
96 def setUp(self):
97 pass
98
99
100 def testTableThalianaData(self):
101 g_dir = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
102 acc_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
103 don_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
104
105 g_fmt = 'chr%d.dna.flat'
106 s_fmt = 'contig_%d%s'
107
108 lt1 = LookupTable(g_dir,acc_dir,don_dir,g_fmt,s_fmt,range(1,100))
109
110
111 def testTableLyrataData(self):
112 g_dir = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/contigs'
113 acc_dir = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/acc'
114 don_dir = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/don'
115
116 g_fmt = 'contig%d.dna.flat'
117 s_fmt = 'contig_%d%s'
118
119 lt1 = LookupTable(g_dir,acc_dir,don_dir,g_fmt,s_fmt,range(1,100))
120
121
122 def tearDown(self):
123 pass
124
125
126 def check_wrapper():
127 g_dir = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/contigs'
128 acc_dir = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/acc'
129 don_dir = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/don'
130
131 g_fmt = 'contig%d.dna.flat'
132 s_fmt = 'contig_%d%s.Conf_cum'
133
134 test = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
135
136 for idx in range(1,100):
137 pos = test.get_genomic_fragment_fn(idx,'+')
138 neg = test.get_genomic_fragment_fn(idx,'-')
139 print pos,neg
140 assert os.path.exists(pos)
141 assert os.path.exists(neg)
142
143 acc_fn,don_fn = test.get_splice_site_scores_fn(idx,'+')
144 print acc_fn,don_fn
145 assert os.path.exists(acc_fn)
146 assert os.path.exists(don_fn)
147 acc_fn,don_fn = test.get_splice_site_scores_fn(idx,'-')
148 print acc_fn,don_fn
149 assert os.path.exists(acc_fn)
150 assert os.path.exists(don_fn)
151
152
153 def check_positions(dna,acc,don,offset=0):
154 first_gt_tuple_pos = [p for p,e in enumerate(dna) if p>0 and p<len(dna)-1 and e=='g' and (dna[p+1]=='t' or dna[p+1]=='c')][:offset]
155 first_ag_tuple_pos = [p for p,e in enumerate(dna) if p>1 and p<len(dna) and dna[p-1]=='a' and dna[p]=='g'][:offset]
156
157 first_acc_scores = [p for p,e in enumerate(acc) if e != -numpy.inf][:offset]
158 first_don_scores = [p for p,e in enumerate(don) if e != -numpy.inf][:offset]
159
160 last_gt_tuple_pos = [p for p,e in enumerate(dna) if p>0 and p<len(dna)-1 and e=='g' and (dna[p+1]=='t' or dna[p+1]=='c')][-offset:]
161 last_ag_tuple_pos = [p for p,e in enumerate(dna) if p>1 and p<len(dna) and dna[p-1]=='a' and dna[p]=='g'][-offset:]
162
163 last_acc_scores = [p for p,e in enumerate(acc) if e != -numpy.inf][-offset:]
164 last_don_scores = [p for p,e in enumerate(don) if e != -numpy.inf][-offset:]
165
166 assert first_gt_tuple_pos == first_don_scores
167 assert first_ag_tuple_pos == first_acc_scores
168
169 assert last_gt_tuple_pos == last_don_scores
170 assert last_ag_tuple_pos == last_acc_scores
171
172
173 def run():
174 flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
175 chromo = 1
176 strand = '+'
177 begin = 0
178 end = 100000000
179 dna,acc,don = qpalma.sequence_utils.get_seq_and_scores(chromo,strand,begin,end,flat_files)
180
181 check_positions(dna,acc,don,100)
182
183
184 def run2():
185 flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
186 chromo = 8
187 strand = '-'
188 begin = 0
189 end = 100000000
190 dna,acc,don = qpalma.sequence_utils.get_seq_and_scores(chromo,strand,begin,end,flat_files)
191
192 check_positions(dna,acc,don,100)
193
194 pdb.set_trace()
195
196
197 def run3():
198 flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
199
200 begin = 0
201 end = 100000000
202 full_pos_dna = qpalma.sequence_utils.get_seq_and_scores(1,'+',begin,end,flat_files,True)
203
204 begin = 0
205 end = 500
206 dna = qpalma.sequence_utils.get_seq_and_scores(8,'+',begin,end,flat_files,True)
207
208 pos_dna = full_pos_dna[-500:]
209 r_dna = qpalma.sequence_utils.reverse_complement(dna)
210 pos_dna == r_dna
211 pdb.set_trace()
212
213 def run4():
214 flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
215 seqInfo = SeqSpliceInfo(flat_files,range(1,6))
216 chromo = 3
217 strand = '+'
218 begin = 200
219 end = 1200
220 dna,acc,don = seqInfo.get_seq_and_scores(chromo,strand,begin,end)
221
222 check_positions(dna,acc,don)
223
224 pdb.set_trace()
225
226
227 def check_negative_strand(b,e):
228 flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
229 chromo = 8
230 strand = '-'
231 begin = b
232 end = e
233 dna,acc,don = qpalma.sequence_utils.get_seq_and_scores(chromo,strand,begin,end,flat_files)
234
235 pdb.set_trace()
236
237 check_positions(dna,acc,don)
238
239 print 'fine'
240
241 def check_positive_strand(b,e):
242 flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
243 chromo = 1
244 strand = '+'
245 begin = b
246 end = e
247 dna,acc,don = qpalma.sequence_utils.get_seq_and_scores(chromo,strand,begin,end,flat_files)
248
249 check_positions(dna,acc,don)
250 print 'fine'
251
252
253 if __name__ == '__main__':
254 #run()
255 #run2()
256 #run3(
257 #run4()
258 #perform_checks()
259 suite = unittest.TestLoader().loadTestsFromTestCase(TestSequenceUtils)
260 unittest.TextTestRunner(verbosity=2).run(suite)
261