00e88703775a2743a087ce9960a1a1756a96d83e
[qpalma.git] / tests / test_sequence_utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import pdb
5 import unittest
6 import numpy
7
8 from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper,reverse_complement
9 from qpalma.Lookup import LookupTable
10
11 class TestSequenceUtils(unittest.TestCase):
12
13
14 def setUp(self):
15 self.strands = ['+','']
16
17 g_dir = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
18 acc_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
19 don_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
20
21 g_fmt = 'chr%d.dna.flat'
22 s_fmt = 'contig_%d%s'
23
24 num_chromo = 6
25
26 accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
27 self.seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
28
29 print self.seqInfo.chromo_sizes
30
31 #self.lt1 = LookupTable(g_dir,acc_dir,don_dir,g_fmt,s_fmt,range(1,2))
32
33
34 def testThalianaDataExamples(self):
35 seq = 'TGAAAACAGGAACGGATTGGAGAAAGGCGTCTCGTCAT'.lower()
36 chromo = 3
37 strand = '+'
38 pos = 19246391
39
40 dna = self.seqInfo.get_seq_and_scores(chromo,strand,pos,pos+38,True)
41 self.assertEqual(seq,dna)
42
43 seq = 'AGGCAATGAAACTGATGCATTGGACTTGACGGGTGTTG'.lower()
44 chromo = 5
45 strand = '+'
46 pos = 15394760
47 dna = self.seqInfo.get_seq_and_scores(chromo,strand,pos,pos+38,True)
48 self.assertEqual(seq,dna)
49
50 seq = 'TCTTGGTGGAGGAGCTAACACCGTAGCTGACGGTTACA'.lower()
51 chromo = 4
52 strand = '+'
53 pos = 16709206
54 dna = self.seqInfo.get_seq_and_scores(chromo,strand,pos,pos+38,True)
55 self.assertEqual(seq,dna)
56
57 seq = 'TTGGAAGACAGAGTCAACCATACCCTTGCCTCTGGTGA'.lower()
58 chromo = 2
59 strand = '+'
60 pos = 16579649
61 dna = self.seqInfo.get_seq_and_scores(chromo,strand,pos,pos+38,True)
62 self.assertEqual(seq,dna)
63
64 seq = 'CTGGCCAAAAGCTCAGGGAAGACGCAGCCTAGGGCTCC'.lower()
65 seq = reverse_complement(seq)
66 chromo = 1
67 strand = '-'
68 pos = 10475515
69 dna = self.seqInfo.get_seq_and_scores(chromo,strand,pos,pos+38,True)
70 self.assertEqual(seq,dna)
71
72 seq = 'TTTTTCCCTTCTAGAAGACCGTAAAGGTAAACTTCTAA'.lower()
73 seq = reverse_complement(seq)
74 chromo = 3
75 strand = '-'
76 pos = 17143420
77 dna = self.seqInfo.get_seq_and_scores(chromo,strand,pos,pos+38,True)
78 self.assertEqual(seq,dna)
79
80 seq = 'CACGGTGCAGATGAAGAACTGAGATCCGTTCGTGTTTG'.lower()
81 seq = reverse_complement(seq)
82 chromo = 4
83 strand = '-'
84 pos = 18083761
85 dna = self.seqInfo.get_seq_and_scores(chromo,strand,pos,pos+38,True)
86 self.assertEqual(seq,dna)
87
88 window = 113
89 seq = 'CACGGTGCAGATGAAGAACTGAGATCCGTTCGTGTTTG'.lower()
90 seq = reverse_complement(seq)
91 chromo = 4
92 strand = '-'
93 pos = 18083761-window
94 dna,acc,don = self.seqInfo.get_seq_and_scores(chromo,strand,pos,pos+38+2*window,False)
95 self.assertEqual(seq,dna[window:-window])
96
97 #print dna
98 #print "".join(map(lambda x: ['_','x'][x!=-numpy.inf],acc))
99 #print "".join(map(lambda x: ['_','x'][x!=-numpy.inf],don))
100
101
102 def _testThalianaDataGeneric(self):
103
104 dna,acc,don = self.seqInfo.get_seq_and_scores(1,'+',1000,1369)
105 dna_,acc_,don_ = self.lt1.get_seq_and_scores(1,'+',1000,1369,'')
106
107 self.assertEqual(dna,dna_)
108 self.assertEqual(acc,acc_)
109 self.assertEqual(don,don_)
110
111 dna,acc,don = self.seqInfo.get_seq_and_scores(1,'-',1000,1369)
112 dna_,acc_,don_ = self.lt1.get_seq_and_scores(1,'-',1000,1369,'')
113
114 self.assertEqual(dna,dna_)
115 self.assertEqual(acc,acc_)
116 self.assertEqual(don,don_)
117
118 #dna,acc,don = seqInfo.get_seq_and_scores(2,'+',1000,1369)
119 #dna,acc,don = seqInfo.get_seq_and_scores(3,'+',1000,1369)
120 #dna,acc,don = seqInfo.get_seq_and_scores(4,'+',1000,1369)
121 #dna,acc,don = seqInfo.get_seq_and_scores(5,'+',1000,1369)
122
123 #dna,acc,don = seqInfo.get_seq_and_scores(1,'-',1000,1369)
124 #dna_,acc_,don_ = lt1.get_seq_and_scores(1,'-',1000,1369,'')
125
126 #self.assertEqual(dna,dna_)
127 #self.assertEqual(acc,acc_)
128 #self.assertEqual(don,don_)
129
130
131 #dna,acc,don = seqInfo.get_seq_and_scores(2,'-',1000,1369)
132 #dna,acc,don = seqInfo.get_seq_and_scores(3,'-',1000,1369)
133 #dna,acc,don = seqInfo.get_seq_and_scores(4,'-',1000,1369)
134 #dna,acc,don = seqInfo.get_seq_and_scores(5,'-',1000,1369)
135
136
137 def _testLyrataData(self):
138 g_dir = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/contigs'
139 acc_dir = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/acc'
140 don_dir = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/don'
141
142 g_fmt = 'contig%d.dna.flat'
143 s_fmt = 'contig_%d%s'
144
145 num_chromo = 1099
146
147 accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
148 seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
149
150 dna,acc,don = seqInfo.get_seq_and_scores(1,'+',1,1369)
151 dna,acc,don = seqInfo.get_seq_and_scores(2,'+',1,1369)
152 dna,acc,don = seqInfo.get_seq_and_scores(3,'+',1,1369)
153 dna,acc,don = seqInfo.get_seq_and_scores(4,'+',1,1369)
154 dna,acc,don = seqInfo.get_seq_and_scores(5,'+',1,1369)
155 dna,acc,don = seqInfo.get_seq_and_scores(1,'-',1,1369)
156 dna,acc,don = seqInfo.get_seq_and_scores(2,'-',1,1369)
157 dna,acc,don = seqInfo.get_seq_and_scores(3,'-',1,1369)
158 dna,acc,don = seqInfo.get_seq_and_scores(4,'-',1,1369)
159 dna,acc,don = seqInfo.get_seq_and_scores(5,'-',1,1369)
160 dna,acc,don = seqInfo.get_seq_and_scores(45,'-',1,1369)
161 dna,acc,don = seqInfo.get_seq_and_scores(45,'+',1,1369)
162
163 print 'Finished'
164 #num_tests = 10
165 #for chromo in range(1,6):
166 # for strand in ['+','-']:
167 # for test_idx in range(num_tests):
168 # if strand == '-':
169 # size = seqInfo.chromo_sizes[chromo+7]
170 # else:
171 # size = seqInfo.chromo_sizes[chromo]
172 # begin = random.randint(0,size)
173 # end = random.randint(b,size)
174 # dna,acc,don = seqInfo.get_seq_and_scores(chromo,strand,begin,end)
175
176
177 def tearDown(self):
178 pass
179
180
181 class TestLookupTable(unittest.TestCase):
182
183 def setUp(self):
184 pass
185
186
187 def testTableThalianaData(self):
188 g_dir = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
189 acc_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
190 don_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
191
192 g_fmt = 'chr%d.dna.flat'
193 s_fmt = 'contig_%d%s'
194
195 num_chromo = 2
196
197 lt1 = LookupTable(g_dir,acc_dir,don_dir,g_fmt,s_fmt,range(1,num_chromo))
198
199 accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
200 seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
201
202 seq = 'CTGGCCAAAAGCTCAGGGAAGACGCAGCCTAGGGCTCC'.lower()
203 seq = reverse_complement(seq)
204 chromo = 1
205 strand = '-'
206 pos = 10475515
207 dna = seqInfo.get_seq_and_scores(chromo,strand,pos,pos+38,True)
208 self.assertEqual(seq,dna)
209
210 dna = lt1.get_seq_and_scores(chromo,strand,pos,pos+38,True)
211 pdb.set_trace()
212 self.assertEqual(seq,dna)
213
214 dna,acc,don = seqInfo.get_seq_and_scores(1,'+',1,1369)
215 dna_,acc_,don_ = lt1.get_seq_and_scores(1,'+',1,1369)
216
217
218
219 def _testTableLyrataData(self):
220 g_dir = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/contigs'
221 acc_dir = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/acc'
222 don_dir = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/don'
223
224 g_fmt = 'contig%d.dna.flat'
225 s_fmt = 'contig_%d%s'
226
227 lt1 = LookupTable(g_dir,acc_dir,don_dir,g_fmt,s_fmt,range(0,1099))
228
229
230 def tearDown(self):
231 pass
232
233
234 def check_wrapper():
235 g_dir = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/contigs'
236 acc_dir = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/acc'
237 don_dir = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/don'
238
239 g_fmt = 'contig%d.dna.flat'
240 s_fmt = 'contig_%d%s.Conf_cum'
241
242 test = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
243
244 for idx in range(1,100):
245 pos = test.get_genomic_fragment_fn(idx,'+')
246 neg = test.get_genomic_fragment_fn(idx,'-')
247 print pos,neg
248 assert os.path.exists(pos)
249 assert os.path.exists(neg)
250
251 acc_fn,don_fn = test.get_splice_site_scores_fn(idx,'+')
252 print acc_fn,don_fn
253 assert os.path.exists(acc_fn)
254 assert os.path.exists(don_fn)
255 acc_fn,don_fn = test.get_splice_site_scores_fn(idx,'-')
256 print acc_fn,don_fn
257 assert os.path.exists(acc_fn)
258 assert os.path.exists(don_fn)
259
260
261 if __name__ == '__main__':
262 #suite = unittest.TestLoader().loadTestsFromTestCase(TestSequenceUtils)
263 suite = unittest.TestLoader().loadTestsFromTestCase(TestLookupTable)
264 unittest.TextTestRunner(verbosity=2).run(suite)