+ added some testcases
[qpalma.git] / tests / test_sequence_utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import pdb
5 import unittest
6 import numpy
7
8 from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper,reverse_complement
9 from qpalma.Lookup import LookupTable
10
11 class TestSequenceUtils(unittest.TestCase):
12
13
14 def setUp(self):
15 self.strands = ['+','']
16
17 g_dir = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
18 acc_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
19 don_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
20
21 g_fmt = 'chr%d.dna.flat'
22 s_fmt = 'contig_%d%s'
23
24 num_chromo = 6
25
26 accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
27 self.seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
28
29 #self.lt1 = LookupTable(g_dir,acc_dir,don_dir,g_fmt,s_fmt,range(1,2))
30
31
32 def testThalianaDataExamples(self):
33 seq = 'TGAAAACAGGAACGGATTGGAGAAAGGCGTCTCGTCAT'.lower()
34 chromo = 3
35 strand = '+'
36 pos = 19246391
37
38 dna = self.seqInfo.get_seq_and_scores(chromo,strand,pos,pos+38,True)
39 self.assertEqual(seq,dna)
40
41 seq = 'AGGCAATGAAACTGATGCATTGGACTTGACGGGTGTTG'.lower()
42 chromo = 5
43 strand = '+'
44 pos = 15394760
45 dna = self.seqInfo.get_seq_and_scores(chromo,strand,pos,pos+38,True)
46 self.assertEqual(seq,dna)
47
48 seq = 'TCTTGGTGGAGGAGCTAACACCGTAGCTGACGGTTACA'.lower()
49 chromo = 4
50 strand = '+'
51 pos = 16709206
52 dna = self.seqInfo.get_seq_and_scores(chromo,strand,pos,pos+38,True)
53 self.assertEqual(seq,dna)
54
55 seq = 'TTGGAAGACAGAGTCAACCATACCCTTGCCTCTGGTGA'.lower()
56 chromo = 2
57 strand = '+'
58 pos = 16579649
59 dna = self.seqInfo.get_seq_and_scores(chromo,strand,pos,pos+38,True)
60 self.assertEqual(seq,dna)
61
62 seq = 'CTGGCCAAAAGCTCAGGGAAGACGCAGCCTAGGGCTCC'.lower()
63 seq = reverse_complement(seq)
64 chromo = 1
65 strand = '-'
66 pos = 10475515
67 dna = self.seqInfo.get_seq_and_scores(chromo,strand,pos,pos+38,True)
68 self.assertEqual(seq,dna)
69
70 seq = 'TTTTTCCCTTCTAGAAGACCGTAAAGGTAAACTTCTAA'.lower()
71 seq = reverse_complement(seq)
72 chromo = 3
73 strand = '-'
74 pos = 17143420
75 dna = self.seqInfo.get_seq_and_scores(chromo,strand,pos,pos+38,True)
76 self.assertEqual(seq,dna)
77
78 seq = 'CACGGTGCAGATGAAGAACTGAGATCCGTTCGTGTTTG'.lower()
79 seq = reverse_complement(seq)
80 chromo = 4
81 strand = '-'
82 pos = 18083761
83 dna = self.seqInfo.get_seq_and_scores(chromo,strand,pos,pos+38,True)
84 self.assertEqual(seq,dna)
85
86 window = 113
87 seq = 'CACGGTGCAGATGAAGAACTGAGATCCGTTCGTGTTTG'.lower()
88 seq = reverse_complement(seq)
89 chromo = 4
90 strand = '-'
91 pos = 18083761-window
92 dna,acc,don = self.seqInfo.get_seq_and_scores(chromo,strand,pos,pos+38+2*window,False)
93 self.assertEqual(seq,dna[window:-window])
94
95 #print dna
96 #print "".join(map(lambda x: ['_','x'][x!=-numpy.inf],acc))
97 #print "".join(map(lambda x: ['_','x'][x!=-numpy.inf],don))
98
99
100 def _testThalianaDataGeneric(self):
101
102 dna,acc,don = self.seqInfo.get_seq_and_scores(1,'+',1000,1369)
103 dna_,acc_,don_ = self.lt1.get_seq_and_scores(1,'+',1000,1369,'')
104
105 self.assertEqual(dna,dna_)
106 self.assertEqual(acc,acc_)
107 self.assertEqual(don,don_)
108
109 dna,acc,don = self.seqInfo.get_seq_and_scores(1,'-',1000,1369)
110 dna_,acc_,don_ = self.lt1.get_seq_and_scores(1,'-',1000,1369,'')
111
112 self.assertEqual(dna,dna_)
113 self.assertEqual(acc,acc_)
114 self.assertEqual(don,don_)
115
116 #dna,acc,don = seqInfo.get_seq_and_scores(2,'+',1000,1369)
117 #dna,acc,don = seqInfo.get_seq_and_scores(3,'+',1000,1369)
118 #dna,acc,don = seqInfo.get_seq_and_scores(4,'+',1000,1369)
119 #dna,acc,don = seqInfo.get_seq_and_scores(5,'+',1000,1369)
120
121 #dna,acc,don = seqInfo.get_seq_and_scores(1,'-',1000,1369)
122 #dna_,acc_,don_ = lt1.get_seq_and_scores(1,'-',1000,1369,'')
123
124 #self.assertEqual(dna,dna_)
125 #self.assertEqual(acc,acc_)
126 #self.assertEqual(don,don_)
127
128
129 #dna,acc,don = seqInfo.get_seq_and_scores(2,'-',1000,1369)
130 #dna,acc,don = seqInfo.get_seq_and_scores(3,'-',1000,1369)
131 #dna,acc,don = seqInfo.get_seq_and_scores(4,'-',1000,1369)
132 #dna,acc,don = seqInfo.get_seq_and_scores(5,'-',1000,1369)
133
134
135 def _testLyrataData(self):
136 g_dir = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/contigs'
137 acc_dir = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/acc'
138 don_dir = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/don'
139
140 g_fmt = 'contig%d.dna.flat'
141 s_fmt = 'contig_%d%s'
142
143 num_chromo = 1099
144
145 accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
146 seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
147
148 dna,acc,don = seqInfo.get_seq_and_scores(1,'+',1,1369)
149 dna,acc,don = seqInfo.get_seq_and_scores(2,'+',1,1369)
150 dna,acc,don = seqInfo.get_seq_and_scores(3,'+',1,1369)
151 dna,acc,don = seqInfo.get_seq_and_scores(4,'+',1,1369)
152 dna,acc,don = seqInfo.get_seq_and_scores(5,'+',1,1369)
153 dna,acc,don = seqInfo.get_seq_and_scores(1,'-',1,1369)
154 dna,acc,don = seqInfo.get_seq_and_scores(2,'-',1,1369)
155 dna,acc,don = seqInfo.get_seq_and_scores(3,'-',1,1369)
156 dna,acc,don = seqInfo.get_seq_and_scores(4,'-',1,1369)
157 dna,acc,don = seqInfo.get_seq_and_scores(5,'-',1,1369)
158 dna,acc,don = seqInfo.get_seq_and_scores(45,'-',1,1369)
159 dna,acc,don = seqInfo.get_seq_and_scores(45,'+',1,1369)
160
161 print 'Finished'
162 #num_tests = 10
163 #for chromo in range(1,6):
164 # for strand in ['+','-']:
165 # for test_idx in range(num_tests):
166 # if strand == '-':
167 # size = seqInfo.chromo_sizes[chromo+7]
168 # else:
169 # size = seqInfo.chromo_sizes[chromo]
170 # begin = random.randint(0,size)
171 # end = random.randint(b,size)
172 # dna,acc,don = seqInfo.get_seq_and_scores(chromo,strand,begin,end)
173
174
175 def tearDown(self):
176 pass
177
178
179 class TestLookupTable(unittest.TestCase):
180
181 def setUp(self):
182 pass
183
184
185 def _testTableThalianaData(self):
186 g_dir = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
187 acc_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
188 don_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
189
190 g_fmt = 'chr%d.dna.flat'
191 s_fmt = 'contig_%d%s'
192
193 lt1 = LookupTable(g_dir,acc_dir,don_dir,g_fmt,s_fmt,range(1,100))
194
195
196 def testTableLyrataData(self):
197 g_dir = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/contigs'
198 acc_dir = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/acc'
199 don_dir = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/don'
200
201 g_fmt = 'contig%d.dna.flat'
202 s_fmt = 'contig_%d%s'
203
204 lt1 = LookupTable(g_dir,acc_dir,don_dir,g_fmt,s_fmt,range(0,1099))
205
206
207 def tearDown(self):
208 pass
209
210
211 def check_wrapper():
212 g_dir = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/contigs'
213 acc_dir = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/acc'
214 don_dir = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/don'
215
216 g_fmt = 'contig%d.dna.flat'
217 s_fmt = 'contig_%d%s.Conf_cum'
218
219 test = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
220
221 for idx in range(1,100):
222 pos = test.get_genomic_fragment_fn(idx,'+')
223 neg = test.get_genomic_fragment_fn(idx,'-')
224 print pos,neg
225 assert os.path.exists(pos)
226 assert os.path.exists(neg)
227
228 acc_fn,don_fn = test.get_splice_site_scores_fn(idx,'+')
229 print acc_fn,don_fn
230 assert os.path.exists(acc_fn)
231 assert os.path.exists(don_fn)
232 acc_fn,don_fn = test.get_splice_site_scores_fn(idx,'-')
233 print acc_fn,don_fn
234 assert os.path.exists(acc_fn)
235 assert os.path.exists(don_fn)
236
237
238 if __name__ == '__main__':
239 suite = unittest.TestLoader().loadTestsFromTestCase(TestSequenceUtils)
240 #suite = unittest.TestLoader().loadTestsFromTestCase(TestLookupTable)
241 unittest.TextTestRunner(verbosity=2).run(suite)