+ minor modifications for the prefetching the whole negative strand
[qpalma.git] / qpalma / sequence_utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 #
5 # This module holds all functions for queries on the dna flat files and the
6 # splice score files.
7 # Furthermore it contains some utility functions such as reverse complement and
8 # unbracketing est strings
9 #
10
11 import os
12 import os.path
13 import pdb
14 import re
15 import subprocess
16 import numpy
17
18 from numpy.matlib import inf
19
20 from Genefinding import *
21 from genome_utils import load_genomic
22
23 extended_alphabet = ['-','a','c','g','t','n','[',']']
24 alphabet = ['-','a','c','g','t','n']
25
26
27 class DataAccessionException:
28 pass
29
30
31 def get_flat_file_size(filename):
32 cmd = 'wc -c %s | cut -f1 -d \' \'' % filename
33 obj = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
34 out,err = obj.communicate()
35
36 if err != '':
37 print 'Error occurred while trying to obtain file size'
38 raise DataAccessionException
39
40 return int(out)
41
42
43 def reverse_complement(seq):
44 """
45 This function takes a read in plain or bracket notation and returns the
46 reverse complement of it.
47 I.e.
48
49 est = cgt[ac][tg]a
50
51 leads to
52
53 rev_comp = t[ac][tg]acg
54 """
55
56 bpos = seq.find('[')
57 rc = lambda x: {'a':'t','c':'g','g':'c','t':'a','n':'n'}[x]
58
59 # check first whether seq contains no brackets at all
60 if bpos == -1:
61 ret_val = map(rc,seq)
62 ret_val.reverse()
63 ret_val = "".join(ret_val)
64 else:
65 brc = lambda x: {'a':'t','c':'g','g':'c','t':'a','n':'n','[':'[',']':']'}[x]
66
67 # first_part is the part of the seq up to the first occurrence of a
68 # bracket
69 first_part = seq[:bpos]
70 first_part = map(rc,first_part)
71 first_part.reverse()
72 first_part = "".join(first_part)
73
74 # inside brackets has to be complemented but NOT reversed
75 inside_brackets = seq[bpos+1:bpos+3]
76 inside_brackets = "".join(map(rc,inside_brackets))
77
78 ret_val = '%s[%s]%s'%(reverse_complement(seq[bpos+4:]),inside_brackets,first_part)
79
80 return ret_val
81
82
83 def unbracket_seq(seq):
84 """
85 This function takes a read in bracket notation and restores the read sequence from it.
86 I.e.
87
88 seq = cgt[ac][tg]aa
89
90 leads to
91
92 result = cgtcgaa
93
94 so the second entry within brackets is the base on the read whereas the first
95 entry is the base from the dna.
96 """
97
98 new_seq = ''
99 e = 0
100
101 while True:
102 if e >= len(seq):
103 break
104
105 if seq[e] == '[':
106 new_seq += seq[e+2]
107 e += 4
108 else:
109 new_seq += seq[e]
110 e += 1
111
112 return "".join(new_seq).lower()
113
114
115 def create_bracket_seq(dna_seq,read_seq):
116 """
117 This function takes a dna sequence and a read sequence and returns the
118 bracket format of the match/mismatches i.e.
119
120 dna : aaa
121 read: aac
122 is written in bracket notation: aa[ac]
123 """
124 assert len(dna_seq) == len(read_seq),pdb.set_trace()
125 return "".join(map(lambda x,y: ['[%s%s]'%(x,y),x][x==y],dna_seq,read_seq))
126
127
128
129 def my_load_genomic(fname, strand, start, stop, one_based=True):
130 if (type(start)==numpy.ndarray) or (type(stop)==numpy.ndarray):
131 assert(len(start) == len(stop))
132 assert((start[1:]-stop[:-1]>0).all())
133 if strand == '+':
134 idx = xrange(len(start))
135 else:
136 idx = xrange(len(start)-1,-1,-1)
137
138 seq = ''.join([load_genomic(chromosome, strand, start[ix], stop[ix], genome)\
139 for ix in idx])
140 return seq
141
142 f=file(fname)
143 if one_based:
144 f.seek(start-1)
145 str=f.read(stop-start+1)
146 else:
147 f.seek(start)
148 str=f.read(stop-start)
149
150 if strand=='-':
151 return reverse_complement(str)
152 elif strand=='+':
153 return str
154 else:
155 print 'strand must be + or -'
156 raise KeyError
157
158
159 class DataAccessWrapper:
160 """
161 The purpose of this class is to wrap the genomic/splice site score data
162 access.
163
164
165
166 """
167
168
169 def __init__(self,genomic_data_dir,acc_score_dir,don_score_dir,gen_fmt,sscore_fmt):
170 self.genomic_dir = genomic_data_dir
171 self.acc_score_dir = acc_score_dir
172 self.don_score_dir = don_score_dir
173 self.genomic_fmt = gen_fmt
174 self.sscore_fmt = sscore_fmt
175
176 assert os.path.isdir(genomic_data_dir)
177 assert os.path.isdir(acc_score_dir)
178 assert os.path.isdir(don_score_dir)
179
180
181 def get_genomic_fragment_fn(self,id,strand):
182 if strand == '+':
183 return os.path.join(self.genomic_dir,self.genomic_fmt%id)
184 elif strand == '-':
185 return os.path.join(self.genomic_dir,(self.genomic_fmt%id)+'.neg')
186 else:
187 assert False
188
189
190 def get_splice_site_scores_fn(self,id,strand):
191 acc_fn = os.path.join(self.acc_score_dir,self.sscore_fmt%(id,strand))
192 don_fn = os.path.join(self.don_score_dir,self.sscore_fmt%(id,strand))
193 return acc_fn,don_fn
194
195
196
197 class SeqSpliceInfo():
198 """
199
200 """
201
202 def __init__(self,accessWrapper,id_list):
203 self.accessWrapper = accessWrapper
204
205 self.chromo_sizes = {}
206
207 print "Fetching fragment sizes..."
208 for chromo in id_list:
209 filename = self.accessWrapper.get_genomic_fragment_fn(chromo,'+')
210 total_size = get_flat_file_size(filename)
211 self.chromo_sizes[chromo] = total_size
212
213 print "done"
214
215
216 def getSpliceScores(self,chromo,strand,intervalBegin,intervalEnd,genomicSeq=''):
217 """
218 Now we want to use interval_query to get the predicted splice scores trained
219 on the TAIR sequence and annotation.
220 """
221
222 size = intervalEnd-intervalBegin
223 assert size > 1, 'Error (getSpliceScores): interval size is less than 2!'
224
225 acc = size*[0.0]
226 don = size*[0.0]
227
228 interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
229 pos_size = new_intp()
230 intp_assign(pos_size,1)
231
232 total_size = self.chromo_sizes[chromo]
233
234 acc_fn, don_fn = self.accessWrapper.get_splice_site_scores_fn(chromo,strand)
235
236 # fetch acceptor scores
237 acc = doIntervalQuery(chromo,strand,intervalBegin,intervalEnd,acc_fn,total_size)
238
239 # fetch donor scores
240 don = doIntervalQuery(chromo,strand,intervalBegin,intervalEnd,don_fn,total_size)
241
242 return acc, don
243
244
245 def fetch_sequence(self,chromo,strand,genomicSeq_start,genomicSeq_stop):
246 #chromo_string = 'chr%d' % chromo
247 #genomicSeq = load_genomic(chromo_string,strand,genomicSeq_start,genomicSeq_stop,self.flat_files,one_based=False)
248 filename = self.accessWrapper.get_genomic_fragment_fn(chromo,strand)
249 #print filename,genomicSeq_start,genomicSeq_stop
250 genomicSeq = my_load_genomic(filename,'+',genomicSeq_start,genomicSeq_stop,one_based=False)
251 genomicSeq = genomicSeq.strip().lower()
252
253 # check the obtained dna sequence
254 assert genomicSeq != '', 'load_genomic returned empty sequence!'
255
256 # all entries other than a c g t are set to n
257 no_base = re.compile('[^acgt]')
258 genomicSeq = no_base.sub('n',genomicSeq)
259
260 return genomicSeq
261
262
263 def get_seq_and_scores(self,chromo,strand,genomicSeq_start,genomicSeq_stop,only_seq=False,perform_checks=True):
264 """
265 This function expects an interval, chromosome and strand information and
266 returns then the genomic sequence of this interval and the associated scores.
267 """
268 # Because splice site scores are predicted using a window of the sequence the
269 # first and the last NO_SCORE_WINDOW_SIZE nucleotides of a genomic sequence
270 # do not have any score predictions
271 NO_SCORE_WINDOW_SIZE = 205
272
273 total_size = self.chromo_sizes[chromo]
274
275 #print genomicSeq_start,genomicSeq_stop
276
277 assert genomicSeq_start < genomicSeq_stop
278
279 if strand == '+':
280 s_start = genomicSeq_start - 1
281 s_stop = genomicSeq_stop - 1
282
283 genomicSeq_start -= 1
284 genomicSeq_stop -= 1
285
286 if strand == '-':
287 s_start = total_size - genomicSeq_stop + 1
288 s_stop = total_size - genomicSeq_start + 1
289
290 assert genomicSeq_start < genomicSeq_stop
291
292 genomicSeq = self.fetch_sequence(chromo,strand,s_start,s_stop)
293
294 if only_seq:
295 return genomicSeq
296
297 # shorten the intervalEnd to the maximal size of the genomic sequence if it would exceed this size
298 lookup_offset_begin = 10
299 lookup_offset_end = 10
300 intervalBegin = max(genomicSeq_start-lookup_offset_begin,0)
301 if intervalBegin == 0:
302 lookup_offset_begin = 0
303
304 intervalEnd = min(genomicSeq_stop+lookup_offset_end,total_size-1)
305 if intervalEnd == total_size-1:
306 lookup_offset_end = 0
307
308 currentAcc, currentDon = self.getSpliceScores(chromo,strand,intervalBegin,intervalEnd,genomicSeq)
309
310 if strand == '-':
311 currentAcc = currentAcc[1:]
312 #currentDon = currentDon[1:]
313 #currentDon = [-inf]+currentDon[:-1]
314 else:
315 currentAcc = currentAcc[2:]
316 currentDon = currentDon[1:]
317
318 # remove the offset positions
319 currentAcc = currentAcc[lookup_offset_begin:]
320 currentDon = currentDon[lookup_offset_begin:]
321
322 currentAcc = currentAcc[:len(genomicSeq)]
323 currentDon = currentDon[:len(genomicSeq)]
324
325 length = len(genomicSeq)
326 currentAcc = currentAcc+[-inf]*(length-len(currentAcc))
327 currentDon = currentDon+[-inf]*(length-len(currentDon))
328 currentDon[-1] = -inf
329
330 gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if (p>0 and p<len(genomicSeq)-1 and e=='g' ) and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
331 ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and genomicSeq[p-1]=='a' and genomicSeq[p]=='g']
332
333 for pos in ag_tuple_pos:
334 if pos+intervalBegin < NO_SCORE_WINDOW_SIZE and currentAcc[pos] == -inf:
335 currentAcc[pos] = 1e-6
336
337 if pos+intervalBegin > total_size - NO_SCORE_WINDOW_SIZE and currentAcc[pos] == -inf:
338 currentAcc[pos] = 1e-6
339
340 for pos in gt_tuple_pos:
341 if pos+intervalBegin < NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
342 currentDon[pos] = 1e-6
343
344 if pos+intervalBegin > total_size - NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
345 currentDon[pos] = 1e-6
346
347 acc_pos = [p for p,e in enumerate(currentAcc) if e != -inf and p > 1]
348 don_pos = [p for p,e in enumerate(currentDon) if e != -inf and p > 0]
349
350 if perform_checks and not ag_tuple_pos == acc_pos:
351 print 'ACC: Chromo/strand: %d/%s' % (chromo,strand)
352 print ag_tuple_pos[:10],ag_tuple_pos[-10:]
353 print acc_pos[:10],acc_pos[-10:]
354 pdb.set_trace()
355
356 if perform_checks and not gt_tuple_pos == don_pos:
357 print 'DON: Chromo/strand: %d/%s' % (chromo,strand)
358 print gt_tuple_pos[:10],gt_tuple_pos[-10:]
359 print don_pos[:10],don_pos[-10:]
360 pdb.set_trace()
361
362 assert len(genomicSeq) == len(currentAcc) == len(currentDon), pdb.set_trace()
363
364 return genomicSeq, currentAcc, currentDon