+ changed sequence utils to support more general genomic/splice dirs and filenames
[qpalma.git] / qpalma / sequence_utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 #
5 # This module holds all functions for queries on the dna flat files and the
6 # splice score files.
7 # Furthermore it contains some utility functions such as reverse complement and
8 # unbracketing est strings
9 #
10
11 import os
12 import os.path
13 import pdb
14 import re
15 import subprocess
16 import numpy
17
18 from numpy.matlib import inf
19
20 from Genefinding import *
21 from genome_utils import load_genomic
22
23 extended_alphabet = ['-','a','c','g','t','n','[',']']
24 alphabet = ['-','a','c','g','t','n']
25
26
27 class DataAccessionException:
28 pass
29
30
31 def get_flat_file_size(filename):
32 cmd = 'wc -c %s | cut -f1 -d \' \'' % filename
33 obj = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
34 out,err = obj.communicate()
35
36 if err != '':
37 print 'Error occurred while trying to obtain file size'
38 raise DataAccessionException
39
40 return int(out)
41
42
43 def reverse_complement(seq):
44 """
45 This function takes a read in plain or bracket notation and returns the
46 reverse complement of it.
47 I.e.
48
49 est = cgt[ac][tg]a
50
51 leads to
52
53 rev_comp = t[ac][tg]acg
54 """
55
56 bpos = seq.find('[')
57 rc = lambda x: {'a':'t','c':'g','g':'c','t':'a','n':'n'}[x]
58
59 # check first whether seq contains no brackets at all
60 if bpos == -1:
61 ret_val = map(rc,seq)
62 ret_val.reverse()
63 ret_val = "".join(ret_val)
64 else:
65 brc = lambda x: {'a':'t','c':'g','g':'c','t':'a','n':'n','[':'[',']':']'}[x]
66
67 # first_part is the part of the seq up to the first occurrence of a
68 # bracket
69 first_part = seq[:bpos]
70 first_part = map(rc,first_part)
71 first_part.reverse()
72 first_part = "".join(first_part)
73
74 # inside brackets has to be complemented but NOT reversed
75 inside_brackets = seq[bpos+1:bpos+3]
76 inside_brackets = "".join(map(rc,inside_brackets))
77
78 ret_val = '%s[%s]%s'%(reverse_complement(seq[bpos+4:]),inside_brackets,first_part)
79
80 return ret_val
81
82
83 def unbracket_seq(seq):
84 """
85 This function takes a read in bracket notation and restores the read sequence from it.
86 I.e.
87
88 seq = cgt[ac][tg]aa
89
90 leads to
91
92 result = cgtcgaa
93
94 so the second entry within brackets is the base on the read whereas the first
95 entry is the base from the dna.
96 """
97
98 new_seq = ''
99 e = 0
100
101 while True:
102 if e >= len(seq):
103 break
104
105 if seq[e] == '[':
106 new_seq += seq[e+2]
107 e += 4
108 else:
109 new_seq += seq[e]
110 e += 1
111
112 return "".join(new_seq).lower()
113
114
115 def create_bracket_seq(dna_seq,read_seq):
116 """
117 This function takes a dna sequence and a read sequence and returns the
118 bracket format of the match/mismatches i.e.
119
120 dna : aaa
121 read: aac
122 is written in bracket notation: aa[ac]
123 """
124 assert len(dna_seq) == len(read_seq),pdb.set_trace()
125 return "".join(map(lambda x,y: ['[%s%s]'%(x,y),x][x==y],dna_seq,read_seq))
126
127
128
129 def my_load_genomic(fname, strand, start, stop, one_based=True):
130 if (type(start)==numpy.ndarray) or (type(stop)==numpy.ndarray):
131 assert(len(start) == len(stop))
132 assert((start[1:]-stop[:-1]>0).all())
133 if strand == '+':
134 idx = xrange(len(start))
135 else:
136 idx = xrange(len(start)-1,-1,-1)
137
138 seq = ''.join([load_genomic(chromosome, strand, start[ix], stop[ix], genome)\
139 for ix in idx])
140 return seq
141
142 f=file(fname)
143 if one_based:
144 f.seek(start-1)
145 str=f.read(stop-start+1)
146 else:
147 f.seek(start)
148 str=f.read(stop-start)
149
150 if strand=='-':
151 return reverse_complement(str)
152 elif strand=='+':
153 return str
154 else:
155 print 'strand must be + or -'
156 raise KeyError
157
158
159 class DataAccessWrapper:
160 """
161 The purpose of this class is to wrap the genomic/splice site score data
162 access.
163
164
165
166 """
167
168
169 def __init__(self,genomic_data_dir,acc_score_dir,don_score_dir,gen_fmt,sscore_fmt):
170 self.genomic_dir = genomic_data_dir
171 self.acc_score_dir = acc_score_dir
172 self.don_score_dir = don_score_dir
173 self.genomic_fmt = gen_fmt
174 self.sscore_fmt = sscore_fmt
175
176 assert os.path.isdir(genomic_data_dir)
177 assert os.path.isdir(acc_score_dir)
178 assert os.path.isdir(don_score_dir)
179
180
181 def get_genomic_fragment_fn(self,id,strand):
182 if strand == '+':
183 return os.path.join(self.genomic_dir,self.genomic_fmt%id)
184 elif strand == '-':
185 return os.path.join(self.genomic_dir,(self.genomic_fmt%id)+'.neg')
186 else:
187 assert False
188
189
190 def get_splice_site_scores_fn(self,id,strand):
191 acc_fn = os.path.join(self.acc_score_dir,self.sscore_fmt%(id,strand))
192 don_fn = os.path.join(self.don_score_dir,self.sscore_fmt%(id,strand))
193 return acc_fn,don_fn
194
195
196
197 class SeqSpliceInfo():
198 """
199
200 """
201
202 def __init__(self,accessWrapper,id_list):
203 self.accessWrapper = accessWrapper
204
205 self.chromo_sizes = {}
206
207 print "Fetching fragment sizes..."
208 for chromo in id_list:
209 filename = self.accessWrapper.get_genomic_fragment_fn(chromo,'+')
210 total_size = get_flat_file_size(filename)
211 self.chromo_sizes[chromo] = total_size
212
213 print "done"
214
215
216 def getSpliceScores(self,chromo,strand,intervalBegin,intervalEnd,genomicSeq=''):
217 """
218 Now we want to use interval_query to get the predicted splice scores trained
219 on the TAIR sequence and annotation.
220 """
221
222 size = intervalEnd-intervalBegin
223 assert size > 1, 'Error (getSpliceScores): interval size is less than 2!'
224
225 acc = size*[0.0]
226 don = size*[0.0]
227
228 interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
229 pos_size = new_intp()
230 intp_assign(pos_size,1)
231
232 total_size = self.chromo_sizes[chromo]
233
234 acc_fn, don_fn = self.accessWrapper.get_splice_site_scores_fn(chromo,strand)
235
236 # fetch acceptor scores
237 acc = doIntervalQuery(chromo,strand,intervalBegin,intervalEnd,acc_fn,total_size)
238
239 # fetch donor scores
240 don = doIntervalQuery(chromo,strand,intervalBegin,intervalEnd,don_fn,total_size)
241
242 return acc, don
243
244
245 def fetch_sequence(self,chromo,strand,genomicSeq_start,genomicSeq_stop):
246 #chromo_string = 'chr%d' % chromo
247 #genomicSeq = load_genomic(chromo_string,strand,genomicSeq_start,genomicSeq_stop,self.flat_files,one_based=False)
248 filename = self.accessWrapper.get_genomic_fragment_fn(chromo,strand)
249 genomicSeq = my_load_genomic(filename,'+',genomicSeq_start,genomicSeq_stop,one_based=False)
250 genomicSeq = genomicSeq.strip().lower()
251
252 # check the obtained dna sequence
253 assert genomicSeq != '', 'load_genomic returned empty sequence!'
254
255 # all entries other than a c g t are set to n
256 no_base = re.compile('[^acgt]')
257 genomicSeq = no_base.sub('n',genomicSeq)
258
259 return genomicSeq
260
261 def get_seq_and_scores(self,chromo,strand,genomicSeq_start,genomicSeq_stop,only_seq=False):
262 """
263 This function expects an interval, chromosome and strand information and
264 returns then the genomic sequence of this interval and the associated scores.
265 """
266 # Because splice site scores are predicted using a window of the sequence the
267 # first and the last NO_SCORE_WINDOW_SIZE nucleotides of a genomic sequence
268 # do not have any score predictions
269 NO_SCORE_WINDOW_SIZE = 205
270
271 assert genomicSeq_start < genomicSeq_stop
272
273 genomicSeq = self.fetch_sequence(chromo,strand,genomicSeq_start,genomicSeq_stop)
274
275 if only_seq:
276 return genomicSeq
277
278 total_size = self.chromo_sizes[chromo]
279
280 # shorten the intervalEnd to the maximal size of the genomic sequence if it would exceed this size
281 lookup_offset_begin = 10
282 lookup_offset_end = 10
283 intervalBegin = max(genomicSeq_start-lookup_offset_begin,0)
284 if intervalBegin == 0:
285 lookup_offset_begin = 0
286
287 intervalEnd = min(genomicSeq_stop+lookup_offset_end,total_size-1)
288 if intervalEnd == total_size-1:
289 lookup_offset_end = 0
290
291 currentAcc, currentDon = self.getSpliceScores(chromo,strand,intervalBegin,intervalEnd,genomicSeq)
292
293 original_acc = list(currentAcc)
294
295 if strand == '-':
296 currentDon = [-inf]+currentDon[:-1]
297 else:
298 currentAcc = currentAcc[2:]
299 currentDon = currentDon[1:]
300
301 # remove the offset positions
302 currentAcc = currentAcc[lookup_offset_begin:]
303 currentDon = currentDon[lookup_offset_begin:]
304
305 currentAcc = currentAcc[:len(genomicSeq)]
306 currentDon = currentDon[:len(genomicSeq)]
307
308 length = len(genomicSeq)
309 currentAcc = currentAcc+[-inf]*(length-len(currentAcc))
310 currentDon = currentDon+[-inf]*(length-len(currentDon))
311 currentDon[-1] = -inf
312
313 gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if (p>0 and p<len(genomicSeq)-1 and e=='g' ) and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
314 ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and genomicSeq[p-1]=='a' and genomicSeq[p]=='g']
315
316 #pdb.set_trace()
317 #return genomicSeq, currentAcc, currentDon
318
319 for pos in ag_tuple_pos:
320 if pos+intervalBegin < NO_SCORE_WINDOW_SIZE and currentAcc[pos] == -inf:
321 currentAcc[pos] = 1e-6
322
323 if pos+intervalBegin > total_size - NO_SCORE_WINDOW_SIZE and currentAcc[pos] == -inf:
324 currentAcc[pos] = 1e-6
325
326 for pos in gt_tuple_pos:
327 if pos+intervalBegin < NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
328 currentDon[pos] = 1e-6
329
330 if pos+intervalBegin > total_size - NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
331 currentDon[pos] = 1e-6
332
333 acc_pos = [p for p,e in enumerate(currentAcc) if e != -inf and p > 1]
334 don_pos = [p for p,e in enumerate(currentDon) if e != -inf and p > 0]
335
336 if not ag_tuple_pos == acc_pos:
337 print 'ACC: Chromo/strand: %d/%s' % (chromo,strand)
338 print ag_tuple_pos[:10],ag_tuple_pos[-10:]
339 print acc_pos[:10],acc_pos[-10:]
340 pdb.set_trace()
341
342 if not gt_tuple_pos == don_pos:
343 print 'DON: Chromo/strand: %d/%s' % (chromo,strand)
344 print gt_tuple_pos[:10],gt_tuple_pos[-10:]
345 print don_pos[:10],don_pos[-10:]
346 pdb.set_trace()
347
348 #assert ag_tuple_pos == acc_pos, pdb.set_trace()
349 #assert gt_tuple_pos == don_pos, pdb.set_trace()
350 assert len(genomicSeq) == len(currentAcc) == len(currentDon), pdb.set_trace()
351
352 return genomicSeq, currentAcc, currentDon