+ removed load_genomic dependency
[qpalma.git] / qpalma / sequence_utils.py
1 # This program is free software; you can redistribute it and/or modify
2 # it under the terms of the GNU General Public License as published by
3 # the Free Software Foundation; either version 2 of the License, or
4 # (at your option) any later version.
5 #
6 # Written (W) 2008 Fabio De Bona
7 # Copyright (C) 2008 Max-Planck-Society
8
9 #
10 # This file contains the main interface to the QPalma pipeline.
11 #
12 #
13 # This module holds all functions for queries on the dna flat files and the
14 # splice score files.
15 # Furthermore it contains some utility functions such as reverse complement and
16 # unbracketing est strings
17 #
18
19 import os
20 import os.path
21 import pdb
22 import re
23 import subprocess
24 import numpy
25
26 from numpy.matlib import inf
27
28 def load_genomic(chromosome, strand, start, stop, genome, one_based=True):
29 """
30 This function stems from Cheng Soon Ongs genome_utils package.
31 """
32
33 if (type(start)==numpy.ndarray) or (type(stop)==numpy.ndarray):
34 assert(len(start) == len(stop))
35 assert((start[1:]-stop[:-1]>0).all())
36 if strand == '+':
37 idx = xrange(len(start))
38 else:
39 idx = xrange(len(start)-1,-1,-1)
40
41 seq = ''.join([load_genomic(chromosome, strand, start[ix], stop[ix], genome)\
42 for ix in idx])
43 return seq
44
45 #fname = '/fml/ag-raetsch/share/databases/genomes/' + genome + '/' + chromosome + '.dna.flat'
46 fname = genome + '/' + chromosome + '.dna.flat'
47 f=file(fname)
48 if one_based:
49 f.seek(start-1)
50 str=f.read(stop-start+1)
51 else:
52 f.seek(start)
53 str=f.read(stop-start)
54
55 if strand=='-':
56 return reverse_complement(str)
57 elif strand=='+':
58 return str
59 else:
60 print 'strand must be + or -'
61 raise KeyError
62
63
64 extended_alphabet = ['-','a','c','g','t','n','[',']']
65 alphabet = ['-','a','c','g','t','n']
66
67
68 class DataAccessionException:
69 pass
70
71
72 def get_flat_file_size(filename):
73 cmd = 'wc -c %s | cut -f1 -d \' \'' % filename
74 obj = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
75 out,err = obj.communicate()
76
77 if err != '':
78 print 'Error occurred while trying to obtain file size'
79 raise DataAccessionException
80
81 return int(out)
82
83
84 def reverse_complement(seq):
85 """
86 This function takes a read in plain or bracket notation and returns the
87 reverse complement of it.
88 I.e.
89
90 est = cgt[ac][tg]a
91
92 leads to
93
94 rev_comp = t[ac][tg]acg
95 """
96
97 bpos = seq.find('[')
98 rc = lambda x: {'a':'t','c':'g','g':'c','t':'a','n':'n'}[x]
99
100 # check first whether seq contains no brackets at all
101 if bpos == -1:
102 ret_val = map(rc,seq)
103 ret_val.reverse()
104 ret_val = "".join(ret_val)
105 else:
106 brc = lambda x: {'a':'t','c':'g','g':'c','t':'a','n':'n','[':'[',']':']'}[x]
107
108 # first_part is the part of the seq up to the first occurrence of a
109 # bracket
110 first_part = seq[:bpos]
111 first_part = map(rc,first_part)
112 first_part.reverse()
113 first_part = "".join(first_part)
114
115 # inside brackets has to be complemented but NOT reversed
116 inside_brackets = seq[bpos+1:bpos+3]
117 inside_brackets = "".join(map(rc,inside_brackets))
118
119 ret_val = '%s[%s]%s'%(reverse_complement(seq[bpos+4:]),inside_brackets,first_part)
120
121 return ret_val
122
123
124 def unbracket_seq(seq):
125 """
126 This function takes a read in bracket notation and restores the read sequence from it.
127 I.e.
128
129 seq = cgt[ac][tg]aa
130
131 leads to
132
133 result = cgtcgaa
134
135 so the second entry within brackets is the base on the read whereas the first
136 entry is the base from the dna.
137 """
138
139 new_seq = ''
140 e = 0
141
142 while True:
143 if e >= len(seq):
144 break
145
146 if seq[e] == '[':
147 new_seq += seq[e+2]
148 e += 4
149 else:
150 new_seq += seq[e]
151 e += 1
152
153 return "".join(new_seq).lower()
154
155
156 def reconstruct_dna_seq(seq):
157 """
158
159 """
160
161 new_seq = ''
162 e = 0
163
164 while True:
165 if e >= len(seq):
166 break
167
168 if seq[e] == '[':
169 new_seq += seq[e+1]
170 e += 4
171 else:
172 new_seq += seq[e]
173 e += 1
174
175 return "".join(new_seq).lower()
176
177
178
179 def create_bracket_seq(dna_seq,read_seq):
180 """
181 This function takes a dna sequence and a read sequence and returns the
182 bracket format of the match/mismatches i.e.
183
184 dna : aaa
185 read: aac
186 is written in bracket notation: aa[ac]
187 """
188 assert len(dna_seq) == len(read_seq),pdb.set_trace()
189 return "".join(map(lambda x,y: ['[%s%s]'%(x,y),x][x==y],dna_seq,read_seq))
190
191
192
193 def my_load_genomic(fname, strand, start, stop, one_based=True):
194
195 if (type(start)==numpy.ndarray) or (type(stop)==numpy.ndarray):
196 assert(len(start) == len(stop))
197 assert((start[1:]-stop[:-1]>0).all())
198 if strand == '+':
199 idx = xrange(len(start))
200 else:
201 idx = xrange(len(start)-1,-1,-1)
202
203 seq = ''.join([load_genomic(chromosome, strand, start[ix], stop[ix], genome)\
204 for ix in idx])
205 return seq
206
207 try:
208 f=file(fname)
209 if one_based:
210 f.seek(start-1)
211 str=f.read(stop-start+1)
212 else:
213 f.seek(start)
214 str=f.read(stop-start)
215
216 if strand=='-':
217 return reverse_complement(str)
218 elif strand=='+':
219 return str
220 else:
221 print 'strand must be + or -'
222 raise KeyError
223
224 except:
225 print fname,strand,start,stop
226
227
228 class DataAccessWrapper:
229 """
230 The purpose of this class is to wrap the genomic/splice site score data
231 access.
232 """
233
234 def __init__(self,settings):
235 self.genomic_dir = settings['genome_dir']
236 self.acc_score_dir = settings['acceptor_scores_loc']
237 self.don_score_dir = settings['donor_scores_loc']
238 self.genomic_fmt = settings['genome_file_fmt']
239 self.sscore_fmt = settings['splice_score_file_fmt']
240
241
242 def get_genomic_fragment_fn(self,id,strand):
243 if strand == '+':
244 return os.path.join(self.genomic_dir,self.genomic_fmt%id)
245 elif strand == '-':
246 return os.path.join(self.genomic_dir,(self.genomic_fmt%id)+'.neg')
247 else:
248 assert False
249
250
251 def get_splice_site_scores_fn(self,id,strand):
252 acc_fn = os.path.join(self.acc_score_dir,self.sscore_fmt%(id,strand))
253 don_fn = os.path.join(self.don_score_dir,self.sscore_fmt%(id,strand))
254 return acc_fn,don_fn
255
256
257
258 class SeqSpliceInfo():
259 """
260
261 """
262
263 def __init__(self,accessWrapper,id_list):
264 self.accessWrapper = accessWrapper
265
266 self.chromo_sizes = {}
267
268 print "Fetching fragment sizes..."
269 for chromo in id_list:
270 filename = self.accessWrapper.get_genomic_fragment_fn(chromo,'+')
271 total_size = get_flat_file_size(filename)
272 self.chromo_sizes[chromo] = total_size
273
274 print "done"
275
276
277 def getSpliceScores(self,chromo,strand,intervalBegin,intervalEnd,genomicSeq=''):
278 """
279 Now we want to use interval_query to get the predicted splice scores trained
280 on the TAIR sequence and annotation.
281 """
282
283 size = intervalEnd-intervalBegin
284 assert size > 1, 'Error (getSpliceScores): interval size is less than 2!'
285
286 acc = size*[0.0]
287 don = size*[0.0]
288
289 interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
290 pos_size = new_intp()
291 intp_assign(pos_size,1)
292
293 total_size = self.chromo_sizes[chromo]
294
295 acc_fn, don_fn = self.accessWrapper.get_splice_site_scores_fn(chromo,strand)
296
297 # fetch acceptor scores
298 acc = doIntervalQuery(chromo,strand,intervalBegin,intervalEnd,acc_fn,total_size)
299
300 # fetch donor scores
301 don = doIntervalQuery(chromo,strand,intervalBegin,intervalEnd,don_fn,total_size)
302
303 return acc, don
304
305
306 def fetch_sequence(self,chromo,strand,genomicSeq_start,genomicSeq_stop):
307 #chromo_string = 'chr%d' % chromo
308 #genomicSeq = load_genomic(chromo_string,strand,genomicSeq_start,genomicSeq_stop,self.flat_files,one_based=False)
309 filename = self.accessWrapper.get_genomic_fragment_fn(chromo,strand)
310 #print filename,genomicSeq_start,genomicSeq_stop
311 genomicSeq = my_load_genomic(filename,'+',genomicSeq_start,genomicSeq_stop,one_based=False)
312 genomicSeq = genomicSeq.strip().lower()
313
314 # check the obtained dna sequence
315 assert genomicSeq != '', 'load_genomic returned empty sequence!'
316
317 # all entries other than a c g t are set to n
318 no_base = re.compile('[^acgt]')
319 genomicSeq = no_base.sub('n',genomicSeq)
320
321 return genomicSeq
322
323
324 def get_seq_and_scores(self,chromo,strand,genomicSeq_start,genomicSeq_stop,only_seq=False,perform_checks=True):
325 """
326 This function expects an interval, chromosome and strand information and
327 returns then the genomic sequence of this interval and the associated scores.
328 """
329 # Because splice site scores are predicted using a window of the sequence the
330 # first and the last NO_SCORE_WINDOW_SIZE nucleotides of a genomic sequence
331 # do not have any score predictions
332 NO_SCORE_WINDOW_SIZE = 205
333
334 total_size = self.chromo_sizes[chromo]
335
336 #print genomicSeq_start,genomicSeq_stop
337
338 assert genomicSeq_start < genomicSeq_stop, pdb.set_trace()
339
340 if strand == '+':
341 s_start = genomicSeq_start - 1
342 s_stop = genomicSeq_stop - 1
343
344 genomicSeq_start -= 1
345 genomicSeq_stop -= 1
346
347 if strand == '-':
348 s_start = total_size - genomicSeq_stop + 1
349 s_stop = total_size - genomicSeq_start + 1
350
351 assert genomicSeq_start < genomicSeq_stop
352
353 genomicSeq = self.fetch_sequence(chromo,strand,s_start,s_stop)
354
355 if only_seq:
356 return genomicSeq
357
358 # shorten the intervalEnd to the maximal size of the genomic sequence if it would exceed this size
359 lookup_offset_begin = 10
360 lookup_offset_end = 10
361 intervalBegin = max(genomicSeq_start-lookup_offset_begin,0)
362 if intervalBegin == 0:
363 lookup_offset_begin = 0
364
365 intervalEnd = min(genomicSeq_stop+lookup_offset_end,total_size-1)
366 if intervalEnd == total_size-1:
367 lookup_offset_end = 0
368
369 currentAcc, currentDon = self.getSpliceScores(chromo,strand,intervalBegin,intervalEnd,genomicSeq)
370
371 if strand == '-':
372 currentAcc = currentAcc[1:]
373 #currentDon = currentDon[1:]
374 #currentDon = [-inf]+currentDon[:-1]
375 else:
376 currentAcc = currentAcc[2:]
377 currentDon = currentDon[1:]
378
379 # remove the offset positions
380 currentAcc = currentAcc[lookup_offset_begin:]
381 currentDon = currentDon[lookup_offset_begin:]
382
383 currentAcc = currentAcc[:len(genomicSeq)]
384 currentDon = currentDon[:len(genomicSeq)]
385
386 length = len(genomicSeq)
387 currentAcc = currentAcc+[-inf]*(length-len(currentAcc))
388 currentDon = currentDon+[-inf]*(length-len(currentDon))
389 currentDon[-1] = -inf
390
391 gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if (p>0 and p<len(genomicSeq)-1 and e=='g' ) and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
392 ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and genomicSeq[p-1]=='a' and genomicSeq[p]=='g']
393
394 for pos in ag_tuple_pos:
395 if pos+intervalBegin < NO_SCORE_WINDOW_SIZE and currentAcc[pos] == -inf:
396 currentAcc[pos] = 1e-6
397
398 if pos+intervalBegin > total_size - NO_SCORE_WINDOW_SIZE and currentAcc[pos] == -inf:
399 currentAcc[pos] = 1e-6
400
401 for pos in gt_tuple_pos:
402 if pos+intervalBegin < NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
403 currentDon[pos] = 1e-6
404
405 if pos+intervalBegin > total_size - NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
406 currentDon[pos] = 1e-6
407
408 acc_pos = [p for p,e in enumerate(currentAcc) if e != -inf and p > 1]
409 don_pos = [p for p,e in enumerate(currentDon) if e != -inf and p > 0]
410
411 if perform_checks and not ag_tuple_pos == acc_pos:
412 print 'ACC: Chromo/strand: %d/%s' % (chromo,strand)
413 print ag_tuple_pos[:10],ag_tuple_pos[-10:]
414 print acc_pos[:10],acc_pos[-10:]
415 pdb.set_trace()
416
417 if perform_checks and not gt_tuple_pos == don_pos:
418 print 'DON: Chromo/strand: %d/%s' % (chromo,strand)
419 print gt_tuple_pos[:10],gt_tuple_pos[-10:]
420 print don_pos[:10],don_pos[-10:]
421 pdb.set_trace()
422
423 assert len(genomicSeq) == len(currentAcc) == len(currentDon), pdb.set_trace()
424
425 return genomicSeq, currentAcc, currentDon