+ got rid of some legacy code
[qpalma.git] / qpalma / sequence_utils.py
1 # This program is free software; you can redistribute it and/or modify
2 # it under the terms of the GNU General Public License as published by
3 # the Free Software Foundation; either version 2 of the License, or
4 # (at your option) any later version.
5 #
6 # Written (W) 2008 Fabio De Bona
7 # Copyright (C) 2008 Max-Planck-Society
8
9 #
10 # This file contains the main interface to the QPalma pipeline.
11 #
12 #
13 # This module holds all functions for queries on the dna flat files and the
14 # splice score files.
15 # Furthermore it contains some utility functions such as reverse complement and
16 # unbracketing est strings
17 #
18
19 import os
20 import os.path
21 import pdb
22 import re
23 import subprocess
24 import numpy
25
26 from numpy.matlib import inf
27
28 from Genefinding import doIntervalQuery,new_intp,intp_assign
29
30 from QPalmaDP import createIntArrayFromList
31
32
33 def load_genomic(chromosome, strand, start, stop, genome, one_based=True):
34 """
35 This function stems from Cheng Soon Ongs genome_utils package.
36 """
37
38 if (type(start)==numpy.ndarray) or (type(stop)==numpy.ndarray):
39 assert(len(start) == len(stop))
40 assert((start[1:]-stop[:-1]>0).all())
41 if strand == '+':
42 idx = xrange(len(start))
43 else:
44 idx = xrange(len(start)-1,-1,-1)
45
46 seq = ''.join([load_genomic(chromosome, strand, start[ix], stop[ix], genome)\
47 for ix in idx])
48 return seq
49
50 #fname = '/fml/ag-raetsch/share/databases/genomes/' + genome + '/' + chromosome + '.dna.flat'
51 fname = genome + '/' + chromosome + '.dna.flat'
52 f=file(fname)
53 if one_based:
54 f.seek(start-1)
55 str=f.read(stop-start+1)
56 else:
57 f.seek(start)
58 str=f.read(stop-start)
59
60 if strand=='-':
61 return reverse_complement(str)
62 elif strand=='+':
63 return str
64 else:
65 print 'strand must be + or -'
66 raise KeyError
67
68
69 extended_alphabet = ['-','a','c','g','t','n','[',']']
70 alphabet = ['-','a','c','g','t','n']
71
72
73 class DataAccessionException:
74 pass
75
76
77 def get_flat_file_size(filename):
78 cmd = 'wc -c %s | cut -f1 -d \' \'' % filename
79 obj = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
80 out,err = obj.communicate()
81
82 if err != '':
83 print 'Error occurred while trying to obtain file size'
84 raise DataAccessionException
85
86 return int(out)
87
88
89 def reverse_complement(seq):
90 """
91 This function takes a read in plain or bracket notation and returns the
92 reverse complement of it.
93 I.e.
94
95 est = cgt[ac][tg]a
96
97 leads to
98
99 rev_comp = t[ac][tg]acg
100 """
101
102 bpos = seq.find('[')
103 rc = lambda x: {'a':'t','c':'g','g':'c','t':'a','n':'n'}[x]
104
105 # check first whether seq contains no brackets at all
106 if bpos == -1:
107 ret_val = map(rc,seq)
108 ret_val.reverse()
109 ret_val = "".join(ret_val)
110 else:
111 brc = lambda x: {'a':'t','c':'g','g':'c','t':'a','n':'n','[':'[',']':']'}[x]
112
113 # first_part is the part of the seq up to the first occurrence of a
114 # bracket
115 first_part = seq[:bpos]
116 first_part = map(rc,first_part)
117 first_part.reverse()
118 first_part = "".join(first_part)
119
120 # inside brackets has to be complemented but NOT reversed
121 inside_brackets = seq[bpos+1:bpos+3]
122 inside_brackets = "".join(map(rc,inside_brackets))
123
124 ret_val = '%s[%s]%s'%(reverse_complement(seq[bpos+4:]),inside_brackets,first_part)
125
126 return ret_val
127
128
129 def unbracket_seq(seq):
130 """
131 This function takes a read in bracket notation and restores the read sequence from it.
132 I.e.
133
134 seq = cgt[ac][tg]aa
135
136 leads to
137
138 result = cgtcgaa
139
140 so the second entry within brackets is the base on the read whereas the first
141 entry is the base from the dna.
142 """
143
144 new_seq = ''
145 e = 0
146
147 while True:
148 if e >= len(seq):
149 break
150
151 if seq[e] == '[':
152 new_seq += seq[e+2]
153 e += 4
154 else:
155 new_seq += seq[e]
156 e += 1
157
158 return "".join(new_seq).lower()
159
160
161 def reconstruct_dna_seq(seq):
162 """
163
164 """
165
166 new_seq = ''
167 e = 0
168
169 while True:
170 if e >= len(seq):
171 break
172
173 if seq[e] == '[':
174 new_seq += seq[e+1]
175 e += 4
176 else:
177 new_seq += seq[e]
178 e += 1
179
180 return "".join(new_seq).lower()
181
182
183
184 def create_bracket_seq(dna_seq,read_seq):
185 """
186 This function takes a dna sequence and a read sequence and returns the
187 bracket format of the match/mismatches i.e.
188
189 dna : aaa
190 read: aac
191 is written in bracket notation: aa[ac]
192 """
193 assert len(dna_seq) == len(read_seq),pdb.set_trace()
194 return "".join(map(lambda x,y: ['[%s%s]'%(x,y),x][x==y],dna_seq,read_seq))
195
196
197
198 def my_load_genomic(fname, strand, start, stop, one_based=True):
199
200 if (type(start)==numpy.ndarray) or (type(stop)==numpy.ndarray):
201 assert(len(start) == len(stop))
202 assert((start[1:]-stop[:-1]>0).all())
203 if strand == '+':
204 idx = xrange(len(start))
205 else:
206 idx = xrange(len(start)-1,-1,-1)
207
208 seq = ''.join([load_genomic(chromosome, strand, start[ix], stop[ix], genome)\
209 for ix in idx])
210 return seq
211
212 try:
213 f=file(fname)
214 if one_based:
215 f.seek(start-1)
216 str=f.read(stop-start+1)
217 else:
218 f.seek(start)
219 str=f.read(stop-start)
220
221 if strand=='-':
222 return reverse_complement(str)
223 elif strand=='+':
224 return str
225 else:
226 print 'strand must be + or -'
227 raise KeyError
228
229 except:
230 print fname,strand,start,stop
231
232
233 class DataAccessWrapper:
234 """
235 The purpose of this class is to wrap the genomic/splice site score data
236 access.
237 """
238
239 def __init__(self,settings):
240 self.genomic_dir = settings['genome_dir']
241 self.acc_score_dir = settings['acceptor_scores_loc']
242 self.don_score_dir = settings['donor_scores_loc']
243 self.genomic_fmt = settings['genome_file_fmt']
244 self.sscore_fmt = settings['splice_score_file_fmt']
245
246
247 def get_genomic_fragment_fn(self,id,strand):
248 if strand == '+':
249 return os.path.join(self.genomic_dir,self.genomic_fmt%id)
250 elif strand == '-':
251 return os.path.join(self.genomic_dir,(self.genomic_fmt%id)+'.neg')
252 else:
253 assert False
254
255
256 def get_splice_site_scores_fn(self,id,strand):
257 acc_fn = os.path.join(self.acc_score_dir,self.sscore_fmt%(id,strand))
258 don_fn = os.path.join(self.don_score_dir,self.sscore_fmt%(id,strand))
259 return acc_fn,don_fn
260
261
262
263 class SeqSpliceInfo():
264 """
265
266 """
267
268 def __init__(self,accessWrapper,id_list):
269 self.accessWrapper = accessWrapper
270
271 self.chromo_sizes = {}
272
273 print "Fetching fragment sizes..."
274 for chromo in id_list:
275 filename = self.accessWrapper.get_genomic_fragment_fn(chromo,'+')
276 total_size = get_flat_file_size(filename)
277 self.chromo_sizes[chromo] = total_size
278
279 print "done"
280
281
282 def getSpliceScores(self,chromo,strand,intervalBegin,intervalEnd,genomicSeq=''):
283 """
284 Now we want to use interval_query to get the predicted splice scores trained
285 on the TAIR sequence and annotation.
286 """
287
288 size = intervalEnd-intervalBegin
289 assert size > 1, 'Error (getSpliceScores): interval size is less than 2!'
290
291 acc = size*[0.0]
292 don = size*[0.0]
293
294 interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
295 pos_size = new_intp()
296 intp_assign(pos_size,1)
297
298 total_size = self.chromo_sizes[chromo]
299
300 acc_fn, don_fn = self.accessWrapper.get_splice_site_scores_fn(chromo,strand)
301
302 # fetch acceptor scores
303 acc = doIntervalQuery(chromo,strand,intervalBegin,intervalEnd,acc_fn,total_size)
304
305 # fetch donor scores
306 don = doIntervalQuery(chromo,strand,intervalBegin,intervalEnd,don_fn,total_size)
307
308 return acc, don
309
310
311 def fetch_sequence(self,chromo,strand,genomicSeq_start,genomicSeq_stop):
312 #chromo_string = 'chr%d' % chromo
313 #genomicSeq = load_genomic(chromo_string,strand,genomicSeq_start,genomicSeq_stop,self.flat_files,one_based=False)
314 filename = self.accessWrapper.get_genomic_fragment_fn(chromo,strand)
315 #print filename,genomicSeq_start,genomicSeq_stop
316 genomicSeq = my_load_genomic(filename,'+',genomicSeq_start,genomicSeq_stop,one_based=False)
317 genomicSeq = genomicSeq.strip().lower()
318
319 # check the obtained dna sequence
320 assert genomicSeq != '', 'load_genomic returned empty sequence!'
321
322 # all entries other than a c g t are set to n
323 no_base = re.compile('[^acgt]')
324 genomicSeq = no_base.sub('n',genomicSeq)
325
326 return genomicSeq
327
328
329 def get_seq_and_scores(self,chromo,strand,genomicSeq_start,genomicSeq_stop,only_seq=False,perform_checks=True):
330 """
331 This function expects an interval, chromosome and strand information and
332 returns then the genomic sequence of this interval and the associated scores.
333 """
334 # Because splice site scores are predicted using a window of the sequence the
335 # first and the last NO_SCORE_WINDOW_SIZE nucleotides of a genomic sequence
336 # do not have any score predictions
337 NO_SCORE_WINDOW_SIZE = 205
338
339 total_size = self.chromo_sizes[chromo]
340
341 #print genomicSeq_start,genomicSeq_stop
342
343 assert genomicSeq_start < genomicSeq_stop, pdb.set_trace()
344
345 if strand == '+':
346 s_start = genomicSeq_start - 1
347 s_stop = genomicSeq_stop - 1
348
349 genomicSeq_start -= 1
350 genomicSeq_stop -= 1
351
352 if strand == '-':
353 s_start = total_size - genomicSeq_stop + 1
354 s_stop = total_size - genomicSeq_start + 1
355
356 assert genomicSeq_start < genomicSeq_stop
357
358 genomicSeq = self.fetch_sequence(chromo,strand,s_start,s_stop)
359
360 if only_seq:
361 return genomicSeq
362
363 # shorten the intervalEnd to the maximal size of the genomic sequence if it would exceed this size
364 lookup_offset_begin = 10
365 lookup_offset_end = 10
366 intervalBegin = max(genomicSeq_start-lookup_offset_begin,0)
367 if intervalBegin == 0:
368 lookup_offset_begin = 0
369
370 intervalEnd = min(genomicSeq_stop+lookup_offset_end,total_size-1)
371 if intervalEnd == total_size-1:
372 lookup_offset_end = 0
373
374 currentAcc, currentDon = self.getSpliceScores(chromo,strand,intervalBegin,intervalEnd,genomicSeq)
375
376 if strand == '-':
377 currentAcc = currentAcc[1:]
378 #currentDon = currentDon[1:]
379 #currentDon = [-inf]+currentDon[:-1]
380 else:
381 currentAcc = currentAcc[2:]
382 currentDon = currentDon[1:]
383
384 # remove the offset positions
385 currentAcc = currentAcc[lookup_offset_begin:]
386 currentDon = currentDon[lookup_offset_begin:]
387
388 currentAcc = currentAcc[:len(genomicSeq)]
389 currentDon = currentDon[:len(genomicSeq)]
390
391 length = len(genomicSeq)
392 currentAcc = currentAcc+[-inf]*(length-len(currentAcc))
393 currentDon = currentDon+[-inf]*(length-len(currentDon))
394 currentDon[-1] = -inf
395
396 gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if (p>0 and p<len(genomicSeq)-1 and e=='g' ) and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
397 ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and genomicSeq[p-1]=='a' and genomicSeq[p]=='g']
398
399 for pos in ag_tuple_pos:
400 if pos+intervalBegin < NO_SCORE_WINDOW_SIZE and currentAcc[pos] == -inf:
401 currentAcc[pos] = 1e-6
402
403 if pos+intervalBegin > total_size - NO_SCORE_WINDOW_SIZE and currentAcc[pos] == -inf:
404 currentAcc[pos] = 1e-6
405
406 for pos in gt_tuple_pos:
407 if pos+intervalBegin < NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
408 currentDon[pos] = 1e-6
409
410 if pos+intervalBegin > total_size - NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
411 currentDon[pos] = 1e-6
412
413 acc_pos = [p for p,e in enumerate(currentAcc) if e != -inf and p > 1]
414 don_pos = [p for p,e in enumerate(currentDon) if e != -inf and p > 0]
415
416 if perform_checks and not ag_tuple_pos == acc_pos:
417 print 'ACC: Chromo/strand: %d/%s' % (chromo,strand)
418 print ag_tuple_pos[:10],ag_tuple_pos[-10:]
419 print acc_pos[:10],acc_pos[-10:]
420 pdb.set_trace()
421
422 if perform_checks and not gt_tuple_pos == don_pos:
423 print 'DON: Chromo/strand: %d/%s' % (chromo,strand)
424 print gt_tuple_pos[:10],gt_tuple_pos[-10:]
425 print don_pos[:10],don_pos[-10:]
426 pdb.set_trace()
427
428 assert len(genomicSeq) == len(currentAcc) == len(currentDon), pdb.set_trace()
429
430 return genomicSeq, currentAcc, currentDon