+ renamed dyn_prog directory
[qpalma.git] / qpalma / sequence_utils.py
1 # This program is free software; you can redistribute it and/or modify
2 # it under the terms of the GNU General Public License as published by
3 # the Free Software Foundation; either version 2 of the License, or
4 # (at your option) any later version.
5 #
6 # Written (W) 2008 Fabio De Bona
7 # Copyright (C) 2008 Max-Planck-Society
8
9 #
10 # This file contains the main interface to the QPalma pipeline.
11 #
12 #
13 # This module holds all functions for queries on the dna flat files and the
14 # splice score files.
15 # Furthermore it contains some utility functions such as reverse complement and
16 # unbracketing est strings
17 #
18
19 import os
20 import os.path
21 import pdb
22 import re
23 import subprocess
24 import numpy
25
26 from numpy.matlib import inf
27
28 from Genefinding import doIntervalQuery
29
30
31 def load_genomic(chromosome, strand, start, stop, genome, one_based=True):
32 """
33 This function stems from Cheng Soon Ongs genome_utils package.
34 """
35
36 if (type(start)==numpy.ndarray) or (type(stop)==numpy.ndarray):
37 assert(len(start) == len(stop))
38 assert((start[1:]-stop[:-1]>0).all())
39 if strand == '+':
40 idx = xrange(len(start))
41 else:
42 idx = xrange(len(start)-1,-1,-1)
43
44 seq = ''.join([load_genomic(chromosome, strand, start[ix], stop[ix], genome)\
45 for ix in idx])
46 return seq
47
48 #fname = '/fml/ag-raetsch/share/databases/genomes/' + genome + '/' + chromosome + '.dna.flat'
49 fname = genome + '/' + chromosome + '.dna.flat'
50 f=file(fname)
51 if one_based:
52 f.seek(start-1)
53 str=f.read(stop-start+1)
54 else:
55 f.seek(start)
56 str=f.read(stop-start)
57
58 if strand=='-':
59 return reverse_complement(str)
60 elif strand=='+':
61 return str
62 else:
63 print 'strand must be + or -'
64 raise KeyError
65
66
67 extended_alphabet = ['-','a','c','g','t','n','[',']']
68 alphabet = ['-','a','c','g','t','n']
69
70
71 class DataAccessionException:
72 pass
73
74
75 def get_flat_file_size(filename):
76 cmd = 'wc -c %s | cut -f1 -d \' \'' % filename
77 obj = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
78 out,err = obj.communicate()
79
80 if err != '':
81 print 'Error occurred while trying to obtain file size'
82 raise DataAccessionException
83
84 return int(out)
85
86
87 def reverse_complement(seq):
88 """
89 This function takes a read in plain or bracket notation and returns the
90 reverse complement of it.
91 I.e.
92
93 est = cgt[ac][tg]a
94
95 leads to
96
97 rev_comp = t[ac][tg]acg
98 """
99
100 bpos = seq.find('[')
101 rc = lambda x: {'a':'t','c':'g','g':'c','t':'a','n':'n'}[x]
102
103 # check first whether seq contains no brackets at all
104 if bpos == -1:
105 ret_val = map(rc,seq)
106 ret_val.reverse()
107 ret_val = "".join(ret_val)
108 else:
109 brc = lambda x: {'a':'t','c':'g','g':'c','t':'a','n':'n','[':'[',']':']'}[x]
110
111 # first_part is the part of the seq up to the first occurrence of a
112 # bracket
113 first_part = seq[:bpos]
114 first_part = map(rc,first_part)
115 first_part.reverse()
116 first_part = "".join(first_part)
117
118 # inside brackets has to be complemented but NOT reversed
119 inside_brackets = seq[bpos+1:bpos+3]
120 inside_brackets = "".join(map(rc,inside_brackets))
121
122 ret_val = '%s[%s]%s'%(reverse_complement(seq[bpos+4:]),inside_brackets,first_part)
123
124 return ret_val
125
126
127 def unbracket_seq(seq):
128 """
129 This function takes a read in bracket notation and restores the read sequence from it.
130 I.e.
131
132 seq = cgt[ac][tg]aa
133
134 leads to
135
136 result = cgtcgaa
137
138 so the second entry within brackets is the base on the read whereas the first
139 entry is the base from the dna.
140 """
141
142 new_seq = ''
143 e = 0
144
145 while True:
146 if e >= len(seq):
147 break
148
149 if seq[e] == '[':
150 new_seq += seq[e+2]
151 e += 4
152 else:
153 new_seq += seq[e]
154 e += 1
155
156 return "".join(new_seq).lower()
157
158
159 def reconstruct_dna_seq(seq):
160 """
161
162 """
163
164 new_seq = ''
165 e = 0
166
167 while True:
168 if e >= len(seq):
169 break
170
171 if seq[e] == '[':
172 new_seq += seq[e+1]
173 e += 4
174 else:
175 new_seq += seq[e]
176 e += 1
177
178 return "".join(new_seq).lower()
179
180
181
182 def create_bracket_seq(dna_seq,read_seq):
183 """
184 This function takes a dna sequence and a read sequence and returns the
185 bracket format of the match/mismatches i.e.
186
187 dna : aaa
188 read: aac
189 is written in bracket notation: aa[ac]
190 """
191 assert len(dna_seq) == len(read_seq),pdb.set_trace()
192 return "".join(map(lambda x,y: ['[%s%s]'%(x,y),x][x==y],dna_seq,read_seq))
193
194
195
196 def my_load_genomic(fname, strand, start, stop, one_based=True):
197
198 if (type(start)==numpy.ndarray) or (type(stop)==numpy.ndarray):
199 assert(len(start) == len(stop))
200 assert((start[1:]-stop[:-1]>0).all())
201 if strand == '+':
202 idx = xrange(len(start))
203 else:
204 idx = xrange(len(start)-1,-1,-1)
205
206 seq = ''.join([load_genomic(chromosome, strand, start[ix], stop[ix], genome)\
207 for ix in idx])
208 return seq
209
210 try:
211 f=file(fname)
212 if one_based:
213 f.seek(start-1)
214 str=f.read(stop-start+1)
215 else:
216 f.seek(start)
217 str=f.read(stop-start)
218
219 if strand=='-':
220 return reverse_complement(str)
221 elif strand=='+':
222 return str
223 else:
224 print 'strand must be + or -'
225 raise KeyError
226
227 except:
228 print fname,strand,start,stop
229
230
231 class DataAccessWrapper:
232 """
233 The purpose of this class is to wrap the genomic/splice site score data
234 access.
235 """
236
237 def __init__(self,settings):
238 self.genomic_dir = settings['genome_dir']
239 self.acc_score_dir = settings['acceptor_scores_loc']
240 self.don_score_dir = settings['donor_scores_loc']
241 self.genomic_fmt = settings['genome_file_fmt']
242 self.sscore_fmt = settings['splice_score_file_fmt']
243
244
245 def get_genomic_fragment_fn(self,id,strand):
246 if strand == '+':
247 return os.path.join(self.genomic_dir,self.genomic_fmt%id)
248 elif strand == '-':
249 return os.path.join(self.genomic_dir,(self.genomic_fmt%id)+'.neg')
250 else:
251 assert False
252
253
254 def get_splice_site_scores_fn(self,id,strand):
255 acc_fn = os.path.join(self.acc_score_dir,self.sscore_fmt%(id,strand))
256 don_fn = os.path.join(self.don_score_dir,self.sscore_fmt%(id,strand))
257 return acc_fn,don_fn
258
259
260
261 class SeqSpliceInfo():
262 """
263
264 """
265
266 def __init__(self,accessWrapper,id_list):
267 self.accessWrapper = accessWrapper
268
269 self.chromo_sizes = {}
270
271 print "Fetching fragment sizes..."
272 for chromo in id_list:
273 filename = self.accessWrapper.get_genomic_fragment_fn(chromo,'+')
274 total_size = get_flat_file_size(filename)
275 self.chromo_sizes[chromo] = total_size
276
277 print "done"
278
279
280 def getSpliceScores(self,chromo,strand,intervalBegin,intervalEnd,genomicSeq=''):
281 """
282 Now we want to use interval_query to get the predicted splice scores trained
283 on the TAIR sequence and annotation.
284 """
285
286 size = intervalEnd-intervalBegin
287 assert size > 1, 'Error (getSpliceScores): interval size is less than 2!'
288
289 acc = size*[0.0]
290 don = size*[0.0]
291
292 interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
293 pos_size = new_intp()
294 intp_assign(pos_size,1)
295
296 total_size = self.chromo_sizes[chromo]
297
298 acc_fn, don_fn = self.accessWrapper.get_splice_site_scores_fn(chromo,strand)
299
300 # fetch acceptor scores
301 acc = doIntervalQuery(chromo,strand,intervalBegin,intervalEnd,acc_fn,total_size)
302
303 # fetch donor scores
304 don = doIntervalQuery(chromo,strand,intervalBegin,intervalEnd,don_fn,total_size)
305
306 return acc, don
307
308
309 def fetch_sequence(self,chromo,strand,genomicSeq_start,genomicSeq_stop):
310 #chromo_string = 'chr%d' % chromo
311 #genomicSeq = load_genomic(chromo_string,strand,genomicSeq_start,genomicSeq_stop,self.flat_files,one_based=False)
312 filename = self.accessWrapper.get_genomic_fragment_fn(chromo,strand)
313 #print filename,genomicSeq_start,genomicSeq_stop
314 genomicSeq = my_load_genomic(filename,'+',genomicSeq_start,genomicSeq_stop,one_based=False)
315 genomicSeq = genomicSeq.strip().lower()
316
317 # check the obtained dna sequence
318 assert genomicSeq != '', 'load_genomic returned empty sequence!'
319
320 # all entries other than a c g t are set to n
321 no_base = re.compile('[^acgt]')
322 genomicSeq = no_base.sub('n',genomicSeq)
323
324 return genomicSeq
325
326
327 def get_seq_and_scores(self,chromo,strand,genomicSeq_start,genomicSeq_stop,only_seq=False,perform_checks=True):
328 """
329 This function expects an interval, chromosome and strand information and
330 returns then the genomic sequence of this interval and the associated scores.
331 """
332 # Because splice site scores are predicted using a window of the sequence the
333 # first and the last NO_SCORE_WINDOW_SIZE nucleotides of a genomic sequence
334 # do not have any score predictions
335 NO_SCORE_WINDOW_SIZE = 205
336
337 total_size = self.chromo_sizes[chromo]
338
339 #print genomicSeq_start,genomicSeq_stop
340
341 assert genomicSeq_start < genomicSeq_stop, pdb.set_trace()
342
343 if strand == '+':
344 s_start = genomicSeq_start - 1
345 s_stop = genomicSeq_stop - 1
346
347 genomicSeq_start -= 1
348 genomicSeq_stop -= 1
349
350 if strand == '-':
351 s_start = total_size - genomicSeq_stop + 1
352 s_stop = total_size - genomicSeq_start + 1
353
354 assert genomicSeq_start < genomicSeq_stop
355
356 genomicSeq = self.fetch_sequence(chromo,strand,s_start,s_stop)
357
358 if only_seq:
359 return genomicSeq
360
361 # shorten the intervalEnd to the maximal size of the genomic sequence if it would exceed this size
362 lookup_offset_begin = 10
363 lookup_offset_end = 10
364 intervalBegin = max(genomicSeq_start-lookup_offset_begin,0)
365 if intervalBegin == 0:
366 lookup_offset_begin = 0
367
368 intervalEnd = min(genomicSeq_stop+lookup_offset_end,total_size-1)
369 if intervalEnd == total_size-1:
370 lookup_offset_end = 0
371
372 currentAcc, currentDon = self.getSpliceScores(chromo,strand,intervalBegin,intervalEnd,genomicSeq)
373
374 if strand == '-':
375 currentAcc = currentAcc[1:]
376 #currentDon = currentDon[1:]
377 #currentDon = [-inf]+currentDon[:-1]
378 else:
379 currentAcc = currentAcc[2:]
380 currentDon = currentDon[1:]
381
382 # remove the offset positions
383 currentAcc = currentAcc[lookup_offset_begin:]
384 currentDon = currentDon[lookup_offset_begin:]
385
386 currentAcc = currentAcc[:len(genomicSeq)]
387 currentDon = currentDon[:len(genomicSeq)]
388
389 length = len(genomicSeq)
390 currentAcc = currentAcc+[-inf]*(length-len(currentAcc))
391 currentDon = currentDon+[-inf]*(length-len(currentDon))
392 currentDon[-1] = -inf
393
394 gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if (p>0 and p<len(genomicSeq)-1 and e=='g' ) and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
395 ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and genomicSeq[p-1]=='a' and genomicSeq[p]=='g']
396
397 for pos in ag_tuple_pos:
398 if pos+intervalBegin < NO_SCORE_WINDOW_SIZE and currentAcc[pos] == -inf:
399 currentAcc[pos] = 1e-6
400
401 if pos+intervalBegin > total_size - NO_SCORE_WINDOW_SIZE and currentAcc[pos] == -inf:
402 currentAcc[pos] = 1e-6
403
404 for pos in gt_tuple_pos:
405 if pos+intervalBegin < NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
406 currentDon[pos] = 1e-6
407
408 if pos+intervalBegin > total_size - NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
409 currentDon[pos] = 1e-6
410
411 acc_pos = [p for p,e in enumerate(currentAcc) if e != -inf and p > 1]
412 don_pos = [p for p,e in enumerate(currentDon) if e != -inf and p > 0]
413
414 if perform_checks and not ag_tuple_pos == acc_pos:
415 print 'ACC: Chromo/strand: %d/%s' % (chromo,strand)
416 print ag_tuple_pos[:10],ag_tuple_pos[-10:]
417 print acc_pos[:10],acc_pos[-10:]
418 pdb.set_trace()
419
420 if perform_checks and not gt_tuple_pos == don_pos:
421 print 'DON: Chromo/strand: %d/%s' % (chromo,strand)
422 print gt_tuple_pos[:10],gt_tuple_pos[-10:]
423 print don_pos[:10],don_pos[-10:]
424 pdb.set_trace()
425
426 assert len(genomicSeq) == len(currentAcc) == len(currentDon), pdb.set_trace()
427
428 return genomicSeq, currentAcc, currentDon