+ fixed some issues with the splice site scores and ugly code fragments
[qpalma.git] / qpalma / sequence_utils.py
1 # This program is free software; you can redistribute it and/or modify
2 # it under the terms of the GNU General Public License as published by
3 # the Free Software Foundation; either version 2 of the License, or
4 # (at your option) any later version.
5 #
6 # Written (W) 2008 Fabio De Bona
7 # Copyright (C) 2008 Max-Planck-Society
8
9 #
10 # This file contains the main interface to the QPalma pipeline.
11 #
12 #
13 # This module holds all functions for queries on the dna flat files and the
14 # splice score files.
15 # Furthermore it contains some utility functions such as reverse complement and
16 # unbracketing est strings
17 #
18
19 import os
20 import os.path
21 import pdb
22 import re
23 import subprocess
24 import numpy
25
26 from numpy.matlib import inf
27
28 from Genefinding import doIntervalQuery,new_intp,intp_assign
29
30 from QPalmaDP import createIntArrayFromList
31
32
33 def load_genomic(chromosome, strand, start, stop, genome, one_based=True):
34 """
35 This function stems from Cheng Soon Ongs genome_utils package.
36 """
37
38 if (type(start)==numpy.ndarray) or (type(stop)==numpy.ndarray):
39 assert(len(start) == len(stop))
40 assert((start[1:]-stop[:-1]>0).all())
41 if strand == '+':
42 idx = xrange(len(start))
43 else:
44 idx = xrange(len(start)-1,-1,-1)
45
46 seq = ''.join([load_genomic(chromosome, strand, start[ix], stop[ix], genome)\
47 for ix in idx])
48 return seq
49
50 #fname = '/fml/ag-raetsch/share/databases/genomes/' + genome + '/' + chromosome + '.dna.flat'
51 fname = genome + '/' + chromosome + '.dna.flat'
52 f=file(fname)
53 if one_based:
54 f.seek(start-1)
55 str=f.read(stop-start+1)
56 else:
57 f.seek(start)
58 str=f.read(stop-start)
59
60 if strand=='-':
61 return reverse_complement(str)
62 elif strand=='+':
63 return str
64 else:
65 print 'strand must be + or -'
66 raise KeyError
67
68
69 extended_alphabet = ['-','a','c','g','t','n','[',']']
70 alphabet = ['-','a','c','g','t','n']
71
72
73 class DataAccessionException:
74 pass
75
76
77 def get_flat_file_size(filename):
78 cmd = 'wc -c %s | cut -f1 -d \' \'' % filename
79 obj = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
80 out,err = obj.communicate()
81
82 if err != '':
83 print 'Error occurred while trying to obtain file size'
84 raise DataAccessionException
85
86 # subtract one because of wc -c output
87 return int(out)-1
88
89
90 def reverse_complement(seq):
91 """
92 This function takes a read in plain or bracket notation and returns the
93 reverse complement of it.
94 I.e.
95
96 est = cgt[ac][tg]a
97
98 leads to
99
100 rev_comp = t[ac][tg]acg
101 """
102
103 bpos = seq.find('[')
104 rc = lambda x: {'a':'t','c':'g','g':'c','t':'a','n':'n'}[x]
105
106 # check first whether seq contains no brackets at all
107 if bpos == -1:
108 ret_val = map(rc,seq)
109 ret_val.reverse()
110 ret_val = "".join(ret_val)
111 else:
112 brc = lambda x: {'a':'t','c':'g','g':'c','t':'a','n':'n','[':'[',']':']'}[x]
113
114 # first_part is the part of the seq up to the first occurrence of a
115 # bracket
116 first_part = seq[:bpos]
117 first_part = map(rc,first_part)
118 first_part.reverse()
119 first_part = "".join(first_part)
120
121 # inside brackets has to be complemented but NOT reversed
122 inside_brackets = seq[bpos+1:bpos+3]
123 inside_brackets = "".join(map(rc,inside_brackets))
124
125 ret_val = '%s[%s]%s'%(reverse_complement(seq[bpos+4:]),inside_brackets,first_part)
126
127 return ret_val
128
129
130 def unbracket_seq(seq):
131 """
132 This function takes a read in bracket notation and restores the read sequence from it.
133 I.e.
134
135 seq = cgt[ac][tg]aa
136
137 leads to
138
139 result = cgtcgaa
140
141 so the second entry within brackets is the base on the read whereas the first
142 entry is the base from the dna.
143 """
144
145 new_seq = ''
146 e = 0
147
148 while True:
149 if e >= len(seq):
150 break
151
152 if seq[e] == '[':
153 new_seq += seq[e+2]
154 e += 4
155 else:
156 new_seq += seq[e]
157 e += 1
158
159 return "".join(new_seq).lower()
160
161
162 def reconstruct_dna_seq(seq):
163 """
164
165 """
166
167 new_seq = ''
168 e = 0
169
170 while True:
171 if e >= len(seq):
172 break
173
174 if seq[e] == '[':
175 new_seq += seq[e+1]
176 e += 4
177 else:
178 new_seq += seq[e]
179 e += 1
180
181 return "".join(new_seq).lower()
182
183
184
185 def create_bracket_seq(dna_seq,read_seq):
186 """
187 This function takes a dna sequence and a read sequence and returns the
188 bracket format of the match/mismatches i.e.
189
190 dna : aaa
191 read: aac
192 is written in bracket notation: aa[ac]
193 """
194 assert len(dna_seq) == len(read_seq),pdb.set_trace()
195 return "".join(map(lambda x,y: ['[%s%s]'%(x,y),x][x==y],dna_seq,read_seq))
196
197
198
199 def my_load_genomic(fname, strand, start, stop, one_based=True):
200
201 if (type(start)==numpy.ndarray) or (type(stop)==numpy.ndarray):
202 assert(len(start) == len(stop))
203 assert((start[1:]-stop[:-1]>0).all())
204 if strand == '+':
205 idx = xrange(len(start))
206 else:
207 idx = xrange(len(start)-1,-1,-1)
208
209 seq = ''.join([load_genomic(chromosome, strand, start[ix], stop[ix], genome)\
210 for ix in idx])
211 return seq
212
213 try:
214 f=file(fname)
215 if one_based:
216 f.seek(start-1)
217 str=f.read(stop-start+1)
218 else:
219 f.seek(start)
220 str=f.read(stop-start)
221
222 if strand=='-':
223 return reverse_complement(str)
224 elif strand=='+':
225 return str
226 else:
227 print 'strand must be + or -'
228 raise KeyError
229
230 except:
231 print fname,strand,start,stop
232
233
234 class DataAccessWrapper:
235 """
236 The purpose of this class is to wrap the genomic/splice site score data
237 access.
238 """
239
240 def __init__(self,settings):
241 self.genomic_dir = settings['genome_dir']
242 self.acc_score_dir = settings['acceptor_scores_loc']
243 self.don_score_dir = settings['donor_scores_loc']
244 self.genomic_fmt = settings['genome_file_fmt']
245 self.sscore_fmt = settings['splice_score_file_fmt']
246
247
248 def get_genomic_fragment_fn(self,id):
249 return os.path.join(self.genomic_dir,self.genomic_fmt%id)
250
251
252 def get_splice_site_scores_fn(self,id,strand):
253 acc_fn = os.path.join(self.acc_score_dir,self.sscore_fmt%(id,strand))
254 don_fn = os.path.join(self.don_score_dir,self.sscore_fmt%(id,strand))
255 return acc_fn,don_fn
256
257
258
259 class SeqSpliceInfo():
260 """
261
262 """
263
264 def __init__(self,accessWrapper,fragment_list):
265 self.accessWrapper = accessWrapper
266
267 self.chromo_sizes = {}
268
269 # take care that it also works say if we get a chromo_list [1,3,5]
270 # those are mapped then to 0,1,2
271 self.fragment_list = fragment_list
272 self.chromo_map = {}
273 for idx,elem in enumerate(fragment_list):
274 self.chromo_map[elem] = idx
275
276 print "Fetching fragment sizes..."
277 for id in fragment_list:
278 filename = self.accessWrapper.get_genomic_fragment_fn(id)
279 total_size = get_flat_file_size(filename)
280 self.chromo_sizes[self.chromo_map[id]] = total_size
281
282 print "done"
283
284
285 def getFragmentSize(self,id):
286 return self.chromo_sizes[self.chromo_map[id]]
287
288
289 def getSpliceScores(self,id,strand,intervalBegin,intervalEnd,genomicSeq=''):
290 """
291 Now we want to use interval_query to get the predicted splice scores trained
292 on the TAIR sequence and annotation.
293 """
294
295 size = intervalEnd-intervalBegin
296 assert size > 1, 'Error (getSpliceScores): interval size is less than 2!'
297
298 acc = size*[0.0]
299 don = size*[0.0]
300
301 interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
302 pos_size = new_intp()
303 intp_assign(pos_size,1)
304
305 total_size = self.getFragmentSize(id)
306
307 acc_fn, don_fn = self.accessWrapper.get_splice_site_scores_fn(id,strand)
308
309 # fetch acceptor scores
310 acc = doIntervalQuery(id,strand,intervalBegin,intervalEnd,acc_fn,total_size)
311
312 # fetch donor scores
313 don = doIntervalQuery(id,strand,intervalBegin,intervalEnd,don_fn,total_size)
314
315 return acc, don
316
317
318 def fetch_sequence(self,chromo,strand,genomicSeq_start,genomicSeq_stop):
319 filename = self.accessWrapper.get_genomic_fragment_fn(chromo)
320 genomicSeq = my_load_genomic(filename,'+',genomicSeq_start,genomicSeq_stop,one_based=False)
321 genomicSeq = genomicSeq.strip().lower()
322
323 # check the obtained dna sequence
324 assert genomicSeq != '', 'load_genomic returned empty sequence!'
325
326 # all entries other than a c g t are set to n
327 no_base = re.compile('[^acgt]')
328 genomicSeq = no_base.sub('n',genomicSeq)
329
330 return genomicSeq
331
332
333 def get_seq_and_scores(self,id,strand,genomicSeq_start,genomicSeq_stop,only_seq=False,perform_checks=True):
334 """
335 This function expects an interval, chromosome and strand information and
336 returns then the genomic sequence of this interval and the associated scores.
337 """
338 # Because splice site scores are predicted using a window of the sequence the
339 # first and the last NO_SCORE_WINDOW_SIZE nucleotides of a genomic sequence
340 # do not have any score predictions
341 NO_SCORE_WINDOW_SIZE = 205
342
343 chromo = self.chromo_map[id]
344
345 total_size = self.chromo_sizes[chromo]
346
347 if strand == '-':
348 s_start = total_size - genomicSeq_stop
349 s_stop = total_size - genomicSeq_start
350 genomicSeq_start = s_start
351 genomicSeq_stop = s_stop
352
353 assert genomicSeq_start < genomicSeq_stop, pdb.set_trace()
354
355 genomicSeq = self.fetch_sequence(id,strand,genomicSeq_start,genomicSeq_stop)
356
357 if strand == '-':
358 genomicSeq = reverse_complement(genomicSeq)
359
360 if only_seq:
361 return genomicSeq
362
363 # shorten the intervalEnd to the maximal size of the genomic sequence if it would exceed this size
364 lookup_offset_begin = 10
365 lookup_offset_end = 10
366 intervalBegin = max(genomicSeq_start-lookup_offset_begin,0)
367 if intervalBegin == 0:
368 lookup_offset_begin = 0
369
370 intervalEnd = min(genomicSeq_stop+lookup_offset_end,total_size-1)
371 if intervalEnd == total_size-1:
372 lookup_offset_end = 0
373
374 currentAcc, currentDon = self.getSpliceScores(id,strand,intervalBegin,intervalEnd,genomicSeq)
375
376 if strand == '-':
377 currentAcc = currentAcc[1:]
378 #currentDon = currentDon[1:]
379 #currentDon = [-inf]+currentDon[:-1]
380 else:
381 currentAcc = currentAcc[2:]
382 currentDon = currentDon[1:]
383
384 # remove the offset positions
385 currentAcc = currentAcc[lookup_offset_begin:]
386 currentDon = currentDon[lookup_offset_begin:]
387
388 currentAcc = currentAcc[:len(genomicSeq)]
389 currentDon = currentDon[:len(genomicSeq)]
390
391 length = len(genomicSeq)
392 currentAcc = currentAcc+[-inf]*(length-len(currentAcc))
393 currentDon = currentDon+[-inf]*(length-len(currentDon))
394 currentDon[-1] = -inf
395
396 gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if (p>0 and p<len(genomicSeq)-1 and e=='g' ) and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
397 ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and genomicSeq[p-1]=='a' and genomicSeq[p]=='g']
398
399 for pos in ag_tuple_pos:
400 if pos+intervalBegin < NO_SCORE_WINDOW_SIZE and currentAcc[pos] == -inf:
401 currentAcc[pos] = 1e-6
402
403 if pos+intervalBegin > total_size - NO_SCORE_WINDOW_SIZE and currentAcc[pos] == -inf:
404 currentAcc[pos] = 1e-6
405
406 for pos in gt_tuple_pos:
407 if pos+intervalBegin < NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
408 currentDon[pos] = 1e-6
409
410 if pos+intervalBegin > total_size - NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
411 currentDon[pos] = 1e-6
412
413 acc_pos = [p for p,e in enumerate(currentAcc) if e != -inf and p > 1]
414 don_pos = [p for p,e in enumerate(currentDon) if e != -inf and p > 0]
415
416 if perform_checks and not ag_tuple_pos == acc_pos:
417 print 'ACC: Chromo/strand: %d/%s' % (id,strand)
418 print ag_tuple_pos[:10],ag_tuple_pos[-10:]
419 print acc_pos[:10],acc_pos[-10:]
420 pdb.set_trace()
421
422 if perform_checks and not gt_tuple_pos == don_pos:
423 print 'DON: Chromo/strand: %d/%s' % (id,strand)
424 print gt_tuple_pos[:10],gt_tuple_pos[-10:]
425 print don_pos[:10],don_pos[-10:]
426 pdb.set_trace()
427
428 assert len(genomicSeq) == len(currentAcc) == len(currentDon), pdb.set_trace()
429
430 return genomicSeq, currentAcc, currentDon