8effa988c7e409d5796877531b1f5380110927d6
[qpalma.git] / qpalma / sequence_utils.py
1 # This program is free software; you can redistribute it and/or modify
2 # it under the terms of the GNU General Public License as published by
3 # the Free Software Foundation; either version 2 of the License, or
4 # (at your option) any later version.
5 #
6 # Written (W) 2008 Fabio De Bona
7 # Copyright (C) 2008 Max-Planck-Society
8
9 #
10 # This file contains the main interface to the QPalma pipeline.
11 #
12 #
13 # This module holds all functions for queries on the dna flat files and the
14 # splice score files.
15 # Furthermore it contains some utility functions such as reverse complement and
16 # unbracketing est strings
17 #
18
19 import os
20 import os.path
21 import pdb
22 import re
23 import subprocess
24 import numpy
25
26 from numpy.matlib import inf
27
28 from Genefinding import *
29 from genome_utils import load_genomic
30
31 extended_alphabet = ['-','a','c','g','t','n','[',']']
32 alphabet = ['-','a','c','g','t','n']
33
34
35 class DataAccessionException:
36 pass
37
38
39 def get_flat_file_size(filename):
40 cmd = 'wc -c %s | cut -f1 -d \' \'' % filename
41 obj = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
42 out,err = obj.communicate()
43
44 if err != '':
45 print 'Error occurred while trying to obtain file size'
46 raise DataAccessionException
47
48 return int(out)
49
50
51 def reverse_complement(seq):
52 """
53 This function takes a read in plain or bracket notation and returns the
54 reverse complement of it.
55 I.e.
56
57 est = cgt[ac][tg]a
58
59 leads to
60
61 rev_comp = t[ac][tg]acg
62 """
63
64 bpos = seq.find('[')
65 rc = lambda x: {'a':'t','c':'g','g':'c','t':'a','n':'n'}[x]
66
67 # check first whether seq contains no brackets at all
68 if bpos == -1:
69 ret_val = map(rc,seq)
70 ret_val.reverse()
71 ret_val = "".join(ret_val)
72 else:
73 brc = lambda x: {'a':'t','c':'g','g':'c','t':'a','n':'n','[':'[',']':']'}[x]
74
75 # first_part is the part of the seq up to the first occurrence of a
76 # bracket
77 first_part = seq[:bpos]
78 first_part = map(rc,first_part)
79 first_part.reverse()
80 first_part = "".join(first_part)
81
82 # inside brackets has to be complemented but NOT reversed
83 inside_brackets = seq[bpos+1:bpos+3]
84 inside_brackets = "".join(map(rc,inside_brackets))
85
86 ret_val = '%s[%s]%s'%(reverse_complement(seq[bpos+4:]),inside_brackets,first_part)
87
88 return ret_val
89
90
91 def unbracket_seq(seq):
92 """
93 This function takes a read in bracket notation and restores the read sequence from it.
94 I.e.
95
96 seq = cgt[ac][tg]aa
97
98 leads to
99
100 result = cgtcgaa
101
102 so the second entry within brackets is the base on the read whereas the first
103 entry is the base from the dna.
104 """
105
106 new_seq = ''
107 e = 0
108
109 while True:
110 if e >= len(seq):
111 break
112
113 if seq[e] == '[':
114 new_seq += seq[e+2]
115 e += 4
116 else:
117 new_seq += seq[e]
118 e += 1
119
120 return "".join(new_seq).lower()
121
122
123 def reconstruct_dna_seq(seq):
124 """
125
126 """
127
128 new_seq = ''
129 e = 0
130
131 while True:
132 if e >= len(seq):
133 break
134
135 if seq[e] == '[':
136 new_seq += seq[e+1]
137 e += 4
138 else:
139 new_seq += seq[e]
140 e += 1
141
142 return "".join(new_seq).lower()
143
144
145
146 def create_bracket_seq(dna_seq,read_seq):
147 """
148 This function takes a dna sequence and a read sequence and returns the
149 bracket format of the match/mismatches i.e.
150
151 dna : aaa
152 read: aac
153 is written in bracket notation: aa[ac]
154 """
155 assert len(dna_seq) == len(read_seq),pdb.set_trace()
156 return "".join(map(lambda x,y: ['[%s%s]'%(x,y),x][x==y],dna_seq,read_seq))
157
158
159
160 def my_load_genomic(fname, strand, start, stop, one_based=True):
161
162 if (type(start)==numpy.ndarray) or (type(stop)==numpy.ndarray):
163 assert(len(start) == len(stop))
164 assert((start[1:]-stop[:-1]>0).all())
165 if strand == '+':
166 idx = xrange(len(start))
167 else:
168 idx = xrange(len(start)-1,-1,-1)
169
170 seq = ''.join([load_genomic(chromosome, strand, start[ix], stop[ix], genome)\
171 for ix in idx])
172 return seq
173
174 try:
175 f=file(fname)
176 if one_based:
177 f.seek(start-1)
178 str=f.read(stop-start+1)
179 else:
180 f.seek(start)
181 str=f.read(stop-start)
182
183 if strand=='-':
184 return reverse_complement(str)
185 elif strand=='+':
186 return str
187 else:
188 print 'strand must be + or -'
189 raise KeyError
190
191 except:
192 print fname,strand,start,stop
193
194
195 class DataAccessWrapper:
196 """
197 The purpose of this class is to wrap the genomic/splice site score data
198 access.
199 """
200
201 def __init__(self,settings):
202 self.genomic_dir = settings['genome_dir']
203 self.acc_score_dir = settings['acceptor_scores_loc']
204 self.don_score_dir = settings['donor_scores_loc']
205 self.genomic_fmt = settings['genome_file_fmt']
206 self.sscore_fmt = settings['splice_score_file_fmt']
207
208
209 def get_genomic_fragment_fn(self,id,strand):
210 if strand == '+':
211 return os.path.join(self.genomic_dir,self.genomic_fmt%id)
212 elif strand == '-':
213 return os.path.join(self.genomic_dir,(self.genomic_fmt%id)+'.neg')
214 else:
215 assert False
216
217
218 def get_splice_site_scores_fn(self,id,strand):
219 acc_fn = os.path.join(self.acc_score_dir,self.sscore_fmt%(id,strand))
220 don_fn = os.path.join(self.don_score_dir,self.sscore_fmt%(id,strand))
221 return acc_fn,don_fn
222
223
224
225 class SeqSpliceInfo():
226 """
227
228 """
229
230 def __init__(self,accessWrapper,id_list):
231 self.accessWrapper = accessWrapper
232
233 self.chromo_sizes = {}
234
235 print "Fetching fragment sizes..."
236 for chromo in id_list:
237 filename = self.accessWrapper.get_genomic_fragment_fn(chromo,'+')
238 total_size = get_flat_file_size(filename)
239 self.chromo_sizes[chromo] = total_size
240
241 print "done"
242
243
244 def getSpliceScores(self,chromo,strand,intervalBegin,intervalEnd,genomicSeq=''):
245 """
246 Now we want to use interval_query to get the predicted splice scores trained
247 on the TAIR sequence and annotation.
248 """
249
250 size = intervalEnd-intervalBegin
251 assert size > 1, 'Error (getSpliceScores): interval size is less than 2!'
252
253 acc = size*[0.0]
254 don = size*[0.0]
255
256 interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
257 pos_size = new_intp()
258 intp_assign(pos_size,1)
259
260 total_size = self.chromo_sizes[chromo]
261
262 acc_fn, don_fn = self.accessWrapper.get_splice_site_scores_fn(chromo,strand)
263
264 # fetch acceptor scores
265 acc = doIntervalQuery(chromo,strand,intervalBegin,intervalEnd,acc_fn,total_size)
266
267 # fetch donor scores
268 don = doIntervalQuery(chromo,strand,intervalBegin,intervalEnd,don_fn,total_size)
269
270 return acc, don
271
272
273 def fetch_sequence(self,chromo,strand,genomicSeq_start,genomicSeq_stop):
274 #chromo_string = 'chr%d' % chromo
275 #genomicSeq = load_genomic(chromo_string,strand,genomicSeq_start,genomicSeq_stop,self.flat_files,one_based=False)
276 filename = self.accessWrapper.get_genomic_fragment_fn(chromo,strand)
277 #print filename,genomicSeq_start,genomicSeq_stop
278 genomicSeq = my_load_genomic(filename,'+',genomicSeq_start,genomicSeq_stop,one_based=False)
279 genomicSeq = genomicSeq.strip().lower()
280
281 # check the obtained dna sequence
282 assert genomicSeq != '', 'load_genomic returned empty sequence!'
283
284 # all entries other than a c g t are set to n
285 no_base = re.compile('[^acgt]')
286 genomicSeq = no_base.sub('n',genomicSeq)
287
288 return genomicSeq
289
290
291 def get_seq_and_scores(self,chromo,strand,genomicSeq_start,genomicSeq_stop,only_seq=False,perform_checks=True):
292 """
293 This function expects an interval, chromosome and strand information and
294 returns then the genomic sequence of this interval and the associated scores.
295 """
296 # Because splice site scores are predicted using a window of the sequence the
297 # first and the last NO_SCORE_WINDOW_SIZE nucleotides of a genomic sequence
298 # do not have any score predictions
299 NO_SCORE_WINDOW_SIZE = 205
300
301 total_size = self.chromo_sizes[chromo]
302
303 #print genomicSeq_start,genomicSeq_stop
304
305 assert genomicSeq_start < genomicSeq_stop, pdb.set_trace()
306
307 if strand == '+':
308 s_start = genomicSeq_start - 1
309 s_stop = genomicSeq_stop - 1
310
311 genomicSeq_start -= 1
312 genomicSeq_stop -= 1
313
314 if strand == '-':
315 s_start = total_size - genomicSeq_stop + 1
316 s_stop = total_size - genomicSeq_start + 1
317
318 assert genomicSeq_start < genomicSeq_stop
319
320 genomicSeq = self.fetch_sequence(chromo,strand,s_start,s_stop)
321
322 if only_seq:
323 return genomicSeq
324
325 # shorten the intervalEnd to the maximal size of the genomic sequence if it would exceed this size
326 lookup_offset_begin = 10
327 lookup_offset_end = 10
328 intervalBegin = max(genomicSeq_start-lookup_offset_begin,0)
329 if intervalBegin == 0:
330 lookup_offset_begin = 0
331
332 intervalEnd = min(genomicSeq_stop+lookup_offset_end,total_size-1)
333 if intervalEnd == total_size-1:
334 lookup_offset_end = 0
335
336 currentAcc, currentDon = self.getSpliceScores(chromo,strand,intervalBegin,intervalEnd,genomicSeq)
337
338 if strand == '-':
339 currentAcc = currentAcc[1:]
340 #currentDon = currentDon[1:]
341 #currentDon = [-inf]+currentDon[:-1]
342 else:
343 currentAcc = currentAcc[2:]
344 currentDon = currentDon[1:]
345
346 # remove the offset positions
347 currentAcc = currentAcc[lookup_offset_begin:]
348 currentDon = currentDon[lookup_offset_begin:]
349
350 currentAcc = currentAcc[:len(genomicSeq)]
351 currentDon = currentDon[:len(genomicSeq)]
352
353 length = len(genomicSeq)
354 currentAcc = currentAcc+[-inf]*(length-len(currentAcc))
355 currentDon = currentDon+[-inf]*(length-len(currentDon))
356 currentDon[-1] = -inf
357
358 gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if (p>0 and p<len(genomicSeq)-1 and e=='g' ) and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
359 ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and genomicSeq[p-1]=='a' and genomicSeq[p]=='g']
360
361 for pos in ag_tuple_pos:
362 if pos+intervalBegin < NO_SCORE_WINDOW_SIZE and currentAcc[pos] == -inf:
363 currentAcc[pos] = 1e-6
364
365 if pos+intervalBegin > total_size - NO_SCORE_WINDOW_SIZE and currentAcc[pos] == -inf:
366 currentAcc[pos] = 1e-6
367
368 for pos in gt_tuple_pos:
369 if pos+intervalBegin < NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
370 currentDon[pos] = 1e-6
371
372 if pos+intervalBegin > total_size - NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
373 currentDon[pos] = 1e-6
374
375 acc_pos = [p for p,e in enumerate(currentAcc) if e != -inf and p > 1]
376 don_pos = [p for p,e in enumerate(currentDon) if e != -inf and p > 0]
377
378 if perform_checks and not ag_tuple_pos == acc_pos:
379 print 'ACC: Chromo/strand: %d/%s' % (chromo,strand)
380 print ag_tuple_pos[:10],ag_tuple_pos[-10:]
381 print acc_pos[:10],acc_pos[-10:]
382 pdb.set_trace()
383
384 if perform_checks and not gt_tuple_pos == don_pos:
385 print 'DON: Chromo/strand: %d/%s' % (chromo,strand)
386 print gt_tuple_pos[:10],gt_tuple_pos[-10:]
387 print don_pos[:10],don_pos[-10:]
388 pdb.set_trace()
389
390 assert len(genomicSeq) == len(currentAcc) == len(currentDon), pdb.set_trace()
391
392 return genomicSeq, currentAcc, currentDon