+ changed python and c code in order to perform easier parameter handling and
[qpalma.git] / qpalma / sequence_utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 #
5 # This module holds all functions for queries on the dna flat files and the
6 # splice score files.
7 # Furthermore it contains some utility functions such as reverse complement and
8 # unbracketing est strings
9 #
10
11 import os
12 import os.path
13 import pdb
14 import re
15 import subprocess
16 import numpy
17
18 from numpy.matlib import inf
19
20 from Genefinding import *
21 from genome_utils import load_genomic
22
23 extended_alphabet = ['-','a','c','g','t','n','[',']']
24 alphabet = ['-','a','c','g','t','n']
25
26
27 class DataAccessionException:
28 pass
29
30
31 def get_flat_file_size(filename):
32 cmd = 'wc -c %s | cut -f1 -d \' \'' % filename
33 obj = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
34 out,err = obj.communicate()
35
36 if err != '':
37 print 'Error occurred while trying to obtain file size'
38 raise DataAccessionException
39
40 return int(out)
41
42
43 def reverse_complement(seq):
44 """
45 This function takes a read in plain or bracket notation and returns the
46 reverse complement of it.
47 I.e.
48
49 est = cgt[ac][tg]a
50
51 leads to
52
53 rev_comp = t[ac][tg]acg
54 """
55
56 bpos = seq.find('[')
57 rc = lambda x: {'a':'t','c':'g','g':'c','t':'a','n':'n'}[x]
58
59 # check first whether seq contains no brackets at all
60 if bpos == -1:
61 ret_val = map(rc,seq)
62 ret_val.reverse()
63 ret_val = "".join(ret_val)
64 else:
65 brc = lambda x: {'a':'t','c':'g','g':'c','t':'a','n':'n','[':'[',']':']'}[x]
66
67 # first_part is the part of the seq up to the first occurrence of a
68 # bracket
69 first_part = seq[:bpos]
70 first_part = map(rc,first_part)
71 first_part.reverse()
72 first_part = "".join(first_part)
73
74 # inside brackets has to be complemented but NOT reversed
75 inside_brackets = seq[bpos+1:bpos+3]
76 inside_brackets = "".join(map(rc,inside_brackets))
77
78 ret_val = '%s[%s]%s'%(reverse_complement(seq[bpos+4:]),inside_brackets,first_part)
79
80 return ret_val
81
82
83 def unbracket_seq(seq):
84 """
85 This function takes a read in bracket notation and restores the read sequence from it.
86 I.e.
87
88 seq = cgt[ac][tg]aa
89
90 leads to
91
92 result = cgtcgaa
93
94 so the second entry within brackets is the base on the read whereas the first
95 entry is the base from the dna.
96 """
97
98 new_seq = ''
99 e = 0
100
101 while True:
102 if e >= len(seq):
103 break
104
105 if seq[e] == '[':
106 new_seq += seq[e+2]
107 e += 4
108 else:
109 new_seq += seq[e]
110 e += 1
111
112 return "".join(new_seq).lower()
113
114
115 def reconstruct_dna_seq(seq):
116 """
117
118 """
119
120 new_seq = ''
121 e = 0
122
123 while True:
124 if e >= len(seq):
125 break
126
127 if seq[e] == '[':
128 new_seq += seq[e+1]
129 e += 4
130 else:
131 new_seq += seq[e]
132 e += 1
133
134 return "".join(new_seq).lower()
135
136
137
138 def create_bracket_seq(dna_seq,read_seq):
139 """
140 This function takes a dna sequence and a read sequence and returns the
141 bracket format of the match/mismatches i.e.
142
143 dna : aaa
144 read: aac
145 is written in bracket notation: aa[ac]
146 """
147 assert len(dna_seq) == len(read_seq),pdb.set_trace()
148 return "".join(map(lambda x,y: ['[%s%s]'%(x,y),x][x==y],dna_seq,read_seq))
149
150
151
152 def my_load_genomic(fname, strand, start, stop, one_based=True):
153
154 if (type(start)==numpy.ndarray) or (type(stop)==numpy.ndarray):
155 assert(len(start) == len(stop))
156 assert((start[1:]-stop[:-1]>0).all())
157 if strand == '+':
158 idx = xrange(len(start))
159 else:
160 idx = xrange(len(start)-1,-1,-1)
161
162 seq = ''.join([load_genomic(chromosome, strand, start[ix], stop[ix], genome)\
163 for ix in idx])
164 return seq
165
166 try:
167 f=file(fname)
168 if one_based:
169 f.seek(start-1)
170 str=f.read(stop-start+1)
171 else:
172 f.seek(start)
173 str=f.read(stop-start)
174
175 if strand=='-':
176 return reverse_complement(str)
177 elif strand=='+':
178 return str
179 else:
180 print 'strand must be + or -'
181 raise KeyError
182
183 except:
184 print fname,strand,start,stop
185
186
187 class DataAccessWrapper:
188 """
189 The purpose of this class is to wrap the genomic/splice site score data
190 access.
191
192
193
194 """
195
196
197 def __init__(self,genomic_data_dir,acc_score_dir,don_score_dir,gen_fmt,sscore_fmt):
198 self.genomic_dir = genomic_data_dir
199 self.acc_score_dir = acc_score_dir
200 self.don_score_dir = don_score_dir
201 self.genomic_fmt = gen_fmt
202 self.sscore_fmt = sscore_fmt
203
204 assert os.path.isdir(genomic_data_dir)
205 assert os.path.isdir(acc_score_dir)
206 assert os.path.isdir(don_score_dir)
207
208
209 def get_genomic_fragment_fn(self,id,strand):
210 if strand == '+':
211 return os.path.join(self.genomic_dir,self.genomic_fmt%id)
212 elif strand == '-':
213 return os.path.join(self.genomic_dir,(self.genomic_fmt%id)+'.neg')
214 else:
215 assert False
216
217
218 def get_splice_site_scores_fn(self,id,strand):
219 acc_fn = os.path.join(self.acc_score_dir,self.sscore_fmt%(id,strand))
220 don_fn = os.path.join(self.don_score_dir,self.sscore_fmt%(id,strand))
221 return acc_fn,don_fn
222
223
224
225 class SeqSpliceInfo():
226 """
227
228 """
229
230 def __init__(self,accessWrapper,id_list):
231 self.accessWrapper = accessWrapper
232
233 self.chromo_sizes = {}
234
235 print "Fetching fragment sizes..."
236 for chromo in id_list:
237 filename = self.accessWrapper.get_genomic_fragment_fn(chromo,'+')
238 total_size = get_flat_file_size(filename)
239 self.chromo_sizes[chromo] = total_size
240
241 print "done"
242
243
244 def getSpliceScores(self,chromo,strand,intervalBegin,intervalEnd,genomicSeq=''):
245 """
246 Now we want to use interval_query to get the predicted splice scores trained
247 on the TAIR sequence and annotation.
248 """
249
250 size = intervalEnd-intervalBegin
251 assert size > 1, 'Error (getSpliceScores): interval size is less than 2!'
252
253 acc = size*[0.0]
254 don = size*[0.0]
255
256 interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
257 pos_size = new_intp()
258 intp_assign(pos_size,1)
259
260 total_size = self.chromo_sizes[chromo]
261
262 acc_fn, don_fn = self.accessWrapper.get_splice_site_scores_fn(chromo,strand)
263
264 # fetch acceptor scores
265 acc = doIntervalQuery(chromo,strand,intervalBegin,intervalEnd,acc_fn,total_size)
266
267 # fetch donor scores
268 don = doIntervalQuery(chromo,strand,intervalBegin,intervalEnd,don_fn,total_size)
269
270 return acc, don
271
272
273 def fetch_sequence(self,chromo,strand,genomicSeq_start,genomicSeq_stop):
274 #chromo_string = 'chr%d' % chromo
275 #genomicSeq = load_genomic(chromo_string,strand,genomicSeq_start,genomicSeq_stop,self.flat_files,one_based=False)
276 filename = self.accessWrapper.get_genomic_fragment_fn(chromo,strand)
277 #print filename,genomicSeq_start,genomicSeq_stop
278 genomicSeq = my_load_genomic(filename,'+',genomicSeq_start,genomicSeq_stop,one_based=False)
279 genomicSeq = genomicSeq.strip().lower()
280
281 # check the obtained dna sequence
282 assert genomicSeq != '', 'load_genomic returned empty sequence!'
283
284 # all entries other than a c g t are set to n
285 no_base = re.compile('[^acgt]')
286 genomicSeq = no_base.sub('n',genomicSeq)
287
288 return genomicSeq
289
290
291 def get_seq_and_scores(self,chromo,strand,genomicSeq_start,genomicSeq_stop,only_seq=False,perform_checks=True):
292 """
293 This function expects an interval, chromosome and strand information and
294 returns then the genomic sequence of this interval and the associated scores.
295 """
296 # Because splice site scores are predicted using a window of the sequence the
297 # first and the last NO_SCORE_WINDOW_SIZE nucleotides of a genomic sequence
298 # do not have any score predictions
299 NO_SCORE_WINDOW_SIZE = 205
300
301 total_size = self.chromo_sizes[chromo]
302
303 #print genomicSeq_start,genomicSeq_stop
304
305 assert genomicSeq_start < genomicSeq_stop
306
307 if strand == '+':
308 s_start = genomicSeq_start - 1
309 s_stop = genomicSeq_stop - 1
310
311 genomicSeq_start -= 1
312 genomicSeq_stop -= 1
313
314 if strand == '-':
315 s_start = total_size - genomicSeq_stop + 1
316 s_stop = total_size - genomicSeq_start + 1
317
318 assert genomicSeq_start < genomicSeq_stop
319
320 genomicSeq = self.fetch_sequence(chromo,strand,s_start,s_stop)
321
322 if only_seq:
323 return genomicSeq
324
325 # shorten the intervalEnd to the maximal size of the genomic sequence if it would exceed this size
326 lookup_offset_begin = 10
327 lookup_offset_end = 10
328 intervalBegin = max(genomicSeq_start-lookup_offset_begin,0)
329 if intervalBegin == 0:
330 lookup_offset_begin = 0
331
332 intervalEnd = min(genomicSeq_stop+lookup_offset_end,total_size-1)
333 if intervalEnd == total_size-1:
334 lookup_offset_end = 0
335
336 currentAcc, currentDon = self.getSpliceScores(chromo,strand,intervalBegin,intervalEnd,genomicSeq)
337
338 if strand == '-':
339 currentAcc = currentAcc[1:]
340 #currentDon = currentDon[1:]
341 #currentDon = [-inf]+currentDon[:-1]
342 else:
343 currentAcc = currentAcc[2:]
344 currentDon = currentDon[1:]
345
346 # remove the offset positions
347 currentAcc = currentAcc[lookup_offset_begin:]
348 currentDon = currentDon[lookup_offset_begin:]
349
350 currentAcc = currentAcc[:len(genomicSeq)]
351 currentDon = currentDon[:len(genomicSeq)]
352
353 length = len(genomicSeq)
354 currentAcc = currentAcc+[-inf]*(length-len(currentAcc))
355 currentDon = currentDon+[-inf]*(length-len(currentDon))
356 currentDon[-1] = -inf
357
358 gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if (p>0 and p<len(genomicSeq)-1 and e=='g' ) and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
359 ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and genomicSeq[p-1]=='a' and genomicSeq[p]=='g']
360
361 for pos in ag_tuple_pos:
362 if pos+intervalBegin < NO_SCORE_WINDOW_SIZE and currentAcc[pos] == -inf:
363 currentAcc[pos] = 1e-6
364
365 if pos+intervalBegin > total_size - NO_SCORE_WINDOW_SIZE and currentAcc[pos] == -inf:
366 currentAcc[pos] = 1e-6
367
368 for pos in gt_tuple_pos:
369 if pos+intervalBegin < NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
370 currentDon[pos] = 1e-6
371
372 if pos+intervalBegin > total_size - NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
373 currentDon[pos] = 1e-6
374
375 acc_pos = [p for p,e in enumerate(currentAcc) if e != -inf and p > 1]
376 don_pos = [p for p,e in enumerate(currentDon) if e != -inf and p > 0]
377
378 if perform_checks and not ag_tuple_pos == acc_pos:
379 print 'ACC: Chromo/strand: %d/%s' % (chromo,strand)
380 print ag_tuple_pos[:10],ag_tuple_pos[-10:]
381 print acc_pos[:10],acc_pos[-10:]
382 pdb.set_trace()
383
384 if perform_checks and not gt_tuple_pos == don_pos:
385 print 'DON: Chromo/strand: %d/%s' % (chromo,strand)
386 print gt_tuple_pos[:10],gt_tuple_pos[-10:]
387 print don_pos[:10],don_pos[-10:]
388 pdb.set_trace()
389
390 assert len(genomicSeq) == len(currentAcc) == len(currentDon), pdb.set_trace()
391
392 return genomicSeq, currentAcc, currentDon