+ changed module hierarchy a bit
[qpalma.git] / qpalma / sequence_utils.py
1 # This program is free software; you can redistribute it and/or modify
2 # it under the terms of the GNU General Public License as published by
3 # the Free Software Foundation; either version 2 of the License, or
4 # (at your option) any later version.
5 #
6 # Written (W) 2008 Fabio De Bona
7 # Copyright (C) 2008 Max-Planck-Society
8
9 #
10 # This file contains the main interface to the QPalma pipeline.
11 #
12 #
13 # This module holds all functions for queries on the dna flat files and the
14 # splice score files.
15 # Furthermore it contains some utility functions such as reverse complement and
16 # unbracketing est strings
17 #
18
19 import os
20 import os.path
21 import pdb
22 import re
23 import subprocess
24 import numpy
25
26 from numpy.matlib import inf
27
28 from Genefinding import *
29 from genome_utils import load_genomic
30
31 extended_alphabet = ['-','a','c','g','t','n','[',']']
32 alphabet = ['-','a','c','g','t','n']
33
34
35 class DataAccessionException:
36 pass
37
38
39 def get_flat_file_size(filename):
40 cmd = 'wc -c %s | cut -f1 -d \' \'' % filename
41 obj = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
42 out,err = obj.communicate()
43
44 if err != '':
45 print 'Error occurred while trying to obtain file size'
46 raise DataAccessionException
47
48 return int(out)
49
50
51 def reverse_complement(seq):
52 """
53 This function takes a read in plain or bracket notation and returns the
54 reverse complement of it.
55 I.e.
56
57 est = cgt[ac][tg]a
58
59 leads to
60
61 rev_comp = t[ac][tg]acg
62 """
63
64 bpos = seq.find('[')
65 rc = lambda x: {'a':'t','c':'g','g':'c','t':'a','n':'n'}[x]
66
67 # check first whether seq contains no brackets at all
68 if bpos == -1:
69 ret_val = map(rc,seq)
70 ret_val.reverse()
71 ret_val = "".join(ret_val)
72 else:
73 brc = lambda x: {'a':'t','c':'g','g':'c','t':'a','n':'n','[':'[',']':']'}[x]
74
75 # first_part is the part of the seq up to the first occurrence of a
76 # bracket
77 first_part = seq[:bpos]
78 first_part = map(rc,first_part)
79 first_part.reverse()
80 first_part = "".join(first_part)
81
82 # inside brackets has to be complemented but NOT reversed
83 inside_brackets = seq[bpos+1:bpos+3]
84 inside_brackets = "".join(map(rc,inside_brackets))
85
86 ret_val = '%s[%s]%s'%(reverse_complement(seq[bpos+4:]),inside_brackets,first_part)
87
88 return ret_val
89
90
91 def unbracket_seq(seq):
92 """
93 This function takes a read in bracket notation and restores the read sequence from it.
94 I.e.
95
96 seq = cgt[ac][tg]aa
97
98 leads to
99
100 result = cgtcgaa
101
102 so the second entry within brackets is the base on the read whereas the first
103 entry is the base from the dna.
104 """
105
106 new_seq = ''
107 e = 0
108
109 while True:
110 if e >= len(seq):
111 break
112
113 if seq[e] == '[':
114 new_seq += seq[e+2]
115 e += 4
116 else:
117 new_seq += seq[e]
118 e += 1
119
120 return "".join(new_seq).lower()
121
122
123 def reconstruct_dna_seq(seq):
124 """
125
126 """
127
128 new_seq = ''
129 e = 0
130
131 while True:
132 if e >= len(seq):
133 break
134
135 if seq[e] == '[':
136 new_seq += seq[e+1]
137 e += 4
138 else:
139 new_seq += seq[e]
140 e += 1
141
142 return "".join(new_seq).lower()
143
144
145
146 def create_bracket_seq(dna_seq,read_seq):
147 """
148 This function takes a dna sequence and a read sequence and returns the
149 bracket format of the match/mismatches i.e.
150
151 dna : aaa
152 read: aac
153 is written in bracket notation: aa[ac]
154 """
155 assert len(dna_seq) == len(read_seq),pdb.set_trace()
156 return "".join(map(lambda x,y: ['[%s%s]'%(x,y),x][x==y],dna_seq,read_seq))
157
158
159
160 def my_load_genomic(fname, strand, start, stop, one_based=True):
161
162 if (type(start)==numpy.ndarray) or (type(stop)==numpy.ndarray):
163 assert(len(start) == len(stop))
164 assert((start[1:]-stop[:-1]>0).all())
165 if strand == '+':
166 idx = xrange(len(start))
167 else:
168 idx = xrange(len(start)-1,-1,-1)
169
170 seq = ''.join([load_genomic(chromosome, strand, start[ix], stop[ix], genome)\
171 for ix in idx])
172 return seq
173
174 try:
175 f=file(fname)
176 if one_based:
177 f.seek(start-1)
178 str=f.read(stop-start+1)
179 else:
180 f.seek(start)
181 str=f.read(stop-start)
182
183 if strand=='-':
184 return reverse_complement(str)
185 elif strand=='+':
186 return str
187 else:
188 print 'strand must be + or -'
189 raise KeyError
190
191 except:
192 print fname,strand,start,stop
193
194
195 class DataAccessWrapper:
196 """
197 The purpose of this class is to wrap the genomic/splice site score data
198 access.
199
200
201
202 """
203
204
205 def __init__(self,genomic_data_dir,acc_score_dir,don_score_dir,gen_fmt,sscore_fmt):
206 self.genomic_dir = genomic_data_dir
207 self.acc_score_dir = acc_score_dir
208 self.don_score_dir = don_score_dir
209 self.genomic_fmt = gen_fmt
210 self.sscore_fmt = sscore_fmt
211
212 assert os.path.isdir(genomic_data_dir)
213 assert os.path.isdir(acc_score_dir)
214 assert os.path.isdir(don_score_dir)
215
216
217 def get_genomic_fragment_fn(self,id,strand):
218 if strand == '+':
219 return os.path.join(self.genomic_dir,self.genomic_fmt%id)
220 elif strand == '-':
221 return os.path.join(self.genomic_dir,(self.genomic_fmt%id)+'.neg')
222 else:
223 assert False
224
225
226 def get_splice_site_scores_fn(self,id,strand):
227 acc_fn = os.path.join(self.acc_score_dir,self.sscore_fmt%(id,strand))
228 don_fn = os.path.join(self.don_score_dir,self.sscore_fmt%(id,strand))
229 return acc_fn,don_fn
230
231
232
233 class SeqSpliceInfo():
234 """
235
236 """
237
238 def __init__(self,accessWrapper,id_list):
239 self.accessWrapper = accessWrapper
240
241 self.chromo_sizes = {}
242
243 print "Fetching fragment sizes..."
244 for chromo in id_list:
245 filename = self.accessWrapper.get_genomic_fragment_fn(chromo,'+')
246 total_size = get_flat_file_size(filename)
247 self.chromo_sizes[chromo] = total_size
248
249 print "done"
250
251
252 def getSpliceScores(self,chromo,strand,intervalBegin,intervalEnd,genomicSeq=''):
253 """
254 Now we want to use interval_query to get the predicted splice scores trained
255 on the TAIR sequence and annotation.
256 """
257
258 size = intervalEnd-intervalBegin
259 assert size > 1, 'Error (getSpliceScores): interval size is less than 2!'
260
261 acc = size*[0.0]
262 don = size*[0.0]
263
264 interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
265 pos_size = new_intp()
266 intp_assign(pos_size,1)
267
268 total_size = self.chromo_sizes[chromo]
269
270 acc_fn, don_fn = self.accessWrapper.get_splice_site_scores_fn(chromo,strand)
271
272 # fetch acceptor scores
273 acc = doIntervalQuery(chromo,strand,intervalBegin,intervalEnd,acc_fn,total_size)
274
275 # fetch donor scores
276 don = doIntervalQuery(chromo,strand,intervalBegin,intervalEnd,don_fn,total_size)
277
278 return acc, don
279
280
281 def fetch_sequence(self,chromo,strand,genomicSeq_start,genomicSeq_stop):
282 #chromo_string = 'chr%d' % chromo
283 #genomicSeq = load_genomic(chromo_string,strand,genomicSeq_start,genomicSeq_stop,self.flat_files,one_based=False)
284 filename = self.accessWrapper.get_genomic_fragment_fn(chromo,strand)
285 #print filename,genomicSeq_start,genomicSeq_stop
286 genomicSeq = my_load_genomic(filename,'+',genomicSeq_start,genomicSeq_stop,one_based=False)
287 genomicSeq = genomicSeq.strip().lower()
288
289 # check the obtained dna sequence
290 assert genomicSeq != '', 'load_genomic returned empty sequence!'
291
292 # all entries other than a c g t are set to n
293 no_base = re.compile('[^acgt]')
294 genomicSeq = no_base.sub('n',genomicSeq)
295
296 return genomicSeq
297
298
299 def get_seq_and_scores(self,chromo,strand,genomicSeq_start,genomicSeq_stop,only_seq=False,perform_checks=True):
300 """
301 This function expects an interval, chromosome and strand information and
302 returns then the genomic sequence of this interval and the associated scores.
303 """
304 # Because splice site scores are predicted using a window of the sequence the
305 # first and the last NO_SCORE_WINDOW_SIZE nucleotides of a genomic sequence
306 # do not have any score predictions
307 NO_SCORE_WINDOW_SIZE = 205
308
309 total_size = self.chromo_sizes[chromo]
310
311 #print genomicSeq_start,genomicSeq_stop
312
313 assert genomicSeq_start < genomicSeq_stop
314
315 if strand == '+':
316 s_start = genomicSeq_start - 1
317 s_stop = genomicSeq_stop - 1
318
319 genomicSeq_start -= 1
320 genomicSeq_stop -= 1
321
322 if strand == '-':
323 s_start = total_size - genomicSeq_stop + 1
324 s_stop = total_size - genomicSeq_start + 1
325
326 assert genomicSeq_start < genomicSeq_stop
327
328 genomicSeq = self.fetch_sequence(chromo,strand,s_start,s_stop)
329
330 if only_seq:
331 return genomicSeq
332
333 # shorten the intervalEnd to the maximal size of the genomic sequence if it would exceed this size
334 lookup_offset_begin = 10
335 lookup_offset_end = 10
336 intervalBegin = max(genomicSeq_start-lookup_offset_begin,0)
337 if intervalBegin == 0:
338 lookup_offset_begin = 0
339
340 intervalEnd = min(genomicSeq_stop+lookup_offset_end,total_size-1)
341 if intervalEnd == total_size-1:
342 lookup_offset_end = 0
343
344 currentAcc, currentDon = self.getSpliceScores(chromo,strand,intervalBegin,intervalEnd,genomicSeq)
345
346 if strand == '-':
347 currentAcc = currentAcc[1:]
348 #currentDon = currentDon[1:]
349 #currentDon = [-inf]+currentDon[:-1]
350 else:
351 currentAcc = currentAcc[2:]
352 currentDon = currentDon[1:]
353
354 # remove the offset positions
355 currentAcc = currentAcc[lookup_offset_begin:]
356 currentDon = currentDon[lookup_offset_begin:]
357
358 currentAcc = currentAcc[:len(genomicSeq)]
359 currentDon = currentDon[:len(genomicSeq)]
360
361 length = len(genomicSeq)
362 currentAcc = currentAcc+[-inf]*(length-len(currentAcc))
363 currentDon = currentDon+[-inf]*(length-len(currentDon))
364 currentDon[-1] = -inf
365
366 gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if (p>0 and p<len(genomicSeq)-1 and e=='g' ) and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
367 ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and genomicSeq[p-1]=='a' and genomicSeq[p]=='g']
368
369 for pos in ag_tuple_pos:
370 if pos+intervalBegin < NO_SCORE_WINDOW_SIZE and currentAcc[pos] == -inf:
371 currentAcc[pos] = 1e-6
372
373 if pos+intervalBegin > total_size - NO_SCORE_WINDOW_SIZE and currentAcc[pos] == -inf:
374 currentAcc[pos] = 1e-6
375
376 for pos in gt_tuple_pos:
377 if pos+intervalBegin < NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
378 currentDon[pos] = 1e-6
379
380 if pos+intervalBegin > total_size - NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
381 currentDon[pos] = 1e-6
382
383 acc_pos = [p for p,e in enumerate(currentAcc) if e != -inf and p > 1]
384 don_pos = [p for p,e in enumerate(currentDon) if e != -inf and p > 0]
385
386 if perform_checks and not ag_tuple_pos == acc_pos:
387 print 'ACC: Chromo/strand: %d/%s' % (chromo,strand)
388 print ag_tuple_pos[:10],ag_tuple_pos[-10:]
389 print acc_pos[:10],acc_pos[-10:]
390 pdb.set_trace()
391
392 if perform_checks and not gt_tuple_pos == don_pos:
393 print 'DON: Chromo/strand: %d/%s' % (chromo,strand)
394 print gt_tuple_pos[:10],gt_tuple_pos[-10:]
395 print don_pos[:10],don_pos[-10:]
396 pdb.set_trace()
397
398 assert len(genomicSeq) == len(currentAcc) == len(currentDon), pdb.set_trace()
399
400 return genomicSeq, currentAcc, currentDon