+ changed dataset compilation to support new framework
[qpalma.git] / qpalma / sequence_utils.py
1 # This program is free software; you can redistribute it and/or modify
2 # it under the terms of the GNU General Public License as published by
3 # the Free Software Foundation; either version 2 of the License, or
4 # (at your option) any later version.
5 #
6 # Written (W) 2008 Fabio De Bona
7 # Copyright (C) 2008 Max-Planck-Society
8
9 #
10 # This file contains the main interface to the QPalma pipeline.
11 #
12 #
13 # This module holds all functions for queries on the dna flat files and the
14 # splice score files.
15 # Furthermore it contains some utility functions such as reverse complement and
16 # unbracketing est strings
17 #
18
19 import os
20 import os.path
21 import pdb
22 import re
23 import subprocess
24 import numpy
25
26 from numpy.matlib import inf
27
28 from Genefinding import *
29 from genome_utils import load_genomic
30
31 extended_alphabet = ['-','a','c','g','t','n','[',']']
32 alphabet = ['-','a','c','g','t','n']
33
34
35 class DataAccessionException:
36 pass
37
38
39 def get_flat_file_size(filename):
40 cmd = 'wc -c %s | cut -f1 -d \' \'' % filename
41 obj = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
42 out,err = obj.communicate()
43
44 if err != '':
45 print 'Error occurred while trying to obtain file size'
46 raise DataAccessionException
47
48 return int(out)
49
50
51 def reverse_complement(seq):
52 """
53 This function takes a read in plain or bracket notation and returns the
54 reverse complement of it.
55 I.e.
56
57 est = cgt[ac][tg]a
58
59 leads to
60
61 rev_comp = t[ac][tg]acg
62 """
63
64 bpos = seq.find('[')
65 rc = lambda x: {'a':'t','c':'g','g':'c','t':'a','n':'n'}[x]
66
67 # check first whether seq contains no brackets at all
68 if bpos == -1:
69 ret_val = map(rc,seq)
70 ret_val.reverse()
71 ret_val = "".join(ret_val)
72 else:
73 brc = lambda x: {'a':'t','c':'g','g':'c','t':'a','n':'n','[':'[',']':']'}[x]
74
75 # first_part is the part of the seq up to the first occurrence of a
76 # bracket
77 first_part = seq[:bpos]
78 first_part = map(rc,first_part)
79 first_part.reverse()
80 first_part = "".join(first_part)
81
82 # inside brackets has to be complemented but NOT reversed
83 inside_brackets = seq[bpos+1:bpos+3]
84 inside_brackets = "".join(map(rc,inside_brackets))
85
86 ret_val = '%s[%s]%s'%(reverse_complement(seq[bpos+4:]),inside_brackets,first_part)
87
88 return ret_val
89
90
91 def unbracket_seq(seq):
92 """
93 This function takes a read in bracket notation and restores the read sequence from it.
94 I.e.
95
96 seq = cgt[ac][tg]aa
97
98 leads to
99
100 result = cgtcgaa
101
102 so the second entry within brackets is the base on the read whereas the first
103 entry is the base from the dna.
104 """
105
106 new_seq = ''
107 e = 0
108
109 while True:
110 if e >= len(seq):
111 break
112
113 if seq[e] == '[':
114 new_seq += seq[e+2]
115 e += 4
116 else:
117 new_seq += seq[e]
118 e += 1
119
120 return "".join(new_seq).lower()
121
122
123 def reconstruct_dna_seq(seq):
124 """
125
126 """
127
128 new_seq = ''
129 e = 0
130
131 while True:
132 if e >= len(seq):
133 break
134
135 if seq[e] == '[':
136 new_seq += seq[e+1]
137 e += 4
138 else:
139 new_seq += seq[e]
140 e += 1
141
142 return "".join(new_seq).lower()
143
144
145
146 def create_bracket_seq(dna_seq,read_seq):
147 """
148 This function takes a dna sequence and a read sequence and returns the
149 bracket format of the match/mismatches i.e.
150
151 dna : aaa
152 read: aac
153 is written in bracket notation: aa[ac]
154 """
155 assert len(dna_seq) == len(read_seq),pdb.set_trace()
156 return "".join(map(lambda x,y: ['[%s%s]'%(x,y),x][x==y],dna_seq,read_seq))
157
158
159
160 def my_load_genomic(fname, strand, start, stop, one_based=True):
161
162 if (type(start)==numpy.ndarray) or (type(stop)==numpy.ndarray):
163 assert(len(start) == len(stop))
164 assert((start[1:]-stop[:-1]>0).all())
165 if strand == '+':
166 idx = xrange(len(start))
167 else:
168 idx = xrange(len(start)-1,-1,-1)
169
170 seq = ''.join([load_genomic(chromosome, strand, start[ix], stop[ix], genome)\
171 for ix in idx])
172 return seq
173
174 try:
175 f=file(fname)
176 if one_based:
177 f.seek(start-1)
178 str=f.read(stop-start+1)
179 else:
180 f.seek(start)
181 str=f.read(stop-start)
182
183 if strand=='-':
184 return reverse_complement(str)
185 elif strand=='+':
186 return str
187 else:
188 print 'strand must be + or -'
189 raise KeyError
190
191 except:
192 print fname,strand,start,stop
193
194
195 class DataAccessWrapper:
196 """
197 The purpose of this class is to wrap the genomic/splice site score data
198 access.
199 """
200
201 def __init__(self,settings):
202 self.genomic_dir = settings['genome_dir']
203 self.acc_score_dir = settings['acceptor_scores_dir']
204 self.don_score_dir = settings['donor_scores__dir']
205 self.genomic_fmt = settings['genome_file_fmt']
206 self.sscore_fmt = settings['splice_score_file_fmt']
207
208 assert os.path.isdir(genomic_data_dir)
209 assert os.path.isdir(acc_score_dir)
210 assert os.path.isdir(don_score_dir)
211
212
213 def get_genomic_fragment_fn(self,id,strand):
214 if strand == '+':
215 return os.path.join(self.genomic_dir,self.genomic_fmt%id)
216 elif strand == '-':
217 return os.path.join(self.genomic_dir,(self.genomic_fmt%id)+'.neg')
218 else:
219 assert False
220
221
222 def get_splice_site_scores_fn(self,id,strand):
223 acc_fn = os.path.join(self.acc_score_dir,self.sscore_fmt%(id,strand))
224 don_fn = os.path.join(self.don_score_dir,self.sscore_fmt%(id,strand))
225 return acc_fn,don_fn
226
227
228
229 class SeqSpliceInfo():
230 """
231
232 """
233
234 def __init__(self,accessWrapper,id_list):
235 self.accessWrapper = accessWrapper
236
237 self.chromo_sizes = {}
238
239 print "Fetching fragment sizes..."
240 for chromo in id_list:
241 filename = self.accessWrapper.get_genomic_fragment_fn(chromo,'+')
242 total_size = get_flat_file_size(filename)
243 self.chromo_sizes[chromo] = total_size
244
245 print "done"
246
247
248 def getSpliceScores(self,chromo,strand,intervalBegin,intervalEnd,genomicSeq=''):
249 """
250 Now we want to use interval_query to get the predicted splice scores trained
251 on the TAIR sequence and annotation.
252 """
253
254 size = intervalEnd-intervalBegin
255 assert size > 1, 'Error (getSpliceScores): interval size is less than 2!'
256
257 acc = size*[0.0]
258 don = size*[0.0]
259
260 interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
261 pos_size = new_intp()
262 intp_assign(pos_size,1)
263
264 total_size = self.chromo_sizes[chromo]
265
266 acc_fn, don_fn = self.accessWrapper.get_splice_site_scores_fn(chromo,strand)
267
268 # fetch acceptor scores
269 acc = doIntervalQuery(chromo,strand,intervalBegin,intervalEnd,acc_fn,total_size)
270
271 # fetch donor scores
272 don = doIntervalQuery(chromo,strand,intervalBegin,intervalEnd,don_fn,total_size)
273
274 return acc, don
275
276
277 def fetch_sequence(self,chromo,strand,genomicSeq_start,genomicSeq_stop):
278 #chromo_string = 'chr%d' % chromo
279 #genomicSeq = load_genomic(chromo_string,strand,genomicSeq_start,genomicSeq_stop,self.flat_files,one_based=False)
280 filename = self.accessWrapper.get_genomic_fragment_fn(chromo,strand)
281 #print filename,genomicSeq_start,genomicSeq_stop
282 genomicSeq = my_load_genomic(filename,'+',genomicSeq_start,genomicSeq_stop,one_based=False)
283 genomicSeq = genomicSeq.strip().lower()
284
285 # check the obtained dna sequence
286 assert genomicSeq != '', 'load_genomic returned empty sequence!'
287
288 # all entries other than a c g t are set to n
289 no_base = re.compile('[^acgt]')
290 genomicSeq = no_base.sub('n',genomicSeq)
291
292 return genomicSeq
293
294
295 def get_seq_and_scores(self,chromo,strand,genomicSeq_start,genomicSeq_stop,only_seq=False,perform_checks=True):
296 """
297 This function expects an interval, chromosome and strand information and
298 returns then the genomic sequence of this interval and the associated scores.
299 """
300 # Because splice site scores are predicted using a window of the sequence the
301 # first and the last NO_SCORE_WINDOW_SIZE nucleotides of a genomic sequence
302 # do not have any score predictions
303 NO_SCORE_WINDOW_SIZE = 205
304
305 total_size = self.chromo_sizes[chromo]
306
307 #print genomicSeq_start,genomicSeq_stop
308
309 assert genomicSeq_start < genomicSeq_stop
310
311 if strand == '+':
312 s_start = genomicSeq_start - 1
313 s_stop = genomicSeq_stop - 1
314
315 genomicSeq_start -= 1
316 genomicSeq_stop -= 1
317
318 if strand == '-':
319 s_start = total_size - genomicSeq_stop + 1
320 s_stop = total_size - genomicSeq_start + 1
321
322 assert genomicSeq_start < genomicSeq_stop
323
324 genomicSeq = self.fetch_sequence(chromo,strand,s_start,s_stop)
325
326 if only_seq:
327 return genomicSeq
328
329 # shorten the intervalEnd to the maximal size of the genomic sequence if it would exceed this size
330 lookup_offset_begin = 10
331 lookup_offset_end = 10
332 intervalBegin = max(genomicSeq_start-lookup_offset_begin,0)
333 if intervalBegin == 0:
334 lookup_offset_begin = 0
335
336 intervalEnd = min(genomicSeq_stop+lookup_offset_end,total_size-1)
337 if intervalEnd == total_size-1:
338 lookup_offset_end = 0
339
340 currentAcc, currentDon = self.getSpliceScores(chromo,strand,intervalBegin,intervalEnd,genomicSeq)
341
342 if strand == '-':
343 currentAcc = currentAcc[1:]
344 #currentDon = currentDon[1:]
345 #currentDon = [-inf]+currentDon[:-1]
346 else:
347 currentAcc = currentAcc[2:]
348 currentDon = currentDon[1:]
349
350 # remove the offset positions
351 currentAcc = currentAcc[lookup_offset_begin:]
352 currentDon = currentDon[lookup_offset_begin:]
353
354 currentAcc = currentAcc[:len(genomicSeq)]
355 currentDon = currentDon[:len(genomicSeq)]
356
357 length = len(genomicSeq)
358 currentAcc = currentAcc+[-inf]*(length-len(currentAcc))
359 currentDon = currentDon+[-inf]*(length-len(currentDon))
360 currentDon[-1] = -inf
361
362 gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if (p>0 and p<len(genomicSeq)-1 and e=='g' ) and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
363 ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and genomicSeq[p-1]=='a' and genomicSeq[p]=='g']
364
365 for pos in ag_tuple_pos:
366 if pos+intervalBegin < NO_SCORE_WINDOW_SIZE and currentAcc[pos] == -inf:
367 currentAcc[pos] = 1e-6
368
369 if pos+intervalBegin > total_size - NO_SCORE_WINDOW_SIZE and currentAcc[pos] == -inf:
370 currentAcc[pos] = 1e-6
371
372 for pos in gt_tuple_pos:
373 if pos+intervalBegin < NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
374 currentDon[pos] = 1e-6
375
376 if pos+intervalBegin > total_size - NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
377 currentDon[pos] = 1e-6
378
379 acc_pos = [p for p,e in enumerate(currentAcc) if e != -inf and p > 1]
380 don_pos = [p for p,e in enumerate(currentDon) if e != -inf and p > 0]
381
382 if perform_checks and not ag_tuple_pos == acc_pos:
383 print 'ACC: Chromo/strand: %d/%s' % (chromo,strand)
384 print ag_tuple_pos[:10],ag_tuple_pos[-10:]
385 print acc_pos[:10],acc_pos[-10:]
386 pdb.set_trace()
387
388 if perform_checks and not gt_tuple_pos == don_pos:
389 print 'DON: Chromo/strand: %d/%s' % (chromo,strand)
390 print gt_tuple_pos[:10],gt_tuple_pos[-10:]
391 print don_pos[:10],don_pos[-10:]
392 pdb.set_trace()
393
394 assert len(genomicSeq) == len(currentAcc) == len(currentDon), pdb.set_trace()
395
396 return genomicSeq, currentAcc, currentDon