9f7ee6ccb30774ad800b5cc3cea9c42a8b40c6db
[qpalma.git] / qpalma / sequence_utils.py
1 # This program is free software; you can redistribute it and/or modify
2 # it under the terms of the GNU General Public License as published by
3 # the Free Software Foundation; either version 2 of the License, or
4 # (at your option) any later version.
5 #
6 # Written (W) 2008 Fabio De Bona
7 # Copyright (C) 2008 Max-Planck-Society
8
9 #
10 # This file contains the main interface to the QPalma pipeline.
11 #
12 #
13 # This module holds all functions for queries on the dna flat files and the
14 # splice score files.
15 # Furthermore it contains some utility functions such as reverse complement and
16 # unbracketing est strings
17 #
18
19 import os
20 import os.path
21 import pdb
22 import re
23 import subprocess
24 import numpy
25
26 from numpy.matlib import inf
27
28 from Genefinding import doIntervalQuery,new_intp,intp_assign
29
30 from QPalmaDP import createIntArrayFromList
31
32
33 def load_genomic(chromosome, strand, start, stop, genome, one_based=True):
34 """
35 This function stems from Cheng Soon Ongs genome_utils package.
36 """
37
38 if (type(start)==numpy.ndarray) or (type(stop)==numpy.ndarray):
39 assert(len(start) == len(stop))
40 assert((start[1:]-stop[:-1]>0).all())
41 if strand == '+':
42 idx = xrange(len(start))
43 else:
44 idx = xrange(len(start)-1,-1,-1)
45
46 seq = ''.join([load_genomic(chromosome, strand, start[ix], stop[ix], genome)\
47 for ix in idx])
48 return seq
49
50 #fname = '/fml/ag-raetsch/share/databases/genomes/' + genome + '/' + chromosome + '.dna.flat'
51 fname = genome + '/' + chromosome + '.dna.flat'
52 f=file(fname)
53 if one_based:
54 f.seek(start-1)
55 str=f.read(stop-start+1)
56 else:
57 f.seek(start)
58 str=f.read(stop-start)
59
60 if strand=='-':
61 return reverse_complement(str)
62 elif strand=='+':
63 return str
64 else:
65 print 'strand must be + or -'
66 raise KeyError
67
68
69 extended_alphabet = ['-','a','c','g','t','n','[',']']
70 alphabet = ['-','a','c','g','t','n']
71
72
73 class DataAccessionException:
74 pass
75
76
77 def get_flat_file_size(filename):
78 cmd = 'wc -c %s | cut -f1 -d \' \'' % filename
79 obj = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
80 out,err = obj.communicate()
81
82 if err != '':
83 print 'Error occurred while trying to obtain file size'
84 raise DataAccessionException
85
86 # subtract one because of wc -c output
87 return int(out)-1
88
89
90 def reverse_complement(seq):
91 """
92 This function takes a read in plain or bracket notation and returns the
93 reverse complement of it.
94 I.e.
95
96 est = cgt[ac][tg]a
97
98 leads to
99
100 rev_comp = t[ac][tg]acg
101 """
102
103 bpos = seq.find('[')
104 rc = lambda x: {'a':'t','c':'g','g':'c','t':'a','n':'n'}[x]
105
106 # check first whether seq contains no brackets at all
107 if bpos == -1:
108 ret_val = map(rc,seq)
109 ret_val.reverse()
110 ret_val = "".join(ret_val)
111 else:
112 brc = lambda x: {'a':'t','c':'g','g':'c','t':'a','n':'n','[':'[',']':']'}[x]
113
114 # first_part is the part of the seq up to the first occurrence of a
115 # bracket
116 first_part = seq[:bpos]
117 first_part = map(rc,first_part)
118 first_part.reverse()
119 first_part = "".join(first_part)
120
121 # inside brackets has to be complemented but NOT reversed
122 inside_brackets = seq[bpos+1:bpos+3]
123 inside_brackets = "".join(map(rc,inside_brackets))
124
125 ret_val = '%s[%s]%s'%(reverse_complement(seq[bpos+4:]),inside_brackets,first_part)
126
127 return ret_val
128
129
130 def unbracket_seq(seq):
131 """
132 This function takes a read in bracket notation and restores the read sequence from it.
133 I.e.
134
135 seq = cgt[ac][tg]aa
136
137 leads to
138
139 result = cgtcgaa
140
141 so the second entry within brackets is the base on the read whereas the first
142 entry is the base from the dna.
143 """
144
145 new_seq = ''
146 e = 0
147
148 while True:
149 if e >= len(seq):
150 break
151
152 if seq[e] == '[':
153 new_seq += seq[e+2]
154 e += 4
155 else:
156 new_seq += seq[e]
157 e += 1
158
159 return "".join(new_seq).lower()
160
161
162 def reconstruct_dna_seq(seq):
163 """
164
165 """
166
167 new_seq = ''
168 e = 0
169
170 while True:
171 if e >= len(seq):
172 break
173
174 if seq[e] == '[':
175 new_seq += seq[e+1]
176 e += 4
177 else:
178 new_seq += seq[e]
179 e += 1
180
181 return "".join(new_seq).lower()
182
183
184
185 def create_bracket_seq(dna_seq,read_seq):
186 """
187 This function takes a dna sequence and a read sequence and returns the
188 bracket format of the match/mismatches i.e.
189
190 dna : aaa
191 read: aac
192 is written in bracket notation: aa[ac]
193 """
194 assert len(dna_seq) == len(read_seq),pdb.set_trace()
195 return "".join(map(lambda x,y: ['[%s%s]'%(x,y),x][x==y],dna_seq,read_seq))
196
197
198
199 def my_load_genomic(fname, strand, start, stop, one_based=True):
200
201 if (type(start)==numpy.ndarray) or (type(stop)==numpy.ndarray):
202 assert(len(start) == len(stop))
203 assert((start[1:]-stop[:-1]>0).all())
204 if strand == '+':
205 idx = xrange(len(start))
206 else:
207 idx = xrange(len(start)-1,-1,-1)
208
209 seq = ''.join([load_genomic(chromosome, strand, start[ix], stop[ix], genome)\
210 for ix in idx])
211 return seq
212
213 try:
214 f=file(fname)
215 if one_based:
216 f.seek(start-1)
217 str=f.read(stop-start+1)
218 else:
219 f.seek(start)
220 str=f.read(stop-start)
221
222 if strand=='-':
223 return reverse_complement(str)
224 elif strand=='+':
225 return str
226 else:
227 print 'strand must be + or -'
228 raise KeyError
229
230 except:
231 print fname,strand,start,stop
232
233
234 class DataAccessWrapper:
235 """
236 The purpose of this class is to wrap the genomic/splice site score data
237 access.
238 """
239
240 def __init__(self,settings):
241 self.genomic_dir = settings['genome_dir']
242 self.acc_score_dir = settings['acceptor_scores_loc']
243 self.don_score_dir = settings['donor_scores_loc']
244 self.genomic_fmt = settings['genome_file_fmt']
245 self.sscore_fmt = settings['splice_score_file_fmt']
246
247
248 def get_genomic_fragment_fn(self,id):
249 return os.path.join(self.genomic_dir,self.genomic_fmt%id)
250
251
252 def get_splice_site_scores_fn(self,id,strand):
253 acc_fn = os.path.join(self.acc_score_dir,self.sscore_fmt%(id,strand))
254 don_fn = os.path.join(self.don_score_dir,self.sscore_fmt%(id,strand))
255 return acc_fn,don_fn
256
257
258
259 class SeqSpliceInfo():
260 """
261
262 """
263
264 def __init__(self,accessWrapper,fragment_list):
265 self.accessWrapper = accessWrapper
266
267 self.chromo_sizes = {}
268
269 # take care that it also works say if we get a chromo_list [1,3,5]
270 # those are mapped then to 0,1,2
271 self.fragment_list = fragment_list
272 self.chromo_map = {}
273 for idx,elem in enumerate(fragment_list):
274 self.chromo_map[elem] = idx
275
276 print "Fetching fragment sizes..."
277 for id in fragment_list:
278 filename = self.accessWrapper.get_genomic_fragment_fn(id)
279 total_size = get_flat_file_size(filename)
280 self.chromo_sizes[self.chromo_map[id]] = total_size
281
282 print "done"
283
284
285 def getFragmentSize(self,id):
286 return self.chromo_sizes[self.chromo_map[id]]
287
288
289 def getSpliceScores(self,id,strand,intervalBegin,intervalEnd):
290 """
291 Now we want to use interval_query to get the predicted splice scores trained
292 on the TAIR sequence and annotation.
293 """
294
295 size = intervalEnd-intervalBegin
296 assert size > 1, 'Error (getSpliceScores): interval size is less than 2!'
297
298 acc = size*[0.0]
299 don = size*[0.0]
300
301 interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
302 pos_size = new_intp()
303 intp_assign(pos_size,1)
304
305 total_size = self.getFragmentSize(id)
306
307 acc_fn, don_fn = self.accessWrapper.get_splice_site_scores_fn(id,strand)
308
309 # fetch acceptor scores
310 acc = doIntervalQuery(id,strand,intervalBegin,intervalEnd,acc_fn,total_size)
311
312 # fetch donor scores
313 don = doIntervalQuery(id,strand,intervalBegin,intervalEnd,don_fn,total_size)
314
315 return acc, don
316
317
318 def fetch_sequence(self,chromo,strand,genomicSeq_start,genomicSeq_stop):
319 filename = self.accessWrapper.get_genomic_fragment_fn(chromo)
320 genomicSeq = my_load_genomic(filename,'+',genomicSeq_start,genomicSeq_stop,one_based=False)
321 genomicSeq = genomicSeq.strip().lower()
322
323 # check the obtained dna sequence
324 assert genomicSeq != '', 'load_genomic returned empty sequence!'
325
326 # all entries other than a c g t are set to n
327 no_base = re.compile('[^acgt]')
328 genomicSeq = no_base.sub('n',genomicSeq)
329
330 return genomicSeq
331
332
333 def get_seq_and_scores(self,id,strand,genomicSeq_start,genomicSeq_stop,only_seq=False,perform_checks=True):
334 """
335 This function expects an interval, chromosome and strand information and
336 returns then the genomic sequence of this interval and the associated scores.
337 """
338 # Because splice site scores are predicted using a window of the sequence the
339 # first and the last NO_SCORE_WINDOW_SIZE nucleotides of a genomic sequence
340 # do not have any score predictions
341 NO_SCORE_WINDOW_SIZE = 205
342
343 chromo = self.chromo_map[id]
344
345 total_size = self.chromo_sizes[chromo]
346
347 if strand == '-':
348 s_start = total_size - genomicSeq_stop
349 s_stop = total_size - genomicSeq_start
350 genomicSeq_start = s_start
351 genomicSeq_stop = s_stop
352
353 assert genomicSeq_start < genomicSeq_stop, pdb.set_trace()
354 assert genomicSeq_start >= 0
355 assert genomicSeq_stop <= total_size
356
357 genomicSeq = self.fetch_sequence(id,strand,genomicSeq_start,genomicSeq_stop)
358
359 if strand == '-':
360 genomicSeq = reverse_complement(genomicSeq)
361
362 if only_seq:
363 return genomicSeq
364
365 # shorten the intervalEnd to the maximal size of the genomic sequence if it would exceed this size
366 lookup_offset_begin = 10
367 lookup_offset_end = 10
368
369 intervalBegin = max(genomicSeq_start-lookup_offset_begin,0)
370 if intervalBegin == 0:
371 lookup_offset_begin = 0
372 intervalBegin = genomicSeq_start
373
374 intervalEnd = min(genomicSeq_stop+lookup_offset_end,total_size-1)
375 if intervalEnd == total_size-1:
376 lookup_offset_end = 0
377 intervalEnd = genomicSeq_stop
378
379 currentAcc, currentDon = self.getSpliceScores(id,strand,intervalBegin,intervalEnd)
380
381 # remove the offset positions
382 if strand == '+':
383 currentAcc = currentAcc[lookup_offset_begin+2:]
384 currentDon = currentDon[lookup_offset_begin+1:]
385 elif strand == '-':
386 currentAcc = currentAcc[lookup_offset_begin:]
387 currentDon = [-inf]+currentDon
388 currentDon = currentDon[(lookup_offset_begin):]
389 else:
390 assert False
391
392 currentAcc = currentAcc[:len(genomicSeq)]
393 currentDon = currentDon[:len(genomicSeq)]
394
395 length = len(genomicSeq)
396 currentAcc = currentAcc+[-inf]*(length-len(currentAcc))
397 currentDon = currentDon+[-inf]*(length-len(currentDon))
398 currentDon[-1] = -inf
399
400 assert len(genomicSeq) == len(currentAcc) == len(currentDon), pdb.set_trace()
401
402 gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if (p>0 and p<len(genomicSeq)-1 and e=='g' ) and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
403 ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and genomicSeq[p-1]=='a' and genomicSeq[p]=='g']
404
405 for pos in ag_tuple_pos:
406 if pos+intervalBegin < NO_SCORE_WINDOW_SIZE and currentAcc[pos] == -inf:
407 currentAcc[pos] = 1e-6
408
409 if pos+intervalBegin > total_size - NO_SCORE_WINDOW_SIZE and currentAcc[pos] == -inf:
410 currentAcc[pos] = 1e-6
411
412 for pos in gt_tuple_pos:
413 if pos+intervalBegin < NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
414 currentDon[pos] = 1e-6
415
416 if pos+intervalBegin > total_size - NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
417 currentDon[pos] = 1e-6
418
419 acc_pos = [p for p,e in enumerate(currentAcc) if e != -inf and p > 1]
420 don_pos = [p for p,e in enumerate(currentDon) if e != -inf and p > 0]
421
422 check_window_size = 30
423
424 if perform_checks and not ag_tuple_pos == acc_pos:
425 print 'ACC: Chromo/strand: %d/%s' % (id,strand)
426 print ag_tuple_pos[:check_window_size]
427 print acc_pos[:check_window_size]
428 print
429 print ag_tuple_pos[-check_window_size:]
430 print acc_pos[-check_window_size:]
431 pdb.set_trace()
432
433 if perform_checks and not gt_tuple_pos == don_pos:
434 print 'DON: Chromo/strand: %d/%s' % (id,strand)
435 print gt_tuple_pos[:check_window_size]
436 print don_pos[:check_window_size]
437 print
438 print gt_tuple_pos[-check_window_size:]
439 print don_pos[-check_window_size:]
440 pdb.set_trace()
441
442
443 return genomicSeq, currentAcc, currentDon
444
445
446
447
448
449
450 ##################################
451 ##################################
452 ##################################
453 ##################################
454
455
456 def _get_seq_and_scores(self,id,strand,genomicSeq_start,genomicSeq_stop,only_seq=False,perform_checks=True):
457 """
458 This function expects an interval, chromosome and strand information and
459 returns then the genomic sequence of this interval and the associated scores.
460 """
461
462 chromo = self.chromo_map[id]
463
464 total_size = self.chromo_sizes[chromo]
465
466 if strand == '-':
467 s_start = total_size - genomicSeq_stop
468 s_stop = total_size - genomicSeq_start
469 genomicSeq_start = s_start
470 genomicSeq_stop = s_stop
471
472 assert genomicSeq_start < genomicSeq_stop, pdb.set_trace()
473
474 genomicSeq = self.fetch_sequence(id,strand,genomicSeq_start,genomicSeq_stop)
475
476 if strand == '-':
477 genomicSeq = reverse_complement(genomicSeq)
478
479 if only_seq:
480 return genomicSeq
481
482 currentAcc,currentDon = self.get_scores(id,strand,genomicSeq_start,genomicSeq_stop,genomicSeq,total_size,perform_checks)
483
484 return genomicSeq, currentAcc, currentDon
485
486
487 def get_scores(self,id,strand,genomicSeq_start,genomicSeq_stop,genomicSeq,total_size,perform_checks):
488 # Because splice site scores are predicted using a window of the sequence the
489 # first and the last NO_SCORE_WINDOW_SIZE nucleotides of a genomic sequence
490 # do not have any score predictions
491 NO_SCORE_WINDOW_SIZE = 205
492
493 seq_size = len(genomicSeq)
494 # shorten the intervalEnd to the maximal size of the genomic sequence if it would exceed this size
495 lookup_offset_begin = 10
496 lookup_offset_end = 10
497
498 intervalBegin = max(genomicSeq_start-lookup_offset_begin,0)
499 if intervalBegin == 0:
500 lookup_offset_begin = 0
501
502 intervalEnd = min(genomicSeq_stop+lookup_offset_end,total_size-1)
503 if intervalEnd == total_size:
504 lookup_offset_end = 0
505
506 print 'interval is: %d %d' % (intervalBegin,intervalEnd)
507
508 # splice score indices are 1-based
509 currentAcc, currentDon = self.getSpliceScores(id,strand,intervalBegin+1,intervalEnd+1)
510
511 print 'original sizes: %d %d' % (len(currentAcc),len(currentDon))
512
513 # remove the offset positions
514 currentAcc = currentAcc[(lookup_offset_begin+1):(lookup_offset_begin+1+seq_size)]
515 currentDon = currentDon[lookup_offset_begin:lookup_offset_begin+seq_size]
516
517 print 'all sizes'
518 print len(genomicSeq)
519 print len(currentAcc)
520 print len(currentDon)
521
522 gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
523 ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and e=='g' and genomicSeq[p-1]=='a' ]
524
525 for pos in ag_tuple_pos:
526 if pos+genomicSeq_start < NO_SCORE_WINDOW_SIZE and currentAcc[pos] == -inf:
527 currentAcc[pos] = 1e-6
528
529 if pos+genomicSeq_start > total_size - NO_SCORE_WINDOW_SIZE and currentAcc[pos] == -inf:
530 currentAcc[pos] = 1e-6
531
532 for pos in gt_tuple_pos:
533 if pos+genomicSeq_start < NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
534 currentDon[pos] = 1e-6
535
536 if pos+genomicSeq_start > total_size - NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
537 currentDon[pos] = 1e-6
538
539 acc_pos = [p for p,e in enumerate(currentAcc) if e != -inf and p > 1]
540 don_pos = [p for p,e in enumerate(currentDon) if e != -inf and p > 0]
541
542 check_window_size = 30
543
544 if perform_checks and not ag_tuple_pos == acc_pos:
545 print 'ACC: Chromo/strand: %d/%s' % (id,strand)
546 print ag_tuple_pos[:check_window_size]
547 print acc_pos[:check_window_size]
548 print
549 print ag_tuple_pos[-check_window_size:]
550 print acc_pos[-check_window_size:]
551 pdb.set_trace()
552
553 if perform_checks and not gt_tuple_pos == don_pos:
554 print 'DON: Chromo/strand: %d/%s' % (id,strand)
555 print gt_tuple_pos[:check_window_size]
556 print don_pos[:check_window_size]
557 print ''
558 print gt_tuple_pos[-check_window_size:]
559 print don_pos[-check_window_size:]
560 pdb.set_trace()
561
562 assert len(genomicSeq) == len(currentAcc) == len(currentDon), pdb.set_trace()
563
564 return currentAcc,currentDon