+ changed check for intial setting of old_w. (matrix != None does not work in cvxopt)
[qpalma.git] / qpalma / sequence_utils.py
1 # This program is free software; you can redistribute it and/or modify
2 # it under the terms of the GNU General Public License as published by
3 # the Free Software Foundation; either version 2 of the License, or
4 # (at your option) any later version.
5 #
6 # Written (W) 2008 Fabio De Bona
7 # Copyright (C) 2008 Max-Planck-Society
8
9 #
10 # This file contains the main interface to the QPalma pipeline.
11 #
12 #
13 # This module holds all functions for queries on the dna flat files and the
14 # splice score files.
15 # Furthermore it contains some utility functions such as reverse complement and
16 # unbracketing est strings
17 #
18
19 import os
20 import os.path
21 import pdb
22 import re
23 import subprocess
24 import numpy
25
26 from numpy.matlib import inf
27
28 from Genefinding import doIntervalQuery,new_intp,intp_assign
29
30 from QPalmaDP import createIntArrayFromList
31
32
33 def load_genomic(chromosome, strand, start, stop, genome, one_based=True):
34 """
35 This function stems from Cheng Soon Ongs genome_utils package.
36 """
37
38 if (type(start)==numpy.ndarray) or (type(stop)==numpy.ndarray):
39 assert(len(start) == len(stop))
40 assert((start[1:]-stop[:-1]>0).all())
41 if strand == '+':
42 idx = xrange(len(start))
43 else:
44 idx = xrange(len(start)-1,-1,-1)
45
46 seq = ''.join([load_genomic(chromosome, strand, start[ix], stop[ix], genome)\
47 for ix in idx])
48 return seq
49
50 #fname = '/fml/ag-raetsch/share/databases/genomes/' + genome + '/' + chromosome + '.dna.flat'
51 fname = genome + '/' + chromosome + '.dna.flat'
52 f=file(fname)
53 if one_based:
54 f.seek(start-1)
55 str=f.read(stop-start+1)
56 else:
57 f.seek(start)
58 str=f.read(stop-start)
59
60 if strand=='-':
61 return reverse_complement(str)
62 elif strand=='+':
63 return str
64 else:
65 print 'strand must be + or -'
66 raise KeyError
67
68
69 extended_alphabet = ['-','a','c','g','t','n','[',']']
70 alphabet = ['-','a','c','g','t','n']
71
72
73 class DataAccessionException:
74 pass
75
76
77 def get_flat_file_size(filename):
78 cmd = 'wc -c %s | cut -f1 -d \' \'' % filename
79 obj = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
80 out,err = obj.communicate()
81
82 if err != '':
83 print 'Error occurred while trying to obtain file size'
84 raise DataAccessionException
85
86 # subtract one because of wc -c output
87 return int(out)-1
88
89
90 def reverse_complement(seq):
91 """
92 This function takes a read in plain or bracket notation and returns the
93 reverse complement of it.
94 I.e.
95
96 est = cgt[ac][tg]a
97
98 leads to
99
100 rev_comp = t[ac][tg]acg
101 """
102
103 bpos = seq.find('[')
104 rc = lambda x: {'a':'t','c':'g','g':'c','t':'a','n':'n'}[x]
105
106 # check first whether seq contains no brackets at all
107 if bpos == -1:
108 ret_val = map(rc,seq)
109 ret_val.reverse()
110 ret_val = "".join(ret_val)
111 else:
112 brc = lambda x: {'a':'t','c':'g','g':'c','t':'a','n':'n','[':'[',']':']'}[x]
113
114 # first_part is the part of the seq up to the first occurrence of a
115 # bracket
116 first_part = seq[:bpos]
117 first_part = map(rc,first_part)
118 first_part.reverse()
119 first_part = "".join(first_part)
120
121 # inside brackets has to be complemented but NOT reversed
122 inside_brackets = seq[bpos+1:bpos+3]
123 inside_brackets = "".join(map(rc,inside_brackets))
124
125 ret_val = '%s[%s]%s'%(reverse_complement(seq[bpos+4:]),inside_brackets,first_part)
126
127 return ret_val
128
129
130 def unbracket_seq(seq):
131 """
132 This function takes a read in bracket notation and restores the read sequence from it.
133 I.e.
134
135 seq = cgt[ac][tg]aa
136
137 leads to
138
139 result = cgtcgaa
140
141 so the second entry within brackets is the base on the read whereas the first
142 entry is the base from the dna.
143 """
144
145 new_seq = ''
146 e = 0
147
148 while True:
149 if e >= len(seq):
150 break
151
152 if seq[e] == '[':
153 new_seq += seq[e+2]
154 e += 4
155 else:
156 new_seq += seq[e]
157 e += 1
158
159 return "".join(new_seq).lower()
160
161
162 def reconstruct_dna_seq(seq):
163 """
164
165 """
166
167 new_seq = ''
168 e = 0
169
170 while True:
171 if e >= len(seq):
172 break
173
174 if seq[e] == '[':
175 new_seq += seq[e+1]
176 e += 4
177 else:
178 new_seq += seq[e]
179 e += 1
180
181 return "".join(new_seq).lower()
182
183
184
185 def create_bracket_seq(dna_seq,read_seq):
186 """
187 This function takes a dna sequence and a read sequence and returns the
188 bracket format of the match/mismatches i.e.
189
190 dna : aaa
191 read: aac
192 is written in bracket notation: aa[ac]
193 """
194 assert len(dna_seq) == len(read_seq),pdb.set_trace()
195 return "".join(map(lambda x,y: ['[%s%s]'%(x,y),x][x==y],dna_seq,read_seq))
196
197
198
199 def my_load_genomic(fname, strand, start, stop, one_based=True):
200
201 if (type(start)==numpy.ndarray) or (type(stop)==numpy.ndarray):
202 assert(len(start) == len(stop))
203 assert((start[1:]-stop[:-1]>0).all())
204 if strand == '+':
205 idx = xrange(len(start))
206 else:
207 idx = xrange(len(start)-1,-1,-1)
208
209 seq = ''.join([load_genomic(chromosome, strand, start[ix], stop[ix], genome)\
210 for ix in idx])
211 return seq
212
213 try:
214 f=file(fname)
215 if one_based:
216 f.seek(start-1)
217 str=f.read(stop-start+1)
218 else:
219 f.seek(start)
220 str=f.read(stop-start)
221
222 if strand=='-':
223 return reverse_complement(str)
224 elif strand=='+':
225 return str
226 else:
227 print 'strand must be + or -'
228 raise KeyError
229
230 except:
231 print fname,strand,start,stop
232
233
234 class DataAccessWrapper:
235 """
236 The purpose of this class is to wrap the genomic/splice site score data
237 access.
238 """
239
240 def __init__(self,settings):
241 self.genomic_dir = settings['genome_dir']
242 self.acc_score_dir = settings['acceptor_scores_loc']
243 self.don_score_dir = settings['donor_scores_loc']
244 self.genomic_fmt = settings['genome_file_fmt']
245 self.sscore_fmt = settings['splice_score_file_fmt']
246
247
248 def get_genomic_fragment_fn(self,id):
249 return os.path.join(self.genomic_dir,self.genomic_fmt%id)
250
251
252 def get_splice_site_scores_fn(self,id,strand):
253 acc_fn = os.path.join(self.acc_score_dir,self.sscore_fmt%(id,strand))
254 don_fn = os.path.join(self.don_score_dir,self.sscore_fmt%(id,strand))
255 return acc_fn,don_fn
256
257
258
259 class SeqSpliceInfo():
260 """
261
262 """
263
264 def __init__(self,accessWrapper,fragment_list):
265 self.accessWrapper = accessWrapper
266
267 self.chromo_sizes = {}
268
269 # take care that it also works say if we get a chromo_list [1,3,5]
270 # those are mapped then to 0,1,2
271 self.fragment_list = fragment_list
272 self.chromo_map = {}
273 for idx,elem in enumerate(fragment_list):
274 self.chromo_map[elem] = idx
275
276 print "Fetching fragment sizes..."
277 for id in fragment_list:
278 filename = self.accessWrapper.get_genomic_fragment_fn(id)
279 total_size = get_flat_file_size(filename)
280 self.chromo_sizes[self.chromo_map[id]] = total_size
281
282 print "done"
283
284
285 def getFragmentSize(self,id):
286 return self.chromo_sizes[self.chromo_map[id]]
287
288
289 def getSpliceScores(self,id,strand,intervalBegin,intervalEnd):
290 """
291 Now we want to use interval_query to get the predicted splice scores trained
292 on the TAIR sequence and annotation.
293 """
294
295 size = intervalEnd-intervalBegin
296 assert size > 1, 'Error (getSpliceScores): interval size is less than 2!'
297
298 acc = size*[0.0]
299 don = size*[0.0]
300
301 interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
302 pos_size = new_intp()
303 intp_assign(pos_size,1)
304
305 total_size = self.getFragmentSize(id)
306
307 acc_fn, don_fn = self.accessWrapper.get_splice_site_scores_fn(id,strand)
308
309 # fetch acceptor scores
310 acc = doIntervalQuery(id,strand,intervalBegin,intervalEnd,acc_fn,total_size)
311
312 # fetch donor scores
313 don = doIntervalQuery(id,strand,intervalBegin,intervalEnd,don_fn,total_size)
314
315 return acc, don
316
317
318 def fetch_sequence(self,chromo,strand,genomicSeq_start,genomicSeq_stop):
319 filename = self.accessWrapper.get_genomic_fragment_fn(chromo)
320 genomicSeq = my_load_genomic(filename,'+',genomicSeq_start,genomicSeq_stop,one_based=False)
321 genomicSeq = genomicSeq.strip().lower()
322
323 # check the obtained dna sequence
324 assert genomicSeq != '', 'load_genomic returned empty sequence!'
325
326 # all entries other than a c g t are set to n
327 no_base = re.compile('[^acgt]')
328 genomicSeq = no_base.sub('n',genomicSeq)
329
330 return genomicSeq
331
332
333 def get_seq_and_scores(self,id,strand,genomicSeq_start,genomicSeq_stop,only_seq=False,perform_checks=True):
334 """
335 This function expects an interval, chromosome and strand information and
336 returns then the genomic sequence of this interval and the associated scores.
337 """
338 # Because splice site scores are predicted using a window of the sequence the
339 # first and the last NO_SCORE_WINDOW_SIZE nucleotides of a genomic sequence
340 # do not have any score predictions
341 NO_SCORE_WINDOW_SIZE = 205
342
343 chromo = self.chromo_map[id]
344
345 total_size = self.chromo_sizes[chromo]
346
347 #print 'genomicSeq: ' + str(genomicSeq_start),str(genomicSeq_stop)
348
349 assert genomicSeq_start < genomicSeq_stop, pdb.set_trace()
350 assert genomicSeq_start >= 0
351 assert genomicSeq_stop <= total_size
352
353 genomicSeq = self.fetch_sequence(id,strand,genomicSeq_start,genomicSeq_stop)
354
355 if strand == '-':
356 genomicSeq = reverse_complement(genomicSeq)
357
358 if only_seq:
359 return genomicSeq
360
361 # shorten the intervalEnd to the maximal size of the genomic sequence if it would exceed this size
362 lookup_offset_begin = 10
363 lookup_offset_end = 10
364
365 intervalBegin = max(genomicSeq_start-lookup_offset_begin,0)
366 if intervalBegin == 0:
367 lookup_offset_begin = 0
368 intervalBegin = genomicSeq_start
369
370 intervalEnd = min(genomicSeq_stop+lookup_offset_end,total_size-1)
371 if intervalEnd == total_size-1:
372 lookup_offset_end = 0
373 intervalEnd = genomicSeq_stop
374
375 currentAcc, currentDon = self.getSpliceScores(id,strand,intervalBegin,intervalEnd)
376
377 # remove the offset positions
378 if strand == '+':
379 currentAcc = currentAcc[lookup_offset_begin+2:]
380 currentDon = currentDon[lookup_offset_begin+1:]
381 elif strand == '-':
382 currentAcc = currentAcc[lookup_offset_begin:]
383 currentDon = [-inf]+currentDon
384 currentDon = currentDon[(lookup_offset_begin):]
385 else:
386 assert False
387
388 currentAcc = currentAcc[:len(genomicSeq)]
389 currentDon = currentDon[:len(genomicSeq)]
390
391 length = len(genomicSeq)
392 currentAcc = currentAcc+[-inf]*(length-len(currentAcc))
393 currentDon = currentDon+[-inf]*(length-len(currentDon))
394 currentDon[-1] = -inf
395
396 assert len(genomicSeq) == len(currentAcc) == len(currentDon), pdb.set_trace()
397
398 gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if (p>0 and p<len(genomicSeq)-1 and e=='g' ) and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
399 ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and genomicSeq[p-1]=='a' and genomicSeq[p]=='g']
400
401 for pos in ag_tuple_pos:
402 if pos+intervalBegin < NO_SCORE_WINDOW_SIZE and currentAcc[pos] == -inf:
403 currentAcc[pos] = 1e-6
404
405 if pos+intervalBegin > total_size - NO_SCORE_WINDOW_SIZE and currentAcc[pos] == -inf:
406 currentAcc[pos] = 1e-6
407
408 for pos in gt_tuple_pos:
409 if pos+intervalBegin < NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
410 currentDon[pos] = 1e-6
411
412 if pos+intervalBegin > total_size - NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
413 currentDon[pos] = 1e-6
414
415 acc_pos = [p for p,e in enumerate(currentAcc) if e != -inf and p > 1]
416 don_pos = [p for p,e in enumerate(currentDon) if e != -inf and p > 0]
417
418 check_window_size = 30
419
420 if perform_checks and not ag_tuple_pos == acc_pos:
421 print 'ACC: Chromo/strand: %d/%s' % (id,strand)
422 print ag_tuple_pos[:check_window_size]
423 print acc_pos[:check_window_size]
424 print
425 print ag_tuple_pos[-check_window_size:]
426 print acc_pos[-check_window_size:]
427 pdb.set_trace()
428
429 if perform_checks and not gt_tuple_pos == don_pos:
430 print 'DON: Chromo/strand: %d/%s' % (id,strand)
431 print gt_tuple_pos[:check_window_size]
432 print don_pos[:check_window_size]
433 print
434 print gt_tuple_pos[-check_window_size:]
435 print don_pos[-check_window_size:]
436 pdb.set_trace()
437
438
439 return genomicSeq, currentAcc, currentDon
440
441
442
443
444
445
446 ##################################
447 ##################################
448 ##################################
449 ##################################
450
451
452 def _get_seq_and_scores(self,id,strand,genomicSeq_start,genomicSeq_stop,only_seq=False,perform_checks=True):
453 """
454 This function expects an interval, chromosome and strand information and
455 returns then the genomic sequence of this interval and the associated scores.
456 """
457
458 chromo = self.chromo_map[id]
459
460 total_size = self.chromo_sizes[chromo]
461
462 if strand == '-':
463 s_start = total_size - genomicSeq_stop
464 s_stop = total_size - genomicSeq_start
465 genomicSeq_start = s_start
466 genomicSeq_stop = s_stop
467
468
469 assert genomicSeq_start < genomicSeq_stop, pdb.set_trace()
470
471 genomicSeq = self.fetch_sequence(id,strand,genomicSeq_start,genomicSeq_stop)
472
473 if strand == '-':
474 genomicSeq = reverse_complement(genomicSeq)
475
476 if only_seq:
477 return genomicSeq
478
479 currentAcc,currentDon = self.get_scores(id,strand,genomicSeq_start,genomicSeq_stop,genomicSeq,total_size,perform_checks)
480
481 return genomicSeq, currentAcc, currentDon
482
483
484 def get_scores(self,id,strand,genomicSeq_start,genomicSeq_stop,genomicSeq,total_size,perform_checks):
485 # Because splice site scores are predicted using a window of the sequence the
486 # first and the last NO_SCORE_WINDOW_SIZE nucleotides of a genomic sequence
487 # do not have any score predictions
488 NO_SCORE_WINDOW_SIZE = 205
489
490 seq_size = len(genomicSeq)
491 # shorten the intervalEnd to the maximal size of the genomic sequence if it would exceed this size
492 lookup_offset_begin = 10
493 lookup_offset_end = 10
494
495 intervalBegin = max(genomicSeq_start-lookup_offset_begin,0)
496 if intervalBegin == 0:
497 lookup_offset_begin = 0
498
499 intervalEnd = min(genomicSeq_stop+lookup_offset_end,total_size-1)
500 if intervalEnd == total_size:
501 lookup_offset_end = 0
502
503 print 'interval is: %d %d' % (intervalBegin,intervalEnd)
504
505 # splice score indices are 1-based
506 currentAcc, currentDon = self.getSpliceScores(id,strand,intervalBegin+1,intervalEnd+1)
507
508 print 'original sizes: %d %d' % (len(currentAcc),len(currentDon))
509
510 # remove the offset positions
511 currentAcc = currentAcc[(lookup_offset_begin+1):(lookup_offset_begin+1+seq_size)]
512 currentDon = currentDon[lookup_offset_begin:lookup_offset_begin+seq_size]
513
514 print 'all sizes'
515 print len(genomicSeq)
516 print len(currentAcc)
517 print len(currentDon)
518
519 gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
520 ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and e=='g' and genomicSeq[p-1]=='a' ]
521
522 for pos in ag_tuple_pos:
523 if pos+genomicSeq_start < NO_SCORE_WINDOW_SIZE and currentAcc[pos] == -inf:
524 currentAcc[pos] = 1e-6
525
526 if pos+genomicSeq_start > total_size - NO_SCORE_WINDOW_SIZE and currentAcc[pos] == -inf:
527 currentAcc[pos] = 1e-6
528
529 for pos in gt_tuple_pos:
530 if pos+genomicSeq_start < NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
531 currentDon[pos] = 1e-6
532
533 if pos+genomicSeq_start > total_size - NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
534 currentDon[pos] = 1e-6
535
536 acc_pos = [p for p,e in enumerate(currentAcc) if e != -inf and p > 1]
537 don_pos = [p for p,e in enumerate(currentDon) if e != -inf and p > 0]
538
539 check_window_size = 30
540
541 if perform_checks and not ag_tuple_pos == acc_pos:
542 print 'ACC: Chromo/strand: %d/%s' % (id,strand)
543 print ag_tuple_pos[:check_window_size]
544 print acc_pos[:check_window_size]
545 print
546 print ag_tuple_pos[-check_window_size:]
547 print acc_pos[-check_window_size:]
548 pdb.set_trace()
549
550 if perform_checks and not gt_tuple_pos == don_pos:
551 print 'DON: Chromo/strand: %d/%s' % (id,strand)
552 print gt_tuple_pos[:check_window_size]
553 print don_pos[:check_window_size]
554 print ''
555 print gt_tuple_pos[-check_window_size:]
556 print don_pos[-check_window_size:]
557 pdb.set_trace()
558
559 assert len(genomicSeq) == len(currentAcc) == len(currentDon), pdb.set_trace()
560
561 return currentAcc,currentDon