--- /dev/null
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import sys
+import os
+
+from qpalma.sequence_utils import SeqSpliceInfo,get_flat_file_size,DataAccessWrapper
+
+
+class LookupTable:
+ """
+ Speed up genomic and splice site score queries by prefetching the sequences
+ and the scores once for all chromosomes.
+ """
+
+ def __init__(self,genomic_data_dir,acc_score_dir,don_score_dir,gen_fmt,sscore_fmt,chromo_list):
+ accessWrapper = DataAccessWrapper(genomic_data_dir,acc_score_dir,don_score_dir,gen_fmt,sscore_fmt)
+ self.seqInfo = SeqSpliceInfo(accessWrapper,chromo_list)
+
+ self.strands = ['+','-']
+
+ # take care that it also works say if we get a chromo_list [1,3,5]
+ # those are mapped then to 0,1,2
+ self.chromo_list = chromo_list
+ self.chromo_map = {}
+ for idx,elem in enumerate(chromo_list):
+ self.chromo_map[elem] = idx
+
+ print 'Chromo mapping is %s' % str(self.chromo_map)
+
+ self.genomicSequences = {}
+ self.acceptorScores = {}
+ self.donorScores = {}
+
+ for strandId in self.strands:
+ self.genomicSequences[strandId] = [0]*(len(self.chromo_list))
+ self.acceptorScores[strandId] = [0]*(len(self.chromo_list))
+ self.donorScores[strandId] = [0]*(len(self.chromo_list))
+
+ for chrId in self.chromo_list:
+ for strandId in self.strands:
+ print (chrId,strandId)
+ self.prefetch_seq_and_scores(chrId,strandId)
+
+
+ def get_seq_and_scores(self,chromo,strand,start,stop,dna_flat_files):
+ assert chromo in self.chromo_list
+ assert strand in self.strands
+
+ assert start <= stop
+
+ chromo_idx = self.chromo_map[chromo]
+
+ start -= 1
+ stop -= 1
+
+ currentSequence = self.genomicSequences[strand][chromo_idx][start:stop]
+
+ if strand == '-':
+ start += 2
+ stop += 2
+
+ currentAcceptorScores = self.acceptorScores[strand][chromo_idx][start:stop]
+ currentDonorScores = self.donorScores[strand][chromo_idx][start:stop]
+
+ if strand == '-':
+ currentSequence = currentSequence[::-1]
+ currentAcceptorScores.reverse()
+ currentDonorScores.reverse()
+
+ return currentSequence, currentAcceptorScores, currentDonorScores
+
+
+ def prefetch_seq_and_scores(self,chromo,strand):
+ """
+ This function expects an interval, chromosome and strand information and
+ returns then the genomic sequence of this interval and the associated scores.
+ """
+ assert chromo in self.chromo_list
+ assert strand in self.strands
+
+ chromo_idx = self.chromo_map[chromo]
+
+ genomicSeq_start = 1
+ genomicSeq_stop = self.seqInfo.chromo_sizes[chromo]-1
+
+ print 'lt total_size %d' % self.seqInfo.chromo_sizes[chromo]
+
+ if strand == '-':
+ genomicSeq_stop = self.seqInfo.chromo_sizes[chromo]+1
+
+ genomicSeq,currentAcc,currentDon = self.seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,False,False)
+ genomicSeq = genomicSeq.lower()
+
+ if strand == '-':
+ genomicSeq = genomicSeq[::-1]
+ currentAcc.reverse()
+ currentDon.reverse()
+
+ self.genomicSequences[strand][chromo_idx] = genomicSeq
+ self.acceptorScores[strand][chromo_idx] = currentAcc
+ self.donorScores[strand][chromo_idx] = currentDon
#chromo_string = 'chr%d' % chromo
#genomicSeq = load_genomic(chromo_string,strand,genomicSeq_start,genomicSeq_stop,self.flat_files,one_based=False)
filename = self.accessWrapper.get_genomic_fragment_fn(chromo,strand)
- print filename,genomicSeq_start,genomicSeq_stop
+ #print filename,genomicSeq_start,genomicSeq_stop
genomicSeq = my_load_genomic(filename,'+',genomicSeq_start,genomicSeq_stop,one_based=False)
genomicSeq = genomicSeq.strip().lower()
return genomicSeq
- def get_seq_and_scores(self,chromo,strand,genomicSeq_start,genomicSeq_stop,only_seq=False):
+ def get_seq_and_scores(self,chromo,strand,genomicSeq_start,genomicSeq_stop,only_seq=False,perform_checks=True):
"""
This function expects an interval, chromosome and strand information and
returns then the genomic sequence of this interval and the associated scores.
gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if (p>0 and p<len(genomicSeq)-1 and e=='g' ) and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and genomicSeq[p-1]=='a' and genomicSeq[p]=='g']
- #pdb.set_trace()
- #return genomicSeq, currentAcc, currentDon
-
for pos in ag_tuple_pos:
if pos+intervalBegin < NO_SCORE_WINDOW_SIZE and currentAcc[pos] == -inf:
currentAcc[pos] = 1e-6
acc_pos = [p for p,e in enumerate(currentAcc) if e != -inf and p > 1]
don_pos = [p for p,e in enumerate(currentDon) if e != -inf and p > 0]
- print 'ACC: Chromo/strand: %d/%s' % (chromo,strand)
- print ag_tuple_pos[:10],ag_tuple_pos[-10:]
- print acc_pos[:10],acc_pos[-10:]
- if not ag_tuple_pos == acc_pos:
+ if perform_checks and not ag_tuple_pos == acc_pos:
print 'ACC: Chromo/strand: %d/%s' % (chromo,strand)
print ag_tuple_pos[:10],ag_tuple_pos[-10:]
print acc_pos[:10],acc_pos[-10:]
pdb.set_trace()
- print 'DON: Chromo/strand: %d/%s' % (chromo,strand)
- print gt_tuple_pos[:10],gt_tuple_pos[-10:]
- print don_pos[:10],don_pos[-10:]
- if not gt_tuple_pos == don_pos:
+ if perform_checks and not gt_tuple_pos == don_pos:
print 'DON: Chromo/strand: %d/%s' % (chromo,strand)
print gt_tuple_pos[:10],gt_tuple_pos[-10:]
print don_pos[:10],don_pos[-10:]
pdb.set_trace()
- #assert ag_tuple_pos == acc_pos, pdb.set_trace()
- #assert gt_tuple_pos == don_pos, pdb.set_trace()
assert len(genomicSeq) == len(currentAcc) == len(currentDon), pdb.set_trace()
return genomicSeq, currentAcc, currentDon
# forgot to do this
if strand == '-':
- pos = self.lt1.seqInfo.chromo_sizes[chr]-pos-self.read_size
+ #pos = self.lt1.seqInfo.chromo_sizes[chr]-pos-self.read_size
unb_seq = reverse_complement(unb_seq)
effective_len = len(unb_seq)
genomicSeq_start = pos
- genomicSeq_stop = pos+effective_len-1
+ #genomicSeq_stop = pos+effective_len-1
+ genomicSeq_stop = pos+effective_len
start = cpu()
#currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])