import random
from numpy.matlib import inf
-from qpalma.sequence_utils import SeqSpliceInfo,get_flat_file_size
+from qpalma.sequence_utils import SeqSpliceInfo,get_flat_file_size,DataAccessWrapper
-class Lookup:
+class LookupTable:
"""
Speed up genomic and splice site score queries by prefetching the sequences
and the scores once for all chromosomes.
"""
- def __init__(self,dna_flat_files):
- self.dna_flat_files = dna_flat_files
-
- self.chromosomes = range(1,6)
- self.strands = ['+','-']
-
- flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
- chromo_list = [1,2,3,4,5,8,9,10,11,12]
+ def __init__(self,genomic_data_dir,acc_score_dir,don_score_dir,gen_fmt,sscore_fmt,chromo_list):
+ accessWrapper = DataAccessWrapper(genomic_data_dir,acc_score_dir,don_score_dir,gen_fmt,sscore_fmt)
self.seqInfo = SeqSpliceInfo(flat_files,chromo_list)
- self.genomicSequencesP = [0]*(len(self.chromosomes)+1)
- self.acceptorScoresP = [0]*(len(self.chromosomes)+1)
- self.donorScoresP = [0]*(len(self.chromosomes)+1)
-
- self.genomicSequencesN = [0]*(len(self.chromosomes)+1)
- self.acceptorScoresN = [0]*(len(self.chromosomes)+1)
- self.donorScoresN = [0]*(len(self.chromosomes)+1)
+ self.strands = ['+','-']
+
+ # take care that it also works say if we get a chromo_list [1,3,5]
+ # those are mapped then to 0,1,2
+ self.chromo_list = chromo_list
+ self.chromo_map = []
+ for elem in chromo_list:
+ self.chromo_map.append(elem)
+
+ self.genomicSequencesP = [0]*(len(self.chromo_map)+1)
+ self.acceptorScoresP = [0]*(len(self.chromo_map)+1)
+ self.donorScoresP = [0]*(len(self.chromo_map)+1)
+
+ self.genomicSequencesN = [0]*(len(self.chromo_map)+1)
+ self.acceptorScoresN = [0]*(len(self.chromo_map)+1)
+ self.donorScoresN = [0]*(len(self.chromo_map)+1)
- #for chrId in self.chromosomes:
- #for chrId in range(1,6):
- for chrId in range(1,2):
+ for chrId in self.chromo_list:
for strandId in self.strands:
print (chrId,strandId)
self.prefetch_seq_and_scores(chrId,strandId)
def get_seq_and_scores(self,chromo,strand,start,stop,dna_flat_files):
- assert chromo in self.chromosomes
+ assert chromo in self.chromo_list
assert strand in self.strands
- currentSequence = self.genomicSequencesP[chromo][start:stop]
- currentDonorScores = self.donorScoresP[chromo][start:stop]
- currentAcceptorScores = self.acceptorScoresP[chromo][start:stop]
+ chromo_idx = self.chromo_map[chromo]
+
+ if strand == '+':
+ currentSequence = self.genomicSequencesP[chromo_idx][start:stop]
+ currentDonorScores = self.donorScoresP[chromo_idx][start:stop]
+ currentAcceptorScores = self.acceptorScoresP[chromo_idx][start:stop]
+ elif strand == '-':
+ currentSequence = self.genomicSequencesN[chromo_idx][start:stop]
+ currentDonorScores = self.donorScoresN[chromo_idx][start:stop]
+ currentAcceptorScores = self.acceptorScoresN[chromo_idx][start:stop]
+ else:
+ assert False
return currentSequence, currentAcceptorScores, currentDonorScores
returns then the genomic sequence of this interval and the associated scores.
"""
- genomicSeq_start = 0
+ chromo_idx = self.chromo_map[chromo]
- if strand == '-':
- genomicSeq_stop = self.seqInfo.chromo_sizes[chromo+7]
- else:
- genomicSeq_stop = self.seqInfo.chromo_sizes[chromo]
+ genomicSeq_start = 0
genomicSeq,currentAcc,currentDon = self.seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop)
genomicSeq = genomicSeq.lower()
if strand == '+':
- self.genomicSequencesP[chromo] = genomicSeq
- self.acceptorScoresP[chromo] = currentAcc
- self.donorScoresP[chromo] = currentDon
+ self.genomicSequencesP[chromo_idx] = genomicSeq
+ self.acceptorScoresP[chromo_idx] = currentAcc
+ self.donorScoresP[chromo_idx] = currentDon
elif strand == '-':
- self.genomicSequencesN[chromo] = genomicSeq
- self.acceptorScoresN[chromo] = currentAcc
- self.donorScoresN[chromo] = currentDon
+ self.genomicSequencesN[chromo_idx] = genomicSeq
+ self.acceptorScoresN[chromo_idx] = currentAcc
+ self.donorScoresN[chromo_idx] = currentDon
else:
assert False
-
-
-if __name__ == '__main__':
- dna_flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
- lt1 = Lookup(dna_flat_files)
-
- #self.lt1.seqInfo.get_seq_and_scores