+ minor modifications for the prefetching the whole negative strand
[qpalma.git] / qpalma / Lookup.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import sys
5 import os
6
7 from qpalma.sequence_utils import SeqSpliceInfo,get_flat_file_size,DataAccessWrapper
8
9
10 class LookupTable:
11 """
12 Speed up genomic and splice site score queries by prefetching the sequences
13 and the scores once for all chromosomes.
14 """
15
16 def __init__(self,genomic_data_dir,acc_score_dir,don_score_dir,gen_fmt,sscore_fmt,chromo_list):
17 accessWrapper = DataAccessWrapper(genomic_data_dir,acc_score_dir,don_score_dir,gen_fmt,sscore_fmt)
18 self.seqInfo = SeqSpliceInfo(accessWrapper,chromo_list)
19
20 self.strands = ['+','-']
21
22 # take care that it also works say if we get a chromo_list [1,3,5]
23 # those are mapped then to 0,1,2
24 self.chromo_list = chromo_list
25 self.chromo_map = {}
26 for idx,elem in enumerate(chromo_list):
27 self.chromo_map[elem] = idx
28
29 print 'Chromo mapping is %s' % str(self.chromo_map)
30
31 self.genomicSequences = {}
32 self.acceptorScores = {}
33 self.donorScores = {}
34
35 for strandId in self.strands:
36 self.genomicSequences[strandId] = [0]*(len(self.chromo_list))
37 self.acceptorScores[strandId] = [0]*(len(self.chromo_list))
38 self.donorScores[strandId] = [0]*(len(self.chromo_list))
39
40 for chrId in self.chromo_list:
41 for strandId in self.strands:
42 print (chrId,strandId)
43 self.prefetch_seq_and_scores(chrId,strandId)
44
45
46 def get_seq_and_scores(self,chromo,strand,start,stop,dna_flat_files):
47 assert chromo in self.chromo_list
48 assert strand in self.strands
49
50 assert start <= stop
51
52 chromo_idx = self.chromo_map[chromo]
53
54 start -= 1
55 stop -= 1
56
57 currentSequence = self.genomicSequences[strand][chromo_idx][start:stop]
58
59 if strand == '-':
60 start += 2
61 stop += 2
62
63 currentAcceptorScores = self.acceptorScores[strand][chromo_idx][start:stop]
64 currentDonorScores = self.donorScores[strand][chromo_idx][start:stop]
65
66 if strand == '-':
67 currentSequence = currentSequence[::-1]
68 currentAcceptorScores.reverse()
69 currentDonorScores.reverse()
70
71 return currentSequence, currentAcceptorScores, currentDonorScores
72
73
74 def prefetch_seq_and_scores(self,chromo,strand):
75 """
76 This function expects an interval, chromosome and strand information and
77 returns then the genomic sequence of this interval and the associated scores.
78 """
79 assert chromo in self.chromo_list
80 assert strand in self.strands
81
82 chromo_idx = self.chromo_map[chromo]
83
84 genomicSeq_start = 1
85 genomicSeq_stop = self.seqInfo.chromo_sizes[chromo]-1
86
87 print 'lt total_size %d' % self.seqInfo.chromo_sizes[chromo]
88
89 if strand == '-':
90 genomicSeq_stop = self.seqInfo.chromo_sizes[chromo]+1
91
92 genomicSeq,currentAcc,currentDon = self.seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,False,False)
93 genomicSeq = genomicSeq.lower()
94
95 if strand == '-':
96 genomicSeq = genomicSeq[::-1]
97 currentAcc.reverse()
98 currentDon.reverse()
99
100 self.genomicSequences[strand][chromo_idx] = genomicSeq
101 self.acceptorScores[strand][chromo_idx] = currentAcc
102 self.donorScores[strand][chromo_idx] = currentDon