f22ecd75db48a03f331b1e7f43a49d25d15a6e90
[qpalma.git] / qpalma / Lookup.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import sys
5 import os
6
7 from qpalma.sequence_utils import SeqSpliceInfo,get_flat_file_size,DataAccessWrapper
8
9
10 class LookupTable:
11 """
12 Speed up genomic and splice site score queries by prefetching the sequences
13 and the scores once for all chromosomes.
14 """
15
16 def __init__(self,settings):
17 chromo_list = settings['allowed_fragments']
18 accessWrapper = DataAccessWrapper(settings)
19 self.seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
20
21 self.strands = ['+','-']
22
23 # take care that it also works say if we get a chromo_list [1,3,5]
24 # those are mapped then to 0,1,2
25 self.chromo_list = chromo_list
26 self.chromo_map = {}
27 for idx,elem in enumerate(chromo_list):
28 self.chromo_map[elem] = idx
29
30 print 'Chromo mapping is %s' % str(self.chromo_map)
31
32 self.genomicSequences = {}
33 self.acceptorScores = {}
34 self.donorScores = {}
35
36 for strandId in self.strands:
37 self.genomicSequences[strandId] = [0]*(len(self.chromo_list))
38 self.acceptorScores[strandId] = [0]*(len(self.chromo_list))
39 self.donorScores[strandId] = [0]*(len(self.chromo_list))
40
41 for chrId in self.chromo_list:
42 for strandId in self.strands:
43 print (chrId,strandId)
44 self.prefetch_seq_and_scores(chrId,strandId)
45
46
47 def get_seq_and_scores(self,chromo,strand,start,stop):
48 assert chromo in self.chromo_list
49 assert strand in self.strands
50
51 assert start <= stop
52
53 chromo_idx = self.chromo_map[chromo]
54
55 start -= 1
56 stop -= 1
57
58 currentSequence = self.genomicSequences[strand][chromo_idx][start:stop]
59
60 if strand == '-':
61 start += 2
62 stop += 2
63
64 currentAcceptorScores = self.acceptorScores[strand][chromo_idx][start:stop]
65 currentDonorScores = self.donorScores[strand][chromo_idx][start:stop]
66
67 if strand == '-':
68 currentSequence = currentSequence[::-1]
69 currentAcceptorScores.reverse()
70 currentDonorScores.reverse()
71
72 return currentSequence, currentAcceptorScores, currentDonorScores
73
74
75 def prefetch_seq_and_scores(self,chromo,strand):
76 """
77 This function expects an interval, chromosome and strand information and
78 returns then the genomic sequence of this interval and the associated scores.
79 """
80 assert chromo in self.chromo_list
81 assert strand in self.strands
82
83 chromo_idx = self.chromo_map[chromo]
84
85 genomicSeq_start = 1
86 genomicSeq_stop = self.seqInfo.chromo_sizes[chromo]-1
87
88 print 'lt total_size %d' % self.seqInfo.chromo_sizes[chromo]
89
90 if strand == '-':
91 genomicSeq_stop = self.seqInfo.chromo_sizes[chromo]+1
92
93 genomicSeq,currentAcc,currentDon = self.seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,False,False)
94 genomicSeq = genomicSeq.lower()
95
96 if strand == '-':
97 genomicSeq = genomicSeq[::-1]
98 currentAcc.reverse()
99 currentDon.reverse()
100
101 self.genomicSequences[strand][chromo_idx] = genomicSeq
102 self.acceptorScores[strand][chromo_idx] = currentAcc
103 self.donorScores[strand][chromo_idx] = currentDon