da9365cfd2c67c47df95451c948e778f19fb3b74
[qpalma.git] / qpalma / Lookup.py
1 # This program is free software; you can redistribute it and/or modify
2 # it under the terms of the GNU General Public License as published by
3 # the Free Software Foundation; either version 2 of the License, or
4 # (at your option) any later version.
5 #
6 # Written (W) 2008 Fabio De Bona
7 # Copyright (C) 2008 Max-Planck-Society
8
9 import sys
10 import os
11
12 from qpalma.sequence_utils import SeqSpliceInfo,get_flat_file_size,DataAccessWrapper
13
14
15 class LookupTable:
16 """
17 Speed up genomic and splice site score queries by prefetching the sequences
18 and the scores once for all chromosomes.
19 """
20
21 def __init__(self,settings):
22 chromo_list = settings['allowed_fragments']
23 accessWrapper = DataAccessWrapper(settings)
24 self.seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
25
26 self.strands = ['+','-']
27
28 # take care that it also works say if we get a chromo_list [1,3,5]
29 # those are mapped then to 0,1,2
30 self.chromo_list = chromo_list
31 self.chromo_map = {}
32 for idx,elem in enumerate(chromo_list):
33 self.chromo_map[elem] = idx
34
35 print 'Chromo mapping is %s' % str(self.chromo_map)
36
37 self.genomicSequences = {}
38 self.acceptorScores = {}
39 self.donorScores = {}
40
41 for strandId in self.strands:
42 self.genomicSequences[strandId] = [0]*(len(self.chromo_list))
43 self.acceptorScores[strandId] = [0]*(len(self.chromo_list))
44 self.donorScores[strandId] = [0]*(len(self.chromo_list))
45
46 for chrId in self.chromo_list:
47 for strandId in self.strands:
48 print (chrId,strandId)
49 self.prefetch_seq_and_scores(chrId,strandId)
50
51
52 def get_seq_and_scores(self,chromo,strand,start,stop):
53 assert chromo in self.chromo_list
54 assert strand in self.strands
55
56 assert start <= stop
57
58 chromo_idx = self.chromo_map[chromo]
59
60 start -= 1
61 stop -= 1
62
63 currentSequence = self.genomicSequences[strand][chromo_idx][start:stop]
64
65 if strand == '-':
66 start += 2
67 stop += 2
68
69 currentAcceptorScores = self.acceptorScores[strand][chromo_idx][start:stop]
70 currentDonorScores = self.donorScores[strand][chromo_idx][start:stop]
71
72 if strand == '-':
73 currentSequence = currentSequence[::-1]
74 currentAcceptorScores.reverse()
75 currentDonorScores.reverse()
76
77 return currentSequence, currentAcceptorScores, currentDonorScores
78
79
80 def prefetch_seq_and_scores(self,chromo,strand):
81 """
82 This function expects an interval, chromosome and strand information and
83 returns then the genomic sequence of this interval and the associated scores.
84 """
85 assert chromo in self.chromo_list
86 assert strand in self.strands
87
88 chromo_idx = self.chromo_map[chromo]
89
90 genomicSeq_start = 1
91 genomicSeq_stop = self.seqInfo.chromo_sizes[chromo]-1
92
93 print 'lt total_size %d' % self.seqInfo.chromo_sizes[chromo]
94
95 if strand == '-':
96 genomicSeq_stop = self.seqInfo.chromo_sizes[chromo]+1
97
98 genomicSeq,currentAcc,currentDon = self.seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,False,False)
99 genomicSeq = genomicSeq.lower()
100
101 if strand == '-':
102 genomicSeq = genomicSeq[::-1]
103 currentAcc.reverse()
104 currentDon.reverse()
105
106 self.genomicSequences[strand][chromo_idx] = genomicSeq
107 self.acceptorScores[strand][chromo_idx] = currentAcc
108 self.donorScores[strand][chromo_idx] = currentDon