+ fixed issue with negative strand in lookup table
[qpalma.git] / qpalma / Lookup.py
1 # This program is free software; you can redistribute it and/or modify
2 # it under the terms of the GNU General Public License as published by
3 # the Free Software Foundation; either version 2 of the License, or
4 # (at your option) any later version.
5 #
6 # Written (W) 2008 Fabio De Bona
7 # Copyright (C) 2008 Max-Planck-Society
8
9 import sys
10 import os
11 import pdb
12 from numpy import inf
13
14 from qpalma.sequence_utils import SeqSpliceInfo,get_flat_file_size,DataAccessWrapper,reverse_complement
15
16
17 class LookupTable:
18 """
19 Speed up genomic and splice site score queries by prefetching the sequences
20 and the scores once for all chromosomes.
21 """
22
23 def __init__(self,settings):
24 chromo_list = settings['allowed_fragments']
25 accessWrapper = DataAccessWrapper(settings)
26 self.seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
27
28 # take care that it also works say if we get a chromo_list [1,3,5]
29 # those are mapped then to 0,1,2
30 self.chromo_list = chromo_list
31 self.chromo_map = {}
32 for idx,elem in enumerate(chromo_list):
33 self.chromo_map[elem] = idx
34
35 print 'Chromo mapping is %s' % str(self.chromo_map)
36
37 self.genomicSequences = {}
38 self.acceptorScores = {}
39 self.donorScores = {}
40
41 self.genomicSequences = [0]*(len(self.chromo_list))
42 self.acceptorScoresPos = [0]*(len(self.chromo_list))
43 self.donorScoresPos = [0]*(len(self.chromo_list))
44 self.acceptorScoresNeg = [0]*(len(self.chromo_list))
45 self.donorScoresNeg = [0]*(len(self.chromo_list))
46
47 for chrId in self.chromo_list:
48 self.prefetch_seq_and_scores(chrId)
49
50
51 def get_seq_and_scores(self,chromo,strand,start,stop):
52 assert chromo in self.chromo_list
53 assert start <= stop
54
55 chromo_idx = self.chromo_map[chromo]
56
57
58 if strand == '+':
59 currentSequence = self.genomicSequences[chromo_idx][start:stop]
60 currentAcceptorScores = self.acceptorScoresPos[chromo_idx][start:stop]
61 currentDonorScores = self.donorScoresPos[chromo_idx][start:stop]
62 elif strand == '-':
63 currentSequence = self.genomicSequences[chromo_idx][start:stop]
64 currentSequence = reverse_complement(currentSequence)
65
66 total_size = self.seqInfo.getFragmentSize(chromo)
67 _start = total_size - stop
68 _stop = total_size - start
69 currentAcceptorScores = self.acceptorScoresNeg[chromo_idx][_start:_stop]
70 currentDonorScores = self.donorScoresNeg[chromo_idx][_start:_stop]
71
72 #gt_tuple_pos = [p for p,e in enumerate(currentSequence) if (p>0 and p<len(currentSequence)-1 and e=='g' ) and (currentSequence[p+1]=='t' or currentSequence[p+1]=='c')]
73 #ag_tuple_pos = [p for p,e in enumerate(currentSequence) if p>1 and currentSequence[p-1]=='a' and currentSequence[p]=='g']
74 #acc_pos = [p for p,e in enumerate(currentAcceptorScores) if e != -inf and p > 1]
75 #don_pos = [p for p,e in enumerate(currentDonorScores) if e != -inf and p > 0]
76
77 return currentSequence, currentAcceptorScores, currentDonorScores
78
79
80 def prefetch_seq_and_scores(self,chromo):
81 """
82 This function expects an interval, chromosome and strand information and
83 returns then the genomic sequence of this interval and the associated scores.
84 """
85 assert chromo in self.chromo_list
86
87 genomicSeq_start = 0
88 genomicSeq_stop = self.seqInfo.getFragmentSize(chromo)
89
90 print 'lt total_size %d' % self.seqInfo.getFragmentSize(chromo)
91
92 genomicSeq,currentAcc,currentDon = self.seqInfo.get_seq_and_scores(chromo,'+',genomicSeq_start,genomicSeq_stop)
93 genomicSeq = genomicSeq.lower()
94
95 chromo_idx = self.chromo_map[chromo]
96
97 self.genomicSequences[chromo_idx] = genomicSeq
98 self.acceptorScoresPos[chromo_idx] = currentAcc
99 self.donorScoresPos[chromo_idx] = currentDon
100
101 dummy,currentAcc,currentDon = self.seqInfo.get_seq_and_scores(chromo,'-',genomicSeq_start,genomicSeq_stop)
102
103 #genomicSeq = reverse_complement(self.genomicSequences[chromo_idx])
104 #gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if (p>0 and p<len(genomicSeq)-1 and e=='g' ) and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
105 #ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and genomicSeq[p-1]=='a' and genomicSeq[p]=='g']
106 #acc_pos = [p for p,e in enumerate(currentAcc) if e != -inf and p > 1]
107 #don_pos = [p for p,e in enumerate(currentDon) if e != -inf and p > 0]
108 #pdb.set_trace()
109
110 self.acceptorScoresNeg[chromo_idx] = currentAcc
111 self.donorScoresNeg[chromo_idx] = currentDon