+ fixed bug in position calculation for negative strand
[qpalma.git] / scripts / Lookup.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import sys
5 import random
6 import os
7 import re
8 import pdb
9 import random
10
11 from numpy.matlib import inf
12 from qpalma.sequence_utils import SeqSpliceInfo,get_flat_file_size,DataAccessWrapper
13
14
15 class LookupTable:
16 """
17 Speed up genomic and splice site score queries by prefetching the sequences
18 and the scores once for all chromosomes.
19 """
20
21 def __init__(self,genomic_data_dir,acc_score_dir,don_score_dir,gen_fmt,sscore_fmt,chromo_list):
22 accessWrapper = DataAccessWrapper(genomic_data_dir,acc_score_dir,don_score_dir,gen_fmt,sscore_fmt)
23 self.seqInfo = SeqSpliceInfo(flat_files,chromo_list)
24
25 self.strands = ['+','-']
26
27 # take care that it also works say if we get a chromo_list [1,3,5]
28 # those are mapped then to 0,1,2
29 self.chromo_list = chromo_list
30 self.chromo_map = []
31 for elem in chromo_list:
32 self.chromo_map.append(elem)
33
34 self.genomicSequencesP = [0]*(len(self.chromo_map)+1)
35 self.acceptorScoresP = [0]*(len(self.chromo_map)+1)
36 self.donorScoresP = [0]*(len(self.chromo_map)+1)
37
38 self.genomicSequencesN = [0]*(len(self.chromo_map)+1)
39 self.acceptorScoresN = [0]*(len(self.chromo_map)+1)
40 self.donorScoresN = [0]*(len(self.chromo_map)+1)
41
42 for chrId in self.chromo_list:
43 for strandId in self.strands:
44 print (chrId,strandId)
45 self.prefetch_seq_and_scores(chrId,strandId)
46
47
48 def get_seq_and_scores(self,chromo,strand,start,stop,dna_flat_files):
49 assert chromo in self.chromo_list
50 assert strand in self.strands
51
52 chromo_idx = self.chromo_map[chromo]
53
54 if strand == '+':
55 currentSequence = self.genomicSequencesP[chromo_idx][start:stop]
56 currentDonorScores = self.donorScoresP[chromo_idx][start:stop]
57 currentAcceptorScores = self.acceptorScoresP[chromo_idx][start:stop]
58 elif strand == '-':
59 currentSequence = self.genomicSequencesN[chromo_idx][start:stop]
60 currentDonorScores = self.donorScoresN[chromo_idx][start:stop]
61 currentAcceptorScores = self.acceptorScoresN[chromo_idx][start:stop]
62 else:
63 assert False
64
65 return currentSequence, currentAcceptorScores, currentDonorScores
66
67
68 def prefetch_seq_and_scores(self,chromo,strand):
69 """
70 This function expects an interval, chromosome and strand information and
71 returns then the genomic sequence of this interval and the associated scores.
72 """
73
74 chromo_idx = self.chromo_map[chromo]
75
76 genomicSeq_start = 0
77
78 genomicSeq,currentAcc,currentDon = self.seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop)
79 genomicSeq = genomicSeq.lower()
80
81 if strand == '+':
82 self.genomicSequencesP[chromo_idx] = genomicSeq
83 self.acceptorScoresP[chromo_idx] = currentAcc
84 self.donorScoresP[chromo_idx] = currentDon
85 elif strand == '-':
86 self.genomicSequencesN[chromo_idx] = genomicSeq
87 self.acceptorScoresN[chromo_idx] = currentAcc
88 self.donorScoresN[chromo_idx] = currentDon
89 else:
90 assert False