f1bc39af3cb83ade15fe1697d1631ca943f16981
[qpalma.git] / scripts / Lookup.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import sys
5 import random
6 import os
7 import re
8 import pdb
9 import random
10
11 from numpy.matlib import inf
12 from qpalma.sequence_utils import SeqSpliceInfo,get_flat_file_size
13
14
15 class Lookup:
16 """
17 Speed up genomic and splice site score queries by prefetching the sequences
18 and the scores once for all chromosomes.
19 """
20
21 def __init__(self,dna_flat_files):
22 self.dna_flat_files = dna_flat_files
23
24 self.chromosomes = range(1,6)
25 self.strands = ['+','-']
26
27 flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
28 chromo_list = [1,2,3,4,5,8,9,10,11,12]
29 self.seqInfo = SeqSpliceInfo(flat_files,chromo_list)
30
31 self.genomicSequencesP = [0]*(len(self.chromosomes)+1)
32 self.acceptorScoresP = [0]*(len(self.chromosomes)+1)
33 self.donorScoresP = [0]*(len(self.chromosomes)+1)
34
35 self.genomicSequencesN = [0]*(len(self.chromosomes)+1)
36 self.acceptorScoresN = [0]*(len(self.chromosomes)+1)
37 self.donorScoresN = [0]*(len(self.chromosomes)+1)
38
39 #for chrId in self.chromosomes:
40 #for chrId in range(1,6):
41 for chrId in range(1,2):
42 for strandId in self.strands:
43 print (chrId,strandId)
44 self.prefetch_seq_and_scores(chrId,strandId)
45
46
47 def get_seq_and_scores(self,chromo,strand,start,stop,dna_flat_files):
48 assert chromo in self.chromosomes
49 assert strand in self.strands
50
51 currentSequence = self.genomicSequencesP[chromo][start:stop]
52 currentDonorScores = self.donorScoresP[chromo][start:stop]
53 currentAcceptorScores = self.acceptorScoresP[chromo][start:stop]
54
55 return currentSequence, currentAcceptorScores, currentDonorScores
56
57
58 def prefetch_seq_and_scores(self,chromo,strand):
59 """
60 This function expects an interval, chromosome and strand information and
61 returns then the genomic sequence of this interval and the associated scores.
62 """
63
64 genomicSeq_start = 0
65
66 if strand == '-':
67 genomicSeq_stop = self.seqInfo.chromo_sizes[chromo+7]
68 else:
69 genomicSeq_stop = self.seqInfo.chromo_sizes[chromo]
70
71 genomicSeq,currentAcc,currentDon = self.seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop)
72 genomicSeq = genomicSeq.lower()
73
74 if strand == '+':
75 self.genomicSequencesP[chromo] = genomicSeq
76 self.acceptorScoresP[chromo] = currentAcc
77 self.donorScoresP[chromo] = currentDon
78 elif strand == '-':
79 self.genomicSequencesN[chromo] = genomicSeq
80 self.acceptorScoresN[chromo] = currentAcc
81 self.donorScoresN[chromo] = currentDon
82 else:
83 assert False
84
85
86 if __name__ == '__main__':
87 dna_flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
88 lt1 = Lookup(dna_flat_files)
89
90 #self.lt1.seqInfo.get_seq_and_scores