+ made original reads file location a parameter for the command line
[qpalma.git] / scripts / Lookup.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import sys
5 import random
6 import os
7 import re
8 import pdb
9 import random
10
11 from numpy.matlib import inf
12
13 from Genefinding import *
14 from genome_utils import load_genomic
15
16 # only for checking purposes
17 #from compile_dataset import get_seq_and_scores
18
19
20 class Lookup:
21 """
22 Speed up genomic and splice site score queries by prefetching the sequences
23 and the scores once for all chromosomes.
24 """
25
26
27 def __init__(self,dna_flat_files):
28 self.dna_flat_files = dna_flat_files
29
30 self.max_size = 40000000
31
32 self.chromosomes = range(1,6)
33 self.strands = ['+']
34
35 self.genomicSequencesP = [0]*(len(self.chromosomes)+1)
36 self.acceptorScoresP = [0]*(len(self.chromosomes)+1)
37 self.donorScoresP = [0]*(len(self.chromosomes)+1)
38
39 self.genomicSequencesN = [0]*(len(self.chromosomes)+1)
40 self.acceptorScoresN = [0]*(len(self.chromosomes)+1)
41 self.donorScoresN = [0]*(len(self.chromosomes)+1)
42
43 for chrId in self.chromosomes:
44 for strandId in self.strands:
45 print (chrId,strandId)
46 self.prefetch_seq_and_scores(chrId,strandId)
47
48
49 def get_seq_and_scores(self,chr,strand,start,stop,dna_flat_files):
50 assert chr in self.chromosomes
51 assert strand in self.strands
52
53 if strand == '+':
54 currentSequence = self.genomicSequencesP[chr][start-1:stop]
55 length = len(currentSequence)
56 currentDonorScores = self.donorScoresP[chr][start:stop]
57 currentDonorScores += [-inf]*(length-len(currentDonorScores))
58 currentAcceptorScores = self.acceptorScoresP[chr][start:stop+2]
59 currentAcceptorScores = currentAcceptorScores[1:]
60 elif strand == '-':
61 currentSequence = self.genomicSequencesN[chr][start-1:stop]
62 length = len(currentSequence)
63 currentDonorScores = self.donorScoresN[chr][start:stop]
64 currentDonorScores += [-inf]*(length-len(currentDonorScores))
65 currentAcceptorScores = self.acceptorScoresN[chr][start:stop+2]
66 currentAcceptorScores = currentAcceptorScores[1:]
67 else:
68 pdb.set_trace()
69
70
71 return currentSequence.lower(), currentAcceptorScores, currentDonorScores
72
73
74 def prefetch_seq_and_scores(self,chr,strand):
75 """
76 This function expects an interval, chromosome and strand information and
77 returns then the genomic sequence of this interval and the associated scores.
78 """
79
80 chrom = 'chr%d' % chr
81
82 genomicSeq_start = 0
83 genomicSeq_stop = self.max_size
84 genomicSeq = load_genomic(chrom,strand,genomicSeq_start,genomicSeq_stop,self.dna_flat_files,one_based=False)
85 genomicSeq = genomicSeq.lower()
86
87 no_base = re.compile('[^acgt]')
88 genomicSeq = no_base.sub('n',genomicSeq)
89 # check the obtained dna sequence
90 assert genomicSeq != '', 'load_genomic returned empty sequence!'
91
92 intervalBegin = 0
93 intervalEnd = self.max_size
94
95 currentAcc, currentDon = self.getSpliceScores(chr,strand,intervalBegin,intervalEnd)
96
97 if strand == '+':
98 self.genomicSequencesP[chr] = genomicSeq
99 self.acceptorScoresP[chr] = currentAcc
100 self.donorScoresP[chr] = currentDon
101
102 elif strand == '-':
103 self.genomicSequencesN[chr] = genomicSeq
104 self.acceptorScoresN[chr] = currentAcc
105 self.donorScoresN[chr] = currentDon
106
107 else:
108 assert False
109
110
111 def getSpliceScores(self,chr,strand,intervalBegin,intervalEnd):
112 """
113 Now we want to use interval_query to get the predicted splice scores trained
114 on the TAIR sequence and annotation.
115 """
116
117 assert intervalEnd > intervalBegin
118
119 size = intervalEnd-intervalBegin
120
121 acc = size*[0.0]
122 don = size*[0.0]
123
124 interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
125 pos_size = new_intp()
126 intp_assign(pos_size,1)
127
128 # fetch acceptor scores
129 sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_%d%s'
130 acc = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename)
131
132 # fetch donor scores
133 sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_%d%s'
134 don = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename)
135
136 return acc, don
137
138
139 if __name__ == '__main__':
140 dna_flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
141 lt1 = Lookup(dna_flat_files)
142
143 #seq,acc,don = lt1.get_seq_and_scores(1,'+',1001,2002,'')
144 #_seq,_acc,_don = get_seq_and_scores(1,'+',1001,2002,dna_flat_files)
145
146 for chro in range(1,6):
147 start = random.randint(40,10000)
148 stop = start + random.randint(10,100)
149
150 seq,acc,don = lt1.get_seq_and_scores(chro,'+',start,stop,'')
151 _seq,_acc,_don = get_seq_and_scores(chro,'+',start,stop,dna_flat_files)
152
153 assert _seq == seq
154 assert _acc == acc
155 assert _don == don
156
157 #seq,acc,don = lt1.get_seq_and_scores(chro,'-',start,stop,'')
158 #_seq,_acc,_don = get_seq_and_scores(chro,'-',start,stop,dna_flat_files)
159
160 #assert _seq == seq
161 #assert _acc == acc
162 #assert _don == don
163
164
165 pdb.set_trace()