a3ccdd680f590e16c4599c43b9b6453bea9fb361
[qpalma.git] / scripts / Lookup.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import sys
5 import random
6 import os
7 import re
8 import pdb
9 import random
10
11 from numpy.matlib import inf
12
13 from Genefinding import *
14 from genome_utils import load_genomic
15
16 # only for checking purposes
17 from qpalma.sequence_utils import get_seq_and_scores
18
19
20 class Lookup:
21 """
22 Speed up genomic and splice site score queries by prefetching the sequences
23 and the scores once for all chromosomes.
24 """
25
26
27 def __init__(self,dna_flat_files):
28 self.dna_flat_files = dna_flat_files
29
30 self.max_size = 40000000
31
32 self.chromosomes = range(1,6)
33 self.strands = ['-']
34
35 self.genomicSequencesP = [0]*(len(self.chromosomes)+1)
36 self.acceptorScoresP = [0]*(len(self.chromosomes)+1)
37 self.donorScoresP = [0]*(len(self.chromosomes)+1)
38
39 self.genomicSequencesN = [0]*(len(self.chromosomes)+1)
40 self.acceptorScoresN = [0]*(len(self.chromosomes)+1)
41 self.donorScoresN = [0]*(len(self.chromosomes)+1)
42
43 #for chrId in self.chromosomes:
44 for chrId in range(4,6):
45 for strandId in self.strands:
46 print (chrId,strandId)
47 self.prefetch_seq_and_scores(chrId,strandId)
48
49
50 def get_seq_and_scores(self,chr,strand,start,stop,dna_flat_files):
51 assert chr in self.chromosomes
52 assert strand in self.strands
53
54 if strand == '+':
55 currentSequence = self.genomicSequencesP[chr][start-1:stop]
56 length = len(currentSequence)
57 currentDonorScores = self.donorScoresP[chr][start:stop]
58 currentDonorScores += [-inf]*(length-len(currentDonorScores))
59 currentAcceptorScores = self.acceptorScoresP[chr][start:stop+2]
60 currentAcceptorScores = currentAcceptorScores[1:]
61 elif strand == '-':
62 currentSequence = self.genomicSequencesN[chr][start-1:stop]
63 length = len(currentSequence)
64 currentDonorScores = self.donorScoresN[chr][start:stop]
65 currentDonorScores += [-inf]*(length-len(currentDonorScores))
66 currentAcceptorScores = self.acceptorScoresN[chr][start:stop+2]
67 currentAcceptorScores = currentAcceptorScores[1:]
68 else:
69 pdb.set_trace()
70
71 return currentSequence.lower(), currentAcceptorScores, currentDonorScores
72
73
74 def prefetch_seq_and_scores(self,chromo,strand):
75 """
76 This function expects an interval, chromosome and strand information and
77 returns then the genomic sequence of this interval and the associated scores.
78 """
79
80 fn = 'chr%d.dna.flat' % chromo
81 filename = os.path.join(self.dna_flat_files,fn)
82 cmd = 'wc -c %s | cut -f1 -d \' \'' % filename
83 import subprocess
84 obj = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
85 out,err = obj.communicate()
86 if err != '':
87 print 'Error occurred while trying to obtain file size'
88 size = int(out)
89
90 genomicSeq_start = 205
91 genomicSeq_stop = size-205
92 genomicSeq,currentAcc,currentDon = get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,self.dna_flat_files)
93 genomicSeq = genomicSeq.lower()
94
95 U = 'u'*205
96 INF_F = [-inf]*205
97
98 if strand == '+':
99 self.genomicSequencesP[chromo] = U + genomicSeq + U
100 self.acceptorScoresP[chromo] = INF_F + currentAcc + INF_F
101 self.donorScoresP[chromo] = INF_F + currentDon + INF_F
102
103 elif strand == '-':
104 self.genomicSequencesN[chromo] = U + genomicSeq + U
105 self.acceptorScoresN[chromo] = INF_F + currentAcc + INF_F
106 self.donorScoresN[chromo] = INF_F + currentDon + INF_F
107
108 else:
109 assert False
110
111
112 def _prefetch_seq_and_scores(self,chr,strand):
113 """
114 This function expects an interval, chromosome and strand information and
115 returns then the genomic sequence of this interval and the associated scores.
116 """
117
118 chrom = 'chr%d' % chr
119
120 genomicSeq_start = 0
121 genomicSeq_stop = self.max_size
122 genomicSeq = load_genomic(chrom,strand,genomicSeq_start,genomicSeq_stop,self.dna_flat_files,one_based=False)
123 genomicSeq = genomicSeq.lower()
124
125 no_base = re.compile('[^acgt]')
126 genomicSeq = no_base.sub('n',genomicSeq)
127 # check the obtained dna sequence
128 assert genomicSeq != '', 'load_genomic returned empty sequence!'
129
130 intervalBegin = 0
131 intervalEnd = self.max_size
132
133 currentAcc, currentDon = self.getSpliceScores(chr,strand,intervalBegin,intervalEnd)
134
135 if strand == '+':
136 self.genomicSequencesP[chr] = genomicSeq
137 self.acceptorScoresP[chr] = currentAcc
138 self.donorScoresP[chr] = currentDon
139
140 elif strand == '-':
141 self.genomicSequencesN[chr] = genomicSeq
142 self.acceptorScoresN[chr] = currentAcc
143 self.donorScoresN[chr] = currentDon
144
145 else:
146 assert False
147
148
149 def getSpliceScores(self,chr,strand,intervalBegin,intervalEnd):
150 """
151 Now we want to use interval_query to get the predicted splice scores trained
152 on the TAIR sequence and annotation.
153 """
154
155 assert intervalEnd > intervalBegin
156
157 size = intervalEnd-intervalBegin
158
159 acc = size*[0.0]
160 don = size*[0.0]
161
162 interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
163 pos_size = new_intp()
164 intp_assign(pos_size,1)
165
166 # fetch acceptor scores
167 sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_%d%s'
168 acc = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename)
169
170 # fetch donor scores
171 sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_%d%s'
172 don = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename)
173
174 return acc, don
175
176
177 if __name__ == '__main__':
178 dna_flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
179 lt1 = Lookup(dna_flat_files)
180
181 #seq,acc,don = lt1.get_seq_and_scores(1,'+',1001,2002,'')
182 #_seq,_acc,_don = get_seq_and_scores(1,'+',1001,2002,dna_flat_files)
183
184 for chro in range(1,6):
185 start = random.randint(40,10000)
186 stop = start + random.randint(10,100)
187
188 seq,acc,don = lt1.get_seq_and_scores(chro,'-',start,stop,'')
189 _seq,_acc,_don = get_seq_and_scores(chro,'-',start,stop,dna_flat_files)
190
191 assert _seq == seq, pdb.set_trace()
192 assert _acc == acc
193 assert _don == don
194
195 #seq,acc,don = lt1.get_seq_and_scores(chro,'-',start,stop,'')
196 #_seq,_acc,_don = get_seq_and_scores(chro,'-',start,stop,dna_flat_files)
197
198 #assert _seq == seq
199 #assert _acc == acc
200 #assert _don == don
201
202 pdb.set_trace()
203