+ wrapped sequence and score functions in order to reduce function calls
[qpalma.git] / qpalma / sequence_utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 #
5 # This module holds all functions for queries on the dna flat files and the
6 # splice score files.
7 # Furthermore it contains some utility functions such as reverse complement and
8 # unbracketing est strings
9 #
10
11 import os
12 import os.path
13 import pdb
14 import re
15 import subprocess
16
17 from numpy.matlib import inf
18
19 from Genefinding import *
20 from genome_utils import load_genomic
21
22 extended_alphabet = ['-','a','c','g','t','n','[',']']
23 alphabet = ['-','a','c','g','t','n']
24
25
26 class DataAccessionException:
27 pass
28
29
30 def get_flat_file_size(filename):
31 cmd = 'wc -c %s | cut -f1 -d \' \'' % filename
32 obj = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
33 out,err = obj.communicate()
34
35 if err != '':
36 print 'Error occurred while trying to obtain file size'
37 raise DataAccessionException
38
39 return int(out)
40
41
42 def reverse_complement(seq):
43 """
44 This function takes a read in plain or bracket notation and returns the
45 reverse complement of it.
46 I.e.
47
48 est = cgt[ac][tg]a
49
50 leads to
51
52 rev_comp = t[ac][tg]acg
53 """
54
55 bpos = seq.find('[')
56 rc = lambda x: {'a':'t','c':'g','g':'c','t':'a','n':'n'}[x]
57
58 # check first whether seq contains no brackets at all
59 if bpos == -1:
60 ret_val = map(rc,seq)
61 ret_val.reverse()
62 ret_val = "".join(ret_val)
63 else:
64 brc = lambda x: {'a':'t','c':'g','g':'c','t':'a','n':'n','[':'[',']':']'}[x]
65
66 # first_part is the part of the seq up to the first occurrence of a
67 # bracket
68 first_part = seq[:bpos]
69 first_part = map(rc,first_part)
70 first_part.reverse()
71 first_part = "".join(first_part)
72
73 # inside brackets has to be complemented but NOT reversed
74 inside_brackets = seq[bpos+1:bpos+3]
75 inside_brackets = "".join(map(rc,inside_brackets))
76
77 ret_val = '%s[%s]%s'%(reverse_complement(seq[bpos+4:]),inside_brackets,first_part)
78
79 return ret_val
80
81
82 def unbracket_seq(seq):
83 """
84 This function takes a read in bracket notation and restores the read sequence from it.
85 I.e.
86
87 seq = cgt[ac][tg]aa
88
89 leads to
90
91 result = cgtcgaa
92
93 so the second entry within brackets is the base on the read whereas the first
94 entry is the base from the dna.
95 """
96
97 new_seq = ''
98 e = 0
99
100 while True:
101 if e >= len(seq):
102 break
103
104 if seq[e] == '[':
105 new_seq += seq[e+2]
106 e += 4
107 else:
108 new_seq += seq[e]
109 e += 1
110
111 return "".join(new_seq).lower()
112
113
114 def create_bracket_seq(dna_seq,read_seq):
115 """
116 This function takes a dna sequence and a read sequence and returns the
117 bracket format of the match/mismatches i.e.
118
119 dna : aaa
120 read: aac
121 is written in bracket notation: aa[ac]
122 """
123 assert len(dna_seq) == len(read_seq),pdb.set_trace()
124 return "".join(map(lambda x,y: ['[%s%s]'%(x,y),x][x==y],dna_seq,read_seq))
125
126
127
128 class SeqSpliceInfo():
129
130 def __init__(self,dna_flat_files,chromo_list):
131 assert os.path.isdir(dna_flat_files)
132 self.flat_files = dna_flat_files
133
134 self.chromo_sizes = {}
135
136 print "Fetching chromosome sizes..."
137 for chromo in chromo_list:
138 filename = os.path.join(self.flat_files,'chr%d.dna.flat' % chromo)
139 total_size = get_flat_file_size(filename)
140 self.chromo_sizes[chromo] = total_size
141
142 print "done"
143
144
145 def getSpliceScores(self,chromo,strand,intervalBegin,intervalEnd,genomicSeq=''):
146 """
147 Now we want to use interval_query to get the predicted splice scores trained
148 on the TAIR sequence and annotation.
149 """
150
151 size = intervalEnd-intervalBegin
152 assert size > 1, 'Error (getSpliceScores): interval size is less than 2!'
153
154 acc = size*[0.0]
155 don = size*[0.0]
156
157 interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
158 pos_size = new_intp()
159 intp_assign(pos_size,1)
160
161 total_size = self.chromo_sizes[chromo]
162
163 # fetch acceptor scores
164 sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_%d%s'
165 acc = doIntervalQuery(chromo,strand,intervalBegin,intervalEnd,sscore_filename,total_size)
166
167 #ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and p<len(genomicSeq) and genomicSeq[p-1]=='a' and genomicSeq[p]=='g'][:100]
168 #acc_pos = [p for p,e in enumerate(acc) if e != -inf and p > 1][:100]
169
170 # fetch donor scores
171 sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_%d%s'
172 don = doIntervalQuery(chromo,strand,intervalBegin,intervalEnd,sscore_filename,total_size)
173
174 #gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
175 #don_pos = [p for p,e in enumerate(don) if e != -inf and p > 1][:100]
176 #pdb.set_trace()
177
178 return acc, don
179
180
181 def fetch_sequence(self,chromo,strand,genomicSeq_start,genomicSeq_stop):
182 chromo_string = 'chr%d' % chromo
183 genomicSeq = load_genomic(chromo_string,strand,genomicSeq_start,genomicSeq_stop,self.flat_files,one_based=False)
184 genomicSeq = genomicSeq.strip().lower()
185
186 # check the obtained dna sequence
187 assert genomicSeq != '', 'load_genomic returned empty sequence!'
188
189 # all entries other than a c g t are set to n
190 no_base = re.compile('[^acgt]')
191 genomicSeq = no_base.sub('n',genomicSeq)
192
193 return genomicSeq
194
195 def get_seq_and_scores(self,chromo,strand,genomicSeq_start,genomicSeq_stop,only_seq=False):
196 """
197 This function expects an interval, chromosome and strand information and
198 returns then the genomic sequence of this interval and the associated scores.
199 """
200 # Because splice site scores are predicted using a window of the sequence the
201 # first and the last NO_SCORE_WINDOW_SIZE nucleotides of a genomic sequence
202 # do not have any score predictions
203 NO_SCORE_WINDOW_SIZE = 205
204
205 assert genomicSeq_start < genomicSeq_stop
206
207 genomicSeq = self.fetch_sequence(chromo,'+',genomicSeq_start,genomicSeq_stop)
208
209 if only_seq:
210 return genomicSeq
211
212 total_size = self.chromo_sizes[chromo]
213
214 # shorten the intervalEnd to the maximal size of the genomic sequence if it would exceed this size
215 lookup_offset = 0
216 intervalBegin = max(genomicSeq_start-lookup_offset,0)
217 intervalEnd = min(genomicSeq_stop+lookup_offset,total_size-1)
218
219 if chromo > 7:
220 print intervalBegin,intervalEnd,total_size
221 currentAcc, currentDon = self.getSpliceScores(chromo-7,'-',intervalBegin,intervalEnd,total_size,genomicSeq)
222 currentDon = [-inf]+currentDon[:-1]
223 else:
224 currentAcc, currentDon = self.getSpliceScores(chromo,strand,intervalBegin,intervalEnd,total_size,genomicSeq)
225 currentAcc = currentAcc[2:]
226 currentDon = currentDon[1:]
227
228 length = len(genomicSeq)
229 currentAcc = currentAcc+[-inf]*(length-len(currentAcc))
230 currentDon = currentDon+[-inf]*(length-len(currentDon))
231
232 #acc_pos = [p for p,e in enumerate(currentAcc) if e != -inf and p > 1]
233 #don_pos = [p for p,e in enumerate(currentDon) if e != -inf and p > 0]
234
235 gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
236 ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and p<len(genomicSeq) and genomicSeq[p-1]=='a' and genomicSeq[p]=='g']
237
238 #pdb.set_trace()
239 #return genomicSeq, currentAcc, currentDon
240
241 for pos in ag_tuple_pos:
242 if pos+intervalBegin < NO_SCORE_WINDOW_SIZE and currentAcc[pos] == -inf:
243 currentAcc[pos] = 1e-6
244
245 if pos+intervalBegin > total_size - NO_SCORE_WINDOW_SIZE and currentAcc[pos] == -inf:
246 currentAcc[pos] = 1e-6
247
248 for pos in gt_tuple_pos:
249 if pos+intervalBegin < NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
250 currentDon[pos] = 1e-6
251
252 if pos+intervalBegin > total_size - NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
253 currentDon[pos] = 1e-6
254
255 assert ag_tuple_pos == [p for p,e in enumerate(currentAcc) if e != -inf and p > 1], pdb.set_trace()
256 assert gt_tuple_pos == [p for p,e in enumerate(currentDon) if e != -inf and p > 0], pdb.set_trace()
257 assert len(genomicSeq) == len(currentAcc) == len(currentDon), pdb.set_trace()
258
259 return genomicSeq, currentAcc, currentDon