+ Fixed issue with g being last element of a sequence (one donor score too much)
[qpalma.git] / qpalma / sequence_utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 #
5 # This module holds all functions for queries on the dna flat files and the
6 # splice score files.
7 # Furthermore it contains some utility functions such as reverse complement and
8 # unbracketing est strings
9 #
10
11 import os
12 import os.path
13 import pdb
14 import re
15 import subprocess
16
17 from numpy.matlib import inf
18
19 from Genefinding import *
20 from genome_utils import load_genomic
21
22 extended_alphabet = ['-','a','c','g','t','n','[',']']
23 alphabet = ['-','a','c','g','t','n']
24
25
26 class DataAccessionException:
27 pass
28
29
30 def get_flat_file_size(filename):
31 cmd = 'wc -c %s | cut -f1 -d \' \'' % filename
32 obj = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
33 out,err = obj.communicate()
34
35 if err != '':
36 print 'Error occurred while trying to obtain file size'
37 raise DataAccessionException
38
39 return int(out)
40
41
42 def reverse_complement(seq):
43 """
44 This function takes a read in plain or bracket notation and returns the
45 reverse complement of it.
46 I.e.
47
48 est = cgt[ac][tg]a
49
50 leads to
51
52 rev_comp = t[ac][tg]acg
53 """
54
55 bpos = seq.find('[')
56 rc = lambda x: {'a':'t','c':'g','g':'c','t':'a','n':'n'}[x]
57
58 # check first whether seq contains no brackets at all
59 if bpos == -1:
60 ret_val = map(rc,seq)
61 ret_val.reverse()
62 ret_val = "".join(ret_val)
63 else:
64 brc = lambda x: {'a':'t','c':'g','g':'c','t':'a','n':'n','[':'[',']':']'}[x]
65
66 # first_part is the part of the seq up to the first occurrence of a
67 # bracket
68 first_part = seq[:bpos]
69 first_part = map(rc,first_part)
70 first_part.reverse()
71 first_part = "".join(first_part)
72
73 # inside brackets has to be complemented but NOT reversed
74 inside_brackets = seq[bpos+1:bpos+3]
75 inside_brackets = "".join(map(rc,inside_brackets))
76
77 ret_val = '%s[%s]%s'%(reverse_complement(seq[bpos+4:]),inside_brackets,first_part)
78
79 return ret_val
80
81
82 def unbracket_seq(seq):
83 """
84 This function takes a read in bracket notation and restores the read sequence from it.
85 I.e.
86
87 seq = cgt[ac][tg]aa
88
89 leads to
90
91 result = cgtcgaa
92
93 so the second entry within brackets is the base on the read whereas the first
94 entry is the base from the dna.
95 """
96
97 new_seq = ''
98 e = 0
99
100 while True:
101 if e >= len(seq):
102 break
103
104 if seq[e] == '[':
105 new_seq += seq[e+2]
106 e += 4
107 else:
108 new_seq += seq[e]
109 e += 1
110
111 return "".join(new_seq).lower()
112
113
114 def create_bracket_seq(dna_seq,read_seq):
115 """
116 This function takes a dna sequence and a read sequence and returns the
117 bracket format of the match/mismatches i.e.
118
119 dna : aaa
120 read: aac
121 is written in bracket notation: aa[ac]
122 """
123 assert len(dna_seq) == len(read_seq),pdb.set_trace()
124 return "".join(map(lambda x,y: ['[%s%s]'%(x,y),x][x==y],dna_seq,read_seq))
125
126
127
128 class SeqSpliceInfo():
129
130 def __init__(self,dna_flat_files,chromo_list):
131 assert os.path.isdir(dna_flat_files)
132 self.flat_files = dna_flat_files
133
134 self.chromo_sizes = {}
135
136 print "Fetching chromosome sizes..."
137 for chromo in chromo_list:
138 filename = os.path.join(self.flat_files,'chr%d.dna.flat' % chromo)
139 total_size = get_flat_file_size(filename)
140 self.chromo_sizes[chromo] = total_size
141
142 print "done"
143
144
145 def getSpliceScores(self,chromo,strand,intervalBegin,intervalEnd,genomicSeq=''):
146 """
147 Now we want to use interval_query to get the predicted splice scores trained
148 on the TAIR sequence and annotation.
149 """
150
151 size = intervalEnd-intervalBegin
152 assert size > 1, 'Error (getSpliceScores): interval size is less than 2!'
153
154 acc = size*[0.0]
155 don = size*[0.0]
156
157 interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
158 pos_size = new_intp()
159 intp_assign(pos_size,1)
160
161 total_size = self.chromo_sizes[chromo]
162
163 # fetch acceptor scores
164 sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_%d%s'
165 acc = doIntervalQuery(chromo,strand,intervalBegin,intervalEnd,sscore_filename,total_size)
166
167 # fetch donor scores
168 sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_%d%s'
169 don = doIntervalQuery(chromo,strand,intervalBegin,intervalEnd,sscore_filename,total_size)
170
171 return acc, don
172
173
174 def fetch_sequence(self,chromo,strand,genomicSeq_start,genomicSeq_stop):
175 chromo_string = 'chr%d' % chromo
176 genomicSeq = load_genomic(chromo_string,strand,genomicSeq_start,genomicSeq_stop,self.flat_files,one_based=False)
177 genomicSeq = genomicSeq.strip().lower()
178
179 # check the obtained dna sequence
180 assert genomicSeq != '', 'load_genomic returned empty sequence!'
181
182 # all entries other than a c g t are set to n
183 no_base = re.compile('[^acgt]')
184 genomicSeq = no_base.sub('n',genomicSeq)
185
186 return genomicSeq
187
188 def get_seq_and_scores(self,chromo,strand,genomicSeq_start,genomicSeq_stop,only_seq=False):
189 """
190 This function expects an interval, chromosome and strand information and
191 returns then the genomic sequence of this interval and the associated scores.
192 """
193 # Because splice site scores are predicted using a window of the sequence the
194 # first and the last NO_SCORE_WINDOW_SIZE nucleotides of a genomic sequence
195 # do not have any score predictions
196 NO_SCORE_WINDOW_SIZE = 205
197
198 assert genomicSeq_start < genomicSeq_stop
199
200 if strand == '-':
201 genomicSeq = self.fetch_sequence(chromo+7,'+',genomicSeq_start,genomicSeq_stop)
202 else:
203 genomicSeq = self.fetch_sequence(chromo,'+',genomicSeq_start,genomicSeq_stop)
204
205 if only_seq:
206 return genomicSeq
207
208 total_size = self.chromo_sizes[chromo]
209
210 # shorten the intervalEnd to the maximal size of the genomic sequence if it would exceed this size
211 lookup_offset_begin = 10
212 lookup_offset_end = 10
213 intervalBegin = max(genomicSeq_start-lookup_offset_begin,0)
214 if intervalBegin == 0:
215 lookup_offset_begin = 0
216
217 intervalEnd = min(genomicSeq_stop+lookup_offset_end,total_size-1)
218 if intervalEnd == total_size-1:
219 lookup_offset_end = 0
220
221 currentAcc, currentDon = self.getSpliceScores(chromo,strand,intervalBegin,intervalEnd,genomicSeq)
222
223 original_acc = list(currentAcc)
224
225 if strand == '-':
226 currentDon = [-inf]+currentDon[:-1]
227 else:
228 currentAcc = currentAcc[2:]
229 currentDon = currentDon[1:]
230
231 # remove the offset positions
232 currentAcc = currentAcc[lookup_offset_begin:]
233 currentDon = currentDon[lookup_offset_begin:]
234
235 currentAcc = currentAcc[:len(genomicSeq)]
236 currentDon = currentDon[:len(genomicSeq)]
237
238 #length = len(genomicSeq)
239 #currentAcc = currentAcc+[-inf]*(length-len(currentAcc))
240 #currentDon = currentDon+[-inf]*(length-len(currentDon))
241 currentDon[-1] = -inf
242
243 gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if (p>0 and p<len(genomicSeq)-1 and e=='g' ) and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
244 ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and genomicSeq[p-1]=='a' and genomicSeq[p]=='g']
245
246 #pdb.set_trace()
247 #return genomicSeq, currentAcc, currentDon
248
249 for pos in ag_tuple_pos:
250 if pos+intervalBegin < NO_SCORE_WINDOW_SIZE and currentAcc[pos] == -inf:
251 currentAcc[pos] = 1e-6
252
253 if pos+intervalBegin > total_size - NO_SCORE_WINDOW_SIZE and currentAcc[pos] == -inf:
254 currentAcc[pos] = 1e-6
255
256 for pos in gt_tuple_pos:
257 if pos+intervalBegin < NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
258 currentDon[pos] = 1e-6
259
260 if pos+intervalBegin > total_size - NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
261 currentDon[pos] = 1e-6
262
263 acc_pos = [p for p,e in enumerate(currentAcc) if e != -inf and p > 1]
264 don_pos = [p for p,e in enumerate(currentDon) if e != -inf and p > 0]
265
266 if not ag_tuple_pos == acc_pos:
267 print 'ACC: Chromo/strand: %d/%s' % (chromo,strand)
268 print ag_tuple_pos[:10],ag_tuple_pos[-10:]
269 print acc_pos[:10],acc_pos[-10:]
270 #pdb.set_trace()
271
272 if not gt_tuple_pos == don_pos:
273 print 'DON: Chromo/strand: %d/%s' % (chromo,strand)
274 print gt_tuple_pos[:10],gt_tuple_pos[-10:]
275 print don_pos[:10],don_pos[-10:]
276 #pdb.set_trace()
277
278 #assert ag_tuple_pos == acc_pos, pdb.set_trace()
279 #assert gt_tuple_pos == don_pos, pdb.set_trace()
280 assert len(genomicSeq) == len(currentAcc) == len(currentDon), pdb.set_trace()
281
282 return genomicSeq, currentAcc, currentDon