+ added small program for intron-position comparison
[qpalma.git] / qpalma / sequence_utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 #
5 # This module holds all functions for queries on the dna flat files and the
6 # splice score files.
7 # Furthermore it contains some utility functions such as reverse complement and
8 # unbracketing est strings
9 #
10
11 import os
12 import pdb
13 import re
14 import subprocess
15
16 from numpy.matlib import inf
17
18 from Genefinding import *
19 from genome_utils import load_genomic
20
21
22 def get_flatfile_size(filename):
23 cmd = 'wc -c %s | cut -f1 -d \' \'' % filename
24 obj = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
25 out,err = obj.communicate()
26
27 if err != '':
28 print 'Error occurred while trying to obtain file size'
29 return int(out)
30
31
32 def reverse_complement(seq):
33 """
34 This function takes a read in plain or bracket notation and returns the
35 reverse complement of it.
36 I.e.
37
38 est = cgt[ac][tg]a
39
40 leads to
41
42 rev_comp = t[ac][tg]acg
43 """
44
45 bpos = seq.find('[')
46 rc = lambda x: {'a':'t','c':'g','g':'c','t':'a'}[x]
47
48 # check first whether seq contains no brackets at all
49 if bpos == -1:
50 ret_val = map(rc,seq)
51 ret_val.reverse()
52 ret_val = "".join(ret_val)
53 else:
54 brc = lambda x: {'a':'t','c':'g','g':'c','t':'a','[':'[',']':']'}[x]
55
56 # first_part is the part of the seq up to the first occurrence of a
57 # bracket
58 first_part = seq[:bpos]
59 first_part = map(rc,first_part)
60 first_part.reverse()
61 first_part = "".join(first_part)
62
63 # inside brackets has to be complemented but NOT reversed
64 inside_brackets = seq[bpos+1:bpos+3]
65 inside_brackets = "".join(map(rc,inside_brackets))
66
67 ret_val = '%s[%s]%s'%(reverse_complement(seq[bpos+4:]),inside_brackets,first_part)
68
69 return ret_val
70
71
72 def unbracket_seq(seq):
73 """
74 This function takes a read in bracket notation and restores the read sequence from it.
75 I.e.
76
77 seq = cgt[ac][tg]aa
78
79 leads to
80
81 result = cgtcgaa
82
83 so the second entry within brackets is the base on the read whereas the first
84 entry is the base from the dna.
85 """
86
87 new_seq = ''
88 e = 0
89
90 while True:
91 if e >= len(seq):
92 break
93
94 if seq[e] == '[':
95 new_seq += seq[e+2]
96 e += 4
97 else:
98 new_seq += seq[e]
99 e += 1
100
101 return "".join(new_seq).lower()
102
103
104 def create_bracket_seq(dna_seq,read_seq):
105 """
106 This function takes a dna sequence and a read sequence and returns the
107 bracket format of the match/mismatches i.e.
108
109 dna : aaa
110 read: aac
111 is written in bracket notation: aa[ac]
112 """
113 assert len(dna_seq) == len(read_seq),pdb.set_trace()
114 return "".join(map(lambda x,y: ['[%s%s]'%(x,y),x][x==y],dna_seq,read_seq))
115
116
117
118 def getSpliceScores(chr,strand,intervalBegin,intervalEnd,total_size=0):
119 """
120 Now we want to use interval_query to get the predicted splice scores trained
121 on the TAIR sequence and annotation.
122 """
123
124 size = intervalEnd-intervalBegin
125 assert size > 1, 'Error (getSpliceScores): interval size is less than 2!'
126
127 acc = size*[0.0]
128 don = size*[0.0]
129
130 interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
131 pos_size = new_intp()
132 intp_assign(pos_size,1)
133
134 # fetch acceptor scores
135 sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_%d%s'
136 acc = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename,total_size)
137
138 # fetch donor scores
139 sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_%d%s'
140 don = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename,total_size)
141
142 return acc, don
143
144
145 def get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_files,only_seq=False):
146 """
147 This function expects an interval, chromosome and strand information and
148 returns then the genomic sequence of this interval and the associated scores.
149 """
150
151 # because splice site scores are predicted using a window of the sequence the
152 # first and the last NO_SCORE_WINDOW_SIZE nucleotides of a genomic sequence
153 # do not have any score predictions
154 NO_SCORE_WINDOW_SIZE = 250
155
156 assert genomicSeq_start < genomicSeq_stop
157
158 chrom = 'chr%d' % chr
159 genomicSeq = load_genomic(chrom,strand,genomicSeq_start-1,genomicSeq_stop,dna_flat_files,one_based=False)
160 genomicSeq = genomicSeq.lower()
161
162 # check the obtained dna sequence
163 assert genomicSeq != '', 'load_genomic returned empty sequence!'
164
165 # all entries other than a c g t are set to n
166 no_base = re.compile('[^acgt]')
167 genomicSeq = no_base.sub('n',genomicSeq)
168
169 if only_seq:
170 return genomicSeq
171
172 fn = 'chr%d.dna.flat' % chr
173 filename = os.path.join(dna_flat_files,fn)
174 total_size = get_flatfile_size(filename)
175
176 intervalBegin = genomicSeq_start-100
177 intervalEnd = genomicSeq_stop+100
178
179 # shorten the intervalEnd to the maximal size of the genomic sequence if it
180 # would exceed this
181 if intervalEnd > total_size:
182 intervalEnd = total_size-1
183
184 # we have assertions below checking whether every gt or ag has a splice score
185 # not equal to -inf if the gt or ag are within the first NO_SCORE_WINDOW_SIZE
186 # nucleotide then they have no scores at all
187 check_assertions = True
188 if intervalEnd > total_size - NO_SCORE_WINDOW_SIZE:
189 check_assertions = False
190
191 if intervalBegin < NO_SCORE_WINDOW_SIZE:
192 check_assertions = False
193
194 currentAcc, currentDon = getSpliceScores(chr,strand,intervalBegin,intervalEnd,total_size)
195
196 currentAcc = currentAcc[100:-98]
197 currentAcc = currentAcc[1:]
198 currentDon = currentDon[100:-100]
199
200 length = len(genomicSeq)
201
202 # indices for positive strand
203 if strand == '+':
204 currentAcc = currentAcc[:length]
205 currentDon = currentDon+[-inf]*(length-len(currentDon))
206
207 ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and genomicSeq[p-1]=='a' and genomicSeq[p]=='g' ]
208 gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
209
210 # take care of different indexation if on negative strand
211 if strand == '-':
212 currentAcc = currentAcc[:length]
213 currentAcc = currentAcc+[-inf]*(length-len(currentAcc))
214 currentDon = currentDon+[-inf]*(length-len(currentDon))
215
216 ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and p<len(genomicSeq) and genomicSeq[p-1]=='a' and genomicSeq[p]=='g']
217 gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
218
219 if check_assertions:
220 assert ag_tuple_pos == [p for p,e in enumerate(currentAcc) if e != -inf and p > 1], pdb.set_trace()
221 assert gt_tuple_pos == [p for p,e in enumerate(currentDon) if e != -inf and p > 0], pdb.set_trace()
222 assert len(currentAcc) == len(currentDon), pdb.set_trace()
223 else
224 for pos in ag_tuple_pos:
225 if currentAcc[pos] == -inf:
226 currentAcc[pos] = 0.01
227
228 for pos in gt_tuple_pos:
229 if currentDon[pos] == -inf:
230 currentDon[pos] = 0.01
231
232 return genomicSeq, currentAcc, currentDon
233
234
235 def checks():
236 start = 1001
237 stop = start+1234
238 dna_flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
239
240 for chromo in range(1,6):
241 p_seq,_p_acc,_p_don = get_seq_and_scores(chromo,'+',start,stop,dna_flat_files)
242 n_seq,n_acc,n_don = get_seq_and_scores(chromo,'-',start,stop,dna_flat_files)
243
244 print p_seq == reverse_complement(n_seq)
245
246
247 if __name__ == '__main__':
248 checks()