+ cleaning up code base
[qpalma.git] / qpalma / sequence_utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 #
5 # This module holds all functions for queries on the dna flat files and the
6 # splice score files.
7 # Furthermore it contains some utility functions such as reverse complement and
8 # unbracketing est strings
9 #
10
11 import os
12 import pdb
13 import re
14 import subprocess
15
16 from numpy.matlib import inf
17
18 from Genefinding import *
19 from genome_utils import load_genomic
20
21 extended_alphabet = ['-','a','c','g','t','n','[',']']
22 alphabet = ['-','a','c','g','t','n']
23
24 def get_flatfile_size(filename):
25 cmd = 'wc -c %s | cut -f1 -d \' \'' % filename
26 obj = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
27 out,err = obj.communicate()
28
29 if err != '':
30 print 'Error occurred while trying to obtain file size'
31 return int(out)
32
33
34 def reverse_complement(seq):
35 """
36 This function takes a read in plain or bracket notation and returns the
37 reverse complement of it.
38 I.e.
39
40 est = cgt[ac][tg]a
41
42 leads to
43
44 rev_comp = t[ac][tg]acg
45 """
46
47 bpos = seq.find('[')
48 rc = lambda x: {'a':'t','c':'g','g':'c','t':'a','n':'n'}[x]
49
50 # check first whether seq contains no brackets at all
51 if bpos == -1:
52 ret_val = map(rc,seq)
53 ret_val.reverse()
54 ret_val = "".join(ret_val)
55 else:
56 brc = lambda x: {'a':'t','c':'g','g':'c','t':'a','n':'n','[':'[',']':']'}[x]
57
58 # first_part is the part of the seq up to the first occurrence of a
59 # bracket
60 first_part = seq[:bpos]
61 first_part = map(rc,first_part)
62 first_part.reverse()
63 first_part = "".join(first_part)
64
65 # inside brackets has to be complemented but NOT reversed
66 inside_brackets = seq[bpos+1:bpos+3]
67 inside_brackets = "".join(map(rc,inside_brackets))
68
69 ret_val = '%s[%s]%s'%(reverse_complement(seq[bpos+4:]),inside_brackets,first_part)
70
71 return ret_val
72
73
74 def unbracket_seq(seq):
75 """
76 This function takes a read in bracket notation and restores the read sequence from it.
77 I.e.
78
79 seq = cgt[ac][tg]aa
80
81 leads to
82
83 result = cgtcgaa
84
85 so the second entry within brackets is the base on the read whereas the first
86 entry is the base from the dna.
87 """
88
89 new_seq = ''
90 e = 0
91
92 while True:
93 if e >= len(seq):
94 break
95
96 if seq[e] == '[':
97 new_seq += seq[e+2]
98 e += 4
99 else:
100 new_seq += seq[e]
101 e += 1
102
103 return "".join(new_seq).lower()
104
105
106 def create_bracket_seq(dna_seq,read_seq):
107 """
108 This function takes a dna sequence and a read sequence and returns the
109 bracket format of the match/mismatches i.e.
110
111 dna : aaa
112 read: aac
113 is written in bracket notation: aa[ac]
114 """
115 assert len(dna_seq) == len(read_seq),pdb.set_trace()
116 return "".join(map(lambda x,y: ['[%s%s]'%(x,y),x][x==y],dna_seq,read_seq))
117
118
119
120 def getSpliceScores(chr,strand,intervalBegin,intervalEnd,total_size=0):
121 """
122 Now we want to use interval_query to get the predicted splice scores trained
123 on the TAIR sequence and annotation.
124 """
125
126 size = intervalEnd-intervalBegin
127 assert size > 1, 'Error (getSpliceScores): interval size is less than 2!'
128
129 acc = size*[0.0]
130 don = size*[0.0]
131
132 interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
133 pos_size = new_intp()
134 intp_assign(pos_size,1)
135
136 # fetch acceptor scores
137 sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_%d%s'
138 acc = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename,total_size)
139
140 # fetch donor scores
141 sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_%d%s'
142 don = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename,total_size)
143
144 return acc, don
145
146
147 def get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_files,only_seq=False):
148 """
149 This function expects an interval, chromosome and strand information and
150 returns then the genomic sequence of this interval and the associated scores.
151 """
152
153 # because splice site scores are predicted using a window of the sequence the
154 # first and the last NO_SCORE_WINDOW_SIZE nucleotides of a genomic sequence
155 # do not have any score predictions
156 NO_SCORE_WINDOW_SIZE = 250
157
158 assert genomicSeq_start < genomicSeq_stop
159
160 chrom = 'chr%d' % chr
161 genomicSeq = load_genomic(chrom,strand,genomicSeq_start-1,genomicSeq_stop,dna_flat_files,one_based=False)
162 genomicSeq = genomicSeq.lower()
163
164 # check the obtained dna sequence
165 assert genomicSeq != '', 'load_genomic returned empty sequence!'
166
167 # all entries other than a c g t are set to n
168 no_base = re.compile('[^acgt]')
169 genomicSeq = no_base.sub('n',genomicSeq)
170
171 if only_seq:
172 return genomicSeq
173
174 fn = 'chr%d.dna.flat' % chr
175 filename = os.path.join(dna_flat_files,fn)
176 total_size = get_flatfile_size(filename)
177
178 intervalBegin = genomicSeq_start-100
179 intervalEnd = genomicSeq_stop+100
180
181 # shorten the intervalEnd to the maximal size of the genomic sequence if it
182 # would exceed this
183 if intervalEnd > total_size:
184 intervalEnd = total_size-1
185
186 # we have assertions below checking whether every gt or ag has a splice score
187 # not equal to -inf if the gt or ag are within the first NO_SCORE_WINDOW_SIZE
188 # nucleotide then they have no scores at all
189 check_assertions = True
190 if intervalEnd > total_size - NO_SCORE_WINDOW_SIZE:
191 check_assertions = False
192
193 if intervalBegin < NO_SCORE_WINDOW_SIZE:
194 check_assertions = False
195
196 currentAcc, currentDon = getSpliceScores(chr,strand,intervalBegin,intervalEnd,total_size)
197
198 currentAcc = currentAcc[100:-98]
199 currentAcc = currentAcc[1:]
200 currentDon = currentDon[100:-100]
201
202 length = len(genomicSeq)
203
204 # indices for positive strand
205 if strand == '+':
206 currentAcc = currentAcc[:length]
207 currentDon = currentDon+[-inf]*(length-len(currentDon))
208
209 ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and genomicSeq[p-1]=='a' and genomicSeq[p]=='g' ]
210 gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
211
212 # take care of different indexation if on negative strand
213 if strand == '-':
214 currentAcc = currentAcc[:length]
215 currentAcc = currentAcc+[-inf]*(length-len(currentAcc))
216 currentDon = currentDon+[-inf]*(length-len(currentDon))
217
218 ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and p<len(genomicSeq) and genomicSeq[p-1]=='a' and genomicSeq[p]=='g']
219 gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
220
221 if check_assertions:
222 assert ag_tuple_pos == [p for p,e in enumerate(currentAcc) if e != -inf and p > 1], pdb.set_trace()
223 assert gt_tuple_pos == [p for p,e in enumerate(currentDon) if e != -inf and p > 0], pdb.set_trace()
224 assert len(currentAcc) == len(currentDon), pdb.set_trace()
225 else:
226 for pos in ag_tuple_pos:
227 if currentAcc[pos] == -inf:
228 currentAcc[pos] = 0.01
229
230 for pos in gt_tuple_pos:
231 if currentDon[pos] == -inf:
232 currentDon[pos] = 0.01
233
234 return genomicSeq, currentAcc, currentDon
235
236
237 def checks():
238 start = 1001
239 stop = start+1234
240 dna_flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
241
242 for chromo in range(1,6):
243 p_seq,_p_acc,_p_don = get_seq_and_scores(chromo,'+',start,stop,dna_flat_files)
244 n_seq,n_acc,n_don = get_seq_and_scores(chromo,'-',start,stop,dna_flat_files)
245
246 print p_seq == reverse_complement(n_seq)
247
248
249 if __name__ == '__main__':
250 checks()