+ fixed some bugs in the negative strand lookup table
[qpalma.git] / qpalma / sequence_utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import os
5 import pdb
6 import random
7 import re
8 import sys
9 import subprocess
10
11 from numpy.matlib import inf
12
13 from Genefinding import *
14 from genome_utils import load_genomic
15
16 def get_flatfile_size(filename):
17 cmd = 'wc -c %s | cut -f1 -d \' \'' % filename
18 obj = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
19 out,err = obj.communicate()
20
21 if err != '':
22 print 'Error occurred while trying to obtain file size'
23 return int(out)
24
25
26 def reverse_complement(seq):
27 """
28 This function takes a read in plain or bracket notation and returns the
29 reverse complement of it.
30 I.e.
31
32 est = cgt[ac][tg]a
33
34 leads to
35
36 rev_comp = t[ac][tg]acg
37 """
38
39 bpos = seq.find('[')
40 rc = lambda x: {'a':'t','c':'g','g':'c','t':'a'}[x]
41
42 # check first whether seq contains no brackets at all
43 if bpos == -1:
44 ret_val = map(rc,seq)
45 ret_val.reverse()
46 ret_val = "".join(ret_val)
47 else:
48 brc = lambda x: {'a':'t','c':'g','g':'c','t':'a','[':'[',']':']'}[x]
49
50 # first_part is the part of the seq up to the first occurrence of a
51 # bracket
52 first_part = seq[:bpos]
53 first_part = map(rc,first_part)
54 first_part.reverse()
55 first_part = "".join(first_part)
56
57 # inside brackets has to be complemented but NOT reversed
58 inside_brackets = seq[bpos+1:bpos+3]
59 inside_brackets = "".join(map(rc,inside_brackets))
60
61 ret_val = '%s[%s]%s'%(reverse_complement(seq[bpos+4:]),inside_brackets,first_part)
62
63 return ret_val
64
65
66 def unbracket_est(est):
67 """
68 This function takes a read in bracket notation and restores the read sequence from it.
69 I.e.
70
71 est = cgt[ac][tg]aa
72
73 leads to
74
75 result = cgtcgaa
76
77 so the second entry within brackets is the base on the read whereas the first
78 entry is the base from the dna.
79 """
80
81 new_est = ''
82 e = 0
83
84 while True:
85 if e >= len(est):
86 break
87
88 if est[e] == '[':
89 new_est += est[e+2]
90 e += 4
91 else:
92 new_est += est[e]
93 e += 1
94
95 return "".join(new_est).lower()
96
97
98 def create_bracket_seq(dna_seq,read_seq):
99 """
100 This function takes a dna sequence and a read sequence and returns the
101 bracket format of the match/mismatches i.e.
102
103 dna : aaa
104 read: aac
105 is written in bracket notation: aa[ac]
106 """
107 assert len(dna_seq) == len(read_seq)
108 return "".join(map(lambda x,y: ['[%s%s]'%(x,y),x][x==y],dna_seq,read_seq))
109
110
111 def getSpliceScores(chr,strand,intervalBegin,intervalEnd,total_size=0):
112 """
113 Now we want to use interval_query to get the predicted splice scores trained
114 on the TAIR sequence and annotation.
115 """
116
117 size = intervalEnd-intervalBegin
118 assert size > 1, 'Error (getSpliceScores): interval size is less than 2!'
119
120 acc = size*[0.0]
121 don = size*[0.0]
122
123 interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
124 pos_size = new_intp()
125 intp_assign(pos_size,1)
126
127 # fetch acceptor scores
128 sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_%d%s'
129 acc = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename,total_size)
130
131 # fetch donor scores
132 sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_%d%s'
133 don = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename,total_size)
134
135 return acc, don
136
137
138 def get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_files):
139 """
140 This function expects an interval, chromosome and strand information and
141 returns then the genomic sequence of this interval and the associated scores.
142 """
143
144 assert genomicSeq_start < genomicSeq_stop
145
146 chrom = 'chr%d' % chr
147 genomicSeq = load_genomic(chrom,strand,genomicSeq_start-1,genomicSeq_stop,dna_flat_files,one_based=False)
148 genomicSeq = genomicSeq.lower()
149
150 # check the obtained dna sequence
151 assert genomicSeq != '', 'load_genomic returned empty sequence!'
152
153 # all entries other than a c g t are set to n
154 no_base = re.compile('[^acgt]')
155 genomicSeq = no_base.sub('n',genomicSeq)
156
157 if strand == '+':
158 intervalBegin = genomicSeq_start-100
159 intervalEnd = genomicSeq_stop+100
160
161 currentAcc, currentDon = getSpliceScores(chr,strand,intervalBegin,intervalEnd)
162
163 currentAcc = currentAcc[100:-98]
164 currentAcc = currentAcc[1:]
165 currentDon = currentDon[100:-100]
166
167 length = len(genomicSeq)
168 currentAcc = currentAcc[:length]
169
170 currentDon = currentDon+[-inf]*(length-len(currentDon))
171
172 ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and genomicSeq[p-1]=='a' and genomicSeq[p]=='g' ]
173 gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
174
175 assert ag_tuple_pos == [p for p,e in enumerate(currentAcc) if e != -inf and p > 1], pdb.set_trace()
176 assert gt_tuple_pos == [p for p,e in enumerate(currentDon) if e != -inf and p > 0], pdb.set_trace()
177 assert len(currentAcc) == len(currentDon)
178
179 return genomicSeq, currentAcc, currentDon
180
181 # build reverse complement if on negative strand
182 if strand == '-':
183 fn = 'chr%d.dna.flat' % chr
184 filename = os.path.join(dna_flat_files,fn)
185 end = get_flatfile_size(filename)
186
187 intervalBegin = genomicSeq_start-100
188 intervalEnd = genomicSeq_stop+100
189
190 total_size = end
191 currentAcc, currentDon = getSpliceScores(chr,strand,intervalBegin,intervalEnd,total_size)
192
193 currentAcc = currentAcc[100:-98]
194 currentAcc = currentAcc[1:]
195 currentDon = currentDon[100:-100]
196
197 length = len(genomicSeq)
198 currentAcc = currentAcc[:length]
199 currentAcc = currentAcc+[-inf]*(length-len(currentAcc))
200
201 currentDon = currentDon+[-inf]*(length-len(currentDon))
202
203 ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and p<len(genomicSeq) and genomicSeq[p-1]=='a' and genomicSeq[p]=='g']
204 gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
205
206 assert ag_tuple_pos == [p for p,e in enumerate(currentAcc) if e != -inf and p > 1], pdb.set_trace()
207 assert gt_tuple_pos == [p for p,e in enumerate(currentDon) if e != -inf and p > 0], pdb.set_trace()
208 assert len(currentAcc) == len(currentDon), pdb.set_trace()
209
210 return genomicSeq, currentAcc, currentDon
211
212
213 def checks():
214 start = 1001
215 stop = start+1234
216 dna_flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
217
218 for chromo in range(1,6):
219 p_seq,_p_acc,_p_don = get_seq_and_scores(chromo,'+',start,stop,dna_flat_files)
220 n_seq,n_acc,n_don = get_seq_and_scores(chromo,'-',start,stop,dna_flat_files)
221
222 print p_seq == reverse_complement(n_seq)
223
224
225
226 if __name__ == '__main__':
227 checks()