5e00cee7ae9c1ffec3b18df9b2c1e4c06f816e60
[qpalma.git] / qpalma / sequence_utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import os
5 import pdb
6 import random
7 import re
8 import sys
9
10 from numpy.matlib import inf
11
12 from Genefinding import *
13 from genome_utils import load_genomic
14
15
16 def reverse_complement(seq):
17 """
18 This function takes a read in plain or bracket notation and returns the
19 reverse complement of it.
20 I.e.
21
22 est = cgt[ac][tg]a
23
24 leads to
25
26 rev_comp = t[ac][tg]acg
27 """
28
29 bpos = seq.find('[')
30 rc = lambda x: {'a':'t','c':'g','g':'c','t':'a'}[x]
31
32 # check first whether seq contains no brackets at all
33 if bpos == -1:
34 ret_val = map(rc,seq)
35 ret_val.reverse()
36 ret_val = "".join(ret_val)
37 else:
38 brc = lambda x: {'a':'t','c':'g','g':'c','t':'a','[':'[',']':']'}[x]
39
40 # first_part is the part of the seq up to the first occurrence of a
41 # bracket
42 first_part = seq[:bpos]
43 first_part = map(rc,first_part)
44 first_part.reverse()
45 first_part = "".join(first_part)
46
47 # inside brackets has to be complemented but NOT reversed
48 inside_brackets = seq[bpos+1:bpos+3]
49 inside_brackets = "".join(map(rc,inside_brackets))
50
51 ret_val = '%s[%s]%s'%(reverse_complement(seq[bpos+4:]),inside_brackets,first_part)
52
53 return ret_val
54
55
56 def unbracket_est(est):
57 """
58 This function takes a read in bracket notation and restores the read sequence from it.
59 I.e.
60
61 est = cgt[ac][tg]aa
62
63 leads to
64
65 result = cgtcgaa
66
67 so the second entry within brackets is the base on the read whereas the first
68 entry is the base from the dna.
69 """
70
71 new_est = ''
72 e = 0
73
74 while True:
75 if e >= len(est):
76 break
77
78 if est[e] == '[':
79 new_est += est[e+2]
80 e += 4
81 else:
82 new_est += est[e]
83 e += 1
84
85 return "".join(new_est).lower()
86
87
88 def create_bracket_seq(dna_seq,read_seq):
89 """
90 This function takes a dna sequence and a read sequence and returns the
91 bracket format of the match/mismatches i.e.
92
93 dna : aaa
94 read: aac
95 is written in bracket notation: aa[ac]
96 """
97 assert len(dna_seq) == len(read_seq)
98 return "".join(map(lambda x,y: ['[%s%s]'%(x,y),x][x==y],dna_seq,read_seq))
99
100
101 def getSpliceScores(chr,strand,intervalBegin,intervalEnd,total_size=0):
102 """
103 Now we want to use interval_query to get the predicted splice scores trained
104 on the TAIR sequence and annotation.
105 """
106
107 size = intervalEnd-intervalBegin
108 assert size > 1, 'Error (getSpliceScores): interval size is less than 2!'
109
110 acc = size*[0.0]
111 don = size*[0.0]
112
113 interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
114 pos_size = new_intp()
115 intp_assign(pos_size,1)
116
117 # fetch acceptor scores
118 sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_%d%s'
119 acc = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename,total_size)
120
121 # fetch donor scores
122 sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_%d%s'
123 don = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename,total_size)
124
125 return acc, don
126
127
128 def get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_files):
129 """
130 This function expects an interval, chromosome and strand information and
131 returns then the genomic sequence of this interval and the associated scores.
132 """
133
134 assert genomicSeq_start < genomicSeq_stop
135
136 chrom = 'chr%d' % chr
137 genomicSeq = load_genomic(chrom,strand,genomicSeq_start-1,genomicSeq_stop,dna_flat_files,one_based=False)
138 genomicSeq = genomicSeq.lower()
139
140 # check the obtained dna sequence
141 assert genomicSeq != '', 'load_genomic returned empty sequence!'
142
143 # all entries other than a c g t are set to n
144 no_base = re.compile('[^acgt]')
145 genomicSeq = no_base.sub('n',genomicSeq)
146
147 if strand == '+':
148 intervalBegin = genomicSeq_start-100
149 intervalEnd = genomicSeq_stop+100
150
151 currentAcc, currentDon = getSpliceScores(chr,strand,intervalBegin,intervalEnd)
152
153 currentAcc = currentAcc[100:-98]
154 currentAcc = currentAcc[1:]
155 currentDon = currentDon[100:-100]
156
157 length = len(genomicSeq)
158 currentAcc = currentAcc[:length]
159
160 currentDon = currentDon+[-inf]*(length-len(currentDon))
161
162 ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and genomicSeq[p-1]=='a' and genomicSeq[p]=='g' ]
163 gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
164
165 assert ag_tuple_pos == [p for p,e in enumerate(currentAcc) if e != -inf and p > 1], pdb.set_trace()
166 assert gt_tuple_pos == [p for p,e in enumerate(currentDon) if e != -inf and p > 0], pdb.set_trace()
167 assert len(currentAcc) == len(currentDon)
168
169 return genomicSeq, currentAcc, currentDon
170
171 # build reverse complement if on negative strand
172 if strand == '-':
173 fn = 'chr%d.dna.flat' % chr
174 filename = os.path.join(dna_flat_files,fn)
175 cmd = 'wc -c %s | cut -f1 -d \' \'' % filename
176 import subprocess
177 obj = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
178 out,err = obj.communicate()
179
180 if err != '':
181 print 'Error occurred while trying to obtain file size'
182 end = int(out)
183
184 intervalBegin = genomicSeq_start-100
185 intervalEnd = genomicSeq_stop+100
186
187 total_size = end
188 currentAcc, currentDon = getSpliceScores(chr,strand,intervalBegin,intervalEnd,total_size)
189
190 currentAcc = currentAcc[100:-98]
191 currentAcc = currentAcc[1:]
192 currentDon = currentDon[100:-100]
193
194 length = len(genomicSeq)
195 currentAcc = currentAcc[:length]
196 currentAcc = currentAcc+[-inf]*(length-len(currentAcc))
197
198 currentDon = currentDon+[-inf]*(length-len(currentDon))
199
200 ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and p<len(genomicSeq) and genomicSeq[p-1]=='a' and genomicSeq[p]=='g']
201 gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
202
203 assert ag_tuple_pos == [p for p,e in enumerate(currentAcc) if e != -inf and p > 1], pdb.set_trace()
204 assert gt_tuple_pos == [p for p,e in enumerate(currentDon) if e != -inf and p > 0], pdb.set_trace()
205 assert len(currentAcc) == len(currentDon), pdb.set_trace()
206
207 return genomicSeq, currentAcc, currentDon
208
209
210 def checks():
211 start = 1001
212 stop = start+1234
213 dna_flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
214
215 for chromo in range(1,6):
216 p_seq,_p_acc,_p_don = get_seq_and_scores(chromo,'+',start,stop,dna_flat_files)
217 n_seq,n_acc,n_don = get_seq_and_scores(chromo,'-',start,stop,dna_flat_files)
218
219 print p_seq == reverse_complement(n_seq)
220
221
222
223 if __name__ == '__main__':
224 checks()