46435e1d7e9d7fa48bc08e95a5526875767cf9b6
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
4 import os
5 import pdb
6 import random
7 import re
8 import sys
9 import subprocess
11 from numpy.matlib import inf
13 from Genefinding import *
16 def get_flatfile_size(filename):
17 cmd = 'wc -c %s | cut -f1 -d \' \'' % filename
18 obj = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
19 out,err = obj.communicate()
21 if err != '':
22 print 'Error occurred while trying to obtain file size'
23 return int(out)
26 def reverse_complement(seq):
27 """
28 This function takes a read in plain or bracket notation and returns the
29 reverse complement of it.
30 I.e.
32 est = cgt[ac][tg]a
36 rev_comp = t[ac][tg]acg
37 """
39 bpos = seq.find('[')
40 rc = lambda x: {'a':'t','c':'g','g':'c','t':'a'}[x]
42 # check first whether seq contains no brackets at all
43 if bpos == -1:
44 ret_val = map(rc,seq)
45 ret_val.reverse()
46 ret_val = "".join(ret_val)
47 else:
48 brc = lambda x: {'a':'t','c':'g','g':'c','t':'a','[':'[',']':']'}[x]
50 # first_part is the part of the seq up to the first occurrence of a
51 # bracket
52 first_part = seq[:bpos]
53 first_part = map(rc,first_part)
54 first_part.reverse()
55 first_part = "".join(first_part)
57 # inside brackets has to be complemented but NOT reversed
58 inside_brackets = seq[bpos+1:bpos+3]
59 inside_brackets = "".join(map(rc,inside_brackets))
61 ret_val = '%s[%s]%s'%(reverse_complement(seq[bpos+4:]),inside_brackets,first_part)
63 return ret_val
66 def unbracket_est(est):
67 """
68 This function takes a read in bracket notation and restores the read sequence from it.
69 I.e.
71 est = cgt[ac][tg]aa
75 result = cgtcgaa
77 so the second entry within brackets is the base on the read whereas the first
78 entry is the base from the dna.
79 """
81 new_est = ''
82 e = 0
84 while True:
85 if e >= len(est):
86 break
88 if est[e] == '[':
89 new_est += est[e+2]
90 e += 4
91 else:
92 new_est += est[e]
93 e += 1
95 return "".join(new_est).lower()
99 """
100 This function takes a dna sequence and a read sequence and returns the
101 bracket format of the match/mismatches i.e.
103 dna : aaa
105 is written in bracket notation: aa[ac]
106 """
111 def getSpliceScores(chr,strand,intervalBegin,intervalEnd,total_size=0):
112 """
113 Now we want to use interval_query to get the predicted splice scores trained
114 on the TAIR sequence and annotation.
115 """
117 size = intervalEnd-intervalBegin
118 assert size > 1, 'Error (getSpliceScores): interval size is less than 2!'
120 acc = size*[0.0]
121 don = size*[0.0]
123 interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
124 pos_size = new_intp()
125 intp_assign(pos_size,1)
127 # fetch acceptor scores
128 sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_%d%s'
129 acc = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename,total_size)
131 # fetch donor scores
132 sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_%d%s'
133 don = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename,total_size)
135 return acc, don
138 def get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_files):
139 """
140 This function expects an interval, chromosome and strand information and
141 returns then the genomic sequence of this interval and the associated scores.
142 """
144 assert genomicSeq_start < genomicSeq_stop
146 chrom = 'chr%d' % chr
148 genomicSeq = genomicSeq.lower()
150 # check the obtained dna sequence
151 assert genomicSeq != '', 'load_genomic returned empty sequence!'
153 # all entries other than a c g t are set to n
154 no_base = re.compile('[^acgt]')
155 genomicSeq = no_base.sub('n',genomicSeq)
157 if strand == '+':
158 intervalBegin = genomicSeq_start-100
159 intervalEnd = genomicSeq_stop+100
161 currentAcc, currentDon = getSpliceScores(chr,strand,intervalBegin,intervalEnd)
163 currentAcc = currentAcc[100:-98]
164 currentAcc = currentAcc[1:]
165 currentDon = currentDon[100:-100]
167 length = len(genomicSeq)
168 currentAcc = currentAcc[:length]
170 currentDon = currentDon+[-inf]*(length-len(currentDon))
172 ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and genomicSeq[p-1]=='a' and genomicSeq[p]=='g' ]
173 gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
175 assert ag_tuple_pos == [p for p,e in enumerate(currentAcc) if e != -inf and p > 1], pdb.set_trace()
176 assert gt_tuple_pos == [p for p,e in enumerate(currentDon) if e != -inf and p > 0], pdb.set_trace()
177 assert len(currentAcc) == len(currentDon)
179 return genomicSeq, currentAcc, currentDon
181 # build reverse complement if on negative strand
182 if strand == '-':
183 fn = 'chr%d.dna.flat' % chr
184 filename = os.path.join(dna_flat_files,fn)
185 end = get_flatfile_size(filename)
187 intervalBegin = genomicSeq_start-100
188 intervalEnd = genomicSeq_stop+100
190 total_size = end
191 currentAcc, currentDon = getSpliceScores(chr,strand,intervalBegin,intervalEnd,total_size)
193 currentAcc = currentAcc[100:-98]
194 currentAcc = currentAcc[1:]
195 currentDon = currentDon[100:-100]
197 length = len(genomicSeq)
198 currentAcc = currentAcc[:length]
199 currentAcc = currentAcc+[-inf]*(length-len(currentAcc))
201 currentDon = currentDon+[-inf]*(length-len(currentDon))
203 ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and p<len(genomicSeq) and genomicSeq[p-1]=='a' and genomicSeq[p]=='g']
204 gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
206 assert ag_tuple_pos == [p for p,e in enumerate(currentAcc) if e != -inf and p > 1], pdb.set_trace()
207 assert gt_tuple_pos == [p for p,e in enumerate(currentDon) if e != -inf and p > 0], pdb.set_trace()
208 assert len(currentAcc) == len(currentDon), pdb.set_trace()
210 return genomicSeq, currentAcc, currentDon
213 def checks():
214 start = 1001
215 stop = start+1234
216 dna_flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
218 for chromo in range(1,6):
219 p_seq,_p_acc,_p_don = get_seq_and_scores(chromo,'+',start,stop,dna_flat_files)
220 n_seq,n_acc,n_don = get_seq_and_scores(chromo,'-',start,stop,dna_flat_files)
222 print p_seq == reverse_complement(n_seq)
226 if __name__ == '__main__':
227 checks()