2 # -*- coding: utf-8 -*-
10 from numpy
.matlib
import inf
12 from Genefinding
import *
13 from genome_utils
import load_genomic
16 def reverse_complement(seq
):
18 This function takes a read in plain or bracket notation and returns the
19 reverse complement of it.
26 rev_comp = t[ac][tg]acg
30 rc
= lambda x
: {'a':'t','c':'g','g':'c','t':'a'}[x
]
32 # check first whether seq contains no brackets at all
36 ret_val
= "".join(ret_val
)
38 brc
= lambda x
: {'a':'t','c':'g','g':'c','t':'a','[':'[',']':']'}[x
]
40 # first_part is the part of the seq up to the first occurrence of a
42 first_part
= seq
[:bpos
]
43 first_part
= map(rc
,first_part
)
45 first_part
= "".join(first_part
)
47 # inside brackets has to be complemented but NOT reversed
48 inside_brackets
= seq
[bpos
+1:bpos
+3]
49 inside_brackets
= "".join(map(rc
,inside_brackets
))
51 ret_val
= '%s[%s]%s'%(reverse_complement(seq
[bpos
+4:]),inside_brackets
,first_part
)
56 def unbracket_est(est
):
58 This function takes a read in bracket notation and restores the read sequence from it.
67 so the second entry within brackets is the base on the read whereas the first
68 entry is the base from the dna.
85 return "".join(new_est
).lower()
88 def create_bracket_seq(dna_seq
,read_seq
):
90 This function takes a dna sequence and a read sequence and returns the
91 bracket format of the match/mismatches i.e.
95 is written in bracket notation: aa[ac]
97 assert len(dna_seq
) == len(read_seq
)
98 return "".join(map(lambda x
,y
: ['[%s%s]'%(x
,y
),x
][x
==y
],dna_seq
,read_seq
))
101 def getSpliceScores(chr,strand
,intervalBegin
,intervalEnd
,total_size
=0):
103 Now we want to use interval_query to get the predicted splice scores trained
104 on the TAIR sequence and annotation.
107 size
= intervalEnd
-intervalBegin
108 assert size
> 1, 'Error (getSpliceScores): interval size is less than 2!'
113 interval_matrix
= createIntArrayFromList([intervalBegin
,intervalEnd
])
114 pos_size
= new_intp()
115 intp_assign(pos_size
,1)
117 # fetch acceptor scores
118 sscore_filename
= '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_%d%s'
119 acc
= doIntervalQuery(chr,strand
,intervalBegin
,intervalEnd
,sscore_filename
,total_size
)
122 sscore_filename
= '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_%d%s'
123 don
= doIntervalQuery(chr,strand
,intervalBegin
,intervalEnd
,sscore_filename
,total_size
)
128 def get_seq_and_scores(chr,strand
,genomicSeq_start
,genomicSeq_stop
,dna_flat_files
):
130 This function expects an interval, chromosome and strand information and
131 returns then the genomic sequence of this interval and the associated scores.
134 assert genomicSeq_start
< genomicSeq_stop
136 chrom
= 'chr%d' % chr
137 genomicSeq
= load_genomic(chrom
,strand
,genomicSeq_start
-1,genomicSeq_stop
,dna_flat_files
,one_based
=False)
138 genomicSeq
= genomicSeq
.lower()
140 # check the obtained dna sequence
141 assert genomicSeq
!= '', 'load_genomic returned empty sequence!'
143 # all entries other than a c g t are set to n
144 no_base
= re
.compile('[^acgt]')
145 genomicSeq
= no_base
.sub('n',genomicSeq
)
148 intervalBegin
= genomicSeq_start
-100
149 intervalEnd
= genomicSeq_stop
+100
151 currentAcc
, currentDon
= getSpliceScores(chr,strand
,intervalBegin
,intervalEnd
)
153 currentAcc
= currentAcc
[100:-98]
154 currentAcc
= currentAcc
[1:]
155 currentDon
= currentDon
[100:-100]
157 length
= len(genomicSeq
)
158 currentAcc
= currentAcc
[:length
]
160 currentDon
= currentDon
+[-inf
]*(length
-len(currentDon
))
162 ag_tuple_pos
= [p
for p
,e
in enumerate(genomicSeq
) if p
>1 and genomicSeq
[p
-1]=='a' and genomicSeq
[p
]=='g' ]
163 gt_tuple_pos
= [p
for p
,e
in enumerate(genomicSeq
) if p
>0 and p
<len(genomicSeq
)-1 and e
=='g' and (genomicSeq
[p
+1]=='t' or genomicSeq
[p
+1]=='c')]
165 assert ag_tuple_pos
== [p
for p
,e
in enumerate(currentAcc
) if e
!= -inf
and p
> 1], pdb
.set_trace()
166 assert gt_tuple_pos
== [p
for p
,e
in enumerate(currentDon
) if e
!= -inf
and p
> 0], pdb
.set_trace()
167 assert len(currentAcc
) == len(currentDon
)
169 return genomicSeq
, currentAcc
, currentDon
171 # build reverse complement if on negative strand
173 fn
= 'chr%d.dna.flat' % chr
174 filename
= os
.path
.join(dna_flat_files
,fn
)
175 cmd
= 'wc -c %s | cut -f1 -d \' \'' % filename
177 obj
= subprocess
.Popen(cmd
,shell
=True,stdout
=subprocess
.PIPE
,stderr
=subprocess
.PIPE
)
178 out
,err
= obj
.communicate()
181 print 'Error occurred while trying to obtain file size'
184 intervalBegin
= genomicSeq_start
-100
185 intervalEnd
= genomicSeq_stop
+100
188 currentAcc
, currentDon
= getSpliceScores(chr,strand
,intervalBegin
,intervalEnd
,total_size
)
190 currentAcc
= currentAcc
[100:-98]
191 currentAcc
= currentAcc
[1:]
192 currentDon
= currentDon
[100:-100]
194 length
= len(genomicSeq
)
195 currentAcc
= currentAcc
[:length
]
196 currentAcc
= currentAcc
+[-inf
]*(length
-len(currentAcc
))
198 currentDon
= currentDon
+[-inf
]*(length
-len(currentDon
))
200 ag_tuple_pos
= [p
for p
,e
in enumerate(genomicSeq
) if p
>1 and p
<len(genomicSeq
) and genomicSeq
[p
-1]=='a' and genomicSeq
[p
]=='g']
201 gt_tuple_pos
= [p
for p
,e
in enumerate(genomicSeq
) if p
>0 and p
<len(genomicSeq
)-1 and e
=='g' and (genomicSeq
[p
+1]=='t' or genomicSeq
[p
+1]=='c')]
203 assert ag_tuple_pos
== [p
for p
,e
in enumerate(currentAcc
) if e
!= -inf
and p
> 1], pdb
.set_trace()
204 assert gt_tuple_pos
== [p
for p
,e
in enumerate(currentDon
) if e
!= -inf
and p
> 0], pdb
.set_trace()
205 assert len(currentAcc
) == len(currentDon
), pdb
.set_trace()
207 return genomicSeq
, currentAcc
, currentDon
213 dna_flat_files
= '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
215 for chromo
in range(1,6):
216 p_seq
,_p_acc
,_p_don
= get_seq_and_scores(chromo
,'+',start
,stop
,dna_flat_files
)
217 n_seq
,n_acc
,n_don
= get_seq_and_scores(chromo
,'-',start
,stop
,dna_flat_files
)
219 print p_seq
== reverse_complement(n_seq
)
223 if __name__
== '__main__':