+ some bugfixes
[qpalma.git] / qpalma / sequence_utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 #
5 # This module holds all functions for queries on the dna flat files and the
6 # splice score files.
7 # Furthermore it contains some utility functions such as reverse complement and
8 # unbracketing est strings
9 #
10
11 import os
12 import pdb
13 import re
14 import subprocess
15
16 from numpy.matlib import inf
17
18 from Genefinding import *
19 from genome_utils import load_genomic
20
21 extended_alphabet = ['-','a','c','g','t','n','[',']']
22 alphabet = ['-','a','c','g','t','n']
23
24
25 class DataAccessionException:
26 pass
27
28
29 def get_flatfile_size(filename):
30 cmd = 'wc -c %s | cut -f1 -d \' \'' % filename
31 obj = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
32 out,err = obj.communicate()
33
34 if err != '':
35 print 'Error occurred while trying to obtain file size'
36 raise DataAccessionException
37
38 return int(out)
39
40
41 def reverse_complement(seq):
42 """
43 This function takes a read in plain or bracket notation and returns the
44 reverse complement of it.
45 I.e.
46
47 est = cgt[ac][tg]a
48
49 leads to
50
51 rev_comp = t[ac][tg]acg
52 """
53
54 bpos = seq.find('[')
55 rc = lambda x: {'a':'t','c':'g','g':'c','t':'a','n':'n'}[x]
56
57 # check first whether seq contains no brackets at all
58 if bpos == -1:
59 ret_val = map(rc,seq)
60 ret_val.reverse()
61 ret_val = "".join(ret_val)
62 else:
63 brc = lambda x: {'a':'t','c':'g','g':'c','t':'a','n':'n','[':'[',']':']'}[x]
64
65 # first_part is the part of the seq up to the first occurrence of a
66 # bracket
67 first_part = seq[:bpos]
68 first_part = map(rc,first_part)
69 first_part.reverse()
70 first_part = "".join(first_part)
71
72 # inside brackets has to be complemented but NOT reversed
73 inside_brackets = seq[bpos+1:bpos+3]
74 inside_brackets = "".join(map(rc,inside_brackets))
75
76 ret_val = '%s[%s]%s'%(reverse_complement(seq[bpos+4:]),inside_brackets,first_part)
77
78 return ret_val
79
80
81 def unbracket_seq(seq):
82 """
83 This function takes a read in bracket notation and restores the read sequence from it.
84 I.e.
85
86 seq = cgt[ac][tg]aa
87
88 leads to
89
90 result = cgtcgaa
91
92 so the second entry within brackets is the base on the read whereas the first
93 entry is the base from the dna.
94 """
95
96 new_seq = ''
97 e = 0
98
99 while True:
100 if e >= len(seq):
101 break
102
103 if seq[e] == '[':
104 new_seq += seq[e+2]
105 e += 4
106 else:
107 new_seq += seq[e]
108 e += 1
109
110 return "".join(new_seq).lower()
111
112
113 def create_bracket_seq(dna_seq,read_seq):
114 """
115 This function takes a dna sequence and a read sequence and returns the
116 bracket format of the match/mismatches i.e.
117
118 dna : aaa
119 read: aac
120 is written in bracket notation: aa[ac]
121 """
122 assert len(dna_seq) == len(read_seq),pdb.set_trace()
123 return "".join(map(lambda x,y: ['[%s%s]'%(x,y),x][x==y],dna_seq,read_seq))
124
125
126 def getSpliceScores(chromo,strand,intervalBegin,intervalEnd,total_size=0,genomicSeq=''):
127 """
128 Now we want to use interval_query to get the predicted splice scores trained
129 on the TAIR sequence and annotation.
130 """
131
132 size = intervalEnd-intervalBegin
133 assert size > 1, 'Error (getSpliceScores): interval size is less than 2!'
134
135 acc = size*[0.0]
136 don = size*[0.0]
137
138 interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
139 pos_size = new_intp()
140 intp_assign(pos_size,1)
141
142 # fetch acceptor scores
143 sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_%d%s'
144 acc = doIntervalQuery(chromo,strand,intervalBegin,intervalEnd,sscore_filename,total_size)
145
146 #ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and p<len(genomicSeq) and genomicSeq[p-1]=='a' and genomicSeq[p]=='g'][:100]
147 #acc_pos = [p for p,e in enumerate(acc) if e != -inf and p > 1][:100]
148
149 # fetch donor scores
150 sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_%d%s'
151 don = doIntervalQuery(chromo,strand,intervalBegin,intervalEnd,sscore_filename,total_size)
152
153 #gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
154 #don_pos = [p for p,e in enumerate(don) if e != -inf and p > 1][:100]
155 #pdb.set_trace()
156
157 return acc, don
158
159
160 def fetch_sequence(chromo,strand,genomicSeq_start,genomicSeq_stop,dna_flat_files):
161 chromo_string = 'chr%d' % chromo
162 #genomicSeq = load_genomic(chromo_string,strand,genomicSeq_start-1,genomicSeq_stop,dna_flat_files,one_based=False)
163 #print 'loading %s...' % chromo_string
164 genomicSeq = load_genomic(chromo_string,strand,genomicSeq_start,genomicSeq_stop,dna_flat_files,one_based=False)
165 genomicSeq = genomicSeq.strip().lower()
166
167 # check the obtained dna sequence
168 assert genomicSeq != '', 'load_genomic returned empty sequence!'
169
170 # all entries other than a c g t are set to n
171 no_base = re.compile('[^acgt]')
172 genomicSeq = no_base.sub('n',genomicSeq)
173
174 return genomicSeq
175
176
177 def get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,dna_flat_files,only_seq=False):
178 """
179 This function expects an interval, chromosome and strand information and
180 returns then the genomic sequence of this interval and the associated scores.
181 """
182 # Because splice site scores are predicted using a window of the sequence the
183 # first and the last NO_SCORE_WINDOW_SIZE nucleotides of a genomic sequence
184 # do not have any score predictions
185 NO_SCORE_WINDOW_SIZE = 205
186
187 assert genomicSeq_start < genomicSeq_stop
188
189 genomicSeq = fetch_sequence(chromo,'+',genomicSeq_start,genomicSeq_stop,dna_flat_files)
190
191 if only_seq:
192 return genomicSeq
193
194 filename = os.path.join(dna_flat_files,'chr%d.dna.flat' % chromo)
195 total_size = get_flatfile_size(filename)
196
197 # shorten the intervalEnd to the maximal size of the genomic sequence if it would exceed this size
198 lookup_offset = 0
199 intervalBegin = max(genomicSeq_start-lookup_offset,0)
200 intervalEnd = min(genomicSeq_stop+lookup_offset,total_size-1)
201
202 if chromo > 7:
203 print intervalBegin,intervalEnd,total_size
204 currentAcc, currentDon = getSpliceScores(chromo-7,'-',intervalBegin,intervalEnd,total_size,genomicSeq)
205 currentDon = [-inf]+currentDon[:-1]
206 else:
207 currentAcc, currentDon = getSpliceScores(chromo,strand,intervalBegin,intervalEnd,total_size,genomicSeq)
208 currentAcc = currentAcc[2:]
209 currentDon = currentDon[1:]
210
211 length = len(genomicSeq)
212 currentAcc = currentAcc+[-inf]*(length-len(currentAcc))
213 currentDon = currentDon+[-inf]*(length-len(currentDon))
214
215 #acc_pos = [p for p,e in enumerate(currentAcc) if e != -inf and p > 1]
216 #don_pos = [p for p,e in enumerate(currentDon) if e != -inf and p > 0]
217
218 gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
219 ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and p<len(genomicSeq) and genomicSeq[p-1]=='a' and genomicSeq[p]=='g']
220
221 #pdb.set_trace()
222 #return genomicSeq, currentAcc, currentDon
223
224 for pos in ag_tuple_pos:
225 if pos+intervalBegin < NO_SCORE_WINDOW_SIZE and currentAcc[pos] == -inf:
226 currentAcc[pos] = 1e-6
227
228 if pos+intervalBegin > total_size - NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
229 currentAcc[pos] = 1e-6
230
231 for pos in gt_tuple_pos:
232 if pos+intervalBegin < NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
233 currentDon[pos] = 1e-6
234
235 if pos+intervalBegin > total_size - NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
236 currentDon[pos] = 1e-6
237
238 assert ag_tuple_pos == [p for p,e in enumerate(currentAcc) if e != -inf and p > 1], pdb.set_trace()
239 assert gt_tuple_pos == [p for p,e in enumerate(currentDon) if e != -inf and p > 0], pdb.set_trace()
240 assert len(genomicSeq) == len(currentAcc) == len(currentDon), pdb.set_trace()
241
242 return genomicSeq, currentAcc, currentDon