SHAPE files plus simulation script
[synmut.git] / scripts / modules / parser_SHAPE.py
1 # vim: fdm=indent
2 '''
3 author: Fabio Zanini
4 date: 03/07/12
5 content: Parser for the SHAPE reactivity data, from doi:10.1038/nature08237.
6 '''
7 # Modules
8 import numpy as np
9 import re
10
11
12 # Globals
13 helixfile = '../data/SHAPE/nature08237-s2.txt'
14 bpseqfile = '../data/SHAPE/nature08237-s2_bpseq.txt'
15 ppfile = '../data/SHAPE/nature08237-s3.txt'
16
17
18
19 # Classes
20 class secondary(object):
21 '''Store the SHAPE reactivities and pairing probs.'''
22
23 Shankarappa_indices = [6558, 7173]
24
25
26 def __init__(self):
27 '''Store sequence, reactivities, pairing probs, and map.'''
28 self.r, self.p = np.genfromtxt(ppfile,
29 usecols=(2,3),
30 delimiter='\t',
31 unpack=True,
32 usemask=True)
33
34 self.sequence = np.genfromtxt(ppfile,
35 dtype='S1',
36 usecols=[1],
37 delimiter='\t',
38 unpack=True,
39 usemask=False)
40
41 # Convert to DNA notation
42 self.sequence[self.sequence == 'U'] = 'T'
43
44 # Save map in a masked array
45 bp1, bp2 = np.loadtxt(bpseqfile, unpack=True, usecols=(0,2), dtype=int)
46 self.map = np.ma.masked_all(len(self.sequence), int)
47 self.map[bp1 - 1] = bp2 - 1
48
49
50 def __repr__(self):
51 return 'secondary structure class of HIV-1 NL4-3'
52
53
54
55 # Functions
56 def assign_prob(seq, include_reactivity=False):
57 '''Create a masked array with pairing probability and reactivity.
58
59 Gap positions in the input sequence will stay masked.
60 Sites where the reactivity has not been measured will stay masked as well.
61 '''
62 # Create secondary structure object
63 sec = secondary()
64 s = ''.join(sec.sequence)
65
66 # Initialize
67 assp = np.ma.masked_all(len(seq))
68 assr = np.ma.masked_all(len(seq))
69
70 # Short names
71 p = sec.p
72 r = sec.r
73
74 # Check input string
75 if not isinstance(seq, basestring):
76 seq = ''.join(seq)
77
78 # Take the ungapped sequence and see whether the input sequence is
79 # compatible with that one (sanity check)
80 start = s.find(re.sub('-','', seq))
81 if start == -1:
82 raise ValueError('Sequence not found!')
83
84 # Fill results
85 ii = start
86 for i in xrange(len(assp)):
87 if seq[i] != '-':
88 assp[i] = p[ii]
89 assr[i] = r[ii]
90 ii += 1
91 if include_reactivity:
92 return (assp, assr)
93 else:
94 return assp
95
96
97 def assign_map(seq):
98 '''Create a masked array with pairing map.
99
100 Gap positions in the input sequence will stay masked. Non-paired positions
101 will also be masked. Please look at the gaps in the sequence to distinguish
102 between the two.
103
104 Note: this algorithm is more involved than the one for pairing
105 probabilities, because the second pairing partner is a index and must take
106 into account gaps.
107
108 Note: this function sets the pairing partner to -1 if it is outside the
109 sequenced region.
110 '''
111 # Create secondary structure object
112 sec = secondary()
113 s = ''.join(sec.sequence)
114
115 # Initialize
116 ass = np.ma.masked_all(len(seq), int)
117 attr = sec.map
118
119 # Check input string
120 if not isinstance(seq, basestring):
121 seq = ''.join(seq)
122
123 # Take the ungapped sequence and see whether the input sequence is
124 # compatible with that one (sanity check)
125 seqng = re.sub('-','',seq)
126 start = s.find(seqng)
127 if start == -1:
128 raise ValueError('Sequence not found!')
129
130 # Create index map, so that
131 # ind[nongapped] = gapped
132 # and of course gapped >= nongapped
133 ind = np.zeros(len(seqng), int)
134 ii = start
135 for i in xrange(len(seq)):
136 if seq[i] != '-':
137 ind[ii - start] = i
138 ii += 1
139
140 # Fill results
141 ii = 0
142 for i in xrange(len(seq)):
143 if seq[i] != '-':
144 bp2 = attr[start + ii]
145 # Check whether there is any pairing
146 if (bp2 is not np.ma.masked):
147 # Check whether the pairing is WITHIN the sequenced region
148 if (bp2 - start > 0) and (bp2 - start < len(seqng)):
149 ass[i] = ind[bp2 - start]
150 # Otherwise set a special flag (-1)
151 else:
152 ass[i] = -1
153 ii += 1
154 return ass
155
156
157
158 def helix_to_bpseq():
159 '''Convert the helix file into bpseq format'''
160 # Create secondary structure object
161 sec = secondary()
162
163 # Load helix file
164 h1, h2, hls = np.loadtxt(helixfile, unpack=True, dtype=int)
165
166 # Total number of base pairs
167 L = np.sum(hls)
168
169 # Store bpseq
170 bp1 = np.zeros(2 * L, int)
171 bpa = np.zeros(2 * L, 'S1')
172 bp2 = np.zeros(2 * L, int)
173 ibp = 0
174 for i in xrange(len(hls)):
175 hl = hls[i]
176 bp1[ibp: ibp + hl] = h1[i] + np.arange(hl)
177 bp2[ibp: ibp + hl] = h2[i] - np.arange(hl)
178 bpa[ibp: ibp + hl] = sec.sequence[bp1[ibp: ibp + hl] - 1]
179 ibp += hl
180
181 # Mirror image
182 bp1[ibp: ibp + hl] = h2[i] - hl + 1 + np.arange(hl)
183 bp2[ibp: ibp + hl] = h1[i] + hl - 1 - np.arange(hl)
184 bpa[ibp: ibp + hl] = sec.sequence[bp1[ibp: ibp + hl] - 1]
185 ibp += hl
186
187 # Sort them by the first column
188 ind_sort = np.argsort(bp1)
189 bp1 = bp1[ind_sort]
190 bp2 = bp2[ind_sort]
191 bpa = bpa[ind_sort]
192
193 # Write to output file
194 with open(bpseqfile, 'w') as f:
195 f.write('# RNA secondary structure pairing map from Watts et al. (2009), bpseq format')
196 f.write('\n')
197 f.write('# Reference sequence NL4-3')
198 f.write('\n')
199 for i in xrange(2 * L):
200 f.write(str(bp1[i]))
201 f.write('\t')
202 f.write(bpa[i])
203 f.write('\t')
204 f.write(str(bp2[i]))
205 f.write('\n')