scripts for supplementary
[synmut.git] / scripts / conservation_syn_nonsyn_subtypeB.py
1 # vim: fdm=indent
2 '''
3 author: Fabio Zanini
4 date: 07/12/12
5 content: Measure the level of conservation of synonymous and nonsynonymous
6 mutations in subtype B. Nonsynonymous mutations are expected to be
7 more broadly conserved, because they impair protein function.
8 '''
9 # Modules
10 import sys
11 from collections import Counter
12 import numpy as np
13 import matplotlib.pyplot as plt
14 from matplotlib.patches import Rectangle
15
16 # Custom modules
17 import modules.alignment as alignment
18 from modules.alphabet import alpha, alphal_ng
19 from modules.codon_usage import all_codons
20 from modules.codon_usage import translate as transstrict
21 from modules.helper import translate
22 from modules.parser_reference import seq_patterns, find_with_gaps
23
24
25
26 # Globals
27 gene = 'gag'
28 reference = 'SHAPE'
29 seq_patterns = seq_patterns[reference]
30
31
32
33 # Functions
34 def smooth(x, f):
35 l = len(x)
36 x = x[:l - l % f]
37 x = x.reshape((len(x) / f, f)).mean(axis=1)
38 return x
39
40
41 def codon_single_mutants_synnonsyn(codon):
42 codon = list(codon)
43 aa = transstrict(''.join(codon))
44 syn = []
45 nonsyn = []
46 for pos in xrange(3):
47 for mut in alphal_ng:
48 if mut != codon[pos]:
49 tmp = list(codon)
50 tmp[pos] = mut
51 tmp = ''.join(tmp)
52 if transstrict(tmp) == aa:
53 syn.append(tmp)
54 else:
55 nonsyn.append(tmp)
56 return {'syn': syn, 'nonsyn': nonsyn}
57
58
59
60
61 # Script
62 if __name__ == '__main__':
63
64 # Load data for a certain gene
65 ali = alignment.MSA(gene=gene, reference=reference)
66 msaraw = np.array(ali.MSA)
67 afs = ali.allele_frequencies()
68
69 # Filter only part of pol
70 if gene == 'pol':
71 msaraw = msaraw[:, :2718]
72 afs = afs[:, :2718]
73
74 # Find consensus sequence
75 consind = afs.argmax(axis=0)
76 consensus = alpha[consind]
77
78 # Some seqs are not viable and include frameshifts that mess up the
79 # translation, hence we restrict to positions for which gaps are a minority
80 is_gap = consensus == '-'
81 # Exclude full codons
82 tmp = np.unique(is_gap.nonzero()[0] / 3)
83 is_gap[np.concatenate([tmp * 3, tmp * 3 +1, tmp * 3 + 2])] = True
84
85 # Exclude stop codons
86 is_stop = np.zeros_like(is_gap)
87 tmp = (translate(consensus) == '*').nonzero()[0]
88 is_stop[np.concatenate([tmp * 3, tmp * 3 +1, tmp * 3 + 2])] = True
89
90 ## Plot base prevalence
91 #for i in xrange(4):
92 # plt.plot(np.arange(len(consensus)), afs[i],
93 # lw=1.5, alpha=0.5)
94 #plt.xlim(0, 2600)
95 #plt.ylim(-0.05, 1.25)
96 #plt.xlabel('position in '+gene)
97 #plt.ylabel('allele frequency')
98 #plt.legend(alpha, loc=9)
99
100 # Good are codons with no gaps and no stops
101 is_good = (-is_gap) & (-is_stop)
102
103 # Get all syn and nonsyn single mutants of all codons
104 sms = {cod: codon_single_mutants_synnonsyn(cod) for cod in all_codons()}
105
106 # For each codon, calculate the number of syn and nonsyn changes from the
107 # consensus
108 msa = msaraw[:, is_good]
109 conscods = consensus[is_good].reshape((is_good.sum() / 3, 3))
110 # Count the number of sequences that enter the statistics for each codon, to
111 # counterbalance for ambiguous sequences
112 nseqs = msa.shape[0] * np.ones(len(conscods))
113 # Prepare the structure to save the counts (columns: not changed, syn, nonsyn)
114 counts3 = np.zeros((len(conscods), 3), int)
115
116 # Run over consensus codons and count
117 for i, conscod in enumerate(conscods):
118 conscod = ''.join(conscod)
119 smscod = sms[conscod]
120 codsall = map(''.join, msa[:, i * 3: (i + 1) * 3])
121 counts = Counter(codsall)
122 for (cod, abundance) in counts.iteritems():
123 if cod == conscod:
124 counts3[i, 0] += abundance
125 elif cod in smscod['syn']:
126 counts3[i, 1] += abundance
127 elif cod in smscod['nonsyn']:
128 counts3[i, 2] += abundance
129
130 # Calculate the number of syn and nonsyn for each codon along the consensus
131 nsmscons = {'syn': [],
132 'nonsyn': []}
133 for cod in conscods:
134 tmp = sms[''.join(cod)]
135 for key in ['syn', 'nonsyn']:
136 nsmscons[key].append(len(tmp[key]))
137 for key in ['syn', 'nonsyn']:
138 nsmscons[key] = np.array(nsmscons[key])
139
140 # Take e.g. four-fold degenerate codons (as SMs only) and calculate frequencies
141 is_deg = nsmscons['syn'] == 3
142 nsmscons2 = np.vstack([nsmscons['syn'], nsmscons['nonsyn']]).T
143 freqs2 = 1.0 * counts3[:, 1:][is_deg] / nsmscons2[is_deg] / msa.shape[0]
144
145 ## Plot a histogram of conservation levels
146 #plt.figure()
147 #plt.hist(freqs2,color=('r', 'b'), bins=np.linspace(0, 0.04, 20),
148 # normed=True, label=('syn', 'nonsyn'))
149 #plt.xlabel('diversity')
150 #plt.ylabel('histogram')
151 #plt.title('diversity at 4-fold degen codons, '+gene)
152
153 # Plot cumulative
154 freqsyn = np.sort(freqs2[:,0])
155 freqnonsyn = np.sort(freqs2[:,1])
156 fig = plt.figure(figsize=[6,4.6])
157 plt.plot(freqsyn, np.linspace(0, 1, len(freqsyn)), c='r', lw=2, label='syn')
158 plt.plot(freqnonsyn, np.linspace(0, 1, len(freqnonsyn)), c='b', lw=2, label='nonsyn')
159 plt.xlabel('diversity')
160 plt.ylabel('cumulative')
161 plt.xlim(0, 0.04)
162 plt.legend(loc=4)
163 plt.title('diversity at 4-fold degen codons, '+gene)
164 line = plt.plot([0.003] * 2, [0.08, 0.60], c='k', lw=1.5, ls='--')
165 txt = plt.text(0.0035, 0.5, r'$\Delta \approx 0.52$', fontsize=14)
166
167 plt.ion()
168 plt.show()