scripts for supplementary
[synmut.git] / scripts / modules / codon_usage.py
1 # vim: fdm=indent
2 '''
3 author: Fabio Zanini
4 date: 16/10/12
5 content: Read codon usage tables
6 '''
7 # Modules
8 import re
9 from longitudinal.classes.parser_generic import alpha
10 from Bio.Seq import translate
11
12
13
14 # Globals
15 datadir = '/home/fabio/university/phd/general/data/codon_usage/'
16 alpha = list(alpha)
17
18
19
20 # Classes
21 class codon_table(object):
22
23 def __init__(self, organism='homo sapiens'):
24
25 self.organism = organism
26 self.datafile = datadir + re.sub(' ', '_', organism) +'.dat'
27
28 table_within = {}
29 table_abs = {}
30 with open(self.datafile, 'r') as f:
31 for line in f:
32 if line[0] != '#':
33 fields = line.rstrip('\n').split('\t')
34 key = fields[0]
35 aa = fields[1]
36 freqwithin = float(fields[2])
37 freqabs = 1e-3 * float(fields[3])
38
39 table_within[key] = (aa, freqwithin)
40 table_abs[key] = freqabs
41
42 self.within = table_within
43 self.absolute = table_abs
44
45
46
47 # Functions
48 def all_codons():
49 '''Generate a list with all codons'''
50 return ('AAA', 'AAC', 'AAG', 'AAT', 'ACA', 'ACC',
51 'ACG', 'ACT', 'AGA', 'AGC', 'AGG', 'AGT',
52 'ATA', 'ATC', 'ATG', 'ATT', 'CAA', 'CAC',
53 'CAG', 'CAT', 'CCA', 'CCC', 'CCG', 'CCT',
54 'CGA', 'CGC', 'CGG', 'CGT', 'CTA', 'CTC',
55 'CTG', 'CTT', 'GAA', 'GAC', 'GAG', 'GAT',
56 'GCA', 'GCC', 'GCG', 'GCT', 'GGA', 'GGC',
57 'GGG', 'GGT', 'GTA', 'GTC', 'GTG', 'GTT',
58 'TAA', 'TAC', 'TAG', 'TAT', 'TCA', 'TCC',
59 'TCG', 'TCT', 'TGA', 'TGC', 'TGG', 'TGT',
60 'TTA', 'TTC', 'TTG', 'TTT')
61
62
63 def all_amino_acids():
64 '''All amino acids'''
65 return ('A', 'C', 'E', 'D', 'G', 'F',
66 'I', 'H', 'K', '*', 'M', 'L',
67 'N', 'Q', 'P', 'S', 'R', 'T',
68 'W', 'V', 'Y')
69
70
71 def number_of_codons(amino_acid):
72 '''Get the number of codons per amino acid'''
73 d = {'L': 6, 'S': 6, 'R': 6, 'A': 4,
74 'G': 4, 'P': 4, 'T': 4, 'V': 4,
75 'I': 3, '*': 3, 'C': 2, 'E': 2,
76 'D': 2, 'F': 2, 'H': 2, 'K': 2,
77 'N': 2, 'Q': 2, 'Y': 2, 'M': 1,
78 'W': 1}
79 return d[amino_acid]
80
81
82 def volatility(codon):
83 '''Number of nonsilent single mutants'''
84 codon = ''.join(codon)
85 if codon not in all_codons():
86 raise ValueError('Codon not recognized. Remember we are in a DNA world.')
87
88 aa = translate(codon)
89 v = 0
90 for pos in xrange(3):
91 for m in alpha:
92 if m not in ('-', codon[pos]):
93 tmp = list(codon)
94 tmp[pos] = m
95 tmp = ''.join(tmp)
96 v += translate(tmp) != aa
97 return v
98
99
100 def effective_number_of_codons(codonlist):
101 '''Calculate the ENC from a list of codons'''
102 def F(n, codonlist):
103 '''The probaibility that two codons are the same in the list'''
104 import numpy as np
105 from collections import Counter
106
107 Ps = []
108 for aa in all_amino_acids():
109 if number_of_codons(aa) == n:
110 cods = filter(lambda x: translate(x) == aa, codonlist)
111 if cods:
112 P = 1.0 * np.array(Counter(cods).values()) / len(cods)
113 Ps.append(np.dot(P, P))
114 if Ps:
115 return np.mean(Ps)
116 else:
117 return None
118
119 codonlist = map(''.join, codonlist)
120 F2 = F(2, codonlist)
121 F3 = F(3, codonlist)
122 F4 = F(4, codonlist)
123 F6 = F(6, codonlist)
124 ENC = 2
125 if F2 is not None:
126 ENC += 9 / F2
127 if F3 is not None:
128 ENC += 1 / F3
129 if F4 is not None:
130 ENC += 5 / F4
131 if F6 is not None:
132 ENC += 3 / F6
133
134 return ENC
135
136
137
138
139
140 # Script
141 if __name__ == '__main__':
142
143 table = codon_table()