scripts for supplementary
[synmut.git] / scripts / modules / parser_reference.py
1 # vim: fdm=indent
2 '''
3 author: Fabio Zanini
4 date: 04/09/12
5 content: Implement some globals used by many parsers that interact with a
6 reference.
7 '''
8 # Modules
9 import numpy as np
10 import Bio.AlignIO as AlignIO
11 from itertools import permutations
12
13 from longitudinal.classes.helper import find_with_gaps
14 from longitudinal.classes.parser_generic import alpha
15
16
17
18 # Globals
19 # these are derived from each other using MSA.corresponding_pattern
20 # (and ultimately by hand from the horrible XLS file for HXB2)
21 seq_patterns = {'SHAPE':{'V1': ['TGCACTGATTTGAA', 'TGCTCTTTCAATATCAGC'],
22 'V2': ['TGCTCTTTCAATATCAGC', 'TGTAACACCTCAGTCAT'],
23 'V3': ['TGTACAAGACCCAACAACA', 'TGTAACATTAGTAGA'],
24 'V4': ['TGTAATTCAACACAACTGTTTAA', 'TGCAGAATAAAACAATT'],
25 'V5': ['TAATAACAACAATGGGTCCGAGAT', 'CCTGGAGGA'],
26 'env_signal': ['ATGAGAGTGAAGGAGAA', 'ACAGAAAAATTGTGGGTC']},
27 'NL4-3':{'V1': ['TGCACTGATTTGAA', 'TGCTCTTTCAATATCAGC'],
28 'V2': ['TGCTCTTTCAATATCAGC', 'TGTAACACCTCAGTCAT'],
29 'V3': ['TGTACAAGACCCAACAACA', 'TGTAACATTAGTAGA'],
30 'V4': ['TGTAATTCAACACAACTGTTTAA', 'TGCAGAATAAAACAATT'],
31 'V5': ['TAATAACAACAATGGGTCCGAGAT', 'CCTGGAGGA']},
32 'HXB2': {'V1': ['TGCACTGATTTGAA', 'TGCTCTTTCAATATCAGC'],
33 'V2': ['TGCTCTTTCAATATCAGC', 'TGTAACACCTCAGTCAT'],
34 'V3': ['TGTACAAGACCCAACAACA', 'TGTAACATTAGTAGA'],
35 'V4': ['TGTAATTCAACACAACTGTTTAA', 'TGCAGAATAAAACAAAT'],
36 'V5': ['TAATAGCAACAATGAGTCCGAGAT', 'CCTGGAGGA'],
37 'env_signal': ['ATGAGAGTGAAGGAGAA', 'ACAGAAAAATTGTGGGTC']}
38 }
39
40 genes = {'SHAPE': {'gag': ['ATGGGTGCGAGAGCGTCGGTA', 'AGATAGGGGGGCAATTAA'],
41 'pol': ['CCTCAGATCACTCTTTGGCAGCGACCCC', 'CACATGGAAAAGATTAGTA'],
42 'env': ['ATGAGAGTGAAGGAG', 'GATGGGTGGCAAGTG'],
43 'nef': ['ATGGGTGGCAAGTG', 'CATCGAGCTTGCTACAAGGG']},
44 'HXB2': {'gag': ['ATGGGTGCGAGAGCGTCAGTA', 'AGATAGGGGGGCAACTAA'],
45 'pol': ['CCTCAGGTCACTCTTTGGCAACGACCCC', 'AACATGGAAAAGTTTAGTA'],
46 'env': ['ATGAGAGTGAAGGAG', 'GATGGGTGGCAAGTG'],
47 'nef': ['ATGGGTGGCAAGTG', 'CATCGAGCTTGCTACAAG']},
48 }
49
50 datadir = '/home/fabio/university/phd/longitudinal/data/reference'
51
52
53
54 class MSA(object):
55 '''An MSA of the given references.
56
57 Note: the MSA must already exist, we are not calling MUSCLE here for reasons
58 of clarity (although we could).
59 '''
60
61 def __init__(self, references):
62 '''Initialize'''
63 for tmplist in permutations(references):
64
65 datafile = datadir+'/'+'_'.join(tmplist)
66 if len(references) > 1:
67 datafile = datafile+'_aligned'
68 datafile = datafile+'.fasta'
69
70 try:
71 self.MSA = AlignIO.read(datafile, format='fasta')
72 break
73 except IOError:
74 pass
75
76 if not hasattr(self, 'MSA'):
77 raise IOError('Reference alignment not found')
78
79
80 def __repr__(self):
81 return self.MSA.__repr__()
82
83
84 def __str__(self):
85 return self.MSA.__str__()
86
87
88 def corresponding_pattern(self, pattern, origin, target):
89 L = len(pattern)
90 fo = lambda x: origin in self.MSA[x].name
91 ft = lambda x: target in self.MSA[x].name
92 iseqo = filter(fo, xrange(len(self.MSA)))[0]
93 iseqt = filter(ft, xrange(len(self.MSA)))[0]
94
95 index = find_with_gaps(str(self.MSA[iseqo].seq), pattern)
96 if index == (-1):
97 raise ValueError('Pattern not found in origin')
98
99 tmplist = []
100 while (len(tmplist) < L) and (index < self.MSA.get_alignment_length()):
101 base = self.MSA[iseqt, index]
102 if base != '-':
103 tmplist.append(base)
104 index += 1
105
106 return ''.join(tmplist)
107
108
109