0f9bf2986dea9913880a1d1174bc7f6051ed91a2
[qpalma.git] / tools / io_qpalma.py
1
2 """
3 Functions for parsing palma parameter files,
4 to extract splice site predictors for qpalma.
5 """
6
7 import bz2
8 import sys
9 import numpy
10 from numpy import mat,array,chararray,inf
11
12 class palma_model():
13 """A container for palma parameters"""
14
15 #acceptor
16 acceptor_bins=None
17 acceptor_limits=None
18 acceptor_penalties=None
19
20 acceptor_splice_b=None
21 acceptor_splice_order=None
22 acceptor_splice_window_left=None
23 acceptor_splice_window_right=None
24 acceptor_splice_alphas=None
25 acceptor_splice_svs=None
26
27 #donor
28 donor_bins=None
29 donor_limits=None
30 donor_penalties=None
31
32 donor_splice_b=None
33 donor_splice_order=None
34 #donor_splice_use_gc=None
35 donor_splice_window_left=None
36 donor_splice_window_right=None
37 donor_splice_alphas=None
38 donor_splice_svs=None
39
40 # intron length
41 intron_len_bins=None
42 intron_len_limits=None
43 intron_len_penalties=None
44 intron_len_min=None
45 intron_len_max=None
46 intron_len_transform=None
47
48 gene_len_max = None
49
50 # substitution matrix
51 substitution_matrix=None
52
53
54
55 def parse_file(filename):
56 if filename.endswith('.param.bz2'):
57 fileptr=bz2.BZ2File(filename);
58 else:
59 sys.stderr.write('Expected palma parameter file with ending param.bz2\n')
60 m=palma_model()
61
62 sys.stdout.write("loading model file"); sys.stdout.flush()
63
64 l=fileptr.readline();
65
66 if l != '%palma definition file version: 1.0\n':
67 sys.stderr.write("\nfile not a palma definition file\n")
68 print l
69 return None
70
71 while l:
72 if not ( l.startswith('%') or l.startswith('\n') ): # comment
73
74 #acceptor
75 if m.acceptor_bins is None: m.acceptor_bins=parse_value(l, 'acceptor_bins')
76 if m.acceptor_limits is None: m.acceptor_limits=parse_vector(l, fileptr, 'acceptor_limits')
77 if m.acceptor_penalties is None: m.acceptor_penalties=parse_vector(l, fileptr, 'acceptor_penalties') #DEAD
78
79 if m.acceptor_splice_b is None: m.acceptor_splice_b=parse_value(l, 'acceptor_splice_b')
80 if m.acceptor_splice_order is None: m.acceptor_splice_order=parse_value(l, 'acceptor_splice_order')
81 if m.acceptor_splice_window_left is None: m.acceptor_splice_window_left=parse_value(l, 'acceptor_splice_window_left')
82 if m.acceptor_splice_window_right is None: m.acceptor_splice_window_right=parse_value(l, 'acceptor_splice_window_right')
83 if m.acceptor_splice_alphas is None: m.acceptor_splice_alphas=parse_vector(l, fileptr, 'acceptor_splice_alphas')
84 if m.acceptor_splice_svs is None: m.acceptor_splice_svs=parse_string(l, fileptr, 'acceptor_splice_svs')
85
86 #donor
87 if m.donor_bins is None: m.donor_bins=parse_value(l, 'donor_bins')
88 if m.donor_limits is None: m.donor_limits=parse_vector(l, fileptr, 'donor_limits')
89 if m.donor_penalties is None: m.donor_penalties=parse_vector(l, fileptr, 'donor_penalties') #DEAD
90
91 if m.donor_splice_b is None: m.donor_splice_b=parse_value(l, 'donor_splice_b')
92 if m.donor_splice_order is None: m.donor_splice_order=parse_value(l, 'donor_splice_order')
93 #if m.donor_splice_use_gc is None: m.donor_splice_use_gc=parse_value(l, 'donor_splice_use_gc')
94 if m.donor_splice_window_left is None: m.donor_splice_window_left=parse_value(l, 'donor_splice_window_left')
95 if m.donor_splice_window_right is None: m.donor_splice_window_right=parse_value(l, 'donor_splice_window_right')
96 if m.donor_splice_alphas is None: m.donor_splice_alphas=parse_vector(l, fileptr, 'donor_splice_alphas')
97 if m.donor_splice_svs is None: m.donor_splice_svs=parse_string(l, fileptr, 'donor_splice_svs')
98
99
100 # intron length
101 if m.intron_len_bins is None: m.intron_len_bins=parse_value(l, 'intron_len_bins')
102 if m.intron_len_limits is None: m.intron_len_limits=parse_vector(l, fileptr, 'intron_len_limits')
103 if m.intron_len_penalties is None: m.intron_len_penalties=parse_vector(l, fileptr, 'intron_len_penalties')
104 if m.intron_len_min is None: m.intron_len_min=parse_value(l, 'intron_len_min')
105 if m.intron_len_max is None: m.intron_len_max=parse_value(l, 'intron_len_max')
106 if m.intron_len_transform is None: m.intron_len_transform=parse_value(l, 'intron_len_transform')
107
108 # gene length
109 if m.gene_len_max is None: m.gene_len_max=parse_value(l, 'gene_len_max')
110
111 if m.substitution_matrix is None: m.substitution_matrix=parse_vector(l, fileptr, 'substitution_matrix')
112
113 l=fileptr.readline()
114
115 sys.stderr.write('done\n')
116 return m
117
118 def parse_value(line, name):
119 if (line.startswith(name)):
120 #print 'found ' + name
121 sys.stdout.write('.'); sys.stdout.flush()
122 str = line[line.find('=')+1:-1] ;
123 if str[0] == "'":
124 return str[1:-1]
125 else:
126 #print float(str)
127 return float(str)
128 else:
129 return None
130
131 def parse_vector(line, fileptr, name):
132 mat = parse_matrix(line, fileptr, name)
133 if mat is None:
134 return mat
135 else:
136 mat = numpy.array(mat).flatten()
137 return mat
138
139 def parse_matrix(line, fileptr, name):
140 if (line.startswith(name)):
141 sys.stdout.write('.'); sys.stdout.flush()
142 if line.find(']') < 0:
143 l=''
144 while l is not None and l.find(']') < 0:
145 line+=l
146 l=fileptr.readline()
147 if l is not None and l.find(']') >= 0:
148 line+=l
149
150 if line.find(']') < 0:
151 sys.stderr.write("matrix `" + name + "' ended without ']'\n")
152 return None
153 else:
154 return mat(line[line.find('['):line.find(']')+1])
155 else:
156 return None
157
158 def parse_string(line, fileptr, name):
159 if (line.startswith(name)):
160 sys.stdout.write('.'); sys.stdout.flush()
161 l=''
162 lines=[]
163 while l is not None and l.find(']') < 0:
164 if l:
165 lines+=[list(l[:-1])]
166 l=fileptr.readline()
167
168 if l.find(']') < 0:
169 sys.stderr.write("string ended without ']'\n")
170 return None
171 else:
172 #seqlen=len(lines[0])
173 num=len(lines)
174 #trdat = chararray((seqlen,num),1,order='FORTRAN')
175 #for i in xrange(num):
176 # trdat[:,i]=lines[i]
177 trdat = num*[[]]
178 for idx,example in enumerate(lines):
179 trdat[idx] = ''.join(example)
180 return trdat
181 else:
182 return None
183
184 if __name__ == '__main__':
185 #import hotshot, hotshot.stats
186
187 def load():
188 f='/fml/ag-raetsch/share/projects/palma/param_files/arabidopsis_ath1_ss=1_il=1.param.bz2'
189 m=parse_file(f);
190
191 print m.acceptor_bins is None
192 print m.acceptor_limits is None
193 print m.acceptor_penalties is None
194
195 print m.acceptor_splice_b is None
196 print m.acceptor_splice_order is None
197 print m.acceptor_splice_window_left is None
198 print m.acceptor_splice_window_right is None
199 print m.acceptor_splice_alphas is None
200 print m.acceptor_splice_svs is None
201
202 print m.donor_bins is None
203 print m.donor_limits is None
204 print m.donor_penalties is None
205
206 print m.donor_splice_b is None
207 print m.donor_splice_order is None
208 #print m.donor_splice_use_gc is None
209 print m.donor_splice_window_left is None
210 print m.donor_splice_window_right is None
211 print m.donor_splice_alphas is None
212 print m.donor_splice_svs is None
213 print m.intron_len_bins is None
214 print m.intron_len_limits is None
215 print m.intron_len_penalties is None
216 print m.intron_len_min is None
217 print m.intron_len_max is None
218 print m.intron_len_transform is None
219
220 print m.substitution_matrix is None
221
222 print 'intron_len_transform'
223 print m.intron_len_transform
224 print 'substitution_matrix'
225 print m.substitution_matrix
226
227 load()
228
229 #prof = hotshot.Profile("model.prof")
230 #benchtime = prof.runcall(load)
231 #prof.close()
232 #stats = hotshot.stats.load("model.prof")
233 #stats.strip_dirs()
234 #stats.sort_stats('time', 'calls')
235 #stats.print_stats(20)