+ minor mods
[qpalma.git] / tools / splicesites.py
1
2 import sys
3 import os
4 import bz2
5 import numpy
6 import string
7 import io_qpalma
8
9 from shogun.Classifier import SVM
10 from shogun.Features import StringCharFeatures,DNA
11 from shogun.Kernel import WeightedDegreeStringKernel
12
13 import cPickle
14
15 from qpalma.DataProc import *
16 from qpalma.TrainingParam import Param
17
18
19 class splicesites():
20 """splice site scores (both donor and acceptor)"""
21
22 def __init__(self):
23 self.model = None
24 self.signal = None
25 self.donor_splice_use_gc = None
26
27 def load_model(self, filenames, parameters):
28 """Load donor SVM, acceptor SVM, intron length score and set 'use_gc' parameter"""
29 try:
30 param_file = file(filenames['data_dir']+filenames['param'])
31 except IOError:
32 sys.stderr.write("Can't open PARAM FILE (%s) to read\n" %(filenames['data_dir']+filenames['param']))
33 sys.exit(2)
34
35 #<------------------------------------------------------------------------------>
36 filename = filenames['data_dir']+filenames['param']
37 self.model = io_qpalma.parse_file(filename)
38
39 self.donor_splice_use_gc = parameters['use_gc'] ;
40 #self.plif=plif(self.model)
41 if parameters['use_splicesites']:
42 self.signal = signal_detectors(self.model, self.donor_splice_use_gc)
43
44
45 def predict_splicescores(self, filenames, parameters, dna_fasta_dict):
46 splicescores_picklefile = filenames['dna'] + '.palma_splicesites.bz2' ;
47 #if os.path.isfile(splicescores_picklefile):
48 if parameters['use_splicesites'] and os.path.isfile(splicescores_picklefile):
49 print 'loading splice site predictions from ' + splicescores_picklefile
50 splicescores = self.load_splicescores_index(splicescores_picklefile)
51 splicescores_precomputed = True
52 else:
53 splicescores = {} ;
54 splicescores_precomputed = False
55
56 #get donor/acceptor signal predictions for all sequences
57 if parameters['use_splicesites']:
58 seqs= dict2seqlist(dna_fasta_dict)
59 self.signal.predict_acceptor_sites_from_seqdict(seqs)
60 self.signal.predict_donor_sites_from_seqdict(seqs)
61
62
63 for counter in xrange(len(dna_fasta_dict.keys())):
64 # DNA and mRNA sequence
65 if parameters['use_splicesites'] and not splicescores_precomputed:
66 seq = seqs[counter]
67 tSeq = dna_fasta_dict[dna_fasta_dict.keys()[counter]] # dna_sequences is list of strings
68 #assert(seq.sequence == tSeq)
69
70 tName = dna_fasta_dict.keys()[counter]
71 tSize = len(tSeq)
72
73 # Splice site scores
74 if parameters['use_splicesites']:
75 if splicescores_precomputed:
76 assert(splicescores.has_key(tName + '_acceptor_splicescores'))
77 assert(splicescores.has_key(tName + '_donor_splicescores'))
78 else:
79 acc_support_values = [-numpy.inf]*tSize
80 for i in xrange(len(seq.preds['acceptor'].positions)):
81 acc_support_values[seq.preds['acceptor'].positions[i]-1] = seq.preds['acceptor'].scores[i] ;
82
83 don_support_values = [-numpy.inf]*tSize
84 for i in xrange(len(seq.preds['donor'].positions)):
85 don_support_values[seq.preds['donor'].positions[i]] = seq.preds['donor'].scores[i] ;
86 else:
87 don_support_values = [-numpy.inf]*tSize
88 for cnt in range(tSize-1):
89 if (tSeq[cnt] in 'Gg') \
90 and ((tSeq[cnt+1] in 'Tt') \
91 or ((tSeq[cnt+1] in 'cC') and parameters['use_gc'])):
92 don_support_values[cnt] = 1
93 assert(len(don_support_values)==tSize)
94
95 acc_support_values = [-numpy.inf]*tSize
96 for cnt in range(tSize-1):
97 if (tSeq[cnt+1] in 'Gg') and (tSeq[cnt] in 'aA'):
98 acc_support_values[cnt+1] = 1
99 assert(len(acc_support_values)==tSize)
100
101 if not splicescores.has_key(tName + '_acceptor_splicescores'):
102 splicescores[tName + '_acceptor_splicescores'] = acc_support_values
103
104 if not splicescores.has_key(tName + '_donor_splicescores'):
105 splicescores[tName + '_donor_splicescores'] = don_support_values
106
107
108 # write the splice site predictions into a pickle file
109 if (not splicescores_precomputed) and (parameters['use_splicesites']) and parameters['save_splicescores']:
110 print 'saving splice site scores to ' + splicescores_picklefile
111 self.save_splicescores(splicescores_picklefile, splicescores)
112
113
114 return splicescores
115
116
117
118 def get_splicescores(self, index, key, start, end):
119 if not index.has_key('XXX_palma_filename'):
120 seq = index[key] ;
121 if end==start==0:
122 return seq
123 else:
124 return seq[start:end]
125 else:
126 (num, fstart, fend) = index[key] ;
127 filename = index['XXX_palma_filename']
128 if filename.endswith('.bz2'):
129 f = bz2.BZ2File(filename, 'rb');
130 else:
131 if filename.endswith('.gz'):
132 f = gzip.GzipFile(filename, 'rb');
133 else:
134 f = file(filename, 'rb')
135
136 f.seek(fstart + start*4, 0) # sizeof float = 4!?
137 ss = numpy.array('f') ;
138 if end==start==0:
139 ss.read(f, num)
140 else:
141 ss.read(f, end-start)
142
143 return ss.tolist()
144
145 def compute_donacc(self,splicescores,tName):
146 """Return two lists of doubles corresponding
147 to the donor and acceptor splice site scores respectively
148 """
149 acc_support_values = self.get_splicescores(splicescores, tName + '_acceptor_splicescores', 0, 0)
150 don_support_values = self.get_splicescores(splicescores, tName + '_donor_splicescores', 0, 0)
151
152 return (don_support_values,acc_support_values)
153
154
155
156
157
158 class signal_detectors():
159 def __init__(self, model, donor_splice_use_gc):
160 if donor_splice_use_gc:
161 donor_consensus=['GC','GT']
162 else:
163 donor_consensus=['GT']
164
165 self.acceptor=svm_splice_model(model.acceptor_splice_order, model.acceptor_splice_svs,
166 numpy.array(model.acceptor_splice_alphas).flatten(), model.acceptor_splice_b,
167 (model.acceptor_splice_window_left, 2, model.acceptor_splice_window_right), ['AG'])
168 self.donor=svm_splice_model(model.donor_splice_order, model.donor_splice_svs,
169 numpy.array(model.donor_splice_alphas).flatten(), model.donor_splice_b,
170 (model.donor_splice_window_left, 0, model.donor_splice_window_right),
171 donor_consensus)
172
173 def predict_acceptor_sites_from_seqdict(self, seqs):
174 self.acceptor.get_positions_from_seqdict(seqs, 'acceptor')
175 sys.stderr.write("computing svm output for acceptor positions\n")
176 self.acceptor.get_predictions_from_seqdict(seqs, 'acceptor')
177
178 def predict_donor_sites_from_seqdict(self, seqs):
179 self.donor.get_positions_from_seqdict(seqs, 'donor')
180 sys.stderr.write("computing svm output for donor positions\n")
181 self.donor.get_predictions_from_seqdict(seqs, 'donor')
182
183
184
185 class svm_splice_model():
186 """The SVM for predicting splice site scores"""
187
188 def __init__(self, order, traindat, alphas, b, (window_left,offset,window_right), consensus):
189
190 f=StringCharFeatures(DNA)
191 f.set_string_features(traindat)
192 wd_kernel = WeightedDegreeStringKernel(f,f, int(order))
193 wd_kernel.io.set_target_to_stderr()
194
195 self.svm=SVM(wd_kernel, alphas, numpy.arange(len(alphas), dtype=numpy.int32), b)
196 self.svm.io.set_target_to_stderr()
197 self.svm.parallel.set_num_threads(self.svm.parallel.get_num_cpus())
198 #self.svm.set_linadd_enabled(False)
199 #self.svm.set_batch_computation_enabled(true)
200
201 self.window_left=int(window_left)
202 self.window_right=int(window_right)
203
204 self.consensus=consensus
205 self.wd_kernel=wd_kernel
206 self.traindat=f
207 self.offset = offset ;
208
209
210 def get_predictions_from_seqdict(self, seqdic, site):
211 """ we need to generate a huge test features object
212 containing all locations found in each seqdict-sequence
213 and each location (this is necessary to efficiently
214 (==fast,low memory) compute the splice outputs
215 """
216
217 #seqlen=self.window_right+self.window_left+2
218
219 num=0
220 for s in seqdic:
221 num+= len(s.preds[site].positions)
222
223 #testdat = numpy.chararray((seqlen,num), 1, order='FORTRAN')
224 testdat = num*[[]]
225
226 k=0
227 si = 0 ;
228 for s in seqdic:
229 sequence=s.seq
230 positions=s.preds[site].positions
231 si += 1
232 for j in xrange(len(positions)):
233 if len(positions)>50000 and j%10000==0:
234 sys.stderr.write('sequence %i: progress %1.2f%%\r' %(si,(100*j/len(positions))))
235 i=positions[j] - self.offset
236 start = i-self.window_left
237 if start<0:
238 s_start='A'*(-start)
239 start = 0;
240 else:
241 s_start = ''
242 stop = i+self.window_right+2
243 if stop>len(sequence):
244 s_stop = 'A'*(stop-len(sequence) +1)
245 stop=len(sequence) - 1 ;
246 else:
247 s_stop = '' ;
248 s= s_start + sequence[start:stop] + s_stop
249 #print len(s)
250 #s=sequence[i-self.window_left:i+self.window_right+2]
251 #testdat[:,k]=list(s)
252 testdat[k]=s
253 k+=1
254
255 if len(positions)>50000:
256 sys.stderr.write('\n')
257
258 t=StringCharFeatures(DNA)
259 t.set_string_features(testdat)
260
261 self.wd_kernel.init(self.traindat, t)
262 l=self.svm.classify().get_labels()
263 sys.stderr.write("\n...done...\n")
264
265 k=0
266 for s in seqdic:
267 num=len(s.preds[site].positions)
268 scores= num * [0]
269 for j in xrange(num):
270 scores[j]=l[k]
271 k+=1
272 s.preds[site].set_scores(scores)
273
274
275
276 def get_positions_from_seqdict(self, seqdic, site):
277 for d in seqdic:
278 positions=list()
279 sequence=d.seq
280 for cons in self.consensus:
281 l=sequence.find(cons)
282 while l>-1:
283 #if l<len(sequence)-self.window_right-2 and l>self.window_left:
284 positions.append(l+self.offset)
285 l=sequence.find(cons, l+1)
286 positions.sort()
287 d.preds[site].set_positions(positions)
288
289 class predictions(object):
290 def __init__(self, positions=None, scores=None):
291 self.positions=positions
292 self.scores=scores
293
294 def set_positions(self, positions):
295 self.positions=positions;
296 def get_positions(self):
297 return self.positions
298
299 def set_scores(self, scores):
300 self.scores=scores
301 def get_scores(self):
302 return self.scores
303
304 def __str__(self):
305 return 'positions: ' + `self.positions` + 'scores: ' + `self.scores`
306 def __repr__(self):
307 return self.__str__()
308
309 class sequence(object):
310 def __init__(self, name, seq, (start,end)):
311 assert(start<end<=len(seq))
312 self.start=start
313 self.end=end
314 self.name=name
315 self.seq=seq
316 self.preds=dict()
317 self.preds['acceptor']=predictions()
318 self.preds['donor']=predictions()
319
320 def __str__(self):
321 s="start:" + `self.start`
322 s+=" end:" + `self.end`
323 s+=" name:" + `self.name`
324 s+=" sequence:" + `self.seq[0:10]`
325 s+="... preds:" + `self.preds`
326 return s
327 def __repr__(self):
328 return self.__str__()
329
330 def dict2seqlist(dic):
331 """ takes a fasta dict as input and
332 generates a list of sequence objects from it """
333 sequences=list()
334
335 #translate string to ACGT / all non ACGT letters are mapped to A
336 tab=''
337 for i in xrange(256):
338 if chr(i).upper() in 'ACGT':
339 tab+=chr(i).upper()
340 else:
341 tab+='A'
342
343 for seqname in dic:
344 seq=string.translate(dic[seqname], tab)
345 seq=seq.upper()
346 #if end<0:
347 # stop=len(seq)+end
348 #else:
349 # stop=end
350
351 sequences.append(sequence(seqname, seq, (0,len(seq))))
352 #sequences.append(seq)
353
354 return sequences
355
356
357
358 def example():
359 """a test example"""
360
361
362 filenames = {'data_dir':'/fml/ag-raetsch/share/projects/palma/param_files/',\
363 'param':'arabidopsis_ath1_ss=1_il=1.param.bz2',
364 'dna':'arabi',}
365 parameters = {'use_gc':True,\
366 'use_splicesites':True,
367 'save_splicescores':False,}
368 ss = splicesites()
369 ss.load_model(filenames, parameters)
370
371 ARGS = Param()
372 ginfo_filename = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/qpalma/genome_info.pickle'
373 genome_info = cPickle.load(open(ginfo_filename))
374
375 (Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos) \
376 = paths_load_data_solexa('training',genome_info,ARGS)
377
378 dna = dict((str(ix),Sequences[ix]) for ix in xrange(len(Sequences)))
379 splicescores = ss.predict_splicescores(filenames, parameters, dna)
380
381 for exampleIdx in range(len(Sequences)):
382 print str(exampleIdx)
383 (don_score,acc_score) = ss.compute_donacc(splicescores,str(exampleIdx))
384 print str((max(don_score),min(numpy.where(numpy.isinf(don_score),100,don_score))))
385 print str((max(acc_score),min(numpy.where(numpy.isinf(acc_score),100,acc_score))))
386
387
388
389 if __name__=='__main__':
390 example()