9 from shogun
.Classifier
import SVM
10 from shogun
.Features
import StringCharFeatures
,DNA
11 from shogun
.Kernel
import WeightedDegreeStringKernel
15 from qpalma
.DataProc
import *
16 from qpalma
.TrainingParam
import Param
20 """splice site scores (both donor and acceptor)"""
25 self
.donor_splice_use_gc
= None
27 def load_model(self
, filenames
, parameters
):
28 """Load donor SVM, acceptor SVM, intron length score and set 'use_gc' parameter"""
30 param_file
= file(filenames
['data_dir']+filenames
['param'])
32 sys
.stderr
.write("Can't open PARAM FILE (%s) to read\n" %(filenames
['data_dir']+filenames
['param']))
35 #<------------------------------------------------------------------------------>
36 filename
= filenames
['data_dir']+filenames
['param']
37 self
.model
= io_qpalma
.parse_file(filename
)
39 self
.donor_splice_use_gc
= parameters
['use_gc'] ;
40 #self.plif=plif(self.model)
41 if parameters
['use_splicesites']:
42 self
.signal
= signal_detectors(self
.model
, self
.donor_splice_use_gc
)
45 def predict_splicescores(self
, filenames
, parameters
, dna_fasta_dict
):
46 splicescores_picklefile
= filenames
['dna'] + '.palma_splicesites.bz2' ;
47 #if os.path.isfile(splicescores_picklefile):
48 if parameters
['use_splicesites'] and os
.path
.isfile(splicescores_picklefile
):
49 print 'loading splice site predictions from ' + splicescores_picklefile
50 splicescores
= self
.load_splicescores_index(splicescores_picklefile
)
51 splicescores_precomputed
= True
54 splicescores_precomputed
= False
56 #get donor/acceptor signal predictions for all sequences
57 if parameters
['use_splicesites']:
58 seqs
= dict2seqlist(dna_fasta_dict
)
59 self
.signal
.predict_acceptor_sites_from_seqdict(seqs
)
60 self
.signal
.predict_donor_sites_from_seqdict(seqs
)
63 for counter
in xrange(len(dna_fasta_dict
.keys())):
64 # DNA and mRNA sequence
65 if parameters
['use_splicesites'] and not splicescores_precomputed
:
67 tSeq
= dna_fasta_dict
[dna_fasta_dict
.keys()[counter
]] # dna_sequences is list of strings
68 #assert(seq.sequence == tSeq)
70 tName
= dna_fasta_dict
.keys()[counter
]
74 if parameters
['use_splicesites']:
75 if splicescores_precomputed
:
76 assert(splicescores
.has_key(tName
+ '_acceptor_splicescores'))
77 assert(splicescores
.has_key(tName
+ '_donor_splicescores'))
79 acc_support_values
= [-numpy
.inf
]*tSize
80 for i
in xrange(len(seq
.preds
['acceptor'].positions
)):
81 acc_support_values
[seq
.preds
['acceptor'].positions
[i
]-1] = seq
.preds
['acceptor'].scores
[i
] ;
83 don_support_values
= [-numpy
.inf
]*tSize
84 for i
in xrange(len(seq
.preds
['donor'].positions
)):
85 don_support_values
[seq
.preds
['donor'].positions
[i
]] = seq
.preds
['donor'].scores
[i
] ;
87 don_support_values
= [-numpy
.inf
]*tSize
88 for cnt
in range(tSize
-1):
89 if (tSeq
[cnt
] in 'Gg') \
90 and ((tSeq
[cnt
+1] in 'Tt') \
91 or ((tSeq
[cnt
+1] in 'cC') and parameters
['use_gc'])):
92 don_support_values
[cnt
] = 1
93 assert(len(don_support_values
)==tSize
)
95 acc_support_values
= [-numpy
.inf
]*tSize
96 for cnt
in range(tSize
-1):
97 if (tSeq
[cnt
+1] in 'Gg') and (tSeq
[cnt
] in 'aA'):
98 acc_support_values
[cnt
+1] = 1
99 assert(len(acc_support_values
)==tSize
)
101 if not splicescores
.has_key(tName
+ '_acceptor_splicescores'):
102 splicescores
[tName
+ '_acceptor_splicescores'] = acc_support_values
104 if not splicescores
.has_key(tName
+ '_donor_splicescores'):
105 splicescores
[tName
+ '_donor_splicescores'] = don_support_values
108 # write the splice site predictions into a pickle file
109 if (not splicescores_precomputed
) and (parameters
['use_splicesites']) and parameters
['save_splicescores']:
110 print 'saving splice site scores to ' + splicescores_picklefile
111 self
.save_splicescores(splicescores_picklefile
, splicescores
)
118 def get_splicescores(self
, index
, key
, start
, end
):
119 if not index
.has_key('XXX_palma_filename'):
124 return seq
[start
:end
]
126 (num
, fstart
, fend
) = index
[key
] ;
127 filename
= index
['XXX_palma_filename']
128 if filename
.endswith('.bz2'):
129 f
= bz2
.BZ2File(filename
, 'rb');
131 if filename
.endswith('.gz'):
132 f
= gzip
.GzipFile(filename
, 'rb');
134 f
= file(filename
, 'rb')
136 f
.seek(fstart
+ start
*4, 0) # sizeof float = 4!?
137 ss
= numpy
.array('f') ;
141 ss
.read(f
, end
-start
)
145 def compute_donacc(self
,splicescores
,tName
):
146 """Return two lists of doubles corresponding
147 to the donor and acceptor splice site scores respectively
149 acc_support_values
= self
.get_splicescores(splicescores
, tName
+ '_acceptor_splicescores', 0, 0)
150 don_support_values
= self
.get_splicescores(splicescores
, tName
+ '_donor_splicescores', 0, 0)
152 return (don_support_values
,acc_support_values
)
158 class signal_detectors():
159 def __init__(self
, model
, donor_splice_use_gc
):
160 if donor_splice_use_gc
:
161 donor_consensus
=['GC','GT']
163 donor_consensus
=['GT']
165 self
.acceptor
=svm_splice_model(model
.acceptor_splice_order
, model
.acceptor_splice_svs
,
166 numpy
.array(model
.acceptor_splice_alphas
).flatten(), model
.acceptor_splice_b
,
167 (model
.acceptor_splice_window_left
, 2, model
.acceptor_splice_window_right
), ['AG'])
168 self
.donor
=svm_splice_model(model
.donor_splice_order
, model
.donor_splice_svs
,
169 numpy
.array(model
.donor_splice_alphas
).flatten(), model
.donor_splice_b
,
170 (model
.donor_splice_window_left
, 0, model
.donor_splice_window_right
),
173 def predict_acceptor_sites_from_seqdict(self
, seqs
):
174 self
.acceptor
.get_positions_from_seqdict(seqs
, 'acceptor')
175 sys
.stderr
.write("computing svm output for acceptor positions\n")
176 self
.acceptor
.get_predictions_from_seqdict(seqs
, 'acceptor')
178 def predict_donor_sites_from_seqdict(self
, seqs
):
179 self
.donor
.get_positions_from_seqdict(seqs
, 'donor')
180 sys
.stderr
.write("computing svm output for donor positions\n")
181 self
.donor
.get_predictions_from_seqdict(seqs
, 'donor')
185 class svm_splice_model():
186 """The SVM for predicting splice site scores"""
188 def __init__(self
, order
, traindat
, alphas
, b
, (window_left
,offset
,window_right
), consensus
):
190 f
=StringCharFeatures(DNA
)
191 f
.set_string_features(traindat
)
192 wd_kernel
= WeightedDegreeStringKernel(f
,f
, int(order
))
193 wd_kernel
.io
.set_target_to_stderr()
195 self
.svm
=SVM(wd_kernel
, alphas
, numpy
.arange(len(alphas
), dtype
=numpy
.int32
), b
)
196 self
.svm
.io
.set_target_to_stderr()
197 self
.svm
.parallel
.set_num_threads(self
.svm
.parallel
.get_num_cpus())
198 #self.svm.set_linadd_enabled(False)
199 #self.svm.set_batch_computation_enabled(true)
201 self
.window_left
=int(window_left
)
202 self
.window_right
=int(window_right
)
204 self
.consensus
=consensus
205 self
.wd_kernel
=wd_kernel
207 self
.offset
= offset
;
210 def get_predictions_from_seqdict(self
, seqdic
, site
):
211 """ we need to generate a huge test features object
212 containing all locations found in each seqdict-sequence
213 and each location (this is necessary to efficiently
214 (==fast,low memory) compute the splice outputs
217 #seqlen=self.window_right+self.window_left+2
221 num
+= len(s
.preds
[site
].positions
)
223 #testdat = numpy.chararray((seqlen,num), 1, order='FORTRAN')
230 positions
=s
.preds
[site
].positions
232 for j
in xrange(len(positions
)):
233 if len(positions
)>50000 and j
%10000==0:
234 sys
.stderr
.write('sequence %i: progress %1.2f%%\r' %(si
,(100*j
/len(positions
))))
235 i
=positions
[j
] - self
.offset
236 start
= i
-self
.window_left
242 stop
= i
+self
.window_right
+2
243 if stop
>len(sequence
):
244 s_stop
= 'A'*(stop
-len(sequence
) +1)
245 stop
=len(sequence
) - 1 ;
248 s
= s_start
+ sequence
[start
:stop
] + s_stop
250 #s=sequence[i-self.window_left:i+self.window_right+2]
251 #testdat[:,k]=list(s)
255 if len(positions
)>50000:
256 sys
.stderr
.write('\n')
258 t
=StringCharFeatures(DNA
)
259 t
.set_string_features(testdat
)
261 self
.wd_kernel
.init(self
.traindat
, t
)
262 l
=self
.svm
.classify().get_labels()
263 sys
.stderr
.write("\n...done...\n")
267 num
=len(s
.preds
[site
].positions
)
269 for j
in xrange(num
):
272 s
.preds
[site
].set_scores(scores
)
276 def get_positions_from_seqdict(self
, seqdic
, site
):
280 for cons
in self
.consensus
:
281 l
=sequence
.find(cons
)
283 #if l<len(sequence)-self.window_right-2 and l>self.window_left:
284 positions
.append(l
+self
.offset
)
285 l
=sequence
.find(cons
, l
+1)
287 d
.preds
[site
].set_positions(positions
)
289 class predictions(object):
290 def __init__(self
, positions
=None, scores
=None):
291 self
.positions
=positions
294 def set_positions(self
, positions
):
295 self
.positions
=positions
;
296 def get_positions(self
):
297 return self
.positions
299 def set_scores(self
, scores
):
301 def get_scores(self
):
305 return 'positions: ' + `self
.positions`
+ 'scores: ' + `self
.scores`
307 return self
.__str
__()
309 class sequence(object):
310 def __init__(self
, name
, seq
, (start
,end
)):
311 assert(start
<end
<=len(seq
))
317 self
.preds
['acceptor']=predictions()
318 self
.preds
['donor']=predictions()
321 s
="start:" + `self
.start`
322 s
+=" end:" + `self
.end`
323 s
+=" name:" + `self
.name`
324 s
+=" sequence:" + `self
.seq
[0:10]`
325 s
+="... preds:" + `self
.preds`
328 return self
.__str
__()
330 def dict2seqlist(dic
):
331 """ takes a fasta dict as input and
332 generates a list of sequence objects from it """
335 #translate string to ACGT / all non ACGT letters are mapped to A
337 for i
in xrange(256):
338 if chr(i
).upper() in 'ACGT':
344 seq
=string
.translate(dic
[seqname
], tab
)
351 sequences
.append(sequence(seqname
, seq
, (0,len(seq
))))
352 #sequences.append(seq)
362 filenames
= {'data_dir':'/fml/ag-raetsch/share/projects/palma/param_files/',\
363 'param':'arabidopsis_ath1_ss=1_il=1.param.bz2',
365 parameters
= {'use_gc':True,\
366 'use_splicesites':True,
367 'save_splicescores':False,}
369 ss
.load_model(filenames
, parameters
)
372 ginfo_filename
= '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/qpalma/genome_info.pickle'
373 genome_info
= cPickle
.load(open(ginfo_filename
))
375 (Sequences
, Acceptors
, Donors
, Exons
, Ests
, Qualities
, SplitPos
) \
376 = paths_load_data_solexa('training',genome_info
,ARGS
)
378 dna
= dict((str(ix
),Sequences
[ix
]) for ix
in xrange(len(Sequences
)))
379 splicescores
= ss
.predict_splicescores(filenames
, parameters
, dna
)
381 for exampleIdx
in range(len(Sequences
)):
382 print str(exampleIdx
)
383 (don_score
,acc_score
) = ss
.compute_donacc(splicescores
,str(exampleIdx
))
384 print str((max(don_score
),min(numpy
.where(numpy
.isinf(don_score
),100,don_score
))))
385 print str((max(acc_score
),min(numpy
.where(numpy
.isinf(acc_score
),100,acc_score
))))
389 if __name__
=='__main__':