return currentWeight
-def computeSpliceWeights(d, a, h, SpliceAlign, don_supp, acc_supp):
+def computeSpliceWeights(d, a, h, SpliceAlign, don_supp, acc_supp,dec=False):
####################################################################################
# 1. Donor: In don_supp stehen Werte der SVM., in SpliceAlign die Einsen
####################################################################################
# Picke die Positionen raus, an denen eine Donorstelle ist
- DonorScores = [elem for pos,elem in enumerate(don_supp) if SpliceAlign[pos] == 1]
- assert not ( -inf in DonorScores )
+ try:
+ if dec:
+ DonorScores = [elem for pos,elem in enumerate(don_supp) if pos > 0 and SpliceAlign[pos] == 1]
+ else:
+ DonorScores = [elem for pos,elem in enumerate(don_supp) if pos > 0 and SpliceAlign[pos-1] == 1]
+ assert not ( -inf in DonorScores )
+ except:
+ pdb.set_trace()
#print 'donor'
weightDon = calculateWeights(d,DonorScores)
####################################################################################
#Den Vektor Acceptorstellen durchgehen und die Gewichtsvektoren belasten:
- AcceptorScores = [elem for pos,elem in enumerate(acc_supp) if pos > 0 and SpliceAlign[pos-1] == 2]
- assert not ( -inf in AcceptorScores )
+ try:
+ if dec:
+ AcceptorScores = [elem for pos,elem in enumerate(acc_supp) if pos > 0 and SpliceAlign[pos-1] == 2]
+ else:
+ AcceptorScores = [elem for pos,elem in enumerate(acc_supp) if pos > 0 and SpliceAlign[pos] == 2]
+ assert not ( -inf in AcceptorScores )
+ except:
+ pdb.set_trace()
#print 'acceptor'
weightAcc = calculateWeights(a,AcceptorScores)
--- /dev/null
+
+"""
+Functions for parsing palma parameter files,
+to extract splice site predictors for qpalma.
+"""
+
+import bz2
+import sys
+import numpy
+from numpy import mat,array,chararray,inf
+
+class palma_model():
+ """A container for palma parameters"""
+
+ #acceptor
+ acceptor_bins=None
+ acceptor_limits=None
+ acceptor_penalties=None
+
+ acceptor_splice_b=None
+ acceptor_splice_order=None
+ acceptor_splice_window_left=None
+ acceptor_splice_window_right=None
+ acceptor_splice_alphas=None
+ acceptor_splice_svs=None
+
+ #donor
+ donor_bins=None
+ donor_limits=None
+ donor_penalties=None
+
+ donor_splice_b=None
+ donor_splice_order=None
+ #donor_splice_use_gc=None
+ donor_splice_window_left=None
+ donor_splice_window_right=None
+ donor_splice_alphas=None
+ donor_splice_svs=None
+
+ # intron length
+ intron_len_bins=None
+ intron_len_limits=None
+ intron_len_penalties=None
+ intron_len_min=None
+ intron_len_max=None
+ intron_len_transform=None
+
+ gene_len_max = None
+
+ # substitution matrix
+ substitution_matrix=None
+
+
+
+def parse_file(filename):
+ if filename.endswith('.param.bz2'):
+ fileptr=bz2.BZ2File(filename);
+ else:
+ sys.stderr.write('Expected palma parameter file with ending param.bz2\n')
+ m=palma_model()
+
+ sys.stdout.write("loading model file"); sys.stdout.flush()
+
+ l=fileptr.readline();
+
+ if l != '%palma definition file version: 1.0\n':
+ sys.stderr.write("\nfile not a palma definition file\n")
+ print l
+ return None
+
+ while l:
+ if not ( l.startswith('%') or l.startswith('\n') ): # comment
+
+ #acceptor
+ if m.acceptor_bins is None: m.acceptor_bins=parse_value(l, 'acceptor_bins')
+ if m.acceptor_limits is None: m.acceptor_limits=parse_vector(l, fileptr, 'acceptor_limits')
+ if m.acceptor_penalties is None: m.acceptor_penalties=parse_vector(l, fileptr, 'acceptor_penalties') #DEAD
+
+ if m.acceptor_splice_b is None: m.acceptor_splice_b=parse_value(l, 'acceptor_splice_b')
+ if m.acceptor_splice_order is None: m.acceptor_splice_order=parse_value(l, 'acceptor_splice_order')
+ if m.acceptor_splice_window_left is None: m.acceptor_splice_window_left=parse_value(l, 'acceptor_splice_window_left')
+ if m.acceptor_splice_window_right is None: m.acceptor_splice_window_right=parse_value(l, 'acceptor_splice_window_right')
+ if m.acceptor_splice_alphas is None: m.acceptor_splice_alphas=parse_vector(l, fileptr, 'acceptor_splice_alphas')
+ if m.acceptor_splice_svs is None: m.acceptor_splice_svs=parse_string(l, fileptr, 'acceptor_splice_svs')
+
+ #donor
+ if m.donor_bins is None: m.donor_bins=parse_value(l, 'donor_bins')
+ if m.donor_limits is None: m.donor_limits=parse_vector(l, fileptr, 'donor_limits')
+ if m.donor_penalties is None: m.donor_penalties=parse_vector(l, fileptr, 'donor_penalties') #DEAD
+
+ if m.donor_splice_b is None: m.donor_splice_b=parse_value(l, 'donor_splice_b')
+ if m.donor_splice_order is None: m.donor_splice_order=parse_value(l, 'donor_splice_order')
+ #if m.donor_splice_use_gc is None: m.donor_splice_use_gc=parse_value(l, 'donor_splice_use_gc')
+ if m.donor_splice_window_left is None: m.donor_splice_window_left=parse_value(l, 'donor_splice_window_left')
+ if m.donor_splice_window_right is None: m.donor_splice_window_right=parse_value(l, 'donor_splice_window_right')
+ if m.donor_splice_alphas is None: m.donor_splice_alphas=parse_vector(l, fileptr, 'donor_splice_alphas')
+ if m.donor_splice_svs is None: m.donor_splice_svs=parse_string(l, fileptr, 'donor_splice_svs')
+
+
+ # intron length
+ if m.intron_len_bins is None: m.intron_len_bins=parse_value(l, 'intron_len_bins')
+ if m.intron_len_limits is None: m.intron_len_limits=parse_vector(l, fileptr, 'intron_len_limits')
+ if m.intron_len_penalties is None: m.intron_len_penalties=parse_vector(l, fileptr, 'intron_len_penalties')
+ if m.intron_len_min is None: m.intron_len_min=parse_value(l, 'intron_len_min')
+ if m.intron_len_max is None: m.intron_len_max=parse_value(l, 'intron_len_max')
+ if m.intron_len_transform is None: m.intron_len_transform=parse_value(l, 'intron_len_transform')
+
+ # gene length
+ if m.gene_len_max is None: m.gene_len_max=parse_value(l, 'gene_len_max')
+
+ if m.substitution_matrix is None: m.substitution_matrix=parse_vector(l, fileptr, 'substitution_matrix')
+
+ l=fileptr.readline()
+
+ sys.stderr.write('done\n')
+ return m
+
+def parse_value(line, name):
+ if (line.startswith(name)):
+ #print 'found ' + name
+ sys.stdout.write('.'); sys.stdout.flush()
+ str = line[line.find('=')+1:-1] ;
+ if str[0] == "'":
+ return str[1:-1]
+ else:
+ #print float(str)
+ return float(str)
+ else:
+ return None
+
+def parse_vector(line, fileptr, name):
+ mat = parse_matrix(line, fileptr, name)
+ if mat is None:
+ return mat
+ else:
+ mat = numpy.array(mat).flatten()
+ return mat
+
+def parse_matrix(line, fileptr, name):
+ if (line.startswith(name)):
+ sys.stdout.write('.'); sys.stdout.flush()
+ if line.find(']') < 0:
+ l=''
+ while l is not None and l.find(']') < 0:
+ line+=l
+ l=fileptr.readline()
+ if l is not None and l.find(']') >= 0:
+ line+=l
+
+ if line.find(']') < 0:
+ sys.stderr.write("matrix `" + name + "' ended without ']'\n")
+ return None
+ else:
+ return mat(line[line.find('['):line.find(']')+1])
+ else:
+ return None
+
+def parse_string(line, fileptr, name):
+ if (line.startswith(name)):
+ sys.stdout.write('.'); sys.stdout.flush()
+ l=''
+ lines=[]
+ while l is not None and l.find(']') < 0:
+ if l:
+ lines+=[list(l[:-1])]
+ l=fileptr.readline()
+
+ if l.find(']') < 0:
+ sys.stderr.write("string ended without ']'\n")
+ return None
+ else:
+ #seqlen=len(lines[0])
+ num=len(lines)
+ #trdat = chararray((seqlen,num),1,order='FORTRAN')
+ #for i in xrange(num):
+ # trdat[:,i]=lines[i]
+ trdat = num*[[]]
+ for idx,example in enumerate(lines):
+ trdat[idx] = ''.join(example)
+ return trdat
+ else:
+ return None
+
+if __name__ == '__main__':
+ #import hotshot, hotshot.stats
+
+ def load():
+ f='/fml/ag-raetsch/share/projects/palma/param_files/arabidopsis_ath1_ss=1_il=1.param.bz2'
+ m=parse_file(f);
+
+ print m.acceptor_bins is None
+ print m.acceptor_limits is None
+ print m.acceptor_penalties is None
+
+ print m.acceptor_splice_b is None
+ print m.acceptor_splice_order is None
+ print m.acceptor_splice_window_left is None
+ print m.acceptor_splice_window_right is None
+ print m.acceptor_splice_alphas is None
+ print m.acceptor_splice_svs is None
+
+ print m.donor_bins is None
+ print m.donor_limits is None
+ print m.donor_penalties is None
+
+ print m.donor_splice_b is None
+ print m.donor_splice_order is None
+ #print m.donor_splice_use_gc is None
+ print m.donor_splice_window_left is None
+ print m.donor_splice_window_right is None
+ print m.donor_splice_alphas is None
+ print m.donor_splice_svs is None
+ print m.intron_len_bins is None
+ print m.intron_len_limits is None
+ print m.intron_len_penalties is None
+ print m.intron_len_min is None
+ print m.intron_len_max is None
+ print m.intron_len_transform is None
+
+ print m.substitution_matrix is None
+
+ print 'intron_len_transform'
+ print m.intron_len_transform
+ print 'substitution_matrix'
+ print m.substitution_matrix
+
+ load()
+
+ #prof = hotshot.Profile("model.prof")
+ #benchtime = prof.runcall(load)
+ #prof.close()
+ #stats = hotshot.stats.load("model.prof")
+ #stats.strip_dirs()
+ #stats.sort_stats('time', 'calls')
+ #stats.print_stats(20)
--- /dev/null
+
+import sys
+import os
+import bz2
+import numpy
+import string
+import io_qpalma
+
+from shogun.Classifier import SVM
+from shogun.Features import StringCharFeatures,DNA
+from shogun.Kernel import WeightedDegreeStringKernel
+
+import cPickle
+
+from qpalma.DataProc import *
+from qpalma.TrainingParam import Param
+
+
+class splicesites():
+ """splice site scores (both donor and acceptor)"""
+
+ def __init__(self):
+ self.model = None
+ self.signal = None
+ self.donor_splice_use_gc = None
+
+ def load_model(self, filenames, parameters):
+ """Load donor SVM, acceptor SVM, intron length score and set 'use_gc' parameter"""
+ try:
+ param_file = file(filenames['data_dir']+filenames['param'])
+ except IOError:
+ sys.stderr.write("Can't open PARAM FILE (%s) to read\n" %(filenames['data_dir']+filenames['param']))
+ sys.exit(2)
+
+ #<------------------------------------------------------------------------------>
+ filename = filenames['data_dir']+filenames['param']
+ self.model = io_qpalma.parse_file(filename)
+
+ self.donor_splice_use_gc = parameters['use_gc'] ;
+ #self.plif=plif(self.model)
+ if parameters['use_splicesites']:
+ self.signal = signal_detectors(self.model, self.donor_splice_use_gc)
+
+
+ def predict_splicescores(self, filenames, parameters, dna_fasta_dict):
+ splicescores_picklefile = filenames['dna'] + '.palma_splicesites.bz2' ;
+ #if os.path.isfile(splicescores_picklefile):
+ if parameters['use_splicesites'] and os.path.isfile(splicescores_picklefile):
+ print 'loading splice site predictions from ' + splicescores_picklefile
+ splicescores = self.load_splicescores_index(splicescores_picklefile)
+ splicescores_precomputed = True
+ else:
+ splicescores = {} ;
+ splicescores_precomputed = False
+
+ #get donor/acceptor signal predictions for all sequences
+ if parameters['use_splicesites']:
+ seqs= dict2seqlist(dna_fasta_dict)
+ self.signal.predict_acceptor_sites_from_seqdict(seqs)
+ self.signal.predict_donor_sites_from_seqdict(seqs)
+
+
+ for counter in xrange(len(dna_fasta_dict.keys())):
+ # DNA and mRNA sequence
+ if parameters['use_splicesites'] and not splicescores_precomputed:
+ seq = seqs[counter]
+ tSeq = dna_fasta_dict[dna_fasta_dict.keys()[counter]] # dna_sequences is list of strings
+ #assert(seq.sequence == tSeq)
+
+ tName = dna_fasta_dict.keys()[counter]
+ tSize = len(tSeq)
+
+ # Splice site scores
+ if parameters['use_splicesites']:
+ if splicescores_precomputed:
+ assert(splicescores.has_key(tName + '_acceptor_splicescores'))
+ assert(splicescores.has_key(tName + '_donor_splicescores'))
+ else:
+ acc_support_values = [-numpy.inf]*tSize
+ for i in xrange(len(seq.preds['acceptor'].positions)):
+ acc_support_values[seq.preds['acceptor'].positions[i]-1] = seq.preds['acceptor'].scores[i] ;
+
+ don_support_values = [-numpy.inf]*tSize
+ for i in xrange(len(seq.preds['donor'].positions)):
+ don_support_values[seq.preds['donor'].positions[i]] = seq.preds['donor'].scores[i] ;
+ else:
+ don_support_values = [-numpy.inf]*tSize
+ for cnt in range(tSize-1):
+ if (tSeq[cnt] in 'Gg') \
+ and ((tSeq[cnt+1] in 'Tt') \
+ or ((tSeq[cnt+1] in 'cC') and parameters['use_gc'])):
+ don_support_values[cnt] = 1
+ assert(len(don_support_values)==tSize)
+
+ acc_support_values = [-numpy.inf]*tSize
+ for cnt in range(tSize-1):
+ if (tSeq[cnt+1] in 'Gg') and (tSeq[cnt] in 'aA'):
+ acc_support_values[cnt+1] = 1
+ assert(len(acc_support_values)==tSize)
+
+ if not splicescores.has_key(tName + '_acceptor_splicescores'):
+ splicescores[tName + '_acceptor_splicescores'] = acc_support_values
+
+ if not splicescores.has_key(tName + '_donor_splicescores'):
+ splicescores[tName + '_donor_splicescores'] = don_support_values
+
+
+ # write the splice site predictions into a pickle file
+ if (not splicescores_precomputed) and (parameters['use_splicesites']) and parameters['save_splicescores']:
+ print 'saving splice site scores to ' + splicescores_picklefile
+ self.save_splicescores(splicescores_picklefile, splicescores)
+
+
+ return splicescores
+
+
+
+ def get_splicescores(self, index, key, start, end):
+ if not index.has_key('XXX_palma_filename'):
+ seq = index[key] ;
+ if end==start==0:
+ return seq
+ else:
+ return seq[start:end]
+ else:
+ (num, fstart, fend) = index[key] ;
+ filename = index['XXX_palma_filename']
+ if filename.endswith('.bz2'):
+ f = bz2.BZ2File(filename, 'rb');
+ else:
+ if filename.endswith('.gz'):
+ f = gzip.GzipFile(filename, 'rb');
+ else:
+ f = file(filename, 'rb')
+
+ f.seek(fstart + start*4, 0) # sizeof float = 4!?
+ ss = numpy.array('f') ;
+ if end==start==0:
+ ss.read(f, num)
+ else:
+ ss.read(f, end-start)
+
+ return ss.tolist()
+
+ def compute_donacc(self,splicescores,tName):
+ """Return two lists of doubles corresponding
+ to the donor and acceptor splice site scores respectively
+ """
+ acc_support_values = self.get_splicescores(splicescores, tName + '_acceptor_splicescores', 0, 0)
+ don_support_values = self.get_splicescores(splicescores, tName + '_donor_splicescores', 0, 0)
+
+ return (don_support_values,acc_support_values)
+
+
+
+
+
+class signal_detectors():
+ def __init__(self, model, donor_splice_use_gc):
+ if donor_splice_use_gc:
+ donor_consensus=['GC','GT']
+ else:
+ donor_consensus=['GT']
+
+ self.acceptor=svm_splice_model(model.acceptor_splice_order, model.acceptor_splice_svs,
+ numpy.array(model.acceptor_splice_alphas).flatten(), model.acceptor_splice_b,
+ (model.acceptor_splice_window_left, 2, model.acceptor_splice_window_right), ['AG'])
+ self.donor=svm_splice_model(model.donor_splice_order, model.donor_splice_svs,
+ numpy.array(model.donor_splice_alphas).flatten(), model.donor_splice_b,
+ (model.donor_splice_window_left, 0, model.donor_splice_window_right),
+ donor_consensus)
+
+ def predict_acceptor_sites_from_seqdict(self, seqs):
+ self.acceptor.get_positions_from_seqdict(seqs, 'acceptor')
+ sys.stderr.write("computing svm output for acceptor positions\n")
+ self.acceptor.get_predictions_from_seqdict(seqs, 'acceptor')
+
+ def predict_donor_sites_from_seqdict(self, seqs):
+ self.donor.get_positions_from_seqdict(seqs, 'donor')
+ sys.stderr.write("computing svm output for donor positions\n")
+ self.donor.get_predictions_from_seqdict(seqs, 'donor')
+
+
+
+class svm_splice_model():
+ """The SVM for predicting splice site scores"""
+
+ def __init__(self, order, traindat, alphas, b, (window_left,offset,window_right), consensus):
+
+ f=StringCharFeatures(DNA)
+ f.set_string_features(traindat)
+ wd_kernel = WeightedDegreeStringKernel(f,f, int(order))
+ wd_kernel.io.set_target_to_stderr()
+
+ self.svm=SVM(wd_kernel, alphas, numpy.arange(len(alphas), dtype=numpy.int32), b)
+ self.svm.io.set_target_to_stderr()
+ self.svm.parallel.set_num_threads(self.svm.parallel.get_num_cpus())
+ #self.svm.set_linadd_enabled(False)
+ #self.svm.set_batch_computation_enabled(true)
+
+ self.window_left=int(window_left)
+ self.window_right=int(window_right)
+
+ self.consensus=consensus
+ self.wd_kernel=wd_kernel
+ self.traindat=f
+ self.offset = offset ;
+
+
+ def get_predictions_from_seqdict(self, seqdic, site):
+ """ we need to generate a huge test features object
+ containing all locations found in each seqdict-sequence
+ and each location (this is necessary to efficiently
+ (==fast,low memory) compute the splice outputs
+ """
+
+ #seqlen=self.window_right+self.window_left+2
+
+ num=0
+ for s in seqdic:
+ num+= len(s.preds[site].positions)
+
+ #testdat = numpy.chararray((seqlen,num), 1, order='FORTRAN')
+ testdat = num*[[]]
+
+ k=0
+ si = 0 ;
+ for s in seqdic:
+ sequence=s.seq
+ positions=s.preds[site].positions
+ si += 1
+ for j in xrange(len(positions)):
+ if len(positions)>50000 and j%10000==0:
+ sys.stderr.write('sequence %i: progress %1.2f%%\r' %(si,(100*j/len(positions))))
+ i=positions[j] - self.offset
+ start = i-self.window_left
+ if start<0:
+ s_start='A'*(-start)
+ start = 0;
+ else:
+ s_start = ''
+ stop = i+self.window_right+2
+ if stop>len(sequence):
+ s_stop = 'A'*(stop-len(sequence) +1)
+ stop=len(sequence) - 1 ;
+ else:
+ s_stop = '' ;
+ s= s_start + sequence[start:stop] + s_stop
+ #print len(s)
+ #s=sequence[i-self.window_left:i+self.window_right+2]
+ #testdat[:,k]=list(s)
+ testdat[k]=s
+ k+=1
+
+ if len(positions)>50000:
+ sys.stderr.write('\n')
+
+ t=StringCharFeatures(DNA)
+ t.set_string_features(testdat)
+
+ self.wd_kernel.init(self.traindat, t)
+ l=self.svm.classify().get_labels()
+ sys.stderr.write("\n...done...\n")
+
+ k=0
+ for s in seqdic:
+ num=len(s.preds[site].positions)
+ scores= num * [0]
+ for j in xrange(num):
+ scores[j]=l[k]
+ k+=1
+ s.preds[site].set_scores(scores)
+
+
+
+ def get_positions_from_seqdict(self, seqdic, site):
+ for d in seqdic:
+ positions=list()
+ sequence=d.seq
+ for cons in self.consensus:
+ l=sequence.find(cons)
+ while l>-1:
+ #if l<len(sequence)-self.window_right-2 and l>self.window_left:
+ positions.append(l+self.offset)
+ l=sequence.find(cons, l+1)
+ positions.sort()
+ d.preds[site].set_positions(positions)
+
+class predictions(object):
+ def __init__(self, positions=None, scores=None):
+ self.positions=positions
+ self.scores=scores
+
+ def set_positions(self, positions):
+ self.positions=positions;
+ def get_positions(self):
+ return self.positions
+
+ def set_scores(self, scores):
+ self.scores=scores
+ def get_scores(self):
+ return self.scores
+
+ def __str__(self):
+ return 'positions: ' + `self.positions` + 'scores: ' + `self.scores`
+ def __repr__(self):
+ return self.__str__()
+
+class sequence(object):
+ def __init__(self, name, seq, (start,end)):
+ assert(start<end<=len(seq))
+ self.start=start
+ self.end=end
+ self.name=name
+ self.seq=seq
+ self.preds=dict()
+ self.preds['acceptor']=predictions()
+ self.preds['donor']=predictions()
+
+ def __str__(self):
+ s="start:" + `self.start`
+ s+=" end:" + `self.end`
+ s+=" name:" + `self.name`
+ s+=" sequence:" + `self.seq[0:10]`
+ s+="... preds:" + `self.preds`
+ return s
+ def __repr__(self):
+ return self.__str__()
+
+def dict2seqlist(dic):
+ """ takes a fasta dict as input and
+ generates a list of sequence objects from it """
+ sequences=list()
+
+ #translate string to ACGT / all non ACGT letters are mapped to A
+ tab=''
+ for i in xrange(256):
+ if chr(i).upper() in 'ACGT':
+ tab+=chr(i).upper()
+ else:
+ tab+='A'
+
+ for seqname in dic:
+ seq=string.translate(dic[seqname], tab)
+ seq=seq.upper()
+ #if end<0:
+ # stop=len(seq)+end
+ #else:
+ # stop=end
+
+ sequences.append(sequence(seqname, seq, (0,len(seq))))
+ #sequences.append(seq)
+
+ return sequences
+
+
+def getDonAccScores(Sequences):
+ """a test example"""
+
+
+ filenames = {'data_dir':'/fml/ag-raetsch/share/projects/palma/param_files/',\
+ 'param':'arabidopsis_ath1_ss=1_il=1.param.bz2',
+ 'dna':'arabi',}
+ parameters = {'use_gc':True,\
+ 'use_splicesites':True,
+ 'save_splicescores':False,}
+ ss = splicesites()
+ ss.load_model(filenames, parameters)
+
+ dna = dict((str(ix),Sequences[ix]) for ix in xrange(len(Sequences)))
+ splicescores = ss.predict_splicescores(filenames, parameters, dna)
+
+ Donors = []
+ Acceptors = []
+
+ for exampleIdx in range(len(Sequences)):
+ #print str(exampleIdx)
+ (don_score,acc_score) = ss.compute_donacc(splicescores,str(exampleIdx))
+ Donors.append(don_score)
+ Acceptors.append(acc_score)
+ print str((max(don_score),min(numpy.where(numpy.isinf(don_score),100,don_score))))
+ print str((max(acc_score),min(numpy.where(numpy.isinf(acc_score),100,acc_score))))
+
+ #pdb.set_trace()
+
+ return Donors, Acceptors
+
+def example():
+ """a test example"""
+
+
+ filenames = {'data_dir':'/fml/ag-raetsch/share/projects/palma/param_files/',\
+ 'param':'arabidopsis_ath1_ss=1_il=1.param.bz2',
+ 'dna':'arabi',}
+ parameters = {'use_gc':True,\
+ 'use_splicesites':True,
+ 'save_splicescores':False,}
+ ss = splicesites()
+ ss.load_model(filenames, parameters)
+
+ ARGS = Param()
+ ginfo_filename = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/qpalma/genome_info.pickle'
+ genome_info = cPickle.load(open(ginfo_filename))
+
+ (Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos) \
+ = paths_load_data_solexa('training',genome_info,ARGS)
+
+ dna = dict((str(ix),Sequences[ix]) for ix in xrange(len(Sequences)))
+ splicescores = ss.predict_splicescores(filenames, parameters, dna)
+
+ for exampleIdx in range(len(Sequences)):
+ print str(exampleIdx)
+ (don_score,acc_score) = ss.compute_donacc(splicescores,str(exampleIdx))
+ print str((max(don_score),min(numpy.where(numpy.isinf(don_score),100,don_score))))
+ print str((max(acc_score),min(numpy.where(numpy.isinf(acc_score),100,acc_score))))
+
+
+
+if __name__=='__main__':
+ example()
elif Conf.mode == 'using_quality_scores':
Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos = paths_load_data_solexa('training',None,self.ARGS)
+ end = 30
+ Sequences = Sequences[:end]
+ Acceptors = Acceptors[:end]
+ Donors = Donors[:end]
+ Exons = Exons[:end]
+ Ests = Ests[:end]
+ Qualities = Qualities[:end]
+ SplitPos = SplitPos[:end]
+
#Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
#Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
#pdb.set_trace()
# Initialize parameter vector / param = numpy.matlib.rand(126,1)
#param_filename = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/python/elegans.param'
- param_filename='/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/param_82.pickle'
+ param_filename='/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/param_20.pickle'
param = load_param(param_filename)
# Set the parameters such as limits penalties for the Plifs
#pdb.set_trace()
up_off,down_off = evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign,currentSplitPos)
+ print up_off,down_off
if up_off > -1:
total_up_off.append(up_off)
if oldElem == 0 and elem != 0:
newExons.append(pos-1)
- #pdb.set_trace()
+ pdb.set_trace()
up_off = -1
down_off = -1
from qpalma.Plif import Plf
from qpalma.Helpers import *
+from qpalma.tools.splicesites import getDonAccScores
+
def getQualityFeatureCounts(qualityPlifs):
weightQuality = qualityPlifs[0].penalties
for currentPlif in qualityPlifs[1:]:
elif Configuration.mode == 'using_quality_scores':
Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos = paths_load_data_solexa('training',None,self.ARGS)
+ end = 30
+ Sequences = Sequences[:end]
+ Exons = Exons[:end]
+ Ests = Ests[:end]
+ Qualities = Qualities[:end]
+ SplitPos = SplitPos[:end]
+
+ Donors, Acceptors = getDonAccScores(Sequences)
+
#Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
#Sequences_, Acceptors_, Donors_, Exons_, Ests_, Qualities_ = generateData(100)
# Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
+ #pdb.set_trace()
# Calculate the weights
trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
path_loss = [0]*(num_path[exampleIdx])
for pathNr in range(num_path[exampleIdx]):
- weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a, h, newSpliceAlign[pathNr,:].flatten().tolist()[0], don_supp, acc_supp)
+ weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a,
+ h, newSpliceAlign[pathNr,:].flatten().tolist()[0], don_supp,
+ acc_supp,True)
decodedQualityFeatures = zeros((Configuration.totalQualSuppPoints,1))
for qidx in range(Configuration.totalQualSuppPoints):
+++ /dev/null
-
-"""
-Functions for parsing palma parameter files,
-to extract splice site predictors for qpalma.
-"""
-
-import bz2
-import sys
-import numpy
-from numpy import mat,array,chararray,inf
-
-class palma_model():
- """A container for palma parameters"""
-
- #acceptor
- acceptor_bins=None
- acceptor_limits=None
- acceptor_penalties=None
-
- acceptor_splice_b=None
- acceptor_splice_order=None
- acceptor_splice_window_left=None
- acceptor_splice_window_right=None
- acceptor_splice_alphas=None
- acceptor_splice_svs=None
-
- #donor
- donor_bins=None
- donor_limits=None
- donor_penalties=None
-
- donor_splice_b=None
- donor_splice_order=None
- #donor_splice_use_gc=None
- donor_splice_window_left=None
- donor_splice_window_right=None
- donor_splice_alphas=None
- donor_splice_svs=None
-
- # intron length
- intron_len_bins=None
- intron_len_limits=None
- intron_len_penalties=None
- intron_len_min=None
- intron_len_max=None
- intron_len_transform=None
-
- gene_len_max = None
-
- # substitution matrix
- substitution_matrix=None
-
-
-
-def parse_file(filename):
- if filename.endswith('.param.bz2'):
- fileptr=bz2.BZ2File(filename);
- else:
- sys.stderr.write('Expected palma parameter file with ending param.bz2\n')
- m=palma_model()
-
- sys.stdout.write("loading model file"); sys.stdout.flush()
-
- l=fileptr.readline();
-
- if l != '%palma definition file version: 1.0\n':
- sys.stderr.write("\nfile not a palma definition file\n")
- print l
- return None
-
- while l:
- if not ( l.startswith('%') or l.startswith('\n') ): # comment
-
- #acceptor
- if m.acceptor_bins is None: m.acceptor_bins=parse_value(l, 'acceptor_bins')
- if m.acceptor_limits is None: m.acceptor_limits=parse_vector(l, fileptr, 'acceptor_limits')
- if m.acceptor_penalties is None: m.acceptor_penalties=parse_vector(l, fileptr, 'acceptor_penalties') #DEAD
-
- if m.acceptor_splice_b is None: m.acceptor_splice_b=parse_value(l, 'acceptor_splice_b')
- if m.acceptor_splice_order is None: m.acceptor_splice_order=parse_value(l, 'acceptor_splice_order')
- if m.acceptor_splice_window_left is None: m.acceptor_splice_window_left=parse_value(l, 'acceptor_splice_window_left')
- if m.acceptor_splice_window_right is None: m.acceptor_splice_window_right=parse_value(l, 'acceptor_splice_window_right')
- if m.acceptor_splice_alphas is None: m.acceptor_splice_alphas=parse_vector(l, fileptr, 'acceptor_splice_alphas')
- if m.acceptor_splice_svs is None: m.acceptor_splice_svs=parse_string(l, fileptr, 'acceptor_splice_svs')
-
- #donor
- if m.donor_bins is None: m.donor_bins=parse_value(l, 'donor_bins')
- if m.donor_limits is None: m.donor_limits=parse_vector(l, fileptr, 'donor_limits')
- if m.donor_penalties is None: m.donor_penalties=parse_vector(l, fileptr, 'donor_penalties') #DEAD
-
- if m.donor_splice_b is None: m.donor_splice_b=parse_value(l, 'donor_splice_b')
- if m.donor_splice_order is None: m.donor_splice_order=parse_value(l, 'donor_splice_order')
- #if m.donor_splice_use_gc is None: m.donor_splice_use_gc=parse_value(l, 'donor_splice_use_gc')
- if m.donor_splice_window_left is None: m.donor_splice_window_left=parse_value(l, 'donor_splice_window_left')
- if m.donor_splice_window_right is None: m.donor_splice_window_right=parse_value(l, 'donor_splice_window_right')
- if m.donor_splice_alphas is None: m.donor_splice_alphas=parse_vector(l, fileptr, 'donor_splice_alphas')
- if m.donor_splice_svs is None: m.donor_splice_svs=parse_string(l, fileptr, 'donor_splice_svs')
-
-
- # intron length
- if m.intron_len_bins is None: m.intron_len_bins=parse_value(l, 'intron_len_bins')
- if m.intron_len_limits is None: m.intron_len_limits=parse_vector(l, fileptr, 'intron_len_limits')
- if m.intron_len_penalties is None: m.intron_len_penalties=parse_vector(l, fileptr, 'intron_len_penalties')
- if m.intron_len_min is None: m.intron_len_min=parse_value(l, 'intron_len_min')
- if m.intron_len_max is None: m.intron_len_max=parse_value(l, 'intron_len_max')
- if m.intron_len_transform is None: m.intron_len_transform=parse_value(l, 'intron_len_transform')
-
- # gene length
- if m.gene_len_max is None: m.gene_len_max=parse_value(l, 'gene_len_max')
-
- if m.substitution_matrix is None: m.substitution_matrix=parse_vector(l, fileptr, 'substitution_matrix')
-
- l=fileptr.readline()
-
- sys.stderr.write('done\n')
- return m
-
-def parse_value(line, name):
- if (line.startswith(name)):
- #print 'found ' + name
- sys.stdout.write('.'); sys.stdout.flush()
- str = line[line.find('=')+1:-1] ;
- if str[0] == "'":
- return str[1:-1]
- else:
- #print float(str)
- return float(str)
- else:
- return None
-
-def parse_vector(line, fileptr, name):
- mat = parse_matrix(line, fileptr, name)
- if mat is None:
- return mat
- else:
- mat = numpy.array(mat).flatten()
- return mat
-
-def parse_matrix(line, fileptr, name):
- if (line.startswith(name)):
- sys.stdout.write('.'); sys.stdout.flush()
- if line.find(']') < 0:
- l=''
- while l is not None and l.find(']') < 0:
- line+=l
- l=fileptr.readline()
- if l is not None and l.find(']') >= 0:
- line+=l
-
- if line.find(']') < 0:
- sys.stderr.write("matrix `" + name + "' ended without ']'\n")
- return None
- else:
- return mat(line[line.find('['):line.find(']')+1])
- else:
- return None
-
-def parse_string(line, fileptr, name):
- if (line.startswith(name)):
- sys.stdout.write('.'); sys.stdout.flush()
- l=''
- lines=[]
- while l is not None and l.find(']') < 0:
- if l:
- lines+=[list(l[:-1])]
- l=fileptr.readline()
-
- if l.find(']') < 0:
- sys.stderr.write("string ended without ']'\n")
- return None
- else:
- #seqlen=len(lines[0])
- num=len(lines)
- #trdat = chararray((seqlen,num),1,order='FORTRAN')
- #for i in xrange(num):
- # trdat[:,i]=lines[i]
- trdat = num*[[]]
- for idx,example in enumerate(lines):
- trdat[idx] = ''.join(example)
- return trdat
- else:
- return None
-
-if __name__ == '__main__':
- #import hotshot, hotshot.stats
-
- def load():
- f='/fml/ag-raetsch/share/projects/palma/param_files/arabidopsis_ath1_ss=1_il=1.param.bz2'
- m=parse_file(f);
-
- print m.acceptor_bins is None
- print m.acceptor_limits is None
- print m.acceptor_penalties is None
-
- print m.acceptor_splice_b is None
- print m.acceptor_splice_order is None
- print m.acceptor_splice_window_left is None
- print m.acceptor_splice_window_right is None
- print m.acceptor_splice_alphas is None
- print m.acceptor_splice_svs is None
-
- print m.donor_bins is None
- print m.donor_limits is None
- print m.donor_penalties is None
-
- print m.donor_splice_b is None
- print m.donor_splice_order is None
- #print m.donor_splice_use_gc is None
- print m.donor_splice_window_left is None
- print m.donor_splice_window_right is None
- print m.donor_splice_alphas is None
- print m.donor_splice_svs is None
- print m.intron_len_bins is None
- print m.intron_len_limits is None
- print m.intron_len_penalties is None
- print m.intron_len_min is None
- print m.intron_len_max is None
- print m.intron_len_transform is None
-
- print m.substitution_matrix is None
-
- print 'intron_len_transform'
- print m.intron_len_transform
- print 'substitution_matrix'
- print m.substitution_matrix
-
- load()
-
- #prof = hotshot.Profile("model.prof")
- #benchtime = prof.runcall(load)
- #prof.close()
- #stats = hotshot.stats.load("model.prof")
- #stats.strip_dirs()
- #stats.sort_stats('time', 'calls')
- #stats.print_stats(20)
+++ /dev/null
-
-import sys
-import os
-import bz2
-import numpy
-import string
-import io_qpalma
-
-from shogun.Classifier import SVM
-from shogun.Features import StringCharFeatures,DNA
-from shogun.Kernel import WeightedDegreeStringKernel
-
-import cPickle
-
-from qpalma.DataProc import *
-from qpalma.TrainingParam import Param
-
-
-class splicesites():
- """splice site scores (both donor and acceptor)"""
-
- def __init__(self):
- self.model = None
- self.signal = None
- self.donor_splice_use_gc = None
-
- def load_model(self, filenames, parameters):
- """Load donor SVM, acceptor SVM, intron length score and set 'use_gc' parameter"""
- try:
- param_file = file(filenames['data_dir']+filenames['param'])
- except IOError:
- sys.stderr.write("Can't open PARAM FILE (%s) to read\n" %(filenames['data_dir']+filenames['param']))
- sys.exit(2)
-
- #<------------------------------------------------------------------------------>
- filename = filenames['data_dir']+filenames['param']
- self.model = io_qpalma.parse_file(filename)
-
- self.donor_splice_use_gc = parameters['use_gc'] ;
- #self.plif=plif(self.model)
- if parameters['use_splicesites']:
- self.signal = signal_detectors(self.model, self.donor_splice_use_gc)
-
-
- def predict_splicescores(self, filenames, parameters, dna_fasta_dict):
- splicescores_picklefile = filenames['dna'] + '.palma_splicesites.bz2' ;
- #if os.path.isfile(splicescores_picklefile):
- if parameters['use_splicesites'] and os.path.isfile(splicescores_picklefile):
- print 'loading splice site predictions from ' + splicescores_picklefile
- splicescores = self.load_splicescores_index(splicescores_picklefile)
- splicescores_precomputed = True
- else:
- splicescores = {} ;
- splicescores_precomputed = False
-
- #get donor/acceptor signal predictions for all sequences
- if parameters['use_splicesites']:
- seqs= dict2seqlist(dna_fasta_dict)
- self.signal.predict_acceptor_sites_from_seqdict(seqs)
- self.signal.predict_donor_sites_from_seqdict(seqs)
-
-
- for counter in xrange(len(dna_fasta_dict.keys())):
- # DNA and mRNA sequence
- if parameters['use_splicesites'] and not splicescores_precomputed:
- seq = seqs[counter]
- tSeq = dna_fasta_dict[dna_fasta_dict.keys()[counter]] # dna_sequences is list of strings
- #assert(seq.sequence == tSeq)
-
- tName = dna_fasta_dict.keys()[counter]
- tSize = len(tSeq)
-
- # Splice site scores
- if parameters['use_splicesites']:
- if splicescores_precomputed:
- assert(splicescores.has_key(tName + '_acceptor_splicescores'))
- assert(splicescores.has_key(tName + '_donor_splicescores'))
- else:
- acc_support_values = [-numpy.inf]*tSize
- for i in xrange(len(seq.preds['acceptor'].positions)):
- acc_support_values[seq.preds['acceptor'].positions[i]-1] = seq.preds['acceptor'].scores[i] ;
-
- don_support_values = [-numpy.inf]*tSize
- for i in xrange(len(seq.preds['donor'].positions)):
- don_support_values[seq.preds['donor'].positions[i]] = seq.preds['donor'].scores[i] ;
- else:
- don_support_values = [-numpy.inf]*tSize
- for cnt in range(tSize-1):
- if (tSeq[cnt] in 'Gg') \
- and ((tSeq[cnt+1] in 'Tt') \
- or ((tSeq[cnt+1] in 'cC') and parameters['use_gc'])):
- don_support_values[cnt] = 1
- assert(len(don_support_values)==tSize)
-
- acc_support_values = [-numpy.inf]*tSize
- for cnt in range(tSize-1):
- if (tSeq[cnt+1] in 'Gg') and (tSeq[cnt] in 'aA'):
- acc_support_values[cnt+1] = 1
- assert(len(acc_support_values)==tSize)
-
- if not splicescores.has_key(tName + '_acceptor_splicescores'):
- splicescores[tName + '_acceptor_splicescores'] = acc_support_values
-
- if not splicescores.has_key(tName + '_donor_splicescores'):
- splicescores[tName + '_donor_splicescores'] = don_support_values
-
-
- # write the splice site predictions into a pickle file
- if (not splicescores_precomputed) and (parameters['use_splicesites']) and parameters['save_splicescores']:
- print 'saving splice site scores to ' + splicescores_picklefile
- self.save_splicescores(splicescores_picklefile, splicescores)
-
-
- return splicescores
-
-
-
- def get_splicescores(self, index, key, start, end):
- if not index.has_key('XXX_palma_filename'):
- seq = index[key] ;
- if end==start==0:
- return seq
- else:
- return seq[start:end]
- else:
- (num, fstart, fend) = index[key] ;
- filename = index['XXX_palma_filename']
- if filename.endswith('.bz2'):
- f = bz2.BZ2File(filename, 'rb');
- else:
- if filename.endswith('.gz'):
- f = gzip.GzipFile(filename, 'rb');
- else:
- f = file(filename, 'rb')
-
- f.seek(fstart + start*4, 0) # sizeof float = 4!?
- ss = numpy.array('f') ;
- if end==start==0:
- ss.read(f, num)
- else:
- ss.read(f, end-start)
-
- return ss.tolist()
-
- def compute_donacc(self,splicescores,tName):
- """Return two lists of doubles corresponding
- to the donor and acceptor splice site scores respectively
- """
- acc_support_values = self.get_splicescores(splicescores, tName + '_acceptor_splicescores', 0, 0)
- don_support_values = self.get_splicescores(splicescores, tName + '_donor_splicescores', 0, 0)
-
- return (don_support_values,acc_support_values)
-
-
-
-
-
-class signal_detectors():
- def __init__(self, model, donor_splice_use_gc):
- if donor_splice_use_gc:
- donor_consensus=['GC','GT']
- else:
- donor_consensus=['GT']
-
- self.acceptor=svm_splice_model(model.acceptor_splice_order, model.acceptor_splice_svs,
- numpy.array(model.acceptor_splice_alphas).flatten(), model.acceptor_splice_b,
- (model.acceptor_splice_window_left, 2, model.acceptor_splice_window_right), ['AG'])
- self.donor=svm_splice_model(model.donor_splice_order, model.donor_splice_svs,
- numpy.array(model.donor_splice_alphas).flatten(), model.donor_splice_b,
- (model.donor_splice_window_left, 0, model.donor_splice_window_right),
- donor_consensus)
-
- def predict_acceptor_sites_from_seqdict(self, seqs):
- self.acceptor.get_positions_from_seqdict(seqs, 'acceptor')
- sys.stderr.write("computing svm output for acceptor positions\n")
- self.acceptor.get_predictions_from_seqdict(seqs, 'acceptor')
-
- def predict_donor_sites_from_seqdict(self, seqs):
- self.donor.get_positions_from_seqdict(seqs, 'donor')
- sys.stderr.write("computing svm output for donor positions\n")
- self.donor.get_predictions_from_seqdict(seqs, 'donor')
-
-
-
-class svm_splice_model():
- """The SVM for predicting splice site scores"""
-
- def __init__(self, order, traindat, alphas, b, (window_left,offset,window_right), consensus):
-
- f=StringCharFeatures(DNA)
- f.set_string_features(traindat)
- wd_kernel = WeightedDegreeStringKernel(f,f, int(order))
- wd_kernel.io.set_target_to_stderr()
-
- self.svm=SVM(wd_kernel, alphas, numpy.arange(len(alphas), dtype=numpy.int32), b)
- self.svm.io.set_target_to_stderr()
- self.svm.parallel.set_num_threads(self.svm.parallel.get_num_cpus())
- #self.svm.set_linadd_enabled(False)
- #self.svm.set_batch_computation_enabled(true)
-
- self.window_left=int(window_left)
- self.window_right=int(window_right)
-
- self.consensus=consensus
- self.wd_kernel=wd_kernel
- self.traindat=f
- self.offset = offset ;
-
-
- def get_predictions_from_seqdict(self, seqdic, site):
- """ we need to generate a huge test features object
- containing all locations found in each seqdict-sequence
- and each location (this is necessary to efficiently
- (==fast,low memory) compute the splice outputs
- """
-
- #seqlen=self.window_right+self.window_left+2
-
- num=0
- for s in seqdic:
- num+= len(s.preds[site].positions)
-
- #testdat = numpy.chararray((seqlen,num), 1, order='FORTRAN')
- testdat = num*[[]]
-
- k=0
- si = 0 ;
- for s in seqdic:
- sequence=s.seq
- positions=s.preds[site].positions
- si += 1
- for j in xrange(len(positions)):
- if len(positions)>50000 and j%10000==0:
- sys.stderr.write('sequence %i: progress %1.2f%%\r' %(si,(100*j/len(positions))))
- i=positions[j] - self.offset
- start = i-self.window_left
- if start<0:
- s_start='A'*(-start)
- start = 0;
- else:
- s_start = ''
- stop = i+self.window_right+2
- if stop>len(sequence):
- s_stop = 'A'*(stop-len(sequence) +1)
- stop=len(sequence) - 1 ;
- else:
- s_stop = '' ;
- s= s_start + sequence[start:stop] + s_stop
- #print len(s)
- #s=sequence[i-self.window_left:i+self.window_right+2]
- #testdat[:,k]=list(s)
- testdat[k]=s
- k+=1
-
- if len(positions)>50000:
- sys.stderr.write('\n')
-
- t=StringCharFeatures(DNA)
- t.set_string_features(testdat)
-
- self.wd_kernel.init(self.traindat, t)
- l=self.svm.classify().get_labels()
- sys.stderr.write("\n...done...\n")
-
- k=0
- for s in seqdic:
- num=len(s.preds[site].positions)
- scores= num * [0]
- for j in xrange(num):
- scores[j]=l[k]
- k+=1
- s.preds[site].set_scores(scores)
-
-
-
- def get_positions_from_seqdict(self, seqdic, site):
- for d in seqdic:
- positions=list()
- sequence=d.seq
- for cons in self.consensus:
- l=sequence.find(cons)
- while l>-1:
- #if l<len(sequence)-self.window_right-2 and l>self.window_left:
- positions.append(l+self.offset)
- l=sequence.find(cons, l+1)
- positions.sort()
- d.preds[site].set_positions(positions)
-
-class predictions(object):
- def __init__(self, positions=None, scores=None):
- self.positions=positions
- self.scores=scores
-
- def set_positions(self, positions):
- self.positions=positions;
- def get_positions(self):
- return self.positions
-
- def set_scores(self, scores):
- self.scores=scores
- def get_scores(self):
- return self.scores
-
- def __str__(self):
- return 'positions: ' + `self.positions` + 'scores: ' + `self.scores`
- def __repr__(self):
- return self.__str__()
-
-class sequence(object):
- def __init__(self, name, seq, (start,end)):
- assert(start<end<=len(seq))
- self.start=start
- self.end=end
- self.name=name
- self.seq=seq
- self.preds=dict()
- self.preds['acceptor']=predictions()
- self.preds['donor']=predictions()
-
- def __str__(self):
- s="start:" + `self.start`
- s+=" end:" + `self.end`
- s+=" name:" + `self.name`
- s+=" sequence:" + `self.seq[0:10]`
- s+="... preds:" + `self.preds`
- return s
- def __repr__(self):
- return self.__str__()
-
-def dict2seqlist(dic):
- """ takes a fasta dict as input and
- generates a list of sequence objects from it """
- sequences=list()
-
- #translate string to ACGT / all non ACGT letters are mapped to A
- tab=''
- for i in xrange(256):
- if chr(i).upper() in 'ACGT':
- tab+=chr(i).upper()
- else:
- tab+='A'
-
- for seqname in dic:
- seq=string.translate(dic[seqname], tab)
- seq=seq.upper()
- #if end<0:
- # stop=len(seq)+end
- #else:
- # stop=end
-
- sequences.append(sequence(seqname, seq, (0,len(seq))))
- #sequences.append(seq)
-
- return sequences
-
-
-
-def example():
- """a test example"""
-
-
- filenames = {'data_dir':'/fml/ag-raetsch/share/projects/palma/param_files/',\
- 'param':'arabidopsis_ath1_ss=1_il=1.param.bz2',
- 'dna':'arabi',}
- parameters = {'use_gc':True,\
- 'use_splicesites':True,
- 'save_splicescores':False,}
- ss = splicesites()
- ss.load_model(filenames, parameters)
-
- ARGS = Param()
- ginfo_filename = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/qpalma/genome_info.pickle'
- genome_info = cPickle.load(open(ginfo_filename))
-
- (Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos) \
- = paths_load_data_solexa('training',genome_info,ARGS)
-
- dna = dict((str(ix),Sequences[ix]) for ix in xrange(len(Sequences)))
- splicescores = ss.predict_splicescores(filenames, parameters, dna)
-
- for exampleIdx in range(len(Sequences)):
- print str(exampleIdx)
- (don_score,acc_score) = ss.compute_donacc(splicescores,str(exampleIdx))
- print str((max(don_score),min(numpy.where(numpy.isinf(don_score),100,don_score))))
- print str((max(acc_score),min(numpy.where(numpy.isinf(acc_score),100,acc_score))))
-
-
-
-if __name__=='__main__':
- example()