+ added splicesite scores to training
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Wed, 23 Jan 2008 16:55:02 +0000 (16:55 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Wed, 23 Jan 2008 16:55:02 +0000 (16:55 +0000)
+ moved splicesite stuff to tools dir of the qpalma module
TODO
+ find index +1 bug in the decoded SpliceAlign vectors

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@7546 e1793c9e-67f9-0310-80fc-b846ff1f7b36

qpalma/computeSpliceWeights.py
qpalma/tools/io_qpalma.py [new file with mode: 0644]
qpalma/tools/splicesites.py [new file with mode: 0644]
scripts/qpalma_predict.py
scripts/qpalma_train.py
tools/io_qpalma.py [deleted file]
tools/splicesites.py [deleted file]

index 7fc8667..59520be 100644 (file)
@@ -46,14 +46,20 @@ def calculateWeights(plf, scores):
 
    return currentWeight
 
-def computeSpliceWeights(d, a, h, SpliceAlign, don_supp, acc_supp):
+def computeSpliceWeights(d, a, h, SpliceAlign, don_supp, acc_supp,dec=False):
    ####################################################################################
    # 1. Donor: In don_supp stehen Werte der SVM., in SpliceAlign die Einsen
    ####################################################################################
 
    # Picke die Positionen raus, an denen eine Donorstelle ist
-   DonorScores = [elem for pos,elem in enumerate(don_supp) if SpliceAlign[pos] == 1]
-   assert not ( -inf in DonorScores )
+   try:
+      if dec:
+         DonorScores = [elem for pos,elem in enumerate(don_supp) if pos > 0 and SpliceAlign[pos] == 1]
+      else:
+         DonorScores = [elem for pos,elem in enumerate(don_supp) if pos > 0 and SpliceAlign[pos-1] == 1]
+      assert not ( -inf in DonorScores )
+   except:
+      pdb.set_trace()
 
    #print 'donor'
    weightDon = calculateWeights(d,DonorScores)
@@ -63,8 +69,14 @@ def computeSpliceWeights(d, a, h, SpliceAlign, don_supp, acc_supp):
    ####################################################################################
 
    #Den Vektor Acceptorstellen durchgehen und die Gewichtsvektoren belasten:
-   AcceptorScores = [elem for pos,elem in enumerate(acc_supp) if pos > 0 and SpliceAlign[pos-1] == 2]
-   assert not ( -inf in AcceptorScores )
+   try:
+      if dec:
+         AcceptorScores = [elem for pos,elem in enumerate(acc_supp) if pos > 0 and SpliceAlign[pos-1] == 2]
+      else:
+         AcceptorScores = [elem for pos,elem in enumerate(acc_supp) if pos > 0 and SpliceAlign[pos] == 2]
+      assert not ( -inf in AcceptorScores )
+   except:
+      pdb.set_trace()
 
    #print 'acceptor'
    weightAcc = calculateWeights(a,AcceptorScores)
diff --git a/qpalma/tools/io_qpalma.py b/qpalma/tools/io_qpalma.py
new file mode 100644 (file)
index 0000000..0f9bf29
--- /dev/null
@@ -0,0 +1,235 @@
+
+"""
+Functions for parsing palma parameter files,
+to extract splice site predictors for qpalma.
+"""
+
+import bz2
+import sys
+import numpy
+from numpy import mat,array,chararray,inf
+
+class palma_model():
+    """A container for palma parameters"""
+    
+    #acceptor
+    acceptor_bins=None
+    acceptor_limits=None
+    acceptor_penalties=None
+
+    acceptor_splice_b=None
+    acceptor_splice_order=None
+    acceptor_splice_window_left=None
+    acceptor_splice_window_right=None
+    acceptor_splice_alphas=None
+    acceptor_splice_svs=None
+
+    #donor
+    donor_bins=None
+    donor_limits=None
+    donor_penalties=None
+    
+    donor_splice_b=None
+    donor_splice_order=None
+    #donor_splice_use_gc=None
+    donor_splice_window_left=None
+    donor_splice_window_right=None
+    donor_splice_alphas=None
+    donor_splice_svs=None
+
+    # intron length
+    intron_len_bins=None
+    intron_len_limits=None
+    intron_len_penalties=None
+    intron_len_min=None
+    intron_len_max=None
+    intron_len_transform=None
+
+    gene_len_max = None
+    
+    # substitution matrix
+    substitution_matrix=None 
+
+
+
+def parse_file(filename):
+    if filename.endswith('.param.bz2'):
+        fileptr=bz2.BZ2File(filename);
+    else:
+        sys.stderr.write('Expected palma parameter file with ending param.bz2\n')
+    m=palma_model()
+
+    sys.stdout.write("loading model file"); sys.stdout.flush()
+
+    l=fileptr.readline();
+
+    if l != '%palma definition file version: 1.0\n':
+        sys.stderr.write("\nfile not a palma definition file\n")
+        print l
+        return None
+
+    while l:
+        if not ( l.startswith('%') or l.startswith('\n') ): # comment
+            
+            #acceptor
+            if m.acceptor_bins is None: m.acceptor_bins=parse_value(l, 'acceptor_bins')
+            if m.acceptor_limits is None: m.acceptor_limits=parse_vector(l, fileptr, 'acceptor_limits')
+            if m.acceptor_penalties is None: m.acceptor_penalties=parse_vector(l, fileptr, 'acceptor_penalties') #DEAD
+
+            if m.acceptor_splice_b is None: m.acceptor_splice_b=parse_value(l, 'acceptor_splice_b')
+            if m.acceptor_splice_order is None: m.acceptor_splice_order=parse_value(l, 'acceptor_splice_order')
+            if m.acceptor_splice_window_left is None: m.acceptor_splice_window_left=parse_value(l, 'acceptor_splice_window_left')
+            if m.acceptor_splice_window_right is None: m.acceptor_splice_window_right=parse_value(l, 'acceptor_splice_window_right')
+            if m.acceptor_splice_alphas is None: m.acceptor_splice_alphas=parse_vector(l, fileptr, 'acceptor_splice_alphas')
+            if m.acceptor_splice_svs is None: m.acceptor_splice_svs=parse_string(l, fileptr, 'acceptor_splice_svs')
+
+            #donor
+            if m.donor_bins is None: m.donor_bins=parse_value(l, 'donor_bins')
+            if m.donor_limits is None: m.donor_limits=parse_vector(l, fileptr, 'donor_limits')
+            if m.donor_penalties is None: m.donor_penalties=parse_vector(l, fileptr, 'donor_penalties') #DEAD
+
+            if m.donor_splice_b is None: m.donor_splice_b=parse_value(l, 'donor_splice_b')
+            if m.donor_splice_order is None: m.donor_splice_order=parse_value(l, 'donor_splice_order')
+            #if m.donor_splice_use_gc is None: m.donor_splice_use_gc=parse_value(l, 'donor_splice_use_gc')
+            if m.donor_splice_window_left is None: m.donor_splice_window_left=parse_value(l, 'donor_splice_window_left')
+            if m.donor_splice_window_right is None: m.donor_splice_window_right=parse_value(l, 'donor_splice_window_right')
+            if m.donor_splice_alphas is None: m.donor_splice_alphas=parse_vector(l, fileptr, 'donor_splice_alphas')
+            if m.donor_splice_svs is None: m.donor_splice_svs=parse_string(l, fileptr, 'donor_splice_svs')
+
+
+            # intron length
+            if m.intron_len_bins is None: m.intron_len_bins=parse_value(l, 'intron_len_bins')
+            if m.intron_len_limits is None: m.intron_len_limits=parse_vector(l, fileptr, 'intron_len_limits')
+            if m.intron_len_penalties is None: m.intron_len_penalties=parse_vector(l, fileptr, 'intron_len_penalties')
+            if m.intron_len_min is None: m.intron_len_min=parse_value(l, 'intron_len_min')
+            if m.intron_len_max is None: m.intron_len_max=parse_value(l, 'intron_len_max')
+            if m.intron_len_transform is None: m.intron_len_transform=parse_value(l, 'intron_len_transform')
+
+            # gene length
+            if m.gene_len_max is None: m.gene_len_max=parse_value(l, 'gene_len_max')
+
+            if m.substitution_matrix is None: m.substitution_matrix=parse_vector(l, fileptr, 'substitution_matrix')
+
+        l=fileptr.readline()
+
+    sys.stderr.write('done\n')
+    return m
+
+def parse_value(line, name):
+    if (line.startswith(name)):
+        #print 'found ' + name
+        sys.stdout.write('.'); sys.stdout.flush()
+        str = line[line.find('=')+1:-1] ;
+        if str[0] == "'":
+            return str[1:-1]
+        else:
+            #print float(str)
+            return float(str)
+    else:
+        return None
+
+def parse_vector(line, fileptr, name):
+    mat = parse_matrix(line, fileptr, name)
+    if mat is None:
+     return mat
+    else:
+     mat = numpy.array(mat).flatten()
+     return mat 
+
+def parse_matrix(line, fileptr, name):
+    if (line.startswith(name)):
+        sys.stdout.write('.'); sys.stdout.flush()
+        if line.find(']') < 0:
+            l=''
+            while l is not None and l.find(']') < 0:
+                line+=l
+                l=fileptr.readline()
+            if l is not None and l.find(']') >= 0:
+                line+=l
+
+        if line.find(']') < 0:
+            sys.stderr.write("matrix `" + name + "' ended without ']'\n")
+            return None
+        else:
+            return mat(line[line.find('['):line.find(']')+1])
+    else:
+        return None
+
+def parse_string(line, fileptr, name):
+    if (line.startswith(name)):
+        sys.stdout.write('.'); sys.stdout.flush()
+        l=''
+        lines=[]
+        while l is not None and l.find(']') < 0:
+            if l:
+                lines+=[list(l[:-1])]
+            l=fileptr.readline()
+
+        if l.find(']') < 0:
+            sys.stderr.write("string ended without ']'\n")
+            return None
+        else:
+            #seqlen=len(lines[0])
+            num=len(lines)
+            #trdat = chararray((seqlen,num),1,order='FORTRAN')
+            #for i in xrange(num):
+            #    trdat[:,i]=lines[i]
+            trdat = num*[[]]
+            for idx,example in enumerate(lines):
+                trdat[idx] = ''.join(example)
+            return trdat
+    else:
+        return None
+
+if __name__ == '__main__':
+    #import hotshot, hotshot.stats
+
+    def load():
+        f='/fml/ag-raetsch/share/projects/palma/param_files/arabidopsis_ath1_ss=1_il=1.param.bz2'
+        m=parse_file(f);
+
+        print m.acceptor_bins is None
+        print m.acceptor_limits is None
+        print m.acceptor_penalties is None
+
+        print m.acceptor_splice_b is None
+        print m.acceptor_splice_order is None
+        print m.acceptor_splice_window_left is None
+        print m.acceptor_splice_window_right is None
+        print m.acceptor_splice_alphas is None
+        print m.acceptor_splice_svs is None
+
+        print m.donor_bins is None
+        print m.donor_limits is None
+        print m.donor_penalties is None
+
+        print m.donor_splice_b is None
+        print m.donor_splice_order is None
+        #print m.donor_splice_use_gc is None
+        print m.donor_splice_window_left is None
+        print m.donor_splice_window_right is None
+        print m.donor_splice_alphas is None
+        print m.donor_splice_svs is None
+        print m.intron_len_bins is None
+        print m.intron_len_limits is None
+        print m.intron_len_penalties is None
+        print m.intron_len_min is None
+        print m.intron_len_max is None
+        print m.intron_len_transform is None
+
+        print m.substitution_matrix is None
+
+        print 'intron_len_transform'
+        print m.intron_len_transform
+        print 'substitution_matrix'
+        print m.substitution_matrix
+
+    load()
+
+    #prof = hotshot.Profile("model.prof")
+    #benchtime = prof.runcall(load)
+    #prof.close()
+    #stats = hotshot.stats.load("model.prof")
+    #stats.strip_dirs()
+    #stats.sort_stats('time', 'calls')
+    #stats.print_stats(20)
diff --git a/qpalma/tools/splicesites.py b/qpalma/tools/splicesites.py
new file mode 100644 (file)
index 0000000..6e0a4a3
--- /dev/null
@@ -0,0 +1,420 @@
+
+import sys
+import os
+import bz2
+import numpy
+import string
+import io_qpalma
+
+from shogun.Classifier import SVM
+from shogun.Features import StringCharFeatures,DNA
+from shogun.Kernel import WeightedDegreeStringKernel
+
+import cPickle
+
+from qpalma.DataProc import *
+from qpalma.TrainingParam import Param
+
+
+class splicesites():
+    """splice site scores (both donor and acceptor)"""
+
+    def __init__(self):
+        self.model = None
+        self.signal = None
+        self.donor_splice_use_gc = None
+
+    def load_model(self, filenames, parameters):
+        """Load donor SVM, acceptor SVM, intron length score and set 'use_gc' parameter"""
+        try:
+            param_file = file(filenames['data_dir']+filenames['param'])
+        except IOError:
+            sys.stderr.write("Can't open PARAM FILE (%s) to read\n" %(filenames['data_dir']+filenames['param']))
+            sys.exit(2)
+
+        #<------------------------------------------------------------------------------>
+        filename = filenames['data_dir']+filenames['param']
+        self.model = io_qpalma.parse_file(filename)
+
+        self.donor_splice_use_gc = parameters['use_gc'] ;
+        #self.plif=plif(self.model)
+        if parameters['use_splicesites']:
+            self.signal = signal_detectors(self.model, self.donor_splice_use_gc)
+
+
+    def predict_splicescores(self, filenames, parameters, dna_fasta_dict):
+        splicescores_picklefile = filenames['dna'] + '.palma_splicesites.bz2' ;
+        #if os.path.isfile(splicescores_picklefile):
+        if parameters['use_splicesites'] and os.path.isfile(splicescores_picklefile):
+            print 'loading splice site predictions from ' + splicescores_picklefile
+            splicescores = self.load_splicescores_index(splicescores_picklefile)
+            splicescores_precomputed = True
+        else:
+            splicescores = {} ;
+            splicescores_precomputed = False 
+                        
+            #get donor/acceptor signal predictions for all sequences
+            if parameters['use_splicesites']:
+                seqs= dict2seqlist(dna_fasta_dict)
+                self.signal.predict_acceptor_sites_from_seqdict(seqs)
+                self.signal.predict_donor_sites_from_seqdict(seqs)
+
+                
+        for counter in xrange(len(dna_fasta_dict.keys())):
+            # DNA and mRNA sequence
+            if parameters['use_splicesites'] and not splicescores_precomputed:
+                seq = seqs[counter]
+            tSeq = dna_fasta_dict[dna_fasta_dict.keys()[counter]] # dna_sequences is list of strings
+            #assert(seq.sequence == tSeq)
+                        
+            tName = dna_fasta_dict.keys()[counter] 
+            tSize = len(tSeq)
+                        
+            # Splice site scores
+            if parameters['use_splicesites']:
+                if splicescores_precomputed:
+                    assert(splicescores.has_key(tName + '_acceptor_splicescores'))
+                    assert(splicescores.has_key(tName + '_donor_splicescores'))
+                else:
+                    acc_support_values = [-numpy.inf]*tSize
+                    for i in xrange(len(seq.preds['acceptor'].positions)):
+                        acc_support_values[seq.preds['acceptor'].positions[i]-1] = seq.preds['acceptor'].scores[i] ;
+                                                
+                    don_support_values = [-numpy.inf]*tSize
+                    for i in xrange(len(seq.preds['donor'].positions)):
+                        don_support_values[seq.preds['donor'].positions[i]] = seq.preds['donor'].scores[i] ;
+            else:
+                don_support_values = [-numpy.inf]*tSize
+                for cnt in range(tSize-1):
+                    if (tSeq[cnt] in 'Gg') \
+                           and ((tSeq[cnt+1] in 'Tt') \
+                                or ((tSeq[cnt+1] in 'cC') and parameters['use_gc'])):
+                        don_support_values[cnt] = 1
+                assert(len(don_support_values)==tSize)
+
+                acc_support_values = [-numpy.inf]*tSize
+                for cnt in range(tSize-1):
+                    if (tSeq[cnt+1] in 'Gg') and (tSeq[cnt] in 'aA'):
+                        acc_support_values[cnt+1] = 1
+                assert(len(acc_support_values)==tSize)
+
+            if not splicescores.has_key(tName + '_acceptor_splicescores'):
+                splicescores[tName + '_acceptor_splicescores'] = acc_support_values 
+                                
+            if not splicescores.has_key(tName + '_donor_splicescores'):
+                splicescores[tName + '_donor_splicescores'] = don_support_values 
+
+
+        # write the splice site predictions into a pickle file
+        if (not splicescores_precomputed) and (parameters['use_splicesites']) and parameters['save_splicescores']:
+            print 'saving splice site scores to ' + splicescores_picklefile
+            self.save_splicescores(splicescores_picklefile, splicescores)
+
+                
+        return splicescores
+
+
+
+    def get_splicescores(self, index, key, start, end):
+        if not index.has_key('XXX_palma_filename'):
+            seq = index[key] ;
+            if end==start==0:
+                return seq
+            else:
+                return seq[start:end]
+        else:
+            (num, fstart, fend) = index[key] ;
+            filename = index['XXX_palma_filename']
+            if filename.endswith('.bz2'):
+                f = bz2.BZ2File(filename, 'rb');
+            else:
+                if filename.endswith('.gz'):
+                    f = gzip.GzipFile(filename, 'rb');
+                else:
+                    f = file(filename, 'rb')
+
+            f.seek(fstart + start*4, 0) # sizeof float = 4!?
+            ss = numpy.array('f') ;
+            if end==start==0:
+                ss.read(f, num)
+            else:
+                ss.read(f, end-start)
+
+            return ss.tolist()
+
+    def compute_donacc(self,splicescores,tName):
+        """Return two lists of doubles corresponding
+        to the donor and acceptor splice site scores respectively
+        """
+        acc_support_values = self.get_splicescores(splicescores, tName + '_acceptor_splicescores', 0, 0)
+        don_support_values = self.get_splicescores(splicescores, tName + '_donor_splicescores', 0, 0)
+
+        return (don_support_values,acc_support_values)
+
+
+
+
+
+class signal_detectors():
+    def __init__(self, model, donor_splice_use_gc):
+        if donor_splice_use_gc:
+            donor_consensus=['GC','GT']
+        else:
+            donor_consensus=['GT']
+        
+        self.acceptor=svm_splice_model(model.acceptor_splice_order, model.acceptor_splice_svs,
+                numpy.array(model.acceptor_splice_alphas).flatten(), model.acceptor_splice_b, 
+                (model.acceptor_splice_window_left, 2, model.acceptor_splice_window_right), ['AG'])
+        self.donor=svm_splice_model(model.donor_splice_order, model.donor_splice_svs, 
+                numpy.array(model.donor_splice_alphas).flatten(), model.donor_splice_b, 
+                (model.donor_splice_window_left, 0, model.donor_splice_window_right),
+                donor_consensus)
+
+    def predict_acceptor_sites_from_seqdict(self, seqs):
+        self.acceptor.get_positions_from_seqdict(seqs, 'acceptor')
+        sys.stderr.write("computing svm output for acceptor positions\n")
+        self.acceptor.get_predictions_from_seqdict(seqs, 'acceptor')
+
+    def predict_donor_sites_from_seqdict(self, seqs): 
+        self.donor.get_positions_from_seqdict(seqs, 'donor')
+        sys.stderr.write("computing svm output for donor positions\n")
+        self.donor.get_predictions_from_seqdict(seqs, 'donor')
+
+
+
+class svm_splice_model():
+    """The SVM for predicting splice site scores"""
+    
+    def __init__(self, order, traindat, alphas, b, (window_left,offset,window_right), consensus):
+
+        f=StringCharFeatures(DNA)
+        f.set_string_features(traindat)
+        wd_kernel = WeightedDegreeStringKernel(f,f, int(order))
+        wd_kernel.io.set_target_to_stderr()
+
+        self.svm=SVM(wd_kernel, alphas, numpy.arange(len(alphas), dtype=numpy.int32), b)
+        self.svm.io.set_target_to_stderr()
+        self.svm.parallel.set_num_threads(self.svm.parallel.get_num_cpus())
+        #self.svm.set_linadd_enabled(False)
+        #self.svm.set_batch_computation_enabled(true)
+
+        self.window_left=int(window_left)
+        self.window_right=int(window_right)
+
+        self.consensus=consensus
+        self.wd_kernel=wd_kernel
+        self.traindat=f
+        self.offset = offset ;
+
+
+    def get_predictions_from_seqdict(self, seqdic, site):
+        """ we need to generate a huge test features object 
+            containing all locations found in each seqdict-sequence
+            and each location (this is necessary to efficiently
+            (==fast,low memory) compute the splice outputs
+        """
+
+        #seqlen=self.window_right+self.window_left+2
+
+        num=0
+        for s in seqdic:
+            num+= len(s.preds[site].positions)
+
+        #testdat = numpy.chararray((seqlen,num), 1, order='FORTRAN')
+        testdat = num*[[]]
+
+        k=0
+        si = 0 ;
+        for s in seqdic:
+            sequence=s.seq
+            positions=s.preds[site].positions
+            si += 1 
+            for j in xrange(len(positions)):
+                if len(positions)>50000 and j%10000==0:
+                    sys.stderr.write('sequence %i: progress %1.2f%%\r' %(si,(100*j/len(positions))))
+                i=positions[j] - self.offset
+                start = i-self.window_left
+                if start<0:
+                    s_start='A'*(-start)
+                    start =  0;
+                else:
+                    s_start = ''
+                stop = i+self.window_right+2
+                if stop>len(sequence):
+                    s_stop = 'A'*(stop-len(sequence) +1)
+                    stop=len(sequence) - 1 ;
+                else:
+                    s_stop = '' ;
+                s= s_start + sequence[start:stop] + s_stop
+                #print len(s)
+                #s=sequence[i-self.window_left:i+self.window_right+2]
+                #testdat[:,k]=list(s)
+                testdat[k]=s
+                k+=1
+                
+        if len(positions)>50000:
+            sys.stderr.write('\n')
+
+        t=StringCharFeatures(DNA)
+        t.set_string_features(testdat)
+
+        self.wd_kernel.init(self.traindat, t)
+        l=self.svm.classify().get_labels()
+        sys.stderr.write("\n...done...\n")
+
+        k=0
+        for s in seqdic:
+            num=len(s.preds[site].positions)
+            scores= num * [0]
+            for j in xrange(num):
+                scores[j]=l[k]
+                k+=1
+            s.preds[site].set_scores(scores)
+
+
+
+    def get_positions_from_seqdict(self, seqdic, site):
+        for d in seqdic:
+            positions=list()
+            sequence=d.seq
+            for cons in self.consensus:
+                l=sequence.find(cons)
+                while l>-1:
+                    #if l<len(sequence)-self.window_right-2 and l>self.window_left:
+                    positions.append(l+self.offset)
+                    l=sequence.find(cons, l+1)
+            positions.sort()
+            d.preds[site].set_positions(positions)
+
+class predictions(object):
+    def __init__(self, positions=None, scores=None):
+        self.positions=positions
+        self.scores=scores
+
+    def set_positions(self, positions):
+        self.positions=positions;
+    def get_positions(self):
+        return self.positions
+
+    def set_scores(self, scores):
+        self.scores=scores
+    def get_scores(self):
+        return self.scores
+
+    def __str__(self):
+        return 'positions: ' + `self.positions` + 'scores: ' + `self.scores`
+    def __repr__(self):
+        return self.__str__()
+
+class sequence(object):
+    def __init__(self, name, seq, (start,end)):
+        assert(start<end<=len(seq))
+        self.start=start
+        self.end=end
+        self.name=name
+        self.seq=seq
+        self.preds=dict()
+        self.preds['acceptor']=predictions()
+        self.preds['donor']=predictions()
+
+    def __str__(self):
+        s="start:" + `self.start` 
+        s+=" end:" + `self.end`
+        s+=" name:" + `self.name`
+        s+=" sequence:" + `self.seq[0:10]`
+        s+="... preds:" + `self.preds`
+        return s
+    def __repr__(self):
+        return self.__str__()
+
+def dict2seqlist(dic):
+    """ takes a fasta dict as input and
+    generates a list of sequence objects from it """
+    sequences=list()
+
+    #translate string to ACGT / all non ACGT letters are mapped to A
+    tab=''
+    for i in xrange(256):
+        if chr(i).upper() in 'ACGT':
+            tab+=chr(i).upper()
+        else:
+            tab+='A'
+
+    for seqname in dic:
+        seq=string.translate(dic[seqname], tab) 
+        seq=seq.upper()
+        #if end<0:
+        #    stop=len(seq)+end
+        #else:
+        #    stop=end
+
+        sequences.append(sequence(seqname, seq, (0,len(seq))))
+        #sequences.append(seq)
+        
+    return sequences
+
+
+def getDonAccScores(Sequences):
+    """a test example"""
+    
+
+    filenames = {'data_dir':'/fml/ag-raetsch/share/projects/palma/param_files/',\
+                 'param':'arabidopsis_ath1_ss=1_il=1.param.bz2',
+                 'dna':'arabi',}
+    parameters = {'use_gc':True,\
+                  'use_splicesites':True,
+                  'save_splicescores':False,}
+    ss = splicesites()
+    ss.load_model(filenames, parameters)    
+
+    dna = dict((str(ix),Sequences[ix]) for ix in xrange(len(Sequences)))
+    splicescores = ss.predict_splicescores(filenames, parameters, dna)
+
+    Donors = []
+    Acceptors = []
+    
+    for exampleIdx in range(len(Sequences)):
+        #print str(exampleIdx)
+        (don_score,acc_score) = ss.compute_donacc(splicescores,str(exampleIdx))
+        Donors.append(don_score)
+        Acceptors.append(acc_score)
+        print str((max(don_score),min(numpy.where(numpy.isinf(don_score),100,don_score))))
+        print str((max(acc_score),min(numpy.where(numpy.isinf(acc_score),100,acc_score))))
+
+    #pdb.set_trace()
+
+    return Donors, Acceptors
+
+def example():
+    """a test example"""
+    
+
+    filenames = {'data_dir':'/fml/ag-raetsch/share/projects/palma/param_files/',\
+                 'param':'arabidopsis_ath1_ss=1_il=1.param.bz2',
+                 'dna':'arabi',}
+    parameters = {'use_gc':True,\
+                  'use_splicesites':True,
+                  'save_splicescores':False,}
+    ss = splicesites()
+    ss.load_model(filenames, parameters)    
+
+    ARGS = Param()
+    ginfo_filename = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/qpalma/genome_info.pickle'
+    genome_info = cPickle.load(open(ginfo_filename))
+    
+    (Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos) \
+                = paths_load_data_solexa('training',genome_info,ARGS)
+
+    dna = dict((str(ix),Sequences[ix]) for ix in xrange(len(Sequences)))
+    splicescores = ss.predict_splicescores(filenames, parameters, dna)
+    
+    for exampleIdx in range(len(Sequences)):
+        print str(exampleIdx)
+        (don_score,acc_score) = ss.compute_donacc(splicescores,str(exampleIdx))
+        print str((max(don_score),min(numpy.where(numpy.isinf(don_score),100,don_score))))
+        print str((max(acc_score),min(numpy.where(numpy.isinf(acc_score),100,acc_score))))
+        
+
+
+if __name__=='__main__':
+    example()
index dc10256..d340a73 100644 (file)
@@ -72,6 +72,15 @@ class QPalma:
       elif Conf.mode == 'using_quality_scores':
          Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos = paths_load_data_solexa('training',None,self.ARGS)
 
+         end = 30
+         Sequences   = Sequences[:end]
+         Acceptors   = Acceptors[:end]
+         Donors      = Donors[:end]
+         Exons       = Exons[:end]
+         Ests        = Ests[:end]
+         Qualities   = Qualities[:end]
+         SplitPos    = SplitPos[:end]
+
          #Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
          #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
          #pdb.set_trace()
@@ -96,7 +105,7 @@ class QPalma:
 
       # Initialize parameter vector  / param = numpy.matlib.rand(126,1)
       #param_filename = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/python/elegans.param'
-      param_filename='/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/param_82.pickle'
+      param_filename='/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/param_20.pickle'
       param = load_param(param_filename)
 
       # Set the parameters such as limits penalties for the Plifs
@@ -301,6 +310,7 @@ class QPalma:
          #pdb.set_trace()
       
          up_off,down_off = evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign,currentSplitPos)
+         print up_off,down_off 
 
          if up_off > -1:
             total_up_off.append(up_off)
@@ -396,7 +406,7 @@ def evaluateExample(dna,est,exons,SpliceAlign,newEstAlign,spos):
       if oldElem == 0 and elem != 0:
          newExons.append(pos-1)
 
-   #pdb.set_trace()
+   pdb.set_trace()
    up_off   = -1
    down_off = -1
 
index 0c83f16..92f9021 100644 (file)
@@ -35,6 +35,8 @@ import qpalma.Configuration
 from qpalma.Plif import Plf
 from qpalma.Helpers import *
 
+from qpalma.tools.splicesites import getDonAccScores
+
 def getQualityFeatureCounts(qualityPlifs):
    weightQuality = qualityPlifs[0].penalties
    for currentPlif in qualityPlifs[1:]:
@@ -71,6 +73,15 @@ class QPalma:
       elif Configuration.mode == 'using_quality_scores':
          Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos = paths_load_data_solexa('training',None,self.ARGS)
 
+         end = 30
+         Sequences   = Sequences[:end]
+         Exons       = Exons[:end]
+         Ests        = Ests[:end]
+         Qualities   = Qualities[:end]
+         SplitPos    = SplitPos[:end]
+
+         Donors, Acceptors = getDonAccScores(Sequences)
+
          #Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
 
          #Sequences_, Acceptors_, Donors_, Exons_, Ests_, Qualities_ = generateData(100)
@@ -168,6 +179,7 @@ class QPalma:
             # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)    
             trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
             
+            #pdb.set_trace()
             # Calculate the weights
             trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
             trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
@@ -307,7 +319,9 @@ class QPalma:
             path_loss   = [0]*(num_path[exampleIdx])
 
             for pathNr in range(num_path[exampleIdx]):
-               weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a, h, newSpliceAlign[pathNr,:].flatten().tolist()[0], don_supp, acc_supp)
+               weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a,
+               h, newSpliceAlign[pathNr,:].flatten().tolist()[0], don_supp,
+               acc_supp,True)
 
                decodedQualityFeatures = zeros((Configuration.totalQualSuppPoints,1))
                for qidx in range(Configuration.totalQualSuppPoints):
diff --git a/tools/io_qpalma.py b/tools/io_qpalma.py
deleted file mode 100644 (file)
index 0f9bf29..0000000
+++ /dev/null
@@ -1,235 +0,0 @@
-
-"""
-Functions for parsing palma parameter files,
-to extract splice site predictors for qpalma.
-"""
-
-import bz2
-import sys
-import numpy
-from numpy import mat,array,chararray,inf
-
-class palma_model():
-    """A container for palma parameters"""
-    
-    #acceptor
-    acceptor_bins=None
-    acceptor_limits=None
-    acceptor_penalties=None
-
-    acceptor_splice_b=None
-    acceptor_splice_order=None
-    acceptor_splice_window_left=None
-    acceptor_splice_window_right=None
-    acceptor_splice_alphas=None
-    acceptor_splice_svs=None
-
-    #donor
-    donor_bins=None
-    donor_limits=None
-    donor_penalties=None
-    
-    donor_splice_b=None
-    donor_splice_order=None
-    #donor_splice_use_gc=None
-    donor_splice_window_left=None
-    donor_splice_window_right=None
-    donor_splice_alphas=None
-    donor_splice_svs=None
-
-    # intron length
-    intron_len_bins=None
-    intron_len_limits=None
-    intron_len_penalties=None
-    intron_len_min=None
-    intron_len_max=None
-    intron_len_transform=None
-
-    gene_len_max = None
-    
-    # substitution matrix
-    substitution_matrix=None 
-
-
-
-def parse_file(filename):
-    if filename.endswith('.param.bz2'):
-        fileptr=bz2.BZ2File(filename);
-    else:
-        sys.stderr.write('Expected palma parameter file with ending param.bz2\n')
-    m=palma_model()
-
-    sys.stdout.write("loading model file"); sys.stdout.flush()
-
-    l=fileptr.readline();
-
-    if l != '%palma definition file version: 1.0\n':
-        sys.stderr.write("\nfile not a palma definition file\n")
-        print l
-        return None
-
-    while l:
-        if not ( l.startswith('%') or l.startswith('\n') ): # comment
-            
-            #acceptor
-            if m.acceptor_bins is None: m.acceptor_bins=parse_value(l, 'acceptor_bins')
-            if m.acceptor_limits is None: m.acceptor_limits=parse_vector(l, fileptr, 'acceptor_limits')
-            if m.acceptor_penalties is None: m.acceptor_penalties=parse_vector(l, fileptr, 'acceptor_penalties') #DEAD
-
-            if m.acceptor_splice_b is None: m.acceptor_splice_b=parse_value(l, 'acceptor_splice_b')
-            if m.acceptor_splice_order is None: m.acceptor_splice_order=parse_value(l, 'acceptor_splice_order')
-            if m.acceptor_splice_window_left is None: m.acceptor_splice_window_left=parse_value(l, 'acceptor_splice_window_left')
-            if m.acceptor_splice_window_right is None: m.acceptor_splice_window_right=parse_value(l, 'acceptor_splice_window_right')
-            if m.acceptor_splice_alphas is None: m.acceptor_splice_alphas=parse_vector(l, fileptr, 'acceptor_splice_alphas')
-            if m.acceptor_splice_svs is None: m.acceptor_splice_svs=parse_string(l, fileptr, 'acceptor_splice_svs')
-
-            #donor
-            if m.donor_bins is None: m.donor_bins=parse_value(l, 'donor_bins')
-            if m.donor_limits is None: m.donor_limits=parse_vector(l, fileptr, 'donor_limits')
-            if m.donor_penalties is None: m.donor_penalties=parse_vector(l, fileptr, 'donor_penalties') #DEAD
-
-            if m.donor_splice_b is None: m.donor_splice_b=parse_value(l, 'donor_splice_b')
-            if m.donor_splice_order is None: m.donor_splice_order=parse_value(l, 'donor_splice_order')
-            #if m.donor_splice_use_gc is None: m.donor_splice_use_gc=parse_value(l, 'donor_splice_use_gc')
-            if m.donor_splice_window_left is None: m.donor_splice_window_left=parse_value(l, 'donor_splice_window_left')
-            if m.donor_splice_window_right is None: m.donor_splice_window_right=parse_value(l, 'donor_splice_window_right')
-            if m.donor_splice_alphas is None: m.donor_splice_alphas=parse_vector(l, fileptr, 'donor_splice_alphas')
-            if m.donor_splice_svs is None: m.donor_splice_svs=parse_string(l, fileptr, 'donor_splice_svs')
-
-
-            # intron length
-            if m.intron_len_bins is None: m.intron_len_bins=parse_value(l, 'intron_len_bins')
-            if m.intron_len_limits is None: m.intron_len_limits=parse_vector(l, fileptr, 'intron_len_limits')
-            if m.intron_len_penalties is None: m.intron_len_penalties=parse_vector(l, fileptr, 'intron_len_penalties')
-            if m.intron_len_min is None: m.intron_len_min=parse_value(l, 'intron_len_min')
-            if m.intron_len_max is None: m.intron_len_max=parse_value(l, 'intron_len_max')
-            if m.intron_len_transform is None: m.intron_len_transform=parse_value(l, 'intron_len_transform')
-
-            # gene length
-            if m.gene_len_max is None: m.gene_len_max=parse_value(l, 'gene_len_max')
-
-            if m.substitution_matrix is None: m.substitution_matrix=parse_vector(l, fileptr, 'substitution_matrix')
-
-        l=fileptr.readline()
-
-    sys.stderr.write('done\n')
-    return m
-
-def parse_value(line, name):
-    if (line.startswith(name)):
-        #print 'found ' + name
-        sys.stdout.write('.'); sys.stdout.flush()
-        str = line[line.find('=')+1:-1] ;
-        if str[0] == "'":
-            return str[1:-1]
-        else:
-            #print float(str)
-            return float(str)
-    else:
-        return None
-
-def parse_vector(line, fileptr, name):
-    mat = parse_matrix(line, fileptr, name)
-    if mat is None:
-     return mat
-    else:
-     mat = numpy.array(mat).flatten()
-     return mat 
-
-def parse_matrix(line, fileptr, name):
-    if (line.startswith(name)):
-        sys.stdout.write('.'); sys.stdout.flush()
-        if line.find(']') < 0:
-            l=''
-            while l is not None and l.find(']') < 0:
-                line+=l
-                l=fileptr.readline()
-            if l is not None and l.find(']') >= 0:
-                line+=l
-
-        if line.find(']') < 0:
-            sys.stderr.write("matrix `" + name + "' ended without ']'\n")
-            return None
-        else:
-            return mat(line[line.find('['):line.find(']')+1])
-    else:
-        return None
-
-def parse_string(line, fileptr, name):
-    if (line.startswith(name)):
-        sys.stdout.write('.'); sys.stdout.flush()
-        l=''
-        lines=[]
-        while l is not None and l.find(']') < 0:
-            if l:
-                lines+=[list(l[:-1])]
-            l=fileptr.readline()
-
-        if l.find(']') < 0:
-            sys.stderr.write("string ended without ']'\n")
-            return None
-        else:
-            #seqlen=len(lines[0])
-            num=len(lines)
-            #trdat = chararray((seqlen,num),1,order='FORTRAN')
-            #for i in xrange(num):
-            #    trdat[:,i]=lines[i]
-            trdat = num*[[]]
-            for idx,example in enumerate(lines):
-                trdat[idx] = ''.join(example)
-            return trdat
-    else:
-        return None
-
-if __name__ == '__main__':
-    #import hotshot, hotshot.stats
-
-    def load():
-        f='/fml/ag-raetsch/share/projects/palma/param_files/arabidopsis_ath1_ss=1_il=1.param.bz2'
-        m=parse_file(f);
-
-        print m.acceptor_bins is None
-        print m.acceptor_limits is None
-        print m.acceptor_penalties is None
-
-        print m.acceptor_splice_b is None
-        print m.acceptor_splice_order is None
-        print m.acceptor_splice_window_left is None
-        print m.acceptor_splice_window_right is None
-        print m.acceptor_splice_alphas is None
-        print m.acceptor_splice_svs is None
-
-        print m.donor_bins is None
-        print m.donor_limits is None
-        print m.donor_penalties is None
-
-        print m.donor_splice_b is None
-        print m.donor_splice_order is None
-        #print m.donor_splice_use_gc is None
-        print m.donor_splice_window_left is None
-        print m.donor_splice_window_right is None
-        print m.donor_splice_alphas is None
-        print m.donor_splice_svs is None
-        print m.intron_len_bins is None
-        print m.intron_len_limits is None
-        print m.intron_len_penalties is None
-        print m.intron_len_min is None
-        print m.intron_len_max is None
-        print m.intron_len_transform is None
-
-        print m.substitution_matrix is None
-
-        print 'intron_len_transform'
-        print m.intron_len_transform
-        print 'substitution_matrix'
-        print m.substitution_matrix
-
-    load()
-
-    #prof = hotshot.Profile("model.prof")
-    #benchtime = prof.runcall(load)
-    #prof.close()
-    #stats = hotshot.stats.load("model.prof")
-    #stats.strip_dirs()
-    #stats.sort_stats('time', 'calls')
-    #stats.print_stats(20)
diff --git a/tools/splicesites.py b/tools/splicesites.py
deleted file mode 100644 (file)
index 7750f75..0000000
+++ /dev/null
@@ -1,390 +0,0 @@
-
-import sys
-import os
-import bz2
-import numpy
-import string
-import io_qpalma
-
-from shogun.Classifier import SVM
-from shogun.Features import StringCharFeatures,DNA
-from shogun.Kernel import WeightedDegreeStringKernel
-
-import cPickle
-
-from qpalma.DataProc import *
-from qpalma.TrainingParam import Param
-
-
-class splicesites():
-    """splice site scores (both donor and acceptor)"""
-
-    def __init__(self):
-        self.model = None
-        self.signal = None
-        self.donor_splice_use_gc = None
-
-    def load_model(self, filenames, parameters):
-        """Load donor SVM, acceptor SVM, intron length score and set 'use_gc' parameter"""
-        try:
-            param_file = file(filenames['data_dir']+filenames['param'])
-        except IOError:
-            sys.stderr.write("Can't open PARAM FILE (%s) to read\n" %(filenames['data_dir']+filenames['param']))
-            sys.exit(2)
-
-        #<------------------------------------------------------------------------------>
-        filename = filenames['data_dir']+filenames['param']
-        self.model = io_qpalma.parse_file(filename)
-
-        self.donor_splice_use_gc = parameters['use_gc'] ;
-        #self.plif=plif(self.model)
-        if parameters['use_splicesites']:
-            self.signal = signal_detectors(self.model, self.donor_splice_use_gc)
-
-
-    def predict_splicescores(self, filenames, parameters, dna_fasta_dict):
-        splicescores_picklefile = filenames['dna'] + '.palma_splicesites.bz2' ;
-        #if os.path.isfile(splicescores_picklefile):
-        if parameters['use_splicesites'] and os.path.isfile(splicescores_picklefile):
-            print 'loading splice site predictions from ' + splicescores_picklefile
-            splicescores = self.load_splicescores_index(splicescores_picklefile)
-            splicescores_precomputed = True
-        else:
-            splicescores = {} ;
-            splicescores_precomputed = False 
-                        
-            #get donor/acceptor signal predictions for all sequences
-            if parameters['use_splicesites']:
-                seqs= dict2seqlist(dna_fasta_dict)
-                self.signal.predict_acceptor_sites_from_seqdict(seqs)
-                self.signal.predict_donor_sites_from_seqdict(seqs)
-
-                
-        for counter in xrange(len(dna_fasta_dict.keys())):
-            # DNA and mRNA sequence
-            if parameters['use_splicesites'] and not splicescores_precomputed:
-                seq = seqs[counter]
-            tSeq = dna_fasta_dict[dna_fasta_dict.keys()[counter]] # dna_sequences is list of strings
-            #assert(seq.sequence == tSeq)
-                        
-            tName = dna_fasta_dict.keys()[counter] 
-            tSize = len(tSeq)
-                        
-            # Splice site scores
-            if parameters['use_splicesites']:
-                if splicescores_precomputed:
-                    assert(splicescores.has_key(tName + '_acceptor_splicescores'))
-                    assert(splicescores.has_key(tName + '_donor_splicescores'))
-                else:
-                    acc_support_values = [-numpy.inf]*tSize
-                    for i in xrange(len(seq.preds['acceptor'].positions)):
-                        acc_support_values[seq.preds['acceptor'].positions[i]-1] = seq.preds['acceptor'].scores[i] ;
-                                                
-                    don_support_values = [-numpy.inf]*tSize
-                    for i in xrange(len(seq.preds['donor'].positions)):
-                        don_support_values[seq.preds['donor'].positions[i]] = seq.preds['donor'].scores[i] ;
-            else:
-                don_support_values = [-numpy.inf]*tSize
-                for cnt in range(tSize-1):
-                    if (tSeq[cnt] in 'Gg') \
-                           and ((tSeq[cnt+1] in 'Tt') \
-                                or ((tSeq[cnt+1] in 'cC') and parameters['use_gc'])):
-                        don_support_values[cnt] = 1
-                assert(len(don_support_values)==tSize)
-
-                acc_support_values = [-numpy.inf]*tSize
-                for cnt in range(tSize-1):
-                    if (tSeq[cnt+1] in 'Gg') and (tSeq[cnt] in 'aA'):
-                        acc_support_values[cnt+1] = 1
-                assert(len(acc_support_values)==tSize)
-
-            if not splicescores.has_key(tName + '_acceptor_splicescores'):
-                splicescores[tName + '_acceptor_splicescores'] = acc_support_values 
-                                
-            if not splicescores.has_key(tName + '_donor_splicescores'):
-                splicescores[tName + '_donor_splicescores'] = don_support_values 
-
-
-        # write the splice site predictions into a pickle file
-        if (not splicescores_precomputed) and (parameters['use_splicesites']) and parameters['save_splicescores']:
-            print 'saving splice site scores to ' + splicescores_picklefile
-            self.save_splicescores(splicescores_picklefile, splicescores)
-
-                
-        return splicescores
-
-
-
-    def get_splicescores(self, index, key, start, end):
-        if not index.has_key('XXX_palma_filename'):
-            seq = index[key] ;
-            if end==start==0:
-                return seq
-            else:
-                return seq[start:end]
-        else:
-            (num, fstart, fend) = index[key] ;
-            filename = index['XXX_palma_filename']
-            if filename.endswith('.bz2'):
-                f = bz2.BZ2File(filename, 'rb');
-            else:
-                if filename.endswith('.gz'):
-                    f = gzip.GzipFile(filename, 'rb');
-                else:
-                    f = file(filename, 'rb')
-
-            f.seek(fstart + start*4, 0) # sizeof float = 4!?
-            ss = numpy.array('f') ;
-            if end==start==0:
-                ss.read(f, num)
-            else:
-                ss.read(f, end-start)
-
-            return ss.tolist()
-
-    def compute_donacc(self,splicescores,tName):
-        """Return two lists of doubles corresponding
-        to the donor and acceptor splice site scores respectively
-        """
-        acc_support_values = self.get_splicescores(splicescores, tName + '_acceptor_splicescores', 0, 0)
-        don_support_values = self.get_splicescores(splicescores, tName + '_donor_splicescores', 0, 0)
-
-        return (don_support_values,acc_support_values)
-
-
-
-
-
-class signal_detectors():
-    def __init__(self, model, donor_splice_use_gc):
-        if donor_splice_use_gc:
-            donor_consensus=['GC','GT']
-        else:
-            donor_consensus=['GT']
-        
-        self.acceptor=svm_splice_model(model.acceptor_splice_order, model.acceptor_splice_svs,
-                numpy.array(model.acceptor_splice_alphas).flatten(), model.acceptor_splice_b, 
-                (model.acceptor_splice_window_left, 2, model.acceptor_splice_window_right), ['AG'])
-        self.donor=svm_splice_model(model.donor_splice_order, model.donor_splice_svs, 
-                numpy.array(model.donor_splice_alphas).flatten(), model.donor_splice_b, 
-                (model.donor_splice_window_left, 0, model.donor_splice_window_right),
-                donor_consensus)
-
-    def predict_acceptor_sites_from_seqdict(self, seqs):
-        self.acceptor.get_positions_from_seqdict(seqs, 'acceptor')
-        sys.stderr.write("computing svm output for acceptor positions\n")
-        self.acceptor.get_predictions_from_seqdict(seqs, 'acceptor')
-
-    def predict_donor_sites_from_seqdict(self, seqs): 
-        self.donor.get_positions_from_seqdict(seqs, 'donor')
-        sys.stderr.write("computing svm output for donor positions\n")
-        self.donor.get_predictions_from_seqdict(seqs, 'donor')
-
-
-
-class svm_splice_model():
-    """The SVM for predicting splice site scores"""
-    
-    def __init__(self, order, traindat, alphas, b, (window_left,offset,window_right), consensus):
-
-        f=StringCharFeatures(DNA)
-        f.set_string_features(traindat)
-        wd_kernel = WeightedDegreeStringKernel(f,f, int(order))
-        wd_kernel.io.set_target_to_stderr()
-
-        self.svm=SVM(wd_kernel, alphas, numpy.arange(len(alphas), dtype=numpy.int32), b)
-        self.svm.io.set_target_to_stderr()
-        self.svm.parallel.set_num_threads(self.svm.parallel.get_num_cpus())
-        #self.svm.set_linadd_enabled(False)
-        #self.svm.set_batch_computation_enabled(true)
-
-        self.window_left=int(window_left)
-        self.window_right=int(window_right)
-
-        self.consensus=consensus
-        self.wd_kernel=wd_kernel
-        self.traindat=f
-        self.offset = offset ;
-
-
-    def get_predictions_from_seqdict(self, seqdic, site):
-        """ we need to generate a huge test features object 
-            containing all locations found in each seqdict-sequence
-            and each location (this is necessary to efficiently
-            (==fast,low memory) compute the splice outputs
-        """
-
-        #seqlen=self.window_right+self.window_left+2
-
-        num=0
-        for s in seqdic:
-            num+= len(s.preds[site].positions)
-
-        #testdat = numpy.chararray((seqlen,num), 1, order='FORTRAN')
-        testdat = num*[[]]
-
-        k=0
-        si = 0 ;
-        for s in seqdic:
-            sequence=s.seq
-            positions=s.preds[site].positions
-            si += 1 
-            for j in xrange(len(positions)):
-                if len(positions)>50000 and j%10000==0:
-                    sys.stderr.write('sequence %i: progress %1.2f%%\r' %(si,(100*j/len(positions))))
-                i=positions[j] - self.offset
-                start = i-self.window_left
-                if start<0:
-                    s_start='A'*(-start)
-                    start =  0;
-                else:
-                    s_start = ''
-                stop = i+self.window_right+2
-                if stop>len(sequence):
-                    s_stop = 'A'*(stop-len(sequence) +1)
-                    stop=len(sequence) - 1 ;
-                else:
-                    s_stop = '' ;
-                s= s_start + sequence[start:stop] + s_stop
-                #print len(s)
-                #s=sequence[i-self.window_left:i+self.window_right+2]
-                #testdat[:,k]=list(s)
-                testdat[k]=s
-                k+=1
-                
-        if len(positions)>50000:
-            sys.stderr.write('\n')
-
-        t=StringCharFeatures(DNA)
-        t.set_string_features(testdat)
-
-        self.wd_kernel.init(self.traindat, t)
-        l=self.svm.classify().get_labels()
-        sys.stderr.write("\n...done...\n")
-
-        k=0
-        for s in seqdic:
-            num=len(s.preds[site].positions)
-            scores= num * [0]
-            for j in xrange(num):
-                scores[j]=l[k]
-                k+=1
-            s.preds[site].set_scores(scores)
-
-
-
-    def get_positions_from_seqdict(self, seqdic, site):
-        for d in seqdic:
-            positions=list()
-            sequence=d.seq
-            for cons in self.consensus:
-                l=sequence.find(cons)
-                while l>-1:
-                    #if l<len(sequence)-self.window_right-2 and l>self.window_left:
-                    positions.append(l+self.offset)
-                    l=sequence.find(cons, l+1)
-            positions.sort()
-            d.preds[site].set_positions(positions)
-
-class predictions(object):
-    def __init__(self, positions=None, scores=None):
-        self.positions=positions
-        self.scores=scores
-
-    def set_positions(self, positions):
-        self.positions=positions;
-    def get_positions(self):
-        return self.positions
-
-    def set_scores(self, scores):
-        self.scores=scores
-    def get_scores(self):
-        return self.scores
-
-    def __str__(self):
-        return 'positions: ' + `self.positions` + 'scores: ' + `self.scores`
-    def __repr__(self):
-        return self.__str__()
-
-class sequence(object):
-    def __init__(self, name, seq, (start,end)):
-        assert(start<end<=len(seq))
-        self.start=start
-        self.end=end
-        self.name=name
-        self.seq=seq
-        self.preds=dict()
-        self.preds['acceptor']=predictions()
-        self.preds['donor']=predictions()
-
-    def __str__(self):
-        s="start:" + `self.start` 
-        s+=" end:" + `self.end`
-        s+=" name:" + `self.name`
-        s+=" sequence:" + `self.seq[0:10]`
-        s+="... preds:" + `self.preds`
-        return s
-    def __repr__(self):
-        return self.__str__()
-
-def dict2seqlist(dic):
-    """ takes a fasta dict as input and
-    generates a list of sequence objects from it """
-    sequences=list()
-
-    #translate string to ACGT / all non ACGT letters are mapped to A
-    tab=''
-    for i in xrange(256):
-        if chr(i).upper() in 'ACGT':
-            tab+=chr(i).upper()
-        else:
-            tab+='A'
-
-    for seqname in dic:
-        seq=string.translate(dic[seqname], tab) 
-        seq=seq.upper()
-        #if end<0:
-        #    stop=len(seq)+end
-        #else:
-        #    stop=end
-
-        sequences.append(sequence(seqname, seq, (0,len(seq))))
-        #sequences.append(seq)
-        
-    return sequences
-
-
-
-def example():
-    """a test example"""
-    
-
-    filenames = {'data_dir':'/fml/ag-raetsch/share/projects/palma/param_files/',\
-                 'param':'arabidopsis_ath1_ss=1_il=1.param.bz2',
-                 'dna':'arabi',}
-    parameters = {'use_gc':True,\
-                  'use_splicesites':True,
-                  'save_splicescores':False,}
-    ss = splicesites()
-    ss.load_model(filenames, parameters)    
-
-    ARGS = Param()
-    ginfo_filename = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/qpalma/genome_info.pickle'
-    genome_info = cPickle.load(open(ginfo_filename))
-    
-    (Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos) \
-                = paths_load_data_solexa('training',genome_info,ARGS)
-
-    dna = dict((str(ix),Sequences[ix]) for ix in xrange(len(Sequences)))
-    splicescores = ss.predict_splicescores(filenames, parameters, dna)
-    
-    for exampleIdx in range(len(Sequences)):
-        print str(exampleIdx)
-        (don_score,acc_score) = ss.compute_donacc(splicescores,str(exampleIdx))
-        print str((max(don_score),min(numpy.where(numpy.isinf(don_score),100,don_score))))
-        print str((max(acc_score),min(numpy.where(numpy.isinf(acc_score),100,acc_score))))
-        
-
-
-if __name__=='__main__':
-    example()