+ renaming business
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 25 Jan 2008 16:49:48 +0000 (16:49 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 25 Jan 2008 16:49:48 +0000 (16:49 +0000)
git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@7585 e1793c9e-67f9-0310-80fc-b846ff1f7b36

qpalma/numpyUtils.py [deleted file]
scripts/qpalma.py [new file with mode: 0644]
scripts/qpalma_predict.py [deleted file]
scripts/qpalma_train.py [deleted file]

diff --git a/qpalma/numpyUtils.py b/qpalma/numpyUtils.py
deleted file mode 100644 (file)
index 175b385..0000000
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import numpy as ny
-import numpy.matlib as nyml
diff --git a/scripts/qpalma.py b/scripts/qpalma.py
new file mode 100644 (file)
index 0000000..9392639
--- /dev/null
@@ -0,0 +1,786 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+###########################################################
+#
+# The QPalma project aims at extending the Palma project 
+# to be able to use Solexa reads toegether with their 
+# quality scores.
+# 
+# This file represents the conversion of the main matlab 
+# training loop for Palma to python.
+# 
+# Author: Fabio De Bona
+# 
+###########################################################
+
+import sys
+import subprocess
+import scipy.io
+import pdb
+import os.path
+
+from numpy.matlib import mat,zeros,ones,inf
+from numpy.linalg import norm
+
+import QPalmaDP
+
+import qpalma
+import qpalma.Configuration
+
+from qpalma.SIQP_CPX import SIQPSolver
+from qpalma.DataProc import *
+from qpalma.generateEvaluationData import *
+from qpalma.computeSpliceWeights import *
+from qpalma.set_param_palma import *
+from qpalma.computeSpliceAlignWithQuality import *
+from qpalma.penalty_lookup_new import *
+from qpalma.compute_donacc import *
+from qpalma.export_param import *
+from qpalma.TrainingParam import Param
+from qpalma.Plif import Plf
+
+from qpalma.tools.splicesites import getDonAccScores
+
+class para:
+   pass
+
+class QPalma:
+   """
+   A training method for the QPalma project
+   """
+   
+   def __init__(self):
+      self.ARGS = Param()
+      self.logfh = open('qpalma.log','w+')
+
+      #gen_file= '%s/genome.config' % self.ARGS.basedir
+      #ginfo_filename = 'genome_info.pickle'
+      #self.genome_info = fetch_genome_info(ginfo_filename)
+      #self.plog('genome_info.basedir is %s\n'%self.genome_info.basedir)
+
+      #self.ARGS.train_with_splicesitescoreinformation = False
+
+      # Load the whole dataset 
+      if Configuration.mode == 'normal':
+         #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
+         Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
+         self.use_quality_scores = False
+
+      elif Configuration.mode == 'using_quality_scores':
+
+         filename = 'real_dset_%s'% 'recent'
+         if True:# not os.path.exists(filename):
+            Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos = paths_load_data_solexa('training',None,self.ARGS)
+
+            end = 1000
+            self.Sequences   = Sequences[:end]
+            self.Exons       = Exons[:end]
+            self.Ests        = Ests[:end]
+            self.Qualities   = Qualities[:end]
+            self.Acceptors   = Acceptors[:end]
+            self.Donors      = Donors[:end]
+
+            self.Donors, self.Acceptors = getDonAccScores(Sequences)
+            #dset = Dataset()
+         else:
+            dset = cPickle.load(open(filename))
+            Sequences = dset.Sequences
+            Acceptors = dset.Acceptors
+            Donors = dset.Donors
+            Exons =  dset.Exons
+            Ests = dset.Ests
+            Qualities = dset.Qualities
+
+         #Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
+         #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
+         #Qualities = []
+         #for i in range(len(Ests)):
+         #   Qualities.append([40]*len(Ests[i]))
+         self.use_quality_scores = True
+      else:
+         assert(False)
+
+      # number of training instances
+      N = len(Sequences) 
+      self.numExamples = N
+      assert N == len(Acceptors) and N == len(Acceptors) and N == len(Exons)\
+      and N == len(Ests), 'The Seq,Accept,Donor,.. arrays are of different lengths'
+      self.plog('Number of training examples: %d\n'% N)
+      print 'Number of features: %d\n'% Configuration.numFeatures
+
+   def plog(self,string):
+      self.logfh.write(string)
+      self.logfh.flush()
+
+   def train(self):
+      beg = Configuration.training_begin
+      end = Configuration.training_end
+
+      Sequences   = self.Sequences[beg:end]
+      Exons       = self.Exons[beg:end]
+      Ests        = self.Ests[beg:end]
+      Qualities   = self.Qualities[beg:end]
+      Acceptors   = self.Acceptors[beg:end]
+      Donors      = self.Donors[beg:end]
+
+      iteration_steps         = Configuration.iter_steps ; #upper bound on iteration steps
+      remove_duplicate_scores = Configuration.remove_duplicate_scores 
+      print_matrix            = Configuration.print_matrix 
+      anzpath                 = Configuration.anzpath 
+
+      # Initialize parameter vector  / param = numpy.matlib.rand(126,1)
+      param = Configuration.fixedParam 
+
+      # Set the parameters such as limits penalties for the Plifs
+      [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
+
+      # delete splicesite-score-information
+      #if not self.ARGS.train_with_splicesitescoreinformation:
+      #   for i in range(len(Acceptors)):
+      #      if Acceptors[i] > -20:
+      #         Acceptors[i] = 1
+      #      if Donors[i] >-20:
+      #         Donors[i] = 1
+
+      # Initialize solver 
+      if Configuration.USE_OPT:
+         self.plog('Initializing problem...\n')
+         solver = SIQPSolver(Configuration.numFeatures,self.numExamples,Configuration.C,self.logfh)
+
+      # stores the number of alignments done for each example (best path, second-best path etc.)
+      num_path = [anzpath]*self.numExamples
+      # stores the gap for each example
+      gap      = [0.0]*self.numExamples
+
+      #############################################################################################
+      # Training
+      #############################################################################################
+      self.plog('Starting training...\n')
+
+      donSP       = Configuration.numDonSuppPoints
+      accSP       = Configuration.numAccSuppPoints
+      lengthSP    = Configuration.numLengthSuppPoints
+      mmatrixSP   = Configuration.sizeMatchmatrix[0]\
+      *Configuration.sizeMatchmatrix[1]
+      numq        = Configuration.numQualSuppPoints
+      totalQualSP = Configuration.totalQualSuppPoints
+
+      currentPhi = zeros((Configuration.numFeatures,1))
+      totalQualityPenalties = zeros((totalQualSP,1))
+
+      iteration_nr = 0
+      param_idx = 0
+      const_added_ctr = 0
+      while True:
+         if iteration_nr == iteration_steps:
+            break
+
+         for exampleIdx in range(self.numExamples):
+            if (exampleIdx%100) == 0:
+               print 'Current example nr %d' % exampleIdx
+
+            dna = Sequences[exampleIdx] 
+            est = Ests[exampleIdx] 
+
+            if Configuration.mode == 'normal':
+               quality = [40]*len(est)
+
+            if Configuration.mode == 'using_quality_scores':
+               quality = Qualities[exampleIdx]
+
+            exons = Exons[exampleIdx] 
+            # NoiseMatrix = Noises[exampleIdx] 
+            don_supp = Donors[exampleIdx] 
+            acc_supp = Acceptors[exampleIdx] 
+
+            # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)    
+            trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
+            
+            #print 'trueWeights' 
+            # Calculate the weights
+            trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
+            trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
+
+            currentPhi[0:donSP]                                               = mat(d.penalties[:]).reshape(donSP,1)
+            currentPhi[donSP:donSP+accSP]                                     = mat(a.penalties[:]).reshape(accSP,1)
+            currentPhi[donSP+accSP:donSP+accSP+lengthSP]                      = mat(h.penalties[:]).reshape(lengthSP,1)
+            currentPhi[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP]   = mmatrix[:]
+
+            #pdb.set_trace()
+            if Configuration.mode == 'using_quality_scores':
+               totalQualityPenalties = param[-totalQualSP:]
+               currentPhi[donSP+accSP+lengthSP+mmatrixSP:]                    = totalQualityPenalties[:]
+
+            # Calculate w'phi(x,y) the total score of the alignment
+            trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
+
+            # The allWeights vector is supposed to store the weight parameter
+            # of the true alignment as well as the weight parameters of the
+            # num_path[exampleIdx] other alignments
+            allWeights = zeros((Configuration.numFeatures,num_path[exampleIdx]+1))
+            allWeights[:,0] = trueWeight[:,0]
+
+            AlignmentScores = [0.0]*(num_path[exampleIdx]+1)
+            AlignmentScores[0] = trueAlignmentScore
+
+            ################## Calculate wrong alignment(s) ######################
+
+            # Compute donor, acceptor with penalty_lookup_new
+            # returns two double lists
+            donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
+
+            #myalign wants the acceptor site on the g of the ag
+            acceptor = acceptor[1:]
+            acceptor.append(-inf)
+
+            # for now we don't use donor/acceptor scores
+
+            #donor = [-inf] * len(donor)
+            #acceptor = [-inf] * len(donor)
+
+            dna = str(dna)
+            est = str(est)
+            dna_len = len(dna)
+            est_len = len(est)
+
+            ps = h.convert2SWIG()
+
+            prb = QPalmaDP.createDoubleArrayFromList(quality)
+            chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
+
+            matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
+            mm_len = Configuration.sizeMatchmatrix[0]*Configuration.sizeMatchmatrix[1]
+
+            d_len = len(donor)
+            donor = QPalmaDP.createDoubleArrayFromList(donor)
+            a_len = len(acceptor)
+            acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
+
+            # Create the alignment object representing the interface to the C/C++ code.
+            currentAlignment = QPalmaDP.Alignment(Configuration.numQualPlifs,Configuration.numQualSuppPoints, self.use_quality_scores)
+
+            c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
+            #print 'Calling myalign...'
+            # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
+            currentAlignment.myalign( num_path[exampleIdx], dna, dna_len,\
+             est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
+             acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
+             print_matrix)
+
+            #print 'After calling myalign...'
+            #print 'Calling getAlignmentResults...'
+
+            c_SpliceAlign       = QPalmaDP.createIntArrayFromList([0]*(dna_len*num_path[exampleIdx]))
+            c_EstAlign          = QPalmaDP.createIntArrayFromList([0]*(est_len*num_path[exampleIdx]))
+            c_WeightMatch       = QPalmaDP.createIntArrayFromList([0]*(mm_len*num_path[exampleIdx]))
+            c_DPScores   = QPalmaDP.createDoubleArrayFromList([.0]*num_path[exampleIdx])
+
+            c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(Configuration.totalQualSuppPoints*num_path[exampleIdx]))
+
+            currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
+            c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
+
+            #print 'After calling getAlignmentResults...'
+
+            newSpliceAlign = zeros((num_path[exampleIdx]*dna_len,1))
+            newEstAlign    = zeros((est_len*num_path[exampleIdx],1))
+            newWeightMatch = zeros((num_path[exampleIdx]*mm_len,1))
+            newDPScores    = zeros((num_path[exampleIdx],1))
+            newQualityPlifsFeatures = zeros((Configuration.totalQualSuppPoints*num_path[exampleIdx],1))
+
+            #print 'newSpliceAlign'
+            for i in range(dna_len*num_path[exampleIdx]):
+               newSpliceAlign[i] = c_SpliceAlign[i]
+            #   print '%f' % (spliceAlign[i])
+
+            #print 'newEstAlign'
+            for i in range(est_len*num_path[exampleIdx]):
+               newEstAlign[i] = c_EstAlign[i]
+            #   print '%f' % (spliceAlign[i])
+
+            #print 'weightMatch'
+            for i in range(mm_len*num_path[exampleIdx]):
+               newWeightMatch[i] = c_WeightMatch[i]
+            #   print '%f' % (weightMatch[i])
+
+            #print 'ViterbiScores'
+            for i in range(num_path[exampleIdx]):
+               newDPScores[i] = c_DPScores[i]
+
+
+            if self.use_quality_scores:
+               for i in range(Configuration.totalQualSuppPoints*num_path[exampleIdx]):
+                  newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
+
+            #  equals palma up to here
+
+            #print "Calling destructors"
+            del c_SpliceAlign
+            del c_EstAlign
+            del c_WeightMatch
+            del c_DPScores
+            del c_qualityPlifsFeatures
+            del currentAlignment
+
+            newSpliceAlign = newSpliceAlign.reshape(num_path[exampleIdx],dna_len)
+            newWeightMatch = newWeightMatch.reshape(num_path[exampleIdx],mm_len)
+            # Calculate weights of the respective alignments Note that we are
+            # calculating n-best alignments without hamming loss, so we
+            # have to keep track which of the n-best alignments correspond to
+            # the true one in order not to incorporate a true alignment in the
+            # constraints. To keep track of the true and false alignments we
+            # define an array true_map with a boolean indicating the
+            # equivalence to the true alignment for each decoded alignment.
+            true_map = [0]*(num_path[exampleIdx]+1)
+            true_map[0] = 1
+            path_loss   = [0]*(num_path[exampleIdx])
+
+            for pathNr in range(num_path[exampleIdx]):
+               #print 'decodedWeights' 
+               weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a,\
+               h, newSpliceAlign[pathNr,:].flatten().tolist()[0], don_supp,\
+               acc_supp)
+
+               decodedQualityFeatures = zeros((Configuration.totalQualSuppPoints,1))
+               for qidx in range(Configuration.totalQualSuppPoints):
+                  decodedQualityFeatures[qidx] = newQualityPlifsFeatures[(pathNr*Configuration.totalQualSuppPoints)+qidx]
+
+               #pdb.set_trace()
+
+               path_loss[pathNr] = 0
+               # sum up positionwise loss between alignments
+               for alignPosIdx in range(newSpliceAlign[pathNr,:].shape[1]):
+                  if newSpliceAlign[pathNr,alignPosIdx] != trueSpliceAlign[alignPosIdx]:
+                     path_loss[pathNr] += 1
+
+               #pdb.set_trace()
+
+               # Gewichte in restliche Zeilen der Matrix speichern
+               wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch[pathNr,:].T, decodedQualityFeatures])
+               allWeights[:,pathNr+1] = wp
+
+               hpen = mat(h.penalties).reshape(len(h.penalties),1)
+               dpen = mat(d.penalties).reshape(len(d.penalties),1)
+               apen = mat(a.penalties).reshape(len(a.penalties),1)
+               features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties])
+
+               AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
+
+               # Check wether scalar product + loss equals viterbi score
+               #print 'Example nr.: %d, path nr. %d, scores: %f vs %f' % (exampleIdx,pathNr,newDPScores[pathNr,0], AlignmentScores[pathNr+1])
+
+               distinct_scores = False
+               if math.fabs(AlignmentScores[pathNr] - AlignmentScores[pathNr+1]) > 1e-5:
+                  distinct_scores = True
+               
+               #if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) + [0,1][distinct_scores and (pathNr>0)] <= 1e-5:
+               if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) <= 1e-5:
+                  pdb.set_trace()
+
+               #  # if the pathNr-best alignment is very close to the true alignment consider it as true
+               if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
+                  true_map[pathNr+1] = 1
+
+               # assert AlignmentScores[0] > max(AlignmentScores[1:]) + 1e-6, pdb.set_trace()
+
+               # the true label sequence should not have a larger score than the maximal one WHYYYYY?
+               # this means that all n-best paths are to close to each other 
+               # we have to extend the n-best search to a (n+1)-best
+               if len([elem for elem in true_map if elem == 1]) == len(true_map):
+                  num_path[exampleIdx] = num_path[exampleIdx]+1
+
+               # Choose true and first false alignment for extending A
+               firstFalseIdx = -1
+               for map_idx,elem in enumerate(true_map):
+                  if elem == 0:
+                     firstFalseIdx = map_idx
+                     break
+
+               # if there is at least one useful false alignment add the
+               # corresponding constraints to the optimization problem
+               if firstFalseIdx != -1:
+                  trueWeights       = allWeights[:,0]
+                  firstFalseWeights = allWeights[:,firstFalseIdx]
+                  differenceVector  = trueWeights - firstFalseWeights
+                  #pdb.set_trace()
+
+                  if Configuration.USE_OPT:
+                     const_added = solver.addConstraint(differenceVector, exampleIdx)
+                     const_added_ctr += 1
+               #
+               # end of one example processing 
+               #
+
+            # call solver every nth example //added constraint
+            if exampleIdx != 0 and exampleIdx % 100 == 0 and Configuration.USE_OPT:
+               objValue,w,self.slacks = solver.solve()
+      
+               print "objValue is %f" % objValue
+
+               sum_xis = 0
+               for elem in self.slacks:
+                  sum_xis +=  elem
+   
+               for i in range(len(param)):
+                  param[i] = w[i]
+
+               #pdb.set_trace()
+               cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
+               param_idx += 1
+               [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
+
+         #
+         # end of one iteration through all examples
+         #
+         iteration_nr += 1
+
+      #
+      # end of optimization 
+      #  
+      print 'Training completed'
+
+      pa = para()
+      pa.h = h
+      pa.d = d
+      pa.a = a
+      pa.mmatrix = mmatrix
+      pa.qualityPlifs = qualityPlifs
+
+      cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
+      #cPickle.dump(pa,open('elegans.param','w+'))
+      self.logfh.close()
+
+
+
+   def predict(self):
+      beg = Configuration.prediction_begin
+      end = Configuration.prediction_end
+
+      Sequences   = self.Sequences[beg:end]
+      Exons       = self.Exons[beg:end]
+      Ests        = self.Ests[beg:end]
+      Qualities   = self.Qualities[beg:end]
+      Acceptors   = self.Acceptors[beg:end]
+      Donors      = self.Donors[beg:end]
+
+      remove_duplicate_scores = Conf.remove_duplicate_scores 
+      print_matrix            = Conf.print_matrix 
+      anzpath                 = Conf.anzpath 
+
+      param_filename='/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/param_28.pickle'
+      param = load_param(param_filename)
+
+      # Set the parameters such as limits penalties for the Plifs
+      [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
+
+      #############################################################################################
+      # Prediction
+      #############################################################################################
+      self.plog('Starting prediction...\n')
+
+      donSP       = Conf.numDonSuppPoints
+      accSP       = Conf.numAccSuppPoints
+      lengthSP    = Conf.numLengthSuppPoints
+      mmatrixSP   = Conf.sizeMatchmatrix[0]\
+      *Conf.sizeMatchmatrix[1]
+      numq        = Conf.numQualSuppPoints
+      totalQualSP = Conf.totalQualSuppPoints
+
+      currentPhi = zeros((Conf.numFeatures,1))
+      totalQualityPenalties = zeros((totalQualSP,1))
+
+      total_up_off = []
+      total_down_off = []
+
+      for exampleIdx in range(self.numExamples):
+         dna = Sequences[exampleIdx] 
+         est = Ests[exampleIdx] 
+         currentSplitPos = SplitPos[exampleIdx]
+
+         if Conf.mode == 'normal':
+            quality = [40]*len(est)
+
+         if Conf.mode == 'using_quality_scores':
+            quality = Qualities[exampleIdx]
+
+         exons = Exons[exampleIdx] 
+         # NoiseMatrix = Noises[exampleIdx] 
+         don_supp = Donors[exampleIdx] 
+         acc_supp = Acceptors[exampleIdx] 
+
+         if exons[-1,1] > len(dna):
+            continue
+
+         # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)    
+         trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
+         
+         # Calculate the weights
+         trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
+         trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
+
+         currentPhi[0:donSP]                                               = mat(d.penalties[:]).reshape(donSP,1)
+         currentPhi[donSP:donSP+accSP]                                     = mat(a.penalties[:]).reshape(accSP,1)
+         currentPhi[donSP+accSP:donSP+accSP+lengthSP]                      = mat(h.penalties[:]).reshape(lengthSP,1)
+         currentPhi[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP]   = mmatrix[:]
+
+         if Conf.mode == 'using_quality_scores':
+            totalQualityPenalties = param[-totalQualSP:]
+            currentPhi[donSP+accSP+lengthSP+mmatrixSP:]                    = totalQualityPenalties[:]
+
+         # Calculate w'phi(x,y) the total score of the alignment
+         trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
+
+         # The allWeights vector is supposed to store the weight parameter
+         # of the true alignment as well as the weight parameters of the
+         # 1 other alignments
+         allWeights = zeros((Conf.numFeatures,1+1))
+         allWeights[:,0] = trueWeight[:,0]
+
+         AlignmentScores = [0.0]*(1+1)
+         AlignmentScores[0] = trueAlignmentScore
+
+         ################## Calculate wrong alignment(s) ######################
+
+         # Compute donor, acceptor with penalty_lookup_new
+         # returns two double lists
+         donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
+
+         #myalign wants the acceptor site on the g of the ag
+         acceptor = acceptor[1:]
+         acceptor.append(-inf)
+
+         dna = str(dna)
+         est = str(est)
+         dna_len = len(dna)
+         est_len = len(est)
+
+         ps = h.convert2SWIG()
+
+         prb = QPalmaDP.createDoubleArrayFromList(quality)
+         chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
+
+         matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
+         mm_len = Conf.sizeMatchmatrix[0]*Conf.sizeMatchmatrix[1]
+
+         d_len = len(donor)
+         donor = QPalmaDP.createDoubleArrayFromList(donor)
+         a_len = len(acceptor)
+         acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
+
+         # Create the alignment object representing the interface to the C/C++ code.
+         currentAlignment = QPalmaDP.Alignment(Conf.numQualPlifs,Conf.numQualSuppPoints, self.use_quality_scores)
+
+         c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
+
+         # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
+         currentAlignment.myalign(1, dna, dna_len,\
+          est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
+          acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
+          print_matrix)
+
+         c_SpliceAlign  = QPalmaDP.createIntArrayFromList([0]*(dna_len*1))
+         c_EstAlign     = QPalmaDP.createIntArrayFromList([0]*(est_len*1))
+         c_WeightMatch  = QPalmaDP.createIntArrayFromList([0]*(mm_len*1))
+         c_DPScores     = QPalmaDP.createDoubleArrayFromList([.0]*1)
+
+         c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(Conf.totalQualSuppPoints))
+
+         currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
+         c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
+
+         newSpliceAlign = zeros((dna_len,1))
+         newEstAlign    = zeros((est_len,1))
+         newWeightMatch = zeros((mm_len,1))
+         newQualityPlifsFeatures = zeros((Conf.totalQualSuppPoints,1))
+
+         #print 'newSpliceAlign'
+         for i in range(dna_len):
+            newSpliceAlign[i] = c_SpliceAlign[i]
+         #   print '%f' % (spliceAlign[i])
+
+         #print 'newEstAlign'
+         for i in range(est_len):
+            newEstAlign[i] = c_EstAlign[i]
+         #   print '%f' % (spliceAlign[i])
+
+         #print 'weightMatch'
+         for i in range(mm_len):
+            newWeightMatch[i] = c_WeightMatch[i]
+         #   print '%f' % (weightMatch[i])
+
+         newDPScores = c_DPScores[0]
+
+         for i in range(Conf.totalQualSuppPoints):
+            newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
+
+         #print "Calling destructors"
+         del c_SpliceAlign
+         del c_EstAlign
+         del c_WeightMatch
+         del c_DPScores
+         del c_qualityPlifsFeatures
+         del currentAlignment
+
+         newSpliceAlign = newSpliceAlign.reshape(1,dna_len)
+         newWeightMatch = newWeightMatch.reshape(1,mm_len)
+         # Calculate weights of the respective alignments Note that we are
+         # calculating n-best alignments without hamming loss, so we
+         # have to keep track which of the n-best alignments correspond to
+         # the true one in order not to incorporate a true alignment in the
+         # constraints. To keep track of the true and false alignments we
+         # define an array true_map with a boolean indicating the
+         # equivalence to the true alignment for each decoded alignment.
+         true_map = [0]*2
+         true_map[0] = 1
+         pathNr = 0
+
+         weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a, h, newSpliceAlign.flatten().tolist()[0], don_supp, acc_supp)
+
+         decodedQualityFeatures = zeros((Conf.totalQualSuppPoints,1))
+         for qidx in range(Conf.totalQualSuppPoints):
+            decodedQualityFeatures[qidx] = newQualityPlifsFeatures[qidx]
+
+         # Gewichte in restliche Zeilen der Matrix speichern
+         wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch.T, decodedQualityFeatures])
+         allWeights[:,pathNr+1] = wp
+
+         hpen = mat(h.penalties).reshape(len(h.penalties),1)
+         dpen = mat(d.penalties).reshape(len(d.penalties),1)
+         apen = mat(a.penalties).reshape(len(a.penalties),1)
+         features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties])
+         AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
+
+         # if the pathNr-best alignment is very close to the true alignment consider it as true
+         if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
+            true_map[pathNr+1] = 1
+
+         up_off,down_off = evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign,currentSplitPos)
+         #evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign,currentSplitPos)
+         #print up_off,down_off 
+
+         if up_off > -1:
+            total_up_off.append(up_off)
+            total_down_off.append(down_off)
+
+      total_up = 0
+      total_down = 0 
+      for idx in range(len(total_up_off)):
+         total_up    += total_up_off[idx]
+         total_down  += total_down_off[idx]
+         
+      total_up /= len(total_up_off)
+      total_down /= len(total_down_off)
+
+      print 'Mean up_off is %f' % total_up
+      print 'Mean down_off is %f' % total_down
+      print 'Correct up_off len is %d' % len([elem for elem in total_up_off if elem in [0,1]])
+      print 'Correct down_off len is %d' % len([elem for elem in total_down_off if elem in [0,1]])
+
+      print 'Prediction completed'
+      self.logfh.close()
+
+def fetch_genome_info(ginfo_filename):
+   if not os.path.exists(ginfo_filename):
+      cmd = ['']*4
+      cmd[0] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/utils'
+      cmd[1] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/genomes'
+      cmd[2] = 'genome_info = init_genome(\'%s\')' % gen_file
+      cmd[3] = 'save genome_info.mat genome_info'  
+      full_cmd = "matlab -nojvm -nodisplay -r \"%s; %s; %s; %s; exit\"" % (cmd[0],cmd[1],cmd[2],cmd[3])
+
+      obj = subprocess.Popen(full_cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
+      out,err = obj.communicate()
+      assert err == '', 'An error occured!\n%s'%err
+
+      ginfo = scipy.io.loadmat('genome_info.mat')
+      cPickle.dump(self.genome_info,open(ginfo_filename,'w+'))
+      return ginfo['genome_info']
+
+   else:
+      return cPickle.load(open(ginfo_filename))
+
+def plifs2param(h,d,a,mmatrix,qualityPlifs):
+   donSP       = Conf.numDonSuppPoints
+   accSP       = Conf.numAccSuppPoints
+   lengthSP    = Conf.numLengthSuppPoints
+   mmatrixSP   = Conf.sizeMatchmatrix[0]\
+   *Conf.sizeMatchmatrix[1]
+
+
+   param = zeros((Conf.numFeatures,1))
+   param[0:donSP]                                               = mat(d.penalties[:]).reshape(donSP,1)
+   param[donSP:donSP+accSP]                                     = mat(a.penalties[:]).reshape(accSP,1)
+   param[donSP+accSP:donSP+accSP+lengthSP]                      = mat(h.penalties[:]).reshape(lengthSP,1)
+   param[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP]   = mmatrix[:]
+
+   for idx in range(len(qualityPlifs)):
+      currentPlif       = qualityPlifs[idx]
+      begin             = lengthSP+donSP+accSP+mmatrixSP+(idx*Conf.numQualSuppPoints)
+      end               = lengthSP+donSP+accSP+mmatrixSP+((idx+1)*Conf.numQualSuppPoints)
+      param[begin:end]  = mat(currentPlif.penalties).reshape(Conf.numQualSuppPoints,1)
+
+   return param
+
+def load_param(filename):
+   param = None 
+   #try:
+   #   para = cPickle.load(open(filename))
+   #except:
+   #   print 'Error: Could not open parameter file!'
+   #   sys.exit(1)
+   #
+   #param = plifs2param(para.h,para.d,para.a,para.mmatrix,para.qualityPlifs)
+
+   param = cPickle.load(open(filename))
+   return param
+
+def evaluateExample(dna,est,exons,SpliceAlign,newEstAlign,spos):
+   newExons = []
+   oldElem = -1
+   SpliceAlign = SpliceAlign.flatten().tolist()[0]
+   SpliceAlign.append(-1)
+   for pos,elem in enumerate(SpliceAlign):
+      if pos == 0:
+         oldElem = -1
+      else:
+         oldElem = SpliceAlign[pos-1]
+
+      if oldElem != 0 and elem == 0: # start of exon
+         newExons.append(pos-1)
+
+      if oldElem == 0 and elem != 0: # end of exon
+         newExons.append(pos)
+
+   up_off   = -1
+   down_off = -1
+
+   if len(newExons) != 4:
+      acc = 0.0
+   else:
+      e1_begin,e1_end = newExons[0],newExons[1]
+      e2_begin,e2_end = newExons[2],newExons[3]
+
+      up_off   = int(math.fabs(e1_end - exons[0,1]))
+      down_off = int(math.fabs(e2_begin - exons[1,0]))
+
+   #pdb.set_trace()
+
+   return up_off,down_off
+
+if __name__ == '__main__':
+
+   qpalma = QPalma()
+   if len(sys.argv) != 2:
+      print 'You have to choose between training or prediction mode:'
+      print 'python qpalma. py (train|predict)' 
+
+   mode = sys.argv[1]
+   assert mode in ['train','predict']
+
+   if mode == 'train':
+      qpalma.train()
+      
+   if mode == 'predict':
+      qpalma.predict()
+
diff --git a/scripts/qpalma_predict.py b/scripts/qpalma_predict.py
deleted file mode 100644 (file)
index 67bfd63..0000000
+++ /dev/null
@@ -1,436 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-###########################################################
-#
-# 
-#
-###########################################################
-
-import sys
-import subprocess
-import scipy.io
-import pdb
-import os.path
-
-from numpy.matlib import mat,zeros,ones,inf
-from numpy.linalg import norm
-
-import QPalmaDP
-
-import qpalma
-from qpalma.SIQP_CPX import SIQPSolver
-from qpalma.DataProc import *
-
-from qpalma.generateEvaluationData import *
-from qpalma.computeSpliceWeights import *
-from qpalma.set_param_palma import *
-from qpalma.computeSpliceAlignWithQuality import *
-from qpalma.penalty_lookup_new import *
-from qpalma.compute_donacc import *
-from qpalma.TrainingParam import Param
-from qpalma.export_param import *
-
-import qpalma.Configuration
-from qpalma.Plif import Plf
-from qpalma.Helpers import *
-
-from qpalma.tools.splicesites import getDonAccScores
-
-def getQualityFeatureCounts(qualityPlifs):
-   weightQuality = qualityPlifs[0].penalties
-   for currentPlif in qualityPlifs[1:]:
-      weightQuality = numpy.vstack([weightQuality, currentPlif.penalties])
-
-   return weightQuality 
-
-class QPalma:
-   """
-   A training method for the QPalma project
-   """
-   
-   def __init__(self):
-      self.ARGS = Param()
-      self.logfh = open('qpalma.log','w+')
-
-      #gen_file= '%s/genome.config' % self.ARGS.basedir
-      #ginfo_filename = 'genome_info.pickle'
-      #self.genome_info = fetch_genome_info(ginfo_filename)
-      #self.plog('genome_info.basedir is %s\n'%self.genome_info.basedir)
-
-      #self.ARGS.train_with_splicesitescoreinformation = False
-
-   def plog(self,string):
-      self.logfh.write(string)
-      self.logfh.flush()
-
-   def predict(self):
-      # Load the whole dataset 
-      if Configuration.mode == 'normal':
-         #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
-         Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
-         use_quality_scores = False
-
-      elif Confguration.mode == 'using_quality_scores':
-         Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos =\
-         paths_load_data_solexa('training',None,self.ARGS,True)
-
-         begin = 0
-         end = 1000
-         Sequences   = Sequences[begin:end]
-         Exons       = Exons[begin:end]
-         Ests        = Ests[begin:end]
-         Qualities   = Qualities[begin:end]
-         Acceptors   = Acceptors[begin:end]
-         Donors      = Donors[begin:end]
-
-         pdb.set_trace()
-
-         Donors, Acceptors = getDonAccScores(Sequences)
-
-
-         #Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
-         #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
-         #Qualities = []
-         #for i in range(len(Ests)):
-         #   Qualities.append([40]*len(Ests[i]))
-         use_quality_scores = True
-      else:
-         assert(False)
-
-      # number of training instances
-      N = len(Sequences) 
-      self.numExamples = N
-      assert N == len(Acceptors) and N == len(Acceptors) and N == len(Exons)\
-      and N == len(Ests), 'The Seq,Accept,Donor,.. arrays are of different lengths'
-      self.plog('Number of training examples: %d\n'% N)
-      print 'Number of features: %d\n'% Conf.numFeatures
-
-      remove_duplicate_scores = Conf.remove_duplicate_scores 
-      print_matrix            = Conf.print_matrix 
-      anzpath                 = Conf.anzpath 
-
-      # Initialize parameter vector  / param = numpy.matlib.rand(126,1)
-      #param_filename = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/python/elegans.param'
-      param_filename='/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/param_28.pickle'
-      param = load_param(param_filename)
-
-      # Set the parameters such as limits penalties for the Plifs
-      [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
-
-      # stores the number of alignments done for each example (best path, second-best path etc.)
-      num_path = [anzpath]*N 
-
-      #############################################################################################
-      # Training
-      #############################################################################################
-      self.plog('Starting training...\n')
-
-      donSP       = Conf.numDonSuppPoints
-      accSP       = Conf.numAccSuppPoints
-      lengthSP    = Conf.numLengthSuppPoints
-      mmatrixSP   = Conf.sizeMatchmatrix[0]\
-      *Conf.sizeMatchmatrix[1]
-      numq        = Conf.numQualSuppPoints
-      totalQualSP = Conf.totalQualSuppPoints
-
-      currentPhi = zeros((Conf.numFeatures,1))
-      totalQualityPenalties = zeros((totalQualSP,1))
-
-      total_up_off = []
-      total_down_off = []
-
-      #for exampleIdx in range(self.numExamples):
-      for exampleIdx in range(200):
-
-         dna = Sequences[exampleIdx] 
-         est = Ests[exampleIdx] 
-         currentSplitPos = SplitPos[exampleIdx]
-
-         if Conf.mode == 'normal':
-            quality = [40]*len(est)
-
-         if Conf.mode == 'using_quality_scores':
-            quality = Qualities[exampleIdx]
-
-         exons = Exons[exampleIdx] 
-         # NoiseMatrix = Noises[exampleIdx] 
-         don_supp = Donors[exampleIdx] 
-         acc_supp = Acceptors[exampleIdx] 
-
-         if exons[-1,1] > len(dna):
-            continue
-
-         # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)    
-         trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
-         
-         # Calculate the weights
-         trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
-         trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
-
-         currentPhi[0:donSP]                                               = mat(d.penalties[:]).reshape(donSP,1)
-         currentPhi[donSP:donSP+accSP]                                     = mat(a.penalties[:]).reshape(accSP,1)
-         currentPhi[donSP+accSP:donSP+accSP+lengthSP]                      = mat(h.penalties[:]).reshape(lengthSP,1)
-         currentPhi[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP]   = mmatrix[:]
-
-         if Conf.mode == 'using_quality_scores':
-            totalQualityPenalties = param[-totalQualSP:]
-            currentPhi[donSP+accSP+lengthSP+mmatrixSP:]                    = totalQualityPenalties[:]
-
-         # Calculate w'phi(x,y) the total score of the alignment
-         trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
-
-         # The allWeights vector is supposed to store the weight parameter
-         # of the true alignment as well as the weight parameters of the
-         # 1 other alignments
-         allWeights = zeros((Conf.numFeatures,1+1))
-         allWeights[:,0] = trueWeight[:,0]
-
-         AlignmentScores = [0.0]*(1+1)
-         AlignmentScores[0] = trueAlignmentScore
-
-         ################## Calculate wrong alignment(s) ######################
-
-         # Compute donor, acceptor with penalty_lookup_new
-         # returns two double lists
-         donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
-
-         #myalign wants the acceptor site on the g of the ag
-         acceptor = acceptor[1:]
-         acceptor.append(-inf)
-
-         # for now we don't use donor/acceptor scores
-         #donor = [-inf] * len(donor)
-         #acceptor = [-inf] * len(donor)
-
-         dna = str(dna)
-         est = str(est)
-         dna_len = len(dna)
-         est_len = len(est)
-
-         ps = h.convert2SWIG()
-
-         prb = QPalmaDP.createDoubleArrayFromList(quality)
-         chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
-
-         matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
-         mm_len = Conf.sizeMatchmatrix[0]*Conf.sizeMatchmatrix[1]
-
-         d_len = len(donor)
-         donor = QPalmaDP.createDoubleArrayFromList(donor)
-         a_len = len(acceptor)
-         acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
-
-         # Create the alignment object representing the interface to the C/C++ code.
-         currentAlignment = QPalmaDP.Alignment(Conf.numQualPlifs,Conf.numQualSuppPoints, use_quality_scores)
-
-         c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
-
-         # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
-         currentAlignment.myalign(1, dna, dna_len,\
-          est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
-          acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
-          print_matrix)
-
-         c_SpliceAlign  = QPalmaDP.createIntArrayFromList([0]*(dna_len*1))
-         c_EstAlign     = QPalmaDP.createIntArrayFromList([0]*(est_len*1))
-         c_WeightMatch  = QPalmaDP.createIntArrayFromList([0]*(mm_len*1))
-         c_DPScores     = QPalmaDP.createDoubleArrayFromList([.0]*1)
-
-         c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(Conf.totalQualSuppPoints))
-
-         currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
-         c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
-
-         newSpliceAlign = zeros((dna_len,1))
-         newEstAlign    = zeros((est_len,1))
-         newWeightMatch = zeros((mm_len,1))
-         newQualityPlifsFeatures = zeros((Conf.totalQualSuppPoints,1))
-
-         #print 'newSpliceAlign'
-         for i in range(dna_len):
-            newSpliceAlign[i] = c_SpliceAlign[i]
-         #   print '%f' % (spliceAlign[i])
-
-         #print 'newEstAlign'
-         for i in range(est_len):
-            newEstAlign[i] = c_EstAlign[i]
-         #   print '%f' % (spliceAlign[i])
-
-         #print 'weightMatch'
-         for i in range(mm_len):
-            newWeightMatch[i] = c_WeightMatch[i]
-         #   print '%f' % (weightMatch[i])
-
-         newDPScores = c_DPScores[0]
-
-         for i in range(Conf.totalQualSuppPoints):
-            newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
-
-         #  equals palma up to here
-
-         #print "Calling destructors"
-         del c_SpliceAlign
-         del c_EstAlign
-         del c_WeightMatch
-         del c_DPScores
-         del c_qualityPlifsFeatures
-         del currentAlignment
-
-         newSpliceAlign = newSpliceAlign.reshape(1,dna_len)
-         newWeightMatch = newWeightMatch.reshape(1,mm_len)
-         # Calculate weights of the respective alignments Note that we are
-         # calculating n-best alignments without hamming loss, so we
-         # have to keep track which of the n-best alignments correspond to
-         # the true one in order not to incorporate a true alignment in the
-         # constraints. To keep track of the true and false alignments we
-         # define an array true_map with a boolean indicating the
-         # equivalence to the true alignment for each decoded alignment.
-         true_map = [0]*2
-         true_map[0] = 1
-         pathNr = 0
-
-         weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a, h, newSpliceAlign.flatten().tolist()[0], don_supp, acc_supp)
-
-         decodedQualityFeatures = zeros((Conf.totalQualSuppPoints,1))
-         for qidx in range(Conf.totalQualSuppPoints):
-            decodedQualityFeatures[qidx] = newQualityPlifsFeatures[qidx]
-
-         # Gewichte in restliche Zeilen der Matrix speichern
-         wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch.T, decodedQualityFeatures])
-         allWeights[:,pathNr+1] = wp
-
-         hpen = mat(h.penalties).reshape(len(h.penalties),1)
-         dpen = mat(d.penalties).reshape(len(d.penalties),1)
-         apen = mat(a.penalties).reshape(len(a.penalties),1)
-         features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties])
-
-         AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
-
-         # Check wether scalar product + loss equals viterbi score
-         print '%f vs. %f' % (newDPScores, AlignmentScores[pathNr+1])
-
-         # if the pathNr-best alignment is very close to the true alignment consider it as true
-         if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
-            true_map[pathNr+1] = 1
-
-         #pdb.set_trace()
-      
-         up_off,down_off = evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign,currentSplitPos)
-         #evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign,currentSplitPos)
-         #print up_off,down_off 
-
-         if up_off > -1:
-            total_up_off.append(up_off)
-            total_down_off.append(down_off)
-
-      total_up = 0
-      total_down = 0 
-      for idx in range(len(total_up_off)):
-         total_up    += total_up_off[idx]
-         total_down  += total_down_off[idx]
-         
-      total_up /= len(total_up_off)
-      total_down /= len(total_down_off)
-
-      print 'Mean up_off is %f' % total_up
-      print 'Mean down_off is %f' % total_down
-      print 'Correct up_off len is %d' % len([elem for elem in total_up_off if elem in [0,1]])
-      print 'Correct down_off len is %d' % len([elem for elem in total_down_off if elem in [0,1]])
-      #print total_up_off
-      #print total_down_off
-
-      print 'Prediction completed'
-      self.logfh.close()
-
-def fetch_genome_info(ginfo_filename):
-   if not os.path.exists(ginfo_filename):
-      cmd = ['']*4
-      cmd[0] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/utils'
-      cmd[1] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/genomes'
-      cmd[2] = 'genome_info = init_genome(\'%s\')' % gen_file
-      cmd[3] = 'save genome_info.mat genome_info'  
-      full_cmd = "matlab -nojvm -nodisplay -r \"%s; %s; %s; %s; exit\"" % (cmd[0],cmd[1],cmd[2],cmd[3])
-
-      obj = subprocess.Popen(full_cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
-      out,err = obj.communicate()
-      assert err == '', 'An error occured!\n%s'%err
-
-      ginfo = scipy.io.loadmat('genome_info.mat')
-      cPickle.dump(self.genome_info,open(ginfo_filename,'w+'))
-      return ginfo['genome_info']
-
-   else:
-      return cPickle.load(open(ginfo_filename))
-
-def plifs2param(h,d,a,mmatrix,qualityPlifs):
-   donSP       = Conf.numDonSuppPoints
-   accSP       = Conf.numAccSuppPoints
-   lengthSP    = Conf.numLengthSuppPoints
-   mmatrixSP   = Conf.sizeMatchmatrix[0]\
-   *Conf.sizeMatchmatrix[1]
-
-
-   param = zeros((Conf.numFeatures,1))
-   param[0:donSP]                                               = mat(d.penalties[:]).reshape(donSP,1)
-   param[donSP:donSP+accSP]                                     = mat(a.penalties[:]).reshape(accSP,1)
-   param[donSP+accSP:donSP+accSP+lengthSP]                      = mat(h.penalties[:]).reshape(lengthSP,1)
-   param[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP]   = mmatrix[:]
-
-   for idx in range(len(qualityPlifs)):
-      currentPlif       = qualityPlifs[idx]
-      begin             = lengthSP+donSP+accSP+mmatrixSP+(idx*Conf.numQualSuppPoints)
-      end               = lengthSP+donSP+accSP+mmatrixSP+((idx+1)*Conf.numQualSuppPoints)
-      param[begin:end]  = mat(currentPlif.penalties).reshape(Conf.numQualSuppPoints,1)
-
-   return param
-
-def load_param(filename):
-   param = None 
-   #try:
-   #   para = cPickle.load(open(filename))
-   #except:
-   #   print 'Error: Could not open parameter file!'
-   #   sys.exit(1)
-   #
-   #param = plifs2param(para.h,para.d,para.a,para.mmatrix,para.qualityPlifs)
-
-   param = cPickle.load(open(filename))
-   return param
-
-def evaluateExample(dna,est,exons,SpliceAlign,newEstAlign,spos):
-   newExons = []
-   oldElem = -1
-   SpliceAlign = SpliceAlign.flatten().tolist()[0]
-   SpliceAlign.append(-1)
-   for pos,elem in enumerate(SpliceAlign):
-      if pos == 0:
-         oldElem = -1
-      else:
-         oldElem = SpliceAlign[pos-1]
-
-      if oldElem != 0 and elem == 0: # start of exon
-         newExons.append(pos-1)
-
-      if oldElem == 0 and elem != 0: # end of exon
-         newExons.append(pos)
-
-   up_off   = -1
-   down_off = -1
-
-   if len(newExons) != 4:
-      acc = 0.0
-   else:
-      e1_begin,e1_end = newExons[0],newExons[1]
-      e2_begin,e2_end = newExons[2],newExons[3]
-
-      up_off   = int(math.fabs(e1_end - exons[0,1]))
-      down_off = int(math.fabs(e2_begin - exons[1,0]))
-
-   #pdb.set_trace()
-
-   return up_off,down_off
-
-if __name__ == '__main__':
-   qpalma = QPalma()
-   qpalma.run()
diff --git a/scripts/qpalma_train.py b/scripts/qpalma_train.py
deleted file mode 100644 (file)
index 9392639..0000000
+++ /dev/null
@@ -1,786 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-###########################################################
-#
-# The QPalma project aims at extending the Palma project 
-# to be able to use Solexa reads toegether with their 
-# quality scores.
-# 
-# This file represents the conversion of the main matlab 
-# training loop for Palma to python.
-# 
-# Author: Fabio De Bona
-# 
-###########################################################
-
-import sys
-import subprocess
-import scipy.io
-import pdb
-import os.path
-
-from numpy.matlib import mat,zeros,ones,inf
-from numpy.linalg import norm
-
-import QPalmaDP
-
-import qpalma
-import qpalma.Configuration
-
-from qpalma.SIQP_CPX import SIQPSolver
-from qpalma.DataProc import *
-from qpalma.generateEvaluationData import *
-from qpalma.computeSpliceWeights import *
-from qpalma.set_param_palma import *
-from qpalma.computeSpliceAlignWithQuality import *
-from qpalma.penalty_lookup_new import *
-from qpalma.compute_donacc import *
-from qpalma.export_param import *
-from qpalma.TrainingParam import Param
-from qpalma.Plif import Plf
-
-from qpalma.tools.splicesites import getDonAccScores
-
-class para:
-   pass
-
-class QPalma:
-   """
-   A training method for the QPalma project
-   """
-   
-   def __init__(self):
-      self.ARGS = Param()
-      self.logfh = open('qpalma.log','w+')
-
-      #gen_file= '%s/genome.config' % self.ARGS.basedir
-      #ginfo_filename = 'genome_info.pickle'
-      #self.genome_info = fetch_genome_info(ginfo_filename)
-      #self.plog('genome_info.basedir is %s\n'%self.genome_info.basedir)
-
-      #self.ARGS.train_with_splicesitescoreinformation = False
-
-      # Load the whole dataset 
-      if Configuration.mode == 'normal':
-         #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
-         Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
-         self.use_quality_scores = False
-
-      elif Configuration.mode == 'using_quality_scores':
-
-         filename = 'real_dset_%s'% 'recent'
-         if True:# not os.path.exists(filename):
-            Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos = paths_load_data_solexa('training',None,self.ARGS)
-
-            end = 1000
-            self.Sequences   = Sequences[:end]
-            self.Exons       = Exons[:end]
-            self.Ests        = Ests[:end]
-            self.Qualities   = Qualities[:end]
-            self.Acceptors   = Acceptors[:end]
-            self.Donors      = Donors[:end]
-
-            self.Donors, self.Acceptors = getDonAccScores(Sequences)
-            #dset = Dataset()
-         else:
-            dset = cPickle.load(open(filename))
-            Sequences = dset.Sequences
-            Acceptors = dset.Acceptors
-            Donors = dset.Donors
-            Exons =  dset.Exons
-            Ests = dset.Ests
-            Qualities = dset.Qualities
-
-         #Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
-         #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
-         #Qualities = []
-         #for i in range(len(Ests)):
-         #   Qualities.append([40]*len(Ests[i]))
-         self.use_quality_scores = True
-      else:
-         assert(False)
-
-      # number of training instances
-      N = len(Sequences) 
-      self.numExamples = N
-      assert N == len(Acceptors) and N == len(Acceptors) and N == len(Exons)\
-      and N == len(Ests), 'The Seq,Accept,Donor,.. arrays are of different lengths'
-      self.plog('Number of training examples: %d\n'% N)
-      print 'Number of features: %d\n'% Configuration.numFeatures
-
-   def plog(self,string):
-      self.logfh.write(string)
-      self.logfh.flush()
-
-   def train(self):
-      beg = Configuration.training_begin
-      end = Configuration.training_end
-
-      Sequences   = self.Sequences[beg:end]
-      Exons       = self.Exons[beg:end]
-      Ests        = self.Ests[beg:end]
-      Qualities   = self.Qualities[beg:end]
-      Acceptors   = self.Acceptors[beg:end]
-      Donors      = self.Donors[beg:end]
-
-      iteration_steps         = Configuration.iter_steps ; #upper bound on iteration steps
-      remove_duplicate_scores = Configuration.remove_duplicate_scores 
-      print_matrix            = Configuration.print_matrix 
-      anzpath                 = Configuration.anzpath 
-
-      # Initialize parameter vector  / param = numpy.matlib.rand(126,1)
-      param = Configuration.fixedParam 
-
-      # Set the parameters such as limits penalties for the Plifs
-      [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
-
-      # delete splicesite-score-information
-      #if not self.ARGS.train_with_splicesitescoreinformation:
-      #   for i in range(len(Acceptors)):
-      #      if Acceptors[i] > -20:
-      #         Acceptors[i] = 1
-      #      if Donors[i] >-20:
-      #         Donors[i] = 1
-
-      # Initialize solver 
-      if Configuration.USE_OPT:
-         self.plog('Initializing problem...\n')
-         solver = SIQPSolver(Configuration.numFeatures,self.numExamples,Configuration.C,self.logfh)
-
-      # stores the number of alignments done for each example (best path, second-best path etc.)
-      num_path = [anzpath]*self.numExamples
-      # stores the gap for each example
-      gap      = [0.0]*self.numExamples
-
-      #############################################################################################
-      # Training
-      #############################################################################################
-      self.plog('Starting training...\n')
-
-      donSP       = Configuration.numDonSuppPoints
-      accSP       = Configuration.numAccSuppPoints
-      lengthSP    = Configuration.numLengthSuppPoints
-      mmatrixSP   = Configuration.sizeMatchmatrix[0]\
-      *Configuration.sizeMatchmatrix[1]
-      numq        = Configuration.numQualSuppPoints
-      totalQualSP = Configuration.totalQualSuppPoints
-
-      currentPhi = zeros((Configuration.numFeatures,1))
-      totalQualityPenalties = zeros((totalQualSP,1))
-
-      iteration_nr = 0
-      param_idx = 0
-      const_added_ctr = 0
-      while True:
-         if iteration_nr == iteration_steps:
-            break
-
-         for exampleIdx in range(self.numExamples):
-            if (exampleIdx%100) == 0:
-               print 'Current example nr %d' % exampleIdx
-
-            dna = Sequences[exampleIdx] 
-            est = Ests[exampleIdx] 
-
-            if Configuration.mode == 'normal':
-               quality = [40]*len(est)
-
-            if Configuration.mode == 'using_quality_scores':
-               quality = Qualities[exampleIdx]
-
-            exons = Exons[exampleIdx] 
-            # NoiseMatrix = Noises[exampleIdx] 
-            don_supp = Donors[exampleIdx] 
-            acc_supp = Acceptors[exampleIdx] 
-
-            # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)    
-            trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
-            
-            #print 'trueWeights' 
-            # Calculate the weights
-            trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
-            trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
-
-            currentPhi[0:donSP]                                               = mat(d.penalties[:]).reshape(donSP,1)
-            currentPhi[donSP:donSP+accSP]                                     = mat(a.penalties[:]).reshape(accSP,1)
-            currentPhi[donSP+accSP:donSP+accSP+lengthSP]                      = mat(h.penalties[:]).reshape(lengthSP,1)
-            currentPhi[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP]   = mmatrix[:]
-
-            #pdb.set_trace()
-            if Configuration.mode == 'using_quality_scores':
-               totalQualityPenalties = param[-totalQualSP:]
-               currentPhi[donSP+accSP+lengthSP+mmatrixSP:]                    = totalQualityPenalties[:]
-
-            # Calculate w'phi(x,y) the total score of the alignment
-            trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
-
-            # The allWeights vector is supposed to store the weight parameter
-            # of the true alignment as well as the weight parameters of the
-            # num_path[exampleIdx] other alignments
-            allWeights = zeros((Configuration.numFeatures,num_path[exampleIdx]+1))
-            allWeights[:,0] = trueWeight[:,0]
-
-            AlignmentScores = [0.0]*(num_path[exampleIdx]+1)
-            AlignmentScores[0] = trueAlignmentScore
-
-            ################## Calculate wrong alignment(s) ######################
-
-            # Compute donor, acceptor with penalty_lookup_new
-            # returns two double lists
-            donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
-
-            #myalign wants the acceptor site on the g of the ag
-            acceptor = acceptor[1:]
-            acceptor.append(-inf)
-
-            # for now we don't use donor/acceptor scores
-
-            #donor = [-inf] * len(donor)
-            #acceptor = [-inf] * len(donor)
-
-            dna = str(dna)
-            est = str(est)
-            dna_len = len(dna)
-            est_len = len(est)
-
-            ps = h.convert2SWIG()
-
-            prb = QPalmaDP.createDoubleArrayFromList(quality)
-            chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
-
-            matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
-            mm_len = Configuration.sizeMatchmatrix[0]*Configuration.sizeMatchmatrix[1]
-
-            d_len = len(donor)
-            donor = QPalmaDP.createDoubleArrayFromList(donor)
-            a_len = len(acceptor)
-            acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
-
-            # Create the alignment object representing the interface to the C/C++ code.
-            currentAlignment = QPalmaDP.Alignment(Configuration.numQualPlifs,Configuration.numQualSuppPoints, self.use_quality_scores)
-
-            c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
-            #print 'Calling myalign...'
-            # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
-            currentAlignment.myalign( num_path[exampleIdx], dna, dna_len,\
-             est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
-             acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
-             print_matrix)
-
-            #print 'After calling myalign...'
-            #print 'Calling getAlignmentResults...'
-
-            c_SpliceAlign       = QPalmaDP.createIntArrayFromList([0]*(dna_len*num_path[exampleIdx]))
-            c_EstAlign          = QPalmaDP.createIntArrayFromList([0]*(est_len*num_path[exampleIdx]))
-            c_WeightMatch       = QPalmaDP.createIntArrayFromList([0]*(mm_len*num_path[exampleIdx]))
-            c_DPScores   = QPalmaDP.createDoubleArrayFromList([.0]*num_path[exampleIdx])
-
-            c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(Configuration.totalQualSuppPoints*num_path[exampleIdx]))
-
-            currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
-            c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
-
-            #print 'After calling getAlignmentResults...'
-
-            newSpliceAlign = zeros((num_path[exampleIdx]*dna_len,1))
-            newEstAlign    = zeros((est_len*num_path[exampleIdx],1))
-            newWeightMatch = zeros((num_path[exampleIdx]*mm_len,1))
-            newDPScores    = zeros((num_path[exampleIdx],1))
-            newQualityPlifsFeatures = zeros((Configuration.totalQualSuppPoints*num_path[exampleIdx],1))
-
-            #print 'newSpliceAlign'
-            for i in range(dna_len*num_path[exampleIdx]):
-               newSpliceAlign[i] = c_SpliceAlign[i]
-            #   print '%f' % (spliceAlign[i])
-
-            #print 'newEstAlign'
-            for i in range(est_len*num_path[exampleIdx]):
-               newEstAlign[i] = c_EstAlign[i]
-            #   print '%f' % (spliceAlign[i])
-
-            #print 'weightMatch'
-            for i in range(mm_len*num_path[exampleIdx]):
-               newWeightMatch[i] = c_WeightMatch[i]
-            #   print '%f' % (weightMatch[i])
-
-            #print 'ViterbiScores'
-            for i in range(num_path[exampleIdx]):
-               newDPScores[i] = c_DPScores[i]
-
-
-            if self.use_quality_scores:
-               for i in range(Configuration.totalQualSuppPoints*num_path[exampleIdx]):
-                  newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
-
-            #  equals palma up to here
-
-            #print "Calling destructors"
-            del c_SpliceAlign
-            del c_EstAlign
-            del c_WeightMatch
-            del c_DPScores
-            del c_qualityPlifsFeatures
-            del currentAlignment
-
-            newSpliceAlign = newSpliceAlign.reshape(num_path[exampleIdx],dna_len)
-            newWeightMatch = newWeightMatch.reshape(num_path[exampleIdx],mm_len)
-            # Calculate weights of the respective alignments Note that we are
-            # calculating n-best alignments without hamming loss, so we
-            # have to keep track which of the n-best alignments correspond to
-            # the true one in order not to incorporate a true alignment in the
-            # constraints. To keep track of the true and false alignments we
-            # define an array true_map with a boolean indicating the
-            # equivalence to the true alignment for each decoded alignment.
-            true_map = [0]*(num_path[exampleIdx]+1)
-            true_map[0] = 1
-            path_loss   = [0]*(num_path[exampleIdx])
-
-            for pathNr in range(num_path[exampleIdx]):
-               #print 'decodedWeights' 
-               weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a,\
-               h, newSpliceAlign[pathNr,:].flatten().tolist()[0], don_supp,\
-               acc_supp)
-
-               decodedQualityFeatures = zeros((Configuration.totalQualSuppPoints,1))
-               for qidx in range(Configuration.totalQualSuppPoints):
-                  decodedQualityFeatures[qidx] = newQualityPlifsFeatures[(pathNr*Configuration.totalQualSuppPoints)+qidx]
-
-               #pdb.set_trace()
-
-               path_loss[pathNr] = 0
-               # sum up positionwise loss between alignments
-               for alignPosIdx in range(newSpliceAlign[pathNr,:].shape[1]):
-                  if newSpliceAlign[pathNr,alignPosIdx] != trueSpliceAlign[alignPosIdx]:
-                     path_loss[pathNr] += 1
-
-               #pdb.set_trace()
-
-               # Gewichte in restliche Zeilen der Matrix speichern
-               wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch[pathNr,:].T, decodedQualityFeatures])
-               allWeights[:,pathNr+1] = wp
-
-               hpen = mat(h.penalties).reshape(len(h.penalties),1)
-               dpen = mat(d.penalties).reshape(len(d.penalties),1)
-               apen = mat(a.penalties).reshape(len(a.penalties),1)
-               features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties])
-
-               AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
-
-               # Check wether scalar product + loss equals viterbi score
-               #print 'Example nr.: %d, path nr. %d, scores: %f vs %f' % (exampleIdx,pathNr,newDPScores[pathNr,0], AlignmentScores[pathNr+1])
-
-               distinct_scores = False
-               if math.fabs(AlignmentScores[pathNr] - AlignmentScores[pathNr+1]) > 1e-5:
-                  distinct_scores = True
-               
-               #if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) + [0,1][distinct_scores and (pathNr>0)] <= 1e-5:
-               if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) <= 1e-5:
-                  pdb.set_trace()
-
-               #  # if the pathNr-best alignment is very close to the true alignment consider it as true
-               if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
-                  true_map[pathNr+1] = 1
-
-               # assert AlignmentScores[0] > max(AlignmentScores[1:]) + 1e-6, pdb.set_trace()
-
-               # the true label sequence should not have a larger score than the maximal one WHYYYYY?
-               # this means that all n-best paths are to close to each other 
-               # we have to extend the n-best search to a (n+1)-best
-               if len([elem for elem in true_map if elem == 1]) == len(true_map):
-                  num_path[exampleIdx] = num_path[exampleIdx]+1
-
-               # Choose true and first false alignment for extending A
-               firstFalseIdx = -1
-               for map_idx,elem in enumerate(true_map):
-                  if elem == 0:
-                     firstFalseIdx = map_idx
-                     break
-
-               # if there is at least one useful false alignment add the
-               # corresponding constraints to the optimization problem
-               if firstFalseIdx != -1:
-                  trueWeights       = allWeights[:,0]
-                  firstFalseWeights = allWeights[:,firstFalseIdx]
-                  differenceVector  = trueWeights - firstFalseWeights
-                  #pdb.set_trace()
-
-                  if Configuration.USE_OPT:
-                     const_added = solver.addConstraint(differenceVector, exampleIdx)
-                     const_added_ctr += 1
-               #
-               # end of one example processing 
-               #
-
-            # call solver every nth example //added constraint
-            if exampleIdx != 0 and exampleIdx % 100 == 0 and Configuration.USE_OPT:
-               objValue,w,self.slacks = solver.solve()
-      
-               print "objValue is %f" % objValue
-
-               sum_xis = 0
-               for elem in self.slacks:
-                  sum_xis +=  elem
-   
-               for i in range(len(param)):
-                  param[i] = w[i]
-
-               #pdb.set_trace()
-               cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
-               param_idx += 1
-               [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
-
-         #
-         # end of one iteration through all examples
-         #
-         iteration_nr += 1
-
-      #
-      # end of optimization 
-      #  
-      print 'Training completed'
-
-      pa = para()
-      pa.h = h
-      pa.d = d
-      pa.a = a
-      pa.mmatrix = mmatrix
-      pa.qualityPlifs = qualityPlifs
-
-      cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
-      #cPickle.dump(pa,open('elegans.param','w+'))
-      self.logfh.close()
-
-
-
-   def predict(self):
-      beg = Configuration.prediction_begin
-      end = Configuration.prediction_end
-
-      Sequences   = self.Sequences[beg:end]
-      Exons       = self.Exons[beg:end]
-      Ests        = self.Ests[beg:end]
-      Qualities   = self.Qualities[beg:end]
-      Acceptors   = self.Acceptors[beg:end]
-      Donors      = self.Donors[beg:end]
-
-      remove_duplicate_scores = Conf.remove_duplicate_scores 
-      print_matrix            = Conf.print_matrix 
-      anzpath                 = Conf.anzpath 
-
-      param_filename='/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/param_28.pickle'
-      param = load_param(param_filename)
-
-      # Set the parameters such as limits penalties for the Plifs
-      [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
-
-      #############################################################################################
-      # Prediction
-      #############################################################################################
-      self.plog('Starting prediction...\n')
-
-      donSP       = Conf.numDonSuppPoints
-      accSP       = Conf.numAccSuppPoints
-      lengthSP    = Conf.numLengthSuppPoints
-      mmatrixSP   = Conf.sizeMatchmatrix[0]\
-      *Conf.sizeMatchmatrix[1]
-      numq        = Conf.numQualSuppPoints
-      totalQualSP = Conf.totalQualSuppPoints
-
-      currentPhi = zeros((Conf.numFeatures,1))
-      totalQualityPenalties = zeros((totalQualSP,1))
-
-      total_up_off = []
-      total_down_off = []
-
-      for exampleIdx in range(self.numExamples):
-         dna = Sequences[exampleIdx] 
-         est = Ests[exampleIdx] 
-         currentSplitPos = SplitPos[exampleIdx]
-
-         if Conf.mode == 'normal':
-            quality = [40]*len(est)
-
-         if Conf.mode == 'using_quality_scores':
-            quality = Qualities[exampleIdx]
-
-         exons = Exons[exampleIdx] 
-         # NoiseMatrix = Noises[exampleIdx] 
-         don_supp = Donors[exampleIdx] 
-         acc_supp = Acceptors[exampleIdx] 
-
-         if exons[-1,1] > len(dna):
-            continue
-
-         # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)    
-         trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
-         
-         # Calculate the weights
-         trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
-         trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
-
-         currentPhi[0:donSP]                                               = mat(d.penalties[:]).reshape(donSP,1)
-         currentPhi[donSP:donSP+accSP]                                     = mat(a.penalties[:]).reshape(accSP,1)
-         currentPhi[donSP+accSP:donSP+accSP+lengthSP]                      = mat(h.penalties[:]).reshape(lengthSP,1)
-         currentPhi[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP]   = mmatrix[:]
-
-         if Conf.mode == 'using_quality_scores':
-            totalQualityPenalties = param[-totalQualSP:]
-            currentPhi[donSP+accSP+lengthSP+mmatrixSP:]                    = totalQualityPenalties[:]
-
-         # Calculate w'phi(x,y) the total score of the alignment
-         trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
-
-         # The allWeights vector is supposed to store the weight parameter
-         # of the true alignment as well as the weight parameters of the
-         # 1 other alignments
-         allWeights = zeros((Conf.numFeatures,1+1))
-         allWeights[:,0] = trueWeight[:,0]
-
-         AlignmentScores = [0.0]*(1+1)
-         AlignmentScores[0] = trueAlignmentScore
-
-         ################## Calculate wrong alignment(s) ######################
-
-         # Compute donor, acceptor with penalty_lookup_new
-         # returns two double lists
-         donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
-
-         #myalign wants the acceptor site on the g of the ag
-         acceptor = acceptor[1:]
-         acceptor.append(-inf)
-
-         dna = str(dna)
-         est = str(est)
-         dna_len = len(dna)
-         est_len = len(est)
-
-         ps = h.convert2SWIG()
-
-         prb = QPalmaDP.createDoubleArrayFromList(quality)
-         chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
-
-         matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
-         mm_len = Conf.sizeMatchmatrix[0]*Conf.sizeMatchmatrix[1]
-
-         d_len = len(donor)
-         donor = QPalmaDP.createDoubleArrayFromList(donor)
-         a_len = len(acceptor)
-         acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
-
-         # Create the alignment object representing the interface to the C/C++ code.
-         currentAlignment = QPalmaDP.Alignment(Conf.numQualPlifs,Conf.numQualSuppPoints, self.use_quality_scores)
-
-         c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
-
-         # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
-         currentAlignment.myalign(1, dna, dna_len,\
-          est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
-          acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
-          print_matrix)
-
-         c_SpliceAlign  = QPalmaDP.createIntArrayFromList([0]*(dna_len*1))
-         c_EstAlign     = QPalmaDP.createIntArrayFromList([0]*(est_len*1))
-         c_WeightMatch  = QPalmaDP.createIntArrayFromList([0]*(mm_len*1))
-         c_DPScores     = QPalmaDP.createDoubleArrayFromList([.0]*1)
-
-         c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(Conf.totalQualSuppPoints))
-
-         currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
-         c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
-
-         newSpliceAlign = zeros((dna_len,1))
-         newEstAlign    = zeros((est_len,1))
-         newWeightMatch = zeros((mm_len,1))
-         newQualityPlifsFeatures = zeros((Conf.totalQualSuppPoints,1))
-
-         #print 'newSpliceAlign'
-         for i in range(dna_len):
-            newSpliceAlign[i] = c_SpliceAlign[i]
-         #   print '%f' % (spliceAlign[i])
-
-         #print 'newEstAlign'
-         for i in range(est_len):
-            newEstAlign[i] = c_EstAlign[i]
-         #   print '%f' % (spliceAlign[i])
-
-         #print 'weightMatch'
-         for i in range(mm_len):
-            newWeightMatch[i] = c_WeightMatch[i]
-         #   print '%f' % (weightMatch[i])
-
-         newDPScores = c_DPScores[0]
-
-         for i in range(Conf.totalQualSuppPoints):
-            newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
-
-         #print "Calling destructors"
-         del c_SpliceAlign
-         del c_EstAlign
-         del c_WeightMatch
-         del c_DPScores
-         del c_qualityPlifsFeatures
-         del currentAlignment
-
-         newSpliceAlign = newSpliceAlign.reshape(1,dna_len)
-         newWeightMatch = newWeightMatch.reshape(1,mm_len)
-         # Calculate weights of the respective alignments Note that we are
-         # calculating n-best alignments without hamming loss, so we
-         # have to keep track which of the n-best alignments correspond to
-         # the true one in order not to incorporate a true alignment in the
-         # constraints. To keep track of the true and false alignments we
-         # define an array true_map with a boolean indicating the
-         # equivalence to the true alignment for each decoded alignment.
-         true_map = [0]*2
-         true_map[0] = 1
-         pathNr = 0
-
-         weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a, h, newSpliceAlign.flatten().tolist()[0], don_supp, acc_supp)
-
-         decodedQualityFeatures = zeros((Conf.totalQualSuppPoints,1))
-         for qidx in range(Conf.totalQualSuppPoints):
-            decodedQualityFeatures[qidx] = newQualityPlifsFeatures[qidx]
-
-         # Gewichte in restliche Zeilen der Matrix speichern
-         wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch.T, decodedQualityFeatures])
-         allWeights[:,pathNr+1] = wp
-
-         hpen = mat(h.penalties).reshape(len(h.penalties),1)
-         dpen = mat(d.penalties).reshape(len(d.penalties),1)
-         apen = mat(a.penalties).reshape(len(a.penalties),1)
-         features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties])
-         AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
-
-         # if the pathNr-best alignment is very close to the true alignment consider it as true
-         if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
-            true_map[pathNr+1] = 1
-
-         up_off,down_off = evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign,currentSplitPos)
-         #evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign,currentSplitPos)
-         #print up_off,down_off 
-
-         if up_off > -1:
-            total_up_off.append(up_off)
-            total_down_off.append(down_off)
-
-      total_up = 0
-      total_down = 0 
-      for idx in range(len(total_up_off)):
-         total_up    += total_up_off[idx]
-         total_down  += total_down_off[idx]
-         
-      total_up /= len(total_up_off)
-      total_down /= len(total_down_off)
-
-      print 'Mean up_off is %f' % total_up
-      print 'Mean down_off is %f' % total_down
-      print 'Correct up_off len is %d' % len([elem for elem in total_up_off if elem in [0,1]])
-      print 'Correct down_off len is %d' % len([elem for elem in total_down_off if elem in [0,1]])
-
-      print 'Prediction completed'
-      self.logfh.close()
-
-def fetch_genome_info(ginfo_filename):
-   if not os.path.exists(ginfo_filename):
-      cmd = ['']*4
-      cmd[0] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/utils'
-      cmd[1] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/genomes'
-      cmd[2] = 'genome_info = init_genome(\'%s\')' % gen_file
-      cmd[3] = 'save genome_info.mat genome_info'  
-      full_cmd = "matlab -nojvm -nodisplay -r \"%s; %s; %s; %s; exit\"" % (cmd[0],cmd[1],cmd[2],cmd[3])
-
-      obj = subprocess.Popen(full_cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
-      out,err = obj.communicate()
-      assert err == '', 'An error occured!\n%s'%err
-
-      ginfo = scipy.io.loadmat('genome_info.mat')
-      cPickle.dump(self.genome_info,open(ginfo_filename,'w+'))
-      return ginfo['genome_info']
-
-   else:
-      return cPickle.load(open(ginfo_filename))
-
-def plifs2param(h,d,a,mmatrix,qualityPlifs):
-   donSP       = Conf.numDonSuppPoints
-   accSP       = Conf.numAccSuppPoints
-   lengthSP    = Conf.numLengthSuppPoints
-   mmatrixSP   = Conf.sizeMatchmatrix[0]\
-   *Conf.sizeMatchmatrix[1]
-
-
-   param = zeros((Conf.numFeatures,1))
-   param[0:donSP]                                               = mat(d.penalties[:]).reshape(donSP,1)
-   param[donSP:donSP+accSP]                                     = mat(a.penalties[:]).reshape(accSP,1)
-   param[donSP+accSP:donSP+accSP+lengthSP]                      = mat(h.penalties[:]).reshape(lengthSP,1)
-   param[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP]   = mmatrix[:]
-
-   for idx in range(len(qualityPlifs)):
-      currentPlif       = qualityPlifs[idx]
-      begin             = lengthSP+donSP+accSP+mmatrixSP+(idx*Conf.numQualSuppPoints)
-      end               = lengthSP+donSP+accSP+mmatrixSP+((idx+1)*Conf.numQualSuppPoints)
-      param[begin:end]  = mat(currentPlif.penalties).reshape(Conf.numQualSuppPoints,1)
-
-   return param
-
-def load_param(filename):
-   param = None 
-   #try:
-   #   para = cPickle.load(open(filename))
-   #except:
-   #   print 'Error: Could not open parameter file!'
-   #   sys.exit(1)
-   #
-   #param = plifs2param(para.h,para.d,para.a,para.mmatrix,para.qualityPlifs)
-
-   param = cPickle.load(open(filename))
-   return param
-
-def evaluateExample(dna,est,exons,SpliceAlign,newEstAlign,spos):
-   newExons = []
-   oldElem = -1
-   SpliceAlign = SpliceAlign.flatten().tolist()[0]
-   SpliceAlign.append(-1)
-   for pos,elem in enumerate(SpliceAlign):
-      if pos == 0:
-         oldElem = -1
-      else:
-         oldElem = SpliceAlign[pos-1]
-
-      if oldElem != 0 and elem == 0: # start of exon
-         newExons.append(pos-1)
-
-      if oldElem == 0 and elem != 0: # end of exon
-         newExons.append(pos)
-
-   up_off   = -1
-   down_off = -1
-
-   if len(newExons) != 4:
-      acc = 0.0
-   else:
-      e1_begin,e1_end = newExons[0],newExons[1]
-      e2_begin,e2_end = newExons[2],newExons[3]
-
-      up_off   = int(math.fabs(e1_end - exons[0,1]))
-      down_off = int(math.fabs(e2_begin - exons[1,0]))
-
-   #pdb.set_trace()
-
-   return up_off,down_off
-
-if __name__ == '__main__':
-
-   qpalma = QPalma()
-   if len(sys.argv) != 2:
-      print 'You have to choose between training or prediction mode:'
-      print 'python qpalma. py (train|predict)' 
-
-   mode = sys.argv[1]
-   assert mode in ['train','predict']
-
-   if mode == 'train':
-      qpalma.train()
-      
-   if mode == 'predict':
-      qpalma.predict()
-