+ added some text and references to the documentation
[qpalma.git] / scripts / qpalma_main.py
index ed94ab1..e70e6d2 100644 (file)
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
-###########################################################
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
 #
-# The QPalma project aims at extending the Palma project 
-# to be able to use Solexa reads toegether with their 
-# quality scores.
-# 
-# This file represents the conversion of the main matlab 
-# training loop for Palma to python.
-# 
-# Author: Fabio De Bona
-# 
-###########################################################
+# Written (W) 2008 Fabio De Bona
+# Copyright (C) 2008 Max-Planck-Society
 
-import sys
-import subprocess
-import scipy.io
-import pdb
+import array
+import cPickle
 import os.path
+import pdb
+import sys
+
+from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper,unbracket_seq
 
+import numpy
 from numpy.matlib import mat,zeros,ones,inf
 from numpy.linalg import norm
 
+#from qpalma.SIQP_CPX import SIQPSolver
+#from qpalma.SIQP_CVXOPT import SIQPSolver
+
 import QPalmaDP
 import qpalma
-from qpalma.SIQP_CPX import SIQPSolver
-from qpalma.DataProc import *
-from qpalma.generateEvaluationData import *
 from qpalma.computeSpliceWeights import *
 from qpalma.set_param_palma import *
 from qpalma.computeSpliceAlignWithQuality import *
-from qpalma.penalty_lookup_new import *
-from qpalma.compute_donacc import *
-from qpalma.export_param import *
 from qpalma.TrainingParam import Param
-from qpalma.Plif import Plf
+from qpalma.Plif import Plf,compute_donacc
 
-from qpalma.tools.splicesites import getDonAccScores
-from qpalma.Configuration import *
+from Utils import pprint_alignment, get_alignment
 
-class para:
+class SpliceSiteException:
    pass
 
+
+def getData(training_set,exampleKey,run):
+   """ This function...  """
+
+   currentSeqInfo,original_est,currentQualities,currentExons = training_set[exampleKey]
+   id,chr,strand,up_cut,down_cut = currentSeqInfo
+
+   est = original_est
+   est = "".join(est)
+   est = est.lower()
+   est = unbracket_est(est)
+   est = est.replace('-','')
+
+   assert len(est) == run['read_size'], pdb.set_trace()
+   est_len = len(est)
+
+   #original_est = OriginalEsts[exampleIdx]
+   original_est = "".join(original_est)
+   original_est = original_est.lower()
+
+   dna_flat_files =  '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+   dna, acc_supp, don_supp = get_seq_and_scores(chr,strand,up_cut,down_cut,dna_flat_files)
+
+   #currentDNASeq, currentAcc, currentDon = seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop)
+
+   original_exons = currentExons
+   exons = original_exons - (up_cut-1)
+   exons[0,0] -= 1
+   exons[1,0] -= 1
+
+   if exons.shape == (2,2):
+      fetched_dna_subseq = dna[exons[0,0]:exons[0,1]] + dna[exons[1,0]:exons[1,1]]
+     
+      donor_elem = dna[exons[0,1]:exons[0,1]+2]
+      acceptor_elem = dna[exons[1,0]-2:exons[1,0]]
+
+      if not ( donor_elem == 'gt' or donor_elem == 'gc' ):
+         print 'invalid donor in example %d'% exampleKey
+         raise SpliceSiteException
+
+      if not ( acceptor_elem == 'ag' ):
+         print 'invalid acceptor in example %d'% exampleKey
+         raise SpliceSiteException
+
+      assert len(fetched_dna_subseq) == len(est), pdb.set_trace()
+
+   return dna,est,acc_supp,don_supp,exons,original_est,currentQualities
+
+
 class QPalma:
    """
-   A training method for the QPalma project
+   This class wraps the training and prediction functions for 
+   the alignment.
    """
    
-   def __init__(self):
+   def __init__(self,run,seqInfo,dmode=False):
       self.ARGS = Param()
+      self.qpalma_debug_mode = dmode
+      self.run = run
+      self.seqInfo = seqInfo
 
-      #gen_file= '%s/genome.config' % self.ARGS.basedir
-      #ginfo_filename = 'genome_info.pickle'
-      #self.genome_info = fetch_genome_info(ginfo_filename)
-      #self.plog('genome_info.basedir is %s\n'%self.genome_info.basedir)
-      #self.ARGS.train_with_splicesitescoreinformation = False
 
-      # Load the whole dataset 
-      if Conf.mode == 'normal':
-         #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
-         Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
-         self.use_quality_scores = False
+   def plog(self,string):
+      self.logfh.write(string)
+      self.logfh.flush()
+
+
+   def do_alignment(self,dna,est,quality,mmatrix,donor,acceptor,ps,qualityPlifs,current_num_path,prediction_mode):
+      """
+      Given the needed input this method calls the QPalma C module which
+      calculates a dynamic programming in order to obtain an alignment
+      """
+
+      dna_len = len(dna)
+      est_len = len(est)
 
-      elif Conf.mode == 'using_quality_scores':
+      prb = QPalmaDP.createDoubleArrayFromList(quality)
+      chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
 
-         filename = 'real_dset_%s'% 'recent'
-         Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos = paths_load_data_solexa('training',None,self.ARGS)
+      matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
+      mm_len = self.run['matchmatrixRows']*self.run['matchmatrixCols']
 
-         #end = 1000
-         self.Sequences   = Sequences
-         self.Exons       = Exons
-         self.Ests        = Ests
-         self.Qualities   = Qualities
-         self.SplitPos    = SplitPos
+      d_len = len(donor)
+      donor = QPalmaDP.createDoubleArrayFromList(donor)
+      a_len = len(acceptor)
+      acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
 
-         self.Donors, self.Acceptors = getDonAccScores(Sequences)
+      # Create the alignment object representing the interface to the C/C++ code.
+      currentAlignment = QPalmaDP.Alignment(self.run['numQualPlifs'],self.run['numQualSuppPoints'], self.use_quality_scores)
+      c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
+      # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
+      currentAlignment.myalign( current_num_path, dna, dna_len,\
+       est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
+       acceptor, a_len, c_qualityPlifs, self.run['remove_duplicate_scores'],\
+       self.run['print_matrix'] )
 
-         #self.Acceptors   = self.Acceptors[:end]
-         #self.Donors      = self.Donors[:end]
+      if prediction_mode:
+         # part that is only needed for prediction
+         result_len = currentAlignment.getResultLength()
+         dna_array,est_array = currentAlignment.getAlignmentArraysNew()
+      else:
+         dna_array = None
+         est_array = None
+
+      newSpliceAlign, newEstAlign, newWeightMatch, newDPScores, newQualityPlifsFeatures =\
+      currentAlignment.getAlignmentResultsNew()
+
+      return newSpliceAlign, newEstAlign, newWeightMatch, newDPScores,\
+      newQualityPlifsFeatures, dna_array, est_array
+
+
+   def init_train(self,training_set):
+      full_working_path = os.path.join(self.run['alignment_dir'],self.run['name'])
 
-         #Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
-         #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
-         #Qualities = []
-         #for i in range(len(Ests)):
-         #   Qualities.append([40]*len(Ests[i]))
+      #assert not os.path.exists(full_working_path)
+      if not os.path.exists(full_working_path):
+         os.mkdir(full_working_path)
+
+      assert os.path.exists(full_working_path)
+
+      # ATTENTION: Changing working directory
+      os.chdir(full_working_path)
+
+      self.logfh = open('_qpalma_train.log','w+')
+      cPickle.dump(self.run,open('run_obj.pickle','w+'))
+
+      self.plog("Settings are:\n")
+      self.plog("%s\n"%str(self.run))
+
+      if self.run['mode'] == 'normal':
+         self.use_quality_scores = False
+
+      elif self.run['mode'] == 'using_quality_scores':
          self.use_quality_scores = True
       else:
          assert(False)
 
-   def plog(self,string):
-      self.logfh.write(string)
-      self.logfh.flush()
 
-   def train(self):
-      self.logfh = open('_qpalma_train.log','w+')
+   def setUpSolver(self):
+      # Initialize solver 
+      self.plog('Initializing problem...\n')
+      
+      try:
+         solver = SIQPSolver(run['numFeatures'],numExamples,run['C'],self.logfh,run)
+      except:
+         self.plog('Got no license. Telling queue to reschedule job...\n')
+         sys.exit(99)
+
+      solver.enforceMonotonicity(lengthSP,lengthSP+donSP)
+      solver.enforceMonotonicity(lengthSP+donSP,lengthSP+donSP+accSP)
+
+      return solver
 
-      beg = Conf.training_begin
-      end = Conf.training_end
-
-      Sequences   = self.Sequences[beg:end]
-      Exons       = self.Exons[beg:end]
-      Ests        = self.Ests[beg:end]
-      Qualities   = self.Qualities[beg:end]
-      Acceptors   = self.Acceptors[beg:end]
-      Donors      = self.Donors[beg:end]
-
-      # number of training instances
-      N = numExamples = len(Sequences) 
-      assert len(Exons) == N and len(Ests) == N\
-      and len(Qualities) == N and len(Acceptors) == N\
-      and len(Donors) == N, 'The Exons,Acc,Don,.. arrays are of different lengths'
+
+   def train(self,training_set):
+      numExamples = len(training_set)
       self.plog('Number of training examples: %d\n'% numExamples)
 
       self.noImprovementCtr = 0
       self.oldObjValue = 1e8
 
-      iteration_steps         = Conf.iter_steps ; #upper bound on iteration steps
-      remove_duplicate_scores = Conf.remove_duplicate_scores 
-      print_matrix            = Conf.print_matrix 
-      anzpath                 = Conf.anzpath 
+      iteration_steps         = self.run['iter_steps']
+      remove_duplicate_scores = self.run['remove_duplicate_scores']
+      print_matrix            = self.run['print_matrix']
+      anzpath                 = self.run['anzpath']
 
-      # Initialize parameter vector  / param = numpy.matlib.rand(126,1)
-      param = Conf.fixedParam 
+      # Initialize parameter vector
+      param = numpy.matlib.rand(run['numFeatures'],1)
+   
+      lengthSP    = self.run['numLengthSuppPoints']
+      donSP       = self.run['numDonSuppPoints']
+      accSP       = self.run['numAccSuppPoints']
+      mmatrixSP   = self.run['matchmatrixRows']*run['matchmatrixCols']
+      numq        = self.run['numQualSuppPoints']
+      totalQualSP = self.run['totalQualSuppPoints']
 
-      # Set the parameters such as limits penalties for the Plifs
-      [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
+      # no intron length model
+      if not self.run['enable_intron_length']:
+         param[:lengthSP] *= 0.0
 
-      # delete splicesite-score-information
-      #if not self.ARGS.train_with_splicesitescoreinformation:
-      #   for i in range(len(Acceptors)):
-      #      if Acceptors[i] > -20:
-      #         Acceptors[i] = 1
-      #      if Donors[i] >-20:
-      #         Donors[i] = 1
+      # Set the parameters such as limits penalties for the Plifs
+      [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation,self.run)
 
-      # Initialize solver 
-      if Conf.USE_OPT:
-         self.plog('Initializing problem...\n')
-         solver = SIQPSolver(Conf.numFeatures,numExamples,Conf.C,self.logfh)
+      solver = self.setUpSolver()
 
       # stores the number of alignments done for each example (best path, second-best path etc.)
       num_path = [anzpath]*numExamples
+
       # stores the gap for each example
       gap      = [0.0]*numExamples
 
-      #############################################################################################
-      # Training
-      #############################################################################################
-      self.plog('Starting training...\n')
-
-      donSP       = Conf.numDonSuppPoints
-      accSP       = Conf.numAccSuppPoints
-      lengthSP    = Conf.numLengthSuppPoints
-      mmatrixSP   = Conf.sizeMatchmatrix[0]\
-      *Conf.sizeMatchmatrix[1]
-      numq        = Conf.numQualSuppPoints
-      totalQualSP = Conf.totalQualSuppPoints
-
-      currentPhi = zeros((Conf.numFeatures,1))
+      currentPhi = zeros((run['numFeatures'],1))
       totalQualityPenalties = zeros((totalQualSP,1))
 
+      numConstPerRound = run['numConstraintsPerRound']
+      solver_call_ctr = 0
+
+      suboptimal_example = 0
       iteration_nr = 0
       param_idx = 0
       const_added_ctr = 0
+
+      featureVectors = zeros((run['numFeatures'],numExamples))
+
+      self.plog('Starting training...\n')
+      # the main training loop
       while True:
          if iteration_nr == iteration_steps:
             break
 
-         for exampleIdx in range(numExamples):
-            if (exampleIdx%100) == 0:
-               print 'Current example nr %d' % exampleIdx
+         for exampleIdx,example_key in enumerate(training_set.keys()):
+            print 'Current example %d' % example_key
+            try:
+               dna,est,acc_supp,don_supp,exons,original_est,currentQualities =\
+               getData(training_set,example_key,run)
+            except SpliceSiteException:
+               continue
 
-            dna = Sequences[exampleIdx] 
-            est = Ests[exampleIdx] 
+            dna_len = len(dna)
 
-            if Conf.mode == 'normal':
-               quality = [40]*len(est)
+            if run['enable_quality_scores']:
+               quality = currentQualities[quality_index]
+            else:
+               quality = [40]*len(read)
 
-            if Conf.mode == 'using_quality_scores':
-               quality = Qualities[exampleIdx]
+            if not run['enable_splice_signals']:
+               for idx,elem in enumerate(don_supp):
+                  if elem != -inf:
+                     don_supp[idx] = 0.0
 
-            exons = Exons[exampleIdx] 
-            # NoiseMatrix = Noises[exampleIdx] 
-            don_supp = Donors[exampleIdx] 
-            acc_supp = Acceptors[exampleIdx] 
+               for idx,elem in enumerate(acc_supp):
+                  if elem != -inf:
+                     acc_supp[idx] = 0.0
 
             # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)    
-            trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
-            
-            #print 'trueWeights' 
+            if run['mode'] == 'using_quality_scores':
+               trueSpliceAlign, trueWeightMatch, trueWeightQuality ,dna_calc =\
+               computeSpliceAlignWithQuality(dna, exons, est, original_est,\
+               quality, qualityPlifs,run)
+            else:
+               trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
+
+            dna_calc = dna_calc.replace('-','')
+
+            #print 'right before computeSpliceWeights exampleIdx %d' % exampleIdx
             # Calculate the weights
-            trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
+            trueWeightDon, trueWeightAcc, trueWeightIntron =\
+            computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
             trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
 
-            currentPhi[0:donSP]                                               = mat(d.penalties[:]).reshape(donSP,1)
-            currentPhi[donSP:donSP+accSP]                                     = mat(a.penalties[:]).reshape(accSP,1)
-            currentPhi[donSP+accSP:donSP+accSP+lengthSP]                      = mat(h.penalties[:]).reshape(lengthSP,1)
-            currentPhi[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP]   = mmatrix[:]
+            currentPhi[0:lengthSP]                                            = mat(h.penalties[:]).reshape(lengthSP,1)
+            currentPhi[lengthSP:lengthSP+donSP]                               = mat(d.penalties[:]).reshape(donSP,1)
+            currentPhi[lengthSP+donSP:lengthSP+donSP+accSP]                   = mat(a.penalties[:]).reshape(accSP,1)
+            currentPhi[lengthSP+donSP+accSP:lengthSP+donSP+accSP+mmatrixSP]   = mmatrix[:]
 
-            #pdb.set_trace()
-            if Conf.mode == 'using_quality_scores':
+            if run['mode'] == 'using_quality_scores':
                totalQualityPenalties = param[-totalQualSP:]
-               currentPhi[donSP+accSP+lengthSP+mmatrixSP:]                    = totalQualityPenalties[:]
+               currentPhi[lengthSP+donSP+accSP+mmatrixSP:]                    = totalQualityPenalties[:]
 
             # Calculate w'phi(x,y) the total score of the alignment
             trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
@@ -210,175 +302,80 @@ class QPalma:
             # The allWeights vector is supposed to store the weight parameter
             # of the true alignment as well as the weight parameters of the
             # num_path[exampleIdx] other alignments
-            allWeights = zeros((Conf.numFeatures,num_path[exampleIdx]+1))
+            allWeights = zeros((run['numFeatures'],num_path[exampleIdx]+1))
             allWeights[:,0] = trueWeight[:,0]
 
             AlignmentScores = [0.0]*(num_path[exampleIdx]+1)
             AlignmentScores[0] = trueAlignmentScore
 
             ################## Calculate wrong alignment(s) ######################
-
             # Compute donor, acceptor with penalty_lookup_new
             # returns two double lists
             donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
 
-            #myalign wants the acceptor site on the g of the ag
-            acceptor = acceptor[1:]
-            acceptor.append(-inf)
-
-            dna = str(dna)
-            est = str(est)
-            dna_len = len(dna)
-            est_len = len(est)
-
-            # check that splice site scores are at dna positions as expected by
-            # the dynamic programming component
-            for d_pos in [pos for pos,elem in enumerate(donor) if elem != -inf]:
-               assert dna[d_pos] == 'g' and (dna[d_pos+1] == 'c'\
-               or dna[d_pos+1] == 't'), pdb.set_trace()
-                
-            for a_pos in [pos for pos,elem in enumerate(acceptor) if elem != -inf]:
-               assert dna[a_pos-1] == 'a' and dna[a_pos] == 'g', pdb.set_trace()
-
             ps = h.convert2SWIG()
 
-            prb = QPalmaDP.createDoubleArrayFromList(quality)
-            chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
-
-            matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
-            mm_len = Conf.sizeMatchmatrix[0]*Conf.sizeMatchmatrix[1]
-
-            d_len = len(donor)
-            donor = QPalmaDP.createDoubleArrayFromList(donor)
-            a_len = len(acceptor)
-            acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
-
-            # Create the alignment object representing the interface to the C/C++ code.
-            currentAlignment = QPalmaDP.Alignment(Conf.numQualPlifs,Conf.numQualSuppPoints, self.use_quality_scores)
-
-            c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
-            #print 'Calling myalign...'
-            # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
-            currentAlignment.myalign( num_path[exampleIdx], dna, dna_len,\
-             est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
-             acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
-             print_matrix)
-
-            #print 'After calling myalign...'
-            #print 'Calling getAlignmentResults...'
-
-            c_SpliceAlign       = QPalmaDP.createIntArrayFromList([0]*(dna_len*num_path[exampleIdx]))
-            c_EstAlign          = QPalmaDP.createIntArrayFromList([0]*(est_len*num_path[exampleIdx]))
-            c_WeightMatch       = QPalmaDP.createIntArrayFromList([0]*(mm_len*num_path[exampleIdx]))
-            c_DPScores   = QPalmaDP.createDoubleArrayFromList([.0]*num_path[exampleIdx])
-
-            c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(Conf.totalQualSuppPoints*num_path[exampleIdx]))
-
-            currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
-            c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
-
-            #print 'After calling getAlignmentResults...'
-
-            newSpliceAlign = zeros((num_path[exampleIdx]*dna_len,1))
-            newEstAlign    = zeros((est_len*num_path[exampleIdx],1))
-            newWeightMatch = zeros((num_path[exampleIdx]*mm_len,1))
-            newDPScores    = zeros((num_path[exampleIdx],1))
-            newQualityPlifsFeatures = zeros((Conf.totalQualSuppPoints*num_path[exampleIdx],1))
-
-            #print 'newSpliceAlign'
-            for i in range(dna_len*num_path[exampleIdx]):
-               newSpliceAlign[i] = c_SpliceAlign[i]
-            #   print '%f' % (spliceAlign[i])
-
-            #print 'newEstAlign'
-            for i in range(est_len*num_path[exampleIdx]):
-               newEstAlign[i] = c_EstAlign[i]
-            #   print '%f' % (spliceAlign[i])
-
-            #print 'weightMatch'
-            for i in range(mm_len*num_path[exampleIdx]):
-               newWeightMatch[i] = c_WeightMatch[i]
-            #   print '%f' % (weightMatch[i])
-
-            #print 'ViterbiScores'
-            for i in range(num_path[exampleIdx]):
-               newDPScores[i] = c_DPScores[i]
-
-
-            if self.use_quality_scores:
-               for i in range(Conf.totalQualSuppPoints*num_path[exampleIdx]):
-                  newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
-
-            #  equals palma up to here
-
-            #print "Calling destructors"
-            del c_SpliceAlign
-            del c_EstAlign
-            del c_WeightMatch
-            del c_DPScores
-            del c_qualityPlifsFeatures
-            del currentAlignment
+            newSpliceAlign, newEstAlign, newWeightMatch, newDPScores,\
+            newQualityPlifsFeatures, unneeded1, unneeded2 =\
+            self.do_alignment(dna,est,quality,mmatrix,donor,acceptor,ps,qualityPlifs,num_path[exampleIdx],False)
+            mm_len = run['matchmatrixRows']*run['matchmatrixCols']
 
             newSpliceAlign = newSpliceAlign.reshape(num_path[exampleIdx],dna_len)
             newWeightMatch = newWeightMatch.reshape(num_path[exampleIdx],mm_len)
-            # Calculate weights of the respective alignments Note that we are
-            # calculating n-best alignments without hamming loss, so we
-            # have to keep track which of the n-best alignments correspond to
-            # the true one in order not to incorporate a true alignment in the
+
+            newQualityPlifsFeatures = newQualityPlifsFeatures.reshape(num_path[exampleIdx],run['totalQualSuppPoints'])
+            # Calculate weights of the respective alignments. Note that we are calculating n-best alignments without 
+            # hamming loss, so we have to keep track which of the n-best alignments correspond to the true one in order 
+            # not to incorporate a true alignment in the
             # constraints. To keep track of the true and false alignments we
             # define an array true_map with a boolean indicating the
             # equivalence to the true alignment for each decoded alignment.
             true_map = [0]*(num_path[exampleIdx]+1)
             true_map[0] = 1
-            path_loss   = [0]*(num_path[exampleIdx])
 
             for pathNr in range(num_path[exampleIdx]):
-               #print 'decodedWeights' 
                weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a,\
                h, newSpliceAlign[pathNr,:].flatten().tolist()[0], don_supp,\
                acc_supp)
-
-               decodedQualityFeatures = zeros((Conf.totalQualSuppPoints,1))
-               for qidx in range(Conf.totalQualSuppPoints):
-                  decodedQualityFeatures[qidx] = newQualityPlifsFeatures[(pathNr*Conf.totalQualSuppPoints)+qidx]
-
-               #pdb.set_trace()
-
-               path_loss[pathNr] = 0
-               # sum up positionwise loss between alignments
-               for alignPosIdx in range(newSpliceAlign[pathNr,:].shape[1]):
-                  if newSpliceAlign[pathNr,alignPosIdx] != trueSpliceAlign[alignPosIdx]:
-                     path_loss[pathNr] += 1
-
-               #pdb.set_trace()
-
+              
+               decodedQualityFeatures = zeros((run['totalQualSuppPoints'],1))
+               decodedQualityFeatures = newQualityPlifsFeatures[pathNr,:].T
                # Gewichte in restliche Zeilen der Matrix speichern
-               wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch[pathNr,:].T, decodedQualityFeatures])
-               allWeights[:,pathNr+1] = wp
+               allWeights[:,pathNr+1] = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch[pathNr,:].T, decodedQualityFeatures[:]])
 
                hpen = mat(h.penalties).reshape(len(h.penalties),1)
                dpen = mat(d.penalties).reshape(len(d.penalties),1)
                apen = mat(a.penalties).reshape(len(a.penalties),1)
-               features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties])
+               features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties[:]])
 
-               AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
+               featureVectors[:,exampleIdx] = allWeights[:,pathNr+1]
 
-               # Check wether scalar product + loss equals viterbi score
-               #print 'Example nr.: %d, path nr. %d, scores: %f vs %f' % (exampleIdx,pathNr,newDPScores[pathNr,0], AlignmentScores[pathNr+1])
+               AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
 
                distinct_scores = False
                if math.fabs(AlignmentScores[pathNr] - AlignmentScores[pathNr+1]) > 1e-5:
                   distinct_scores = True
-               
-               #if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) + [0,1][distinct_scores and (pathNr>0)] <= 1e-5:
+
+               # Check wether scalar product + loss equals viterbi score
                if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) <= 1e-5:
+                  self.plog("Scalar prod. + loss not equals Viterbi output!\n")
                   pdb.set_trace()
 
-               #  # if the pathNr-best alignment is very close to the true alignment consider it as true
+               self.plog(" scalar prod (correct) : %f\n"%AlignmentScores[0])
+               self.plog(" scalar prod (pred.) : %f %f\n"%(newDPScores[pathNr,0],AlignmentScores[pathNr+1]))
+
+               # if the pathNr-best alignment is very close to the true alignment consider it as true
                if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
                   true_map[pathNr+1] = 1
 
-               # assert AlignmentScores[0] > max(AlignmentScores[1:]) + 1e-6, pdb.set_trace()
+               if not trueAlignmentScore <= max(AlignmentScores[1:]) + 1e-6:
+                  print "suboptimal_example %d\n" %exampleIdx
+                  #trueSpliceAlign, trueWeightMatch, trueWeightQuality dna_calc=\
+                  #computeSpliceAlignWithQuality(dna, exons, est, original_est, quality, qualityPlifs)
+
+                  #pdb.set_trace()
+                  suboptimal_example += 1
+                  self.plog("suboptimal_example %d\n" %exampleIdx)
 
                # the true label sequence should not have a larger score than the maximal one WHYYYYY?
                # this means that all n-best paths are to close to each other 
@@ -386,31 +383,43 @@ class QPalma:
                if len([elem for elem in true_map if elem == 1]) == len(true_map):
                   num_path[exampleIdx] = num_path[exampleIdx]+1
 
-               # Choose true and first false alignment for extending A
+               # Choose true and first false alignment for extending
                firstFalseIdx = -1
                for map_idx,elem in enumerate(true_map):
                   if elem == 0:
                      firstFalseIdx = map_idx
                      break
 
+               if False:
+                  self.plog("Is considered as: %d\n" % true_map[1])
+
+                  #result_len = currentAlignment.getResultLength()
+
+                  dna_array,est_array = currentAlignment.getAlignmentArraysNew()
+
+                  _newSpliceAlign = newSpliceAlign[0].flatten().tolist()[0]
+                  _newEstAlign = newEstAlign[0].flatten().tolist()[0]
+
                # if there is at least one useful false alignment add the
                # corresponding constraints to the optimization problem
                if firstFalseIdx != -1:
-                  trueWeights       = allWeights[:,0]
                   firstFalseWeights = allWeights[:,firstFalseIdx]
-                  differenceVector  = trueWeights - firstFalseWeights
+                  differenceVector  = trueWeight - firstFalseWeights
                   #pdb.set_trace()
 
-                  if Conf.USE_OPT:
-                     const_added = solver.addConstraint(differenceVector, exampleIdx)
-                     const_added_ctr += 1
-               #
+                  const_added = solver.addConstraint(differenceVector, exampleIdx)
+                  const_added_ctr += 1
+
                # end of one example processing 
-               #
 
             # call solver every nth example //added constraint
-            if exampleIdx != 0 and exampleIdx % 100 == 0 and Conf.USE_OPT:
+            if exampleIdx != 0 and exampleIdx % numConstPerRound == 0:
                objValue,w,self.slacks = solver.solve()
+               solver_call_ctr += 1
+
+               if solver_call_ctr == 5:
+                  numConstPerRound = 200
+                  self.plog('numConstPerRound is now %d\n'% numConstPerRound)
 
                if math.fabs(objValue - self.oldObjValue) <= 1e-6:
                   self.noImprovementCtr += 1
@@ -424,447 +433,236 @@ class QPalma:
                sum_xis = 0
                for elem in self.slacks:
                   sum_xis +=  elem
+
+               print 'sum of slacks is %f'% sum_xis 
+               self.plog('sum of slacks is %f\n'% sum_xis)
    
                for i in range(len(param)):
                   param[i] = w[i]
 
-               #pdb.set_trace()
                cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
                param_idx += 1
-               [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
+               [h,d,a,mmatrix,qualityPlifs] =\
+               set_param_palma(param,self.ARGS.train_with_intronlengthinformation,self.run)
 
-         #
-         # end of one iteration through all examples
-         #
-         if self.noImprovementCtr == numExamples+1:
-            break
+         ##############################################
+         # end of one iteration through all examples  #
+         ##############################################
+
+         self.plog("suboptimal rounds %d\n" %suboptimal_example)
+
+         if self.noImprovementCtr == numExamples*2:
+            FinalizeTraining(param,'param_%d.pickle'%param_idx)
 
          iteration_nr += 1
 
       #
       # end of optimization 
       #  
-      print 'Training completed'
+      FinalizeTraining(param,'param_%d.pickle'%param_idx)
 
-      pa = para()
-      pa.h = h
-      pa.d = d
-      pa.a = a
-      pa.mmatrix = mmatrix
-      pa.qualityPlifs = qualityPlifs
 
-      cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
-      #cPickle.dump(pa,open('elegans.param','w+'))
+   def FinalizeTraining(self,vector,name):
+      self.plog("Training completed")
+      cPickle.dump(param,open(name,'w+'))
       self.logfh.close()
-
-   def evaluate(self,param_filename):
-      beg = Conf.prediction_begin
-      end = Conf.prediction_end
-
-      # predict on training set
-      print '##### Prediction on the training set #####'
-      self.predict(param_filename,0,beg)
-      
-      # predict on test set
-      print '##### Prediction on the test set #####'
-      self.predict(param_filename,beg,end)
+      sys.exit(0)
    
-      pdb.set_trace()
-
-   def predict(self,param_filename,beg,end):
-
-      self.logfh = open('_qpalma_predict.log','w+')
-
-      Sequences   = self.Sequences[beg:end]
-      Exons       = self.Exons[beg:end]
-      Ests        = self.Ests[beg:end]
-      Qualities   = self.Qualities[beg:end]
-      Acceptors   = self.Acceptors[beg:end]
-      Donors      = self.Donors[beg:end]
-      SplitPos    = self.SplitPos[beg:end]
-
-      # number of training instances
-      N = numExamples = len(Sequences) 
-      assert len(Exons) == N and len(Ests) == N\
-      and len(Qualities) == N and len(Acceptors) == N\
-      and len(Donors) == N, 'The Exons,Acc,Don,.. arrays are of different lengths'
-      self.plog('Number of training examples: %d\n'% numExamples)
-
-      self.noImprovementCtr = 0
-      self.oldObjValue = 1e8
-
-      remove_duplicate_scores = Conf.remove_duplicate_scores 
-      print_matrix            = Conf.print_matrix 
-      anzpath                 = Conf.anzpath 
-
-      #param_filename='/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/param_25.pickle'
-      param = load_param(param_filename)
-
-      # Set the parameters such as limits penalties for the Plifs
-      [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
-
-      #############################################################################################
-      # Prediction
-      #############################################################################################
-      self.plog('Starting prediction...\n')
-
-      donSP       = Conf.numDonSuppPoints
-      accSP       = Conf.numAccSuppPoints
-      lengthSP    = Conf.numLengthSuppPoints
-      mmatrixSP   = Conf.sizeMatchmatrix[0]\
-      *Conf.sizeMatchmatrix[1]
-      numq        = Conf.numQualSuppPoints
-      totalQualSP = Conf.totalQualSuppPoints
-
-      currentPhi = zeros((Conf.numFeatures,1))
-      totalQualityPenalties = zeros((totalQualSP,1))
 
-      exon1Begin = []
-      exon1End = []
-      exon2Begin = []
-      exon2End = []
-      allWrongExons = []
-
-      for exampleIdx in range(numExamples):
-         dna = Sequences[exampleIdx] 
-         est = Ests[exampleIdx] 
-         currentSplitPos = SplitPos[exampleIdx]
-
-         if Conf.mode == 'normal':
-            quality = [40]*len(est)
+###############################################################################
+#
+# End of the code needed for training 
+#
+# Begin of code for prediction
+#
+###############################################################################
 
-         if Conf.mode == 'using_quality_scores':
-            quality = Qualities[exampleIdx]
+   def init_prediction(self,dataset_fn,prediction_keys,param_fn,set_name):
+      """
+      Performing a prediction takes...
+      """
+      self.set_name = set_name
 
-         exons = Exons[exampleIdx] 
-         # NoiseMatrix = Noises[exampleIdx] 
-         don_supp = Donors[exampleIdx] 
-         acc_supp = Acceptors[exampleIdx] 
+      #full_working_path = os.path.join(run['alignment_dir'],run['name'])
+      full_working_path = self.run['result_dir']
 
-         # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)    
-         trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
-         
-         # Calculate the weights
-         trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
-         trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
+      print 'full_working_path is %s' % full_working_path 
 
-         currentPhi[0:donSP]                                               = mat(d.penalties[:]).reshape(donSP,1)
-         currentPhi[donSP:donSP+accSP]                                     = mat(a.penalties[:]).reshape(accSP,1)
-         currentPhi[donSP+accSP:donSP+accSP+lengthSP]                      = mat(h.penalties[:]).reshape(lengthSP,1)
-         currentPhi[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP]   = mmatrix[:]
+      #assert not os.path.exists(full_working_path)
+      if not os.path.exists(full_working_path):
+         os.mkdir(full_working_path)
 
-         if Conf.mode == 'using_quality_scores':
-            totalQualityPenalties = param[-totalQualSP:]
-            currentPhi[donSP+accSP+lengthSP+mmatrixSP:]                    = totalQualityPenalties[:]
+      assert os.path.exists(full_working_path)
 
-         # Calculate w'phi(x,y) the total score of the alignment
-         trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
+      # ATTENTION: Changing working directory
+      os.chdir(full_working_path)
 
-         # The allWeights vector is supposed to store the weight parameter
-         # of the true alignment as well as the weight parameters of the
-         # 1 other alignments
-         allWeights = zeros((Conf.numFeatures,1+1))
-         allWeights[:,0] = trueWeight[:,0]
+      self.logfh = open('_qpalma_predict_%s.log'%set_name,'w+')
 
-         AlignmentScores = [0.0]*(1+1)
-         AlignmentScores[0] = trueAlignmentScore
+      # number of prediction instances
+      self.plog('Number of prediction examples: %d\n'% len(prediction_keys))
 
-         ################## Calculate wrong alignment(s) ######################
+      # load dataset and fetch instances that shall be predicted
+      dataset = cPickle.load(open(dataset_fn))
 
-         # Compute donor, acceptor with penalty_lookup_new
-         # returns two double lists
-         donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
+      prediction_set = {}
+      for key in prediction_keys:
+         prediction_set[key] = dataset[key]
 
-         #myalign wants the acceptor site on the g of the ag
-         acceptor = acceptor[1:]
-         acceptor.append(-inf)
+      # we do not need the full dataset anymore
+      del dataset
+   
+      # Load parameter vector to predict with
+      param = cPickle.load(open(param_fn))
 
-         dna = str(dna)
-         est = str(est)
-         dna_len = len(dna)
-         est_len = len(est)
+      self.predict(prediction_set,param)
 
-         ps = h.convert2SWIG()
 
-         prb = QPalmaDP.createDoubleArrayFromList(quality)
-         chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
+   def predict(self,prediction_set,param):
+      """
+      This method...
+      """
 
-         matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
-         mm_len = Conf.sizeMatchmatrix[0]*Conf.sizeMatchmatrix[1]
+      if self.run['mode'] == 'normal':
+         self.use_quality_scores = False
 
-         d_len = len(donor)
-         donor = QPalmaDP.createDoubleArrayFromList(donor)
-         a_len = len(acceptor)
-         acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
+      elif self.run['mode'] == 'using_quality_scores':
+         self.use_quality_scores = True
+      else:
+         assert(False)
 
-         # Create the alignment object representing the interface to the C/C++ code.
-         currentAlignment = QPalmaDP.Alignment(Conf.numQualPlifs,Conf.numQualSuppPoints, self.use_quality_scores)
+      # Set the parameters such as limits/penalties for the Plifs
+      [h,d,a,mmatrix,qualityPlifs] =\
+      set_param_palma(param,self.ARGS.train_with_intronlengthinformation,self.run)
 
-         c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
+      if not self.qpalma_debug_mode:
+         self.plog('Starting prediction...\n')
 
-         # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
-         currentAlignment.myalign(1, dna, dna_len,\
-          est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
-          acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
-          print_matrix)
+      self.problem_ctr = 0
 
-         c_SpliceAlign  = QPalmaDP.createIntArrayFromList([0]*(dna_len*1))
-         c_EstAlign     = QPalmaDP.createIntArrayFromList([0]*(est_len*1))
-         c_WeightMatch  = QPalmaDP.createIntArrayFromList([0]*(mm_len*1))
-         c_DPScores     = QPalmaDP.createDoubleArrayFromList([.0]*1)
+      # where we store the predictions
+      allPredictions = []
 
-         c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(Conf.totalQualSuppPoints))
+      # we take the first quality vector of the tuple of quality vectors
+      quality_index = 0
 
-         currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
-         c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
+      # beginning of the prediction loop
+      for example_key in prediction_set.keys():
+         print 'Current example %d' % example_key
+         for example in prediction_set[example_key]:
 
-         newSpliceAlign = zeros((dna_len,1))
-         newEstAlign    = zeros((est_len,1))
-         newWeightMatch = zeros((mm_len,1))
-         newQualityPlifsFeatures = zeros((Conf.totalQualSuppPoints,1))
+            currentSeqInfo,read,currentQualities = example
 
-         for i in range(dna_len):
-            newSpliceAlign[i] = c_SpliceAlign[i]
+            id,chromo,strand,genomicSeq_start,genomicSeq_stop =\
+            currentSeqInfo 
 
-         for i in range(est_len):
-            newEstAlign[i] = c_EstAlign[i]
+            if not self.qpalma_debug_mode:
+               self.plog('Loading example id: %d...\n'% int(id))
 
-         for i in range(mm_len):
-            newWeightMatch[i] = c_WeightMatch[i]
+            if self.run['enable_quality_scores']:
+               quality = currentQualities[quality_index]
+            else:
+               quality = [40]*len(read)
 
-         newDPScores = c_DPScores[0]
+            try:
+               currentDNASeq, currentAcc, currentDon = self.seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop)
+            except:
+               self.problem_ctr += 1
+               continue
 
-         for i in range(Conf.totalQualSuppPoints):
-            newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
+            if not self.run['enable_splice_signals']:
+               for idx,elem in enumerate(currentDon):
+                  if elem != -inf:
+                     currentDon[idx] = 0.0
 
-         del c_SpliceAlign
-         del c_EstAlign
-         del c_WeightMatch
-         del c_DPScores
-         del c_qualityPlifsFeatures
-         del currentAlignment
+               for idx,elem in enumerate(currentAcc):
+                  if elem != -inf:
+                     currentAcc[idx] = 0.0
 
-         newSpliceAlign = newSpliceAlign.reshape(1,dna_len)
-         newWeightMatch = newWeightMatch.reshape(1,mm_len)
-         true_map = [0]*2
-         true_map[0] = 1
-         pathNr = 0
+            current_prediction = self.calc_alignment(currentDNASeq, read,\
+            quality, currentDon, currentAcc, d, a, h, mmatrix, qualityPlifs)
 
-         weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a, h, newSpliceAlign.flatten().tolist()[0], don_supp, acc_supp)
+            current_prediction['id']         = id
+            current_prediction['chr']        = chromo
+            current_prediction['strand']     = strand
+            current_prediction['start_pos']  = genomicSeq_start
 
-         decodedQualityFeatures = zeros((Conf.totalQualSuppPoints,1))
-         for qidx in range(Conf.totalQualSuppPoints):
-            decodedQualityFeatures[qidx] = newQualityPlifsFeatures[qidx]
+            allPredictions.append(current_prediction)
 
-         # Gewichte in restliche Zeilen der Matrix speichern
-         wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch.T, decodedQualityFeatures])
-         allWeights[:,pathNr+1] = wp
+      if not self.qpalma_debug_mode:
+         self.FinalizePrediction(allPredictions)
+      else:
+         return allPredictions
 
-         hpen = mat(h.penalties).reshape(len(h.penalties),1)
-         dpen = mat(d.penalties).reshape(len(d.penalties),1)
-         apen = mat(a.penalties).reshape(len(a.penalties),1)
-         features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties])
-         AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
 
-         # if the pathNr-best alignment is very close to the true alignment consider it as true
-         if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
-            true_map[pathNr+1] = 1
+   def FinalizePrediction(self,allPredictions):
+      """ End of the prediction loop we save all predictions in a pickle file and exit """
 
-         e1_b_off,e1_e_off,e2_b_off,e2_e_off,newExons = evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign,currentSplitPos)
+      cPickle.dump(allPredictions,open('%s.predictions.pickle'%(self.set_name),'w+'))
+      self.plog('Prediction completed\n')
+      mes =  'Problem ctr %d' % self.problem_ctr
+      self.plog(mes+'\n')
+      self.logfh.close()
+      sys.exit(0)
 
-         if e1_b_off != None:
-            exon1Begin.append(e1_b_off)
-            exon1End.append(e1_e_off)
-            exon2Begin.append(e2_b_off)
-            exon2End.append(e2_e_off)
-         else:
-            allWrongExons.append((newExons,exons))
 
-      e1Begin_pos,e1Begin_neg,e1End_pos,e1End_neg,mean_e1Begin_neg,mean_e1End_neg = self.evaluatePositions(exon1Begin,exon1End)
-      e2Begin_pos,e2Begin_neg,e2End_pos,e2End_neg,mean_e2Begin_neg,mean_e2End_neg = self.evaluatePositions(exon2Begin,exon2End)
+   def calc_alignment(self, dna, read, quality, don_supp, acc_supp, d, a, h, mmatrix, qualityPlifs):
+      """
+      Given two sequences and the parameters we calculate on alignment
+      """
 
-      print 'Total num. of Examples: %d' % numExamples
-      print 'Correct positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_pos),len(e1End_pos),len(e2Begin_pos),len(e2End_pos))
-      print 'Incorrect positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_neg),len(e1End_neg),len(e2Begin_neg),len(e2End_neg))
-      print 'Mean of pos. offset:\t\t%.2f\t%.2f\t%.2f\t%.2f' % (mean_e1Begin_neg,mean_e1End_neg,mean_e2Begin_neg,mean_e2End_neg)
+      run = self.run
+      donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
 
-      print 'Prediction completed'
-      self.logfh.close()
+      if '-' in read:
+         self.plog('found gap\n')
+         read = read.replace('-','')
+         assert len(read) == Conf.read_size
 
-   def evaluatePositions(self,eBegin,eEnd):
-      eBegin_pos = [elem for elem in eBegin if elem == 0]
-      eBegin_neg = [elem for elem in eBegin if elem != 0]
-      eEnd_pos = [elem for elem in eEnd if elem == 0]
-      eEnd_neg = [elem for elem in eEnd if elem != 0]
-
-      mean_eBegin_neg = 0
-      for idx in range(len(eBegin_neg)):
-         mean_eBegin_neg += eBegin_neg[idx]
-         
-      mean_eBegin_neg /= 1.0*len(eBegin_neg)
-
-      mean_eEnd_neg = 0
-      for idx in range(len(eEnd_neg)):
-         mean_eEnd_neg += eEnd_neg[idx]
-
-      mean_eEnd_neg /= 1.0*len(eEnd_neg)
-
-      return eBegin_pos,eBegin_neg,eEnd_pos,eEnd_neg,mean_eBegin_neg,mean_eEnd_neg
-
-
-def fetch_genome_info(ginfo_filename):
-   if not os.path.exists(ginfo_filename):
-      cmd = ['']*4
-      cmd[0] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/utils'
-      cmd[1] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/genomes'
-      cmd[2] = 'genome_info = init_genome(\'%s\')' % gen_file
-      cmd[3] = 'save genome_info.mat genome_info'  
-      full_cmd = "matlab -nojvm -nodisplay -r \"%s; %s; %s; %s; exit\"" % (cmd[0],cmd[1],cmd[2],cmd[3])
-
-      obj = subprocess.Popen(full_cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
-      out,err = obj.communicate()
-      assert err == '', 'An error occured!\n%s'%err
-
-      ginfo = scipy.io.loadmat('genome_info.mat')
-      cPickle.dump(self.genome_info,open(ginfo_filename,'w+'))
-      return ginfo['genome_info']
-
-   else:
-      return cPickle.load(open(ginfo_filename))
-
-def plifs2param(h,d,a,mmatrix,qualityPlifs):
-   donSP       = Conf.numDonSuppPoints
-   accSP       = Conf.numAccSuppPoints
-   lengthSP    = Conf.numLengthSuppPoints
-   mmatrixSP   = Conf.sizeMatchmatrix[0]\
-   *Conf.sizeMatchmatrix[1]
-
-
-   param = zeros((Conf.numFeatures,1))
-   param[0:donSP]                                               = mat(d.penalties[:]).reshape(donSP,1)
-   param[donSP:donSP+accSP]                                     = mat(a.penalties[:]).reshape(accSP,1)
-   param[donSP+accSP:donSP+accSP+lengthSP]                      = mat(h.penalties[:]).reshape(lengthSP,1)
-   param[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP]   = mmatrix[:]
-
-   for idx in range(len(qualityPlifs)):
-      currentPlif       = qualityPlifs[idx]
-      begin             = lengthSP+donSP+accSP+mmatrixSP+(idx*Conf.numQualSuppPoints)
-      end               = lengthSP+donSP+accSP+mmatrixSP+((idx+1)*Conf.numQualSuppPoints)
-      param[begin:end]  = mat(currentPlif.penalties).reshape(Conf.numQualSuppPoints,1)
-
-   return param
-
-def load_param(filename):
-   param = None 
-   #try:
-   #   para = cPickle.load(open(filename))
-   #except:
-   #   print 'Error: Could not open parameter file!'
-   #   sys.exit(1)
-   #
-   #param = plifs2param(para.h,para.d,para.a,para.mmatrix,para.qualityPlifs)
-
-   param = cPickle.load(open(filename))
-   return param
-
-def evaluateExample(dna,est,exons,SpliceAlign,newEstAlign,spos):
-   newExons = []
-   oldElem = -1
-   SpliceAlign = SpliceAlign.flatten().tolist()[0]
-   SpliceAlign.append(-1)
-   for pos,elem in enumerate(SpliceAlign):
-      if pos == 0:
-         oldElem = -1
-      else:
-         oldElem = SpliceAlign[pos-1]
+      ps = h.convert2SWIG()
 
-      if oldElem != 0 and elem == 0: # start of exon
-         newExons.append(pos)
+      newSpliceAlign, newEstAlign, newWeightMatch, newDPScores,\
+      newQualityPlifsFeatures, dna_array, read_array =\
+      self.do_alignment(dna,read,quality,mmatrix,donor,acceptor,ps,qualityPlifs,1,True)
 
-      if oldElem == 0 and elem != 0: # end of exon
-         newExons.append(pos)
+      mm_len = run['matchmatrixRows']*run['matchmatrixCols']
 
-   e1_b_off,e1_e_off,e2_b_off,e2_e_off = 0,0,0,0
+      true_map    = [0]*2
+      true_map[0] = 1
+      pathNr      = 0
 
-   if len(newExons) == 4:
-      e1_begin,e1_end = newExons[0],newExons[1]
-      e2_begin,e2_end = newExons[2],newExons[3]
+      _newSpliceAlign   = array.array('B',newSpliceAlign)
+      _newEstAlign      = array.array('B',newEstAlign)
+       
+      #(qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds, tExonSizes, tStarts, tEnds)
+      alignment = get_alignment(_newSpliceAlign,_newEstAlign, dna_array, read_array) 
 
-   #elif len(newExons) > 4 :
-   #   e1_begin,e1_end = newExons[0],newExons[1]
-   #   e2_begin,e2_end = newExons[-2],newExons[-1]
-   else:
-      return None,None,None,None,newExons
+      dna_array   = array.array('B',dna_array)
+      read_array  = array.array('B',read_array)
 
-   e1_b_off = int(math.fabs(e1_begin - exons[0,0]))
-   e1_e_off = int(math.fabs(e1_end - exons[0,1]))
+      newExons = self.calculatePredictedExons(newSpliceAlign)
 
-   e2_b_off = int(math.fabs(e2_begin - exons[1,0]))
-   e2_e_off = int(math.fabs(e2_end - exons[1,1]))
+      current_prediction = {'predExons':newExons, 'dna':dna, 'read':read, 'DPScores':newDPScores,\
+      'alignment':alignment,'spliceAlign':_newSpliceAlign,'estAlign':_newEstAlign,\
+      'dna_array':dna_array, 'read_array':read_array }
 
-   return e1_b_off,e1_e_off,e2_b_off,e2_e_off,newExons
+      return current_prediction
 
-def calcStat(Acceptor,Donor,Exons):
-   maxAcc = -100
-   minAcc = 100
-   maxDon = -100
-   minDon = 100
 
-   acc_vec_pos = []
-   acc_vec_neg = []
-   don_vec_pos = []
-   don_vec_neg = []
+   def calculatePredictedExons(self,SpliceAlign):
+      newExons = []
+      oldElem = -1
+      SpliceAlign.append(-1)
+      for pos,elem in enumerate(SpliceAlign):
+         if pos == 0:
+            oldElem = -1
+         else:
+            oldElem = SpliceAlign[pos-1]
 
-   for jdx in range(len(Acceptors)):
-      currentExons = Exons[jdx]
-      currentAcceptor = Acceptors[jdx]
-      currentAcceptor = currentAcceptor[1:]
-      currentAcceptor.append(-inf)
-      currentDonor = Donors[jdx]
+         if oldElem != 0 and elem == 0: # start of exon
+            newExons.append(pos)
 
-      for idx,elem in enumerate(currentAcceptor):
-         if idx == (int(currentExons[1,0])-1): # acceptor site
-            acc_vec_pos.append(elem)
-         else:
-            acc_vec_neg.append(elem)
+         if oldElem == 0 and elem != 0: # end of exon
+            newExons.append(pos)
 
-      for idx,elem in enumerate(currentDonor):
-         if idx == (int(currentExons[0,1])): # donor site
-            don_vec_pos.append(elem)
-         else:
-            don_vec_neg.append(elem)
-
-   acc_pos = [elem for elem in acc_vec_pos if elem != -inf]
-   acc_neg = [elem for elem in acc_vec_neg if elem != -inf]
-   don_pos = [elem for elem in don_vec_pos if elem != -inf]
-   don_neg = [elem for elem in don_vec_neg if elem != -inf]
-
-   pdb.set_trace()
-
-   for idx in range(len(Sequences)):
-      acc = [elem for elem in Acceptors[idx] if elem != -inf]
-      maxAcc = max(max(acc),maxAcc)
-      minAcc = min(min(acc),minAcc)
-
-      don = [elem for elem in Donors[idx] if elem != -inf]
-      maxDon = max(max(don),maxDon)
-      minDon = min(min(don),minDon)
-
-if __name__ == '__main__':
-
-   qpalma = QPalma()
-
-   if len(sys.argv) == 2:
-      mode = sys.argv[1]
-      assert mode == 'train'
-      qpalma.train()
-
-   elif len(sys.argv) == 3:
-      mode = sys.argv[1]
-      param_filename = sys.argv[2]
-      assert mode == 'predict' 
-      assert os.path.exists(param_filename)
-      qpalma.evaluate(param_filename)
-   else:
-      print 'You have to choose between training or prediction mode:'
-      print 'python qpalma. py (train|predict) <param_file>' 
+      return newExons