+ added some text and references to the documentation
[qpalma.git] / scripts / qpalma_main.py
index 8af76e2..e70e6d2 100644 (file)
@@ -1,78 +1,47 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
-###########################################################
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
 #
-# The QPalma project aims at extending the Palma project 
-# to be able to use Solexa reads together with their 
-# quality scores.
-# 
-# This file represents the conversion of the main matlab 
-# training loop for Palma to Python.
-# 
-# Author: Fabio De Bona
-# 
-###########################################################
+# Written (W) 2008 Fabio De Bona
+# Copyright (C) 2008 Max-Planck-Society
 
-import sys
+import array
 import cPickle
-import pdb
-import re
 import os.path
+import pdb
+import sys
 
-from compile_dataset import getSpliceScores, get_seq_and_scores
+from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper,unbracket_seq
 
 import numpy
 from numpy.matlib import mat,zeros,ones,inf
 from numpy.linalg import norm
 
-import QPalmaDP
-import qpalma
-
-from qpalma.SIQP_CPX import SIQPSolver
+#from qpalma.SIQP_CPX import SIQPSolver
 #from qpalma.SIQP_CVXOPT import SIQPSolver
 
-from qpalma.DataProc import *
+import QPalmaDP
+import qpalma
 from qpalma.computeSpliceWeights import *
 from qpalma.set_param_palma import *
 from qpalma.computeSpliceAlignWithQuality import *
-from qpalma.penalty_lookup_new import *
-from qpalma.compute_donacc import *
 from qpalma.TrainingParam import Param
-from qpalma.Plif import Plf
+from qpalma.Plif import Plf,compute_donacc
 
-from qpalma.Configuration import *
-
-# this two imports are needed for the load genomic resp. interval query
-# functions
-from Genefinding import *
-from genome_utils import load_genomic
-from Utils import calc_stat, calc_info, pprint_alignment, get_alignment
+from Utils import pprint_alignment, get_alignment
 
 class SpliceSiteException:
    pass
 
 
-def unbracket_est(est):
-   new_est = ''
-   e = 0
-
-   while True:
-      if e >= len(est):
-         break
-
-      if est[e] == '[':
-         new_est += est[e+2]
-         e += 4
-      else:
-         new_est += est[e]
-         e += 1
-
-   return "".join(new_est).lower()
-
-
 def getData(training_set,exampleKey,run):
-   currentSeqInfo,currentExons,original_est,currentQualities = training_set[exampleKey]
+   """ This function...  """
+
+   currentSeqInfo,original_est,currentQualities,currentExons = training_set[exampleKey]
    id,chr,strand,up_cut,down_cut = currentSeqInfo
 
    est = original_est
@@ -91,14 +60,8 @@ def getData(training_set,exampleKey,run):
    dna_flat_files =  '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
    dna, acc_supp, don_supp = get_seq_and_scores(chr,strand,up_cut,down_cut,dna_flat_files)
 
-   # splice score is located at g of ag
-   ag_tuple_pos = [p for p,e in enumerate(dna) if p>1 and dna[p-1]=='a' and dna[p]=='g' ]
-   assert ag_tuple_pos == [p for p,e in enumerate(acc_supp) if e != -inf and p > 1], pdb.set_trace()
-
-   gt_tuple_pos = [p for p,e in enumerate(dna) if p>0 and p<len(dna)-1 and e=='g' and (dna[p+1]=='t' or dna[p+1]=='c')]
-   assert gt_tuple_pos == [p for p,e in enumerate(don_supp) if e != -inf and p > 0], pdb.set_trace()
+   #currentDNASeq, currentAcc, currentDon = seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop)
 
-   #original_exons = Exons[exampleIdx]
    original_exons = currentExons
    exons = original_exons - (up_cut-1)
    exons[0,0] -= 1
@@ -123,15 +86,17 @@ def getData(training_set,exampleKey,run):
    return dna,est,acc_supp,don_supp,exons,original_est,currentQualities
 
 
-
 class QPalma:
    """
    This class wraps the training and prediction functions for 
    the alignment.
    """
    
-   def __init__(self):
+   def __init__(self,run,seqInfo,dmode=False):
       self.ARGS = Param()
+      self.qpalma_debug_mode = dmode
+      self.run = run
+      self.seqInfo = seqInfo
 
 
    def plog(self,string):
@@ -144,7 +109,6 @@ class QPalma:
       Given the needed input this method calls the QPalma C module which
       calculates a dynamic programming in order to obtain an alignment
       """
-      run = self.run
 
       dna_len = len(dna)
       est_len = len(est)
@@ -153,7 +117,7 @@ class QPalma:
       chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
 
       matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
-      mm_len = run['matchmatrixRows']*run['matchmatrixCols']
+      mm_len = self.run['matchmatrixRows']*self.run['matchmatrixCols']
 
       d_len = len(donor)
       donor = QPalmaDP.createDoubleArrayFromList(donor)
@@ -161,80 +125,31 @@ class QPalma:
       acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
 
       # Create the alignment object representing the interface to the C/C++ code.
-      currentAlignment = QPalmaDP.Alignment(run['numQualPlifs'],run['numQualSuppPoints'], self.use_quality_scores)
+      currentAlignment = QPalmaDP.Alignment(self.run['numQualPlifs'],self.run['numQualSuppPoints'], self.use_quality_scores)
       c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
       # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
       currentAlignment.myalign( current_num_path, dna, dna_len,\
        est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
-       acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
-       print_matrix)
-
-      c_SpliceAlign       = QPalmaDP.createIntArrayFromList([0]*(dna_len*current_num_path))
-      c_EstAlign          = QPalmaDP.createIntArrayFromList([0]*(est_len*current_num_path))
-      c_WeightMatch       = QPalmaDP.createIntArrayFromList([0]*(mm_len*current_num_path))
-      c_DPScores   = QPalmaDP.createDoubleArrayFromList([.0]*current_num_path)
-
-      c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(run['totalQualSuppPoints']*current_num_path))
+       acceptor, a_len, c_qualityPlifs, self.run['remove_duplicate_scores'],\
+       self.run['print_matrix'] )
 
       if prediction_mode:
          # part that is only needed for prediction
          result_len = currentAlignment.getResultLength()
-         c_dna_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
-         c_est_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
-
-         currentAlignment.getAlignmentArrays(c_dna_array,c_est_array)
-
-         dna_array = [0.0]*result_len
-         est_array = [0.0]*result_len
-
-         for r_idx in range(result_len):
-            dna_array[r_idx] = c_dna_array[r_idx]
-            est_array[r_idx] = c_est_array[r_idx]
-
+         dna_array,est_array = currentAlignment.getAlignmentArraysNew()
       else:
          dna_array = None
          est_array = None
 
-      currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
-      c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
-
-      newSpliceAlign = zeros((current_num_path*dna_len,1))
-      newEstAlign    = zeros((est_len*current_num_path,1))
-      newWeightMatch = zeros((current_num_path*mm_len,1))
-      newDPScores    = zeros((current_num_path,1))
-      newQualityPlifsFeatures = zeros((run['totalQualSuppPoints']*current_num_path,1))
-
-      for i in range(dna_len*current_num_path):
-         newSpliceAlign[i] = c_SpliceAlign[i]
-
-      for i in range(est_len*current_num_path):
-         newEstAlign[i] = c_EstAlign[i]
-
-      for i in range(mm_len*current_num_path):
-         newWeightMatch[i] = c_WeightMatch[i]
-
-      for i in range(current_num_path):
-         newDPScores[i] = c_DPScores[i]
-
-      if self.use_quality_scores:
-         for i in range(run['totalQualSuppPoints']*current_num_path):
-            newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
-
-      del c_SpliceAlign
-      del c_EstAlign
-      del c_WeightMatch
-      del c_DPScores
-      del c_qualityPlifsFeatures
-      del currentAlignment
+      newSpliceAlign, newEstAlign, newWeightMatch, newDPScores, newQualityPlifsFeatures =\
+      currentAlignment.getAlignmentResultsNew()
 
       return newSpliceAlign, newEstAlign, newWeightMatch, newDPScores,\
       newQualityPlifsFeatures, dna_array, est_array
 
 
-   def train(self,run,training_set):
-      self.run = run
-
-      full_working_path = os.path.join(run['alignment_dir'],run['name'])
+   def init_train(self,training_set):
+      full_working_path = os.path.join(self.run['alignment_dir'],self.run['name'])
 
       #assert not os.path.exists(full_working_path)
       if not os.path.exists(full_working_path):
@@ -246,10 +161,10 @@ class QPalma:
       os.chdir(full_working_path)
 
       self.logfh = open('_qpalma_train.log','w+')
-      cPickle.dump(run,open('run_obj.pickle','w+'))
+      cPickle.dump(self.run,open('run_obj.pickle','w+'))
 
       self.plog("Settings are:\n")
-      self.plog("%s\n"%str(run))
+      self.plog("%s\n"%str(self.run))
 
       if self.run['mode'] == 'normal':
          self.use_quality_scores = False
@@ -258,56 +173,60 @@ class QPalma:
          self.use_quality_scores = True
       else:
          assert(False)
-   
+
+
+   def setUpSolver(self):
+      # Initialize solver 
+      self.plog('Initializing problem...\n')
+      
+      try:
+         solver = SIQPSolver(run['numFeatures'],numExamples,run['C'],self.logfh,run)
+      except:
+         self.plog('Got no license. Telling queue to reschedule job...\n')
+         sys.exit(99)
+
+      solver.enforceMonotonicity(lengthSP,lengthSP+donSP)
+      solver.enforceMonotonicity(lengthSP+donSP,lengthSP+donSP+accSP)
+
+      return solver
+
+
+   def train(self,training_set):
       numExamples = len(training_set)
       self.plog('Number of training examples: %d\n'% numExamples)
 
       self.noImprovementCtr = 0
       self.oldObjValue = 1e8
 
-      iteration_steps         = run['iter_steps']
-      remove_duplicate_scores = run['remove_duplicate_scores']
-      print_matrix            = run['print_matrix']
-      anzpath                 = run['anzpath']
+      iteration_steps         = self.run['iter_steps']
+      remove_duplicate_scores = self.run['remove_duplicate_scores']
+      print_matrix            = self.run['print_matrix']
+      anzpath                 = self.run['anzpath']
 
-      # Initialize parameter vector  / 
-      #param = Conf.fixedParam[:run['numFeatures']]
+      # Initialize parameter vector
       param = numpy.matlib.rand(run['numFeatures'],1)
    
-      lengthSP    = run['numLengthSuppPoints']
-      donSP       = run['numDonSuppPoints']
-      accSP       = run['numAccSuppPoints']
-      mmatrixSP   = run['matchmatrixRows']*run['matchmatrixCols']
-      numq        = run['numQualSuppPoints']
-      totalQualSP = run['totalQualSuppPoints']
+      lengthSP    = self.run['numLengthSuppPoints']
+      donSP       = self.run['numDonSuppPoints']
+      accSP       = self.run['numAccSuppPoints']
+      mmatrixSP   = self.run['matchmatrixRows']*run['matchmatrixCols']
+      numq        = self.run['numQualSuppPoints']
+      totalQualSP = self.run['totalQualSuppPoints']
 
       # no intron length model
-      if not run['enable_intron_length']:
+      if not self.run['enable_intron_length']:
          param[:lengthSP] *= 0.0
 
       # Set the parameters such as limits penalties for the Plifs
-      [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation,run)
+      [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation,self.run)
 
-      # Initialize solver 
-      self.plog('Initializing problem...\n')
-      
-      try:
-         solver = SIQPSolver(run['numFeatures'],numExamples,run['C'],self.logfh,run)
-      except:
-         self.plog('Got no license. Telling queue to reschedule job...\n')
-         sys.exit(99)
-
-      #solver.enforceMonotonicity(lengthSP,lengthSP+donSP)
-      #solver.enforceMonotonicity(lengthSP+donSP,lengthSP+donSP+accSP)
+      solver = self.setUpSolver()
 
       # stores the number of alignments done for each example (best path, second-best path etc.)
       num_path = [anzpath]*numExamples
+
       # stores the gap for each example
       gap      = [0.0]*numExamples
-      #############################################################################################
-      # Training
-      #############################################################################################
-      self.plog('Starting training...\n')
 
       currentPhi = zeros((run['numFeatures'],1))
       totalQualityPenalties = zeros((totalQualSP,1))
@@ -322,6 +241,7 @@ class QPalma:
 
       featureVectors = zeros((run['numFeatures'],numExamples))
 
+      self.plog('Starting training...\n')
       # the main training loop
       while True:
          if iteration_nr == iteration_steps:
@@ -337,14 +257,10 @@ class QPalma:
 
             dna_len = len(dna)
 
-            if run['mode'] == 'normal':
-               quality = [40]*len(est)
-
-            if run['mode'] == 'using_quality_scores':
-               quality = currentQualities[0]
-
-            if not run['enable_quality_scores']:
-               quality = [40]*len(est)
+            if run['enable_quality_scores']:
+               quality = currentQualities[quality_index]
+            else:
+               quality = [40]*len(read)
 
             if not run['enable_splice_signals']:
                for idx,elem in enumerate(don_supp):
@@ -397,33 +313,20 @@ class QPalma:
             # returns two double lists
             donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
 
-            #myalign wants the acceptor site on the g of the ag
-            #acceptor = acceptor[1:]
-            #acceptor.append(-inf)
-
-            #donor = [-inf] + donor[:-1]
-
             ps = h.convert2SWIG()
 
-            _newSpliceAlign, _newEstAlign, _newWeightMatch, _newDPScores,\
-            _newQualityPlifsFeatures, unneeded1, unneeded2 =\
+            newSpliceAlign, newEstAlign, newWeightMatch, newDPScores,\
+            newQualityPlifsFeatures, unneeded1, unneeded2 =\
             self.do_alignment(dna,est,quality,mmatrix,donor,acceptor,ps,qualityPlifs,num_path[exampleIdx],False)
             mm_len = run['matchmatrixRows']*run['matchmatrixCols']
 
-            newSpliceAlign = _newSpliceAlign
-            newEstAlign    = _newEstAlign
-            newWeightMatch = _newWeightMatch
-            newDPScores    = _newDPScores
-            newQualityPlifsFeatures = _newQualityPlifsFeatures
-
             newSpliceAlign = newSpliceAlign.reshape(num_path[exampleIdx],dna_len)
             newWeightMatch = newWeightMatch.reshape(num_path[exampleIdx],mm_len)
 
             newQualityPlifsFeatures = newQualityPlifsFeatures.reshape(num_path[exampleIdx],run['totalQualSuppPoints'])
-            # Calculate weights of the respective alignments. Note that we are
-            # calculating n-best alignments without hamming loss, so we
-            # have to keep track which of the n-best alignments correspond to
-            # the true one in order not to incorporate a true alignment in the
+            # Calculate weights of the respective alignments. Note that we are calculating n-best alignments without 
+            # hamming loss, so we have to keep track which of the n-best alignments correspond to the true one in order 
+            # not to incorporate a true alignment in the
             # constraints. To keep track of the true and false alignments we
             # define an array true_map with a boolean indicating the
             # equivalence to the true alignment for each decoded alignment.
@@ -490,27 +393,13 @@ class QPalma:
                if False:
                   self.plog("Is considered as: %d\n" % true_map[1])
 
-                  result_len = currentAlignment.getResultLength()
-                  c_dna_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
-                  c_est_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
+                  #result_len = currentAlignment.getResultLength()
 
-                  currentAlignment.getAlignmentArrays(c_dna_array,c_est_array)
-
-                  dna_array = [0.0]*result_len
-                  est_array = [0.0]*result_len
-
-                  for r_idx in range(result_len):
-                     dna_array[r_idx] = c_dna_array[r_idx]
-                     est_array[r_idx] = c_est_array[r_idx]
+                  dna_array,est_array = currentAlignment.getAlignmentArraysNew()
 
                   _newSpliceAlign = newSpliceAlign[0].flatten().tolist()[0]
                   _newEstAlign = newEstAlign[0].flatten().tolist()[0]
 
-                  #line1,line2,line3 = pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array)
-                  #self.plog(line1+'\n')
-                  #self.plog(line2+'\n')
-                  #self.plog(line3+'\n')
-
                # if there is at least one useful false alignment add the
                # corresponding constraints to the optimization problem
                if firstFalseIdx != -1:
@@ -518,13 +407,10 @@ class QPalma:
                   differenceVector  = trueWeight - firstFalseWeights
                   #pdb.set_trace()
 
-                  #print 'NOT ADDING ANY CONSTRAINTS'
                   const_added = solver.addConstraint(differenceVector, exampleIdx)
-
                   const_added_ctr += 1
-               #
+
                # end of one example processing 
-               #
 
             # call solver every nth example //added constraint
             if exampleIdx != 0 and exampleIdx % numConstPerRound == 0:
@@ -557,27 +443,31 @@ class QPalma:
                cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
                param_idx += 1
                [h,d,a,mmatrix,qualityPlifs] =\
-               set_param_palma(param,self.ARGS.train_with_intronlengthinformation,run)
+               set_param_palma(param,self.ARGS.train_with_intronlengthinformation,self.run)
 
-         #
-         # end of one iteration through all examples
-         #
+         ##############################################
+         # end of one iteration through all examples  #
+         ##############################################
 
          self.plog("suboptimal rounds %d\n" %suboptimal_example)
 
          if self.noImprovementCtr == numExamples*2:
-            break
+            FinalizeTraining(param,'param_%d.pickle'%param_idx)
 
          iteration_nr += 1
 
       #
       # end of optimization 
       #  
-      print 'Training completed'
+      FinalizeTraining(param,'param_%d.pickle'%param_idx)
 
-      cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
-      self.logfh.close()
 
+   def FinalizeTraining(self,vector,name):
+      self.plog("Training completed")
+      cPickle.dump(param,open(name,'w+'))
+      self.logfh.close()
+      sys.exit(0)
+   
 
 ###############################################################################
 #
@@ -587,13 +477,14 @@ class QPalma:
 #
 ###############################################################################
 
-   def predict(self,run,dataset_fn,prediction_keys,param,set_name):
+   def init_prediction(self,dataset_fn,prediction_keys,param_fn,set_name):
       """
       Performing a prediction takes...
       """
-      self.run = run
+      self.set_name = set_name
 
-      full_working_path = os.path.join(run['alignment_dir'],run['name'])
+      #full_working_path = os.path.join(run['alignment_dir'],run['name'])
+      full_working_path = self.run['result_dir']
 
       print 'full_working_path is %s' % full_working_path 
 
@@ -608,14 +499,6 @@ class QPalma:
 
       self.logfh = open('_qpalma_predict_%s.log'%set_name,'w+')
 
-      if self.run['mode'] == 'normal':
-         self.use_quality_scores = False
-
-      elif self.run['mode'] == 'using_quality_scores':
-         self.use_quality_scores = True
-      else:
-         assert(False)
-
       # number of prediction instances
       self.plog('Number of prediction examples: %d\n'% len(prediction_keys))
 
@@ -628,69 +511,66 @@ class QPalma:
 
       # we do not need the full dataset anymore
       del dataset
+   
+      # Load parameter vector to predict with
+      param = cPickle.load(open(param_fn))
 
-      # Set the parameters such as limits penalties for the Plifs
-      [h,d,a,mmatrix,qualityPlifs] =\
-      set_param_palma(param,self.ARGS.train_with_intronlengthinformation,run)
+      self.predict(prediction_set,param)
 
-      #############################################################################################
-      # Prediction
-      #############################################################################################
-      self.plog('Starting prediction...\n')
 
-      donSP       = self.run['numDonSuppPoints']
-      accSP       = self.run['numAccSuppPoints']
-      lengthSP    = self.run['numLengthSuppPoints']
-      mmatrixSP   = run['matchmatrixRows']*run['matchmatrixCols']
-      numq        = self.run['numQualSuppPoints']
-      totalQualSP = self.run['totalQualSuppPoints']
+   def predict(self,prediction_set,param):
+      """
+      This method...
+      """
 
-      totalQualityPenalties = zeros((totalQualSP,1))
+      if self.run['mode'] == 'normal':
+         self.use_quality_scores = False
 
-      problem_ctr = 0
+      elif self.run['mode'] == 'using_quality_scores':
+         self.use_quality_scores = True
+      else:
+         assert(False)
+
+      # Set the parameters such as limits/penalties for the Plifs
+      [h,d,a,mmatrix,qualityPlifs] =\
+      set_param_palma(param,self.ARGS.train_with_intronlengthinformation,self.run)
+
+      if not self.qpalma_debug_mode:
+         self.plog('Starting prediction...\n')
+
+      self.problem_ctr = 0
 
       # where we store the predictions
       allPredictions = []
 
+      # we take the first quality vector of the tuple of quality vectors
+      quality_index = 0
+
       # beginning of the prediction loop
       for example_key in prediction_set.keys():
          print 'Current example %d' % example_key
-
          for example in prediction_set[example_key]:
 
-            currentSeqInfo,original_est,currentQualities = example
+            currentSeqInfo,read,currentQualities = example
 
-            id,chr,strand,genomicSeq_start,genomicSeq_stop =\
+            id,chromo,strand,genomicSeq_start,genomicSeq_stop =\
             currentSeqInfo 
 
-            assert id == example_key
-
-            if not chr in range(1,6):
-               continue
-
-            self.plog('Loading example id: %d...\n'% int(id))
-
-            est = original_est
-            est = unbracket_est(est)
-
-            if run['mode'] == 'normal':
-               quality = [40]*len(est)
-
-            if run['mode'] == 'using_quality_scores':
-               quality = currentQualities[0]
-
-            if not run['enable_quality_scores']:
-               quality = [40]*len(est)
+            if not self.qpalma_debug_mode:
+               self.plog('Loading example id: %d...\n'% int(id))
 
-            current_example_predictions = []
+            if self.run['enable_quality_scores']:
+               quality = currentQualities[quality_index]
+            else:
+               quality = [40]*len(read)
 
             try:
-               currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
+               currentDNASeq, currentAcc, currentDon = self.seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop)
             except:
-               problem_ctr += 1
+               self.problem_ctr += 1
                continue
 
-            if not run['enable_splice_signals']:
+            if not self.run['enable_splice_signals']:
                for idx,elem in enumerate(currentDon):
                   if elem != -inf:
                      currentDon[idx] = 0.0
@@ -699,25 +579,34 @@ class QPalma:
                   if elem != -inf:
                      currentAcc[idx] = 0.0
 
-            current_prediction = self.calc_alignment(currentDNASeq, est,\
+            current_prediction = self.calc_alignment(currentDNASeq, read,\
             quality, currentDon, currentAcc, d, a, h, mmatrix, qualityPlifs)
 
-            current_prediction['id'] = id
-            #current_prediction['start_pos'] = up_cut
-            current_prediction['start_pos'] = genomicSeq_start
-            current_prediction['chr'] = chr
-            current_prediction['strand'] = strand
+            current_prediction['id']         = id
+            current_prediction['chr']        = chromo
+            current_prediction['strand']     = strand
+            current_prediction['start_pos']  = genomicSeq_start
 
             allPredictions.append(current_prediction)
 
-      # end of the prediction loop we save all predictions in a pickle file and exit
-      cPickle.dump(allPredictions,open('%s.predictions.pickle'%(set_name),'w+'))
-      print 'Prediction completed'
-      print 'Problem ctr %d' % problem_ctr
+      if not self.qpalma_debug_mode:
+         self.FinalizePrediction(allPredictions)
+      else:
+         return allPredictions
+
+
+   def FinalizePrediction(self,allPredictions):
+      """ End of the prediction loop we save all predictions in a pickle file and exit """
+
+      cPickle.dump(allPredictions,open('%s.predictions.pickle'%(self.set_name),'w+'))
+      self.plog('Prediction completed\n')
+      mes =  'Problem ctr %d' % self.problem_ctr
+      self.plog(mes+'\n')
       self.logfh.close()
+      sys.exit(0)
 
 
-   def calc_alignment(self, dna, est, quality, don_supp, acc_supp, d, a, h, mmatrix, qualityPlifs):
+   def calc_alignment(self, dna, read, quality, don_supp, acc_supp, d, a, h, mmatrix, qualityPlifs):
       """
       Given two sequences and the parameters we calculate on alignment
       """
@@ -725,45 +614,37 @@ class QPalma:
       run = self.run
       donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
 
-      dna = str(dna)
-      est = str(est)
-
-      if '-' in est:
+      if '-' in read:
          self.plog('found gap\n')
-         est = est.replace('-','')
-         assert len(est) == 36
-
-      dna_len = len(dna)
-      est_len = len(est)
+         read = read.replace('-','')
+         assert len(read) == Conf.read_size
 
       ps = h.convert2SWIG()
 
       newSpliceAlign, newEstAlign, newWeightMatch, newDPScores,\
-      newQualityPlifsFeatures, dna_array, est_array =\
-      self.do_alignment(dna,est,quality,mmatrix,donor,acceptor,ps,qualityPlifs,1,True)
+      newQualityPlifsFeatures, dna_array, read_array =\
+      self.do_alignment(dna,read,quality,mmatrix,donor,acceptor,ps,qualityPlifs,1,True)
 
       mm_len = run['matchmatrixRows']*run['matchmatrixCols']
 
-      # old code removed
-      newSpliceAlign = newSpliceAlign.reshape(1,dna_len)
-      newWeightMatch = newWeightMatch.reshape(1,mm_len)
-      true_map = [0]*2
+      true_map    = [0]*2
       true_map[0] = 1
-      pathNr = 0
+      pathNr      = 0
 
-      _newSpliceAlign = newSpliceAlign.flatten().tolist()[0]
-      _newEstAlign = newEstAlign.flatten().tolist()[0]
+      _newSpliceAlign   = array.array('B',newSpliceAlign)
+      _newEstAlign      = array.array('B',newEstAlign)
        
-      alignment = get_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array) #(qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds, tExonSizes, tStarts, tEnds)
-      #line1,line2,line3 = pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array)
-      #self.plog(line1+'\n')
-      #self.plog(line2+'\n')
-      #self.plog(line3+'\n')
+      #(qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds, tExonSizes, tStarts, tEnds)
+      alignment = get_alignment(_newSpliceAlign,_newEstAlign, dna_array, read_array) 
+
+      dna_array   = array.array('B',dna_array)
+      read_array  = array.array('B',read_array)
 
       newExons = self.calculatePredictedExons(newSpliceAlign)
 
-      current_prediction = {'predExons':newExons, 'dna':dna, 'est':est, 'DPScores':newDPScores,\
-      'alignment':alignment}
+      current_prediction = {'predExons':newExons, 'dna':dna, 'read':read, 'DPScores':newDPScores,\
+      'alignment':alignment,'spliceAlign':_newSpliceAlign,'estAlign':_newEstAlign,\
+      'dna_array':dna_array, 'read_array':read_array }
 
       return current_prediction
 
@@ -771,7 +652,6 @@ class QPalma:
    def calculatePredictedExons(self,SpliceAlign):
       newExons = []
       oldElem = -1
-      SpliceAlign = SpliceAlign.flatten().tolist()[0]
       SpliceAlign.append(-1)
       for pos,elem in enumerate(SpliceAlign):
          if pos == 0:
@@ -786,26 +666,3 @@ class QPalma:
             newExons.append(pos)
 
       return newExons
-
-###########################
-# A simple command line 
-# interface
-###########################
-
-if __name__ == '__main__':
-   assert len(sys.argv) == 4
-
-   run_fn      = sys.argv[1]
-   dataset_fn  = sys.argv[2]
-   param_fn    = sys.argv[3]
-
-   run_obj = cPickle.load(open(run_fn))
-   dataset_obj = cPickle.load(open(dataset_fn))
-
-   qpalma = QPalma()
-
-   if param_fn == 'train':
-      qpalma.train(run_obj,dataset_obj)
-   else:
-      param_obj = cPickle.load(open(param_fn))
-      qpalma.predict(run_obj,dataset_obj,param_obj)