+ extended evaluation function to return positionwise deviations
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Thu, 31 Jan 2008 16:32:41 +0000 (16:32 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Thu, 31 Jan 2008 16:32:41 +0000 (16:32 +0000)
+ changed data loading to use load_genomic for sequences fetching
+ added more assertions
+ changed weights in the regularizer to account for different number of features

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@7654 e1793c9e-67f9-0310-80fc-b846ff1f7b36

dyn_prog/qpalma_dp.h
qpalma/Configuration.py
qpalma/DataProc.py
qpalma/SIQP.py
qpalma/SIQP_CPX.py
qpalma/set_param_palma.py
scripts/qpalma_main.py

index 4049221..085d6af 100644 (file)
@@ -130,4 +130,3 @@ class Alignment {
 };
 
 #endif  // _QPALMA_DP_H_
-
index e7d045d..d35c6c4 100644 (file)
@@ -300,6 +300,8 @@ fixedParam = numpy.matlib.mat([[ 0.62870709], [ 0.7012026 ], [ 0.60236784],
   [ 0.69422476], [ 0.4310939 ], [ 0.03069099], [ 0.35969779], [ 0.18047331],
   [ 0.4177651 ], [ 0.01360547], [ 0.29069319]])
 
+#fixedParamQ = numpy.matlib.mat(range(1300))
+#fixedParamQ = fixedParamQ.reshape((1300,1))
 
 ###########################################################
 #
@@ -307,15 +309,24 @@ fixedParam = numpy.matlib.mat([[ 0.62870709], [ 0.7012026 ], [ 0.60236784],
 #
 #
 
-C = 10000
+C = 1
 
+###############################################################################
+# 
+# CHOOSING THE MODE 
+# 
 # 'normal' means work like Palma 'using_quality_scores' means work like Palma
 # plus using sequencing quality scores
+#
+###############################################################################
 
 #mode = 'normal'
-
 mode = 'using_quality_scores'
 
+
+
+###############################################################################
+# 
 # When using quality scores our scoring function is defined as
 #
 # f: S_e x R x S -> R
@@ -346,6 +357,7 @@ mode = 'using_quality_scores'
 #
 # At ests we do not have gaps with quality scores so we look up the matchmatrix
 # instead.
+###############################################################################
 
 numDonSuppPoints     = 30
 numAccSuppPoints     = 30
@@ -375,6 +387,15 @@ totalQualSuppPoints = numQualPlifs*numQualSuppPoints
 numFeatures = numDonSuppPoints + numAccSuppPoints\
 + numLengthSuppPoints + sizeMatchmatrix[0]*sizeMatchmatrix[1] + totalQualSuppPoints 
 
+
+###############################################################################
+#
+# GENERAL SETTINGS CONCERNING THE SOLVER
+#
+#
+#
+###############################################################################
+
 iter_steps = 40
 remove_duplicate_scores = False
 print_matrix            = False
@@ -387,22 +408,29 @@ elif mode == 'using_quality_scores':
 else:
    assert False, 'Wrong operation mode specified'
 
+###############################################################################
+#
+# DATA SETTINGS CONCERNING THE SPLITS AND FILE LOCATIONS
 #
 #
+#
+###############################################################################
+
 training_begin    =    0
-training_end      = 1000
-prediction_begin  =    0
-prediction_end    = 1000
+training_end      = 1500
 
+prediction_begin  = 1500
+prediction_end    = 2200
 
-#
-#
-#
 dna_filename         = '/fml/ag-raetsch/share/projects/qpalma/solexa/allGenes.pickle'
 est_filename         = '/fml/ag-raetsch/share/projects/qpalma/solexa/filteredReads_recent'
 tair7_seq_filename   = '/fml/ag-raetsch/share/projects/qpalma/solexa/tair7_seq.pickle'
 
-# Check for valid parameters
+###############################################################################
+#
+# SANITY CHECKS
+#
+###############################################################################
 assert numQualPlifs       >= 0
 assert numDonSuppPoints    > 1
 assert numAccSuppPoints    > 1
@@ -412,3 +440,4 @@ assert numQualSuppPoints   > 1
 assert os.path.exists(dna_filename), 'DNA data does not exist!'
 assert os.path.exists(est_filename), 'EST/Reads data does not exist!'
 assert os.path.exists(tair7_seq_filename), 'Sequence data does not exist!'
+
index 33b2995..f123dd7 100644 (file)
@@ -8,7 +8,9 @@ import Configuration as Conf
 from PyGff import *
 import io_pickle
 import scipy.io
-import pdb
+import sys
+
+from genome_utils import load_genomic
 
 def paths_load_data_solexa(expt,genome_info,PAR,test=False):
    # expt can be 'training','validation' or 'test'
@@ -16,9 +18,10 @@ def paths_load_data_solexa(expt,genome_info,PAR,test=False):
 
    dna_filename = Conf.dna_filename
    est_filename = Conf.est_filename
-   tair7_seq_filename = Conf.tair7_seq_filename 
 
-   tair7_seq = cPickle.load(open(tair7_seq_filename))
+   #tair7_seq_filename = Conf.tair7_seq_filename 
+   #tair7_seq = cPickle.load(open(tair7_seq_filename))
+
    allGenes  = cPickle.load(open(dna_filename))
 
    Sequences = []
@@ -29,6 +32,7 @@ def paths_load_data_solexa(expt,genome_info,PAR,test=False):
    Qualities = []
    SplitPositions = []
 
+   # Iterate over all reads
    for line in open(est_filename):
       line = line.strip()
       chr,strand,seq,splitpos,length,prb,cal,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split()
@@ -43,38 +47,50 @@ def paths_load_data_solexa(expt,genome_info,PAR,test=False):
       currentGene = allGenes[gene_id]
       seq = seq.lower()
 
-      try:
-         currentSeq = tair7_seq[gene_id+'.1']['sequence'].lower()
-      except:
-         continue
+      #try:
+      #   currentSeq = tair7_seq[gene_id+'.1']['sequence'].lower()
+      #except:
+      #   continue
+
+      genome_file = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+      chrom    = 'chr1'
+
+      # use matlab-style functions to access dna sequence
+      dna = load_genomic(chrom,'+',currentGene.start,currentGene.stop,genome_file,one_based=True)
+      currentSeq = dna
 
       if len(currentSeq) < (currentGene.stop - currentGene.start):
          continue
-         
-      oldSeq = currentSeq
 
+      # compare both sequences
+      #assert dna == currentSeq, pdb.set_trace()
+   
       p_start = int(p_start)
       exon_stop = int(exon_stop)
       exon_start = int(exon_start)
       p_stop = int(p_stop)
 
-      assert p_start <= exon_stop <= exon_start <= p_stop, 'Invalid Indices'
+      assert p_start < exon_stop < exon_start < p_stop, 'Invalid Indices'
+      assert exon_stop - p_start + p_stop - exon_start == 36, 'Invalid read size'
       assert p_stop - p_start >= 36
-
+   
       currentExons = zeros((2,2))
 
-      currentExons[0,0] = p_start-currentGene.start
-      currentExons[0,1] = exon_stop-currentGene.start
+      currentExons[0,0] = p_start
+      currentExons[0,1] = exon_stop
 
-      currentExons[1,0] = exon_start-currentGene.start
-      currentExons[1,1] = p_stop-currentGene.start
+      currentExons[1,0] = exon_start
+      currentExons[1,1] = p_stop
 
-      import copy
-      oldExons = copy.deepcopy(currentExons)
-         
+      # make indices relative to start pos of the current gene
+      currentExons -= currentGene.start
+
+      # determine cutting positions for sequences
       up_cut   = int(currentExons[0,0])
       down_cut = int(currentExons[1,1])
 
+      # if we perform testing we cut a much wider region as we want to detect
+      # how good we perform on a region
       if test:
          up_cut = up_cut-500
          if up_cut < 0:
@@ -85,10 +101,11 @@ def paths_load_data_solexa(expt,genome_info,PAR,test=False):
          if down_cut > len(currentSeq):
             down_cut = len(currentSeq)
 
+      # cut out part of the sequence
       currentSeq = currentSeq[up_cut:down_cut]
 
+      # ensure that index is correct
       currentExons[0,1] += 1
-      #currentExons[1,0] += 1
 
       if test:
          currentExons -= up_cut
@@ -96,7 +113,8 @@ def paths_load_data_solexa(expt,genome_info,PAR,test=False):
          currentExons -= currentExons[0,0]
 
       try:
-         if not (currentSeq[int(currentExons[0,1])] == 'g' and currentSeq[int(currentExons[0,1])+1] == 't'):
+         if not (currentSeq[int(currentExons[0,1])] == 'g' and\
+         currentSeq[int(currentExons[0,1])+1] in ['c','t' ]):
             continue
 
          if not (currentSeq[int(currentExons[1,0])-1] == 'g' and currentSeq[int(currentExons[1,0])-2] == 'a'):
@@ -107,9 +125,65 @@ def paths_load_data_solexa(expt,genome_info,PAR,test=False):
 
       Sequences.append(currentSeq)
 
-      currentSize = len(Sequences[-1])
-      Acceptors.append([0]*currentSize)
-      Donors.append([0]*currentSize)
+      # now we want to use interval_query to get the predicted splice scores
+      # trained on the TAIR sequence and annotation
+
+      from Genefinding import *
+      
+      interval_matrix = createIntArrayFromList([currentGene.start+up_cut,currentGene.start+down_cut])
+      num_fields = 1
+      num_intervals = 1
+
+      # fetch acceptor scores
+      fname =  '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/acc/output_SVMWD/pred/contig_1+' 
+      gf = Genef()
+      num_hits = gf.query(fname, ['Conf'], num_fields, interval_matrix, num_intervals)
+      assert num_hits <= len(currentSeq)
+      
+      #print 'Acceptor hits: %d' % num_hits
+
+      pos = createIntArrayFromList([0]*num_hits)
+      indices = createIntArrayFromList([0]*num_hits)
+      scores = createDoubleArrayFromList([0]*num_hits*num_fields)
+      gf.getResults(pos,indices,scores)
+
+      currentAcceptors  = [-inf]*(len(currentSeq)+1)
+      
+      for i in range(num_hits):
+         position = pos[i] 
+         position -= (currentGene.start+up_cut)
+         assert 0 <= position < len(currentSeq)+1, 'position index wrong'
+         currentAcceptors[position] = scores[i]
+
+      currentAcceptors = currentAcceptors[1:] + [-inf]
+      Acceptors.append(currentAcceptors)
+
+      del gf
+
+      # fetch donor scores
+      fname =  '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_1+' 
+      gf = Genef()
+      num_hits = gf.query(fname, ['Conf'], num_fields, interval_matrix, num_intervals)
+      assert num_hits <= len(currentSeq)
+
+      #print 'Donor hits: %d' % num_hits
+      pos = createIntArrayFromList([0]*num_hits)
+      indices = createIntArrayFromList([0]*num_hits)
+      scores = createDoubleArrayFromList([0]*num_hits*num_fields)
+      gf.getResults(pos,indices,scores)
+
+      currentDonors     = [-inf]*(len(currentSeq))
+
+      try:
+         for i in range(1,num_hits):
+            position = pos[i] 
+            position -= (currentGene.start+up_cut)
+            currentDonors[position-1] = scores[i]
+      except:
+         pdb.set_trace()
+
+      currentDonors = [-inf] + currentDonors[:-1]
+      Donors.append(currentDonors)
 
       Exons.append(currentExons)
       Ests.append(seq)
@@ -126,7 +200,7 @@ def paths_load_data_solexa(expt,genome_info,PAR,test=False):
       #   Qualities = Qualities[:-1]
       #   SplitPositions = SplitPositions[:-1]
    
-   print 'found %d examples' % len(Sequences)
+   #print 'found %d examples' % len(Sequences)
 
    return Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPositions
 
@@ -162,7 +236,7 @@ def paths_load_data_pickle(expt,genome_info,PAR):
          train_data = '%s/exons_train_local.pickle' % genome_info.basedir
          data = cPickle.load(open(train_data))
 
-      print 'train_data is %s' % train_data
+      #print 'train_data is %s' % train_data
 
       Sequences = data['Train']       # dna sequences
       Acceptors = data['TrainAcc']    # acceptor scores
@@ -225,7 +299,7 @@ def paths_load_data(expt,genome_info,PAR):
          #microexon_train_data = io_pickle.load(train_data_pickle)
          data = scipy.io.loadmat(train_data)
 
-      print 'train_data is %s' % train_data
+      #print 'train_data is %s' % train_data
 
       Sequences = data['Train']       # dna sequences
       Acceptors = data['TrainAcc']    # acceptor scores
index 3b015fc..08e51f6 100644 (file)
@@ -72,6 +72,8 @@ class SIQP:
       totalQualSP = Configuration.totalQualSuppPoints
       numQPlifs = Configuration.numQualPlifs 
 
+      regC = self.numFeatures / 1.0*self.numExamples
+
       for j in range(lengthSP-1):
          self.P[j,j+1] = -1.0 
          self.P[j+1,j] = -1.0
@@ -97,6 +99,30 @@ class SIQP:
             self.P[j+1,j] = -1.0
             self.P[j,j] += 1.0
 
+      # 0.25 for each was already good
+
+      lengthGroupParam  = 0.005
+      spliceGroupParam  = 0.005
+
+      matchGroupParam   = 0.495
+      qualityGroupParam = 0.495
+
+      self.P[0:lengthSP,0:lengthSP] *= lengthGroupParam
+
+      beg = lengthSP
+      end = lengthSP+donSP+accSP
+      self.P[beg:end,beg:end]       *= spliceGroupParam
+
+      beg = lengthSP+donSP+accSP
+      end = lengthSP+donSP+accSP+mmatrixSP
+      self.P[beg:end,beg:end]       *= matchGroupParam
+
+      beg = lengthSP+donSP+accSP+mmatrixSP
+      self.P[beg:-self.numExamples,beg:-self.numExamples]   *= qualityGroupParam
+
+      self.P[0:-self.numExamples,0:-self.numExamples]       *= regC*1e-3
+
+
    def createRegularizer(self):
       #self.createUnitRegularizer()
       self.createSmoothnessRegularizer()
index b6635f6..125c47a 100644 (file)
@@ -75,6 +75,7 @@ class SIQPSolver(SIQP):
 
       CPX.setintparam(self.env, CPX_PARAM_SCRIND, CPX_ON) # print >> self.protocol, info to screen
       CPX.setintparam(self.env, CPX_PARAM_DATACHECK, CPX_ON)
+      #CPX.setintparam(self.env, CPX_PARAM_QPMETHOD, 2)
 
       # create CPLEX problem, add objective and constraints to it
       self.lp = CPX.createprob(self.env, 'test1')
@@ -106,6 +107,11 @@ class SIQPSolver(SIQP):
                self.matbeg, self.matcnt, self.matind, self.matval, 
                self.lb, self.ub) 
 
+      import pdb
+      #pdb.set_trace()
+
+      assert sum(self.P[self.numFeatures:,self.numFeatures:]) == 0.0, 'slack variables are regularized'
+
       self.qmatbeg, self.qmatcnt, self.qmatind, self.qmatval = self.cpx_matrix(self.P)
       CPX.copyquad(self.env, self.lp, self.qmatbeg, self.qmatcnt, self.qmatind, self.qmatval)
 
index f760031..47462f3 100644 (file)
@@ -42,17 +42,21 @@ def set_param_palma(param, train_with_intronlengthinformation,\
 
    print 'Setting parameters ...'
    
-   if min_intron_len == None:
-      if train_with_intronlengthinformation:
-         min_intron_len=20
-         max_intron_len=1000
-      else:
-         min_intron_len = 1
-         max_intron_len = 2
-
-   if min_intron_len != None and max_intron_len != None:
-      min_svm_score=-5
-      max_svm_score=5
+   #if min_intron_len == None:
+   #   if train_with_intronlengthinformation:
+   #      min_intron_len=20
+   #      max_intron_len=1000
+   #   else:
+   #      min_intron_len = 1
+   #      max_intron_len = 2
+   min_intron_len=4
+   max_intron_len=10000
+
+   #if min_intron_len != None and max_intron_len != None:
+   #   min_svm_score=-5
+   #   max_svm_score=5
+   min_svm_score=-3
+   max_svm_score=1
 
    h = Plf()
    d = Plf()
@@ -70,42 +74,41 @@ def set_param_palma(param, train_with_intronlengthinformation,\
    # Gapfunktion
    ####################
    h.limits    = logspace(math.log(min_intron_len,10),math.log(max_intron_len,10),30) 
+   #h.limits    = linspace(min_intron_len,max_intron_len,30)
    h.penalties = param[0:lengthSP].flatten().tolist()[0]
    #h.transform = '+1' 
    h.transform = '' 
    h.name      = 'h' 
-   h.max_len   =  100000 
-   h.min_len   =  4 
+   h.max_len   = max_intron_len
+   h.min_len   = min_intron_len
    h.id        = 1 
    h.use_svm   = 0 
    h.next_id   = 0 
 
-
    ####################
    # Donorfunktion
    ####################
-   d.limits    = linspace(min_svm_score,max_svm_score,30) 
+   d.limits    = linspace(min_svm_score,max_svm_score,30)
    d.penalties = param[lengthSP:lengthSP+donSP].flatten().tolist()[0]
    #d.transform = '+1' 
    d.transform = '' 
    d.name      = 'd' 
-   d.max_len   = 100 
-   d.min_len   = -100 
+   d.max_len   = max_svm_score
+   d.min_len   = min_svm_score
    d.id        = 1 
    d.use_svm   = 0 
    d.next_id   = 0 
 
-
    ####################
    # Acceptorfunktion
    ####################
-   a.limits    = linspace(min_svm_score,max_svm_score,30) 
+   a.limits    = linspace(min_svm_score,max_svm_score,30)
    a.penalties = param[lengthSP+donSP:lengthSP+donSP+accSP].flatten().tolist()[0]
    #a.transform = '+1' 
    a.transform = '' 
    a.name      = 'a' 
-   a.max_len   = 100  
-   a.min_len   = -100  
+   a.max_len   = max_svm_score
+   a.min_len   = min_svm_score
    a.id        = 1 
    a.use_svm   = 0 
    a.next_id   = 0 
index 12c1475..ed94ab1 100644 (file)
@@ -38,7 +38,6 @@ from qpalma.TrainingParam import Param
 from qpalma.Plif import Plf
 
 from qpalma.tools.splicesites import getDonAccScores
-
 from qpalma.Configuration import *
 
 class para:
@@ -56,7 +55,6 @@ class QPalma:
       #ginfo_filename = 'genome_info.pickle'
       #self.genome_info = fetch_genome_info(ginfo_filename)
       #self.plog('genome_info.basedir is %s\n'%self.genome_info.basedir)
-
       #self.ARGS.train_with_splicesitescoreinformation = False
 
       # Load the whole dataset 
@@ -91,7 +89,6 @@ class QPalma:
       else:
          assert(False)
 
-
    def plog(self,string):
       self.logfh.write(string)
       self.logfh.flush()
@@ -234,6 +231,15 @@ class QPalma:
             dna_len = len(dna)
             est_len = len(est)
 
+            # check that splice site scores are at dna positions as expected by
+            # the dynamic programming component
+            for d_pos in [pos for pos,elem in enumerate(donor) if elem != -inf]:
+               assert dna[d_pos] == 'g' and (dna[d_pos+1] == 'c'\
+               or dna[d_pos+1] == 't'), pdb.set_trace()
+                
+            for a_pos in [pos for pos,elem in enumerate(acceptor) if elem != -inf]:
+               assert dna[a_pos-1] == 'a' and dna[a_pos] == 'g', pdb.set_trace()
+
             ps = h.convert2SWIG()
 
             prb = QPalmaDP.createDoubleArrayFromList(quality)
@@ -451,13 +457,24 @@ class QPalma:
       #cPickle.dump(pa,open('elegans.param','w+'))
       self.logfh.close()
 
-
-   def predict(self):
-      self.logfh = open('_qpalma_predict.log','w+')
-
+   def evaluate(self,param_filename):
       beg = Conf.prediction_begin
       end = Conf.prediction_end
 
+      # predict on training set
+      print '##### Prediction on the training set #####'
+      self.predict(param_filename,0,beg)
+      
+      # predict on test set
+      print '##### Prediction on the test set #####'
+      self.predict(param_filename,beg,end)
+   
+      pdb.set_trace()
+
+   def predict(self,param_filename,beg,end):
+
+      self.logfh = open('_qpalma_predict.log','w+')
+
       Sequences   = self.Sequences[beg:end]
       Exons       = self.Exons[beg:end]
       Ests        = self.Ests[beg:end]
@@ -480,7 +497,7 @@ class QPalma:
       print_matrix            = Conf.print_matrix 
       anzpath                 = Conf.anzpath 
 
-      param_filename='/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/param_6.pickle'
+      #param_filename='/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/param_25.pickle'
       param = load_param(param_filename)
 
       # Set the parameters such as limits penalties for the Plifs
@@ -502,8 +519,10 @@ class QPalma:
       currentPhi = zeros((Conf.numFeatures,1))
       totalQualityPenalties = zeros((totalQualSP,1))
 
-      total_up_off = []
-      total_down_off = []
+      exon1Begin = []
+      exon1End = []
+      exon2Begin = []
+      exon2End = []
       allWrongExons = []
 
       for exampleIdx in range(numExamples):
@@ -604,27 +623,20 @@ class QPalma:
          newWeightMatch = zeros((mm_len,1))
          newQualityPlifsFeatures = zeros((Conf.totalQualSuppPoints,1))
 
-         #print 'newSpliceAlign'
          for i in range(dna_len):
             newSpliceAlign[i] = c_SpliceAlign[i]
-         #   print '%f' % (spliceAlign[i])
 
-         #print 'newEstAlign'
          for i in range(est_len):
             newEstAlign[i] = c_EstAlign[i]
-         #   print '%f' % (spliceAlign[i])
 
-         #print 'weightMatch'
          for i in range(mm_len):
             newWeightMatch[i] = c_WeightMatch[i]
-         #   print '%f' % (weightMatch[i])
 
          newDPScores = c_DPScores[0]
 
          for i in range(Conf.totalQualSuppPoints):
             newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
 
-         #print "Calling destructors"
          del c_SpliceAlign
          del c_EstAlign
          del c_WeightMatch
@@ -634,13 +646,6 @@ class QPalma:
 
          newSpliceAlign = newSpliceAlign.reshape(1,dna_len)
          newWeightMatch = newWeightMatch.reshape(1,mm_len)
-         # Calculate weights of the respective alignments Note that we are
-         # calculating n-best alignments without hamming loss, so we
-         # have to keep track which of the n-best alignments correspond to
-         # the true one in order not to incorporate a true alignment in the
-         # constraints. To keep track of the true and false alignments we
-         # define an array true_map with a boolean indicating the
-         # equivalence to the true alignment for each decoded alignment.
          true_map = [0]*2
          true_map[0] = 1
          pathNr = 0
@@ -665,35 +670,47 @@ class QPalma:
          if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
             true_map[pathNr+1] = 1
 
-         up_off,down_off,newExons = evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign,currentSplitPos)
-         #evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign,currentSplitPos)
-         #print up_off,down_off 
+         e1_b_off,e1_e_off,e2_b_off,e2_e_off,newExons = evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign,currentSplitPos)
 
-         if up_off > -1:
-            total_up_off.append(up_off)
-            total_down_off.append(down_off)
+         if e1_b_off != None:
+            exon1Begin.append(e1_b_off)
+            exon1End.append(e1_e_off)
+            exon2Begin.append(e2_b_off)
+            exon2End.append(e2_e_off)
          else:
             allWrongExons.append((newExons,exons))
 
-      total_up = 0
-      total_down = 0 
-      for idx in range(len(total_up_off)):
-         total_up    += total_up_off[idx]
-         total_down  += total_down_off[idx]
-         
-      total_up /= len(total_up_off)
-      total_down /= len(total_down_off)
+      e1Begin_pos,e1Begin_neg,e1End_pos,e1End_neg,mean_e1Begin_neg,mean_e1End_neg = self.evaluatePositions(exon1Begin,exon1End)
+      e2Begin_pos,e2Begin_neg,e2End_pos,e2End_neg,mean_e2Begin_neg,mean_e2End_neg = self.evaluatePositions(exon2Begin,exon2End)
 
-      print 'Mean up_off is %f' % total_up
-      print 'Mean down_off is %f' % total_down
-      print 'Correct up_off len is %d' % len([elem for elem in total_up_off if elem in [0,1]])
-      print 'Correct down_off len is %d' % len([elem for elem in total_down_off if elem in [0,1]])
+      print 'Total num. of Examples: %d' % numExamples
+      print 'Correct positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_pos),len(e1End_pos),len(e2Begin_pos),len(e2End_pos))
+      print 'Incorrect positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_neg),len(e1End_neg),len(e2Begin_neg),len(e2End_neg))
+      print 'Mean of pos. offset:\t\t%.2f\t%.2f\t%.2f\t%.2f' % (mean_e1Begin_neg,mean_e1End_neg,mean_e2Begin_neg,mean_e2End_neg)
 
       print 'Prediction completed'
+      self.logfh.close()
 
-      pdb.set_trace()
+   def evaluatePositions(self,eBegin,eEnd):
+      eBegin_pos = [elem for elem in eBegin if elem == 0]
+      eBegin_neg = [elem for elem in eBegin if elem != 0]
+      eEnd_pos = [elem for elem in eEnd if elem == 0]
+      eEnd_neg = [elem for elem in eEnd if elem != 0]
+
+      mean_eBegin_neg = 0
+      for idx in range(len(eBegin_neg)):
+         mean_eBegin_neg += eBegin_neg[idx]
+         
+      mean_eBegin_neg /= 1.0*len(eBegin_neg)
+
+      mean_eEnd_neg = 0
+      for idx in range(len(eEnd_neg)):
+         mean_eEnd_neg += eEnd_neg[idx]
+
+      mean_eEnd_neg /= 1.0*len(eEnd_neg)
+
+      return eBegin_pos,eBegin_neg,eEnd_pos,eEnd_neg,mean_eBegin_neg,mean_eEnd_neg
 
-      self.logfh.close()
 
 def fetch_genome_info(ginfo_filename):
    if not os.path.exists(ginfo_filename):
@@ -762,38 +779,92 @@ def evaluateExample(dna,est,exons,SpliceAlign,newEstAlign,spos):
          oldElem = SpliceAlign[pos-1]
 
       if oldElem != 0 and elem == 0: # start of exon
-         newExons.append(pos-1)
+         newExons.append(pos)
 
       if oldElem == 0 and elem != 0: # end of exon
          newExons.append(pos)
 
-   up_off   = -1
-   down_off = -1
+   e1_b_off,e1_e_off,e2_b_off,e2_e_off = 0,0,0,0
 
-   if len(newExons) != 4:
-      acc = 0.0
-   else:
+   if len(newExons) == 4:
       e1_begin,e1_end = newExons[0],newExons[1]
       e2_begin,e2_end = newExons[2],newExons[3]
 
-      up_off   = int(math.fabs(e1_end - exons[0,1]))
-      down_off = int(math.fabs(e2_begin - exons[1,0]))
+   #elif len(newExons) > 4 :
+   #   e1_begin,e1_end = newExons[0],newExons[1]
+   #   e2_begin,e2_end = newExons[-2],newExons[-1]
+   else:
+      return None,None,None,None,newExons
+
+   e1_b_off = int(math.fabs(e1_begin - exons[0,0]))
+   e1_e_off = int(math.fabs(e1_end - exons[0,1]))
+
+   e2_b_off = int(math.fabs(e2_begin - exons[1,0]))
+   e2_e_off = int(math.fabs(e2_end - exons[1,1]))
+
+   return e1_b_off,e1_e_off,e2_b_off,e2_e_off,newExons
+
+def calcStat(Acceptor,Donor,Exons):
+   maxAcc = -100
+   minAcc = 100
+   maxDon = -100
+   minDon = 100
+
+   acc_vec_pos = []
+   acc_vec_neg = []
+   don_vec_pos = []
+   don_vec_neg = []
+
+   for jdx in range(len(Acceptors)):
+      currentExons = Exons[jdx]
+      currentAcceptor = Acceptors[jdx]
+      currentAcceptor = currentAcceptor[1:]
+      currentAcceptor.append(-inf)
+      currentDonor = Donors[jdx]
+
+      for idx,elem in enumerate(currentAcceptor):
+         if idx == (int(currentExons[1,0])-1): # acceptor site
+            acc_vec_pos.append(elem)
+         else:
+            acc_vec_neg.append(elem)
+
+      for idx,elem in enumerate(currentDonor):
+         if idx == (int(currentExons[0,1])): # donor site
+            don_vec_pos.append(elem)
+         else:
+            don_vec_neg.append(elem)
 
-   return up_off,down_off,newExons
+   acc_pos = [elem for elem in acc_vec_pos if elem != -inf]
+   acc_neg = [elem for elem in acc_vec_neg if elem != -inf]
+   don_pos = [elem for elem in don_vec_pos if elem != -inf]
+   don_neg = [elem for elem in don_vec_neg if elem != -inf]
 
+   pdb.set_trace()
+
+   for idx in range(len(Sequences)):
+      acc = [elem for elem in Acceptors[idx] if elem != -inf]
+      maxAcc = max(max(acc),maxAcc)
+      minAcc = min(min(acc),minAcc)
+
+      don = [elem for elem in Donors[idx] if elem != -inf]
+      maxDon = max(max(don),maxDon)
+      minDon = min(min(don),minDon)
 
 if __name__ == '__main__':
 
    qpalma = QPalma()
-   if len(sys.argv) != 2:
-      print 'You have to choose between training or prediction mode:'
-      print 'python qpalma. py (train|predict)' 
 
-   mode = sys.argv[1]
-   assert mode in ['train','predict']
-
-   if mode == 'train':
+   if len(sys.argv) == 2:
+      mode = sys.argv[1]
+      assert mode == 'train'
       qpalma.train()
-      
-   if mode == 'predict':
-      qpalma.predict()
+
+   elif len(sys.argv) == 3:
+      mode = sys.argv[1]
+      param_filename = sys.argv[2]
+      assert mode == 'predict' 
+      assert os.path.exists(param_filename)
+      qpalma.evaluate(param_filename)
+   else:
+      print 'You have to choose between training or prediction mode:'
+      print 'python qpalma. py (train|predict) <param_file>'