merged training/prediction scripts
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 25 Jan 2008 16:48:54 +0000 (16:48 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 25 Jan 2008 16:48:54 +0000 (16:48 +0000)
git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@7584 e1793c9e-67f9-0310-80fc-b846ff1f7b36

qpalma/Configuration.py
qpalma/numpyUtils.py
scripts/qpalma_predict.py
scripts/qpalma_train.py

index cba6041..654eafd 100644 (file)
@@ -387,6 +387,14 @@ elif mode == 'using_quality_scores':
 else:
    assert False, 'Wrong operation mode specified'
 
+#
+#
+training_begin    =    0
+training_end      = 1000
+prediction_begin  =    0
+prediction_end    = 1000
+
+
 #
 #
 #
index 14ece77..175b385 100644 (file)
@@ -3,6 +3,3 @@
 
 import numpy as ny
 import numpy.matlib as nyml
-
-
-def 
index dd6ecf8..67bfd63 100644 (file)
@@ -51,8 +51,8 @@ class QPalma:
    
    def __init__(self):
       self.ARGS = Param()
-
       self.logfh = open('qpalma.log','w+')
+
       #gen_file= '%s/genome.config' % self.ARGS.basedir
       #ginfo_filename = 'genome_info.pickle'
       #self.genome_info = fetch_genome_info(ginfo_filename)
@@ -64,19 +64,17 @@ class QPalma:
       self.logfh.write(string)
       self.logfh.flush()
 
-   def run(self):
+   def predict(self):
       # Load the whole dataset 
-      if Conf.mode == 'normal':
+      if Configuration.mode == 'normal':
          #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
          Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
          use_quality_scores = False
 
-      elif Conf.mode == 'using_quality_scores':
+      elif Confguration.mode == 'using_quality_scores':
          Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos =\
          paths_load_data_solexa('training',None,self.ARGS,True)
 
-         #pdb.set_trace()
-
          begin = 0
          end = 1000
          Sequences   = Sequences[begin:end]
@@ -92,10 +90,7 @@ class QPalma:
 
 
          #Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
-         #SplitPos = [1]*len(Qualities)
-
          #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
-         #pdb.set_trace()
          #Qualities = []
          #for i in range(len(Ests)):
          #   Qualities.append([40]*len(Ests[i]))
index 1daf44c..9392639 100644 (file)
@@ -3,8 +3,15 @@
 
 ###########################################################
 #
+# The QPalma project aims at extending the Palma project 
+# to be able to use Solexa reads toegether with their 
+# quality scores.
+# 
+# This file represents the conversion of the main matlab 
+# training loop for Palma to python.
+# 
+# Author: Fabio De Bona
 # 
-#
 ###########################################################
 
 import sys
@@ -19,30 +26,24 @@ from numpy.linalg import norm
 import QPalmaDP
 
 import qpalma
+import qpalma.Configuration
+
 from qpalma.SIQP_CPX import SIQPSolver
 from qpalma.DataProc import *
-
 from qpalma.generateEvaluationData import *
 from qpalma.computeSpliceWeights import *
 from qpalma.set_param_palma import *
 from qpalma.computeSpliceAlignWithQuality import *
 from qpalma.penalty_lookup_new import *
 from qpalma.compute_donacc import *
-from qpalma.TrainingParam import Param
 from qpalma.export_param import *
-
-import qpalma.Configuration
+from qpalma.TrainingParam import Param
 from qpalma.Plif import Plf
-from qpalma.Helpers import *
 
 from qpalma.tools.splicesites import getDonAccScores
 
-def getQualityFeatureCounts(qualityPlifs):
-   weightQuality = qualityPlifs[0].penalties
-   for currentPlif in qualityPlifs[1:]:
-      weightQuality = numpy.vstack([weightQuality, currentPlif.penalties])
-
-   return weightQuality 
+class para:
+   pass
 
 class QPalma:
    """
@@ -60,45 +61,28 @@ class QPalma:
 
       #self.ARGS.train_with_splicesitescoreinformation = False
 
-   def plog(self,string):
-      self.logfh.write(string)
-      self.logfh.flush()
-
-   def run(self):
       # Load the whole dataset 
       if Configuration.mode == 'normal':
          #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
          Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
+         self.use_quality_scores = False
 
-         Donors, Acceptors = getDonAccScores(Sequences)
-
-         use_quality_scores = False
       elif Configuration.mode == 'using_quality_scores':
 
-
          filename = 'real_dset_%s'% 'recent'
          if True:# not os.path.exists(filename):
             Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos = paths_load_data_solexa('training',None,self.ARGS)
 
             end = 1000
-            Sequences   = Sequences[:end]
-            Exons       = Exons[:end]
-            Ests        = Ests[:end]
-            Qualities   = Qualities[:end]
-            Acceptors   = Acceptors[:end]
-            Donors      = Donors[:end]
-
-            pdb.set_trace()
-
-            Donors, Acceptors = getDonAccScores(Sequences)
+            self.Sequences   = Sequences[:end]
+            self.Exons       = Exons[:end]
+            self.Ests        = Ests[:end]
+            self.Qualities   = Qualities[:end]
+            self.Acceptors   = Acceptors[:end]
+            self.Donors      = Donors[:end]
+
+            self.Donors, self.Acceptors = getDonAccScores(Sequences)
             #dset = Dataset()
-            #dset.Sequences = Sequences
-            #dset.Acceptor  = Acceptors
-            #dset.Donors    = Donors
-            #dset.Exons     = Exons
-            #dset.Ests      = Ests
-            #dset.Qualities = Qualities
-            #cPickle.dump(dset,open(filename,'w+'))
          else:
             dset = cPickle.load(open(filename))
             Sequences = dset.Sequences
@@ -113,7 +97,7 @@ class QPalma:
          #Qualities = []
          #for i in range(len(Ests)):
          #   Qualities.append([40]*len(Ests[i]))
-         use_quality_scores = True
+         self.use_quality_scores = True
       else:
          assert(False)
 
@@ -125,6 +109,21 @@ class QPalma:
       self.plog('Number of training examples: %d\n'% N)
       print 'Number of features: %d\n'% Configuration.numFeatures
 
+   def plog(self,string):
+      self.logfh.write(string)
+      self.logfh.flush()
+
+   def train(self):
+      beg = Configuration.training_begin
+      end = Configuration.training_end
+
+      Sequences   = self.Sequences[beg:end]
+      Exons       = self.Exons[beg:end]
+      Ests        = self.Ests[beg:end]
+      Qualities   = self.Qualities[beg:end]
+      Acceptors   = self.Acceptors[beg:end]
+      Donors      = self.Donors[beg:end]
+
       iteration_steps         = Configuration.iter_steps ; #upper bound on iteration steps
       remove_duplicate_scores = Configuration.remove_duplicate_scores 
       print_matrix            = Configuration.print_matrix 
@@ -150,9 +149,9 @@ class QPalma:
          solver = SIQPSolver(Configuration.numFeatures,self.numExamples,Configuration.C,self.logfh)
 
       # stores the number of alignments done for each example (best path, second-best path etc.)
-      num_path = [anzpath]*
+      num_path = [anzpath]*self.numExamples
       # stores the gap for each example
-      gap      = [0.0]*N
+      gap      = [0.0]*self.numExamples
 
       #############################################################################################
       # Training
@@ -195,10 +194,6 @@ class QPalma:
             don_supp = Donors[exampleIdx] 
             acc_supp = Acceptors[exampleIdx] 
 
-            #if exons[-1,1] > len(dna):
-            #   continue
-
-            #pdb.set_trace()
             # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)    
             trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
             
@@ -207,7 +202,6 @@ class QPalma:
             trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
             trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
 
-
             currentPhi[0:donSP]                                               = mat(d.penalties[:]).reshape(donSP,1)
             currentPhi[donSP:donSP+accSP]                                     = mat(a.penalties[:]).reshape(accSP,1)
             currentPhi[donSP+accSP:donSP+accSP+lengthSP]                      = mat(h.penalties[:]).reshape(lengthSP,1)
@@ -264,7 +258,7 @@ class QPalma:
             acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
 
             # Create the alignment object representing the interface to the C/C++ code.
-            currentAlignment = QPalmaDP.Alignment(Configuration.numQualPlifs,Configuration.numQualSuppPoints, use_quality_scores)
+            currentAlignment = QPalmaDP.Alignment(Configuration.numQualPlifs,Configuration.numQualSuppPoints, self.use_quality_scores)
 
             c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
             #print 'Calling myalign...'
@@ -315,7 +309,7 @@ class QPalma:
                newDPScores[i] = c_DPScores[i]
 
 
-            if use_quality_scores:
+            if self.use_quality_scores:
                for i in range(Configuration.totalQualSuppPoints*num_path[exampleIdx]):
                   newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
 
@@ -457,6 +451,235 @@ class QPalma:
       #cPickle.dump(pa,open('elegans.param','w+'))
       self.logfh.close()
 
+
+
+   def predict(self):
+      beg = Configuration.prediction_begin
+      end = Configuration.prediction_end
+
+      Sequences   = self.Sequences[beg:end]
+      Exons       = self.Exons[beg:end]
+      Ests        = self.Ests[beg:end]
+      Qualities   = self.Qualities[beg:end]
+      Acceptors   = self.Acceptors[beg:end]
+      Donors      = self.Donors[beg:end]
+
+      remove_duplicate_scores = Conf.remove_duplicate_scores 
+      print_matrix            = Conf.print_matrix 
+      anzpath                 = Conf.anzpath 
+
+      param_filename='/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/param_28.pickle'
+      param = load_param(param_filename)
+
+      # Set the parameters such as limits penalties for the Plifs
+      [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
+
+      #############################################################################################
+      # Prediction
+      #############################################################################################
+      self.plog('Starting prediction...\n')
+
+      donSP       = Conf.numDonSuppPoints
+      accSP       = Conf.numAccSuppPoints
+      lengthSP    = Conf.numLengthSuppPoints
+      mmatrixSP   = Conf.sizeMatchmatrix[0]\
+      *Conf.sizeMatchmatrix[1]
+      numq        = Conf.numQualSuppPoints
+      totalQualSP = Conf.totalQualSuppPoints
+
+      currentPhi = zeros((Conf.numFeatures,1))
+      totalQualityPenalties = zeros((totalQualSP,1))
+
+      total_up_off = []
+      total_down_off = []
+
+      for exampleIdx in range(self.numExamples):
+         dna = Sequences[exampleIdx] 
+         est = Ests[exampleIdx] 
+         currentSplitPos = SplitPos[exampleIdx]
+
+         if Conf.mode == 'normal':
+            quality = [40]*len(est)
+
+         if Conf.mode == 'using_quality_scores':
+            quality = Qualities[exampleIdx]
+
+         exons = Exons[exampleIdx] 
+         # NoiseMatrix = Noises[exampleIdx] 
+         don_supp = Donors[exampleIdx] 
+         acc_supp = Acceptors[exampleIdx] 
+
+         if exons[-1,1] > len(dna):
+            continue
+
+         # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)    
+         trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
+         
+         # Calculate the weights
+         trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
+         trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
+
+         currentPhi[0:donSP]                                               = mat(d.penalties[:]).reshape(donSP,1)
+         currentPhi[donSP:donSP+accSP]                                     = mat(a.penalties[:]).reshape(accSP,1)
+         currentPhi[donSP+accSP:donSP+accSP+lengthSP]                      = mat(h.penalties[:]).reshape(lengthSP,1)
+         currentPhi[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP]   = mmatrix[:]
+
+         if Conf.mode == 'using_quality_scores':
+            totalQualityPenalties = param[-totalQualSP:]
+            currentPhi[donSP+accSP+lengthSP+mmatrixSP:]                    = totalQualityPenalties[:]
+
+         # Calculate w'phi(x,y) the total score of the alignment
+         trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
+
+         # The allWeights vector is supposed to store the weight parameter
+         # of the true alignment as well as the weight parameters of the
+         # 1 other alignments
+         allWeights = zeros((Conf.numFeatures,1+1))
+         allWeights[:,0] = trueWeight[:,0]
+
+         AlignmentScores = [0.0]*(1+1)
+         AlignmentScores[0] = trueAlignmentScore
+
+         ################## Calculate wrong alignment(s) ######################
+
+         # Compute donor, acceptor with penalty_lookup_new
+         # returns two double lists
+         donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
+
+         #myalign wants the acceptor site on the g of the ag
+         acceptor = acceptor[1:]
+         acceptor.append(-inf)
+
+         dna = str(dna)
+         est = str(est)
+         dna_len = len(dna)
+         est_len = len(est)
+
+         ps = h.convert2SWIG()
+
+         prb = QPalmaDP.createDoubleArrayFromList(quality)
+         chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
+
+         matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
+         mm_len = Conf.sizeMatchmatrix[0]*Conf.sizeMatchmatrix[1]
+
+         d_len = len(donor)
+         donor = QPalmaDP.createDoubleArrayFromList(donor)
+         a_len = len(acceptor)
+         acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
+
+         # Create the alignment object representing the interface to the C/C++ code.
+         currentAlignment = QPalmaDP.Alignment(Conf.numQualPlifs,Conf.numQualSuppPoints, self.use_quality_scores)
+
+         c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
+
+         # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
+         currentAlignment.myalign(1, dna, dna_len,\
+          est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
+          acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
+          print_matrix)
+
+         c_SpliceAlign  = QPalmaDP.createIntArrayFromList([0]*(dna_len*1))
+         c_EstAlign     = QPalmaDP.createIntArrayFromList([0]*(est_len*1))
+         c_WeightMatch  = QPalmaDP.createIntArrayFromList([0]*(mm_len*1))
+         c_DPScores     = QPalmaDP.createDoubleArrayFromList([.0]*1)
+
+         c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(Conf.totalQualSuppPoints))
+
+         currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
+         c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
+
+         newSpliceAlign = zeros((dna_len,1))
+         newEstAlign    = zeros((est_len,1))
+         newWeightMatch = zeros((mm_len,1))
+         newQualityPlifsFeatures = zeros((Conf.totalQualSuppPoints,1))
+
+         #print 'newSpliceAlign'
+         for i in range(dna_len):
+            newSpliceAlign[i] = c_SpliceAlign[i]
+         #   print '%f' % (spliceAlign[i])
+
+         #print 'newEstAlign'
+         for i in range(est_len):
+            newEstAlign[i] = c_EstAlign[i]
+         #   print '%f' % (spliceAlign[i])
+
+         #print 'weightMatch'
+         for i in range(mm_len):
+            newWeightMatch[i] = c_WeightMatch[i]
+         #   print '%f' % (weightMatch[i])
+
+         newDPScores = c_DPScores[0]
+
+         for i in range(Conf.totalQualSuppPoints):
+            newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
+
+         #print "Calling destructors"
+         del c_SpliceAlign
+         del c_EstAlign
+         del c_WeightMatch
+         del c_DPScores
+         del c_qualityPlifsFeatures
+         del currentAlignment
+
+         newSpliceAlign = newSpliceAlign.reshape(1,dna_len)
+         newWeightMatch = newWeightMatch.reshape(1,mm_len)
+         # Calculate weights of the respective alignments Note that we are
+         # calculating n-best alignments without hamming loss, so we
+         # have to keep track which of the n-best alignments correspond to
+         # the true one in order not to incorporate a true alignment in the
+         # constraints. To keep track of the true and false alignments we
+         # define an array true_map with a boolean indicating the
+         # equivalence to the true alignment for each decoded alignment.
+         true_map = [0]*2
+         true_map[0] = 1
+         pathNr = 0
+
+         weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a, h, newSpliceAlign.flatten().tolist()[0], don_supp, acc_supp)
+
+         decodedQualityFeatures = zeros((Conf.totalQualSuppPoints,1))
+         for qidx in range(Conf.totalQualSuppPoints):
+            decodedQualityFeatures[qidx] = newQualityPlifsFeatures[qidx]
+
+         # Gewichte in restliche Zeilen der Matrix speichern
+         wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch.T, decodedQualityFeatures])
+         allWeights[:,pathNr+1] = wp
+
+         hpen = mat(h.penalties).reshape(len(h.penalties),1)
+         dpen = mat(d.penalties).reshape(len(d.penalties),1)
+         apen = mat(a.penalties).reshape(len(a.penalties),1)
+         features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties])
+         AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
+
+         # if the pathNr-best alignment is very close to the true alignment consider it as true
+         if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
+            true_map[pathNr+1] = 1
+
+         up_off,down_off = evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign,currentSplitPos)
+         #evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign,currentSplitPos)
+         #print up_off,down_off 
+
+         if up_off > -1:
+            total_up_off.append(up_off)
+            total_down_off.append(down_off)
+
+      total_up = 0
+      total_down = 0 
+      for idx in range(len(total_up_off)):
+         total_up    += total_up_off[idx]
+         total_down  += total_down_off[idx]
+         
+      total_up /= len(total_up_off)
+      total_down /= len(total_down_off)
+
+      print 'Mean up_off is %f' % total_up
+      print 'Mean down_off is %f' % total_down
+      print 'Correct up_off len is %d' % len([elem for elem in total_up_off if elem in [0,1]])
+      print 'Correct down_off len is %d' % len([elem for elem in total_down_off if elem in [0,1]])
+
+      print 'Prediction completed'
+      self.logfh.close()
+
 def fetch_genome_info(ginfo_filename):
    if not os.path.exists(ginfo_filename):
       cmd = ['']*4
@@ -477,6 +700,87 @@ def fetch_genome_info(ginfo_filename):
    else:
       return cPickle.load(open(ginfo_filename))
 
+def plifs2param(h,d,a,mmatrix,qualityPlifs):
+   donSP       = Conf.numDonSuppPoints
+   accSP       = Conf.numAccSuppPoints
+   lengthSP    = Conf.numLengthSuppPoints
+   mmatrixSP   = Conf.sizeMatchmatrix[0]\
+   *Conf.sizeMatchmatrix[1]
+
+
+   param = zeros((Conf.numFeatures,1))
+   param[0:donSP]                                               = mat(d.penalties[:]).reshape(donSP,1)
+   param[donSP:donSP+accSP]                                     = mat(a.penalties[:]).reshape(accSP,1)
+   param[donSP+accSP:donSP+accSP+lengthSP]                      = mat(h.penalties[:]).reshape(lengthSP,1)
+   param[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP]   = mmatrix[:]
+
+   for idx in range(len(qualityPlifs)):
+      currentPlif       = qualityPlifs[idx]
+      begin             = lengthSP+donSP+accSP+mmatrixSP+(idx*Conf.numQualSuppPoints)
+      end               = lengthSP+donSP+accSP+mmatrixSP+((idx+1)*Conf.numQualSuppPoints)
+      param[begin:end]  = mat(currentPlif.penalties).reshape(Conf.numQualSuppPoints,1)
+
+   return param
+
+def load_param(filename):
+   param = None 
+   #try:
+   #   para = cPickle.load(open(filename))
+   #except:
+   #   print 'Error: Could not open parameter file!'
+   #   sys.exit(1)
+   #
+   #param = plifs2param(para.h,para.d,para.a,para.mmatrix,para.qualityPlifs)
+
+   param = cPickle.load(open(filename))
+   return param
+
+def evaluateExample(dna,est,exons,SpliceAlign,newEstAlign,spos):
+   newExons = []
+   oldElem = -1
+   SpliceAlign = SpliceAlign.flatten().tolist()[0]
+   SpliceAlign.append(-1)
+   for pos,elem in enumerate(SpliceAlign):
+      if pos == 0:
+         oldElem = -1
+      else:
+         oldElem = SpliceAlign[pos-1]
+
+      if oldElem != 0 and elem == 0: # start of exon
+         newExons.append(pos-1)
+
+      if oldElem == 0 and elem != 0: # end of exon
+         newExons.append(pos)
+
+   up_off   = -1
+   down_off = -1
+
+   if len(newExons) != 4:
+      acc = 0.0
+   else:
+      e1_begin,e1_end = newExons[0],newExons[1]
+      e2_begin,e2_end = newExons[2],newExons[3]
+
+      up_off   = int(math.fabs(e1_end - exons[0,1]))
+      down_off = int(math.fabs(e2_begin - exons[1,0]))
+
+   #pdb.set_trace()
+
+   return up_off,down_off
+
 if __name__ == '__main__':
+
    qpalma = QPalma()
-   qpalma.run()
+   if len(sys.argv) != 2:
+      print 'You have to choose between training or prediction mode:'
+      print 'python qpalma. py (train|predict)' 
+
+   mode = sys.argv[1]
+   assert mode in ['train','predict']
+
+   if mode == 'train':
+      qpalma.train()
+      
+   if mode == 'predict':
+      qpalma.predict()
+