+ added feature calculation for the labels
[qpalma.git] / python / qpalma.py
index 4b5af6a..4efc708 100644 (file)
@@ -3,7 +3,7 @@
 
 ###########################################################
 #
-# This file contains the 
+# 
 #
 ###########################################################
 
@@ -11,6 +11,7 @@ import sys
 import subprocess
 import scipy.io
 import pdb
+import os.path
 
 from numpy.matlib import mat,zeros,ones,inf
 from numpy.linalg import norm
@@ -20,9 +21,12 @@ import QPalmaDP
 from SIQP_CPX import SIQPSolver
 
 from paths_load_data import *
+from paths_load_data_pickle import *
+
 from computeSpliceWeights import *
 from set_param_palma import *
 from computeSpliceAlign import *
+from computeSpliceAlignWithQuality import *
 from penalty_lookup_new import *
 from compute_donacc import *
 from TrainingParam import Param
@@ -30,6 +34,53 @@ from export_param import *
 
 import Configuration
 
+from Plif import Plf
+
+def getQualityFeatureCounts(qualityPlifs):
+   weightQuality = qualityPlifs[0].penalties
+   for currentPlif in qualityPlifs[1:]:
+      weightQuality = numpy.vstack([weightQuality, currentPlif.penalties])
+
+   return weightQuality 
+
+
+def initializeQualityScoringFunctions(numPlifs,numSuppPoints):
+
+   min_intron_len=20
+   max_intron_len=1000
+   min_svm_score=-5
+   max_svm_score=5
+
+   qualityPlifs = [None]*numPlifs
+
+   for idx in range(numPlifs):
+
+      curPlif = Plf()
+      curPlif.limits    = linspace(min_svm_score,max_svm_score,numSuppPoints) 
+      curPlif.penalties = [0]*numSuppPoints
+      curPlif.transform = '' 
+      curPlif.name      = '' 
+      curPlif.max_len   = 100 
+      curPlif.min_len   = -100 
+      curPlif.id        = 1 
+      curPlif.use_svm   = 0 
+      curPlif.next_id   = 0 
+
+      if idx == 0:
+         curPlif.penalties[0] = 11
+         curPlif.penalties[1] = 22
+         curPlif.penalties[2] = 33
+
+      if idx == 1:
+         curPlif.penalties[0] = 99
+         curPlif.penalties[1] = 100
+         curPlif.penalties[2] = 101
+
+      curPlif = curPlif.convert2SWIG()
+      qualityPlifs[idx] = curPlif
+
+   return qualityPlifs
+
 class QPalma:
    """
    A training method for the QPalma project
@@ -39,38 +90,13 @@ class QPalma:
       self.ARGS = Param()
 
       self.logfh = open('qpalma.log','w+')
-
       gen_file= '%s/genome.config' % self.ARGS.basedir
 
-      cmd = ['']*4
-      cmd[0] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/utils'
-      cmd[1] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/genomes'
-      cmd[2] = 'genome_info = init_genome(\'%s\')' % gen_file
-      cmd[3] = 'save genome_info.mat genome_info'  
-      full_cmd = "matlab -nojvm -nodisplay -r \"%s; %s; %s; %s; exit\"" % (cmd[0],cmd[1],cmd[2],cmd[3])
-
-      obj = subprocess.Popen(full_cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
-      out,err = obj.communicate()
-      assert err == '', 'An error occured!\n%s'%err
-
-      ginfo = scipy.io.loadmat('genome_info.mat')
-      self.genome_info = ginfo['genome_info']
+      ginfo_filename = 'genome_info.pickle'
+      self.genome_info = fetch_genome_info(ginfo_filename)
 
       self.plog('genome_info.basedir is %s\n'%self.genome_info.basedir)
 
-      self.C=1.0
-
-      self.numDonSuppPoints     = 30
-      self.numAccSuppPoints     = 30
-      self.numLengthSuppPoints  = 30 
-      self.sizeMMatrix          = 36
-      self.numQualSuppPoints    = 16*0
-
-      self.numFeatures = self.numDonSuppPoints + self.numAccSuppPoints + self.numLengthSuppPoints\
-                  + self.sizeMMatrix + self.numQualSuppPoints
-
-      self.plog('Initializing problem...\n')
-
 
    def plog(self,string):
       self.logfh.write(string)
@@ -78,10 +104,8 @@ class QPalma:
       
    def run(self):
       # Load the whole dataset 
-      Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data('training',self.genome_info,self.ARGS)
-
-      #Sequences, Acceptors, Donors, Exons, Ests, QualityScores = paths_load_data('training',self.genome_info,self.ARGS)
-
+      #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data('training',self.genome_info,self.ARGS)
+      Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
 
       # number of training instances
       N = len(Sequences) 
@@ -90,7 +114,8 @@ class QPalma:
       and N == len(Ests), 'The Seq,Accept,Donor,.. arrays are of different lengths'
       self.plog('Number of training examples: %d\n'% N)
 
-      iteration_steps = 10 ; #upper bound on iteration steps
+      #iteration_steps = 200 ; #upper bound on iteration steps
+      iteration_steps = 2 ; #upper bound on iteration steps
 
       remove_duplicate_scores = False
       print_matrix = False
@@ -103,7 +128,6 @@ class QPalma:
       # Set the parameters such as limits penalties for the Plifs
       [h,d,a,mmatrix] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
 
-
       # delete splicesite-score-information
       if not self.ARGS.train_with_splicesitescoreinformation:
          for i in range(len(Acceptors)):
@@ -114,51 +138,65 @@ class QPalma:
 
       # Initialize solver 
       if not __debug__:
-         solver = SIQPSolver(self.numFeatures,self.numExamples,self.C,self.logfh)
+         self.plog('Initializing problem...\n')
+         solver = SIQPSolver(Configuration.numFeatures,Configuration.numExamples,Configuration.C,self.logfh)
 
       # stores the number of alignments done for each example (best path, second-best path etc.)
       num_path = [anzpath]*N 
       # stores the gap for each example
       gap      = [0.0]*N
 
-      qualityMatrix = zeros((self.numQualSuppPoints,1))
-
       #############################################################################################
       # Training
       #############################################################################################
       self.plog('Starting training...\n')
 
-      iteration_nr = 1
+      donSP       = Configuration.numDonSuppPoints
+      accSP       = Configuration.numAccSuppPoints
+      lengthSP    = Configuration.numLengthSuppPoints
+      mmatrixSP   = Configuration.sizeMMatrix
+      totalQualSP = Configuration.totalQualSuppPoints
+
+      currentPhi = zeros((Configuration.numFeatures,1))
+      totalQualityPenalties = zeros((totalQualSP,1))
+
+      #qualityMatrix = zeros((Configuration.numPlifSuppPoints*Configuration.numQualPlifs,1))
+
+      iteration_nr = 0
 
       while True:
          if iteration_nr == iteration_steps:
             break
 
          for exampleIdx in range(self.numExamples):
-            if exampleIdx% 1000 == 0:
+            if (exampleIdx%10) == 0:
                print 'Current example nr %d' % exampleIdx
 
             dna = Sequences[exampleIdx] 
             est = Ests[exampleIdx] 
 
+            quality = [0.0]*len(est)
+
             exons = Exons[exampleIdx] 
             # NoiseMatrix = Noises[exampleIdx] 
             don_supp = Donors[exampleIdx] 
             acc_supp = Acceptors[exampleIdx] 
 
             # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)    
-            trueSpliceAlign, trueWeightMatch = computeSpliceAlign(dna, exons)
+            # trueSpliceAlign, trueWeightMatch = computeSpliceAlign(dna, exons)
+            trueSpliceAlign, trueWeightMatch, trueQualityPlifs = computeSpliceAlignWithQuality(dna, exons, quality)
             
             # Calculate the weights
             trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
-            trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, qualityMatrix ])
 
-            currentPhi = zeros((self.numFeatures,1))
-            currentPhi[0:30]     = mat(d.penalties[:]).reshape(30,1)
-            currentPhi[30:60]    = mat(a.penalties[:]).reshape(30,1)
-            currentPhi[60:90]    = mat(h.penalties[:]).reshape(30,1)
-            currentPhi[90:126]   = mmatrix[:]
-            currentPhi[126:]   = qualityMatrix[:]
+            trueWeightQuality = getQualityFeatureCounts(trueQualityPlifs)
+            trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
+
+            currentPhi[0:donSP]                                               = mat(d.penalties[:]).reshape(donSP,1)
+            currentPhi[donSP:donSP+accSP]                                     = mat(a.penalties[:]).reshape(accSP,1)
+            currentPhi[donSP+accSP:donSP+accSP+lengthSP]                      = mat(h.penalties[:]).reshape(lengthSP,1)
+            currentPhi[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP]   = mmatrix[:]
+            currentPhi[donSP+accSP+lengthSP+mmatrixSP:]                       = totalQualityPenalties[:]
 
             # Calculate w'phi(x,y) the total score of the alignment
             trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
@@ -166,7 +204,7 @@ class QPalma:
             # The allWeights vector is supposed to store the weight parameter
             # of the true alignment as well as the weight parameters of the
             # num_path[exampleIdx] other alignments
-            allWeights = zeros((self.numFeatures,num_path[exampleIdx]+1))
+            allWeights = zeros((Configuration.numFeatures,num_path[exampleIdx]+1))
             allWeights[:,0] = trueWeight[:,0]
 
             AlignmentScores = [0.0]*(num_path[exampleIdx]+1)
@@ -188,6 +226,9 @@ class QPalma:
             est_len = len(est)
             ps = h.convert2SWIG()
 
+            prb = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
+            chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
+
             matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
             mm_len = 36
 
@@ -197,45 +238,73 @@ class QPalma:
             acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
 
             currentAlignment = QPalmaDP.Alignment()
-            qualityMat = QPalmaDP.createDoubleArrayFromList(qualityMatrix)
-            currentAlignment.setQualityMatrix(qualityMat,self.numQualSuppPoints)
+            #qualityMat = QPalmaDP.createDoubleArrayFromList(qualityMatrix)
+            #currentAlignment.setQualityMatrix(qualityMat,self.numQualSuppPoints)
+
+            qualityPlifs = initializeQualityScoringFunctions(Configuration.numQualPlifs,Configuration.numQualSuppPoints)
+
+            c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList(qualityPlifs)
 
             #print 'PYTHON: Calling myalign...'
             # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
             currentAlignment.myalign( num_path[exampleIdx], dna, dna_len,\
-            est, est_len, ps, matchmatrix, mm_len, donor, d_len,\
-            acceptor, a_len, remove_duplicate_scores, print_matrix)
+             est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
+             acceptor, a_len, c_qualityPlifs, remove_duplicate_scores, print_matrix)
             #print 'PYTHON: After myalign call...'
 
-            newSpliceAlign       = QPalmaDP.createIntArrayFromList([0]*(dna_len*num_path[exampleIdx]))
-            newEstAlign          = QPalmaDP.createIntArrayFromList([0]*(est_len*num_path[exampleIdx]))
-            newWeightMatch       = QPalmaDP.createIntArrayFromList([0]*(mm_len*num_path[exampleIdx]))
-            newAlignmentScores   = QPalmaDP.createDoubleArrayFromList([.0]*num_path[exampleIdx])
+            c_SpliceAlign       = QPalmaDP.createIntArrayFromList([0]*(dna_len*num_path[exampleIdx]))
+            c_EstAlign          = QPalmaDP.createIntArrayFromList([0]*(est_len*num_path[exampleIdx]))
+            c_WeightMatch       = QPalmaDP.createIntArrayFromList([0]*(mm_len*num_path[exampleIdx]))
+            c_AlignmentScores   = QPalmaDP.createDoubleArrayFromList([.0]*num_path[exampleIdx])
+
+            emptyPlif = Plf(30)
+            emptyPlif = emptyPlif.convert2SWIG()
+            c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([emptyPlif]*(Configuration.numQualPlifs*num_path[exampleIdx]))
 
-            currentAlignment.getAlignmentResults(newSpliceAlign, newEstAlign, newWeightMatch, newAlignmentScores)
+            currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
+            c_WeightMatch, c_AlignmentScores, c_qualityPlifs)
 
-            spliceAlign = zeros((num_path[exampleIdx]*dna_len,1))
-            weightMatch = zeros((num_path[exampleIdx]*mm_len,1))
+            newSpliceAlign = zeros((num_path[exampleIdx]*dna_len,1))
+            newEstAlign = zeros((est_len*num_path[exampleIdx],1))
+            newWeightMatch = zeros((num_path[exampleIdx]*mm_len,1))
+            newQualityPlifs = [None]*num_path[exampleIdx]*Configuration.numQualPlifs
+
+            #print 'newSpliceAlign'
+            for i in range(dna_len*num_path[exampleIdx]):
+               newSpliceAlign[i] = c_SpliceAlign[i]
+            #   print '%f' % (spliceAlign[i])
 
-            #print 'spliceAlign'
-            #for i in range(dna_len*num_path[exampleIdx]):
-            #   spliceAlign[i] = newSpliceAlign[i]
+            #print 'newEstAlign'
+            for i in range(est_len*num_path[exampleIdx]):
+               newEstAlign[i] = c_EstAlign[i]
             #   print '%f' % (spliceAlign[i])
 
             #print 'weightMatch'
-            #for i in range(mm_len*num_path[exampleIdx]):
-            #   weightMatch[i] = newWeightMatch[i]
+            for i in range(mm_len*num_path[exampleIdx]):
+               newWeightMatch[i] = c_WeightMatch[i]
             #   print '%f' % (weightMatch[i])
 
+            #print 'AlignmentScores'
             for i in range(num_path[exampleIdx]):
-               AlignmentScores[i+1] = newAlignmentScores[i]
+               AlignmentScores[i+1] = c_AlignmentScores[i]
 
-            #print AlignmentScores
-               
-            spliceAlign = spliceAlign.reshape(num_path[exampleIdx],dna_len)
-            weightMatch = weightMatch.reshape(num_path[exampleIdx],mm_len)
+            #print 'newQualityPlifs'
+            for i in range(num_path[exampleIdx]*Configuration.numQualPlifs):
+               newQualityPlifs[i] = QPalmaDP.penaltyArray_getitem(c_qualityPlifs, i)
+
+            #print "Calling destructors"
+
+            del c_SpliceAlign
+            del c_EstAlign
+            del c_WeightMatch
+            del c_AlignmentScores
+            del c_qualityPlifs
+            del currentAlignment
+
+            newSpliceAlign = newSpliceAlign.reshape(num_path[exampleIdx],dna_len)
+            newWeightMatch = newWeightMatch.reshape(num_path[exampleIdx],mm_len)
             # Calculate weights of the respective alignments Note that we are
-            # calculating n-best alignments without any hamming loss, so we
+            # calculating n-best alignments without hamming loss, so we
             # have to keep track which of the n-best alignments correspond to
             # the true one in order not to incorporate a true alignment in the
             # constraints. To keep track of the true and false alignments we
@@ -246,25 +315,36 @@ class QPalma:
             path_loss = [0]*(num_path[exampleIdx]+1)
 
             for pathNr in range(num_path[exampleIdx]):
-               #dna_numbers = dnaest{1,pathNr}
-               #est_numbers = dnaest{2,pathNr}
 
-               weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a, h, spliceAlign[pathNr,:].flatten().tolist()[0], don_supp, acc_supp)
+               weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a, h, newSpliceAlign[pathNr,:].flatten().tolist()[0], don_supp, acc_supp)
+
+               decodedQualityFeatures = zeros((Configuration.totalQualSuppPoints,1))
+               qidx = 0
+
+               for currentPlif in newQualityPlifs[Configuration.numQualPlifs*pathNr:Configuration.numQualPlifs*(pathNr+1)]:
+                  for tidx in range(currentPlif.len):
+                     #elem = currentPlif.penalties[tidx]
+                     elem = QPalmaDP.doubleFArray_getitem(currentPlif.penalties, tidx)
+                     #print '%f ' % elem, 
+                     print qidx
+                     decodedQualityFeatures[qidx] = elem
+                     qidx += 1
+                  #print
 
                # sum up positionwise loss between alignments
-               for alignPosIdx in range(len(spliceAlign[pathNr,:])):
-                  if spliceAlign[pathNr,alignPosIdx] != trueSpliceAlign[alignPosIdx]:
+               for alignPosIdx in range(len(newSpliceAlign[pathNr,:])):
+                  if newSpliceAlign[pathNr,alignPosIdx] != trueSpliceAlign[alignPosIdx]:
                      path_loss[pathNr+1] += 1
 
                # Gewichte in restliche Zeilen der Matrix speichern
-               wp = numpy.vstack([weightIntron, weightDon, weightAcc, weightMatch[pathNr,:].T, qualityMatrix ])
+               wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch[pathNr,:].T, decodedQualityFeatures])
                allWeights[:,pathNr+1] = wp
 
                hpen = mat(h.penalties).reshape(len(h.penalties),1)
                dpen = mat(d.penalties).reshape(len(d.penalties),1)
                apen = mat(a.penalties).reshape(len(a.penalties),1)
 
-               features = numpy.vstack([hpen , dpen , apen , mmatrix[:]])
+               features = numpy.vstack([hpen , dpen , apen , mmatrix[:], zeros((Configuration.totalQualSuppPoints,1))])
                AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
 
                # Check wether scalar product + loss equals viterbi score
@@ -306,20 +386,51 @@ class QPalma:
                      for elem in self.slacks:
                         sum_xis +=  elem
 
-                  for i in range(len(param)):
-                     param[i] = w[i]
+                     for i in range(len(param)):
+                        param[i] = w[i]
+
+                     [h,d,a,mmatrix] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
 
-                  [h,d,a,mmatrix] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
-                  
-            if exampleIdx==10:
-               break
+               #
+               # end of one example processing 
+               #
+            #if exampleIdx == 100:
+            #   break
+         
+         #break
 
+         #
+         # end of one iteration through all examples
+         #
          iteration_nr += 1
 
-      export_param('test_params',h,d,a,mmatrix)
+      #
+      # end of optimization 
+      #  
+      export_param('elegans.param',h,d,a,mmatrix)
       self.logfh.close()
       print 'Training completed'
 
+def fetch_genome_info(ginfo_filename):
+   if not os.path.exists(ginfo_filename):
+      cmd = ['']*4
+      cmd[0] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/utils'
+      cmd[1] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/genomes'
+      cmd[2] = 'genome_info = init_genome(\'%s\')' % gen_file
+      cmd[3] = 'save genome_info.mat genome_info'  
+      full_cmd = "matlab -nojvm -nodisplay -r \"%s; %s; %s; %s; exit\"" % (cmd[0],cmd[1],cmd[2],cmd[3])
+
+      obj = subprocess.Popen(full_cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
+      out,err = obj.communicate()
+      assert err == '', 'An error occured!\n%s'%err
+
+      ginfo = scipy.io.loadmat('genome_info.mat')
+      cPickle.dump(self.genome_info,open(ginfo_filename,'w+'))
+      return ginfo['genome_info']
+
+   else:
+      return cPickle.load(open(ginfo_filename))
+
 if __name__ == '__main__':
    qpalma = QPalma()
    qpalma.run()