+ added feature calculation for the labels
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Tue, 8 Jan 2008 11:23:31 +0000 (11:23 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Tue, 8 Jan 2008 11:23:31 +0000 (11:23 +0000)
+ moved all parameter to central config file
TODO
- clean up code, eliminate some redundancies
- add more assertions

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@7254 e1793c9e-67f9-0310-80fc-b846ff1f7b36

python/Configuration.py
python/Plif.py
python/computeSpliceAlignWithQuality.py
python/qpalma.py
python/set_param_palma.py

index 187bc9e..8ccba2b 100644 (file)
@@ -30,3 +30,48 @@ fixedParam = numpy.matlib.mat([[ 0.62870709], [ 0.7012026 ], [ 0.60236784],
        [ 0.69422476], [ 0.4310939 ], [ 0.03069099], [ 0.35969779], [ 0.18047331],
        [ 0.4177651 ], [ 0.01360547], [ 0.29069319]])
 
+###########################################################
+#
+# The parameters for the QPalma algorithm
+#
+#
+
+C = 1.0
+
+# 'normal' means work like Palma 'using_quality_scores' means work like Palma
+# plus using sequencing quality scores
+
+#mode = 'normal'
+mode = 'using_quality_scores'
+
+# Here we specify the total number of parameters.
+
+
+# When using quality scores our scoring function is defined as
+#
+# f: S x R x S -> R
+# 
+# as opposed to a usage without quality scores when we only have
+# 
+# f: S x S -> R 
+#
+numDonSuppPoints     = 30
+numAccSuppPoints     = 30
+numLengthSuppPoints  = 30 
+numQualSuppPoints    = 10
+
+if mode == 'normal':
+   sizeMMatrix          = 36
+   numQualPlifs = 0
+elif mode == 'using_quality_scores':
+   sizeMMatrix          = 36 
+   numQualPlifs = 5*5
+else:
+   assert False, 'Wrong operation mode specified'
+
+totalQualSuppPoints = numQualPlifs*numQualSuppPoints
+
+numFeatures = numDonSuppPoints + numAccSuppPoints\
++ numLengthSuppPoints + sizeMMatrix + totalQualSuppPoints 
+
+
index 433c3d4..5acf385 100644 (file)
@@ -42,3 +42,43 @@ class Plf:
 
       return ps
 
+
+def base_coord(b1,b2):
+   b1 = b1.lower()
+   b2 = b2.lower()
+   assert b1 in ['a','c','g','t','n']
+   assert b2 in ['a','c','g','t','n']
+   cb = {'a':0, 'c':1, 'g':2, 't':3, 'n':4}
+
+   b1 = cb[b1]
+   b2 = cb[b2]
+
+   return b1*5+b2
+
+def linspace(a,b,n):
+   intervalLength = b-a
+   stepSize = 1.0*intervalLength / (n-1)
+   
+   interval = [0]*n
+   interval[0] = a
+   interval[-1] = b
+   for i in range(1,n-1):
+      interval[i] = a+(i*stepSize)
+
+   return interval
+
+def logspace(a,b,n):
+   interval = [0]*n
+   begin = 10.0**a
+   end = 10.0**b
+   intervalSize = 1.0*(b-a)/(n-1)
+   interval[0] = begin
+   interval[-1] = end
+
+   for i in range(1,n-1):
+      interval[i] = 10**(a+i*intervalSize)
+
+   return interval
+
+def log10(x):
+   return math.log(x)/math.log(10)
index f53098a..d69b2eb 100644 (file)
@@ -1,9 +1,10 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
-from numpy.matlib import zeros
+from numpy.matlib import mat,zeros,ones,inf
 import pdb
-from Plif import *
+from Plif import Plf,base_coord,linspace
+import Configuration
 
 def  computeSpliceAlignWithQuality(dna, exons, quality):
    """
@@ -61,9 +62,13 @@ def  computeSpliceAlignWithQuality(dna, exons, quality):
    # counts the occurences of a,c,g,t,n in this order
    numChar = [0]*sizeMatchmatrix
 
-   trueQualityPlifs =  [Plf()] * 24
+   # counts the occurrnces of a,c,g,t,n with their quality scores
+   trueQualityPlifs =  [Plf(Configuration.numQualSuppPoints)] * Configuration.numQualPlifs
+   for currentPlif in trueQualityPlifs:
+      currentPlif.limits = mat(linspace(0,40,10)).reshape(10,1)
+      currentPlif.penalties = zeros((10,1))
 
-   for elem in est:
+   for pos, elem in enumerate(est):
       if elem == 'a':
          numChar[0] += 1
       if elem == 'c':
@@ -75,6 +80,20 @@ def  computeSpliceAlignWithQuality(dna, exons, quality):
       if elem == 'n':
          numChar[4] += 1
 
+      currentPlif = trueQualityPlifs[base_coord(elem,elem)]
+      currentQualityScore = quality[pos]
+
+      if currentQualityScore < currentPlif.limits[0]:
+            currentPlif.penalties[0] += 1
+
+      elif currentQualityScore >= currentPlif.limits[-1]:
+            currentPlif.penalties[-1] += 1
+
+      else:
+         for idx in range(1,currentPlif.len):
+            if currentPlif.limits[idx-1] <= currentQualityScore < currentPlif.limits[idx]:
+               currentPlif.penalties[idx] += 1
+
    totalNumChar = 0
    for idx in range(sizeMatchmatrix):
       totalNumChar += numChar[idx]
index 47f0dd0..4efc708 100644 (file)
@@ -36,6 +36,13 @@ import Configuration
 
 from Plif import Plf
 
+def getQualityFeatureCounts(qualityPlifs):
+   weightQuality = qualityPlifs[0].penalties
+   for currentPlif in qualityPlifs[1:]:
+      weightQuality = numpy.vstack([weightQuality, currentPlif.penalties])
+
+   return weightQuality 
+
 
 def initializeQualityScoringFunctions(numPlifs,numSuppPoints):
 
@@ -90,43 +97,6 @@ class QPalma:
 
       self.plog('genome_info.basedir is %s\n'%self.genome_info.basedir)
 
-      self.C=1.0
-
-      # 'normal' means work like Palma
-      # 'using_quality_scores' means work like Palma plus using sequencing
-      # quality scores
-      self.mode = 'normal'
-      #self.mode = 'using_quality_scores'
-
-      # Here we specify the total number of parameters.
-      # When using quality scores our scoring function is defined as
-      #
-      # f: S x R x S -> R
-      # 
-      # as opposed to a usage without quality scores when we only have
-      # 
-      # f: S x S -> R 
-      #
-      self.numDonSuppPoints     = 30
-      self.numAccSuppPoints     = 30
-      self.numLengthSuppPoints  = 30 
-      if self.mode == 'normal':
-         self.sizeMMatrix          = 36
-      elif self.mode == 'using_quality_scores':
-         self.sizeMMatrix          = 728
-      else:
-         assert False, 'Wrong operation mode specified'
-
-      # this number defines the number of support points for one tuple (a,b)
-      # where 'a' comes with a quality score
-      self.numQualSuppPoints    = 10
-      self.numQualSuppPoints    = 0
-
-      self.numFeatures = self.numDonSuppPoints + self.numAccSuppPoints\
-      + self.numLengthSuppPoints + self.sizeMMatrix 
-
-      self.plog('Initializing problem...\n')
-
 
    def plog(self,string):
       self.logfh.write(string)
@@ -168,25 +138,30 @@ class QPalma:
 
       # Initialize solver 
       if not __debug__:
-         solver = SIQPSolver(self.numFeatures,self.numExamples,self.C,self.logfh)
+         self.plog('Initializing problem...\n')
+         solver = SIQPSolver(Configuration.numFeatures,Configuration.numExamples,Configuration.C,self.logfh)
 
       # stores the number of alignments done for each example (best path, second-best path etc.)
       num_path = [anzpath]*N 
       # stores the gap for each example
       gap      = [0.0]*N
 
-      qualityMatrix = zeros((self.numQualSuppPoints,1))
-
-      numPlifs = 24
-      numSuppPoints = 30
-
-      qualityPlifs = initializeQualityScoringFunctions(numPlifs,numSuppPoints)
-
       #############################################################################################
       # Training
       #############################################################################################
       self.plog('Starting training...\n')
 
+      donSP       = Configuration.numDonSuppPoints
+      accSP       = Configuration.numAccSuppPoints
+      lengthSP    = Configuration.numLengthSuppPoints
+      mmatrixSP   = Configuration.sizeMMatrix
+      totalQualSP = Configuration.totalQualSuppPoints
+
+      currentPhi = zeros((Configuration.numFeatures,1))
+      totalQualityPenalties = zeros((totalQualSP,1))
+
+      #qualityMatrix = zeros((Configuration.numPlifSuppPoints*Configuration.numQualPlifs,1))
+
       iteration_nr = 0
 
       while True:
@@ -208,19 +183,20 @@ class QPalma:
             acc_supp = Acceptors[exampleIdx] 
 
             # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)    
-            trueSpliceAlign, trueWeightMatch = computeSpliceAlign(dna, exons)
+            trueSpliceAlign, trueWeightMatch = computeSpliceAlign(dna, exons)
             trueSpliceAlign, trueWeightMatch, trueQualityPlifs = computeSpliceAlignWithQuality(dna, exons, quality)
             
             # Calculate the weights
             trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
-            trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, qualityMatrix ])
 
-            currentPhi = zeros((self.numFeatures,1))
-            currentPhi[0:30]     = mat(d.penalties[:]).reshape(30,1)
-            currentPhi[30:60]    = mat(a.penalties[:]).reshape(30,1)
-            currentPhi[60:90]    = mat(h.penalties[:]).reshape(30,1)
-            currentPhi[90:126]   = mmatrix[:]
-            currentPhi[126:]   = qualityMatrix[:]
+            trueWeightQuality = getQualityFeatureCounts(trueQualityPlifs)
+            trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
+
+            currentPhi[0:donSP]                                               = mat(d.penalties[:]).reshape(donSP,1)
+            currentPhi[donSP:donSP+accSP]                                     = mat(a.penalties[:]).reshape(accSP,1)
+            currentPhi[donSP+accSP:donSP+accSP+lengthSP]                      = mat(h.penalties[:]).reshape(lengthSP,1)
+            currentPhi[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP]   = mmatrix[:]
+            currentPhi[donSP+accSP+lengthSP+mmatrixSP:]                       = totalQualityPenalties[:]
 
             # Calculate w'phi(x,y) the total score of the alignment
             trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
@@ -228,7 +204,7 @@ class QPalma:
             # The allWeights vector is supposed to store the weight parameter
             # of the true alignment as well as the weight parameters of the
             # num_path[exampleIdx] other alignments
-            allWeights = zeros((self.numFeatures,num_path[exampleIdx]+1))
+            allWeights = zeros((Configuration.numFeatures,num_path[exampleIdx]+1))
             allWeights[:,0] = trueWeight[:,0]
 
             AlignmentScores = [0.0]*(num_path[exampleIdx]+1)
@@ -262,8 +238,10 @@ class QPalma:
             acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
 
             currentAlignment = QPalmaDP.Alignment()
-            qualityMat = QPalmaDP.createDoubleArrayFromList(qualityMatrix)
-            currentAlignment.setQualityMatrix(qualityMat,self.numQualSuppPoints)
+            #qualityMat = QPalmaDP.createDoubleArrayFromList(qualityMatrix)
+            #currentAlignment.setQualityMatrix(qualityMat,self.numQualSuppPoints)
+
+            qualityPlifs = initializeQualityScoringFunctions(Configuration.numQualPlifs,Configuration.numQualSuppPoints)
 
             c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList(qualityPlifs)
 
@@ -281,46 +259,40 @@ class QPalma:
 
             emptyPlif = Plf(30)
             emptyPlif = emptyPlif.convert2SWIG()
-            c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([emptyPlif]*(24*num_path[exampleIdx]))
-
-            print "Trying to fetch results from c-code"
+            c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([emptyPlif]*(Configuration.numQualPlifs*num_path[exampleIdx]))
 
             currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
             c_WeightMatch, c_AlignmentScores, c_qualityPlifs)
 
-            print "Fetched results from c-code"
-
-            #print '%d %d %d' % (dna_len*num_path[exampleIdx],est_len*num_path[exampleIdx],mm_len*num_path[exampleIdx])
-
             newSpliceAlign = zeros((num_path[exampleIdx]*dna_len,1))
             newEstAlign = zeros((est_len*num_path[exampleIdx],1))
             newWeightMatch = zeros((num_path[exampleIdx]*mm_len,1))
-            newQualityPlifs = [None]*num_path[exampleIdx]*24
+            newQualityPlifs = [None]*num_path[exampleIdx]*Configuration.numQualPlifs
 
-            print 'newSpliceAlign'
+            #print 'newSpliceAlign'
             for i in range(dna_len*num_path[exampleIdx]):
                newSpliceAlign[i] = c_SpliceAlign[i]
             #   print '%f' % (spliceAlign[i])
 
-            print 'newEstAlign'
+            #print 'newEstAlign'
             for i in range(est_len*num_path[exampleIdx]):
                newEstAlign[i] = c_EstAlign[i]
             #   print '%f' % (spliceAlign[i])
 
-            print 'weightMatch'
+            #print 'weightMatch'
             for i in range(mm_len*num_path[exampleIdx]):
                newWeightMatch[i] = c_WeightMatch[i]
             #   print '%f' % (weightMatch[i])
 
-            print 'AlignmentScores'
+            #print 'AlignmentScores'
             for i in range(num_path[exampleIdx]):
                AlignmentScores[i+1] = c_AlignmentScores[i]
 
-            print 'newQualityPlifs'
-            for i in range(num_path[exampleIdx]*24):
+            #print 'newQualityPlifs'
+            for i in range(num_path[exampleIdx]*Configuration.numQualPlifs):
                newQualityPlifs[i] = QPalmaDP.penaltyArray_getitem(c_qualityPlifs, i)
 
-            print "Calling destructors"
+            #print "Calling destructors"
 
             del c_SpliceAlign
             del c_EstAlign
@@ -343,19 +315,21 @@ class QPalma:
             path_loss = [0]*(num_path[exampleIdx]+1)
 
             for pathNr in range(num_path[exampleIdx]):
-               #dna_numbers = dnaest{1,pathNr}
-               #est_numbers = dnaest{2,pathNr}
 
                weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a, h, newSpliceAlign[pathNr,:].flatten().tolist()[0], don_supp, acc_supp)
 
-               testPlif = newQualityPlifs[24*pathNr]
-               print "testPlif.penalties"
-               print type(testPlif)
-               for tidx in range(testPlif.len):
-                  #elem = testPlif.penalties[tidx]
-                  elem = QPalmaDP.doubleFArray_getitem(testPlif.penalties, tidx)
-                  print '%f ' % elem, 
-               print
+               decodedQualityFeatures = zeros((Configuration.totalQualSuppPoints,1))
+               qidx = 0
+
+               for currentPlif in newQualityPlifs[Configuration.numQualPlifs*pathNr:Configuration.numQualPlifs*(pathNr+1)]:
+                  for tidx in range(currentPlif.len):
+                     #elem = currentPlif.penalties[tidx]
+                     elem = QPalmaDP.doubleFArray_getitem(currentPlif.penalties, tidx)
+                     #print '%f ' % elem, 
+                     print qidx
+                     decodedQualityFeatures[qidx] = elem
+                     qidx += 1
+                  #print
 
                # sum up positionwise loss between alignments
                for alignPosIdx in range(len(newSpliceAlign[pathNr,:])):
@@ -363,14 +337,14 @@ class QPalma:
                      path_loss[pathNr+1] += 1
 
                # Gewichte in restliche Zeilen der Matrix speichern
-               wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch[pathNr,:].T, qualityMatrix ])
+               wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch[pathNr,:].T, decodedQualityFeatures])
                allWeights[:,pathNr+1] = wp
 
                hpen = mat(h.penalties).reshape(len(h.penalties),1)
                dpen = mat(d.penalties).reshape(len(d.penalties),1)
                apen = mat(a.penalties).reshape(len(a.penalties),1)
 
-               features = numpy.vstack([hpen , dpen , apen , mmatrix[:]])
+               features = numpy.vstack([hpen , dpen , apen , mmatrix[:], zeros((Configuration.totalQualSuppPoints,1))])
                AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
 
                # Check wether scalar product + loss equals viterbi score
index c98e270..69d0be4 100644 (file)
@@ -7,33 +7,6 @@ import QPalmaDP
 import pdb
 from Plif import *
 
-def linspace(a,b,n):
-   intervalLength = b-a
-   stepSize = 1.0*intervalLength / (n-1)
-   
-   interval = [0]*n
-   interval[0] = a
-   interval[-1] = b
-   for i in range(1,n-1):
-      interval[i] = a+(i*stepSize)
-
-   return interval
-
-def logspace(a,b,n):
-   interval = [0]*n
-   begin = 10.0**a
-   end = 10.0**b
-   intervalSize = 1.0*(b-a)/(n-1)
-   interval[0] = begin
-   interval[-1] = end
-
-   for i in range(1,n-1):
-      interval[i] = 10**(a+i*intervalSize)
-
-   return interval
-
-def log10(x):
-   return math.log(x)/math.log(10)