+ added a toy data random generator
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 18 Jan 2008 13:03:04 +0000 (13:03 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 18 Jan 2008 13:03:04 +0000 (13:03 +0000)
+ removed num exons == 3 restriction from computeSpliceAlignWithQuality
TODO
- check difference in the DP recurrences of the ../standalone vs. ../python
  versions, choose one to make predictions with.

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@7504 e1793c9e-67f9-0310-80fc-b846ff1f7b36

QPalmaDP/qpalma_dp.cpp
QPalmaDP/result_align.cpp
python/computeSpliceAlignWithQuality.py
python/generateEvaluationData.py [new file with mode: 0644]
python/qpalma.py

index 931b46c..0466d6c 100644 (file)
@@ -31,8 +31,8 @@ Alignment::Alignment(int numQPlifs, int numq, bool use_qscores) {
       numPlifs = numQPlifs;
       use_quality_scores = use_qscores;
 
-      printf("number of support points: %d\n",numQualSuppPoints);
-      printf("number of plifs: %d\n",numPlifs );
+      //printf("number of support points: %d\n",numQualSuppPoints);
+      //printf("number of plifs: %d\n",numPlifs );
       FA( numQualSuppPoints >= 0 );
       FA( numPlifs >= 0 );
 }
index 7994419..d0c6944 100644 (file)
@@ -13,7 +13,7 @@ void increaseFeatureCount(penalty_struct* qparam, int dnaChar, int estChar, doub
 
    penalty_struct currentStruct = qparam[currentPos];
 
-   printf("Current index %d est/dna: %d %d score %f\n",currentPos,estChar,dnaChar,estprb);
+   //printf("Current index %d est/dna: %d %d score %f\n",currentPos,estChar,dnaChar,estprb);
 
    //printf("before\n");
    //int p_idx;
index 5108adb..fb88ca2 100644 (file)
@@ -21,7 +21,7 @@ def  computeSpliceAlignWithQuality(dna, exons):
    numberOfExons = (exons.shape)[0] # how many rows ?
    exonSizes = [-1]*numberOfExons
 
-   assert numberOfExons == 3
+   #assert numberOfExons == 3
 
    for idx in range(numberOfExons):
       exonSizes[idx] = exons[idx,1] - exons[idx,0]
@@ -41,8 +41,8 @@ def  computeSpliceAlignWithQuality(dna, exons):
          intronLength = exons[idx+1,0] - exons[idx,1]
          SpliceAlign.extend([1]+[3]*(intronLength-2)+[2])
 
-   if len(dna) > exons[2,1]:
-      SpliceAlign.extend([4]*(len(dna)-exons[2,1]))
+   if len(dna) > exons[-1,1]:
+      SpliceAlign.extend([4]*(len(dna)-exons[-1,1]))
 
    assert len(SpliceAlign) == len(dna), pdb.set_trace()
 
diff --git a/python/generateEvaluationData.py b/python/generateEvaluationData.py
new file mode 100644 (file)
index 0000000..6caf172
--- /dev/null
@@ -0,0 +1,62 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from numpy.matlib import mat,zeros,ones,inf
+import random
+
+def sample(population, k):
+   """Chooses k random elements from a population sequence. """
+   n = len(population)
+   result = [None] * k
+   for i in xrange(k):
+      j = int(random.random() * n)
+      result[i] = population[j]
+   return result
+
+def generateData(numExamples):
+   dna_len = 216
+   est_len = 36
+   random.seed(14)
+
+   letters = ['a','c','g','t']
+
+   Sequences   = []
+   Acceptors   = []
+   Donors      = []
+   Exons       = []
+   Ests        = []
+
+   for i in range(numExamples):
+      dna = ''.join(sample(letters, dna_len))
+      Sequences.append(dna)
+
+      Acceptors.append([0.0]*dna_len)
+      Donors.append([0.0]*dna_len)
+
+      currentExon       = zeros((2,2))
+      currentExon[0,0]  = 0
+      currentExon[0,1]  = 72
+      currentExon[1,0]  = 144
+      currentExon[1,1]  = 216
+
+      Exons.append(currentExon)
+
+      est = ''.join(sample(letters, est_len))
+      Ests.append(est)
+
+   
+   preNr    = 15
+   middleNr = 6
+   sufNr    = 15
+   Qualities  = [[40]*preNr + [-1]*middleNr + [40]*sufNr]*numExamples
+
+   return Sequences, Acceptors, Donors, Exons, Ests, Qualities
+
+
+if __name__ == '__main__':
+   Sequences, Acceptors, Donors, Exons, Ests, Qualities = generateData(10)
+   print Acceptors
+   print Donors
+   print Exons
+   print Ests
+   print Qualities
index 22c623c..96040af 100644 (file)
@@ -22,6 +22,8 @@ from SIQP_CPX import SIQPSolver
 from paths_load_data_pickle import *
 from paths_load_data_solexa import *
 
+from generateEvaluationData import *
+
 from computeSpliceWeights import *
 from set_param_palma import *
 from computeSpliceAlignWithQuality import *
@@ -68,7 +70,9 @@ class QPalma:
          Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
          use_quality_scores = False
       elif Configuration.mode == 'using_quality_scores':
-         Sequences, Acceptors, Donors, Exons, Ests, Qualities = paths_load_data_solexa('training',self.genome_info,self.ARGS)
+         #Sequences, Acceptors, Donors, Exons, Ests, Qualities = paths_load_data_solexa('training',self.genome_info,self.ARGS)
+         Sequences, Acceptors, Donors, Exons, Ests, Qualities = generateData(100)
+         #Sequences_, Acceptors_, Donors_, Exons_, Ests_, Qualities_ = generateData(100)
 
          #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
          #Qualities = []
@@ -78,6 +82,8 @@ class QPalma:
       else:
          assert(False)
 
+      pdb.set_trace()
+
       # number of training instances
       N = len(Sequences) 
       self.numExamples = N
@@ -227,10 +233,6 @@ class QPalma:
             # Create the alignment object representing the interface to the C/C++ code.
             currentAlignment = QPalmaDP.Alignment(Configuration.numQualPlifs,Configuration.numQualSuppPoints, use_quality_scores)
 
-            #for pos,elem in enumerate(qualityPlifs):
-            #   elem.penalties = [1.0]*len(elem.penalties)
-            #   qualityPlifs[pos] = elem
-
             c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
 
             # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
@@ -323,14 +325,10 @@ class QPalma:
                hpen = mat(h.penalties).reshape(len(h.penalties),1)
                dpen = mat(d.penalties).reshape(len(d.penalties),1)
                apen = mat(a.penalties).reshape(len(a.penalties),1)
-
-               #totalQualityPenalties = ones((Configuration.totalQualSuppPoints,1))
                features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties])
 
                AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
 
-               #pdb.set_trace()
-
                # Check wether scalar product + loss equals viterbi score
                #assert math.fabs(AlignmentScores[pathNr] - AlignmentScores[pathNr+1]) < 1e-6,\
                #'Scalar prod + loss is not equal Viterbi score. Respective values are %f, %f' % \