+ renamed main dir in order to create python module hierarchy
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Wed, 23 Jan 2008 09:37:33 +0000 (09:37 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Wed, 23 Jan 2008 09:37:33 +0000 (09:37 +0000)
git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@7537 e1793c9e-67f9-0310-80fc-b846ff1f7b36

48 files changed:
python/Configuration.py [deleted file]
python/Plif.py [deleted file]
python/SIQP.py [deleted file]
python/SIQP_CPX.py [deleted file]
python/TrainingParam.py [deleted file]
python/computeSpliceAlignWithQuality.py [deleted file]
python/computeSpliceWeights.py [deleted file]
python/compute_donacc.py [deleted file]
python/export_param.py [deleted file]
python/generateEvaluationData.py [deleted file]
python/genome_info.pickle [deleted file]
python/numpyUtils.py [deleted file]
python/paths_load_data.py [deleted file]
python/paths_load_data_pickle.py [deleted file]
python/paths_load_data_solexa.py [deleted file]
python/penalty_lookup_new.py [deleted file]
python/qpalma.py [deleted file]
python/qpalma_predict.py [deleted file]
python/set_param_palma.py [deleted file]
python/tools/PyGff.py [deleted file]
python/tools/Solexa.py [deleted file]
python/tools/createTestSet.py [deleted file]
python/tools/parseGff.py [deleted file]
python/tools/parseSolexa.py [deleted file]
qpalma/Configuration.py [new file with mode: 0644]
qpalma/Plif.py [new file with mode: 0644]
qpalma/SIQP.py [new file with mode: 0644]
qpalma/SIQP_CPX.py [new file with mode: 0644]
qpalma/TrainingParam.py [new file with mode: 0644]
qpalma/computeSpliceAlignWithQuality.py [new file with mode: 0644]
qpalma/computeSpliceWeights.py [new file with mode: 0644]
qpalma/compute_donacc.py [new file with mode: 0644]
qpalma/export_param.py [new file with mode: 0644]
qpalma/generateEvaluationData.py [new file with mode: 0644]
qpalma/genome_info.pickle [new file with mode: 0644]
qpalma/numpyUtils.py [new file with mode: 0644]
qpalma/paths_load_data.py [new file with mode: 0644]
qpalma/paths_load_data_pickle.py [new file with mode: 0644]
qpalma/paths_load_data_solexa.py [new file with mode: 0644]
qpalma/penalty_lookup_new.py [new file with mode: 0644]
qpalma/qpalma.py [new file with mode: 0644]
qpalma/qpalma_predict.py [new file with mode: 0644]
qpalma/set_param_palma.py [new file with mode: 0644]
qpalma/tools/PyGff.py [new file with mode: 0644]
qpalma/tools/Solexa.py [new file with mode: 0644]
qpalma/tools/createTestSet.py [new file with mode: 0644]
qpalma/tools/parseGff.py [new file with mode: 0644]
qpalma/tools/parseSolexa.py [new file with mode: 0644]

diff --git a/python/Configuration.py b/python/Configuration.py
deleted file mode 100644 (file)
index 7d82c0c..0000000
+++ /dev/null
@@ -1,196 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import numpy.matlib
-
-fixedParamQ = numpy.matlib.mat(
-[[ 0.62870709], [ 0.7012026 ], [ 0.60236784],
-       [ 0.51316259], [ 0.20220814], [ 0.70324863], [ 0.37218684], [ 0.82178927],
-       [ 0.60394866], [ 0.70371272], [ 0.07548074], [ 0.63412803], [ 0.97442266],
-       [ 0.13216791], [ 0.71041168], [ 0.2093887 ], [ 0.35227344], [ 0.3405142 ],
-       [ 0.69677236], [ 0.41673747], [ 0.564245  ], [ 0.37613432], [ 0.88805642],
-       [ 0.69677236], [ 0.41673747], [ 0.564245  ], [ 0.37613432], [ 0.88805642],
-       [ 0.69677236], [ 0.41673747], [ 0.564245  ], [ 0.37613432], [ 0.88805642],
-       [ 0.88691608], [ 0.69476752], [ 0.81659504], [ 0.17801859], [ 0.71048235],
-       [ 0.08188783], [ 0.54884803], [ 0.84039558], [ 0.6982093 ], [ 0.41686176],
-       [ 0.38568873], [ 0.29401347], [ 0.12704074], [ 0.30640858], [ 0.89578031],
-       [ 0.84621571], [ 0.11783439], [ 0.0944695 ], [ 0.34081575], [ 0.44157643],
-       [ 0.77847185], [ 0.04283567], [ 0.45107823], [ 0.89789891], [ 0.41045519],
-       [ 0.49073531], [ 0.29727627], [ 0.94711483], [ 0.24898204], [ 0.26181212],
-       [ 0.71760957], [ 0.60326883], [ 0.80887576], [ 0.09448718], [ 0.88064525],
-       [ 0.84317654], [ 0.48893703], [ 0.24847021], [ 0.84203596], [ 0.34104156],
-       [ 0.75604701], [ 0.91703057], [ 0.69325475], [ 0.61276969], [ 0.16335226],
-       [ 0.4684374 ], [ 0.16553371], [ 0.79594434], [ 0.6440283 ], [ 0.80922237],
-       [ 0.5349296 ], [ 0.31924316], [ 0.10960695], [ 0.40151062], [ 0.50473641],
-       [ 0.14812671], [ 0.73523169], [ 0.35141625], [ 0.80364238], [ 0.02128181],
-       [ 0.0061226 ], [ 0.34541924], [ 0.07694485], [ 0.05551339], [ 0.23087636],
-       [ 0.87016395], [ 0.31682377], [ 0.27375113], [ 0.72226332], [ 0.62914149],
-       [ 0.59236012], [ 0.2070238 ], [ 0.52390942], [ 0.11894098], [ 0.55725917],
-       [ 0.72706009], [ 0.087196  ], [ 0.04745082], [ 0.95636492], [ 0.31524576],
-       [ 0.79685218], [ 0.80386771], [ 0.70942604], [ 0.82869417], [ 0.26906569],
-       [ 0.51848039], [ 0.64169354], [ 0.07114973], [ 0.39249454], [ 0.07002803],
-       [ 0.94667567], [ 0.02252752], [ 0.01039039], [ 0.5721312 ], [ 0.06065969],
-       [ 0.88691608], [ 0.69476752], [ 0.81659504], [ 0.17801859], [ 0.71048235],
-       [ 0.88691608], [ 0.69476752], [ 0.81659504], [ 0.17801859], [ 0.71048235],
-       [ 0.08188783], [ 0.54884803], [ 0.84039558], [ 0.6982093 ], [ 0.41686176],
-       [ 0.38568873], [ 0.29401347], [ 0.12704074], [ 0.30640858], [ 0.89578031],
-       [ 0.84621571], [ 0.11783439], [ 0.0944695 ], [ 0.34081575], [ 0.44157643],
-       [ 0.77847185], [ 0.04283567], [ 0.45107823], [ 0.89789891], [ 0.41045519],
-       [ 0.49073531], [ 0.29727627], [ 0.94711483], [ 0.24898204], [ 0.26181212],
-       [ 0.71760957], [ 0.60326883], [ 0.80887576], [ 0.09448718], [ 0.88064525],
-       [ 0.84317654], [ 0.48893703], [ 0.24847021], [ 0.84203596], [ 0.34104156],
-       [ 0.75604701], [ 0.91703057], [ 0.69325475], [ 0.61276969], [ 0.16335226],
-       [ 0.4684374 ], [ 0.16553371], [ 0.79594434], [ 0.6440283 ], [ 0.80922237],
-       [ 0.5349296 ], [ 0.31924316], [ 0.10960695], [ 0.40151062], [ 0.50473641],
-       [ 0.14812671], [ 0.73523169], [ 0.35141625], [ 0.80364238], [ 0.02128181],
-       [ 0.0061226 ], [ 0.34541924], [ 0.07694485], [ 0.05551339], [ 0.23087636],
-       [ 0.87016395], [ 0.31682377], [ 0.27375113], [ 0.72226332], [ 0.62914149],
-       [ 0.59236012], [ 0.2070238 ], [ 0.52390942], [ 0.11894098], [ 0.55725917],
-       [ 0.72706009], [ 0.087196  ], [ 0.04745082], [ 0.95636492], [ 0.31524576],
-       [ 0.79685218], [ 0.80386771], [ 0.70942604], [ 0.82869417], [ 0.26906569],
-       [ 0.51848039], [ 0.64169354], [ 0.07114973], [ 0.39249454], [ 0.07002803],
-       [ 0.94667567], [ 0.02252752], [ 0.01039039], [ 0.5721312 ], [ 0.06065969],
-       [ 0.88691608], [ 0.69476752], [ 0.81659504], [ 0.17801859], [ 0.71048235],
-       [ 0.08188783], [ 0.54884803], [ 0.84039558], [ 0.6982093 ], [ 0.41686176],
-       [ 0.38568873], [ 0.29401347], [ 0.12704074], [ 0.30640858], [ 0.89578031],
-       [ 0.84621571], [ 0.11783439], [ 0.0944695 ], [ 0.34081575], [ 0.44157643],
-       [ 0.84621571], [ 0.11783439], [ 0.0944695 ], [ 0.34081575], [ 0.44157643],
-       [ 0.77847185], [ 0.04283567], [ 0.45107823], [ 0.89789891], [ 0.41045519],
-       [ 0.49073531], [ 0.29727627], [ 0.94711483], [ 0.24898204], [ 0.26181212],
-       [ 0.71760957], [ 0.60326883], [ 0.80887576], [ 0.09448718], [ 0.88064525],
-       [ 0.84317654], [ 0.48893703], [ 0.24847021], [ 0.84203596], [ 0.34104156],
-       [ 0.75604701], [ 0.91703057], [ 0.69325475], [ 0.61276969], [ 0.16335226],
-       [ 0.4684374 ], [ 0.16553371], [ 0.79594434], [ 0.6440283 ], [ 0.80922237],
-       [ 0.5349296 ], [ 0.31924316], [ 0.10960695], [ 0.40151062], [ 0.50473641],
-       [ 0.14812671], [ 0.73523169], [ 0.35141625], [ 0.80364238], [ 0.02128181],
-       [ 0.0061226 ], [ 0.34541924], [ 0.07694485], [ 0.05551339], [ 0.23087636],
-       [ 0.87016395], [ 0.31682377], [ 0.27375113], [ 0.72226332], [ 0.62914149],
-       [ 0.59236012], [ 0.2070238 ], [ 0.52390942], [ 0.11894098], [ 0.55725917],
-       [ 0.72706009], [ 0.087196  ], [ 0.04745082], [ 0.95636492], [ 0.31524576],
-       [ 0.79685218], [ 0.80386771], [ 0.70942604], [ 0.82869417], [ 0.26906569],
-       [ 0.51848039], [ 0.64169354], [ 0.07114973], [ 0.39249454], [ 0.07002803],
-       [ 0.94667567], [ 0.02252752], [ 0.01039039], [ 0.5721312 ], [ 0.06065969],
-       [ 0.69422476], [ 0.4310939 ], [ 0.03069099], [ 0.35969779], [ 0.18047331],
-       [ 0.4177651 ], [ 0.01360547], [ 0.29069319]
-       ])
-
-fixedParam = numpy.matlib.mat([[ 0.62870709], [ 0.7012026 ], [ 0.60236784],
-  [ 0.51316259], [ 0.20220814], [ 0.70324863], [ 0.37218684], [ 0.82178927],
-  [ 0.60394866], [ 0.70371272], [ 0.07548074], [ 0.63412803], [ 0.97442266],
-  [ 0.13216791], [ 0.71041168], [ 0.2093887 ], [ 0.35227344], [ 0.3405142 ],
-  [ 0.69677236], [ 0.41673747], [ 0.564245  ], [ 0.37613432], [ 0.88805642],
-  [ 0.88691608], [ 0.69476752], [ 0.81659504], [ 0.17801859], [ 0.71048235],
-  [ 0.08188783], [ 0.54884803], [ 0.84039558], [ 0.6982093 ], [ 0.41686176],
-  [ 0.38568873], [ 0.29401347], [ 0.12704074], [ 0.30640858], [ 0.89578031],
-  [ 0.84621571], [ 0.11783439], [ 0.0944695 ], [ 0.34081575], [ 0.44157643],
-  [ 0.77847185], [ 0.04283567], [ 0.45107823], [ 0.89789891], [ 0.41045519],
-  [ 0.49073531], [ 0.29727627], [ 0.94711483], [ 0.24898204], [ 0.26181212],
-  [ 0.71760957], [ 0.60326883], [ 0.80887576], [ 0.09448718], [ 0.88064525],
-  [ 0.84317654], [ 0.48893703], [ 0.24847021], [ 0.84203596], [ 0.34104156],
-  [ 0.75604701], [ 0.91703057], [ 0.69325475], [ 0.61276969], [ 0.16335226],
-  [ 0.4684374 ], [ 0.16553371], [ 0.79594434], [ 0.6440283 ], [ 0.80922237],
-  [ 0.5349296 ], [ 0.31924316], [ 0.10960695], [ 0.40151062], [ 0.50473641],
-  [ 0.14812671], [ 0.73523169], [ 0.35141625], [ 0.80364238], [ 0.02128181],
-  [ 0.0061226 ], [ 0.34541924], [ 0.07694485], [ 0.05551339], [ 0.23087636],
-  [ 0.87016395], [ 0.31682377], [ 0.27375113], [ 0.72226332], [ 0.62914149],
-  [ 0.59236012], [ 0.2070238 ], [ 0.52390942], [ 0.11894098], [ 0.55725917],
-  [ 0.72706009], [ 0.087196  ], [ 0.04745082], [ 0.95636492], [ 0.31524576],
-  [ 0.79685218], [ 0.80386771], [ 0.70942604], [ 0.82869417], [ 0.26906569],
-  [ 0.51848039], [ 0.64169354], [ 0.07114973], [ 0.39249454], [ 0.07002803],
-  [ 0.94667567], [ 0.02252752], [ 0.01039039], [ 0.5721312 ], [ 0.06065969],
-  [ 0.69422476], [ 0.4310939 ], [ 0.03069099], [ 0.35969779], [ 0.18047331],
-  [ 0.4177651 ], [ 0.01360547], [ 0.29069319]])
-
-
-###########################################################
-#
-# The parameters for the QPalma algorithm
-#
-#
-
-C = 100000.0
-
-# 'normal' means work like Palma 'using_quality_scores' means work like Palma
-# plus using sequencing quality scores
-
-#mode = 'normal'
-
-mode = 'using_quality_scores'
-
-# When using quality scores our scoring function is defined as
-#
-# f: S_e x R x S -> R
-#
-# where S_e is {A,C,G,T,N} and S = {A,C,G,T,N,-}
-# 
-# as opposed to a usage without quality scores when we only have
-# 
-# f: S x S -> R 
-#
-# The matrix of plifs is defined as follows:
-#
-#  elem |   -  a  c  g  t  n
-#  -------------------------  
-#  idx  |   0  1  2  3  4  5
-#   
-# 
-#              dna
-#
-#           -  a  c  g  t  n
-#        a  
-# est    c
-#        g
-#        t
-#        n
-#  
-# so the index can be calculated as (estnum-1)*6 + dnanum.
-#
-# At ests we do not have gaps with quality scores so we look up the matchmatrix
-# instead.
-
-numDonSuppPoints     = 30
-numAccSuppPoints     = 30
-numLengthSuppPoints  = 30 
-numQualSuppPoints    = 2
-
-min_qual = -1
-max_qual = 40
-
-USE_OPT = True
-
-if mode == 'normal':
-   sizeMatchmatrix   = (6,6)
-   estPlifs = 0
-   dnaPlifs = 0
-   numQualPlifs = estPlifs*dnaPlifs
-elif mode == 'using_quality_scores':
-   sizeMatchmatrix   = (6,1)
-   estPlifs = 5
-   dnaPlifs = 6
-   numQualPlifs = estPlifs*dnaPlifs
-else:
-   assert False, 'Wrong operation mode specified'
-
-totalQualSuppPoints = numQualPlifs*numQualSuppPoints
-
-numFeatures = numDonSuppPoints + numAccSuppPoints\
-+ numLengthSuppPoints + sizeMatchmatrix[0]*sizeMatchmatrix[1] + totalQualSuppPoints 
-
-iter_steps = 40
-remove_duplicate_scores = False
-print_matrix            = False
-anzpath                 = 2
-
-if mode == 'normal':
-   fixedParam = fixedParam[:numFeatures]
-elif mode == 'using_quality_scores':
-   fixedParam = fixedParamQ[:numFeatures]
-else:
-   assert False, 'Wrong operation mode specified'
-
-# Check for valid parameters
-assert numQualPlifs       >= 0
-assert numDonSuppPoints    > 1
-assert numAccSuppPoints    > 1
-assert numLengthSuppPoints > 1 
-assert numQualSuppPoints   > 1
diff --git a/python/Plif.py b/python/Plif.py
deleted file mode 100644 (file)
index a52594f..0000000
+++ /dev/null
@@ -1,84 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import QPalmaDP
-
-class Plf: 
-   """
-   means piecewise linear function
-   """
-
-   def __init__(self,num=None):
-      if num == None:
-         self.len = 0
-         self.limits = []
-         self.penalties = []
-         self.transform = ''
-         self.name = ''
-         self.max_len = 0
-         self.min_len = 0
-      else:
-         self.len = num
-         self.limits = linspace(0,40,num) 
-         self.penalties = [0]*num
-         self.transform = ''
-         self.name = ''
-         self.max_len = 0
-         self.min_len = 0
-         
-
-   def convert2SWIG(self):
-      ps = QPalmaDP.penalty_struct()
-
-      ps.len = len(self.limits)
-   
-      ps.limits = QPalmaDP.createDoubleArrayFromList(self.limits)
-      ps.penalties = QPalmaDP.createDoubleArrayFromList(self.penalties)
-
-      ps.max_len = self.max_len
-      ps.min_len = self.min_len
-      ps.transform = 0
-      ps.name = self.name
-
-      return ps
-
-
-def base_coord(b1,b2):
-   b1 = b1.lower()
-   b2 = b2.lower()
-   assert b1 in ['a','c','g','t','n']
-   assert b2 in ['a','c','g','t','n']
-   cb = {'a':0, 'c':1, 'g':2, 't':3, 'n':4}
-
-   b1 = cb[b1]
-   b2 = cb[b2]
-
-   return b1*5+b2
-
-def linspace(a,b,n):
-   intervalLength = b-a
-   stepSize = 1.0*intervalLength / (n-1)
-   
-   interval = [0]*n
-   interval[0] = a
-   interval[-1] = b
-   for i in range(1,n-1):
-      interval[i] = a+(i*stepSize)
-
-   return interval
-
-def logspace(a,b,n):
-   interval = [0]*n
-   begin = 10.0**a
-   end = 10.0**b
-   intervalSize = 1.0*(b-a)/(n-1)
-   interval[0] = begin
-   interval[-1] = end
-
-   for i in range(1,n-1):
-      interval[i] = 10**(a+i*intervalSize)
-
-   return interval
-
-def log10(x):
-   return math.log(x)/math.log(10)
diff --git a/python/SIQP.py b/python/SIQP.py
deleted file mode 100644 (file)
index f5008d9..0000000
+++ /dev/null
@@ -1,124 +0,0 @@
-#!/usr/bin/env python
-# -*- coding:latin-1 -*-
-
-import sys
-import os
-import os.path
-import logging
-import math
-import pdb
-
-import cvxopt.base as cb
-
-import logging
-logging.basicConfig(level=logging.DEBUG,format='%(levelname)s %(message)s')
-
-def asymCoords(t):
-   size_upstream = t[0]
-   size_downstream = t[1]
-   assert size_upstream >= 1 and size_upstream <= 39
-   assert size_downstream >= 1 and size_downstream <= 39
-   
-   if size_downstream > size_upstream:
-      asym_small = size_upstream
-      asym_big   = size_downstream
-   else:
-      asym_small = size_downstream
-      asym_big   = size_upstream
-
-   asym_offset = 0
-   for r in range(asym_small):
-      asym_offset += (40-r)-r;
-
-   return (asym_offset+asym_big)-1
-
-def calcTuples(i,j):
-   return filter(lambda t: t[0]<=t[1] and t[0]>0 and t[0]<40 and t[1]>0 and t[1]<40,[(i+1,j),(i-1,j),(i,j+1),(i,j-1)])
-
-class SIQP:
-   """
-   This class is a wrapper for the cvxopt qp solver.
-   It has the purpose to support an iterative addition of
-   constraints to the original problem.
-
-   We want to solve a problem of the form
-
-   min  1/2 * C * x' * P * x + q' * x
-    x
-
-    s.t. 
-      < w,f+ > - < w,f- > >= 1 - xi_i    for all i, for all f-
-
-   where x is a vector storing information on the parameters w_i and the slacks xi_i
-   so x = [w;xi] 
-
-   """
-
-   def __init__(self,fSize,numExamples,c):
-      assert fSize > 0, 'You have to have at least one feature!'
-      assert numExamples > 0, 'You have to have at least one example!'
-      self.numFeatures = fSize
-      self.numExamples = numExamples
-      self.C = c
-      self.EPSILON = 10e-15
-
-      logging.debug("Starting SIQP solver with %d examples. A feature space dim. of %d.", numExamples,fSize)
-      logging.debug("Regularization parameters are C=%f."%(self.C))
-
-      self.old_w = None
-
-      self.dimP = self.numFeatures + self.numExamples
-      self.createRegularizer()
-
-   def createUnitRegularizer(self):
-      # set regularizer to zero
-      self.P = cb.matrix(0.0,(self.dimP,self.dimP))
-      for i in range(self.numFeatures):
-         self.P[i,i] = 1.0
-      # end of zeroing regularizer
-
-   def createRegularizer(self):
-      self.createUnitRegularizer()
-
-      q_array = [0.0]*self.numFeatures + [1.0]*self.numExamples
-      self.q = cb.matrix(q_array,(self.numFeatures+self.numExamples,1),'d')
-      self.q = self.C * self.q
-
-      self.G = cb.matrix(0.0,(self.numExamples,self.numFeatures+self.numExamples))
-      for i in range(self.numExamples):
-         self.G[i,self.numFeatures+i] = -1
-
-      self.h = cb.matrix(0,(self.numExamples,1),'d')
-
-if __name__ == '__main__':
-   sq = SIQP(3,2,100.0)
-
-   """
-   function [C_qp_penalties,C_qp_smoothness,C_lp_smoothness,C_qp_smallness,column_eps,Q,f,LB,UB] = paths_create_qp(N,C)
-   % function [C_qp_penalties,C_qp_smoothness,C_lp_smoothness,C_qp_smallness,column_eps,Q,f,LB,UB] = paths_create_qp(N,C)
-
-   %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-   % qp_solve: min ( <res, f'> + 1/2 res' Q res)
-   %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-   C_qp_penalties  = C ;    %was 0.01
-   C_qp_smoothness = C ;    %was 0.01
-   C_lp_smoothness = 0.00 ; %linear problem parameters
-   C_qp_smallness  = 1e-6 ; %was 1e-6 
-   column_eps      = 1e-3 ;
-
-   Q=zeros(2*126+N) ; % 1/2 * res' * Q * res
-   Q(1:90,1:90)           = C_qp_smallness*diag(ones(1,90)) ;
-   Q(91:126,91:126)       = C_qp_penalties*diag(ones(1,36)) ;
-   Q(127:2*126,127:2*126) = C_qp_smoothness*diag(ones(1,126)) ;
-   f = [zeros(1,126) C_lp_smoothness*ones(1,126) ones(1,N)/N] ; % <res, f'>
-
-
-
-   %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-   % qp_solve: LB <= res <= UB
-   %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-   INF = 1e20 ;
-   LB = [-INF*ones(1,126) zeros(1,126) zeros(1,N)] ; % lower bound for res
-   UB = [ INF*ones(1,2*126) INF*ones(1,N)] ; % upper bound for res
-
-   """
diff --git a/python/SIQP_CPX.py b/python/SIQP_CPX.py
deleted file mode 100644 (file)
index b6635f6..0000000
+++ /dev/null
@@ -1,318 +0,0 @@
-#!/usr/bin/env python
-# -*- coding:latin-1 -*-
-
-import sys
-import os
-import os.path
-import logging
-import math
-
-import cvxopt.base as cb
-
-import logging
-logging.basicConfig(level=logging.DEBUG,format='%(levelname)s %(message)s')
-
-import CPX
-import numpy as ny
-import numpy.matlib as nyml
-from cplex_const import *
-
-from SIQP import SIQP
-
-OPTIMAL = [CPX_STAT_OPTIMAL, CPXMIP_OPTIMAL, CPXMIP_OPTIMAL_TOL]
-UNBOUNDED = [CPX_STAT_UNBOUNDED, CPXMIP_UNBOUNDED]
-INFEASIBLE = [CPX_STAT_INFEASIBLE, CPXMIP_INFEASIBLE, CPXMIP_INForUNBD]
-
-class FloatMatrix:
-   
-   TYPE = nyml.float64
-
-   def __init__(self,list):
-      self.data = nyml.mat(list, dtype=self.TYPE).T
-
-class IntMatrix:
-   
-   TYPE = nyml.int32
-
-   def __init__(self,list):
-      self.data = nyml.mat(list, dtype=self.TYPE).T
-
-class CharMatrix:
-   
-   TYPE = '|S1'
-
-   def __init__(self,list):
-      self.data = nyml.mat(list, dtype=self.TYPE).T
-
-class SIQPSolver(SIQP):
-   """
-   This algorithm solves the problem 
-
-   min 1/2 x'Px + q'x
-   s.t. 
-         Gx >= h
-   """
-
-   # Define types and constants
-   Inf   = CPX_INFBOUND
-
-   def __init__(self,fSize,numExamples,c,proto):
-      SIQP.__init__(self,fSize,numExamples,c)
-      self.numFeatures = fSize
-      self.numExamples = numExamples
-      self.numVariables = self.numFeatures + self.numExamples
-      self.old_w = None
-      self.old_objective_value = -(sys.maxint-1)
-
-      self.addedConstraints = 0
-
-      self.protocol = proto
-
-      self.objsen = CPX_MIN
-
-      # open CPLEX environment, set some parameters
-      self.env = CPX.openCPLEX()
-
-      CPX.setintparam(self.env, CPX_PARAM_SCRIND, CPX_ON) # print >> self.protocol, info to screen
-      CPX.setintparam(self.env, CPX_PARAM_DATACHECK, CPX_ON)
-
-      # create CPLEX problem, add objective and constraints to it
-      self.lp = CPX.createprob(self.env, 'test1')
-
-      self.obj = FloatMatrix([0.0]*self.numFeatures + [1.0]*self.numExamples).data
-      cFactor = 1.0*self.C/self.numExamples
-      self.obj =  cFactor * self.obj
-      assert self.obj.shape == (self.numVariables,1)
-      
-      self.ub = FloatMatrix([self.Inf] * self.numVariables).data
-      self.lb = FloatMatrix([-self.Inf] * self.numFeatures + [0.0]*self.numExamples).data
-
-      self.matbeg = IntMatrix([1]*self.numVariables).data
-      self.matbeg[0] = 0
-
-      self.matcnt = IntMatrix([0]*self.numVariables).data
-      self.matcnt[0] = 1
-
-      self.matind = IntMatrix([0]).data
-      self.matval = FloatMatrix([0.0]).data
-
-      self.rhs = FloatMatrix([0.0]*self.numVariables).data
-      self.sense = CharMatrix(['G']*self.numVariables).data
-
-      self.numConstraints = 1
-
-      CPX.copylp(self.env, self.lp, self.numVariables, self.numConstraints, self.objsen, 
-               self.obj, self.rhs, self.sense,
-               self.matbeg, self.matcnt, self.matind, self.matval, 
-               self.lb, self.ub) 
-
-      self.qmatbeg, self.qmatcnt, self.qmatind, self.qmatval = self.cpx_matrix(self.P)
-      CPX.copyquad(self.env, self.lp, self.qmatbeg, self.qmatcnt, self.qmatind, self.qmatval)
-
-   def addConstraint(self, energy_deltas, example_idx):
-   #def addConstraint(self, energy_deltas, loss, example_idx, useMarginRescaling):
-      """
-      This method adds one contraint to the optimization problem.
-      """
-      self.lastDelta = energy_deltas
-      self.lastIndex = example_idx
-
-      assert -1 < example_idx and example_idx < self.numExamples
-
-      if self.old_w != None:
-         print >> self.protocol, 'Scalar prod. contrib is %f'%((self.old_w[0:self.numFeatures].T*energy_deltas)[0])
-
-      numNewRows = 1
-      numNewCols = 0
-   
-      row_rhs = FloatMatrix([1.0]).data
-      row_sense = CharMatrix(['G']).data
-
-      row_matbeg = IntMatrix([0]).data
-
-      row_entries = [0.0]*self.numVariables
-      indices = [0]*self.numVariables
-
-      currentVal = 0.0
-      numNonZeroElems = 0
-      for idx in range(self.numFeatures):
-         currentVal = energy_deltas[idx]
-         if math.fabs(currentVal) > self.EPSILON:
-            row_entries[numNonZeroElems] = currentVal
-            indices[numNonZeroElems] = idx
-            numNonZeroElems += 1
-
-      row_matind = IntMatrix([0]*(numNonZeroElems+1)).data
-      row_matval = FloatMatrix([0.0]*(numNonZeroElems+1)).data
-
-      for pos in range(numNonZeroElems):
-         row_matind[pos] = indices[pos]
-         row_matval[pos] = row_entries[pos]
-
-      # coefficient 1.0 of xi
-      row_matind[numNonZeroElems] = self.numFeatures+example_idx
-      row_matval[numNonZeroElems] = 1.0
-
-      status = CPX.addrows (self.env, self.lp, numNewCols, numNewRows, numNonZeroElems+1, row_rhs,
-                                  row_sense, row_matbeg, row_matind, row_matval)
-      
-      assert status == 0, 'Error ocurred while adding constraint.'
-
-      #CPX.writeprob(self.env,self.lp,"problem_%d.SAV"%self.addedConstraints)
-      #CPX.writeprob(self.env,self.lp,"problem_%d.LP"%self.addedConstraints)
-      self.addedConstraints += 1
-
-      return True
-               
-   def solve(self):
-      # solve the problem
-      CPX.qpopt(self.env,self.lp)
-
-      # get the solution
-      lpstat = CPX.getstat(self.env,self.lp)
-      if lpstat in OPTIMAL or lpstat == 6:
-         x, objval = CPX.getx(self.env,self.lp), CPX.getobjval(self.env,self.lp)
-         print >> self.protocol, 'SIQP_CPX: Objective is: %f'%objval
-
-         # Check that objective value is monotonically increasing
-         if math.fabs(objval - self.old_objective_value) >= 10e-5:
-            assert objval >= self.old_objective_value
-
-         self.old_objective_value = objval
-
-         new_w = cb.matrix([0.0]*self.numVariables,(self.numVariables,1))
-         for i in range(self.numVariables):
-            new_w[i,0] = x[i]
-
-         self.old_w = new_w
-
-         # Check for non-negative slacks    
-         sumOfSlacks = 0.0
-         print >> self.protocol, 'Slacks are:'
-         for i in range(self.numExamples):
-            print >> self.protocol, '%f ' % new_w[self.numFeatures+i,0]
-            assert new_w[self.numFeatures+i,0] >= 0.0
-            sumOfSlacks += new_w[self.numFeatures+i,0]
-
-         scalarProdDiff = (new_w[0:self.numFeatures,0].T * self.lastDelta)[0]
-         # print >> self.protocol, 'Constraint: %f >= %d - %f'%(scalarProdDiff,self.lastLoss,new_w[self.numFeatures+self.lastIndex])
-
-         # evalute parts of the objective
-         regularizationPart = (0.5 *new_w.T*self.P*new_w)[0]
-         lossPart = 1.0*self.C/self.numExamples * sumOfSlacks
-         assert lossPart >= 0.0, 'Error we have a negative loss contribution'
-         print >> self.protocol, 'Parts of objective: %e %e'%(regularizationPart,lossPart)
-
-         return objval,new_w[0:self.numFeatures,0],new_w[self.numFeatures:,0]
-
-      elif lpstat in UNBOUNDED: 
-         print >> self.protocol, "Solution unbounded"
-         assert False
-      elif lpstat in INFEASIBLE: 
-         print >> self.protocol, "Solution infeasible"
-         assert False
-      else: 
-         print >> self.protocol, "Solution code: ", lpstat
-         assert False
-
-   def cpx_matrix(self,M):
-      """
-      This method creates a sparse CPLEX representation of a given
-      cvxopt Matrix M.
-      """
-
-      nRows,nCols = M.size
-      numEntries = nRows*nCols
-
-      matbeg = IntMatrix([0]*nCols).data
-      matcnt = IntMatrix([0]*nCols).data
-
-      nonZeros = 0
-      elemCounter = 0
-      entry = 0.0
-
-      valVec = [0.0]*numEntries
-      indVec = [0.0]*numEntries
-
-      for currentCol in range(nCols):
-         nonZeros = 0
-         matbeg[currentCol] = elemCounter
-
-         for currentRow in range(nRows):
-            entry = M[currentRow,currentCol]
-            if math.fabs(entry) > self.EPSILON:
-               nonZeros += 1
-               indVec[elemCounter] = currentRow
-               valVec[elemCounter] = entry
-               elemCounter += 1
-
-         matcnt[currentCol] = nonZeros
-
-      totalNonZeros = elemCounter
-
-      matind = IntMatrix([0]*totalNonZeros).data
-      matval = FloatMatrix([0.0]*totalNonZeros).data
-
-      for idx in range(totalNonZeros):
-         matind[idx] = indVec[idx];
-         matval[idx] = valVec[idx];
-    
-      print >> self.protocol, 'nRows,nCols: %d,%d'%(nRows,nCols)
-      print >> self.protocol, 'totalNonZeros ' + str(totalNonZeros)
-
-      return matbeg,matcnt,matind,matval
-
-   def cpx_matrix_id(self):
-      """
-      This method creates a sparse CPLEX representation of a given
-      cvxopt Matrix M.
-      """
-
-      nRows,nCols = self.numVariables,self.numVariables
-      matbeg = IntMatrix([0]*nCols).data
-      matcnt = IntMatrix([0]*nCols).data
-
-      matind = IntMatrix([0]*nCols).data
-      matval = FloatMatrix([0.0]*nCols).data
-
-      for i in range(nCols):
-         matbeg[i] = i
-         matcnt[i] = 1
-         matind[i] = i
-         matval[i] = 1.0
-
-      return matbeg,matcnt,matind,matval
-
-   def delete(self):
-      # free the problem
-      CPX.freeprob(self.env,self.lp)
-      # close CPLEX environment
-      CPX.closeCPLEX(self.env)
-
-def test():
-   fh = open('siqp_cpx.log','w+')
-
-   numFeatures = 3
-   numExamples = 2
-   s = SIQPSolver(numFeatures,numExamples,10.0,fh)
-
-   d1 = cb.matrix([1,0,2],(numFeatures,1))
-   d2 = cb.matrix([1,1,0],(numFeatures,1))
-
-   s.addConstraint(d1,200.0,0,True)
-   s.addConstraint(d2,105.0,1,True)
-
-   w,slacks = s.solve()
-   print w
-   print slacks
-
-   print w.T * d1 + slacks[0]
-   print w.T * d2 + slacks[1]
-
-
-   s.delete()
-   del s
-   fh.close()
-
-if __name__ == '__main__':
-   test()
diff --git a/python/TrainingParam.py b/python/TrainingParam.py
deleted file mode 100644 (file)
index 2d5b28b..0000000
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-class Param:
-   
-   def __init__(self):
-      """ default parameters """
-
-      self.basedir = '/fml/ag-raetsch/share/projects/qpalma/elegans_palma2'
-      #self.basedir = '/fml/ag-raetsch/share/projects/qpalma/zebrafish'
-      self.MAX_MEM = 31000;
-      self.LOCAL_ALIGN = 1;
-      self.init_param = 1;
-      self.train_with_splicesitescoreinformation = 1;
-      self.train_with_intronlengthinformation = 1;
-      self.C = 0.001;
-      self.microexon = 0;
-      self.prob = 0;
-      self.organism = 'elegans'
-      self.expt = 'training'
-         
-      self.insertion_prob  = self.prob/3 ;
-      self.deletion_prob   = self.prob/3 ;
-      self.mutation_prob   = self.prob/3 ;
diff --git a/python/computeSpliceAlignWithQuality.py b/python/computeSpliceAlignWithQuality.py
deleted file mode 100644 (file)
index bb72d92..0000000
+++ /dev/null
@@ -1,110 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import numpy
-from numpy.matlib import mat,zeros,ones,inf
-import pdb
-from Plif import Plf,base_coord,linspace
-import Configuration
-
-def  computeSpliceAlignWithQuality(dna, exons):
-   """
-   Exonpos: Anfang Exon bis 1 rechts neben Ende Exon
-   Daraus folgt: SpliceposAnfang ist gleich Ende Exon von Vorgaenger
-   (ausser dem letzten und ersten)
-    und Ende Splicestelle ist gleich Anfang Exon vom naechsten, da Splicestelleende notiert 1 rechts neben Splciestellenende
-   bsp: 
-   cccccA1cccccE1cccccA2 ... AmcccccEmcccccccc
-   cccccXXcccccA1cccccE1 ... Em-1cccXXcccccccc
-   """
-
-   numberOfExons = (exons.shape)[0] # how many rows ?
-   exonSizes = [-1]*numberOfExons
-
-   #assert numberOfExons == 3
-
-   for idx in range(numberOfExons):
-      exonSizes[idx] = exons[idx,1] - exons[idx,0]
-
-   # SpliceAlign vector describes alignment: 
-   # 1:donorpos, 3:intron 2:acceptorpos, 0:exon, 4: dangling end
-   SpliceAlign = []
-
-   if exons[0,0] > 0:
-      SpliceAlign.extend([4]*(exons[0,0]))
-
-   for idx in range(numberOfExons):
-      exonLength = exonSizes[idx]
-      SpliceAlign.extend([0]*exonLength)
-      
-      if idx < numberOfExons-1:
-         intronLength = exons[idx+1,0] - exons[idx,1]
-         SpliceAlign.extend([1]+[3]*(intronLength-2)+[2])
-
-   if len(dna) > exons[-1,1]:
-      SpliceAlign.extend([4]*(len(dna)-exons[-1,1]))
-
-   assert len(SpliceAlign) == len(dna), pdb.set_trace()
-
-   # number of matches: just have to look at the underlying est
-   # est = dna(find(SpliceAlign==0)) # exon nucleotides
-   exonPos = [pos for pos,elem in enumerate(SpliceAlign) if elem == 0]
-   est = [elem for pos,elem in enumerate(dna) if pos in exonPos]
-
-   #length_est = sum(exon_sizes) ;
-   totalESTLength = 0
-   for elem in exonSizes:
-      totalESTLength += elem
-
-   assert totalESTLength == len(est) 
-
-   sizeMatchmatrix = Configuration.sizeMatchmatrix
-   # counts the occurences of a,c,g,t,n in this order
-   numChar = [0]*sizeMatchmatrix[0]
-
-   # counts the occurrences of a,c,g,t,n with their quality scores
-   #trueWeightQuality = zeros((Configuration.numQualPlifs*Configuration.numQualSuppPoints,1))
-   trueWeightQuality = numpy.zeros(Configuration.numQualPlifs*Configuration.numQualSuppPoints)
-   trueWeightQuality = trueWeightQuality.reshape(Configuration.estPlifs,Configuration.dnaPlifs,Configuration.numQualSuppPoints)
-
-   for elem in est:
-      if elem == 'a':
-         numChar[0] += 1
-      if elem == 'c':
-         numChar[1] += 1
-      if elem == 'g':
-         numChar[2] += 1
-      if elem == 't':
-         numChar[3] += 1
-      if elem == 'n':
-         numChar[4] += 1
-
-   if Configuration.mode == 'using_quality_scores':
-      for pos,elem in enumerate(numChar):
-         if pos < Configuration.estPlifs:
-            trueWeightQuality[pos][pos+1][-1] = elem
-
-   trueWeightQuality = mat(trueWeightQuality.flatten())
-   trueWeightQuality = trueWeightQuality.T
-
-   totalNumChar = 0
-   for idx in range(sizeMatchmatrix[0]):
-      totalNumChar += numChar[idx]
-
-   assert totalNumChar == len(est), "numChar %d, len(est) %d" % (totalNumChar,len(est))
-
-   # writing in weight match matrix
-   # matrix is saved columnwise
-   if Configuration.mode == 'normal':
-      trueWeightMatch = zeros((sizeMatchmatrix[0],sizeMatchmatrix[1])) # Scorematrix fuer Wahrheit
-      for idx in range(1,sizeMatchmatrix[0]):
-         trueWeightMatch[idx,idx] = numChar[idx-1]
-
-   if Configuration.mode == 'using_quality_scores':
-      trueWeightMatch = zeros((sizeMatchmatrix[0],sizeMatchmatrix[1])) # Scorematrix fuer Wahrheit
-      #for idx in range(1,sizeMatchmatrix[0]):
-      #   trueWeightMatch[idx] = numChar[idx-1]
-
-   trueWeightMatch = trueWeightMatch.reshape(sizeMatchmatrix[0]*sizeMatchmatrix[1],1)
-
-   return SpliceAlign, trueWeightMatch, trueWeightQuality
diff --git a/python/computeSpliceWeights.py b/python/computeSpliceWeights.py
deleted file mode 100644 (file)
index 7fc8667..0000000
+++ /dev/null
@@ -1,96 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-from numpy import inf
-from numpy.matlib import zeros
-import pdb
-
-# 3 * 30 supporting points: x_1 ... x_30 => 3 * 30 parameters (1 .. 90): y_1 ... y_30
-# piecewise linear function:
-# take score from SVM vektor (don_supp: case 1, acc_supp: case 2) and compute length of intron: case 3
-# these are our values x
-# 
-#        | y_1                                          if x <= x_1
-#        |
-#        |        x_i+1 - x              x - x_i
-# f(x) = | y_i * -----------  +  y_i+1 * -----------    if x_i <= x <= x_i+1
-#        |       x_i+1 - x_i             x_i+1 - x_i
-#        |
-#        | y_30                                         if x_n <= x 
-# 
-#  y_i and y_i+1 parameters, so the fractions are saved in the weight vectors!
-
-def calculateWeights(plf, scores):
-   currentWeight = zeros((30,1))
-   
-   for k in range(len(scores)):
-      value = scores[k]
-      Lower = len([elem for elem in plf.limits if elem <= value])
-
-      if Lower == 0:
-         currentWeight[0] += 1
-      elif Lower == len(plf.limits):
-         currentWeight[-1] += 1
-      else:
-         # because we count from 0 in python
-         Lower -= 1
-         Upper = Lower+1 ; # x-werte bleiben fest
-         #print value,Lower,Upper
-         weightup  = 1.0*(value - plf.limits[Lower]) / (plf.limits[Upper] - plf.limits[Lower])
-         weightlow = 1.0*(plf.limits[Upper] - value) / (plf.limits[Upper] - plf.limits[Lower])
-         currentWeight[Upper] = currentWeight[Upper] + weightup
-         currentWeight[Lower] = currentWeight[Lower] + weightlow
-
-         #print plf.limits[Lower],plf.limits[Upper]
-         #print weightup,weightlow,currentWeight[Upper],currentWeight[Lower]
-
-   return currentWeight
-
-def computeSpliceWeights(d, a, h, SpliceAlign, don_supp, acc_supp):
-   ####################################################################################
-   # 1. Donor: In don_supp stehen Werte der SVM., in SpliceAlign die Einsen
-   ####################################################################################
-
-   # Picke die Positionen raus, an denen eine Donorstelle ist
-   DonorScores = [elem for pos,elem in enumerate(don_supp) if SpliceAlign[pos] == 1]
-   assert not ( -inf in DonorScores )
-
-   #print 'donor'
-   weightDon = calculateWeights(d,DonorScores)
-
-   ####################################################################################
-   # 2. Acceptor: In acc_supp stehen Werte der SVM., in SpliceAlign die Einsen
-   ####################################################################################
-
-   #Den Vektor Acceptorstellen durchgehen und die Gewichtsvektoren belasten:
-   AcceptorScores = [elem for pos,elem in enumerate(acc_supp) if pos > 0 and SpliceAlign[pos-1] == 2]
-   assert not ( -inf in AcceptorScores )
-
-   #print 'acceptor'
-   weightAcc = calculateWeights(a,AcceptorScores)
-
-   ####################################################################################
-   # 3. Intron length: SpliceAlign: Gaps zaehlen und auf Gapgewichte addieren
-   ####################################################################################
-
-   intron_starts = []
-   intron_ends   = []
-   for pos,elem in enumerate(SpliceAlign):
-      if elem == 1:
-         intron_starts.append(pos)
-
-      if elem == 2:
-         intron_ends.append(pos)
-
-   assert len(intron_starts) == len(intron_ends)
-
-   for i in range(len(intron_starts)):
-      assert intron_starts[i] < intron_ends[i]
-
-   values = [0.0]*len(intron_starts)
-   for pos in range(len(intron_starts)):
-      values[pos] = intron_ends[pos] - intron_starts[pos] + 1
-
-   weightIntron = calculateWeights(h,values)
-
-   return weightDon, weightAcc, weightIntron
diff --git a/python/compute_donacc.py b/python/compute_donacc.py
deleted file mode 100644 (file)
index bad0872..0000000
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import math
-from numpy.matlib import zeros,isinf
-from penalty_lookup_new import *
-
-def compute_donacc(donor_supp, acceptor_supp, d, a):
-
-   assert(len(donor_supp)==len(acceptor_supp))
-   size = len(donor_supp)
-  
-   donor    = [0.0]*size
-   acceptor = [0.0]*size
-
-   for idx in range(size):
-     if isinf(donor_supp[idx]):
-       donor[idx] = donor_supp[idx]
-     else:
-       donor[idx] = penalty_lookup_new(d, donor_supp[idx])
-
-     if isinf(acceptor_supp[idx]):
-       acceptor[idx] = acceptor_supp[idx]
-     else:
-       acceptor[idx] = penalty_lookup_new(a,acceptor_supp[idx])
-
-   return donor,acceptor
diff --git a/python/export_param.py b/python/export_param.py
deleted file mode 100644 (file)
index 57d2c94..0000000
+++ /dev/null
@@ -1,52 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-###########################################################
-#
-###########################################################
-
-import bz2
-
-def writeStruct(fid,plif):
-   fid.write('%s_limits=%s\n'%(plif.name,str(plif.limits)))
-   fid.write('%s_penalties=%s\n'%(plif.name,str(plif.penalties)))
-   fid.write('%s_bins=%d\n'%(plif.name,len(plif.limits)))
-
-   if plif.name == 'intron':
-      fid.write('%s_len_limits=%s\n'%(plif.name,str(plif.limits)))
-      fid.write('%s_len_penalties=%s\n'%(plif.name,str(plif.penalties)))
-      fid.write('%s_len_bins=%d\n'%(plif.name,len(plif.limits)))
-      fid.write('%s_len_min=%d\n'%(plif.name,plif.min_len))
-      fid.write('%s_len_max=%d\n'%(plif.name,plif.max_len))
-      fid.write('%s_len_transform=%s\n'%(plif.name,plif.transform))
-
-def export_param(filename,h,d,a,mmatrix,qualityPlifs):
-
-   # Exports a bz2 file with the trained PALMA. Assumes splice sites and intron length used.
-   h.name = 'intron'
-   d.name = 'donor'
-   a.name = 'acceptor'
-
-   fid = bz2.BZ2File(filename+'.bz2','w')
-
-   fid.write('%palma definition file version: 1.0\n\n')
-   fid.write('%penalties\n');
-  
-   writeStruct(fid, h);
-   writeStruct(fid, a);
-   writeStruct(fid, d);
-
-   # substitution matrix
-   mmatrix = mmatrix.reshape(6,6)
-   fid.write('substitution_matrix=[')
-   for row in range(6):
-      if row == 5:
-         fid.write('%f, %f, %f, %f, %f, %f]\n'%(mmatrix[row,0],mmatrix[row,1],mmatrix[row,2],mmatrix[row,3],mmatrix[row,4],mmatrix[row,5]))
-      else:
-         fid.write('%f, %f, %f, %f, %f, %f;\n'%(mmatrix[row,0],mmatrix[row,1],mmatrix[row,2],mmatrix[row,3],mmatrix[row,4],mmatrix[row,5]))
-
-   
-   for elem in qualityPlifs:
-      writeStruct(fid, elem);
-
-   fid.close()
diff --git a/python/generateEvaluationData.py b/python/generateEvaluationData.py
deleted file mode 100644 (file)
index 4b5b7a0..0000000
+++ /dev/null
@@ -1,148 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-from numpy.matlib import mat,zeros,ones,inf
-import random
-import pdb
-import os.path
-import cPickle
-
-class Dataset:
-  pass
-
-def sample(population, k):
-   """Chooses k random elements from a population sequence. """
-   n = len(population)
-   result = [None] * k
-   for i in xrange(k):
-      j = int(random.random() * n)
-      result[i] = population[j]
-   return result
-
-def generateData(numExamples):
-   dna_len = 216
-   est_len = 36
-   random.seed(14)
-
-   letters = ['a','c','g','t']
-
-   Sequences   = []
-   Acceptors   = []
-   Donors      = []
-   Exons       = []
-   Ests        = []
-
-   for i in range(numExamples):
-      dna = ''.join(sample(letters, dna_len))
-      Sequences.append(dna)
-
-      Acceptors.append([0.0]*dna_len)
-      Donors.append([0.0]*dna_len)
-
-      currentExon       = zeros((2,2))
-      currentExon[0,0]  = 0
-      currentExon[0,1]  = 72
-      currentExon[1,0]  = 144
-      currentExon[1,1]  = 216
-
-      Exons.append(currentExon)
-
-      est = ''.join(sample(letters, est_len))
-      Ests.append(est)
-
-   preNr    = 15
-   middleNr = 6
-   sufNr    = 15
-   Qualities  = [[40]*preNr + [-1]*middleNr + [40]*sufNr]*numExamples
-
-   return Sequences, Acceptors, Donors, Exons, Ests, Qualities
-
-def generateData2(numExamples):
-   est_len = 36
-   random.seed(14)
-
-   letters = ['a','c','g','t']
-
-   Sequences   = []
-   Acceptors   = []
-   Donors      = []
-   Exons       = []
-   Ests        = []
-
-   for i in range(numExamples):
-      dna_len = random.randint(200,400)
-      dna = ''.join(sample(letters, dna_len))
-      #Sequences.append(dna)
-      begin = random.randint(70,dna_len-70)
-      end   = begin+60
-      dna = dna[0:begin] + 't'*18 + 'gt' + 'a'*20 + 'ag' + 't'*18 + dna[end:]
-      dna_len = len(dna)
-      Sequences.append(dna)
-      
-      currentDon  = [-3.0]*dna_len
-      currentAcc  = [-3.0]*dna_len
-
-      currentDon[begin+18+1] = 3.0
-      currentDon[begin+18+2] = 3.0
-
-      currentAcc[end-18-1] = 3.0
-      currentAcc[end-18-2] = 3.0
-
-      #pdb.set_trace()
-
-      Donors.append(currentDon)
-      Acceptors.append(currentAcc)
-
-      currentExon       = zeros((2,2))
-      currentExon[0,0]  = 0
-      currentExon[0,1]  = begin+18
-      currentExon[1,0]  = end-18
-      currentExon[1,1]  = dna_len-1
-
-      Exons.append(currentExon)
-
-      #est = ''.join(sample(letters, est_len))
-      est = dna[begin-18:begin] + dna[end+1:end+19]
-      est = 't'*est_len
-      Ests.append(est)
-
-   preNr    = 15
-   middleNr = 6
-   sufNr    = 15
-   #Qualities  = [[40]*preNr + [-1]*middleNr + [40]*sufNr]*numExamples
-   Qualities  = [[40]*est_len]*numExamples
-
-   return Sequences, Acceptors, Donors, Exons, Ests, Qualities
-
-def loadArtificialData(numExamples):
-   filename = 'artificial_dset_%d'%numExamples
-   if not os.path.exists(filename):
-      s,a,d,e,est,q = generateData2(numExamples)
-      dset = Dataset()
-      dset.Sequences = s
-      dset.Acceptor  = a
-      dset.Donors    = d
-      dset.Exons     = e
-      dset.Ests      = est
-      dset.Qualities = q
-      cPickle.dump(dset,open(filename,'w+'))
-   else:
-      dset = cPickle.load(open(filename))
-      s = dset.Sequences
-      a = dset.Acceptor
-      d = dset.Donors
-      e =  dset.Exons
-      est = dset.Ests
-      q = dset.Qualities
-
-   return s,a,d,e,est,q
-
-
-
-if __name__ == '__main__':
-   Sequences, Acceptors, Donors, Exons, Ests, Qualities = generateData(10)
-   print Acceptors
-   print Donors
-   print Exons
-   print Ests
-   print Qualities
diff --git a/python/genome_info.pickle b/python/genome_info.pickle
deleted file mode 100644 (file)
index 5eb5916..0000000
+++ /dev/null
@@ -1,178 +0,0 @@
-ccopy_reg
-_reconstructor
-p1
-(cscipy.io.mio5
-mat_struct
-p2
-c__builtin__
-object
-p3
-NtRp4
-(dp5
-S'cdna_fnames'
-p6
-V/fml/ag-raetsch/share/databases/elegans_WS150/confirmed_genes.WS150
-p7
-sS'contig_names'
-p8
-cnumpy.core.multiarray
-_reconstruct
-p9
-(cnumpy
-ndarray
-p10
-(I0
-tS'b'
-tRp11
-(I1
-(I6
-tcnumpy
-dtype
-p12
-(S'O8'
-I0
-I1
-tRp13
-(I3
-S'|'
-NNNI-1
-I-1
-I63
-tbI00
-(lp14
-VCHROMOSOME_I
-p15
-aVCHROMOSOME_II
-p16
-aVCHROMOSOME_III
-p17
-aVCHROMOSOME_IV
-p18
-aVCHROMOSOME_V
-p19
-aVCHROMOSOME_X
-p20
-atbsS'alphabet'
-p21
-Vacgt
-p22
-sS'basedir'
-p23
-V/fml/ag-raetsch/share/projects/qpalma/elegans_palma2/
-p24
-sS'flcdna_fnames'
-p25
-g9
-(g10
-(I0
-tS'b'
-tRp26
-(I1
-(I0
-tg12
-(S'f8'
-I0
-I1
-tRp27
-(I3
-S'<'
-NNNI-1
-I-1
-I0
-tbI00
-S''
-tbsS'flat_fnames'
-p28
-g9
-(g10
-(I0
-tS'b'
-tRp29
-(I1
-(I6
-tg13
-I00
-(lp30
-V/fml/ag-raetsch/share/projects/qpalma/elegans_palma2/genome/I.dna.flat
-p31
-aV/fml/ag-raetsch/share/projects/qpalma/elegans_palma2/genome/II.dna.flat
-p32
-aV/fml/ag-raetsch/share/projects/qpalma/elegans_palma2/genome/III.dna.flat
-p33
-aV/fml/ag-raetsch/share/projects/qpalma/elegans_palma2/genome/IV.dna.flat
-p34
-aV/fml/ag-raetsch/share/projects/qpalma/elegans_palma2/genome/V.dna.flat
-p35
-aV/fml/ag-raetsch/share/projects/qpalma/elegans_palma2/genome/X.dna.flat
-p36
-atbsS'est_fnames'
-p37
-g9
-(g10
-(I0
-tS'b'
-tRp38
-(I1
-(I3
-tg13
-I00
-(lp39
-V/fml/ag-raetsch/share/databases/dbEST.111005/dbEST_elegans.fa
-p40
-aV/fml/ag-raetsch/share/databases/elegans_WS150/EST_Elegans.dna
-p41
-aV/fml/ag-raetsch/share/databases/elegans_WS150/confirmed_genes.WS150
-p42
-atbsS'_fieldnames'
-p43
-(lp44
-g37
-ag6
-ag25
-ag8
-ag28
-aS'fasta_fnames'
-p45
-aS'annotation_fnames'
-p46
-ag21
-ag23
-aS'max_intron_len'
-p47
-aS'min_intron_len'
-p48
-aS'merge_est_transcripts'
-p49
-asg47
-I20000
-sg49
-I1
-sg46
-V/fml/ag-raetsch/share/databases/elegans_WS150/elegansWS150_CHROMOSOME.gff
-p50
-sg45
-g9
-(g10
-(I0
-tS'b'
-tRp51
-(I1
-(I6
-tg13
-I00
-(lp52
-V/fml/ag-raetsch/share/databases/elegans_WS150/CHROMOSOME_I.dna
-p53
-aV/fml/ag-raetsch/share/databases/elegans_WS150/CHROMOSOME_II.dna
-p54
-aV/fml/ag-raetsch/share/databases/elegans_WS150/CHROMOSOME_III.dna
-p55
-aV/fml/ag-raetsch/share/databases/elegans_WS150/CHROMOSOME_IV.dna
-p56
-aV/fml/ag-raetsch/share/databases/elegans_WS150/CHROMOSOME_V.dna
-p57
-aV/fml/ag-raetsch/share/databases/elegans_WS150/CHROMOSOME_X.dna
-p58
-atbsg48
-I20
-sb.
\ No newline at end of file
diff --git a/python/numpyUtils.py b/python/numpyUtils.py
deleted file mode 100644 (file)
index 14ece77..0000000
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import numpy as ny
-import numpy.matlib as nyml
-
-
-def 
diff --git a/python/paths_load_data.py b/python/paths_load_data.py
deleted file mode 100644 (file)
index fef599c..0000000
+++ /dev/null
@@ -1,124 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import io_pickle
-import scipy.io
-import pdb
-
-def paths_load_data(expt,genome_info,PAR):
-   """
-      
-   """
-
-   # function [Sequences, Acceptors, Donors, Exons, Ests, Noises] = paths_load_data(expt,genome_info,PAR)
-   # Load the relevant file and return the alignment data
-
-   # expt can be 'training','validation' or 'test'
-
-   assert expt in ['training','validation','test']
-
-   tmp_dir = '/fml/ag-raetsch/home/fabio/tmp'
-
-   Noises = [];
-
-   if expt == 'training':
-      if PAR.microexon:
-         if PAR.LOCAL_ALIGN: # local version
-
-            train_data = '%s/microexon_train_data_cut_local.mat' % genome_info.basedir
-            train_data = '%s/microexon_train_data.mat' % genome_info.basedir
-            #train_data_pickle = '%s/microexon_train_data_cut_local.pickle'% tmp_dir
-            #io_pickle.convert_v6(train_data,train_data_pickle)
-            #train_data = io_pickle.load(train_data_pickle)
-            data = scipy.io.loadmat(train_data)
-
-         else: # global version
-
-            train_data = '%s/microexon_train_data_cut_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat' %\
-               (genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob)
-
-            train_data = '%s/microexon_train_data.mat' % genome_info.basedir
-            #train_data_pickle = '%s/microexon_train_data_cut_ip=%1.3f_dp=%1.3f_mp=%1.3f.pickle' %\
-            #   (tmp_dir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob)
-
-            #io_pickle.convert_v6(train_data,train_data_pickle)
-            #train_data = io_pickle.load(train_data_pickle)
-            data = scipy.io.loadmat(train_data)
-            Noises = data['TrainNoise'] # substitution matrix
-
-      else:
-         train_data = '%s/exons_train_local.mat' % genome_info.basedir
-         #train_data_pickle = '%s/exons_train_local.pickle'% tmp_dir
-         #io_pickle.convert_v6(train_data,train_data_pickle)
-         #microexon_train_data = io_pickle.load(train_data_pickle)
-         data = scipy.io.loadmat(train_data)
-
-      print 'train_data is %s' % train_data
-
-      Sequences = data['Train']       # dna sequences
-      Acceptors = data['TrainAcc']    # acceptor scores
-      Donors    = data['TrainDon']    # donor scores
-      Exons     = data['TrainExon']   # exon boundaries
-      Ests      = data['TrainEsts']   # est sequences
-     
-   #elif expt == 'validation':
-   #  print('Loading validation data\n') ;
-   #  if PAR.microexon,
-   #    if PAR.LOCAL_ALIGN
-   #      %local version
-   #      load(sprintf('%s/microexon_val_data_cut_local_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat', ...
-   #         genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob),  ...
-   #      'ValEsts', 'ValExon', 'Val', 'ValAcc', 'ValDon') ;
-   #    else
-   #      %global version
-   #      load(sprintf('%s/microexon_val_data_cut_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat', ...
-   #         genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob),  ...
-   #      'ValEsts', 'ValExon', 'Val', 'ValAcc', 'ValDon') ;
-   #    end
-   #  else
-   #    load(sprintf('%s/exons_val_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat', ...
-   #       genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob),  ...
-   #    'ValEsts', 'ValExon', 'Val', 'ValAcc', 'ValDon') ;
-   #  end
-   #  
-   #  Sequences = Val ;      % dna sequences
-   #  Acceptors = ValAcc ;   % acceptor scores
-   #  Donors    = ValDon ;   % donor scores
-   #  Exons     = ValExon ;  % exon boundaries
-   #  Ests      = ValEsts ;  % est sequences
-
-
-
-   #elif expt == 'test':
-   #  fprintf('Loading test data\n') ;
-   #  if PAR.microexon,
-   #    if PAR.LOCAL_ALIGN
-   #      %local version
-   #      load(sprintf('%s/microexon_test_data_cut_local_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat', ...
-   #         genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob), ...
-   #      'TestEsts', 'TestExon', 'Test', 'TestAcc', 'TestDon') ;
-   #    else
-   #      %global version
-   #      load(sprintf('%s/microexon_test_data_cut_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat', ...
-   #         genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob),  ...
-   #      'TestEsts', 'TestExon', 'Test','TestAcc', 'TestDon', 'TestNoise') ;
-   #      Noises    = TestNoise ; % substitution matrix
-   #      end
-   #      else
-   #    load(sprintf('%s/exons_test_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat', ...
-   #       genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob), ...
-   #    'TestEsts', 'TestExon', 'Test', 'TestAcc', 'TestDon') ;
-   #  end
-   #  
-   #  Sequences = Test ;      % dna sequences
-   #  Acceptors = TestAcc ;   % acceptor scores
-   #  Donors    = TestDon ;   % donor scores
-   #  Exons     = TestExon ;  % exon boundaries
-   #  Ests      = TestEsts ;  % est sequences
-
-   # Lower all indices by one to convert matlab 
-   # to python indices
-
-   Exons -= 1
-
-   return Sequences, Acceptors, Donors, Exons, Ests, Noises
diff --git a/python/paths_load_data_pickle.py b/python/paths_load_data_pickle.py
deleted file mode 100644 (file)
index 112ec5f..0000000
+++ /dev/null
@@ -1,50 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import cPickle
-
-def paths_load_data_pickle(expt,genome_info,PAR):
-   """
-      
-   """
-
-   # function [Sequences, Acceptors, Donors, Exons, Ests, Noises] = paths_load_data(expt,genome_info,PAR)
-   # Load the relevant file and return the alignment data
-
-   # expt can be 'training','validation' or 'test'
-
-   assert expt in ['training','validation','test']
-
-   tmp_dir = '/fml/ag-raetsch/home/fabio/tmp'
-
-   Noises = [];
-
-   if expt == 'training':
-      if PAR.microexon:
-         if PAR.LOCAL_ALIGN: # local version
-
-            train_data = '%s/microexon_train_data.pickle' % genome_info.basedir
-            data = cPickle.load(open(train_data))
-
-         else: # global version
-            pass
-
-
-      else:
-         train_data = '%s/exons_train_local.pickle' % genome_info.basedir
-         data = cPickle.load(open(train_data))
-
-      print 'train_data is %s' % train_data
-
-      Sequences = data['Train']       # dna sequences
-      Acceptors = data['TrainAcc']    # acceptor scores
-      Donors    = data['TrainDon']    # donor scores
-      Exons     = data['TrainExon']   # exon boundaries
-      Ests      = data['TrainEsts']   # est sequences
-     
-   # Lower all indices by one to convert matlab 
-   # to python indices
-
-   Exons -= 1
-
-   return Sequences, Acceptors, Donors, Exons, Ests, Noises
diff --git a/python/paths_load_data_solexa.py b/python/paths_load_data_solexa.py
deleted file mode 100644 (file)
index e9dd387..0000000
+++ /dev/null
@@ -1,71 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-from numpy.matlib import mat,zeros,ones,inf
-import cPickle
-import pdb
-
-def paths_load_data_solexa(expt,genome_info,PAR):
-   # expt can be 'training','validation' or 'test'
-   assert expt in ['training','validation','test']
-
-   dna_filename = '/fml/ag-raetsch/share/projects/qpalma/solexa/allGenes.pickle'
-   est_filename = '/fml/ag-raetsch/share/projects/qpalma/solexa/filtered_reads'
-   tair7_seq_filename = '/fml/ag-raetsch/share/projects/qpalma/solexa/tair7_seq.pickle'
-
-   tair7_seq = cPickle.load(open(tair7_seq_filename))
-   allGenes  = cPickle.load(open(dna_filename))
-
-   Sequences = []
-   Acceptors = []
-   Donors = []
-   Exons = []
-   Ests = []
-   Qualities = []
-
-   for line in open(est_filename):
-      line = line.strip()
-      chr,strand,seq, splitpos, length, prb, cal, chastity, gene_id = line.split()
-      splitpos = int(splitpos)
-      length = int(length)
-      prb = [ord(elem)-50 for elem in prb]
-      cal = [ord(elem)-64 for elem in cal]
-      chastity = [ord(elem)+10 for elem in chastity]
-
-      assert len(prb) == len(seq)
-
-      currentGene = allGenes[gene_id]
-      seq = seq.lower()
-
-      try:
-         currentSeq = tair7_seq[gene_id+'.1']['sequence'].lower()
-         Sequences.append(currentSeq)
-      except:
-         continue
-         
-      currentSize = len(Sequences[-1])
-      #assert currentSize == len(Sequences[-1]), 'gene_size %d / seq size %d' % (currentSize,len(Sequences[-1]))
-      #Acceptors.append([-inf]*currentSize)
-      #Donors.append([-inf]*currentSize)
-      Acceptors.append([0]*currentSize)
-      Donors.append([0]*currentSize)
-
-      currentExons = zeros((len(currentGene.exons),2))
-      for idx in range(len(currentGene.exons)):
-         currentExons[idx,0] = currentGene.exons[idx][0]-currentGene.start
-         currentExons[idx,1] = currentGene.exons[idx][1]-currentGene.start
-      Exons.append(currentExons)
-      Ests.append(seq)
-      Qualities.append(prb)
-
-      if len(Sequences[-1]) == 2755:
-         Sequences = Sequences[:-1]
-         Acceptors = Acceptors[:-1]
-         Donors    = Donors[:-1]
-         Exons     = Exons[:-1]
-         Ests      = Ests[:-1]
-         Qualities = Qualities[:-1]
-   
-   print 'found %d examples' % len(Sequences)
-
-   return Sequences, Acceptors, Donors, Exons, Ests, Qualities
diff --git a/python/penalty_lookup_new.py b/python/penalty_lookup_new.py
deleted file mode 100644 (file)
index 9837dd5..0000000
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import math
-import pdb
-
-def penalty_lookup_new(penalty_struct, value):
-
-   limits = penalty_struct.limits
-   penalties = penalty_struct.penalties
-
-   if penalty_struct.transform == 'log':
-      value = math.log(value)
-
-   elif penalty_struct.transform == 'log(+3)':
-      value = math.log(value+3)
-
-   elif penalty_struct.transform == 'log(+1)':
-      value = math.log(value+1)
-
-   elif penalty_struct.transform == '':
-      pass
-
-   elif penalty_struct.transform == '(+3)':
-      value += 3
-         
-   elif penalty_struct.transform == 'mod3':
-     value = value%3
-
-   else:
-      assert False,'unknown penalty transform'
-
-   count = len([elem for elem in limits if elem <= value])
-
-   if count == 0:
-      pen = penalties[0]
-   elif count == len(limits):
-      pen=penalties[-1]
-   else:
-      pen = (penalties[count]*(value-limits[count-1]) + penalties[count-1]\
-         *(limits[count]-value)) / (limits[count]-limits[count-1])
-
-   return pen
diff --git a/python/qpalma.py b/python/qpalma.py
deleted file mode 100644 (file)
index 412a67b..0000000
+++ /dev/null
@@ -1,444 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-###########################################################
-#
-# 
-#
-###########################################################
-
-import sys
-import subprocess
-import scipy.io
-import pdb
-import os.path
-
-from numpy.matlib import mat,zeros,ones,inf
-from numpy.linalg import norm
-
-import QPalmaDP
-from SIQP_CPX import SIQPSolver
-
-from paths_load_data_pickle import *
-from paths_load_data_solexa import *
-
-from generateEvaluationData import *
-
-from computeSpliceWeights import *
-from set_param_palma import *
-from computeSpliceAlignWithQuality import *
-from penalty_lookup_new import *
-from compute_donacc import *
-from TrainingParam import Param
-from export_param import *
-
-import Configuration
-from Plif import Plf
-from Helpers import *
-
-def getQualityFeatureCounts(qualityPlifs):
-   weightQuality = qualityPlifs[0].penalties
-   for currentPlif in qualityPlifs[1:]:
-      weightQuality = numpy.vstack([weightQuality, currentPlif.penalties])
-
-   return weightQuality 
-
-class QPalma:
-   """
-   A training method for the QPalma project
-   """
-   
-   def __init__(self):
-      self.ARGS = Param()
-
-      self.logfh = open('qpalma.log','w+')
-      gen_file= '%s/genome.config' % self.ARGS.basedir
-
-      ginfo_filename = 'genome_info.pickle'
-      self.genome_info = fetch_genome_info(ginfo_filename)
-
-      self.plog('genome_info.basedir is %s\n'%self.genome_info.basedir)
-
-      #self.ARGS.train_with_splicesitescoreinformation = False
-
-   def plog(self,string):
-      self.logfh.write(string)
-      self.logfh.flush()
-
-   def run(self):
-      # Load the whole dataset 
-      if Configuration.mode == 'normal':
-         #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
-         Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
-         use_quality_scores = False
-      elif Configuration.mode == 'using_quality_scores':
-         Sequences, Acceptors, Donors, Exons, Ests, Qualities = paths_load_data_solexa('training',self.genome_info,self.ARGS)
-
-         #Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
-
-         #Sequences_, Acceptors_, Donors_, Exons_, Ests_, Qualities_ = generateData(100)
-
-         #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
-         #Qualities = []
-         #for i in range(len(Ests)):
-         #   Qualities.append([40]*len(Ests[i]))
-         use_quality_scores = True
-      else:
-         assert(False)
-
-      # number of training instances
-      N = len(Sequences) 
-      self.numExamples = N
-      assert N == len(Acceptors) and N == len(Acceptors) and N == len(Exons)\
-      and N == len(Ests), 'The Seq,Accept,Donor,.. arrays are of different lengths'
-      self.plog('Number of training examples: %d\n'% N)
-      print 'Number of features: %d\n'% Configuration.numFeatures
-
-      iteration_steps         = Configuration.iter_steps ; #upper bound on iteration steps
-      remove_duplicate_scores = Configuration.remove_duplicate_scores 
-      print_matrix            = Configuration.print_matrix 
-      anzpath                 = Configuration.anzpath 
-
-      # Initialize parameter vector  / param = numpy.matlib.rand(126,1)
-      param = Configuration.fixedParam 
-
-      # Set the parameters such as limits penalties for the Plifs
-      [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
-
-      # delete splicesite-score-information
-      if not self.ARGS.train_with_splicesitescoreinformation:
-         for i in range(len(Acceptors)):
-            if Acceptors[i] > -20:
-               Acceptors[i] = 1
-            if Donors[i] >-20:
-               Donors[i] = 1
-
-      # Initialize solver 
-      if Configuration.USE_OPT:
-         self.plog('Initializing problem...\n')
-         solver = SIQPSolver(Configuration.numFeatures,self.numExamples,Configuration.C,self.logfh)
-
-      # stores the number of alignments done for each example (best path, second-best path etc.)
-      num_path = [anzpath]*N 
-      # stores the gap for each example
-      gap      = [0.0]*N
-
-      #############################################################################################
-      # Training
-      #############################################################################################
-      self.plog('Starting training...\n')
-
-      donSP       = Configuration.numDonSuppPoints
-      accSP       = Configuration.numAccSuppPoints
-      lengthSP    = Configuration.numLengthSuppPoints
-      mmatrixSP   = Configuration.sizeMatchmatrix[0]\
-      *Configuration.sizeMatchmatrix[1]
-      numq        = Configuration.numQualSuppPoints
-      totalQualSP = Configuration.totalQualSuppPoints
-
-      currentPhi = zeros((Configuration.numFeatures,1))
-      totalQualityPenalties = zeros((totalQualSP,1))
-
-      iteration_nr = 0
-      param_idx = 0
-      const_added_ctr = 0
-      while True:
-         if iteration_nr == iteration_steps:
-            break
-
-         for exampleIdx in range(self.numExamples):
-
-            if (exampleIdx%10) == 0:
-               print 'Current example nr %d' % exampleIdx
-
-            dna = Sequences[exampleIdx] 
-            est = Ests[exampleIdx] 
-
-            if Configuration.mode == 'normal':
-               quality = [40]*len(est)
-
-            if Configuration.mode == 'using_quality_scores':
-               quality = Qualities[exampleIdx]
-
-            exons = Exons[exampleIdx] 
-            # NoiseMatrix = Noises[exampleIdx] 
-            don_supp = Donors[exampleIdx] 
-            acc_supp = Acceptors[exampleIdx] 
-
-            if exons[-1,1] > len(dna):
-               continue
-
-            # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)    
-            trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
-            
-            # Calculate the weights
-            trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
-            trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
-
-            currentPhi[0:donSP]                                               = mat(d.penalties[:]).reshape(donSP,1)
-            currentPhi[donSP:donSP+accSP]                                     = mat(a.penalties[:]).reshape(accSP,1)
-            currentPhi[donSP+accSP:donSP+accSP+lengthSP]                      = mat(h.penalties[:]).reshape(lengthSP,1)
-            currentPhi[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP]   = mmatrix[:]
-
-            if Configuration.mode == 'using_quality_scores':
-               totalQualityPenalties = param[-totalQualSP:]
-               currentPhi[donSP+accSP+lengthSP+mmatrixSP:]                    = totalQualityPenalties[:]
-
-            # Calculate w'phi(x,y) the total score of the alignment
-            trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
-
-            # The allWeights vector is supposed to store the weight parameter
-            # of the true alignment as well as the weight parameters of the
-            # num_path[exampleIdx] other alignments
-            allWeights = zeros((Configuration.numFeatures,num_path[exampleIdx]+1))
-            allWeights[:,0] = trueWeight[:,0]
-
-            AlignmentScores = [0.0]*(num_path[exampleIdx]+1)
-            AlignmentScores[0] = trueAlignmentScore
-
-            ################## Calculate wrong alignment(s) ######################
-
-            # Compute donor, acceptor with penalty_lookup_new
-            # returns two double lists
-            donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
-
-            #myalign wants the acceptor site on the g of the ag
-            acceptor = acceptor[1:]
-            acceptor.append(-inf)
-
-            # for now we don't use donor/acceptor scores
-
-            #donor = [-inf] * len(donor)
-            #acceptor = [-inf] * len(donor)
-
-            dna = str(dna)
-            est = str(est)
-            dna_len = len(dna)
-            est_len = len(est)
-
-            ps = h.convert2SWIG()
-
-            prb = QPalmaDP.createDoubleArrayFromList(quality)
-            chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
-
-            matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
-            mm_len = Configuration.sizeMatchmatrix[0]*Configuration.sizeMatchmatrix[1]
-
-            d_len = len(donor)
-            donor = QPalmaDP.createDoubleArrayFromList(donor)
-            a_len = len(acceptor)
-            acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
-
-            # Create the alignment object representing the interface to the C/C++ code.
-            currentAlignment = QPalmaDP.Alignment(Configuration.numQualPlifs,Configuration.numQualSuppPoints, use_quality_scores)
-
-            c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
-
-            #print 'Calling myalign...'
-            # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
-            currentAlignment.myalign( num_path[exampleIdx], dna, dna_len,\
-             est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
-             acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
-             print_matrix)
-
-            #print 'After calling myalign...'
-            #print 'Calling getAlignmentResults...'
-
-            c_SpliceAlign       = QPalmaDP.createIntArrayFromList([0]*(dna_len*num_path[exampleIdx]))
-            c_EstAlign          = QPalmaDP.createIntArrayFromList([0]*(est_len*num_path[exampleIdx]))
-            c_WeightMatch       = QPalmaDP.createIntArrayFromList([0]*(mm_len*num_path[exampleIdx]))
-            c_DPScores   = QPalmaDP.createDoubleArrayFromList([.0]*num_path[exampleIdx])
-
-            c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(Configuration.totalQualSuppPoints*num_path[exampleIdx]))
-
-            currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
-            c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
-
-            #print 'After calling getAlignmentResults...'
-
-            newSpliceAlign = zeros((num_path[exampleIdx]*dna_len,1))
-            newEstAlign    = zeros((est_len*num_path[exampleIdx],1))
-            newWeightMatch = zeros((num_path[exampleIdx]*mm_len,1))
-            newDPScores    = zeros((num_path[exampleIdx],1))
-            newQualityPlifsFeatures = zeros((Configuration.totalQualSuppPoints*num_path[exampleIdx],1))
-
-            #print 'newSpliceAlign'
-            for i in range(dna_len*num_path[exampleIdx]):
-               newSpliceAlign[i] = c_SpliceAlign[i]
-            #   print '%f' % (spliceAlign[i])
-
-            #print 'newEstAlign'
-            for i in range(est_len*num_path[exampleIdx]):
-               newEstAlign[i] = c_EstAlign[i]
-            #   print '%f' % (spliceAlign[i])
-
-            #print 'weightMatch'
-            for i in range(mm_len*num_path[exampleIdx]):
-               newWeightMatch[i] = c_WeightMatch[i]
-            #   print '%f' % (weightMatch[i])
-
-            #print 'ViterbiScores'
-            for i in range(num_path[exampleIdx]):
-               newDPScores[i] = c_DPScores[i]
-
-
-            if use_quality_scores:
-               for i in range(Configuration.totalQualSuppPoints*num_path[exampleIdx]):
-                  newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
-
-            #  equals palma up to here
-
-            #print "Calling destructors"
-            del c_SpliceAlign
-            del c_EstAlign
-            del c_WeightMatch
-            del c_DPScores
-            del c_qualityPlifsFeatures
-            del currentAlignment
-
-            newSpliceAlign = newSpliceAlign.reshape(num_path[exampleIdx],dna_len)
-            newWeightMatch = newWeightMatch.reshape(num_path[exampleIdx],mm_len)
-            # Calculate weights of the respective alignments Note that we are
-            # calculating n-best alignments without hamming loss, so we
-            # have to keep track which of the n-best alignments correspond to
-            # the true one in order not to incorporate a true alignment in the
-            # constraints. To keep track of the true and false alignments we
-            # define an array true_map with a boolean indicating the
-            # equivalence to the true alignment for each decoded alignment.
-            true_map = [0]*(num_path[exampleIdx]+1)
-            true_map[0] = 1
-            path_loss   = [0]*(num_path[exampleIdx])
-
-            for pathNr in range(num_path[exampleIdx]):
-               weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a, h, newSpliceAlign[pathNr,:].flatten().tolist()[0], don_supp, acc_supp)
-
-               decodedQualityFeatures = zeros((Configuration.totalQualSuppPoints,1))
-               for qidx in range(Configuration.totalQualSuppPoints):
-                  decodedQualityFeatures[qidx] = newQualityPlifsFeatures[(pathNr*Configuration.totalQualSuppPoints)+qidx]
-
-               #pdb.set_trace()
-
-               path_loss[pathNr] = 0
-               # sum up positionwise loss between alignments
-               for alignPosIdx in range(newSpliceAlign[pathNr,:].shape[1]):
-                  if newSpliceAlign[pathNr,alignPosIdx] != trueSpliceAlign[alignPosIdx]:
-                     path_loss[pathNr] += 1
-
-               # Gewichte in restliche Zeilen der Matrix speichern
-               wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch[pathNr,:].T, decodedQualityFeatures])
-               allWeights[:,pathNr+1] = wp
-
-               hpen = mat(h.penalties).reshape(len(h.penalties),1)
-               dpen = mat(d.penalties).reshape(len(d.penalties),1)
-               apen = mat(a.penalties).reshape(len(a.penalties),1)
-               features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties])
-
-               AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
-
-               # Check wether scalar product + loss equals viterbi score
-               print 'Example nr.: %d, path nr. %d, scores: %f vs %f' % (exampleIdx,pathNr,newDPScores[pathNr,0], AlignmentScores[pathNr+1])
-
-               distinct_scores = False
-               if math.fabs(AlignmentScores[pathNr] - AlignmentScores[pathNr+1]) > 1e-5:
-                  distinct_scores = True
-               
-               #if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) + [0,1][distinct_scores and (pathNr>0)] <= 1e-5:
-               if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) <= 1e-5:
-                  pdb.set_trace()
-
-               #  # if the pathNr-best alignment is very close to the true alignment consider it as true
-               if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
-                  true_map[pathNr+1] = 1
-
-               # assert AlignmentScores[0] > max(AlignmentScores[1:]) + 1e-6, pdb.set_trace()
-
-               # the true label sequence should not have a larger score than the maximal one WHYYYYY?
-               # this means that all n-best paths are to close to each other 
-               # we have to extend the n-best search to a (n+1)-best
-               if len([elem for elem in true_map if elem == 1]) == len(true_map):
-                  num_path[exampleIdx] = num_path[exampleIdx]+1
-
-               # Choose true and first false alignment for extending A
-               firstFalseIdx = -1
-               for map_idx,elem in enumerate(true_map):
-                  if elem == 0:
-                     firstFalseIdx = map_idx
-                     break
-
-               # if there is at least one useful false alignment add the
-               # corresponding constraints to the optimization problem
-               if firstFalseIdx != -1:
-                  trueWeights       = allWeights[:,0]
-                  firstFalseWeights = allWeights[:,firstFalseIdx]
-                  differenceVector  = trueWeights - firstFalseWeights
-                  #pdb.set_trace()
-
-                  if Configuration.USE_OPT:
-                     const_added = solver.addConstraint(differenceVector, exampleIdx)
-                     const_added_ctr += 1
-               #
-               # end of one example processing 
-               #
-
-            # call solver every nth example //added constraint
-            if exampleIdx != 0 and exampleIdx % 20 == 0 and Configuration.USE_OPT:
-               objValue,w,self.slacks = solver.solve()
-      
-               print "objValue is %f" % objValue
-
-               sum_xis = 0
-               for elem in self.slacks:
-                  sum_xis +=  elem
-   
-               for i in range(len(param)):
-                  param[i] = w[i]
-
-               #pdb.set_trace()
-               cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
-               param_idx += 1
-               [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
-
-         #
-         # end of one iteration through all examples
-         #
-         iteration_nr += 1
-
-      #
-      # end of optimization 
-      #  
-      print 'Training completed'
-
-      pa = para()
-      pa.h = h
-      pa.d = d
-      pa.a = a
-      pa.mmatrix = mmatrix
-      pa.qualityPlifs = qualityPlifs
-
-      cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
-      #cPickle.dump(pa,open('elegans.param','w+'))
-      self.logfh.close()
-
-def fetch_genome_info(ginfo_filename):
-   if not os.path.exists(ginfo_filename):
-      cmd = ['']*4
-      cmd[0] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/utils'
-      cmd[1] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/genomes'
-      cmd[2] = 'genome_info = init_genome(\'%s\')' % gen_file
-      cmd[3] = 'save genome_info.mat genome_info'  
-      full_cmd = "matlab -nojvm -nodisplay -r \"%s; %s; %s; %s; exit\"" % (cmd[0],cmd[1],cmd[2],cmd[3])
-
-      obj = subprocess.Popen(full_cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
-      out,err = obj.communicate()
-      assert err == '', 'An error occured!\n%s'%err
-
-      ginfo = scipy.io.loadmat('genome_info.mat')
-      cPickle.dump(self.genome_info,open(ginfo_filename,'w+'))
-      return ginfo['genome_info']
-
-   else:
-      return cPickle.load(open(ginfo_filename))
-
-if __name__ == '__main__':
-   qpalma = QPalma()
-   qpalma.run()
diff --git a/python/qpalma_predict.py b/python/qpalma_predict.py
deleted file mode 100644 (file)
index db3c1cc..0000000
+++ /dev/null
@@ -1,382 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-###########################################################
-#
-# 
-#
-###########################################################
-
-import sys
-import subprocess
-import scipy.io
-import pdb
-import os.path
-
-from numpy.matlib import mat,zeros,ones,inf
-from numpy.linalg import norm
-
-import QPalmaDP
-from SIQP_CPX import SIQPSolver
-
-from paths_load_data_pickle import *
-from paths_load_data_solexa import *
-
-from generateEvaluationData import *
-
-from computeSpliceWeights import *
-from set_param_palma import *
-from computeSpliceAlignWithQuality import *
-from penalty_lookup_new import *
-from compute_donacc import *
-from TrainingParam import Param
-from export_param import *
-
-import Configuration as Conf
-from Plif import Plf
-from Helpers import *
-
-def getQualityFeatureCounts(qualityPlifs):
-   weightQuality = qualityPlifs[0].penalties
-   for currentPlif in qualityPlifs[1:]:
-      weightQuality = numpy.vstack([weightQuality, currentPlif.penalties])
-
-   return weightQuality 
-
-class QPalma:
-   """
-   A training method for the QPalma project
-   """
-   
-   def __init__(self):
-      self.ARGS = Param()
-
-      self.logfh = open('qpalma.log','w+')
-      gen_file= '%s/genome.config' % self.ARGS.basedir
-
-      ginfo_filename = 'genome_info.pickle'
-      self.genome_info = fetch_genome_info(ginfo_filename)
-
-      self.plog('genome_info.basedir is %s\n'%self.genome_info.basedir)
-
-      #self.ARGS.train_with_splicesitescoreinformation = False
-
-   def plog(self,string):
-      self.logfh.write(string)
-      self.logfh.flush()
-
-   def run(self):
-      # Load the whole dataset 
-      if Conf.mode == 'normal':
-         #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
-         Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
-         use_quality_scores = False
-
-      elif Conf.mode == 'using_quality_scores':
-         Sequences, Acceptors, Donors, Exons, Ests, Qualities = paths_load_data_solexa('training',self.genome_info,self.ARGS)
-         #Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
-
-         #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
-         pdb.set_trace()
-         #Qualities = []
-         #for i in range(len(Ests)):
-         #   Qualities.append([40]*len(Ests[i]))
-         use_quality_scores = True
-      else:
-         assert(False)
-
-      # number of training instances
-      N = len(Sequences) 
-      self.numExamples = N
-      assert N == len(Acceptors) and N == len(Acceptors) and N == len(Exons)\
-      and N == len(Ests), 'The Seq,Accept,Donor,.. arrays are of different lengths'
-      self.plog('Number of training examples: %d\n'% N)
-      print 'Number of features: %d\n'% Conf.numFeatures
-
-      remove_duplicate_scores = Conf.remove_duplicate_scores 
-      print_matrix            = Conf.print_matrix 
-      anzpath                 = Conf.anzpath 
-
-      # Initialize parameter vector  / param = numpy.matlib.rand(126,1)
-      #param_filename = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/python/elegans.param'
-      param_filename='/fml/ag-raetsch/home/fabio/svn/projects/QPalma/python/param_16.pickle'
-      param = load_param(param_filename)
-
-      # Set the parameters such as limits penalties for the Plifs
-      [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
-
-      # stores the number of alignments done for each example (best path, second-best path etc.)
-      num_path = [anzpath]*N 
-
-      #############################################################################################
-      # Training
-      #############################################################################################
-      self.plog('Starting training...\n')
-
-      donSP       = Conf.numDonSuppPoints
-      accSP       = Conf.numAccSuppPoints
-      lengthSP    = Conf.numLengthSuppPoints
-      mmatrixSP   = Conf.sizeMatchmatrix[0]\
-      *Conf.sizeMatchmatrix[1]
-      numq        = Conf.numQualSuppPoints
-      totalQualSP = Conf.totalQualSuppPoints
-
-      currentPhi = zeros((Conf.numFeatures,1))
-      totalQualityPenalties = zeros((totalQualSP,1))
-
-      for exampleIdx in range(self.numExamples):
-
-         dna = Sequences[exampleIdx] 
-         est = Ests[exampleIdx] 
-
-         if Conf.mode == 'normal':
-            quality = [40]*len(est)
-
-         if Conf.mode == 'using_quality_scores':
-            quality = Qualities[exampleIdx]
-
-         exons = Exons[exampleIdx] 
-         # NoiseMatrix = Noises[exampleIdx] 
-         don_supp = Donors[exampleIdx] 
-         acc_supp = Acceptors[exampleIdx] 
-
-         if exons[-1,1] > len(dna):
-            continue
-
-         # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)    
-         trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
-         
-         # Calculate the weights
-         trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
-         trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
-
-         currentPhi[0:donSP]                                               = mat(d.penalties[:]).reshape(donSP,1)
-         currentPhi[donSP:donSP+accSP]                                     = mat(a.penalties[:]).reshape(accSP,1)
-         currentPhi[donSP+accSP:donSP+accSP+lengthSP]                      = mat(h.penalties[:]).reshape(lengthSP,1)
-         currentPhi[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP]   = mmatrix[:]
-
-         if Conf.mode == 'using_quality_scores':
-            totalQualityPenalties = param[-totalQualSP:]
-            currentPhi[donSP+accSP+lengthSP+mmatrixSP:]                    = totalQualityPenalties[:]
-
-         # Calculate w'phi(x,y) the total score of the alignment
-         trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
-
-         # The allWeights vector is supposed to store the weight parameter
-         # of the true alignment as well as the weight parameters of the
-         # 1 other alignments
-         allWeights = zeros((Conf.numFeatures,1+1))
-         allWeights[:,0] = trueWeight[:,0]
-
-         AlignmentScores = [0.0]*(1+1)
-         AlignmentScores[0] = trueAlignmentScore
-
-         ################## Calculate wrong alignment(s) ######################
-
-         # Compute donor, acceptor with penalty_lookup_new
-         # returns two double lists
-         donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
-
-         #myalign wants the acceptor site on the g of the ag
-         acceptor = acceptor[1:]
-         acceptor.append(-inf)
-
-         # for now we don't use donor/acceptor scores
-         #donor = [-inf] * len(donor)
-         #acceptor = [-inf] * len(donor)
-
-         dna = str(dna)
-         est = str(est)
-         dna_len = len(dna)
-         est_len = len(est)
-
-         ps = h.convert2SWIG()
-
-         prb = QPalmaDP.createDoubleArrayFromList(quality)
-         chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
-
-         matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
-         mm_len = Conf.sizeMatchmatrix[0]*Conf.sizeMatchmatrix[1]
-
-         d_len = len(donor)
-         donor = QPalmaDP.createDoubleArrayFromList(donor)
-         a_len = len(acceptor)
-         acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
-
-         # Create the alignment object representing the interface to the C/C++ code.
-         currentAlignment = QPalmaDP.Alignment(Conf.numQualPlifs,Conf.numQualSuppPoints, use_quality_scores)
-
-         c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
-
-         # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
-         currentAlignment.myalign(1, dna, dna_len,\
-          est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
-          acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
-          print_matrix)
-
-         c_SpliceAlign  = QPalmaDP.createIntArrayFromList([0]*(dna_len*1))
-         c_EstAlign     = QPalmaDP.createIntArrayFromList([0]*(est_len*1))
-         c_WeightMatch  = QPalmaDP.createIntArrayFromList([0]*(mm_len*1))
-         c_DPScores     = QPalmaDP.createDoubleArrayFromList([.0]*1)
-
-         c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(Conf.totalQualSuppPoints))
-
-         currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
-         c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
-
-         newSpliceAlign = zeros((dna_len,1))
-         newEstAlign    = zeros((est_len,1))
-         newWeightMatch = zeros((mm_len,1))
-         newQualityPlifsFeatures = zeros((Conf.totalQualSuppPoints,1))
-
-         #print 'newSpliceAlign'
-         for i in range(dna_len):
-            newSpliceAlign[i] = c_SpliceAlign[i]
-         #   print '%f' % (spliceAlign[i])
-
-         #print 'newEstAlign'
-         for i in range(est_len):
-            newEstAlign[i] = c_EstAlign[i]
-         #   print '%f' % (spliceAlign[i])
-
-         #print 'weightMatch'
-         for i in range(mm_len):
-            newWeightMatch[i] = c_WeightMatch[i]
-         #   print '%f' % (weightMatch[i])
-
-         newDPScores = c_DPScores[0]
-
-         for i in range(Conf.totalQualSuppPoints):
-            newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
-
-         #  equals palma up to here
-
-         #print "Calling destructors"
-         del c_SpliceAlign
-         del c_EstAlign
-         del c_WeightMatch
-         del c_DPScores
-         del c_qualityPlifsFeatures
-         del currentAlignment
-
-         newSpliceAlign = newSpliceAlign.reshape(1,dna_len)
-         newWeightMatch = newWeightMatch.reshape(1,mm_len)
-         # Calculate weights of the respective alignments Note that we are
-         # calculating n-best alignments without hamming loss, so we
-         # have to keep track which of the n-best alignments correspond to
-         # the true one in order not to incorporate a true alignment in the
-         # constraints. To keep track of the true and false alignments we
-         # define an array true_map with a boolean indicating the
-         # equivalence to the true alignment for each decoded alignment.
-         true_map = [0]*2
-         true_map[0] = 1
-         pathNr = 0
-
-         weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a, h, newSpliceAlign.flatten().tolist()[0], don_supp, acc_supp)
-
-         decodedQualityFeatures = zeros((Conf.totalQualSuppPoints,1))
-         for qidx in range(Conf.totalQualSuppPoints):
-            decodedQualityFeatures[qidx] = newQualityPlifsFeatures[qidx]
-
-         # Gewichte in restliche Zeilen der Matrix speichern
-         wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch.T, decodedQualityFeatures])
-         allWeights[:,pathNr+1] = wp
-
-         hpen = mat(h.penalties).reshape(len(h.penalties),1)
-         dpen = mat(d.penalties).reshape(len(d.penalties),1)
-         apen = mat(a.penalties).reshape(len(a.penalties),1)
-         features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties])
-
-         AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
-
-         # Check wether scalar product + loss equals viterbi score
-         print '%f vs %f' % (newDPScores, AlignmentScores[pathNr+1])
-
-         # if the pathNr-best alignment is very close to the true alignment consider it as true
-         if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
-            true_map[pathNr+1] = 1
-
-         pdb.set_trace()
-      
-         evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign)
-
-      print 'Prediction completed'
-      self.logfh.close()
-
-def fetch_genome_info(ginfo_filename):
-   if not os.path.exists(ginfo_filename):
-      cmd = ['']*4
-      cmd[0] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/utils'
-      cmd[1] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/genomes'
-      cmd[2] = 'genome_info = init_genome(\'%s\')' % gen_file
-      cmd[3] = 'save genome_info.mat genome_info'  
-      full_cmd = "matlab -nojvm -nodisplay -r \"%s; %s; %s; %s; exit\"" % (cmd[0],cmd[1],cmd[2],cmd[3])
-
-      obj = subprocess.Popen(full_cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
-      out,err = obj.communicate()
-      assert err == '', 'An error occured!\n%s'%err
-
-      ginfo = scipy.io.loadmat('genome_info.mat')
-      cPickle.dump(self.genome_info,open(ginfo_filename,'w+'))
-      return ginfo['genome_info']
-
-   else:
-      return cPickle.load(open(ginfo_filename))
-
-def plifs2param(h,d,a,mmatrix,qualityPlifs):
-   donSP       = Conf.numDonSuppPoints
-   accSP       = Conf.numAccSuppPoints
-   lengthSP    = Conf.numLengthSuppPoints
-   mmatrixSP   = Conf.sizeMatchmatrix[0]\
-   *Conf.sizeMatchmatrix[1]
-
-
-   param = zeros((Conf.numFeatures,1))
-   param[0:donSP]                                               = mat(d.penalties[:]).reshape(donSP,1)
-   param[donSP:donSP+accSP]                                     = mat(a.penalties[:]).reshape(accSP,1)
-   param[donSP+accSP:donSP+accSP+lengthSP]                      = mat(h.penalties[:]).reshape(lengthSP,1)
-   param[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP]   = mmatrix[:]
-
-   for idx in range(len(qualityPlifs)):
-      currentPlif       = qualityPlifs[idx]
-      begin             = lengthSP+donSP+accSP+mmatrixSP+(idx*Conf.numQualSuppPoints)
-      end               = lengthSP+donSP+accSP+mmatrixSP+((idx+1)*Conf.numQualSuppPoints)
-      param[begin:end]  = mat(currentPlif.penalties).reshape(Conf.numQualSuppPoints,1)
-
-   return param
-
-def load_param(filename):
-   param = None 
-   #try:
-   #   para = cPickle.load(open(filename))
-   #except:
-   #   print 'Error: Could not open parameter file!'
-   #   sys.exit(1)
-   #
-   #param = plifs2param(para.h,para.d,para.a,para.mmatrix,para.qualityPlifs)
-
-   param = cPickle.load(open(filename))
-   return param
-
-def evaluateExample(dna,est,exons,SpliceAlign,newEstAlign):
-   newExons = []
-   oldElem = -1
-   SpliceAlign = SpliceAlign.flatten().tolist()[0]
-   SpliceAlign.append(-1)
-   for pos,elem in enumerate(SpliceAlign):
-      if pos == 0:
-         oldElem = -1
-      else:
-         oldElem = SpliceAlign[pos-1]
-
-      if oldElem != 0 and elem == 0:
-         newExons.append(pos)
-
-      if oldElem == 0 and elem != 0:
-         newExons.append(pos-1)
-
-   pdb.set_trace()
-
-if __name__ == '__main__':
-   qpalma = QPalma()
-   qpalma.run()
diff --git a/python/set_param_palma.py b/python/set_param_palma.py
deleted file mode 100644 (file)
index f760031..0000000
+++ /dev/null
@@ -1,139 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import math
-import numpy.matlib
-import QPalmaDP
-import pdb
-import Configuration
-from Plif import *
-
-def set_params_pa():
-   h = plf()
-   h.len = int(model.intron_len_bins) 
-   h.limits = model.intron_len_limits 
-   h.penalties = model.intron_len_penalties 
-   h.name = 'h'
-   h.max_len   = int(max_intron_len)
-   h.min_len   = int(min_intron_len)
-   h.transform = model.intron_len_transform
-
-   d = plf()
-   d.len = int(model.donor_bins)
-   d.limits = model.donor_limits
-   d.penalties = model.donor_penalties 
-   d.name = 'd'
-   d.max_len   = 100
-   d.min_len   = -100
-
-   a = plf()
-   a.len = int(model.acceptor_bins)
-   a.limits = model.acceptor_limits 
-   a.penalties = model.acceptor_penalties 
-   a.name = 'a'
-   a.max_len   = 100
-   a.min_len   = -100
-
-   mmatrix = model.substitution_matrix 
-
-
-def set_param_palma(param, train_with_intronlengthinformation,\
-                                               min_intron_len=None, max_intron_len=None, min_svm_score=None, max_svm_score=None):
-
-   print 'Setting parameters ...'
-   
-   if min_intron_len == None:
-      if train_with_intronlengthinformation:
-         min_intron_len=20
-         max_intron_len=1000
-      else:
-         min_intron_len = 1
-         max_intron_len = 2
-
-   if min_intron_len != None and max_intron_len != None:
-      min_svm_score=-5
-      max_svm_score=5
-
-   h = Plf()
-   d = Plf()
-   a = Plf()
-   qualityPlifs = [None]*Configuration.numQualPlifs
-
-   donSP       = Configuration.numDonSuppPoints
-   accSP       = Configuration.numAccSuppPoints
-   lengthSP    = Configuration.numLengthSuppPoints
-   mmatrixSP   = Configuration.sizeMatchmatrix[0]\
-   *Configuration.sizeMatchmatrix[1]
-   totalQualSP = Configuration.totalQualSuppPoints
-
-   ####################
-   # Gapfunktion
-   ####################
-   h.limits    = logspace(math.log(min_intron_len,10),math.log(max_intron_len,10),30) 
-   h.penalties = param[0:lengthSP].flatten().tolist()[0]
-   #h.transform = '+1' 
-   h.transform = '' 
-   h.name      = 'h' 
-   h.max_len   =  100000 
-   h.min_len   =  4 
-   h.id        = 1 
-   h.use_svm   = 0 
-   h.next_id   = 0 
-
-
-   ####################
-   # Donorfunktion
-   ####################
-   d.limits    = linspace(min_svm_score,max_svm_score,30) 
-   d.penalties = param[lengthSP:lengthSP+donSP].flatten().tolist()[0]
-   #d.transform = '+1' 
-   d.transform = '' 
-   d.name      = 'd' 
-   d.max_len   = 100 
-   d.min_len   = -100 
-   d.id        = 1 
-   d.use_svm   = 0 
-   d.next_id   = 0 
-
-
-   ####################
-   # Acceptorfunktion
-   ####################
-   a.limits    = linspace(min_svm_score,max_svm_score,30) 
-   a.penalties = param[lengthSP+donSP:lengthSP+donSP+accSP].flatten().tolist()[0]
-   #a.transform = '+1' 
-   a.transform = '' 
-   a.name      = 'a' 
-   a.max_len   = 100  
-   a.min_len   = -100  
-   a.id        = 1 
-   a.use_svm   = 0 
-   a.next_id   = 0 
-
-   ####################
-   # Matchmatrix
-   ####################
-   mmatrix = numpy.matlib.mat(param[lengthSP+donSP+accSP:lengthSP+donSP+accSP+mmatrixSP])
-   mmatrix.reshape(Configuration.sizeMatchmatrix[0],Configuration.sizeMatchmatrix[1]) 
-
-   ####################
-   # Quality Plifs
-   ####################
-   for idx in range(Configuration.numQualPlifs):
-      currentPlif = Plf()
-      currentPlif.limits    = linspace(Configuration.min_qual,Configuration.max_qual,Configuration.numQualSuppPoints) 
-      begin                 = lengthSP+donSP+accSP+mmatrixSP+(idx*Configuration.numQualSuppPoints)
-      end                   = lengthSP+donSP+accSP+mmatrixSP+((idx+1)*Configuration.numQualSuppPoints)
-      currentPlif.penalties = param[begin:end].flatten().tolist()[0]
-      currentPlif.transform = '' 
-      currentPlif.name      = 'q' 
-      currentPlif.max_len   = Configuration.max_qual 
-      currentPlif.min_len   = Configuration.min_qual
-      qualityPlifs[idx] = currentPlif
-
-   return h,d,a,mmatrix,qualityPlifs
-
-if __name__ == '__main__':
-   min_svm_score=-5
-   max_svm_score=5
-   print linspace(min_svm_score,max_svm_score,30) 
diff --git a/python/tools/PyGff.py b/python/tools/PyGff.py
deleted file mode 100644 (file)
index 2d26c8a..0000000
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/usr/bin/env python 
-# -*- coding: utf-8 -*- 
-
-class Gene:
-   
-   def __init__(self,chr,begin,end,strand,id):
-      assert chr != ''
-      assert begin >= 0 and begin <= end and end >= 0
-      assert strand in ['+','-']
-
-      self.chromosome = chr
-      self.start = begin
-      self.stop = end
-      self.strand = strand
-      self.exons = []
-      self.fiveUTR = []
-      self.threeUTR = []
-      self.id = id
-
-   def addExon(self,start,stop):
-      self.exons.append((start,stop))
-
-   def load(self,filename):
-      pass
-
-   def save(self,filename):
-      pass
diff --git a/python/tools/Solexa.py b/python/tools/Solexa.py
deleted file mode 100644 (file)
index ad48439..0000000
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/usr/bin/env python 
-# -*- coding: utf-8 -*-
-
-class Read:
-   
-   chromosome = ''
-   position = 0
-   sequence = ''
-   strand = ''
-
-   # mismatch = 0
-   # repeats = 0
-   # length = 0
-   # deletion = 0
-
-   prob = None
-   calibratedProb = None
-   chastity = None
-
-   def __init__(self,chr,pos,seq,strand,prob,calibrated,chastity):
-      self.chromosome = chr
-      self.position = pos
-      self.sequence = seq
-      self.strand = strand
-      self.prob = prob
-      self.calibratedProb = calibrated
-      self.chastity = chastity
diff --git a/python/tools/createTestSet.py b/python/tools/createTestSet.py
deleted file mode 100644 (file)
index 67f6bf9..0000000
+++ /dev/null
@@ -1,81 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import sys
-import os.path
-import csv
-from PyGff import *
-import cPickle
-
-info=\
-"""
-You have to supply two files. One containing the gff information and the other
-one containing the information of the Solexa(R) reads.
-
-Usage: ./createTestSet.py gff.pickle reads.txt
-"""
-
-doc=\
-"""
-Make sure that you split the data into chromosome files beforehand because
-this method does not check the chromosome info
-"""
-
-def printRow(row):
-   print "".join([elem+' ' for elem in row]).strip()
-
-def check(annotation,reads):
-   skippedReadsCounter = 0
-
-   reader = csv.reader(reads, delimiter='\t', quoting=csv.QUOTE_NONE)
-   for pos,row in enumerate(reader):
-      if len(row) != 12:
-         skippedReadsCounter += 1
-         continue
-
-      if pos % 10000 == 0:
-         print >> sys.stderr, pos
-
-      chr = row[0]
-      pos = int(row[1])
-      readStart = pos
-      readStop = pos+35
-
-      for gene in annotation:
-         # read inside current gene positions
-
-         if gene.start <= readStart and pos+35 <= gene.stop:
-            overlapping = True
-            
-            for exonStart,exonStop in gene.exons:
-               if exonStart <= readStart and readStop <= exonStop:
-                  overlapping = False
-                  break
-         
-            if overlapping:
-               printRow(row)
-               continue
-
-         # read is overlapping with gene boundaries
-         if pos < gene.start and pos+35 > gene.start:
-            printRow(row)
-            continue
-
-         if pos < gene.stop and pos+35 > gene.stop:
-            printRow(row)
-            continue
-
-   print skippedReadsCounter     
-      
-if __name__ == '__main__':
-   assert len(sys.argv) >= 3, info
-   annotFile = sys.argv[1]
-   readsFile = sys.argv[2]
-   assert os.path.exists(annotFile), 'File %s does not exist!' % annotFile
-   assert os.path.exists(readsFile), 'File %s does not exist!' % readsFile
-   print >> sys.stderr, doc
-
-   annotation = cPickle.load(open(annotFile))
-   reads = open(readsFile)
-
-   check(annotation,reads)
diff --git a/python/tools/parseGff.py b/python/tools/parseGff.py
deleted file mode 100644 (file)
index b01bbdc..0000000
+++ /dev/null
@@ -1,104 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import sys
-import os.path
-import csv
-import re
-from PyGff import *
-import cPickle
-import copy
-
-def parse(gff_fid):
-   reader = csv.reader(gff_fid, delimiter='\t', quoting=csv.QUOTE_NONE)
-
-   allGenes = {}
-   currentGene = None
-
-   rx = re.compile('ID=[^;]*;',re.DOTALL)
-
-   for row in reader:
-      assert len(row) == 9
-      chr = row[0]
-      id = row[2]
-      start = int(row[3])
-      stop = int(row[4])
-      strand = row[6]
-      desc = row[8]
-
-      if id == 'chromosome':
-         continue
-
-      if id == 'gene':
-         if currentGene != None:
-            allGenes[currentGene.id] = currentGene
-         
-         desc = rx.search(desc).group()[3:-1]
-         print desc
-         currentGene = Gene(chr,start,stop,strand,desc)
-
-      elif id == 'five_prime_UTR':
-         pass
-
-      elif id == 'three_prime_UTR':
-         pass
-
-      elif id == 'mRNA':
-         pass
-
-      elif id == 'exon':
-         assert currentGene != None
-         currentGene.addExon(start,stop)
-
-      elif id == 'CDS':
-         pass
-
-      elif id == 'ncRNA':
-         pass
-
-      elif id == 'pseudogenic_exon':
-         pass
-
-      elif id == 'pseudogenic_transcript':
-         pass
-
-      elif id == 'miRNA':
-         pass
-
-      elif id == 'rRNA':
-         pass
-
-      elif id == 'snoRNA':
-         pass
-
-      elif id == 'snRNA':
-         pass
-
-      elif id == 'tRNA':
-         pass
-
-      elif id == 'pseudogene':
-         if currentGene != None:
-            allGenes[currentGene.id] = currentGene
-            currentGene = None
-      else:
-         assert False, 'Error: Unknown identifier \'%s\'' % id
-
-   if currentGene != None:
-      allGenes[currentGene.id] = currentGene
-
-   return allGenes
-      
-if __name__ == '__main__':
-   assert len(sys.argv) >= 3
-   annotFile = sys.argv[1]
-   pickleFile = sys.argv[2]
-   assert os.path.exists(annotFile)
-   assert not os.path.exists(pickleFile)
-
-   gff_fid = open(annotFile)
-   pickle_fid = open(pickleFile,'w+')
-   allGenes = parse(gff_fid)
-   for key,val in allGenes.iteritems():
-      print key
-   cPickle.dump(allGenes,pickle_fid)
diff --git a/python/tools/parseSolexa.py b/python/tools/parseSolexa.py
deleted file mode 100644 (file)
index dd62b37..0000000
+++ /dev/null
@@ -1,53 +0,0 @@
-#!/usr/bin/env python 
-# -*- coding: utf-8 -*-
-
-import sys
-import os.path
-import cPickle
-import csv
-from Solexa import Read
-
-def parse(solexa_fid,pickle_fid):
-   
-   reader = csv.reader(solexa_fid, delimiter='\t', quoting=csv.QUOTE_NONE)
-
-   allReads = []
-   currentRead = None
-
-   stmap = {'P':'-','D':'+'}
-
-   for row in reader:
-      assert len(row) == 12
-      chr = row[0]
-      pos = row[1]
-      seq = row[2]
-      id = int(row[3])
-      strand = stmap[row[4]]
-      mismatch = row[5]
-      repeat = row[6]
-      length = row[7]
-      deletion = row[8]
-
-      prob = row[9]
-      calibratedProb = row[10]
-      chastity = row[11]
-
-      prob = [ord(elem)-50 for elem in prob]
-      calibratedProb = [ord(elem)-64 for elem in calibratedProb]
-      chastity = [ord(elem)+10 for elem in chastity]
-
-      currentRead = Read(chr,pos,seq,strand,prob,calibratedProb,chastity)
-      allReads.append(currentRead)
-
-   cPickle.dump(allReads,pickle_fid)
-
-if __name__ == '__main__':
-   assert len(sys.argv) >= 3
-   solexaFile = sys.argv[1]
-   pickleFile = sys.argv[2]
-   assert os.path.exists(solexaFile)
-   assert not os.path.exists(pickleFile)
-
-   solexa_fid = open(solexaFile)
-   pickle_fid = open(pickleFile,'w+')
-   parse(solexa_fid,pickle_fid)
diff --git a/qpalma/Configuration.py b/qpalma/Configuration.py
new file mode 100644 (file)
index 0000000..7d82c0c
--- /dev/null
@@ -0,0 +1,196 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import numpy.matlib
+
+fixedParamQ = numpy.matlib.mat(
+[[ 0.62870709], [ 0.7012026 ], [ 0.60236784],
+       [ 0.51316259], [ 0.20220814], [ 0.70324863], [ 0.37218684], [ 0.82178927],
+       [ 0.60394866], [ 0.70371272], [ 0.07548074], [ 0.63412803], [ 0.97442266],
+       [ 0.13216791], [ 0.71041168], [ 0.2093887 ], [ 0.35227344], [ 0.3405142 ],
+       [ 0.69677236], [ 0.41673747], [ 0.564245  ], [ 0.37613432], [ 0.88805642],
+       [ 0.69677236], [ 0.41673747], [ 0.564245  ], [ 0.37613432], [ 0.88805642],
+       [ 0.69677236], [ 0.41673747], [ 0.564245  ], [ 0.37613432], [ 0.88805642],
+       [ 0.88691608], [ 0.69476752], [ 0.81659504], [ 0.17801859], [ 0.71048235],
+       [ 0.08188783], [ 0.54884803], [ 0.84039558], [ 0.6982093 ], [ 0.41686176],
+       [ 0.38568873], [ 0.29401347], [ 0.12704074], [ 0.30640858], [ 0.89578031],
+       [ 0.84621571], [ 0.11783439], [ 0.0944695 ], [ 0.34081575], [ 0.44157643],
+       [ 0.77847185], [ 0.04283567], [ 0.45107823], [ 0.89789891], [ 0.41045519],
+       [ 0.49073531], [ 0.29727627], [ 0.94711483], [ 0.24898204], [ 0.26181212],
+       [ 0.71760957], [ 0.60326883], [ 0.80887576], [ 0.09448718], [ 0.88064525],
+       [ 0.84317654], [ 0.48893703], [ 0.24847021], [ 0.84203596], [ 0.34104156],
+       [ 0.75604701], [ 0.91703057], [ 0.69325475], [ 0.61276969], [ 0.16335226],
+       [ 0.4684374 ], [ 0.16553371], [ 0.79594434], [ 0.6440283 ], [ 0.80922237],
+       [ 0.5349296 ], [ 0.31924316], [ 0.10960695], [ 0.40151062], [ 0.50473641],
+       [ 0.14812671], [ 0.73523169], [ 0.35141625], [ 0.80364238], [ 0.02128181],
+       [ 0.0061226 ], [ 0.34541924], [ 0.07694485], [ 0.05551339], [ 0.23087636],
+       [ 0.87016395], [ 0.31682377], [ 0.27375113], [ 0.72226332], [ 0.62914149],
+       [ 0.59236012], [ 0.2070238 ], [ 0.52390942], [ 0.11894098], [ 0.55725917],
+       [ 0.72706009], [ 0.087196  ], [ 0.04745082], [ 0.95636492], [ 0.31524576],
+       [ 0.79685218], [ 0.80386771], [ 0.70942604], [ 0.82869417], [ 0.26906569],
+       [ 0.51848039], [ 0.64169354], [ 0.07114973], [ 0.39249454], [ 0.07002803],
+       [ 0.94667567], [ 0.02252752], [ 0.01039039], [ 0.5721312 ], [ 0.06065969],
+       [ 0.88691608], [ 0.69476752], [ 0.81659504], [ 0.17801859], [ 0.71048235],
+       [ 0.88691608], [ 0.69476752], [ 0.81659504], [ 0.17801859], [ 0.71048235],
+       [ 0.08188783], [ 0.54884803], [ 0.84039558], [ 0.6982093 ], [ 0.41686176],
+       [ 0.38568873], [ 0.29401347], [ 0.12704074], [ 0.30640858], [ 0.89578031],
+       [ 0.84621571], [ 0.11783439], [ 0.0944695 ], [ 0.34081575], [ 0.44157643],
+       [ 0.77847185], [ 0.04283567], [ 0.45107823], [ 0.89789891], [ 0.41045519],
+       [ 0.49073531], [ 0.29727627], [ 0.94711483], [ 0.24898204], [ 0.26181212],
+       [ 0.71760957], [ 0.60326883], [ 0.80887576], [ 0.09448718], [ 0.88064525],
+       [ 0.84317654], [ 0.48893703], [ 0.24847021], [ 0.84203596], [ 0.34104156],
+       [ 0.75604701], [ 0.91703057], [ 0.69325475], [ 0.61276969], [ 0.16335226],
+       [ 0.4684374 ], [ 0.16553371], [ 0.79594434], [ 0.6440283 ], [ 0.80922237],
+       [ 0.5349296 ], [ 0.31924316], [ 0.10960695], [ 0.40151062], [ 0.50473641],
+       [ 0.14812671], [ 0.73523169], [ 0.35141625], [ 0.80364238], [ 0.02128181],
+       [ 0.0061226 ], [ 0.34541924], [ 0.07694485], [ 0.05551339], [ 0.23087636],
+       [ 0.87016395], [ 0.31682377], [ 0.27375113], [ 0.72226332], [ 0.62914149],
+       [ 0.59236012], [ 0.2070238 ], [ 0.52390942], [ 0.11894098], [ 0.55725917],
+       [ 0.72706009], [ 0.087196  ], [ 0.04745082], [ 0.95636492], [ 0.31524576],
+       [ 0.79685218], [ 0.80386771], [ 0.70942604], [ 0.82869417], [ 0.26906569],
+       [ 0.51848039], [ 0.64169354], [ 0.07114973], [ 0.39249454], [ 0.07002803],
+       [ 0.94667567], [ 0.02252752], [ 0.01039039], [ 0.5721312 ], [ 0.06065969],
+       [ 0.88691608], [ 0.69476752], [ 0.81659504], [ 0.17801859], [ 0.71048235],
+       [ 0.08188783], [ 0.54884803], [ 0.84039558], [ 0.6982093 ], [ 0.41686176],
+       [ 0.38568873], [ 0.29401347], [ 0.12704074], [ 0.30640858], [ 0.89578031],
+       [ 0.84621571], [ 0.11783439], [ 0.0944695 ], [ 0.34081575], [ 0.44157643],
+       [ 0.84621571], [ 0.11783439], [ 0.0944695 ], [ 0.34081575], [ 0.44157643],
+       [ 0.77847185], [ 0.04283567], [ 0.45107823], [ 0.89789891], [ 0.41045519],
+       [ 0.49073531], [ 0.29727627], [ 0.94711483], [ 0.24898204], [ 0.26181212],
+       [ 0.71760957], [ 0.60326883], [ 0.80887576], [ 0.09448718], [ 0.88064525],
+       [ 0.84317654], [ 0.48893703], [ 0.24847021], [ 0.84203596], [ 0.34104156],
+       [ 0.75604701], [ 0.91703057], [ 0.69325475], [ 0.61276969], [ 0.16335226],
+       [ 0.4684374 ], [ 0.16553371], [ 0.79594434], [ 0.6440283 ], [ 0.80922237],
+       [ 0.5349296 ], [ 0.31924316], [ 0.10960695], [ 0.40151062], [ 0.50473641],
+       [ 0.14812671], [ 0.73523169], [ 0.35141625], [ 0.80364238], [ 0.02128181],
+       [ 0.0061226 ], [ 0.34541924], [ 0.07694485], [ 0.05551339], [ 0.23087636],
+       [ 0.87016395], [ 0.31682377], [ 0.27375113], [ 0.72226332], [ 0.62914149],
+       [ 0.59236012], [ 0.2070238 ], [ 0.52390942], [ 0.11894098], [ 0.55725917],
+       [ 0.72706009], [ 0.087196  ], [ 0.04745082], [ 0.95636492], [ 0.31524576],
+       [ 0.79685218], [ 0.80386771], [ 0.70942604], [ 0.82869417], [ 0.26906569],
+       [ 0.51848039], [ 0.64169354], [ 0.07114973], [ 0.39249454], [ 0.07002803],
+       [ 0.94667567], [ 0.02252752], [ 0.01039039], [ 0.5721312 ], [ 0.06065969],
+       [ 0.69422476], [ 0.4310939 ], [ 0.03069099], [ 0.35969779], [ 0.18047331],
+       [ 0.4177651 ], [ 0.01360547], [ 0.29069319]
+       ])
+
+fixedParam = numpy.matlib.mat([[ 0.62870709], [ 0.7012026 ], [ 0.60236784],
+  [ 0.51316259], [ 0.20220814], [ 0.70324863], [ 0.37218684], [ 0.82178927],
+  [ 0.60394866], [ 0.70371272], [ 0.07548074], [ 0.63412803], [ 0.97442266],
+  [ 0.13216791], [ 0.71041168], [ 0.2093887 ], [ 0.35227344], [ 0.3405142 ],
+  [ 0.69677236], [ 0.41673747], [ 0.564245  ], [ 0.37613432], [ 0.88805642],
+  [ 0.88691608], [ 0.69476752], [ 0.81659504], [ 0.17801859], [ 0.71048235],
+  [ 0.08188783], [ 0.54884803], [ 0.84039558], [ 0.6982093 ], [ 0.41686176],
+  [ 0.38568873], [ 0.29401347], [ 0.12704074], [ 0.30640858], [ 0.89578031],
+  [ 0.84621571], [ 0.11783439], [ 0.0944695 ], [ 0.34081575], [ 0.44157643],
+  [ 0.77847185], [ 0.04283567], [ 0.45107823], [ 0.89789891], [ 0.41045519],
+  [ 0.49073531], [ 0.29727627], [ 0.94711483], [ 0.24898204], [ 0.26181212],
+  [ 0.71760957], [ 0.60326883], [ 0.80887576], [ 0.09448718], [ 0.88064525],
+  [ 0.84317654], [ 0.48893703], [ 0.24847021], [ 0.84203596], [ 0.34104156],
+  [ 0.75604701], [ 0.91703057], [ 0.69325475], [ 0.61276969], [ 0.16335226],
+  [ 0.4684374 ], [ 0.16553371], [ 0.79594434], [ 0.6440283 ], [ 0.80922237],
+  [ 0.5349296 ], [ 0.31924316], [ 0.10960695], [ 0.40151062], [ 0.50473641],
+  [ 0.14812671], [ 0.73523169], [ 0.35141625], [ 0.80364238], [ 0.02128181],
+  [ 0.0061226 ], [ 0.34541924], [ 0.07694485], [ 0.05551339], [ 0.23087636],
+  [ 0.87016395], [ 0.31682377], [ 0.27375113], [ 0.72226332], [ 0.62914149],
+  [ 0.59236012], [ 0.2070238 ], [ 0.52390942], [ 0.11894098], [ 0.55725917],
+  [ 0.72706009], [ 0.087196  ], [ 0.04745082], [ 0.95636492], [ 0.31524576],
+  [ 0.79685218], [ 0.80386771], [ 0.70942604], [ 0.82869417], [ 0.26906569],
+  [ 0.51848039], [ 0.64169354], [ 0.07114973], [ 0.39249454], [ 0.07002803],
+  [ 0.94667567], [ 0.02252752], [ 0.01039039], [ 0.5721312 ], [ 0.06065969],
+  [ 0.69422476], [ 0.4310939 ], [ 0.03069099], [ 0.35969779], [ 0.18047331],
+  [ 0.4177651 ], [ 0.01360547], [ 0.29069319]])
+
+
+###########################################################
+#
+# The parameters for the QPalma algorithm
+#
+#
+
+C = 100000.0
+
+# 'normal' means work like Palma 'using_quality_scores' means work like Palma
+# plus using sequencing quality scores
+
+#mode = 'normal'
+
+mode = 'using_quality_scores'
+
+# When using quality scores our scoring function is defined as
+#
+# f: S_e x R x S -> R
+#
+# where S_e is {A,C,G,T,N} and S = {A,C,G,T,N,-}
+# 
+# as opposed to a usage without quality scores when we only have
+# 
+# f: S x S -> R 
+#
+# The matrix of plifs is defined as follows:
+#
+#  elem |   -  a  c  g  t  n
+#  -------------------------  
+#  idx  |   0  1  2  3  4  5
+#   
+# 
+#              dna
+#
+#           -  a  c  g  t  n
+#        a  
+# est    c
+#        g
+#        t
+#        n
+#  
+# so the index can be calculated as (estnum-1)*6 + dnanum.
+#
+# At ests we do not have gaps with quality scores so we look up the matchmatrix
+# instead.
+
+numDonSuppPoints     = 30
+numAccSuppPoints     = 30
+numLengthSuppPoints  = 30 
+numQualSuppPoints    = 2
+
+min_qual = -1
+max_qual = 40
+
+USE_OPT = True
+
+if mode == 'normal':
+   sizeMatchmatrix   = (6,6)
+   estPlifs = 0
+   dnaPlifs = 0
+   numQualPlifs = estPlifs*dnaPlifs
+elif mode == 'using_quality_scores':
+   sizeMatchmatrix   = (6,1)
+   estPlifs = 5
+   dnaPlifs = 6
+   numQualPlifs = estPlifs*dnaPlifs
+else:
+   assert False, 'Wrong operation mode specified'
+
+totalQualSuppPoints = numQualPlifs*numQualSuppPoints
+
+numFeatures = numDonSuppPoints + numAccSuppPoints\
++ numLengthSuppPoints + sizeMatchmatrix[0]*sizeMatchmatrix[1] + totalQualSuppPoints 
+
+iter_steps = 40
+remove_duplicate_scores = False
+print_matrix            = False
+anzpath                 = 2
+
+if mode == 'normal':
+   fixedParam = fixedParam[:numFeatures]
+elif mode == 'using_quality_scores':
+   fixedParam = fixedParamQ[:numFeatures]
+else:
+   assert False, 'Wrong operation mode specified'
+
+# Check for valid parameters
+assert numQualPlifs       >= 0
+assert numDonSuppPoints    > 1
+assert numAccSuppPoints    > 1
+assert numLengthSuppPoints > 1 
+assert numQualSuppPoints   > 1
diff --git a/qpalma/Plif.py b/qpalma/Plif.py
new file mode 100644 (file)
index 0000000..a52594f
--- /dev/null
@@ -0,0 +1,84 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import QPalmaDP
+
+class Plf: 
+   """
+   means piecewise linear function
+   """
+
+   def __init__(self,num=None):
+      if num == None:
+         self.len = 0
+         self.limits = []
+         self.penalties = []
+         self.transform = ''
+         self.name = ''
+         self.max_len = 0
+         self.min_len = 0
+      else:
+         self.len = num
+         self.limits = linspace(0,40,num) 
+         self.penalties = [0]*num
+         self.transform = ''
+         self.name = ''
+         self.max_len = 0
+         self.min_len = 0
+         
+
+   def convert2SWIG(self):
+      ps = QPalmaDP.penalty_struct()
+
+      ps.len = len(self.limits)
+   
+      ps.limits = QPalmaDP.createDoubleArrayFromList(self.limits)
+      ps.penalties = QPalmaDP.createDoubleArrayFromList(self.penalties)
+
+      ps.max_len = self.max_len
+      ps.min_len = self.min_len
+      ps.transform = 0
+      ps.name = self.name
+
+      return ps
+
+
+def base_coord(b1,b2):
+   b1 = b1.lower()
+   b2 = b2.lower()
+   assert b1 in ['a','c','g','t','n']
+   assert b2 in ['a','c','g','t','n']
+   cb = {'a':0, 'c':1, 'g':2, 't':3, 'n':4}
+
+   b1 = cb[b1]
+   b2 = cb[b2]
+
+   return b1*5+b2
+
+def linspace(a,b,n):
+   intervalLength = b-a
+   stepSize = 1.0*intervalLength / (n-1)
+   
+   interval = [0]*n
+   interval[0] = a
+   interval[-1] = b
+   for i in range(1,n-1):
+      interval[i] = a+(i*stepSize)
+
+   return interval
+
+def logspace(a,b,n):
+   interval = [0]*n
+   begin = 10.0**a
+   end = 10.0**b
+   intervalSize = 1.0*(b-a)/(n-1)
+   interval[0] = begin
+   interval[-1] = end
+
+   for i in range(1,n-1):
+      interval[i] = 10**(a+i*intervalSize)
+
+   return interval
+
+def log10(x):
+   return math.log(x)/math.log(10)
diff --git a/qpalma/SIQP.py b/qpalma/SIQP.py
new file mode 100644 (file)
index 0000000..f5008d9
--- /dev/null
@@ -0,0 +1,124 @@
+#!/usr/bin/env python
+# -*- coding:latin-1 -*-
+
+import sys
+import os
+import os.path
+import logging
+import math
+import pdb
+
+import cvxopt.base as cb
+
+import logging
+logging.basicConfig(level=logging.DEBUG,format='%(levelname)s %(message)s')
+
+def asymCoords(t):
+   size_upstream = t[0]
+   size_downstream = t[1]
+   assert size_upstream >= 1 and size_upstream <= 39
+   assert size_downstream >= 1 and size_downstream <= 39
+   
+   if size_downstream > size_upstream:
+      asym_small = size_upstream
+      asym_big   = size_downstream
+   else:
+      asym_small = size_downstream
+      asym_big   = size_upstream
+
+   asym_offset = 0
+   for r in range(asym_small):
+      asym_offset += (40-r)-r;
+
+   return (asym_offset+asym_big)-1
+
+def calcTuples(i,j):
+   return filter(lambda t: t[0]<=t[1] and t[0]>0 and t[0]<40 and t[1]>0 and t[1]<40,[(i+1,j),(i-1,j),(i,j+1),(i,j-1)])
+
+class SIQP:
+   """
+   This class is a wrapper for the cvxopt qp solver.
+   It has the purpose to support an iterative addition of
+   constraints to the original problem.
+
+   We want to solve a problem of the form
+
+   min  1/2 * C * x' * P * x + q' * x
+    x
+
+    s.t. 
+      < w,f+ > - < w,f- > >= 1 - xi_i    for all i, for all f-
+
+   where x is a vector storing information on the parameters w_i and the slacks xi_i
+   so x = [w;xi] 
+
+   """
+
+   def __init__(self,fSize,numExamples,c):
+      assert fSize > 0, 'You have to have at least one feature!'
+      assert numExamples > 0, 'You have to have at least one example!'
+      self.numFeatures = fSize
+      self.numExamples = numExamples
+      self.C = c
+      self.EPSILON = 10e-15
+
+      logging.debug("Starting SIQP solver with %d examples. A feature space dim. of %d.", numExamples,fSize)
+      logging.debug("Regularization parameters are C=%f."%(self.C))
+
+      self.old_w = None
+
+      self.dimP = self.numFeatures + self.numExamples
+      self.createRegularizer()
+
+   def createUnitRegularizer(self):
+      # set regularizer to zero
+      self.P = cb.matrix(0.0,(self.dimP,self.dimP))
+      for i in range(self.numFeatures):
+         self.P[i,i] = 1.0
+      # end of zeroing regularizer
+
+   def createRegularizer(self):
+      self.createUnitRegularizer()
+
+      q_array = [0.0]*self.numFeatures + [1.0]*self.numExamples
+      self.q = cb.matrix(q_array,(self.numFeatures+self.numExamples,1),'d')
+      self.q = self.C * self.q
+
+      self.G = cb.matrix(0.0,(self.numExamples,self.numFeatures+self.numExamples))
+      for i in range(self.numExamples):
+         self.G[i,self.numFeatures+i] = -1
+
+      self.h = cb.matrix(0,(self.numExamples,1),'d')
+
+if __name__ == '__main__':
+   sq = SIQP(3,2,100.0)
+
+   """
+   function [C_qp_penalties,C_qp_smoothness,C_lp_smoothness,C_qp_smallness,column_eps,Q,f,LB,UB] = paths_create_qp(N,C)
+   % function [C_qp_penalties,C_qp_smoothness,C_lp_smoothness,C_qp_smallness,column_eps,Q,f,LB,UB] = paths_create_qp(N,C)
+
+   %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+   % qp_solve: min ( <res, f'> + 1/2 res' Q res)
+   %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+   C_qp_penalties  = C ;    %was 0.01
+   C_qp_smoothness = C ;    %was 0.01
+   C_lp_smoothness = 0.00 ; %linear problem parameters
+   C_qp_smallness  = 1e-6 ; %was 1e-6 
+   column_eps      = 1e-3 ;
+
+   Q=zeros(2*126+N) ; % 1/2 * res' * Q * res
+   Q(1:90,1:90)           = C_qp_smallness*diag(ones(1,90)) ;
+   Q(91:126,91:126)       = C_qp_penalties*diag(ones(1,36)) ;
+   Q(127:2*126,127:2*126) = C_qp_smoothness*diag(ones(1,126)) ;
+   f = [zeros(1,126) C_lp_smoothness*ones(1,126) ones(1,N)/N] ; % <res, f'>
+
+
+
+   %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+   % qp_solve: LB <= res <= UB
+   %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+   INF = 1e20 ;
+   LB = [-INF*ones(1,126) zeros(1,126) zeros(1,N)] ; % lower bound for res
+   UB = [ INF*ones(1,2*126) INF*ones(1,N)] ; % upper bound for res
+
+   """
diff --git a/qpalma/SIQP_CPX.py b/qpalma/SIQP_CPX.py
new file mode 100644 (file)
index 0000000..b6635f6
--- /dev/null
@@ -0,0 +1,318 @@
+#!/usr/bin/env python
+# -*- coding:latin-1 -*-
+
+import sys
+import os
+import os.path
+import logging
+import math
+
+import cvxopt.base as cb
+
+import logging
+logging.basicConfig(level=logging.DEBUG,format='%(levelname)s %(message)s')
+
+import CPX
+import numpy as ny
+import numpy.matlib as nyml
+from cplex_const import *
+
+from SIQP import SIQP
+
+OPTIMAL = [CPX_STAT_OPTIMAL, CPXMIP_OPTIMAL, CPXMIP_OPTIMAL_TOL]
+UNBOUNDED = [CPX_STAT_UNBOUNDED, CPXMIP_UNBOUNDED]
+INFEASIBLE = [CPX_STAT_INFEASIBLE, CPXMIP_INFEASIBLE, CPXMIP_INForUNBD]
+
+class FloatMatrix:
+   
+   TYPE = nyml.float64
+
+   def __init__(self,list):
+      self.data = nyml.mat(list, dtype=self.TYPE).T
+
+class IntMatrix:
+   
+   TYPE = nyml.int32
+
+   def __init__(self,list):
+      self.data = nyml.mat(list, dtype=self.TYPE).T
+
+class CharMatrix:
+   
+   TYPE = '|S1'
+
+   def __init__(self,list):
+      self.data = nyml.mat(list, dtype=self.TYPE).T
+
+class SIQPSolver(SIQP):
+   """
+   This algorithm solves the problem 
+
+   min 1/2 x'Px + q'x
+   s.t. 
+         Gx >= h
+   """
+
+   # Define types and constants
+   Inf   = CPX_INFBOUND
+
+   def __init__(self,fSize,numExamples,c,proto):
+      SIQP.__init__(self,fSize,numExamples,c)
+      self.numFeatures = fSize
+      self.numExamples = numExamples
+      self.numVariables = self.numFeatures + self.numExamples
+      self.old_w = None
+      self.old_objective_value = -(sys.maxint-1)
+
+      self.addedConstraints = 0
+
+      self.protocol = proto
+
+      self.objsen = CPX_MIN
+
+      # open CPLEX environment, set some parameters
+      self.env = CPX.openCPLEX()
+
+      CPX.setintparam(self.env, CPX_PARAM_SCRIND, CPX_ON) # print >> self.protocol, info to screen
+      CPX.setintparam(self.env, CPX_PARAM_DATACHECK, CPX_ON)
+
+      # create CPLEX problem, add objective and constraints to it
+      self.lp = CPX.createprob(self.env, 'test1')
+
+      self.obj = FloatMatrix([0.0]*self.numFeatures + [1.0]*self.numExamples).data
+      cFactor = 1.0*self.C/self.numExamples
+      self.obj =  cFactor * self.obj
+      assert self.obj.shape == (self.numVariables,1)
+      
+      self.ub = FloatMatrix([self.Inf] * self.numVariables).data
+      self.lb = FloatMatrix([-self.Inf] * self.numFeatures + [0.0]*self.numExamples).data
+
+      self.matbeg = IntMatrix([1]*self.numVariables).data
+      self.matbeg[0] = 0
+
+      self.matcnt = IntMatrix([0]*self.numVariables).data
+      self.matcnt[0] = 1
+
+      self.matind = IntMatrix([0]).data
+      self.matval = FloatMatrix([0.0]).data
+
+      self.rhs = FloatMatrix([0.0]*self.numVariables).data
+      self.sense = CharMatrix(['G']*self.numVariables).data
+
+      self.numConstraints = 1
+
+      CPX.copylp(self.env, self.lp, self.numVariables, self.numConstraints, self.objsen, 
+               self.obj, self.rhs, self.sense,
+               self.matbeg, self.matcnt, self.matind, self.matval, 
+               self.lb, self.ub) 
+
+      self.qmatbeg, self.qmatcnt, self.qmatind, self.qmatval = self.cpx_matrix(self.P)
+      CPX.copyquad(self.env, self.lp, self.qmatbeg, self.qmatcnt, self.qmatind, self.qmatval)
+
+   def addConstraint(self, energy_deltas, example_idx):
+   #def addConstraint(self, energy_deltas, loss, example_idx, useMarginRescaling):
+      """
+      This method adds one contraint to the optimization problem.
+      """
+      self.lastDelta = energy_deltas
+      self.lastIndex = example_idx
+
+      assert -1 < example_idx and example_idx < self.numExamples
+
+      if self.old_w != None:
+         print >> self.protocol, 'Scalar prod. contrib is %f'%((self.old_w[0:self.numFeatures].T*energy_deltas)[0])
+
+      numNewRows = 1
+      numNewCols = 0
+   
+      row_rhs = FloatMatrix([1.0]).data
+      row_sense = CharMatrix(['G']).data
+
+      row_matbeg = IntMatrix([0]).data
+
+      row_entries = [0.0]*self.numVariables
+      indices = [0]*self.numVariables
+
+      currentVal = 0.0
+      numNonZeroElems = 0
+      for idx in range(self.numFeatures):
+         currentVal = energy_deltas[idx]
+         if math.fabs(currentVal) > self.EPSILON:
+            row_entries[numNonZeroElems] = currentVal
+            indices[numNonZeroElems] = idx
+            numNonZeroElems += 1
+
+      row_matind = IntMatrix([0]*(numNonZeroElems+1)).data
+      row_matval = FloatMatrix([0.0]*(numNonZeroElems+1)).data
+
+      for pos in range(numNonZeroElems):
+         row_matind[pos] = indices[pos]
+         row_matval[pos] = row_entries[pos]
+
+      # coefficient 1.0 of xi
+      row_matind[numNonZeroElems] = self.numFeatures+example_idx
+      row_matval[numNonZeroElems] = 1.0
+
+      status = CPX.addrows (self.env, self.lp, numNewCols, numNewRows, numNonZeroElems+1, row_rhs,
+                                  row_sense, row_matbeg, row_matind, row_matval)
+      
+      assert status == 0, 'Error ocurred while adding constraint.'
+
+      #CPX.writeprob(self.env,self.lp,"problem_%d.SAV"%self.addedConstraints)
+      #CPX.writeprob(self.env,self.lp,"problem_%d.LP"%self.addedConstraints)
+      self.addedConstraints += 1
+
+      return True
+               
+   def solve(self):
+      # solve the problem
+      CPX.qpopt(self.env,self.lp)
+
+      # get the solution
+      lpstat = CPX.getstat(self.env,self.lp)
+      if lpstat in OPTIMAL or lpstat == 6:
+         x, objval = CPX.getx(self.env,self.lp), CPX.getobjval(self.env,self.lp)
+         print >> self.protocol, 'SIQP_CPX: Objective is: %f'%objval
+
+         # Check that objective value is monotonically increasing
+         if math.fabs(objval - self.old_objective_value) >= 10e-5:
+            assert objval >= self.old_objective_value
+
+         self.old_objective_value = objval
+
+         new_w = cb.matrix([0.0]*self.numVariables,(self.numVariables,1))
+         for i in range(self.numVariables):
+            new_w[i,0] = x[i]
+
+         self.old_w = new_w
+
+         # Check for non-negative slacks    
+         sumOfSlacks = 0.0
+         print >> self.protocol, 'Slacks are:'
+         for i in range(self.numExamples):
+            print >> self.protocol, '%f ' % new_w[self.numFeatures+i,0]
+            assert new_w[self.numFeatures+i,0] >= 0.0
+            sumOfSlacks += new_w[self.numFeatures+i,0]
+
+         scalarProdDiff = (new_w[0:self.numFeatures,0].T * self.lastDelta)[0]
+         # print >> self.protocol, 'Constraint: %f >= %d - %f'%(scalarProdDiff,self.lastLoss,new_w[self.numFeatures+self.lastIndex])
+
+         # evalute parts of the objective
+         regularizationPart = (0.5 *new_w.T*self.P*new_w)[0]
+         lossPart = 1.0*self.C/self.numExamples * sumOfSlacks
+         assert lossPart >= 0.0, 'Error we have a negative loss contribution'
+         print >> self.protocol, 'Parts of objective: %e %e'%(regularizationPart,lossPart)
+
+         return objval,new_w[0:self.numFeatures,0],new_w[self.numFeatures:,0]
+
+      elif lpstat in UNBOUNDED: 
+         print >> self.protocol, "Solution unbounded"
+         assert False
+      elif lpstat in INFEASIBLE: 
+         print >> self.protocol, "Solution infeasible"
+         assert False
+      else: 
+         print >> self.protocol, "Solution code: ", lpstat
+         assert False
+
+   def cpx_matrix(self,M):
+      """
+      This method creates a sparse CPLEX representation of a given
+      cvxopt Matrix M.
+      """
+
+      nRows,nCols = M.size
+      numEntries = nRows*nCols
+
+      matbeg = IntMatrix([0]*nCols).data
+      matcnt = IntMatrix([0]*nCols).data
+
+      nonZeros = 0
+      elemCounter = 0
+      entry = 0.0
+
+      valVec = [0.0]*numEntries
+      indVec = [0.0]*numEntries
+
+      for currentCol in range(nCols):
+         nonZeros = 0
+         matbeg[currentCol] = elemCounter
+
+         for currentRow in range(nRows):
+            entry = M[currentRow,currentCol]
+            if math.fabs(entry) > self.EPSILON:
+               nonZeros += 1
+               indVec[elemCounter] = currentRow
+               valVec[elemCounter] = entry
+               elemCounter += 1
+
+         matcnt[currentCol] = nonZeros
+
+      totalNonZeros = elemCounter
+
+      matind = IntMatrix([0]*totalNonZeros).data
+      matval = FloatMatrix([0.0]*totalNonZeros).data
+
+      for idx in range(totalNonZeros):
+         matind[idx] = indVec[idx];
+         matval[idx] = valVec[idx];
+    
+      print >> self.protocol, 'nRows,nCols: %d,%d'%(nRows,nCols)
+      print >> self.protocol, 'totalNonZeros ' + str(totalNonZeros)
+
+      return matbeg,matcnt,matind,matval
+
+   def cpx_matrix_id(self):
+      """
+      This method creates a sparse CPLEX representation of a given
+      cvxopt Matrix M.
+      """
+
+      nRows,nCols = self.numVariables,self.numVariables
+      matbeg = IntMatrix([0]*nCols).data
+      matcnt = IntMatrix([0]*nCols).data
+
+      matind = IntMatrix([0]*nCols).data
+      matval = FloatMatrix([0.0]*nCols).data
+
+      for i in range(nCols):
+         matbeg[i] = i
+         matcnt[i] = 1
+         matind[i] = i
+         matval[i] = 1.0
+
+      return matbeg,matcnt,matind,matval
+
+   def delete(self):
+      # free the problem
+      CPX.freeprob(self.env,self.lp)
+      # close CPLEX environment
+      CPX.closeCPLEX(self.env)
+
+def test():
+   fh = open('siqp_cpx.log','w+')
+
+   numFeatures = 3
+   numExamples = 2
+   s = SIQPSolver(numFeatures,numExamples,10.0,fh)
+
+   d1 = cb.matrix([1,0,2],(numFeatures,1))
+   d2 = cb.matrix([1,1,0],(numFeatures,1))
+
+   s.addConstraint(d1,200.0,0,True)
+   s.addConstraint(d2,105.0,1,True)
+
+   w,slacks = s.solve()
+   print w
+   print slacks
+
+   print w.T * d1 + slacks[0]
+   print w.T * d2 + slacks[1]
+
+
+   s.delete()
+   del s
+   fh.close()
+
+if __name__ == '__main__':
+   test()
diff --git a/qpalma/TrainingParam.py b/qpalma/TrainingParam.py
new file mode 100644 (file)
index 0000000..2d5b28b
--- /dev/null
@@ -0,0 +1,24 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+class Param:
+   
+   def __init__(self):
+      """ default parameters """
+
+      self.basedir = '/fml/ag-raetsch/share/projects/qpalma/elegans_palma2'
+      #self.basedir = '/fml/ag-raetsch/share/projects/qpalma/zebrafish'
+      self.MAX_MEM = 31000;
+      self.LOCAL_ALIGN = 1;
+      self.init_param = 1;
+      self.train_with_splicesitescoreinformation = 1;
+      self.train_with_intronlengthinformation = 1;
+      self.C = 0.001;
+      self.microexon = 0;
+      self.prob = 0;
+      self.organism = 'elegans'
+      self.expt = 'training'
+         
+      self.insertion_prob  = self.prob/3 ;
+      self.deletion_prob   = self.prob/3 ;
+      self.mutation_prob   = self.prob/3 ;
diff --git a/qpalma/computeSpliceAlignWithQuality.py b/qpalma/computeSpliceAlignWithQuality.py
new file mode 100644 (file)
index 0000000..bb72d92
--- /dev/null
@@ -0,0 +1,110 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import numpy
+from numpy.matlib import mat,zeros,ones,inf
+import pdb
+from Plif import Plf,base_coord,linspace
+import Configuration
+
+def  computeSpliceAlignWithQuality(dna, exons):
+   """
+   Exonpos: Anfang Exon bis 1 rechts neben Ende Exon
+   Daraus folgt: SpliceposAnfang ist gleich Ende Exon von Vorgaenger
+   (ausser dem letzten und ersten)
+    und Ende Splicestelle ist gleich Anfang Exon vom naechsten, da Splicestelleende notiert 1 rechts neben Splciestellenende
+   bsp: 
+   cccccA1cccccE1cccccA2 ... AmcccccEmcccccccc
+   cccccXXcccccA1cccccE1 ... Em-1cccXXcccccccc
+   """
+
+   numberOfExons = (exons.shape)[0] # how many rows ?
+   exonSizes = [-1]*numberOfExons
+
+   #assert numberOfExons == 3
+
+   for idx in range(numberOfExons):
+      exonSizes[idx] = exons[idx,1] - exons[idx,0]
+
+   # SpliceAlign vector describes alignment: 
+   # 1:donorpos, 3:intron 2:acceptorpos, 0:exon, 4: dangling end
+   SpliceAlign = []
+
+   if exons[0,0] > 0:
+      SpliceAlign.extend([4]*(exons[0,0]))
+
+   for idx in range(numberOfExons):
+      exonLength = exonSizes[idx]
+      SpliceAlign.extend([0]*exonLength)
+      
+      if idx < numberOfExons-1:
+         intronLength = exons[idx+1,0] - exons[idx,1]
+         SpliceAlign.extend([1]+[3]*(intronLength-2)+[2])
+
+   if len(dna) > exons[-1,1]:
+      SpliceAlign.extend([4]*(len(dna)-exons[-1,1]))
+
+   assert len(SpliceAlign) == len(dna), pdb.set_trace()
+
+   # number of matches: just have to look at the underlying est
+   # est = dna(find(SpliceAlign==0)) # exon nucleotides
+   exonPos = [pos for pos,elem in enumerate(SpliceAlign) if elem == 0]
+   est = [elem for pos,elem in enumerate(dna) if pos in exonPos]
+
+   #length_est = sum(exon_sizes) ;
+   totalESTLength = 0
+   for elem in exonSizes:
+      totalESTLength += elem
+
+   assert totalESTLength == len(est) 
+
+   sizeMatchmatrix = Configuration.sizeMatchmatrix
+   # counts the occurences of a,c,g,t,n in this order
+   numChar = [0]*sizeMatchmatrix[0]
+
+   # counts the occurrences of a,c,g,t,n with their quality scores
+   #trueWeightQuality = zeros((Configuration.numQualPlifs*Configuration.numQualSuppPoints,1))
+   trueWeightQuality = numpy.zeros(Configuration.numQualPlifs*Configuration.numQualSuppPoints)
+   trueWeightQuality = trueWeightQuality.reshape(Configuration.estPlifs,Configuration.dnaPlifs,Configuration.numQualSuppPoints)
+
+   for elem in est:
+      if elem == 'a':
+         numChar[0] += 1
+      if elem == 'c':
+         numChar[1] += 1
+      if elem == 'g':
+         numChar[2] += 1
+      if elem == 't':
+         numChar[3] += 1
+      if elem == 'n':
+         numChar[4] += 1
+
+   if Configuration.mode == 'using_quality_scores':
+      for pos,elem in enumerate(numChar):
+         if pos < Configuration.estPlifs:
+            trueWeightQuality[pos][pos+1][-1] = elem
+
+   trueWeightQuality = mat(trueWeightQuality.flatten())
+   trueWeightQuality = trueWeightQuality.T
+
+   totalNumChar = 0
+   for idx in range(sizeMatchmatrix[0]):
+      totalNumChar += numChar[idx]
+
+   assert totalNumChar == len(est), "numChar %d, len(est) %d" % (totalNumChar,len(est))
+
+   # writing in weight match matrix
+   # matrix is saved columnwise
+   if Configuration.mode == 'normal':
+      trueWeightMatch = zeros((sizeMatchmatrix[0],sizeMatchmatrix[1])) # Scorematrix fuer Wahrheit
+      for idx in range(1,sizeMatchmatrix[0]):
+         trueWeightMatch[idx,idx] = numChar[idx-1]
+
+   if Configuration.mode == 'using_quality_scores':
+      trueWeightMatch = zeros((sizeMatchmatrix[0],sizeMatchmatrix[1])) # Scorematrix fuer Wahrheit
+      #for idx in range(1,sizeMatchmatrix[0]):
+      #   trueWeightMatch[idx] = numChar[idx-1]
+
+   trueWeightMatch = trueWeightMatch.reshape(sizeMatchmatrix[0]*sizeMatchmatrix[1],1)
+
+   return SpliceAlign, trueWeightMatch, trueWeightQuality
diff --git a/qpalma/computeSpliceWeights.py b/qpalma/computeSpliceWeights.py
new file mode 100644 (file)
index 0000000..7fc8667
--- /dev/null
@@ -0,0 +1,96 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from numpy import inf
+from numpy.matlib import zeros
+import pdb
+
+# 3 * 30 supporting points: x_1 ... x_30 => 3 * 30 parameters (1 .. 90): y_1 ... y_30
+# piecewise linear function:
+# take score from SVM vektor (don_supp: case 1, acc_supp: case 2) and compute length of intron: case 3
+# these are our values x
+# 
+#        | y_1                                          if x <= x_1
+#        |
+#        |        x_i+1 - x              x - x_i
+# f(x) = | y_i * -----------  +  y_i+1 * -----------    if x_i <= x <= x_i+1
+#        |       x_i+1 - x_i             x_i+1 - x_i
+#        |
+#        | y_30                                         if x_n <= x 
+# 
+#  y_i and y_i+1 parameters, so the fractions are saved in the weight vectors!
+
+def calculateWeights(plf, scores):
+   currentWeight = zeros((30,1))
+   
+   for k in range(len(scores)):
+      value = scores[k]
+      Lower = len([elem for elem in plf.limits if elem <= value])
+
+      if Lower == 0:
+         currentWeight[0] += 1
+      elif Lower == len(plf.limits):
+         currentWeight[-1] += 1
+      else:
+         # because we count from 0 in python
+         Lower -= 1
+         Upper = Lower+1 ; # x-werte bleiben fest
+         #print value,Lower,Upper
+         weightup  = 1.0*(value - plf.limits[Lower]) / (plf.limits[Upper] - plf.limits[Lower])
+         weightlow = 1.0*(plf.limits[Upper] - value) / (plf.limits[Upper] - plf.limits[Lower])
+         currentWeight[Upper] = currentWeight[Upper] + weightup
+         currentWeight[Lower] = currentWeight[Lower] + weightlow
+
+         #print plf.limits[Lower],plf.limits[Upper]
+         #print weightup,weightlow,currentWeight[Upper],currentWeight[Lower]
+
+   return currentWeight
+
+def computeSpliceWeights(d, a, h, SpliceAlign, don_supp, acc_supp):
+   ####################################################################################
+   # 1. Donor: In don_supp stehen Werte der SVM., in SpliceAlign die Einsen
+   ####################################################################################
+
+   # Picke die Positionen raus, an denen eine Donorstelle ist
+   DonorScores = [elem for pos,elem in enumerate(don_supp) if SpliceAlign[pos] == 1]
+   assert not ( -inf in DonorScores )
+
+   #print 'donor'
+   weightDon = calculateWeights(d,DonorScores)
+
+   ####################################################################################
+   # 2. Acceptor: In acc_supp stehen Werte der SVM., in SpliceAlign die Einsen
+   ####################################################################################
+
+   #Den Vektor Acceptorstellen durchgehen und die Gewichtsvektoren belasten:
+   AcceptorScores = [elem for pos,elem in enumerate(acc_supp) if pos > 0 and SpliceAlign[pos-1] == 2]
+   assert not ( -inf in AcceptorScores )
+
+   #print 'acceptor'
+   weightAcc = calculateWeights(a,AcceptorScores)
+
+   ####################################################################################
+   # 3. Intron length: SpliceAlign: Gaps zaehlen und auf Gapgewichte addieren
+   ####################################################################################
+
+   intron_starts = []
+   intron_ends   = []
+   for pos,elem in enumerate(SpliceAlign):
+      if elem == 1:
+         intron_starts.append(pos)
+
+      if elem == 2:
+         intron_ends.append(pos)
+
+   assert len(intron_starts) == len(intron_ends)
+
+   for i in range(len(intron_starts)):
+      assert intron_starts[i] < intron_ends[i]
+
+   values = [0.0]*len(intron_starts)
+   for pos in range(len(intron_starts)):
+      values[pos] = intron_ends[pos] - intron_starts[pos] + 1
+
+   weightIntron = calculateWeights(h,values)
+
+   return weightDon, weightAcc, weightIntron
diff --git a/qpalma/compute_donacc.py b/qpalma/compute_donacc.py
new file mode 100644 (file)
index 0000000..bad0872
--- /dev/null
@@ -0,0 +1,27 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import math
+from numpy.matlib import zeros,isinf
+from penalty_lookup_new import *
+
+def compute_donacc(donor_supp, acceptor_supp, d, a):
+
+   assert(len(donor_supp)==len(acceptor_supp))
+   size = len(donor_supp)
+  
+   donor    = [0.0]*size
+   acceptor = [0.0]*size
+
+   for idx in range(size):
+     if isinf(donor_supp[idx]):
+       donor[idx] = donor_supp[idx]
+     else:
+       donor[idx] = penalty_lookup_new(d, donor_supp[idx])
+
+     if isinf(acceptor_supp[idx]):
+       acceptor[idx] = acceptor_supp[idx]
+     else:
+       acceptor[idx] = penalty_lookup_new(a,acceptor_supp[idx])
+
+   return donor,acceptor
diff --git a/qpalma/export_param.py b/qpalma/export_param.py
new file mode 100644 (file)
index 0000000..57d2c94
--- /dev/null
@@ -0,0 +1,52 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+###########################################################
+#
+###########################################################
+
+import bz2
+
+def writeStruct(fid,plif):
+   fid.write('%s_limits=%s\n'%(plif.name,str(plif.limits)))
+   fid.write('%s_penalties=%s\n'%(plif.name,str(plif.penalties)))
+   fid.write('%s_bins=%d\n'%(plif.name,len(plif.limits)))
+
+   if plif.name == 'intron':
+      fid.write('%s_len_limits=%s\n'%(plif.name,str(plif.limits)))
+      fid.write('%s_len_penalties=%s\n'%(plif.name,str(plif.penalties)))
+      fid.write('%s_len_bins=%d\n'%(plif.name,len(plif.limits)))
+      fid.write('%s_len_min=%d\n'%(plif.name,plif.min_len))
+      fid.write('%s_len_max=%d\n'%(plif.name,plif.max_len))
+      fid.write('%s_len_transform=%s\n'%(plif.name,plif.transform))
+
+def export_param(filename,h,d,a,mmatrix,qualityPlifs):
+
+   # Exports a bz2 file with the trained PALMA. Assumes splice sites and intron length used.
+   h.name = 'intron'
+   d.name = 'donor'
+   a.name = 'acceptor'
+
+   fid = bz2.BZ2File(filename+'.bz2','w')
+
+   fid.write('%palma definition file version: 1.0\n\n')
+   fid.write('%penalties\n');
+  
+   writeStruct(fid, h);
+   writeStruct(fid, a);
+   writeStruct(fid, d);
+
+   # substitution matrix
+   mmatrix = mmatrix.reshape(6,6)
+   fid.write('substitution_matrix=[')
+   for row in range(6):
+      if row == 5:
+         fid.write('%f, %f, %f, %f, %f, %f]\n'%(mmatrix[row,0],mmatrix[row,1],mmatrix[row,2],mmatrix[row,3],mmatrix[row,4],mmatrix[row,5]))
+      else:
+         fid.write('%f, %f, %f, %f, %f, %f;\n'%(mmatrix[row,0],mmatrix[row,1],mmatrix[row,2],mmatrix[row,3],mmatrix[row,4],mmatrix[row,5]))
+
+   
+   for elem in qualityPlifs:
+      writeStruct(fid, elem);
+
+   fid.close()
diff --git a/qpalma/generateEvaluationData.py b/qpalma/generateEvaluationData.py
new file mode 100644 (file)
index 0000000..4b5b7a0
--- /dev/null
@@ -0,0 +1,148 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from numpy.matlib import mat,zeros,ones,inf
+import random
+import pdb
+import os.path
+import cPickle
+
+class Dataset:
+  pass
+
+def sample(population, k):
+   """Chooses k random elements from a population sequence. """
+   n = len(population)
+   result = [None] * k
+   for i in xrange(k):
+      j = int(random.random() * n)
+      result[i] = population[j]
+   return result
+
+def generateData(numExamples):
+   dna_len = 216
+   est_len = 36
+   random.seed(14)
+
+   letters = ['a','c','g','t']
+
+   Sequences   = []
+   Acceptors   = []
+   Donors      = []
+   Exons       = []
+   Ests        = []
+
+   for i in range(numExamples):
+      dna = ''.join(sample(letters, dna_len))
+      Sequences.append(dna)
+
+      Acceptors.append([0.0]*dna_len)
+      Donors.append([0.0]*dna_len)
+
+      currentExon       = zeros((2,2))
+      currentExon[0,0]  = 0
+      currentExon[0,1]  = 72
+      currentExon[1,0]  = 144
+      currentExon[1,1]  = 216
+
+      Exons.append(currentExon)
+
+      est = ''.join(sample(letters, est_len))
+      Ests.append(est)
+
+   preNr    = 15
+   middleNr = 6
+   sufNr    = 15
+   Qualities  = [[40]*preNr + [-1]*middleNr + [40]*sufNr]*numExamples
+
+   return Sequences, Acceptors, Donors, Exons, Ests, Qualities
+
+def generateData2(numExamples):
+   est_len = 36
+   random.seed(14)
+
+   letters = ['a','c','g','t']
+
+   Sequences   = []
+   Acceptors   = []
+   Donors      = []
+   Exons       = []
+   Ests        = []
+
+   for i in range(numExamples):
+      dna_len = random.randint(200,400)
+      dna = ''.join(sample(letters, dna_len))
+      #Sequences.append(dna)
+      begin = random.randint(70,dna_len-70)
+      end   = begin+60
+      dna = dna[0:begin] + 't'*18 + 'gt' + 'a'*20 + 'ag' + 't'*18 + dna[end:]
+      dna_len = len(dna)
+      Sequences.append(dna)
+      
+      currentDon  = [-3.0]*dna_len
+      currentAcc  = [-3.0]*dna_len
+
+      currentDon[begin+18+1] = 3.0
+      currentDon[begin+18+2] = 3.0
+
+      currentAcc[end-18-1] = 3.0
+      currentAcc[end-18-2] = 3.0
+
+      #pdb.set_trace()
+
+      Donors.append(currentDon)
+      Acceptors.append(currentAcc)
+
+      currentExon       = zeros((2,2))
+      currentExon[0,0]  = 0
+      currentExon[0,1]  = begin+18
+      currentExon[1,0]  = end-18
+      currentExon[1,1]  = dna_len-1
+
+      Exons.append(currentExon)
+
+      #est = ''.join(sample(letters, est_len))
+      est = dna[begin-18:begin] + dna[end+1:end+19]
+      est = 't'*est_len
+      Ests.append(est)
+
+   preNr    = 15
+   middleNr = 6
+   sufNr    = 15
+   #Qualities  = [[40]*preNr + [-1]*middleNr + [40]*sufNr]*numExamples
+   Qualities  = [[40]*est_len]*numExamples
+
+   return Sequences, Acceptors, Donors, Exons, Ests, Qualities
+
+def loadArtificialData(numExamples):
+   filename = 'artificial_dset_%d'%numExamples
+   if not os.path.exists(filename):
+      s,a,d,e,est,q = generateData2(numExamples)
+      dset = Dataset()
+      dset.Sequences = s
+      dset.Acceptor  = a
+      dset.Donors    = d
+      dset.Exons     = e
+      dset.Ests      = est
+      dset.Qualities = q
+      cPickle.dump(dset,open(filename,'w+'))
+   else:
+      dset = cPickle.load(open(filename))
+      s = dset.Sequences
+      a = dset.Acceptor
+      d = dset.Donors
+      e =  dset.Exons
+      est = dset.Ests
+      q = dset.Qualities
+
+   return s,a,d,e,est,q
+
+
+
+if __name__ == '__main__':
+   Sequences, Acceptors, Donors, Exons, Ests, Qualities = generateData(10)
+   print Acceptors
+   print Donors
+   print Exons
+   print Ests
+   print Qualities
diff --git a/qpalma/genome_info.pickle b/qpalma/genome_info.pickle
new file mode 100644 (file)
index 0000000..5eb5916
--- /dev/null
@@ -0,0 +1,178 @@
+ccopy_reg
+_reconstructor
+p1
+(cscipy.io.mio5
+mat_struct
+p2
+c__builtin__
+object
+p3
+NtRp4
+(dp5
+S'cdna_fnames'
+p6
+V/fml/ag-raetsch/share/databases/elegans_WS150/confirmed_genes.WS150
+p7
+sS'contig_names'
+p8
+cnumpy.core.multiarray
+_reconstruct
+p9
+(cnumpy
+ndarray
+p10
+(I0
+tS'b'
+tRp11
+(I1
+(I6
+tcnumpy
+dtype
+p12
+(S'O8'
+I0
+I1
+tRp13
+(I3
+S'|'
+NNNI-1
+I-1
+I63
+tbI00
+(lp14
+VCHROMOSOME_I
+p15
+aVCHROMOSOME_II
+p16
+aVCHROMOSOME_III
+p17
+aVCHROMOSOME_IV
+p18
+aVCHROMOSOME_V
+p19
+aVCHROMOSOME_X
+p20
+atbsS'alphabet'
+p21
+Vacgt
+p22
+sS'basedir'
+p23
+V/fml/ag-raetsch/share/projects/qpalma/elegans_palma2/
+p24
+sS'flcdna_fnames'
+p25
+g9
+(g10
+(I0
+tS'b'
+tRp26
+(I1
+(I0
+tg12
+(S'f8'
+I0
+I1
+tRp27
+(I3
+S'<'
+NNNI-1
+I-1
+I0
+tbI00
+S''
+tbsS'flat_fnames'
+p28
+g9
+(g10
+(I0
+tS'b'
+tRp29
+(I1
+(I6
+tg13
+I00
+(lp30
+V/fml/ag-raetsch/share/projects/qpalma/elegans_palma2/genome/I.dna.flat
+p31
+aV/fml/ag-raetsch/share/projects/qpalma/elegans_palma2/genome/II.dna.flat
+p32
+aV/fml/ag-raetsch/share/projects/qpalma/elegans_palma2/genome/III.dna.flat
+p33
+aV/fml/ag-raetsch/share/projects/qpalma/elegans_palma2/genome/IV.dna.flat
+p34
+aV/fml/ag-raetsch/share/projects/qpalma/elegans_palma2/genome/V.dna.flat
+p35
+aV/fml/ag-raetsch/share/projects/qpalma/elegans_palma2/genome/X.dna.flat
+p36
+atbsS'est_fnames'
+p37
+g9
+(g10
+(I0
+tS'b'
+tRp38
+(I1
+(I3
+tg13
+I00
+(lp39
+V/fml/ag-raetsch/share/databases/dbEST.111005/dbEST_elegans.fa
+p40
+aV/fml/ag-raetsch/share/databases/elegans_WS150/EST_Elegans.dna
+p41
+aV/fml/ag-raetsch/share/databases/elegans_WS150/confirmed_genes.WS150
+p42
+atbsS'_fieldnames'
+p43
+(lp44
+g37
+ag6
+ag25
+ag8
+ag28
+aS'fasta_fnames'
+p45
+aS'annotation_fnames'
+p46
+ag21
+ag23
+aS'max_intron_len'
+p47
+aS'min_intron_len'
+p48
+aS'merge_est_transcripts'
+p49
+asg47
+I20000
+sg49
+I1
+sg46
+V/fml/ag-raetsch/share/databases/elegans_WS150/elegansWS150_CHROMOSOME.gff
+p50
+sg45
+g9
+(g10
+(I0
+tS'b'
+tRp51
+(I1
+(I6
+tg13
+I00
+(lp52
+V/fml/ag-raetsch/share/databases/elegans_WS150/CHROMOSOME_I.dna
+p53
+aV/fml/ag-raetsch/share/databases/elegans_WS150/CHROMOSOME_II.dna
+p54
+aV/fml/ag-raetsch/share/databases/elegans_WS150/CHROMOSOME_III.dna
+p55
+aV/fml/ag-raetsch/share/databases/elegans_WS150/CHROMOSOME_IV.dna
+p56
+aV/fml/ag-raetsch/share/databases/elegans_WS150/CHROMOSOME_V.dna
+p57
+aV/fml/ag-raetsch/share/databases/elegans_WS150/CHROMOSOME_X.dna
+p58
+atbsg48
+I20
+sb.
\ No newline at end of file
diff --git a/qpalma/numpyUtils.py b/qpalma/numpyUtils.py
new file mode 100644 (file)
index 0000000..14ece77
--- /dev/null
@@ -0,0 +1,8 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import numpy as ny
+import numpy.matlib as nyml
+
+
+def 
diff --git a/qpalma/paths_load_data.py b/qpalma/paths_load_data.py
new file mode 100644 (file)
index 0000000..fef599c
--- /dev/null
@@ -0,0 +1,124 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import io_pickle
+import scipy.io
+import pdb
+
+def paths_load_data(expt,genome_info,PAR):
+   """
+      
+   """
+
+   # function [Sequences, Acceptors, Donors, Exons, Ests, Noises] = paths_load_data(expt,genome_info,PAR)
+   # Load the relevant file and return the alignment data
+
+   # expt can be 'training','validation' or 'test'
+
+   assert expt in ['training','validation','test']
+
+   tmp_dir = '/fml/ag-raetsch/home/fabio/tmp'
+
+   Noises = [];
+
+   if expt == 'training':
+      if PAR.microexon:
+         if PAR.LOCAL_ALIGN: # local version
+
+            train_data = '%s/microexon_train_data_cut_local.mat' % genome_info.basedir
+            train_data = '%s/microexon_train_data.mat' % genome_info.basedir
+            #train_data_pickle = '%s/microexon_train_data_cut_local.pickle'% tmp_dir
+            #io_pickle.convert_v6(train_data,train_data_pickle)
+            #train_data = io_pickle.load(train_data_pickle)
+            data = scipy.io.loadmat(train_data)
+
+         else: # global version
+
+            train_data = '%s/microexon_train_data_cut_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat' %\
+               (genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob)
+
+            train_data = '%s/microexon_train_data.mat' % genome_info.basedir
+            #train_data_pickle = '%s/microexon_train_data_cut_ip=%1.3f_dp=%1.3f_mp=%1.3f.pickle' %\
+            #   (tmp_dir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob)
+
+            #io_pickle.convert_v6(train_data,train_data_pickle)
+            #train_data = io_pickle.load(train_data_pickle)
+            data = scipy.io.loadmat(train_data)
+            Noises = data['TrainNoise'] # substitution matrix
+
+      else:
+         train_data = '%s/exons_train_local.mat' % genome_info.basedir
+         #train_data_pickle = '%s/exons_train_local.pickle'% tmp_dir
+         #io_pickle.convert_v6(train_data,train_data_pickle)
+         #microexon_train_data = io_pickle.load(train_data_pickle)
+         data = scipy.io.loadmat(train_data)
+
+      print 'train_data is %s' % train_data
+
+      Sequences = data['Train']       # dna sequences
+      Acceptors = data['TrainAcc']    # acceptor scores
+      Donors    = data['TrainDon']    # donor scores
+      Exons     = data['TrainExon']   # exon boundaries
+      Ests      = data['TrainEsts']   # est sequences
+     
+   #elif expt == 'validation':
+   #  print('Loading validation data\n') ;
+   #  if PAR.microexon,
+   #    if PAR.LOCAL_ALIGN
+   #      %local version
+   #      load(sprintf('%s/microexon_val_data_cut_local_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat', ...
+   #         genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob),  ...
+   #      'ValEsts', 'ValExon', 'Val', 'ValAcc', 'ValDon') ;
+   #    else
+   #      %global version
+   #      load(sprintf('%s/microexon_val_data_cut_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat', ...
+   #         genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob),  ...
+   #      'ValEsts', 'ValExon', 'Val', 'ValAcc', 'ValDon') ;
+   #    end
+   #  else
+   #    load(sprintf('%s/exons_val_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat', ...
+   #       genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob),  ...
+   #    'ValEsts', 'ValExon', 'Val', 'ValAcc', 'ValDon') ;
+   #  end
+   #  
+   #  Sequences = Val ;      % dna sequences
+   #  Acceptors = ValAcc ;   % acceptor scores
+   #  Donors    = ValDon ;   % donor scores
+   #  Exons     = ValExon ;  % exon boundaries
+   #  Ests      = ValEsts ;  % est sequences
+
+
+
+   #elif expt == 'test':
+   #  fprintf('Loading test data\n') ;
+   #  if PAR.microexon,
+   #    if PAR.LOCAL_ALIGN
+   #      %local version
+   #      load(sprintf('%s/microexon_test_data_cut_local_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat', ...
+   #         genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob), ...
+   #      'TestEsts', 'TestExon', 'Test', 'TestAcc', 'TestDon') ;
+   #    else
+   #      %global version
+   #      load(sprintf('%s/microexon_test_data_cut_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat', ...
+   #         genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob),  ...
+   #      'TestEsts', 'TestExon', 'Test','TestAcc', 'TestDon', 'TestNoise') ;
+   #      Noises    = TestNoise ; % substitution matrix
+   #      end
+   #      else
+   #    load(sprintf('%s/exons_test_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat', ...
+   #       genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob), ...
+   #    'TestEsts', 'TestExon', 'Test', 'TestAcc', 'TestDon') ;
+   #  end
+   #  
+   #  Sequences = Test ;      % dna sequences
+   #  Acceptors = TestAcc ;   % acceptor scores
+   #  Donors    = TestDon ;   % donor scores
+   #  Exons     = TestExon ;  % exon boundaries
+   #  Ests      = TestEsts ;  % est sequences
+
+   # Lower all indices by one to convert matlab 
+   # to python indices
+
+   Exons -= 1
+
+   return Sequences, Acceptors, Donors, Exons, Ests, Noises
diff --git a/qpalma/paths_load_data_pickle.py b/qpalma/paths_load_data_pickle.py
new file mode 100644 (file)
index 0000000..112ec5f
--- /dev/null
@@ -0,0 +1,50 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import cPickle
+
+def paths_load_data_pickle(expt,genome_info,PAR):
+   """
+      
+   """
+
+   # function [Sequences, Acceptors, Donors, Exons, Ests, Noises] = paths_load_data(expt,genome_info,PAR)
+   # Load the relevant file and return the alignment data
+
+   # expt can be 'training','validation' or 'test'
+
+   assert expt in ['training','validation','test']
+
+   tmp_dir = '/fml/ag-raetsch/home/fabio/tmp'
+
+   Noises = [];
+
+   if expt == 'training':
+      if PAR.microexon:
+         if PAR.LOCAL_ALIGN: # local version
+
+            train_data = '%s/microexon_train_data.pickle' % genome_info.basedir
+            data = cPickle.load(open(train_data))
+
+         else: # global version
+            pass
+
+
+      else:
+         train_data = '%s/exons_train_local.pickle' % genome_info.basedir
+         data = cPickle.load(open(train_data))
+
+      print 'train_data is %s' % train_data
+
+      Sequences = data['Train']       # dna sequences
+      Acceptors = data['TrainAcc']    # acceptor scores
+      Donors    = data['TrainDon']    # donor scores
+      Exons     = data['TrainExon']   # exon boundaries
+      Ests      = data['TrainEsts']   # est sequences
+     
+   # Lower all indices by one to convert matlab 
+   # to python indices
+
+   Exons -= 1
+
+   return Sequences, Acceptors, Donors, Exons, Ests, Noises
diff --git a/qpalma/paths_load_data_solexa.py b/qpalma/paths_load_data_solexa.py
new file mode 100644 (file)
index 0000000..e9dd387
--- /dev/null
@@ -0,0 +1,71 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from numpy.matlib import mat,zeros,ones,inf
+import cPickle
+import pdb
+
+def paths_load_data_solexa(expt,genome_info,PAR):
+   # expt can be 'training','validation' or 'test'
+   assert expt in ['training','validation','test']
+
+   dna_filename = '/fml/ag-raetsch/share/projects/qpalma/solexa/allGenes.pickle'
+   est_filename = '/fml/ag-raetsch/share/projects/qpalma/solexa/filtered_reads'
+   tair7_seq_filename = '/fml/ag-raetsch/share/projects/qpalma/solexa/tair7_seq.pickle'
+
+   tair7_seq = cPickle.load(open(tair7_seq_filename))
+   allGenes  = cPickle.load(open(dna_filename))
+
+   Sequences = []
+   Acceptors = []
+   Donors = []
+   Exons = []
+   Ests = []
+   Qualities = []
+
+   for line in open(est_filename):
+      line = line.strip()
+      chr,strand,seq, splitpos, length, prb, cal, chastity, gene_id = line.split()
+      splitpos = int(splitpos)
+      length = int(length)
+      prb = [ord(elem)-50 for elem in prb]
+      cal = [ord(elem)-64 for elem in cal]
+      chastity = [ord(elem)+10 for elem in chastity]
+
+      assert len(prb) == len(seq)
+
+      currentGene = allGenes[gene_id]
+      seq = seq.lower()
+
+      try:
+         currentSeq = tair7_seq[gene_id+'.1']['sequence'].lower()
+         Sequences.append(currentSeq)
+      except:
+         continue
+         
+      currentSize = len(Sequences[-1])
+      #assert currentSize == len(Sequences[-1]), 'gene_size %d / seq size %d' % (currentSize,len(Sequences[-1]))
+      #Acceptors.append([-inf]*currentSize)
+      #Donors.append([-inf]*currentSize)
+      Acceptors.append([0]*currentSize)
+      Donors.append([0]*currentSize)
+
+      currentExons = zeros((len(currentGene.exons),2))
+      for idx in range(len(currentGene.exons)):
+         currentExons[idx,0] = currentGene.exons[idx][0]-currentGene.start
+         currentExons[idx,1] = currentGene.exons[idx][1]-currentGene.start
+      Exons.append(currentExons)
+      Ests.append(seq)
+      Qualities.append(prb)
+
+      if len(Sequences[-1]) == 2755:
+         Sequences = Sequences[:-1]
+         Acceptors = Acceptors[:-1]
+         Donors    = Donors[:-1]
+         Exons     = Exons[:-1]
+         Ests      = Ests[:-1]
+         Qualities = Qualities[:-1]
+   
+   print 'found %d examples' % len(Sequences)
+
+   return Sequences, Acceptors, Donors, Exons, Ests, Qualities
diff --git a/qpalma/penalty_lookup_new.py b/qpalma/penalty_lookup_new.py
new file mode 100644 (file)
index 0000000..9837dd5
--- /dev/null
@@ -0,0 +1,43 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import math
+import pdb
+
+def penalty_lookup_new(penalty_struct, value):
+
+   limits = penalty_struct.limits
+   penalties = penalty_struct.penalties
+
+   if penalty_struct.transform == 'log':
+      value = math.log(value)
+
+   elif penalty_struct.transform == 'log(+3)':
+      value = math.log(value+3)
+
+   elif penalty_struct.transform == 'log(+1)':
+      value = math.log(value+1)
+
+   elif penalty_struct.transform == '':
+      pass
+
+   elif penalty_struct.transform == '(+3)':
+      value += 3
+         
+   elif penalty_struct.transform == 'mod3':
+     value = value%3
+
+   else:
+      assert False,'unknown penalty transform'
+
+   count = len([elem for elem in limits if elem <= value])
+
+   if count == 0:
+      pen = penalties[0]
+   elif count == len(limits):
+      pen=penalties[-1]
+   else:
+      pen = (penalties[count]*(value-limits[count-1]) + penalties[count-1]\
+         *(limits[count]-value)) / (limits[count]-limits[count-1])
+
+   return pen
diff --git a/qpalma/qpalma.py b/qpalma/qpalma.py
new file mode 100644 (file)
index 0000000..412a67b
--- /dev/null
@@ -0,0 +1,444 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+###########################################################
+#
+# 
+#
+###########################################################
+
+import sys
+import subprocess
+import scipy.io
+import pdb
+import os.path
+
+from numpy.matlib import mat,zeros,ones,inf
+from numpy.linalg import norm
+
+import QPalmaDP
+from SIQP_CPX import SIQPSolver
+
+from paths_load_data_pickle import *
+from paths_load_data_solexa import *
+
+from generateEvaluationData import *
+
+from computeSpliceWeights import *
+from set_param_palma import *
+from computeSpliceAlignWithQuality import *
+from penalty_lookup_new import *
+from compute_donacc import *
+from TrainingParam import Param
+from export_param import *
+
+import Configuration
+from Plif import Plf
+from Helpers import *
+
+def getQualityFeatureCounts(qualityPlifs):
+   weightQuality = qualityPlifs[0].penalties
+   for currentPlif in qualityPlifs[1:]:
+      weightQuality = numpy.vstack([weightQuality, currentPlif.penalties])
+
+   return weightQuality 
+
+class QPalma:
+   """
+   A training method for the QPalma project
+   """
+   
+   def __init__(self):
+      self.ARGS = Param()
+
+      self.logfh = open('qpalma.log','w+')
+      gen_file= '%s/genome.config' % self.ARGS.basedir
+
+      ginfo_filename = 'genome_info.pickle'
+      self.genome_info = fetch_genome_info(ginfo_filename)
+
+      self.plog('genome_info.basedir is %s\n'%self.genome_info.basedir)
+
+      #self.ARGS.train_with_splicesitescoreinformation = False
+
+   def plog(self,string):
+      self.logfh.write(string)
+      self.logfh.flush()
+
+   def run(self):
+      # Load the whole dataset 
+      if Configuration.mode == 'normal':
+         #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
+         Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
+         use_quality_scores = False
+      elif Configuration.mode == 'using_quality_scores':
+         Sequences, Acceptors, Donors, Exons, Ests, Qualities = paths_load_data_solexa('training',self.genome_info,self.ARGS)
+
+         #Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
+
+         #Sequences_, Acceptors_, Donors_, Exons_, Ests_, Qualities_ = generateData(100)
+
+         #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
+         #Qualities = []
+         #for i in range(len(Ests)):
+         #   Qualities.append([40]*len(Ests[i]))
+         use_quality_scores = True
+      else:
+         assert(False)
+
+      # number of training instances
+      N = len(Sequences) 
+      self.numExamples = N
+      assert N == len(Acceptors) and N == len(Acceptors) and N == len(Exons)\
+      and N == len(Ests), 'The Seq,Accept,Donor,.. arrays are of different lengths'
+      self.plog('Number of training examples: %d\n'% N)
+      print 'Number of features: %d\n'% Configuration.numFeatures
+
+      iteration_steps         = Configuration.iter_steps ; #upper bound on iteration steps
+      remove_duplicate_scores = Configuration.remove_duplicate_scores 
+      print_matrix            = Configuration.print_matrix 
+      anzpath                 = Configuration.anzpath 
+
+      # Initialize parameter vector  / param = numpy.matlib.rand(126,1)
+      param = Configuration.fixedParam 
+
+      # Set the parameters such as limits penalties for the Plifs
+      [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
+
+      # delete splicesite-score-information
+      if not self.ARGS.train_with_splicesitescoreinformation:
+         for i in range(len(Acceptors)):
+            if Acceptors[i] > -20:
+               Acceptors[i] = 1
+            if Donors[i] >-20:
+               Donors[i] = 1
+
+      # Initialize solver 
+      if Configuration.USE_OPT:
+         self.plog('Initializing problem...\n')
+         solver = SIQPSolver(Configuration.numFeatures,self.numExamples,Configuration.C,self.logfh)
+
+      # stores the number of alignments done for each example (best path, second-best path etc.)
+      num_path = [anzpath]*N 
+      # stores the gap for each example
+      gap      = [0.0]*N
+
+      #############################################################################################
+      # Training
+      #############################################################################################
+      self.plog('Starting training...\n')
+
+      donSP       = Configuration.numDonSuppPoints
+      accSP       = Configuration.numAccSuppPoints
+      lengthSP    = Configuration.numLengthSuppPoints
+      mmatrixSP   = Configuration.sizeMatchmatrix[0]\
+      *Configuration.sizeMatchmatrix[1]
+      numq        = Configuration.numQualSuppPoints
+      totalQualSP = Configuration.totalQualSuppPoints
+
+      currentPhi = zeros((Configuration.numFeatures,1))
+      totalQualityPenalties = zeros((totalQualSP,1))
+
+      iteration_nr = 0
+      param_idx = 0
+      const_added_ctr = 0
+      while True:
+         if iteration_nr == iteration_steps:
+            break
+
+         for exampleIdx in range(self.numExamples):
+
+            if (exampleIdx%10) == 0:
+               print 'Current example nr %d' % exampleIdx
+
+            dna = Sequences[exampleIdx] 
+            est = Ests[exampleIdx] 
+
+            if Configuration.mode == 'normal':
+               quality = [40]*len(est)
+
+            if Configuration.mode == 'using_quality_scores':
+               quality = Qualities[exampleIdx]
+
+            exons = Exons[exampleIdx] 
+            # NoiseMatrix = Noises[exampleIdx] 
+            don_supp = Donors[exampleIdx] 
+            acc_supp = Acceptors[exampleIdx] 
+
+            if exons[-1,1] > len(dna):
+               continue
+
+            # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)    
+            trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
+            
+            # Calculate the weights
+            trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
+            trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
+
+            currentPhi[0:donSP]                                               = mat(d.penalties[:]).reshape(donSP,1)
+            currentPhi[donSP:donSP+accSP]                                     = mat(a.penalties[:]).reshape(accSP,1)
+            currentPhi[donSP+accSP:donSP+accSP+lengthSP]                      = mat(h.penalties[:]).reshape(lengthSP,1)
+            currentPhi[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP]   = mmatrix[:]
+
+            if Configuration.mode == 'using_quality_scores':
+               totalQualityPenalties = param[-totalQualSP:]
+               currentPhi[donSP+accSP+lengthSP+mmatrixSP:]                    = totalQualityPenalties[:]
+
+            # Calculate w'phi(x,y) the total score of the alignment
+            trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
+
+            # The allWeights vector is supposed to store the weight parameter
+            # of the true alignment as well as the weight parameters of the
+            # num_path[exampleIdx] other alignments
+            allWeights = zeros((Configuration.numFeatures,num_path[exampleIdx]+1))
+            allWeights[:,0] = trueWeight[:,0]
+
+            AlignmentScores = [0.0]*(num_path[exampleIdx]+1)
+            AlignmentScores[0] = trueAlignmentScore
+
+            ################## Calculate wrong alignment(s) ######################
+
+            # Compute donor, acceptor with penalty_lookup_new
+            # returns two double lists
+            donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
+
+            #myalign wants the acceptor site on the g of the ag
+            acceptor = acceptor[1:]
+            acceptor.append(-inf)
+
+            # for now we don't use donor/acceptor scores
+
+            #donor = [-inf] * len(donor)
+            #acceptor = [-inf] * len(donor)
+
+            dna = str(dna)
+            est = str(est)
+            dna_len = len(dna)
+            est_len = len(est)
+
+            ps = h.convert2SWIG()
+
+            prb = QPalmaDP.createDoubleArrayFromList(quality)
+            chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
+
+            matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
+            mm_len = Configuration.sizeMatchmatrix[0]*Configuration.sizeMatchmatrix[1]
+
+            d_len = len(donor)
+            donor = QPalmaDP.createDoubleArrayFromList(donor)
+            a_len = len(acceptor)
+            acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
+
+            # Create the alignment object representing the interface to the C/C++ code.
+            currentAlignment = QPalmaDP.Alignment(Configuration.numQualPlifs,Configuration.numQualSuppPoints, use_quality_scores)
+
+            c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
+
+            #print 'Calling myalign...'
+            # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
+            currentAlignment.myalign( num_path[exampleIdx], dna, dna_len,\
+             est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
+             acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
+             print_matrix)
+
+            #print 'After calling myalign...'
+            #print 'Calling getAlignmentResults...'
+
+            c_SpliceAlign       = QPalmaDP.createIntArrayFromList([0]*(dna_len*num_path[exampleIdx]))
+            c_EstAlign          = QPalmaDP.createIntArrayFromList([0]*(est_len*num_path[exampleIdx]))
+            c_WeightMatch       = QPalmaDP.createIntArrayFromList([0]*(mm_len*num_path[exampleIdx]))
+            c_DPScores   = QPalmaDP.createDoubleArrayFromList([.0]*num_path[exampleIdx])
+
+            c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(Configuration.totalQualSuppPoints*num_path[exampleIdx]))
+
+            currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
+            c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
+
+            #print 'After calling getAlignmentResults...'
+
+            newSpliceAlign = zeros((num_path[exampleIdx]*dna_len,1))
+            newEstAlign    = zeros((est_len*num_path[exampleIdx],1))
+            newWeightMatch = zeros((num_path[exampleIdx]*mm_len,1))
+            newDPScores    = zeros((num_path[exampleIdx],1))
+            newQualityPlifsFeatures = zeros((Configuration.totalQualSuppPoints*num_path[exampleIdx],1))
+
+            #print 'newSpliceAlign'
+            for i in range(dna_len*num_path[exampleIdx]):
+               newSpliceAlign[i] = c_SpliceAlign[i]
+            #   print '%f' % (spliceAlign[i])
+
+            #print 'newEstAlign'
+            for i in range(est_len*num_path[exampleIdx]):
+               newEstAlign[i] = c_EstAlign[i]
+            #   print '%f' % (spliceAlign[i])
+
+            #print 'weightMatch'
+            for i in range(mm_len*num_path[exampleIdx]):
+               newWeightMatch[i] = c_WeightMatch[i]
+            #   print '%f' % (weightMatch[i])
+
+            #print 'ViterbiScores'
+            for i in range(num_path[exampleIdx]):
+               newDPScores[i] = c_DPScores[i]
+
+
+            if use_quality_scores:
+               for i in range(Configuration.totalQualSuppPoints*num_path[exampleIdx]):
+                  newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
+
+            #  equals palma up to here
+
+            #print "Calling destructors"
+            del c_SpliceAlign
+            del c_EstAlign
+            del c_WeightMatch
+            del c_DPScores
+            del c_qualityPlifsFeatures
+            del currentAlignment
+
+            newSpliceAlign = newSpliceAlign.reshape(num_path[exampleIdx],dna_len)
+            newWeightMatch = newWeightMatch.reshape(num_path[exampleIdx],mm_len)
+            # Calculate weights of the respective alignments Note that we are
+            # calculating n-best alignments without hamming loss, so we
+            # have to keep track which of the n-best alignments correspond to
+            # the true one in order not to incorporate a true alignment in the
+            # constraints. To keep track of the true and false alignments we
+            # define an array true_map with a boolean indicating the
+            # equivalence to the true alignment for each decoded alignment.
+            true_map = [0]*(num_path[exampleIdx]+1)
+            true_map[0] = 1
+            path_loss   = [0]*(num_path[exampleIdx])
+
+            for pathNr in range(num_path[exampleIdx]):
+               weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a, h, newSpliceAlign[pathNr,:].flatten().tolist()[0], don_supp, acc_supp)
+
+               decodedQualityFeatures = zeros((Configuration.totalQualSuppPoints,1))
+               for qidx in range(Configuration.totalQualSuppPoints):
+                  decodedQualityFeatures[qidx] = newQualityPlifsFeatures[(pathNr*Configuration.totalQualSuppPoints)+qidx]
+
+               #pdb.set_trace()
+
+               path_loss[pathNr] = 0
+               # sum up positionwise loss between alignments
+               for alignPosIdx in range(newSpliceAlign[pathNr,:].shape[1]):
+                  if newSpliceAlign[pathNr,alignPosIdx] != trueSpliceAlign[alignPosIdx]:
+                     path_loss[pathNr] += 1
+
+               # Gewichte in restliche Zeilen der Matrix speichern
+               wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch[pathNr,:].T, decodedQualityFeatures])
+               allWeights[:,pathNr+1] = wp
+
+               hpen = mat(h.penalties).reshape(len(h.penalties),1)
+               dpen = mat(d.penalties).reshape(len(d.penalties),1)
+               apen = mat(a.penalties).reshape(len(a.penalties),1)
+               features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties])
+
+               AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
+
+               # Check wether scalar product + loss equals viterbi score
+               print 'Example nr.: %d, path nr. %d, scores: %f vs %f' % (exampleIdx,pathNr,newDPScores[pathNr,0], AlignmentScores[pathNr+1])
+
+               distinct_scores = False
+               if math.fabs(AlignmentScores[pathNr] - AlignmentScores[pathNr+1]) > 1e-5:
+                  distinct_scores = True
+               
+               #if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) + [0,1][distinct_scores and (pathNr>0)] <= 1e-5:
+               if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) <= 1e-5:
+                  pdb.set_trace()
+
+               #  # if the pathNr-best alignment is very close to the true alignment consider it as true
+               if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
+                  true_map[pathNr+1] = 1
+
+               # assert AlignmentScores[0] > max(AlignmentScores[1:]) + 1e-6, pdb.set_trace()
+
+               # the true label sequence should not have a larger score than the maximal one WHYYYYY?
+               # this means that all n-best paths are to close to each other 
+               # we have to extend the n-best search to a (n+1)-best
+               if len([elem for elem in true_map if elem == 1]) == len(true_map):
+                  num_path[exampleIdx] = num_path[exampleIdx]+1
+
+               # Choose true and first false alignment for extending A
+               firstFalseIdx = -1
+               for map_idx,elem in enumerate(true_map):
+                  if elem == 0:
+                     firstFalseIdx = map_idx
+                     break
+
+               # if there is at least one useful false alignment add the
+               # corresponding constraints to the optimization problem
+               if firstFalseIdx != -1:
+                  trueWeights       = allWeights[:,0]
+                  firstFalseWeights = allWeights[:,firstFalseIdx]
+                  differenceVector  = trueWeights - firstFalseWeights
+                  #pdb.set_trace()
+
+                  if Configuration.USE_OPT:
+                     const_added = solver.addConstraint(differenceVector, exampleIdx)
+                     const_added_ctr += 1
+               #
+               # end of one example processing 
+               #
+
+            # call solver every nth example //added constraint
+            if exampleIdx != 0 and exampleIdx % 20 == 0 and Configuration.USE_OPT:
+               objValue,w,self.slacks = solver.solve()
+      
+               print "objValue is %f" % objValue
+
+               sum_xis = 0
+               for elem in self.slacks:
+                  sum_xis +=  elem
+   
+               for i in range(len(param)):
+                  param[i] = w[i]
+
+               #pdb.set_trace()
+               cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
+               param_idx += 1
+               [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
+
+         #
+         # end of one iteration through all examples
+         #
+         iteration_nr += 1
+
+      #
+      # end of optimization 
+      #  
+      print 'Training completed'
+
+      pa = para()
+      pa.h = h
+      pa.d = d
+      pa.a = a
+      pa.mmatrix = mmatrix
+      pa.qualityPlifs = qualityPlifs
+
+      cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
+      #cPickle.dump(pa,open('elegans.param','w+'))
+      self.logfh.close()
+
+def fetch_genome_info(ginfo_filename):
+   if not os.path.exists(ginfo_filename):
+      cmd = ['']*4
+      cmd[0] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/utils'
+      cmd[1] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/genomes'
+      cmd[2] = 'genome_info = init_genome(\'%s\')' % gen_file
+      cmd[3] = 'save genome_info.mat genome_info'  
+      full_cmd = "matlab -nojvm -nodisplay -r \"%s; %s; %s; %s; exit\"" % (cmd[0],cmd[1],cmd[2],cmd[3])
+
+      obj = subprocess.Popen(full_cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
+      out,err = obj.communicate()
+      assert err == '', 'An error occured!\n%s'%err
+
+      ginfo = scipy.io.loadmat('genome_info.mat')
+      cPickle.dump(self.genome_info,open(ginfo_filename,'w+'))
+      return ginfo['genome_info']
+
+   else:
+      return cPickle.load(open(ginfo_filename))
+
+if __name__ == '__main__':
+   qpalma = QPalma()
+   qpalma.run()
diff --git a/qpalma/qpalma_predict.py b/qpalma/qpalma_predict.py
new file mode 100644 (file)
index 0000000..db3c1cc
--- /dev/null
@@ -0,0 +1,382 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+###########################################################
+#
+# 
+#
+###########################################################
+
+import sys
+import subprocess
+import scipy.io
+import pdb
+import os.path
+
+from numpy.matlib import mat,zeros,ones,inf
+from numpy.linalg import norm
+
+import QPalmaDP
+from SIQP_CPX import SIQPSolver
+
+from paths_load_data_pickle import *
+from paths_load_data_solexa import *
+
+from generateEvaluationData import *
+
+from computeSpliceWeights import *
+from set_param_palma import *
+from computeSpliceAlignWithQuality import *
+from penalty_lookup_new import *
+from compute_donacc import *
+from TrainingParam import Param
+from export_param import *
+
+import Configuration as Conf
+from Plif import Plf
+from Helpers import *
+
+def getQualityFeatureCounts(qualityPlifs):
+   weightQuality = qualityPlifs[0].penalties
+   for currentPlif in qualityPlifs[1:]:
+      weightQuality = numpy.vstack([weightQuality, currentPlif.penalties])
+
+   return weightQuality 
+
+class QPalma:
+   """
+   A training method for the QPalma project
+   """
+   
+   def __init__(self):
+      self.ARGS = Param()
+
+      self.logfh = open('qpalma.log','w+')
+      gen_file= '%s/genome.config' % self.ARGS.basedir
+
+      ginfo_filename = 'genome_info.pickle'
+      self.genome_info = fetch_genome_info(ginfo_filename)
+
+      self.plog('genome_info.basedir is %s\n'%self.genome_info.basedir)
+
+      #self.ARGS.train_with_splicesitescoreinformation = False
+
+   def plog(self,string):
+      self.logfh.write(string)
+      self.logfh.flush()
+
+   def run(self):
+      # Load the whole dataset 
+      if Conf.mode == 'normal':
+         #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
+         Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
+         use_quality_scores = False
+
+      elif Conf.mode == 'using_quality_scores':
+         Sequences, Acceptors, Donors, Exons, Ests, Qualities = paths_load_data_solexa('training',self.genome_info,self.ARGS)
+         #Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
+
+         #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
+         pdb.set_trace()
+         #Qualities = []
+         #for i in range(len(Ests)):
+         #   Qualities.append([40]*len(Ests[i]))
+         use_quality_scores = True
+      else:
+         assert(False)
+
+      # number of training instances
+      N = len(Sequences) 
+      self.numExamples = N
+      assert N == len(Acceptors) and N == len(Acceptors) and N == len(Exons)\
+      and N == len(Ests), 'The Seq,Accept,Donor,.. arrays are of different lengths'
+      self.plog('Number of training examples: %d\n'% N)
+      print 'Number of features: %d\n'% Conf.numFeatures
+
+      remove_duplicate_scores = Conf.remove_duplicate_scores 
+      print_matrix            = Conf.print_matrix 
+      anzpath                 = Conf.anzpath 
+
+      # Initialize parameter vector  / param = numpy.matlib.rand(126,1)
+      #param_filename = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/python/elegans.param'
+      param_filename='/fml/ag-raetsch/home/fabio/svn/projects/QPalma/python/param_16.pickle'
+      param = load_param(param_filename)
+
+      # Set the parameters such as limits penalties for the Plifs
+      [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
+
+      # stores the number of alignments done for each example (best path, second-best path etc.)
+      num_path = [anzpath]*N 
+
+      #############################################################################################
+      # Training
+      #############################################################################################
+      self.plog('Starting training...\n')
+
+      donSP       = Conf.numDonSuppPoints
+      accSP       = Conf.numAccSuppPoints
+      lengthSP    = Conf.numLengthSuppPoints
+      mmatrixSP   = Conf.sizeMatchmatrix[0]\
+      *Conf.sizeMatchmatrix[1]
+      numq        = Conf.numQualSuppPoints
+      totalQualSP = Conf.totalQualSuppPoints
+
+      currentPhi = zeros((Conf.numFeatures,1))
+      totalQualityPenalties = zeros((totalQualSP,1))
+
+      for exampleIdx in range(self.numExamples):
+
+         dna = Sequences[exampleIdx] 
+         est = Ests[exampleIdx] 
+
+         if Conf.mode == 'normal':
+            quality = [40]*len(est)
+
+         if Conf.mode == 'using_quality_scores':
+            quality = Qualities[exampleIdx]
+
+         exons = Exons[exampleIdx] 
+         # NoiseMatrix = Noises[exampleIdx] 
+         don_supp = Donors[exampleIdx] 
+         acc_supp = Acceptors[exampleIdx] 
+
+         if exons[-1,1] > len(dna):
+            continue
+
+         # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)    
+         trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
+         
+         # Calculate the weights
+         trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
+         trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
+
+         currentPhi[0:donSP]                                               = mat(d.penalties[:]).reshape(donSP,1)
+         currentPhi[donSP:donSP+accSP]                                     = mat(a.penalties[:]).reshape(accSP,1)
+         currentPhi[donSP+accSP:donSP+accSP+lengthSP]                      = mat(h.penalties[:]).reshape(lengthSP,1)
+         currentPhi[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP]   = mmatrix[:]
+
+         if Conf.mode == 'using_quality_scores':
+            totalQualityPenalties = param[-totalQualSP:]
+            currentPhi[donSP+accSP+lengthSP+mmatrixSP:]                    = totalQualityPenalties[:]
+
+         # Calculate w'phi(x,y) the total score of the alignment
+         trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
+
+         # The allWeights vector is supposed to store the weight parameter
+         # of the true alignment as well as the weight parameters of the
+         # 1 other alignments
+         allWeights = zeros((Conf.numFeatures,1+1))
+         allWeights[:,0] = trueWeight[:,0]
+
+         AlignmentScores = [0.0]*(1+1)
+         AlignmentScores[0] = trueAlignmentScore
+
+         ################## Calculate wrong alignment(s) ######################
+
+         # Compute donor, acceptor with penalty_lookup_new
+         # returns two double lists
+         donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
+
+         #myalign wants the acceptor site on the g of the ag
+         acceptor = acceptor[1:]
+         acceptor.append(-inf)
+
+         # for now we don't use donor/acceptor scores
+         #donor = [-inf] * len(donor)
+         #acceptor = [-inf] * len(donor)
+
+         dna = str(dna)
+         est = str(est)
+         dna_len = len(dna)
+         est_len = len(est)
+
+         ps = h.convert2SWIG()
+
+         prb = QPalmaDP.createDoubleArrayFromList(quality)
+         chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
+
+         matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
+         mm_len = Conf.sizeMatchmatrix[0]*Conf.sizeMatchmatrix[1]
+
+         d_len = len(donor)
+         donor = QPalmaDP.createDoubleArrayFromList(donor)
+         a_len = len(acceptor)
+         acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
+
+         # Create the alignment object representing the interface to the C/C++ code.
+         currentAlignment = QPalmaDP.Alignment(Conf.numQualPlifs,Conf.numQualSuppPoints, use_quality_scores)
+
+         c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
+
+         # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
+         currentAlignment.myalign(1, dna, dna_len,\
+          est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
+          acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
+          print_matrix)
+
+         c_SpliceAlign  = QPalmaDP.createIntArrayFromList([0]*(dna_len*1))
+         c_EstAlign     = QPalmaDP.createIntArrayFromList([0]*(est_len*1))
+         c_WeightMatch  = QPalmaDP.createIntArrayFromList([0]*(mm_len*1))
+         c_DPScores     = QPalmaDP.createDoubleArrayFromList([.0]*1)
+
+         c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(Conf.totalQualSuppPoints))
+
+         currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
+         c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
+
+         newSpliceAlign = zeros((dna_len,1))
+         newEstAlign    = zeros((est_len,1))
+         newWeightMatch = zeros((mm_len,1))
+         newQualityPlifsFeatures = zeros((Conf.totalQualSuppPoints,1))
+
+         #print 'newSpliceAlign'
+         for i in range(dna_len):
+            newSpliceAlign[i] = c_SpliceAlign[i]
+         #   print '%f' % (spliceAlign[i])
+
+         #print 'newEstAlign'
+         for i in range(est_len):
+            newEstAlign[i] = c_EstAlign[i]
+         #   print '%f' % (spliceAlign[i])
+
+         #print 'weightMatch'
+         for i in range(mm_len):
+            newWeightMatch[i] = c_WeightMatch[i]
+         #   print '%f' % (weightMatch[i])
+
+         newDPScores = c_DPScores[0]
+
+         for i in range(Conf.totalQualSuppPoints):
+            newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
+
+         #  equals palma up to here
+
+         #print "Calling destructors"
+         del c_SpliceAlign
+         del c_EstAlign
+         del c_WeightMatch
+         del c_DPScores
+         del c_qualityPlifsFeatures
+         del currentAlignment
+
+         newSpliceAlign = newSpliceAlign.reshape(1,dna_len)
+         newWeightMatch = newWeightMatch.reshape(1,mm_len)
+         # Calculate weights of the respective alignments Note that we are
+         # calculating n-best alignments without hamming loss, so we
+         # have to keep track which of the n-best alignments correspond to
+         # the true one in order not to incorporate a true alignment in the
+         # constraints. To keep track of the true and false alignments we
+         # define an array true_map with a boolean indicating the
+         # equivalence to the true alignment for each decoded alignment.
+         true_map = [0]*2
+         true_map[0] = 1
+         pathNr = 0
+
+         weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a, h, newSpliceAlign.flatten().tolist()[0], don_supp, acc_supp)
+
+         decodedQualityFeatures = zeros((Conf.totalQualSuppPoints,1))
+         for qidx in range(Conf.totalQualSuppPoints):
+            decodedQualityFeatures[qidx] = newQualityPlifsFeatures[qidx]
+
+         # Gewichte in restliche Zeilen der Matrix speichern
+         wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch.T, decodedQualityFeatures])
+         allWeights[:,pathNr+1] = wp
+
+         hpen = mat(h.penalties).reshape(len(h.penalties),1)
+         dpen = mat(d.penalties).reshape(len(d.penalties),1)
+         apen = mat(a.penalties).reshape(len(a.penalties),1)
+         features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties])
+
+         AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
+
+         # Check wether scalar product + loss equals viterbi score
+         print '%f vs %f' % (newDPScores, AlignmentScores[pathNr+1])
+
+         # if the pathNr-best alignment is very close to the true alignment consider it as true
+         if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
+            true_map[pathNr+1] = 1
+
+         pdb.set_trace()
+      
+         evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign)
+
+      print 'Prediction completed'
+      self.logfh.close()
+
+def fetch_genome_info(ginfo_filename):
+   if not os.path.exists(ginfo_filename):
+      cmd = ['']*4
+      cmd[0] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/utils'
+      cmd[1] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/genomes'
+      cmd[2] = 'genome_info = init_genome(\'%s\')' % gen_file
+      cmd[3] = 'save genome_info.mat genome_info'  
+      full_cmd = "matlab -nojvm -nodisplay -r \"%s; %s; %s; %s; exit\"" % (cmd[0],cmd[1],cmd[2],cmd[3])
+
+      obj = subprocess.Popen(full_cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
+      out,err = obj.communicate()
+      assert err == '', 'An error occured!\n%s'%err
+
+      ginfo = scipy.io.loadmat('genome_info.mat')
+      cPickle.dump(self.genome_info,open(ginfo_filename,'w+'))
+      return ginfo['genome_info']
+
+   else:
+      return cPickle.load(open(ginfo_filename))
+
+def plifs2param(h,d,a,mmatrix,qualityPlifs):
+   donSP       = Conf.numDonSuppPoints
+   accSP       = Conf.numAccSuppPoints
+   lengthSP    = Conf.numLengthSuppPoints
+   mmatrixSP   = Conf.sizeMatchmatrix[0]\
+   *Conf.sizeMatchmatrix[1]
+
+
+   param = zeros((Conf.numFeatures,1))
+   param[0:donSP]                                               = mat(d.penalties[:]).reshape(donSP,1)
+   param[donSP:donSP+accSP]                                     = mat(a.penalties[:]).reshape(accSP,1)
+   param[donSP+accSP:donSP+accSP+lengthSP]                      = mat(h.penalties[:]).reshape(lengthSP,1)
+   param[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP]   = mmatrix[:]
+
+   for idx in range(len(qualityPlifs)):
+      currentPlif       = qualityPlifs[idx]
+      begin             = lengthSP+donSP+accSP+mmatrixSP+(idx*Conf.numQualSuppPoints)
+      end               = lengthSP+donSP+accSP+mmatrixSP+((idx+1)*Conf.numQualSuppPoints)
+      param[begin:end]  = mat(currentPlif.penalties).reshape(Conf.numQualSuppPoints,1)
+
+   return param
+
+def load_param(filename):
+   param = None 
+   #try:
+   #   para = cPickle.load(open(filename))
+   #except:
+   #   print 'Error: Could not open parameter file!'
+   #   sys.exit(1)
+   #
+   #param = plifs2param(para.h,para.d,para.a,para.mmatrix,para.qualityPlifs)
+
+   param = cPickle.load(open(filename))
+   return param
+
+def evaluateExample(dna,est,exons,SpliceAlign,newEstAlign):
+   newExons = []
+   oldElem = -1
+   SpliceAlign = SpliceAlign.flatten().tolist()[0]
+   SpliceAlign.append(-1)
+   for pos,elem in enumerate(SpliceAlign):
+      if pos == 0:
+         oldElem = -1
+      else:
+         oldElem = SpliceAlign[pos-1]
+
+      if oldElem != 0 and elem == 0:
+         newExons.append(pos)
+
+      if oldElem == 0 and elem != 0:
+         newExons.append(pos-1)
+
+   pdb.set_trace()
+
+if __name__ == '__main__':
+   qpalma = QPalma()
+   qpalma.run()
diff --git a/qpalma/set_param_palma.py b/qpalma/set_param_palma.py
new file mode 100644 (file)
index 0000000..f760031
--- /dev/null
@@ -0,0 +1,139 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import math
+import numpy.matlib
+import QPalmaDP
+import pdb
+import Configuration
+from Plif import *
+
+def set_params_pa():
+   h = plf()
+   h.len = int(model.intron_len_bins) 
+   h.limits = model.intron_len_limits 
+   h.penalties = model.intron_len_penalties 
+   h.name = 'h'
+   h.max_len   = int(max_intron_len)
+   h.min_len   = int(min_intron_len)
+   h.transform = model.intron_len_transform
+
+   d = plf()
+   d.len = int(model.donor_bins)
+   d.limits = model.donor_limits
+   d.penalties = model.donor_penalties 
+   d.name = 'd'
+   d.max_len   = 100
+   d.min_len   = -100
+
+   a = plf()
+   a.len = int(model.acceptor_bins)
+   a.limits = model.acceptor_limits 
+   a.penalties = model.acceptor_penalties 
+   a.name = 'a'
+   a.max_len   = 100
+   a.min_len   = -100
+
+   mmatrix = model.substitution_matrix 
+
+
+def set_param_palma(param, train_with_intronlengthinformation,\
+                                               min_intron_len=None, max_intron_len=None, min_svm_score=None, max_svm_score=None):
+
+   print 'Setting parameters ...'
+   
+   if min_intron_len == None:
+      if train_with_intronlengthinformation:
+         min_intron_len=20
+         max_intron_len=1000
+      else:
+         min_intron_len = 1
+         max_intron_len = 2
+
+   if min_intron_len != None and max_intron_len != None:
+      min_svm_score=-5
+      max_svm_score=5
+
+   h = Plf()
+   d = Plf()
+   a = Plf()
+   qualityPlifs = [None]*Configuration.numQualPlifs
+
+   donSP       = Configuration.numDonSuppPoints
+   accSP       = Configuration.numAccSuppPoints
+   lengthSP    = Configuration.numLengthSuppPoints
+   mmatrixSP   = Configuration.sizeMatchmatrix[0]\
+   *Configuration.sizeMatchmatrix[1]
+   totalQualSP = Configuration.totalQualSuppPoints
+
+   ####################
+   # Gapfunktion
+   ####################
+   h.limits    = logspace(math.log(min_intron_len,10),math.log(max_intron_len,10),30) 
+   h.penalties = param[0:lengthSP].flatten().tolist()[0]
+   #h.transform = '+1' 
+   h.transform = '' 
+   h.name      = 'h' 
+   h.max_len   =  100000 
+   h.min_len   =  4 
+   h.id        = 1 
+   h.use_svm   = 0 
+   h.next_id   = 0 
+
+
+   ####################
+   # Donorfunktion
+   ####################
+   d.limits    = linspace(min_svm_score,max_svm_score,30) 
+   d.penalties = param[lengthSP:lengthSP+donSP].flatten().tolist()[0]
+   #d.transform = '+1' 
+   d.transform = '' 
+   d.name      = 'd' 
+   d.max_len   = 100 
+   d.min_len   = -100 
+   d.id        = 1 
+   d.use_svm   = 0 
+   d.next_id   = 0 
+
+
+   ####################
+   # Acceptorfunktion
+   ####################
+   a.limits    = linspace(min_svm_score,max_svm_score,30) 
+   a.penalties = param[lengthSP+donSP:lengthSP+donSP+accSP].flatten().tolist()[0]
+   #a.transform = '+1' 
+   a.transform = '' 
+   a.name      = 'a' 
+   a.max_len   = 100  
+   a.min_len   = -100  
+   a.id        = 1 
+   a.use_svm   = 0 
+   a.next_id   = 0 
+
+   ####################
+   # Matchmatrix
+   ####################
+   mmatrix = numpy.matlib.mat(param[lengthSP+donSP+accSP:lengthSP+donSP+accSP+mmatrixSP])
+   mmatrix.reshape(Configuration.sizeMatchmatrix[0],Configuration.sizeMatchmatrix[1]) 
+
+   ####################
+   # Quality Plifs
+   ####################
+   for idx in range(Configuration.numQualPlifs):
+      currentPlif = Plf()
+      currentPlif.limits    = linspace(Configuration.min_qual,Configuration.max_qual,Configuration.numQualSuppPoints) 
+      begin                 = lengthSP+donSP+accSP+mmatrixSP+(idx*Configuration.numQualSuppPoints)
+      end                   = lengthSP+donSP+accSP+mmatrixSP+((idx+1)*Configuration.numQualSuppPoints)
+      currentPlif.penalties = param[begin:end].flatten().tolist()[0]
+      currentPlif.transform = '' 
+      currentPlif.name      = 'q' 
+      currentPlif.max_len   = Configuration.max_qual 
+      currentPlif.min_len   = Configuration.min_qual
+      qualityPlifs[idx] = currentPlif
+
+   return h,d,a,mmatrix,qualityPlifs
+
+if __name__ == '__main__':
+   min_svm_score=-5
+   max_svm_score=5
+   print linspace(min_svm_score,max_svm_score,30) 
diff --git a/qpalma/tools/PyGff.py b/qpalma/tools/PyGff.py
new file mode 100644 (file)
index 0000000..2d26c8a
--- /dev/null
@@ -0,0 +1,27 @@
+#!/usr/bin/env python 
+# -*- coding: utf-8 -*- 
+
+class Gene:
+   
+   def __init__(self,chr,begin,end,strand,id):
+      assert chr != ''
+      assert begin >= 0 and begin <= end and end >= 0
+      assert strand in ['+','-']
+
+      self.chromosome = chr
+      self.start = begin
+      self.stop = end
+      self.strand = strand
+      self.exons = []
+      self.fiveUTR = []
+      self.threeUTR = []
+      self.id = id
+
+   def addExon(self,start,stop):
+      self.exons.append((start,stop))
+
+   def load(self,filename):
+      pass
+
+   def save(self,filename):
+      pass
diff --git a/qpalma/tools/Solexa.py b/qpalma/tools/Solexa.py
new file mode 100644 (file)
index 0000000..ad48439
--- /dev/null
@@ -0,0 +1,27 @@
+#!/usr/bin/env python 
+# -*- coding: utf-8 -*-
+
+class Read:
+   
+   chromosome = ''
+   position = 0
+   sequence = ''
+   strand = ''
+
+   # mismatch = 0
+   # repeats = 0
+   # length = 0
+   # deletion = 0
+
+   prob = None
+   calibratedProb = None
+   chastity = None
+
+   def __init__(self,chr,pos,seq,strand,prob,calibrated,chastity):
+      self.chromosome = chr
+      self.position = pos
+      self.sequence = seq
+      self.strand = strand
+      self.prob = prob
+      self.calibratedProb = calibrated
+      self.chastity = chastity
diff --git a/qpalma/tools/createTestSet.py b/qpalma/tools/createTestSet.py
new file mode 100644 (file)
index 0000000..67f6bf9
--- /dev/null
@@ -0,0 +1,81 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import sys
+import os.path
+import csv
+from PyGff import *
+import cPickle
+
+info=\
+"""
+You have to supply two files. One containing the gff information and the other
+one containing the information of the Solexa(R) reads.
+
+Usage: ./createTestSet.py gff.pickle reads.txt
+"""
+
+doc=\
+"""
+Make sure that you split the data into chromosome files beforehand because
+this method does not check the chromosome info
+"""
+
+def printRow(row):
+   print "".join([elem+' ' for elem in row]).strip()
+
+def check(annotation,reads):
+   skippedReadsCounter = 0
+
+   reader = csv.reader(reads, delimiter='\t', quoting=csv.QUOTE_NONE)
+   for pos,row in enumerate(reader):
+      if len(row) != 12:
+         skippedReadsCounter += 1
+         continue
+
+      if pos % 10000 == 0:
+         print >> sys.stderr, pos
+
+      chr = row[0]
+      pos = int(row[1])
+      readStart = pos
+      readStop = pos+35
+
+      for gene in annotation:
+         # read inside current gene positions
+
+         if gene.start <= readStart and pos+35 <= gene.stop:
+            overlapping = True
+            
+            for exonStart,exonStop in gene.exons:
+               if exonStart <= readStart and readStop <= exonStop:
+                  overlapping = False
+                  break
+         
+            if overlapping:
+               printRow(row)
+               continue
+
+         # read is overlapping with gene boundaries
+         if pos < gene.start and pos+35 > gene.start:
+            printRow(row)
+            continue
+
+         if pos < gene.stop and pos+35 > gene.stop:
+            printRow(row)
+            continue
+
+   print skippedReadsCounter     
+      
+if __name__ == '__main__':
+   assert len(sys.argv) >= 3, info
+   annotFile = sys.argv[1]
+   readsFile = sys.argv[2]
+   assert os.path.exists(annotFile), 'File %s does not exist!' % annotFile
+   assert os.path.exists(readsFile), 'File %s does not exist!' % readsFile
+   print >> sys.stderr, doc
+
+   annotation = cPickle.load(open(annotFile))
+   reads = open(readsFile)
+
+   check(annotation,reads)
diff --git a/qpalma/tools/parseGff.py b/qpalma/tools/parseGff.py
new file mode 100644 (file)
index 0000000..b01bbdc
--- /dev/null
@@ -0,0 +1,104 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import sys
+import os.path
+import csv
+import re
+from PyGff import *
+import cPickle
+import copy
+
+def parse(gff_fid):
+   reader = csv.reader(gff_fid, delimiter='\t', quoting=csv.QUOTE_NONE)
+
+   allGenes = {}
+   currentGene = None
+
+   rx = re.compile('ID=[^;]*;',re.DOTALL)
+
+   for row in reader:
+      assert len(row) == 9
+      chr = row[0]
+      id = row[2]
+      start = int(row[3])
+      stop = int(row[4])
+      strand = row[6]
+      desc = row[8]
+
+      if id == 'chromosome':
+         continue
+
+      if id == 'gene':
+         if currentGene != None:
+            allGenes[currentGene.id] = currentGene
+         
+         desc = rx.search(desc).group()[3:-1]
+         print desc
+         currentGene = Gene(chr,start,stop,strand,desc)
+
+      elif id == 'five_prime_UTR':
+         pass
+
+      elif id == 'three_prime_UTR':
+         pass
+
+      elif id == 'mRNA':
+         pass
+
+      elif id == 'exon':
+         assert currentGene != None
+         currentGene.addExon(start,stop)
+
+      elif id == 'CDS':
+         pass
+
+      elif id == 'ncRNA':
+         pass
+
+      elif id == 'pseudogenic_exon':
+         pass
+
+      elif id == 'pseudogenic_transcript':
+         pass
+
+      elif id == 'miRNA':
+         pass
+
+      elif id == 'rRNA':
+         pass
+
+      elif id == 'snoRNA':
+         pass
+
+      elif id == 'snRNA':
+         pass
+
+      elif id == 'tRNA':
+         pass
+
+      elif id == 'pseudogene':
+         if currentGene != None:
+            allGenes[currentGene.id] = currentGene
+            currentGene = None
+      else:
+         assert False, 'Error: Unknown identifier \'%s\'' % id
+
+   if currentGene != None:
+      allGenes[currentGene.id] = currentGene
+
+   return allGenes
+      
+if __name__ == '__main__':
+   assert len(sys.argv) >= 3
+   annotFile = sys.argv[1]
+   pickleFile = sys.argv[2]
+   assert os.path.exists(annotFile)
+   assert not os.path.exists(pickleFile)
+
+   gff_fid = open(annotFile)
+   pickle_fid = open(pickleFile,'w+')
+   allGenes = parse(gff_fid)
+   for key,val in allGenes.iteritems():
+      print key
+   cPickle.dump(allGenes,pickle_fid)
diff --git a/qpalma/tools/parseSolexa.py b/qpalma/tools/parseSolexa.py
new file mode 100644 (file)
index 0000000..dd62b37
--- /dev/null
@@ -0,0 +1,53 @@
+#!/usr/bin/env python 
+# -*- coding: utf-8 -*-
+
+import sys
+import os.path
+import cPickle
+import csv
+from Solexa import Read
+
+def parse(solexa_fid,pickle_fid):
+   
+   reader = csv.reader(solexa_fid, delimiter='\t', quoting=csv.QUOTE_NONE)
+
+   allReads = []
+   currentRead = None
+
+   stmap = {'P':'-','D':'+'}
+
+   for row in reader:
+      assert len(row) == 12
+      chr = row[0]
+      pos = row[1]
+      seq = row[2]
+      id = int(row[3])
+      strand = stmap[row[4]]
+      mismatch = row[5]
+      repeat = row[6]
+      length = row[7]
+      deletion = row[8]
+
+      prob = row[9]
+      calibratedProb = row[10]
+      chastity = row[11]
+
+      prob = [ord(elem)-50 for elem in prob]
+      calibratedProb = [ord(elem)-64 for elem in calibratedProb]
+      chastity = [ord(elem)+10 for elem in chastity]
+
+      currentRead = Read(chr,pos,seq,strand,prob,calibratedProb,chastity)
+      allReads.append(currentRead)
+
+   cPickle.dump(allReads,pickle_fid)
+
+if __name__ == '__main__':
+   assert len(sys.argv) >= 3
+   solexaFile = sys.argv[1]
+   pickleFile = sys.argv[2]
+   assert os.path.exists(solexaFile)
+   assert not os.path.exists(pickleFile)
+
+   solexa_fid = open(solexaFile)
+   pickle_fid = open(pickleFile,'w+')
+   parse(solexa_fid,pickle_fid)