+++ /dev/null
-SRCS= Mathmatics.cpp\
- fill_matrix.cpp\
- qpalma_dp.cpp\
- result_align.cpp\
- penalty_info.cpp\
- print_align.cpp\
- io.cpp
-
-HDRS= Mathmatics.h\
- common.h\
- config.h\
- debug_tools.h\
- fill_matrix.h\
- io.h\
- penalty_info.h\
- qpalma_dp.h
-
-OBJS = $(SRCS:%.cpp=%.o)
-
-#CXXFLAGS=-O3 -fPIC
-#CXXFLAGS=-O3 -fPIC -pg -fprofile-arcs
-CXXFLAGS=-O3 -fPIC -ggdb
-
-IF=QPalmaDP
-
-all: $(OBJS) $(HDRS)
- swig -c++ -python ${IF}.i
- g++ $(CXXFLAGS) -I/usr/include/python2.5 -c ${IF}_wrap.cxx -o ${IF}_wrap.o
- g++ $(CXXFLAGS) -shared -lpython2.5 $(OBJS) ${IF}_wrap.o -o _${IF}.so
- python -c "import ${IF}"
-
-clean:
- rm *.o *.so *.cxx ${IF}.py ${IF}.pyc
+++ /dev/null
-// Math.cpp: implementation of the CMath class.
-//
-//////////////////////////////////////////////////////////////////////
-
-
-#include "Mathmatics.h"
-#include "io.h"
-
-#include <sys/time.h>
-#include <sys/types.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <time.h>
-#include <unistd.h>
-#include <assert.h>
-
-//////////////////////////////////////////////////////////////////////
-// Construction/Destruction
-//////////////////////////////////////////////////////////////////////
-
-#ifdef USE_LOGCACHE
-//gene/math specific constants
-#ifdef USE_HMMDEBUG
-#define MAX_LOG_TABLE_SIZE 10*1024*1024
-#define LOG_TABLE_PRECISION 1e-6
-#else
-#define MAX_LOG_TABLE_SIZE 123*1024*1024
-#define LOG_TABLE_PRECISION 1e-15
-#endif
-
-INT CMath::LOGACCURACY = 0; // 100000 steps per integer
-#endif
-
-INT CMath::LOGRANGE = 0; // range for logtable: log(1+exp(x)) -25 <= x <= 0
-
-#ifdef USE_PATHDEBUG
-const REAL CMath::INFTY = 1e11; // infinity
-#else
-const REAL CMath::INFTY = -log(0.0); // infinity
-#endif
-const REAL CMath::ALMOST_NEG_INFTY = -1000;
-
-CHAR CMath::rand_state[256];
-
-CMath::CMath()
-{
- struct timeval tv;
- gettimeofday(&tv, NULL);
- UINT seed=(UINT) (4223517*getpid()*tv.tv_sec*tv.tv_usec);
- initstate(seed, CMath::rand_state, sizeof(CMath::rand_state));
- CIO::message(M_INFO, "seeding random number generator with %u\n", seed);
-
-#ifdef USE_LOGCACHE
- LOGRANGE=CMath::determine_logrange();
- LOGACCURACY=CMath::determine_logaccuracy(LOGRANGE);
- CIO::message(M_INFO, "Initializing log-table (size=%i*%i*%i=%2.1fMB) ...",LOGRANGE,LOGACCURACY,sizeof(REAL),LOGRANGE*LOGACCURACY*sizeof(REAL)/(1024.0*1024.0)) ;
-
- CMath::logtable=new REAL[LOGRANGE*LOGACCURACY];
- init_log_table();
- CIO::message(M_INFO, "Done.\n") ;
-#else
- INT i=0;
- while ((REAL)log(1+((REAL)exp(-REAL(i)))))
- i++;
- CIO::message(M_INFO, "determined range for x in log(1+exp(-x)) is:%d\n", i);
- LOGRANGE=i;
-#endif
-}
-
-CMath::~CMath()
-{
-#ifdef USE_LOGCACHE
- delete[] logtable;
-#endif
-}
-
-#ifdef USE_LOGCACHE
-INT CMath::determine_logrange()
-{
- INT i;
- REAL acc=0;
- for (i=0; i<50; i++)
- {
- acc=((REAL)log(1+((REAL)exp(-REAL(i)))));
- if (acc<=(REAL)LOG_TABLE_PRECISION)
- break;
- }
-
- CIO::message(M_INFO, "determined range for x in table log(1+exp(-x)) is:%d (error:%G)\n",i,acc);
- return i;
-}
-
-INT CMath::determine_logaccuracy(INT range)
-{
- range=MAX_LOG_TABLE_SIZE/range/((int)sizeof(REAL));
- CIO::message(M_INFO, "determined accuracy for x in table log(1+exp(-x)) is:%d (error:%G)\n",range,1.0/(double) range);
- return range;
-}
-
-//init log table of form log(1+exp(x))
-void CMath::init_log_table()
-{
- for (INT i=0; i< LOGACCURACY*LOGRANGE; i++)
- logtable[i]=log(1+exp(REAL(-i)/REAL(LOGACCURACY)));
-}
-#endif
-
-void CMath::sort(INT *a, INT cols, INT sort_col)
-{
- INT changed=1;
- if (a[0]==-1) return ;
- while (changed)
- {
- changed=0; INT i=0 ;
- while ((a[(i+1)*cols]!=-1) && (a[(i+1)*cols+1]!=-1)) // to be sure
- {
- if (a[i*cols+sort_col]>a[(i+1)*cols+sort_col])
- {
- for (INT j=0; j<cols; j++)
- CMath::swap(a[i*cols+j],a[(i+1)*cols+j]) ;
- changed=1 ;
- } ;
- i++ ;
- } ;
- } ;
-}
-
-void CMath::sort(REAL *a, INT* idx, INT N)
-{
-
- INT changed=1;
- while (changed)
- {
- changed=0;
- for (INT i=0; i<N-1; i++)
- {
- if (a[i]>a[i+1])
- {
- swap(a[i],a[i+1]) ;
- swap(idx[i],idx[i+1]) ;
- changed=1 ;
- } ;
- } ;
- } ;
-
-}
-
-
-
-//plot x- axis false positives (fp) 1-Specificity
-//plot y- axis true positives (tp) Sensitivity
-INT CMath::calcroc(REAL* fp, REAL* tp, REAL* output, INT* label, INT& size, INT& possize, INT& negsize, REAL& tresh, FILE* rocfile)
-{
- INT left=0;
- INT right=size-1;
- INT i;
-
- for (i=0; i<size; i++)
- {
- if (!(label[i]== -1 || label[i]==1))
- return -1;
- }
-
- //sort data such that -1 labels first +1 behind
- while (left<right)
- {
- while ((label[left] < 0) && (left<right))
- left++;
- while ((label[right] > 0) && (left<right)) //warning: label must be either -1 or +1
- right--;
-
- swap(output[left],output[right]);
- swap(label[left], label[right]);
- }
-
- // neg/pos sizes
- negsize=left;
- possize=size-left;
- REAL* negout=output;
- REAL* posout=&output[left];
-
- // sort the pos and neg outputs separately
- qsort(negout, negsize);
- qsort(posout, possize);
-
- // set minimum+maximum values for decision-treshhold
- REAL minimum=min(negout[0], posout[0]);
- REAL maximum=minimum;
- if (negsize>0)
- maximum=max(maximum,negout[negsize-1]);
- if (possize>0)
- maximum=max(maximum,posout[possize-1]);
-
- REAL treshhold=minimum;
- REAL old_treshhold=treshhold;
-
- //clear array.
- for (i=0; i<size; i++)
- {
- fp[i]=1.0;
- tp[i]=1.0;
- }
-
- //start with fp=1.0 tp=1.0 which is posidx=0, negidx=0
- //everything right of {pos,neg}idx is considered to beLONG to +1
- INT posidx=0;
- INT negidx=0;
- INT iteration=1;
- INT returnidx=-1;
-
- REAL minerr=10;
-
- while (iteration < size && treshhold<=maximum)
- {
- old_treshhold=treshhold;
-
- while (treshhold==old_treshhold && treshhold<=maximum)
- {
- if (posidx<possize && negidx<negsize)
- {
- if (posout[posidx]<negout[negidx])
- {
- if (posout[posidx]==treshhold)
- posidx++;
- else
- treshhold=posout[posidx];
- }
- else
- {
- if (negout[negidx]==treshhold)
- negidx++;
- else
- treshhold=negout[negidx];
- }
- }
- else
- {
- if (posidx>=possize && negidx<negsize-1)
- {
- negidx++;
- treshhold=negout[negidx];
- }
- else if (negidx>=negsize && posidx<possize-1)
- {
- posidx++;
- treshhold=posout[posidx];
- }
- else if (negidx<negsize && treshhold!=negout[negidx])
- treshhold=negout[negidx];
- else if (posidx<possize && treshhold!=posout[posidx])
- treshhold=posout[posidx];
- else
- {
- treshhold=2*(maximum+100); // force termination
- posidx=possize;
- negidx=negsize;
- break;
- }
- }
- }
-
- //calc tp,fp
- tp[iteration]=(possize-posidx)/(REAL (possize));
- fp[iteration]=(negsize-negidx)/(REAL (negsize));
-
- //choose poINT with minimal error
- if (minerr > negsize*fp[iteration]/size+(1-tp[iteration])*possize/size )
- {
- minerr=negsize*fp[iteration]/size+(1-tp[iteration])*possize/size;
- tresh=(old_treshhold+treshhold)/2;
- returnidx=iteration;
- }
-
- iteration++;
- }
-
- // set new size
- size=iteration;
-
- if (rocfile)
- {
- const CHAR id[]="ROC";
- fwrite(id, sizeof(char), sizeof(id), rocfile);
- fwrite(fp, sizeof(REAL), size, rocfile);
- fwrite(tp, sizeof(REAL), size, rocfile);
- }
-
- return returnidx;
-}
-
-UINT CMath::crc32(BYTE *data, INT len)
-{
- UINT result;
- INT i,j;
- BYTE octet;
-
- result = 0-1;
-
- for (i=0; i<len; i++)
- {
- octet = *(data++);
- for (j=0; j<8; j++)
- {
- if ((octet >> 7) ^ (result >> 31))
- {
- result = (result << 1) ^ 0x04c11db7;
- }
- else
- {
- result = (result << 1);
- }
- octet <<= 1;
- }
- }
-
- return ~result;
-}
-
-double CMath::mutual_info(REAL* p1, REAL* p2, INT len)
-{
- double e=0;
-
- for (INT i=0; i<len; i++)
- for (INT j=0; j<len; j++)
- e+=exp(p2[j*len+i])*(p2[j*len+i]-p1[i]-p1[j]);
-
- return e;
-}
-
-double CMath::relative_entropy(REAL* p, REAL* q, INT len)
-{
- double e=0;
-
- for (INT i=0; i<len; i++)
- e+=exp(p[i])*(p[i]-q[i]);
-
- return e;
-}
-
-double CMath::entropy(REAL* p, INT len)
-{
- double e=0;
-
- for (INT i=0; i<len; i++)
- e-=exp(p[i])*p[i];
-
- return e;
-}
+++ /dev/null
-#ifndef __MATHMATICS_H_
-#define __MATHMATICS_H_
-
-#include "common.h"
-#include "io.h"
-
-#include <math.h>
-#include <stdio.h>
-
-//define finite for win32
-#ifdef _WIN32
-#include <float.h>
-#define finite _finite
-#define isnan _isnan
-#endif
-
-#ifndef NAN
-#include <stdlib.h>
-#define NAN (strtod("NAN",NULL))
-#endif
-
-#ifdef SUNOS
-extern "C" int finite(double);
-#endif
-
-/** Mathematical Functions.
- * Class which collects generic mathematical functions
- */
-class CMath
-{
-public:
- /**@name Constructor/Destructor.
- */
- //@{
- ///Constructor - initializes log-table
- CMath();
-
- ///Destructor - frees logtable
- virtual ~CMath();
- //@}
-
- /**@name min/max/abs functions.
- */
- //@{
-
- ///return the minimum of two integers
- //
- template <class T>
- static inline T min(T a, T b)
- {
- return ((a)<=(b))?(a):(b);
- }
-
- ///return the maximum of two integers
- template <class T>
- static inline T max(T a, T b)
- {
- return ((a)>=(b))?(a):(b);
- }
-
- ///return the maximum of two integers
- template <class T>
- static inline T abs(T a)
- {
- return ((a)>=(0))?(a):(-a);
- }
- //@}
-
- /**@name misc functions */
- //@{
-
- /// crc32
- static UINT crc32(BYTE *data, INT len);
-
- static inline REAL round(REAL d)
- {
- return floor(d+0.5);
- }
-
- /// signum of type T variable a
- template <class T>
- static inline REAL sign(REAL a)
- {
- if (a==0)
- return 0;
- else return (a<0) ? (-1) : (+1);
- }
-
- /// swap floats a and b
- template <class T>
- static inline void swap(T & a,T &b)
- {
- T c=a ;
- a=b; b=c ;
- }
-
- /// x^2
- template <class T>
- static inline T sq(T x)
- {
- return x*x;
- }
-
- static inline LONG factorial(INT n)
- {
- LONG res=1 ;
- for (int i=2; i<=n; i++)
- res*=i ;
- return res ;
- }
-
- static inline LONG nchoosek(INT n, INT k)
- {
- long res=1 ;
-
- for (INT i=n-k+1; i<=n; i++)
- res*=i ;
-
- return res/factorial(k) ;
- }
-
- /** performs a bubblesort on a given matrix a.
- * it is sorted from in ascending order from top to bottom
- * and left to right */
- static void sort(INT *a, INT cols, INT sort_col=0) ;
- static void sort(REAL *a, INT*idx, INT N) ;
-
- /** performs a quicksort on an array output of length size
- * it is sorted from in ascending (for type T) */
- template <class T>
- static void qsort(T* output, INT size) ;
-
- /** performs a quicksort on an array output of length size
- * it is sorted from in ascending order
- * (for type T1) and returns the index (type T2)
- * matlab alike [sorted,index]=sort(output)
- */
- template <class T1,class T2>
- static void qsort(T1* output, T2* index, INT size);
-
- /** performs a quicksort on an array output of length size
- * it is sorted from in ascending order
- * (for type T1) and returns the index (type T2)
- * matlab alike [sorted,index]=sort(output)
- */
- template <class T1,class T2>
- static void qsort_backward(T1* output, T2* index, INT size);
-
- /* finds the smallest element in output and puts that element as the
- first element */
- template <class T>
- static void min(REAL* output, T* index, INT size);
-
- /* finds the n smallest elements in output and puts these elements as the
- first n elements */
- template <class T>
- static void nmin(REAL* output, T* index, INT size, INT n);
-
- /* performs a inplace unique of a vector of type T using quicksort
- * returns the new number of elements */
- template <class T>
- static INT unique(T* output, INT size)
- {
- qsort(output, size) ;
- INT i,j=0 ;
- for (i=0; i<size; i++)
- if (i==0 || output[i]!=output[i-1])
- output[j++]=output[i] ;
- return j ;
- }
-
- /* finds an element in a sorted array via binary search
- * returns -1 if not found */
- static inline INT fast_find(WORD* output, INT size, WORD elem)
- {
- INT start=0, end=size-1, middle=size/2 ;
-
- if (output[start]>elem || output[end]<elem)
- return -1 ;
-
- while (1)
- {
- if (output[middle]>elem)
- {
- end = middle ;
- middle=start+(end-start)/2 ;
- } ;
- if (output[middle]<elem)
- {
- start = middle ;
- middle=start+(end-start)/2 ;
- }
- if (output[middle]==elem)
- return middle ;
- if (end-start<=1)
- {
- if (output[start]==elem)
- return start ;
- if (output[end]==elem)
- return end ;
- return -1 ;
- }
- }
- }
-
- /* finds an element in a sorted array via binary search
- * that is smaller-equal elem und the next element is larger-equal */
- static inline INT fast_find_range(REAL* output, INT size, REAL elem)
- {
- INT start=0, end=size-2, middle=(size-2)/2 ;
-
- if (output[start]>elem)
- return -1 ;
- if (output[end]<=elem)
- return size-1 ;
-
- while (1)
- {
- if (output[middle]>elem)
- {
- end = middle ;
- middle=start+(end-start)/2 ;
- } ;
- if (output[middle]<elem)
- {
- start = middle ;
- middle=start+(end-start)/2 ;
- }
- if (output[middle]<=elem && output[middle+1]>=elem)
- return middle ;
- if (end-start<=1)
- {
- if (output[start]<=elem && output[start+1]>=elem)
- return start ;
- return end ;
- }
- }
- }
-
- /** calculates ROC into (fp,tp)
- * from output and label of length size
- * returns index with smallest error=fp+fn
- */
- static INT calcroc(REAL* fp, REAL* tp, REAL* output, INT* label, INT& size, INT& possize, INT& negsize, REAL& tresh, FILE* rocfile);
- //@}
-
- /// returns the mutual information of p which is given in logspace
- /// where p,q are given in logspace
- static double mutual_info(REAL* p1, REAL* p2, INT len);
-
- /// returns the relative entropy H(P||Q),
- /// where p,q are given in logspace
- static double relative_entropy(REAL* p, REAL* q, INT len);
-
- /// returns entropy of p which is given in logspace
- static double entropy(REAL* p, INT len);
-
- /**@name summing functions */
- //@{
- /** sum logarithmic probabilities.
- * Probability measures are summed up but are now given in logspace where
- * direct summation of exp(operand) is not possible due to numerical problems, i.e. eg. exp(-1000)=0. Therefore
- * we do log( exp(a) + exp(b)) = a + log (1 + exp (b-a)) where a is max(p,q) and b min(p,q). */
-#ifdef USE_LOGCACHE
- static inline REAL logarithmic_sum(REAL p, REAL q)
- {
- if (finite(p))
- {
- if (finite(q))
- {
-
- register REAL diff=p-q;
-
- if (diff>0) //p>q
- {
- if (diff > LOGRANGE)
- return p;
- else
- return p + logtable[(int)(diff*LOGACCURACY)];
- }
- else //p<=q
- {
- if (-diff > LOGRANGE)
- return q;
- else
- return q + logtable[(int)(-diff*LOGACCURACY)];
- }
- }
- CIO::message("INVALID second operand to logsum(%f,%f) expect undefined results\n", p, q);
- return NAN;
- }
- else
- return q;
- }
-
- ///init log table of form log(1+exp(x))
- static void init_log_table();
-
- /// determine INT x for that log(1+exp(-x)) == 0
- static INT determine_logrange();
-
- /// determine accuracy, such that the thing fits into MAX_LOG_TABLE_SIZE, needs logrange as argument
- static INT determine_logaccuracy(INT range);
-#else
- /*
- inline REAL logarithmic_sum(REAL p, REAL q)
- {
- double result=comp_logarithmic_sum(p,q);
-
- printf("diff:%f <-> %f\n",p-q, result);
- return result;
- }*/
-
- static inline REAL logarithmic_sum(REAL p, REAL q)
- {
- if (finite(p))
- {
- if (finite(q))
- {
-
- register REAL diff=p-q;
- if (diff>0) //p>q
- {
- if (diff > LOGRANGE)
- return p;
- else
- return p + log(1+exp(-diff));
- }
- else //p<=q
- {
- if (-diff > LOGRANGE)
- return q;
- else
- return q + log(1+exp(diff));
- }
- }
- return p;
- }
- else
- return q;
- }
-#endif
-#ifdef LOG_SUM_ARRAY
- /** sum up a whole array of values in logspace.
- * This function addresses the numeric instabiliy caused by simply summing up N elements by adding
- * each of the elements to some variable. Instead array neighbours are summed up until one element remains.
- * Whilst the number of additions remains the same, the error is only in the order of log(N) instead N.
- */
- static inline REAL logarithmic_sum_array(REAL *p, INT len)
- {
- if (len<=2)
- {
- if (len==2)
- return logarithmic_sum(p[0],p[1]) ;
- if (len==1)
- return p[0];
- return -INFTY ;
- }
- else
- {
- register REAL *pp=p ;
- if (len%2==1) pp++ ;
- for (register INT j=0; j < len>>1; j++)
- pp[j]=logarithmic_sum(pp[j<<1], pp[1+(j<<1)]) ;
- }
- return logarithmic_sum_array(p,len%2+len>>1) ;
- }
-#endif
- //@}
-public:
- /**@name constants*/
- //@{
- /// infinity
- static const REAL INFTY;
-
- /// almost neg (log) infinity
- static const REAL ALMOST_NEG_INFTY;
-
- /// range for logtable: log(1+exp(x)) -LOGRANGE <= x <= 0
- static INT LOGRANGE;
-
-#ifdef USE_LOGCACHE
- /// number of steps per integer
- static INT LOGACCURACY;
- //@}
-protected:
- ///table with log-values
- static REAL* logtable;
-#endif
- static CHAR rand_state[256];
-};
-
-
-//implementations of template functions
-template <class T>
-void CMath::qsort(T* output, INT size)
-{
- if (size==2)
- {
- if (output[0] > output [1])
- swap(output[0],output[1]);
- }
- else
- {
- REAL split=output[(size*rand())/(RAND_MAX+1)];
- //REAL split=output[size/2];
-
- INT left=0;
- INT right=size-1;
-
- while (left<=right)
- {
- while (output[left] < split)
- left++;
- while (output[right] > split)
- right--;
-
- if (left<=right)
- {
- swap(output[left],output[right]);
- left++;
- right--;
- }
- }
-
- if (right+1> 1)
- qsort(output,right+1);
-
- if (size-left> 1)
- qsort(&output[left],size-left);
- }
-}
-
-template <class T1,class T2>
-void CMath::qsort(T1* output, T2* index, INT size)
-{
- if (size==2)
- {
- if (output[0] > output [1]){
- swap(output[0],output[1]);
- swap(index[0],index[1]);
- }
-
- }
- else
- {
- REAL split=output[(size*rand())/(RAND_MAX+1)];
- //REAL split=output[size/2];
-
- INT left=0;
- INT right=size-1;
-
- while (left<=right)
- {
- while (output[left] < split)
- left++;
- while (output[right] > split)
- right--;
-
- if (left<=right)
- {
- swap(output[left],output[right]);
- swap(index[left],index[right]);
- left++;
- right--;
- }
- }
-
- if (right+1> 1)
- qsort(output,index,right+1);
-
- if (size-left> 1)
- qsort(&output[left],&index[left], size-left);
- }
-}
-
-template <class T1,class T2>
-void CMath::qsort_backward(T1* output, T2* index, INT size)
-{
- if (size==2)
- {
- if (output[0] < output [1]){
- swap(output[0],output[1]);
- swap(index[0],index[1]);
- }
-
- }
- else
- {
- REAL split=output[(size*rand())/(RAND_MAX+1)];
- //REAL split=output[size/2];
-
- INT left=0;
- INT right=size-1;
-
- while (left<=right)
- {
- while (output[left] > split)
- left++;
- while (output[right] < split)
- right--;
-
- if (left<=right)
- {
- swap(output[left],output[right]);
- swap(index[left],index[right]);
- left++;
- right--;
- }
- }
-
- if (right+1> 1)
- qsort(output,index,right+1);
-
- if (size-left> 1)
- qsort(&output[left],&index[left], size-left);
- }
-}
-
-template <class T>
-void CMath::nmin(REAL* output, T* index, INT size, INT n)
-{
- if (6*n*size<13*size*log((double)size))
- for (INT i=0; i<n; i++)
- min(&output[i], &index[i], size-i) ;
- else
- qsort(output, index, size) ;
-}
-
-template <class T>
-void CMath::min(REAL* output, T* index, INT size)
-{
- if (size<=0)
- return ;
- REAL min_elem = output[0] ;
- INT min_index = 0 ;
- for (INT i=1; i<size; i++)
- if (output[i]<min_elem)
- {
- min_index=i ;
- min_elem=output[i] ;
- }
- swap(output[0], output[min_index]) ;
- swap(index[0], index[min_index]) ;
-}
-#endif
+++ /dev/null
-%module QPalmaDP
-%{
-#define SWIG_FILE_WITH_INIT
-#include "common.h"
-#include "Mathmatics.h"
-#include "penalty_info.h"
-#include "qpalma_dp.h"
-%}
-
-%include "std_vector.i"
-%include "std_string.i"
-
-%include "carrays.i"
-
-%include "common.h"
-%include "Mathmatics.h"
-%include "penalty_info.h"
-%include "qpalma_dp.h"
-
-%array_class(int, intArray) ;
-%array_class(double, doubleArray) ;
-%array_functions(double, doubleFArray) ;
-%array_functions(struct penalty_struct, penaltyArray) ;
-
-/*
-%array_functions(int, intArray) ;
-%array_functions(double, doubleArray) ;
-*/
-%array_class(Pre_score, Pre_scoreArray) ;
-
-%pythoncode
-%{
-
-def createPenaltyArrayFromList(list):
- array = new_penaltyArray(len(list))
- for i in range(len(list)):
- penaltyArray_setitem(array, i, list[i])
- return array
-
-def createDoubleArrayFromList(list):
- #array = new_doubleArray(len(list))
- array = doubleArray(len(list))
- for i in range(len(list)):
- # doubleArray_setitem(array, i, list[i])
- array[i] = list[i]
- return array
-
-def createIntArrayFromList(list):
- #array = new_intArray(len(list))
- array = intArray(len(list))
- for i in range(len(list)):
- #intArray_setitem(array, i, list[i])
- array[i] = list[i]
- return array
-
-def createListFromIntArray(array, array_len):
- list = [0]*array_len
- for i in range(array_len):
- list[i] = intArray_getitem(array,i)
- return list
-
-def createListFromDoubleArray(array, array_len):
- list = [0]*array_len
- for i in range(array_len):
- list[i] = doubleArray_getitem(array,i)
- return list
-
-%}
+++ /dev/null
-#ifndef __COMMON_H__
-#define __COMMON_H__
-
-#include <stdlib.h>
-#include <stdio.h>
-#include "config.h"
-
-#ifdef SUNOS
-#define bool int
-#define false 0
-#define true 1
-#endif
-
-#ifndef LINUX
-#define RANDOM_MAX 2147483647
-#else
-#define RANDOM_MAX RAND_MAX
-#endif
-
-/**@name Standard Types
- * Definition of Platform independent Types
-*/
-//@{
-
-/// Type CHAR
-typedef char CHAR;
-typedef CHAR* P_CHAR;
-
-/// Type BYTE
-typedef unsigned char BYTE;
-typedef BYTE* P_BYTE;
-
-/// Type SHORT is 2 bytes in size
-typedef short int SHORT;
-typedef SHORT* P_SHORT;
-
-/// Type WORD is 2 bytes in size
-typedef unsigned short int WORD;
-typedef WORD* P_WORD;
-
-/// Type INT is 4 bytes in size
-typedef int INT;
-typedef INT* P_INT;
-
-/// Type INT is 4 bytes in size
-typedef unsigned int UINT;
-typedef UINT* P_UINT;
-
-/// Type LONG is 8 bytes in size
-typedef long LONG;
-typedef LONG* P_LONG;
-
-/// Type SHORTREAL is 4 bytes in size
-typedef float SHORTREAL;
-typedef SHORTREAL* P_SHORTREAL;
-
-/// Type REAL is 8 bytes in size
-typedef double REAL;
-typedef REAL* P_REAL;
-
-/// Type LONGREAL is 16 bytes in size
-//typedef long double LONGREAL;
-//typedef LONGREAL* P_LONGREAL;
-
-#ifdef USE_SHORTREAL_KERNELCACHE
- typedef SHORTREAL KERNELCACHE_ELEM;
-#else
- typedef REAL KERNELCACHE_ELEM;
-#endif
-
-typedef KERNELCACHE_ELEM P_KERNELCACHE_ELEM;
-
-typedef LONG KERNELCACHE_IDX;
-
-/// The io libs output [DEBUG] etc in front of every CIO::message
-/// 'higher' messages filter output depending on the loglevel, i.e. CRITICAL messages
-/// will print all M_CRITICAL TO M_EMERGENCY messages to
-enum EMessageType
-{
- M_DEBUG,
- M_INFO,
- M_NOTICE,
- M_WARN,
- M_ERROR,
- M_CRITICAL,
- M_ALERT,
- M_EMERGENCY,
- M_MESSAGEONLY,
- M_PROGRESS
-};
-
-enum EKernelType
-{
- K_UNKNOWN = 0,
- K_OPTIMIZABLE = 4096,
- K_LINEAR = 10 | K_OPTIMIZABLE,
- K_POLY = 20,
- K_GAUSSIAN = 30,
- K_HISTOGRAM = 40,
- K_SALZBERG = 41,
- K_LOCALITYIMPROVED = 50,
- K_SIMPLELOCALITYIMPROVED = 60,
- K_FIXEDDEGREE = 70,
- K_WEIGHTEDDEGREE = 80 | K_OPTIMIZABLE,
- K_WEIGHTEDDEGREEPOS = 81 | K_OPTIMIZABLE,
- K_WEIGHTEDDEGREEPOLYA = 82,
- K_WD = 83,
- K_COMMWORD = 90 | K_OPTIMIZABLE ,
- K_POLYMATCH = 100,
- K_ALIGNMENT = 110,
- K_COMMWORDSTRING = 120 | K_OPTIMIZABLE,
- K_SPARSENORMSQUARED = 130,
- K_COMBINED = 140 | K_OPTIMIZABLE
-};
-
-enum EFeatureType
-{
- F_UNKNOWN = 0,
- F_REAL = 10,
- F_SHORT = 20,
- F_CHAR = 30,
- F_INT = 40,
- F_BYTE = 50,
- F_WORD = 60
-};
-
-enum EFeatureClass
-{
- C_UNKNOWN = 0,
- C_SIMPLE = 10,
- C_SPARSE = 20,
- C_STRING = 30,
- C_COMBINED = 40
-};
-
-/// Alphabet of charfeatures/observations
-enum E_ALPHABET
-{
- /// DNA - letters A,C,G,T,*,N,n
- DNA=0,
-
- /// PROTEIN - letters a-z
- PROTEIN=1,
-
- /// ALPHANUM - [0-9a-z]
- ALPHANUM=2,
-
- /// CUBE - [1-6]
- CUBE=3,
-
- /// NONE - type has no alphabet
- NONE=4
-};
-
-//@}
-
-#define TMP_DIR "/tmp/"
-//#define TMP_DIR "/short/x46/tmp/"
-
-#endif
+++ /dev/null
-
-#define HAVE_MATLAB 1
-
-#define HAVE_LARGEFILE 1
-
-
-
-
-
-
-#define USE_BIGSTATES 1
-
-
-#define USE_HMMCACHE 1
-
-
-
-
-
+++ /dev/null
-#ifndef __DEBUG_TOOLS__
-#define __DEBUG_TOOLS__
-
-#include <cstdio>
-
-void fassert(bool exp,int line, char* file) {
- if (! exp) {
- printf("invalid fassert at line %d in file %s!\n",line,file);
- exit(EXIT_FAILURE);
- }
-}
-
-#define FA(expr) (fassert(expr,__LINE__,__FILE__))
-
-#endif // __DEBUG_TOOLS__
+++ /dev/null
-/*********************************************************************************************/
-// Version of fill_matrix with fills several alignment matrices (best, second_best ...)
-// nr_paths det. number of these matrices
-//
-// est_len: Number of columns in alignment matrix: length(EST) +1 (for the gap) (variable: i)
-// dna_len: Number of lines in alignment matrix: length(DNA) +1 (for the gap) (variable: j)
-//
-// !!! IMPORTANT !!! The pictures and equations in the paper/poster etc. describe the algorithm
-// for an alignment matrix where DNA and EST changed places !!! IMPORTANT !!!
-/*********************************************************************************************/
-
-/*
- matchmatrix: columnwise
- - A C G T N (dna)
- - x x x x x x
- A x x x x x x
- C x x x x x x
- G x x x x x x
- T x x x x x x
- N x x x x x x
-(est)
-*/
-
-/*
- alignment matrix: columnwise
- |-EST . . .
- -+------------
- -|00 ...
- D|0.
- N|. .
- A|. .
- .|.
- .|
- .|
-*/
-
-
-/* TODO
- - max_len = 10000 for elegans!
- - do not use double
-*/
-
-#include "fill_matrix.h"
-
-void fassert(bool exp,int line, char* file);
-#define FA(expr) (fassert(expr,__LINE__,__FILE__))
-
-using namespace std;
-
-int check_char(char base)
-{
- switch(base)
- {
- case 'A':
- case 'a': return 1;
- case 'C':
- case 'c': return 2;
- case 'G':
- case 'g': return 3;
- case 'T':
- case 't': return 4;
- case 'N':
- case 'n': return 5;
- //default : return 0;
- default : return -1;
- }
-
-}
-
- /* system of Palma. Otherwise scoring_functions should point to an array of
- * plifs scoring the respective pairs of characters together with the EST
- * quality score.
- */
-
-double getScore(struct penalty_struct* qualityScores, int mlen, int dnaChar, int estChar, double baseScore) {
- double score = 0.0;
- FA(estChar > 0 && estChar < 6);
- FA(dnaChar > -1 && dnaChar < 6);
-
- //printf("Starting scoring / scheme is USE_QUALITY_SCORES\n");
- //printf("estChar,dnaChar are: %d,%d,%d",estChar,dnaChar);
-
- int currentPos = (estChar-1)*6+dnaChar;
- struct penalty_struct currentPen = qualityScores[currentPos];
-
- INT idx = 0 ;
- for (INT i=0; i<currentPen.len; i++)
- if (currentPen.limits[i] <= baseScore)
- idx++ ;
-
- if (idx==0)
- score=currentPen.penalties[0] ;
- else if (idx==currentPen.len)
- score=currentPen.penalties[currentPen.len-1] ;
- else
- {
- score = (currentPen.penalties[idx]*(baseScore-currentPen.limits[idx-1]) + currentPen.penalties[idx-1]*
- (currentPen.limits[idx]-baseScore)) / (currentPen.limits[idx]-currentPen.limits[idx-1]) ;
- }
-
- //score = lookup_penalty(¤tPen,idx, NULL, 0);
- //if(baseScore <= currentPen.limits[0]) {
- // score = currentPen.penalties[0];
- //} else {
- // int sp_idx;
- // for(sp_idx=0;sp_idx<currentPen.len;sp_idx++) {
- // if(currentPen.limits[sp_idx] <= baseScore )
- // score = currentPen.penalties[sp_idx];
- // //printf("Current score is %f\n",score);
- // }
- //}
-
- return score;
-}
-
-void fill_matrix(int nr_paths, Pre_score* matrices[], int est_len /*counter var: i*/, int dna_len /*counter var: j*/, char* est, char* dna, double* prb, penalty_struct* functions , double* matchmatrix, penalty_struct* qualityScores, double* donor, double* acceptor, bool remove_duplicate_scores, int nr_donor_sites, int* donor_sites, int* max_score_positions, mode currentMode)
-{
- //printf("Entering fill_matrix...\n");
-
- /*********************************************************************************************/
- // variables
- /*********************************************************************************************/
- const int mlen = 6; // length of matchmatrix
- int estChar, dnaChar; // 0:'-', 1:'a', 2:'c', 3:'g', 4:'t', 5:'n'
-
- double baseScore;
- double *est_scores = prb ;
-
- double prevValue ;
- double tempValue;
- int max_len = (int)functions->limits[functions->len-1] ; //last (len) supporting point of h
- //printf("max_len: %d\n", max_len) ;
-
- int num_elem ; // counter variable
-
- const int max_num_elem = (4+nr_paths+max_len) * nr_paths ; // upper bound on num_elem
- double *scores=new double[max_num_elem] ; //are sorted later
- struct ArrayElem *array=new ArrayElem[max_num_elem] ;
- //printf("%d\n", max_num_elem*sizeof(ArrayElem)) ;
-
- int max_ss_element[nr_paths*nr_paths]; //position on dna with max ss score for introns of length greater than max_len
- int splice_pos ; //splice position get from donor_sites
- int start_splice_pos ; //where to start looking in donor_sites
-
- double max_scores[nr_paths] ;
-
- /*********************************************************************************************/
- // Initialisation of all alignment matrices, columnwise
- /*********************************************************************************************/
- for (int z=0; z<nr_paths; z++) /* matrice z */
- {
- max_scores[z] = log(0.0) ;
- max_score_positions[2*z] = 0; // pos i
- max_score_positions[2*z +1] = 0; // pos j
-
- for (int i=0; i<est_len; i++) /* i.column */
- {
- for (int j=0; j<dna_len; j++) /* j.line */
- {
- ((Pre_score*)matrices[z] +(i*dna_len) +j)->value = log(0.0); // -inf
- ((Pre_score*)matrices[z] +(i*dna_len) +j)->prev_i = 0;
- ((Pre_score*)matrices[z] +(i*dna_len) +j)->prev_j = 0;
- ((Pre_score*)matrices[z] +(i*dna_len) +j)->prev_matrix_no = 0;
- ((Pre_score*)matrices[z] +(i*dna_len) +j)->issplice = 0;
- }
- }
- }
-
- /*********************************************************************************************/
- // Element (0,0) for first/best matrice (z=0)
- /*********************************************************************************************/
- ((Pre_score*)matrices[0])->value = 0;
- ((Pre_score*)matrices[0])->prev_i = 0;
- ((Pre_score*)matrices[0])->prev_j = 0;
- ((Pre_score*)matrices[0])->prev_matrix_no = 0;
- ((Pre_score*)matrices[0])->issplice = 0;
-
-
- /*********************************************************************************************/
- // Line Zero for first/best matrice (z=0)
- /*********************************************************************************************/
- for (int i=1; i<est_len; i++) /* 0.line, all columns (DNA='-') */
- {
- ((Pre_score*)matrices[0] +(i*dna_len))->value = 0 ;
- ((Pre_score*)matrices[0] +(i*dna_len))->prev_i = i-1;
- ((Pre_score*)matrices[0] +(i*dna_len))->prev_j = 0;
- ((Pre_score*)matrices[0] +(i*dna_len))->prev_matrix_no = 0;
- ((Pre_score*)matrices[0] +(i*dna_len))->issplice = 0;
- }
-
-
- /*********************************************************************************************/
- // Column Zero for first/best matrice (z=0)
- /*********************************************************************************************/
- for (int j=1; j<dna_len; j++) // 0.column, all lines
- {
- ((Pre_score*)matrices[0] +j)->value = 0 ;
- ((Pre_score*)matrices[0] +j)->prev_i = 0 ;
- ((Pre_score*)matrices[0] +j)->prev_j = j-1 ;
- ((Pre_score*)matrices[0] +j)->prev_matrix_no = 0 ;
- ((Pre_score*)matrices[0] +j)->issplice = 0;
- }
-
-
- /*********************************************************************************************/
- // !!!! Column Zero and line Zero for all other matrices except the best one (z>0): -INF !!!!
- /*********************************************************************************************/
-
-
- //printf("after initialization\n");
-
- /*********************************************************************************************/
- // all other entries for all matrices, last column and row treated seperately
- /*********************************************************************************************/
-
- /* columnwise, 0.column and 0.line already filled */
- for (int i=1; i<est_len; i++) /* i.column */
- {
-
- start_splice_pos = 0 ; //where to start looking in donor_sites
- for (int zz=0; zz<nr_paths*nr_paths; zz++) {max_ss_element[zz]=0 ;} // initializing
-
- for (int j=1; j<dna_len; j++) //j.line
- {
- dnaChar = check_char(dna[j-1]) ; // -1 because of of the gaps in the 0.line/column of the al-matrix
- estChar = check_char(est[i-1]) ;
- baseScore = est_scores[i-1];
-
- //printf("at iteration %d %d\n",i,j);
-
- //if introns of length greater than max_len possible
- if (j > max_len)
- {
- //storing highest ss_score (the position) and NOT -INF
- int new_ss_pos = j-max_len ;
- double donor_score = donor[new_ss_pos-1] ;
- if (isfinite(donor_score)) //splice site!
- {
- if (max_ss_element[nr_paths-1] != 0)
- {
- for (int zz=0; zz<nr_paths; zz++) //for each alignment matrix
- {
- int pos_with_minscore = nr_paths ; //assuming, the new score is the worst score
- double new_intron_score = donor_score+((Pre_score*)matrices[zz]+(i*dna_len)+new_ss_pos-1)->value ; //could be -INF
- if (!(isfinite(new_intron_score))) {printf("new_intron_score is -INF\n") ;}
- for (int pos=0; pos<nr_paths; pos++) //find worst score of all nr_paths+1 scores
- {
- int act_splice_position = max_ss_element[zz*nr_paths+pos] ;
- assert(isfinite(donor[act_splice_position-1])) ;
- double act_intron_score = donor[act_splice_position-1]+((Pre_score*)matrices[zz]+(i*dna_len)+act_splice_position-1)->value ;
- if (new_intron_score >= act_intron_score)
- {
- new_intron_score = act_intron_score ;
- pos_with_minscore = pos ;
- } //else nothing happens, new intron score still the worst one
- }
- if (pos_with_minscore != nr_paths)
- {
- max_ss_element[zz*nr_paths+pos_with_minscore] = new_ss_pos ; //replacing
- }
- }
- }
- else
- {
- for (int pos=0; pos<nr_paths; pos++)
- {
- if (max_ss_element[pos]==0) //no previous splice site at this pos
- {
- for (int zz=0; zz<nr_paths; zz++) {max_ss_element[zz*nr_paths+pos]=j-max_len ;}
- break ;
- }
- }
- }
- }
- //start_pos in donor_sites so that only introns shorter then max_length are seen
- if ((start_splice_pos < nr_donor_sites) && donor_sites[start_splice_pos] == j-max_len)
- {
- start_splice_pos++ ;
- }
- }
-
-
- /* Computing all possible scores */
- num_elem = 0 ; /* number of different scores/possibilities so far */
- for (int z=0; z<nr_paths; z++) /* matrice z */
- {
- Pre_score* actMatrix = (Pre_score*)matrices[z]; /* hm, why? */
-
- /***************************************/
- // 0. Zero (local alignment version)
- /***************************************/
- assert(num_elem<max_num_elem) ;
- array[num_elem].prev_i = 0;
- array[num_elem].prev_j = 0;
- array[num_elem].prev_matrix_no = z;
- array[num_elem].isSplice = false; /* no splice site */
- scores[num_elem] = 0 ;
- num_elem++ ;
-
-
- /***************************************/
- // 1. gap on EST (j-1,i) -> (j,i)
- /***************************************/
- prevValue = ((Pre_score*)actMatrix +(i*dna_len)+j-1)->value ; /* (j-1,i) -> (j,i) */
- if (isfinite(prevValue))
- {
-
- if( currentMode == USE_QUALITY_SCORES ) {
- tempValue = prevValue + matchmatrix[dnaChar];
- } else {
- tempValue = prevValue + matchmatrix[mlen*dnaChar]; /* score(gap,DNA) */
- }
-
-
- if (isfinite(tempValue))
- {
- assert(num_elem<max_num_elem) ;
- array[num_elem].prev_i = i;
- array[num_elem].prev_j = j-1; /* predecessor */
- array[num_elem].prev_matrix_no = z;
- array[num_elem].isSplice = false; /* no splice site */
- scores[num_elem] = -tempValue ;
- num_elem++ ;
- }
- }
-
-
- /***************************************/
- // 2. match/mismatch (j-1,i-1) -> (j,i)
- /***************************************/
- prevValue = ((Pre_score*)actMatrix +(i-1)*dna_len + j-1)->value ; /* (j-1,i-1) -> (j,i) */
- if (isfinite(prevValue))
- {
- if( currentMode == USE_QUALITY_SCORES ) {
- tempValue = prevValue + getScore(qualityScores,mlen,dnaChar,estChar,baseScore);
- } else {
- tempValue = prevValue +(matchmatrix[mlen* dnaChar +estChar]);
- }
-
- if (isfinite(tempValue))
- {
- assert(num_elem<max_num_elem) ;
- array[num_elem].prev_i = i-1; /* predecessor */
- array[num_elem].prev_j = j-1; /* predecessor */
- array[num_elem].prev_matrix_no = z;
- array[num_elem].isSplice = false; /* no splice site */
- scores[num_elem] = -tempValue ;
- num_elem++ ;
- }
- }
-
-
- /***************************************/
- // 3. gap on DNA (j,i-1) -> (j,i)
- /***************************************/
- prevValue = ((Pre_score*)actMatrix +(i-1)*dna_len + j)->value ;
- if (isfinite(prevValue))
- {
-
- if( currentMode == USE_QUALITY_SCORES ) {
- tempValue = prevValue + getScore(qualityScores,mlen,0,estChar,baseScore);
- } else {
- tempValue = prevValue +(matchmatrix[estChar]); /* score(EST,gap) */
- }
-
- if (isfinite(tempValue))
- {
- assert(num_elem<max_num_elem) ;
- array[num_elem].prev_i = i-1; /* predecessor */
- array[num_elem].prev_j = j;
- array[num_elem].prev_matrix_no = z;
- array[num_elem].isSplice = false; /* no splice site */
- scores[num_elem]=-tempValue ;
- num_elem++ ;
- }
- }
-
- /***************************************/
- // 4. all possible splice sites/introns
- /***************************************/
- // acceptor[j-1]: g of ag (-1 because cpp starts counting at 0)
- // donor[splice_pos-1]: g of gt/gc (-1 because cpp starts counting at 0)
- // donor_sites: the g's of the gt/gc
-
- if (isfinite(acceptor[j-1]))
- {
-
- //the nr_paths many introns of length greater than max_len
- for (int pos=0; pos<nr_paths; pos++)
- {
- if (max_ss_element[z*nr_paths+pos]>0)
- {
- assert((j-max_ss_element[z*nr_paths+pos]+1) > max_len) ;
- splice_pos = max_ss_element[z*nr_paths+pos] ; //the g of the gt
- prevValue = ((Pre_score*)actMatrix + (i*dna_len) +splice_pos-1)->value ; //one before the _gt
- if (isfinite(prevValue) && (((Pre_score*)actMatrix + (i*dna_len) +splice_pos-1)->issplice!=1))
- {
- int dist = j -(splice_pos-1); //_gt...ag: dist from g to g
- assert(dist>max_len) ;
- tempValue = lookup_penalty(functions,dist, NULL, 0) +donor[splice_pos-1] +acceptor[j-1] +prevValue;
- if (isfinite(tempValue))
- {
- assert(num_elem<max_num_elem) ;
- array[num_elem].prev_i = i;
- array[num_elem].prev_j = splice_pos-1; // predecessor
- array[num_elem].prev_matrix_no = z;
- array[num_elem].isSplice = true; // splice site
- scores[num_elem]=-tempValue ;
- num_elem++ ;
- }
- else if (isfinite(lookup_penalty(functions,dist, NULL, 0)))
- {
- printf("Something wrong with donor_sites(1000er) (intron_length: %d)\n", dist) ;
- sleep(5000) ;
- }
- }
- }
- }
-
- //all introns of length <= max_len
- for (int k=start_splice_pos; k < nr_donor_sites; k++)
- //for (int k=0; k < nr_donor_sites; k++) //also intron of length greater than 1000
- {
- splice_pos = donor_sites[k] ;
- if (splice_pos > j-4) { break ;}
- prevValue = ((Pre_score*)actMatrix + (i*dna_len) +splice_pos-1)->value ; //one before the _gt
-
- if (isfinite(prevValue) && (((Pre_score*)actMatrix + (i*dna_len) +splice_pos-1)->issplice!=1))
- {
- int dist = j -(splice_pos-1); //_gt...ag: dist from g to g
- assert(dist<=max_len) ;
- tempValue = lookup_penalty(functions,dist, NULL, 0) +donor[splice_pos-1] +acceptor[j-1] +prevValue;
- if (isfinite(tempValue))
- {
- assert(num_elem<max_num_elem) ;
- array[num_elem].prev_i = i;
- array[num_elem].prev_j = splice_pos-1; /* predecessor */
- array[num_elem].prev_matrix_no = z;
- array[num_elem].isSplice = true; /* splice site */
- scores[num_elem]=-tempValue ;
- num_elem++ ;
- }
- else if (isfinite(lookup_penalty(functions,dist, NULL, 0)))
- {
- printf("Something wrong with donor_sites (intron_length: %d)\n", dist) ;
- sleep(5000) ;
- }
- }
- }
- }
-
- } //end of z
-
- CMath::nmin<struct ArrayElem>(scores, array, num_elem, nr_paths);
-
- /* the nr_paths many highest scores are written in the nr_paths many matrices*/
- for (int z=0; z<nr_paths; z++) {
- if (z>=num_elem) {
- //nothing happens, entry in matrices[z] at pos. (i,j) still -inf
- }
- else {
- ((Pre_score*)matrices[z] + (i*dna_len) +j)->value = -scores[z] ;
- ((Pre_score*)matrices[z] + (i*dna_len) +j)->prev_i = array[z].prev_i ;
- ((Pre_score*)matrices[z] + (i*dna_len) +j)->prev_j = array[z].prev_j ;
- ((Pre_score*)matrices[z] + (i*dna_len) +j)->issplice = array[z].isSplice ;
- ((Pre_score*)matrices[z] + (i*dna_len) +j)->prev_matrix_no = array[z].prev_matrix_no;
-
- if (-scores[z]>=max_scores[z]) { //storing the postion of the highest alignment score yet
- max_scores[z] = -scores[z] ;
- max_score_positions[2*z] = i; // pos i
- max_score_positions[2*z +1] = j; // pos j
- }
- }
- } //end of z
-
- } //end of j
- } // end of i
-
- //free memory
- delete[] array;
- delete[] scores;
-
- //printf("Leaving fill_matrix...\n");
- return;
-}
+++ /dev/null
-#ifndef _FILLMATRIX_H__
-#define _FILLMATRIX_H__
-
-#include "Mathmatics.h"
-#include "penalty_info.h"
-#include <algorithm> //Eigentlich in Header
-#include <assert.h>
-#include <ctype.h>
-#include <iostream>
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-
-
-struct ArrayElem { //24B
- double score;
- int prev_j;
- int prev_i;
- int prev_matrix_no;
- bool isSplice;
-} ;
-
-struct pre_score { //24B
- double value; //8
- int prev_i; //4
- int prev_j; //4
- int prev_matrix_no;//4
- bool issplice;//4
-};
-
-typedef struct pre_score Pre_score;
-
-typedef struct Align_pair { //24B
- double value ;
- bool issplice;
- int previ,prevj ;
- char est_char;
- char dna_char;
-};
-
-
-void fill_matrix (int nr_paths, Pre_score* matrices[], int len_est, int len_dna, char* est, char* dna, double* prb, penalty_struct* functions, double* matchmatrix, penalty_struct* qualityScores, double* donor, double* acceptor, bool remove_duplicate_scores, int nr_donor_sites, int* donor_sites, int* max_score_positions);
-// funktion aendern in void result_align(Pre_score* matrices[], int anzpath,
-
-int check_char(char base);
-
-bool result_align (Pre_score* matrices[], int matrixnr, int i, int j, int* result_length_ptr, char* est, char* dna, int* s_align, int* e_align, int* m_param, double* alignmentscores, int* max_score_positions );
-
-extern void print_align(Pre_score* matrix, int length_est, int length_dna, Align_pair* vektor, int result_length, int print_matrix);
-
-double getScore(double *matchmatrix,int mlen, int dnaChar, int estChar);
-
-
-double scoreMatch(int dnaChar, int estChar, double baseScore);
-
-
-#endif // _FILLMATRIX_H__
+++ /dev/null
-#include "io.h"
-#include "common.h"
-
-#include <stdio.h>
-#include <stdarg.h>
-#include <ctype.h>
-
-FILE* CIO::target=stdout;
-
-CIO::CIO()
-{
-}
-
-void CIO::message(EMessageType prio, const CHAR *fmt, ... )
-{
- check_target();
- print_message_prio(prio, target);
- va_list list;
- va_start(list,fmt);
- vfprintf(target,fmt,list);
- va_end(list);
- fflush(target);
-}
-
-void CIO::buffered_message(EMessageType prio, const CHAR *fmt, ... )
-{
- check_target();
- print_message_prio(prio, target);
- va_list list;
- va_start(list,fmt);
- vfprintf(target,fmt,list);
- va_end(list);
-}
-
-CHAR* CIO::skip_spaces(CHAR* str)
-{
- INT i=0;
-
- if (str)
- {
- for (i=0; isspace(str[i]); i++);
-
- return &str[i];
- }
- else
- return str;
-}
-
-void CIO::set_target(FILE* t)
-{
- target=t;
-}
-
-void CIO::check_target()
-{
- if (!target)
- target=stdout;
-}
-
-void CIO::print_message_prio(EMessageType prio, FILE* target)
-{
- switch (prio)
- {
- case M_DEBUG:
- fprintf(target, "[DEBUG] ");
- break;
- case M_PROGRESS:
- fprintf(target, "[PROGRESS] ");
- break;
- case M_INFO:
- //fprintf(target, "[INFO] ");
- break;
- case M_NOTICE:
- fprintf(target, "[NOTICE] ");
- break;
- case M_WARN:
- fprintf(target, "[WARN] ");
- break;
- case M_ERROR:
- fprintf(target, "[ERROR] ");
- break;
- case M_CRITICAL:
- fprintf(target, "[CRITICAL] ");
- break;
- case M_ALERT:
- fprintf(target, "[ALERT] ");
- break;
- case M_EMERGENCY:
- fprintf(target, "[EMERGENCY] ");
- break;
- case M_MESSAGEONLY:
- break;
- default:
- break;
- }
-}
+++ /dev/null
-#ifndef __CIO_H__
-#define __CIO_H__
-
-#include "common.h"
-
-#include <stdio.h>
-#include <stdarg.h>
-
-class CIO
-{
-public:
- CIO();
-
- static void set_target(FILE* target);
- static void message(EMessageType prio, const CHAR *fmt, ... );
-
- inline static void not_implemented()
- {
- message(M_ERROR, "Sorry, not yet implemented\n");
- }
-
- static void buffered_message(EMessageType prio, const CHAR *fmt, ... );
-
- static CHAR* skip_spaces(CHAR* str);
-
-protected:
- static void check_target();
- static void print_message_prio(EMessageType prio, FILE* target);
-
-protected:
- static FILE* target;
-};
-#endif
+++ /dev/null
-#include <assert.h>
-#include "config.h"
-//#include "features/CharFeatures.h"
-//#include "features/StringFeatures.h"
-
-#include <stdio.h>
-#include <string.h>
-
-#include "io.h"
-
-
-#include "fill_matrix.h"
-#include "penalty_info.h"
-
-void init_penalty_struct(struct penalty_struct &PEN)
-{
- PEN.limits=NULL ;
- PEN.penalties=NULL ;
- PEN.id=-1 ;
- PEN.next_pen=NULL ;
- PEN.transform = T_LINEAR ;
- PEN.name = NULL ;
- PEN.max_len=0 ;
- PEN.min_len=0 ;
- PEN.cache=NULL ;
- PEN.use_svm=0 ;
-}
-
-void init_penalty_struct_cache(struct penalty_struct &PEN)
-{
- if (PEN.cache || PEN.use_svm)
- return ;
-
- REAL* cache=new REAL[PEN.max_len+1] ;
- if (cache)
- {
- for (INT i=0; i<=PEN.max_len; i++)
- cache[i] = lookup_penalty(&PEN, i, 0, false) ;
- PEN.cache = cache ;
- }
-}
-
-void delete_penalty_struct_palma(struct penalty_struct &PEN)
-{
- if (PEN.id!=-1)
- {
- delete[] PEN.limits ;
- delete[] PEN.penalties ;
- delete[] PEN.name ;
- delete[] PEN.cache ;
- }
-}
-
-void delete_penalty_struct_array(struct penalty_struct *PEN, INT len)
-{
- for (int i=0; i<len; i++)
- delete_penalty_struct_palma(PEN[i]) ;
- delete[] PEN ;
-}
-
-void copy_penalty_struct(struct penalty_struct *old, struct penalty_struct *newp) {
- newp->len = old->len;
- newp->limits = old->limits;
- newp->penalties = old->penalties;
- newp->max_len = old->max_len;
- newp->min_len = old->min_len;
- newp->cache = old->cache;
- newp->transform = old->transform;
- newp->id = old->id;
- newp->next_pen = old->next_pen;
- newp->name = old->name;
- newp->use_svm = old->use_svm;
-}
-
-
-//struct penalty_struct * read_penalty_struct_from_cell(const mxArray * mx_penalty_info, int &P)
-//{
-// P = mxGetN(mx_penalty_info) ;
-//
-// struct penalty_struct * PEN = new struct penalty_struct[P] ;
-// for (INT i=0; i<P; i++)
-// init_penalty_struct(PEN[i]) ;
-//
-// for (INT i=0; i<P; i++)
-// {
-// const mxArray* mx_elem = mxGetCell(mx_penalty_info, i) ;
-// if (mx_elem==NULL || !mxIsStruct(mx_elem))
-// {
-// CIO::message(M_ERROR, "empty cell element\n") ;
-// delete_penalty_struct_array(PEN,P) ;
-// return NULL ;
-// } ;
-// const mxArray* mx_id_field = mxGetField(mx_elem, 0, "id") ;
-// if (mx_id_field==NULL || !mxIsNumeric(mx_id_field) ||
-// mxGetN(mx_id_field)!=1 || mxGetM(mx_id_field)!=1)
-// {
-// CIO::message(M_ERROR, "missing id field\n") ;
-// delete_penalty_struct_array(PEN,P) ;
-// return NULL ;
-// }
-// const mxArray* mx_limits_field = mxGetField(mx_elem, 0, "limits") ;
-// if (mx_limits_field==NULL || !mxIsNumeric(mx_limits_field) ||
-// mxGetM(mx_limits_field)!=1)
-// {
-// CIO::message(M_ERROR, "missing limits field\n") ;
-// delete_penalty_struct_array(PEN,P) ;
-// return NULL ;
-// }
-// INT len = mxGetN(mx_limits_field) ;
-//
-// const mxArray* mx_penalties_field = mxGetField(mx_elem, 0, "penalties") ;
-// if (mx_penalties_field==NULL || !mxIsNumeric(mx_penalties_field) ||
-// mxGetM(mx_penalties_field)!=1 || mxGetN(mx_penalties_field)!=len)
-// {
-// CIO::message(M_ERROR, "missing penalties field\n") ;
-// delete_penalty_struct_array(PEN,P) ;
-// return NULL ;
-// }
-// const mxArray* mx_transform_field = mxGetField(mx_elem, 0, "transform") ;
-// if (mx_transform_field==NULL || !mxIsChar(mx_transform_field))
-// {
-// CIO::message(M_ERROR, "missing transform field\n") ;
-// delete_penalty_struct_array(PEN,P) ;
-// return NULL ;
-// }
-// const mxArray* mx_name_field = mxGetField(mx_elem, 0, "name") ;
-// if (mx_name_field==NULL || !mxIsChar(mx_name_field))
-// {
-// CIO::message(M_ERROR, "missing name field\n") ;
-// delete_penalty_struct_array(PEN,P) ;
-// return NULL ;
-// }
-// const mxArray* mx_max_len_field = mxGetField(mx_elem, 0, "max_len") ;
-// if (mx_max_len_field==NULL || !mxIsNumeric(mx_max_len_field) ||
-// mxGetM(mx_max_len_field)!=1 || mxGetN(mx_max_len_field)!=1)
-// {
-// CIO::message(M_ERROR, "missing max_len field\n") ;
-// delete_penalty_struct_array(PEN,P) ;
-// return NULL ;
-// }
-// const mxArray* mx_min_len_field = mxGetField(mx_elem, 0, "min_len") ;
-// if (mx_min_len_field==NULL || !mxIsNumeric(mx_min_len_field) ||
-// mxGetM(mx_min_len_field)!=1 || mxGetN(mx_min_len_field)!=1)
-// {
-// CIO::message(M_ERROR, "missing min_len field\n") ;
-// delete_penalty_struct_array(PEN,P) ;
-// return NULL ;
-// }
-// const mxArray* mx_use_svm_field = mxGetField(mx_elem, 0, "use_svm") ;
-// if (mx_use_svm_field==NULL || !mxIsNumeric(mx_use_svm_field) ||
-// mxGetM(mx_use_svm_field)!=1 || mxGetN(mx_use_svm_field)!=1)
-// {
-// CIO::message(M_ERROR, "missing use_svm field\n") ;
-// delete_penalty_struct_array(PEN,P) ;
-// return NULL ;
-// }
-// INT use_svm = (INT) mxGetScalar(mx_use_svm_field) ;
-// //fprintf(stderr, "use_svm_field=%i\n", use_svm) ;
-//
-// const mxArray* mx_next_id_field = mxGetField(mx_elem, 0, "next_id") ;
-// if (mx_next_id_field==NULL || !mxIsNumeric(mx_next_id_field) ||
-// mxGetM(mx_next_id_field)!=1 || mxGetN(mx_next_id_field)!=1)
-// {
-// CIO::message(M_ERROR, "missing next_id field\n") ;
-// delete_penalty_struct_array(PEN,P) ;
-// return NULL ;
-// }
-// INT next_id = (INT) mxGetScalar(mx_next_id_field)-1 ;
-//
-// INT id = (INT) mxGetScalar(mx_id_field)-1 ;
-// if (i<0 || i>P-1)
-// {
-// CIO::message(M_ERROR, "id out of range\n") ;
-// delete_penalty_struct_array(PEN,P) ;
-// return NULL ;
-// }
-// INT max_len = (INT) mxGetScalar(mx_max_len_field) ;
-// if (max_len<0 || max_len>1024*1024*100)
-// {
-// CIO::message(M_ERROR, "max_len out of range\n") ;
-// delete_penalty_struct_array(PEN,P) ;
-// return NULL ;
-// }
-// PEN[id].max_len = max_len ;
-//
-// INT min_len = (INT) mxGetScalar(mx_min_len_field) ;
-// if (min_len<0 || min_len>1024*1024*100)
-// {
-// CIO::message(M_ERROR, "min_len out of range\n") ;
-// delete_penalty_struct_array(PEN,P) ;
-// return NULL ;
-// }
-// PEN[id].min_len = min_len ;
-//
-// if (PEN[id].id!=-1)
-// {
-// CIO::message(M_ERROR, "penalty id already used\n") ;
-// delete_penalty_struct_array(PEN,P) ;
-// return NULL ;
-// }
-// PEN[id].id=id ;
-// if (next_id>=0)
-// PEN[id].next_pen=&PEN[next_id] ;
-// //fprintf(stderr,"id=%i, next_id=%i\n", id, next_id) ;
-//
-// assert(next_id!=id) ;
-// PEN[id].use_svm=use_svm ;
-// PEN[id].limits = new REAL[len] ;
-// PEN[id].penalties = new REAL[len] ;
-// double * limits = mxGetPr(mx_limits_field) ;
-// double * penalties = mxGetPr(mx_penalties_field) ;
-//
-// for (INT i=0; i<len; i++)
-// {
-// PEN[id].limits[i]=limits[i] ;
-// PEN[id].penalties[i]=penalties[i] ;
-// }
-// PEN[id].len = len ;
-//
-// char *transform_str = mxArrayToString(mx_transform_field) ;
-// char *name_str = mxArrayToString(mx_name_field) ;
-//
-// if (strcmp(transform_str, "log")==0)
-// PEN[id].transform = T_LOG ;
-// else if (strcmp(transform_str, "log(+1)")==0)
-// PEN[id].transform = T_LOG_PLUS1 ;
-// else if (strcmp(transform_str, "log(+3)")==0)
-// PEN[id].transform = T_LOG_PLUS3 ;
-// else if (strcmp(transform_str, "(+3)")==0)
-// PEN[id].transform = T_LINEAR_PLUS3 ;
-// else if (strcmp(transform_str, "")==0)
-// PEN[id].transform = T_LINEAR ;
-// else
-// {
-// delete_penalty_struct_array(PEN,P) ;
-// mxFree(transform_str) ;
-// return NULL ;
-// }
-// PEN[id].name = new char[strlen(name_str)+1] ;
-// strcpy(PEN[id].name, name_str) ;
-//
-// init_penalty_struct_cache(PEN[id]) ;
-//
-// mxFree(transform_str) ;
-// mxFree(name_str) ;
-// }
-// return PEN ;
-//}
-
-REAL lookup_penalty_svm(const struct penalty_struct *PEN, INT p_value, REAL *d_values)
-{
- if (PEN==NULL)
- return 0 ;
- assert(PEN->use_svm>0) ;
- REAL d_value=d_values[PEN->use_svm-1] ;
- //fprintf(stderr,"transform=%i, d_value=%1.2f\n", (INT)PEN->transform, d_value) ;
-
- switch (PEN->transform)
- {
- case T_LINEAR:
- break ;
- case T_LOG:
- d_value = log(d_value) ;
- break ;
- case T_LOG_PLUS1:
- d_value = log(d_value+1) ;
- break ;
- case T_LOG_PLUS3:
- d_value = log(d_value+3) ;
- break ;
- case T_LINEAR_PLUS3:
- d_value = d_value+3 ;
- break ;
- default:
- CIO::message(M_ERROR, "unknown transform\n") ;
- break ;
- }
-
- INT idx = 0 ;
- REAL ret ;
- for (INT i=0; i<PEN->len; i++)
- if (PEN->limits[i]<=d_value)
- idx++ ;
-
- if (idx==0)
- ret=PEN->penalties[0] ;
- else if (idx==PEN->len)
- ret=PEN->penalties[PEN->len-1] ;
- else
- {
- ret = (PEN->penalties[idx]*(d_value-PEN->limits[idx-1]) + PEN->penalties[idx-1]*
- (PEN->limits[idx]-d_value)) / (PEN->limits[idx]-PEN->limits[idx-1]) ;
- }
-
- //fprintf(stderr,"ret=%1.2f\n", ret) ;
-
- if (PEN->next_pen)
- ret+=lookup_penalty(PEN->next_pen, p_value, d_values);
-
- //fprintf(stderr,"ret=%1.2f\n", ret) ;
-
- return ret ;
-}
-
-REAL lookup_penalty(const struct penalty_struct *PEN, INT p_value,
- REAL* svm_values, bool follow_next)
-{
- if (PEN==NULL)
- return 0 ;
- if (PEN->use_svm)
- return lookup_penalty_svm(PEN, p_value, svm_values) ;
-
- if ((p_value<PEN->min_len) || (p_value>PEN->max_len))
- return -CMath::INFTY ;
-
- if (PEN->cache!=NULL && (p_value>=0) && (p_value<=PEN->max_len))
- {
- REAL ret=PEN->cache[p_value] ;
- if (PEN->next_pen && follow_next)
- ret+=lookup_penalty(PEN->next_pen, p_value, svm_values);
- return ret ;
- }
-
- REAL d_value = (REAL) p_value ;
- switch (PEN->transform)
- {
- case T_LINEAR:
- break ;
- case T_LOG:
- d_value = log(d_value) ;
- break ;
- case T_LOG_PLUS1:
- d_value = log(d_value+1) ;
- break ;
- case T_LOG_PLUS3:
- d_value = log(d_value+3) ;
- break ;
- case T_LINEAR_PLUS3:
- d_value = d_value+3 ;
- break ;
- default:
- CIO::message(M_ERROR, "unknown transform\n") ;
- break ;
- }
-
- INT idx = 0 ;
- REAL ret ;
- for (INT i=0; i<PEN->len; i++)
- if (PEN->limits[i]<=d_value)
- idx++ ;
-
- if (idx==0)
- ret=PEN->penalties[0] ;
- else if (idx==PEN->len)
- ret=PEN->penalties[PEN->len-1] ;
- else
- {
- ret = (PEN->penalties[idx]*(d_value-PEN->limits[idx-1]) + PEN->penalties[idx-1]*
- (PEN->limits[idx]-d_value)) / (PEN->limits[idx]-PEN->limits[idx-1]) ;
- }
- //if (p_value>=30 && p_value<150)
- //fprintf(stderr, "%s %i(%i) -> %1.2f\n", PEN->name, p_value, idx, ret) ;
-
- if (PEN->next_pen && follow_next)
- ret+=lookup_penalty(PEN->next_pen, p_value, svm_values);
-
- return ret ;
-}
+++ /dev/null
-#ifndef __PENALTY_INFO_H__
-#define __PENALTY_INFO_H__
-
-#include "common.h"
-#include "Mathmatics.h"
-
-enum ETransformType
-{
- T_LINEAR,
- T_LOG,
- T_LOG_PLUS1,
- T_LOG_PLUS3,
- T_LINEAR_PLUS3
-} ;
-
-struct penalty_struct
-{
- INT len ;
- REAL *limits ;
- REAL *penalties ;
- INT max_len ;
- INT min_len ;
- REAL *cache ;
- enum ETransformType transform ;
- INT id ;
- struct penalty_struct *next_pen ;
- char * name ;
- INT use_svm ;
-} ;
-
-void init_penalty_struct(struct penalty_struct &PEN) ;
-void delete_penalty_struct_palma(struct penalty_struct &PEN) ;
-//void delete_penalty_struct_array(struct penalty_struct *PEN, INT len) ;
-void copy_penalty_struct(struct penalty_struct *old, struct penalty_struct *newp);
-
-//#ifdef HAVE_MATLAB
-//struct penalty_struct * read_penalty_struct_from_cell(const mxArray * mx_penalty_info, mwSize &P) ;
-//#endif
-
-REAL lookup_penalty(const struct penalty_struct *PEN, INT p_value,
- REAL* svm_values, bool follow_next=true) ;
-
-enum mode { NORMAL, USE_QUALITY_SCORES};
-
-#endif
+++ /dev/null
-#include "fill_matrix.h"
-using namespace std;
-
-void print_align(Pre_score* matrix, int length_est, int length_dna,Align_pair* vektor, int result_length, int print_matrix)
-
-{
- //Druck der matrix
- switch(print_matrix)
- {
- case 1:
- {
- cout <<"Matrix: \n";
-
- for (int k = 0; k<length_est; k++)//k Zeile, l Spalte
- {
- cout << "Zeile " << k << ":\t";
- for (int l= 0; l<length_dna; l++)
- {
- cout << ((Pre_score*)matrix + (k*length_dna) + l)->value <<"\t";
- }
- cout <<"\n";
- }
- break;
- }
-
- case 2:
- {
- cout <<"Matrix: \n";
-
- for (int k = 0; k<length_est; k++)//k Zeile, l Spalte
- {
- cout << "Zeile " << k << ":\t";
- for (int l= 0; l<length_dna; l++)
- {
- if (((((Pre_score*)matrix + (k*length_dna) + l)->prev_i) == k-1) && ((((Pre_score*)matrix + (k*length_dna) + l)->prev_j) == l-1))
- cout << "\\" <<((Pre_score*)matrix + (k*length_dna) + l)->value <<"\t";
- else if (((((Pre_score*)matrix + (k*length_dna) + l)->prev_i) == k) && ((((Pre_score*)matrix + (k*length_dna) + l)->prev_j) == l-1))
- cout << "<"<< ((Pre_score*)matrix + (k*length_dna) + l)->value <<"\t";
- else if (((((Pre_score*)matrix + (k*length_dna) + l)->prev_i) == k -1) && ((((Pre_score*)matrix + (k*length_dna) + l)->prev_j) == l))
- cout << "|"<< ((Pre_score*)matrix + (k*length_dna) + l)->value <<"\t";
- else { cout << ((Pre_score*)matrix + (k*length_dna) + l)->value << "(";
- cout << ((Pre_score*)matrix + (k*length_dna) + l)->prev_i << "," << ((Pre_score*)matrix + (k*length_dna) + l)->prev_j << ")\t";
- }
-
- }
- cout <<"\n";
- }
-
- cout << "length_est" << length_est << "\n";
- cout << "length_dna" << length_dna << "\n";
- cout << "result_length: " << result_length << "\n";
-
-
- break;
- }
-
- case 3:
- {
- //Druck des Alignment
-
- // cout << "Alignment\n" ;
- int j = 0;
- // cout << "resultlength: " << result_length <<"\n";
- while (j<result_length) {
- int pacman = j;
- cout << "D:(" <<j<<")\t";
- for (int i=0; ((j<result_length) && (i<50)); i++)
- {
- cout << vektor[j].dna_char << " ";
- j++;
- }
-
- cout << "\n";
-
- cout << "E:(" <<pacman<<")\t";
- for (int i=0;((pacman<result_length) && (i<50)); i++)
- {
- cout << vektor[pacman].est_char << " ";
- pacman++;
- }
- cout << "\n\n";
- }
-
- //Ende Druck Alignment
- break ;
- }
-
- default:
- {
- cout <<"";
- }
- }
-
-
-
-}
-
+++ /dev/null
-#include "qpalma_dp.h"
-#include <cstring>
-
-using namespace std;
-
-/*[splice_align, est_align, weightMatch, alignmentscores, dnaest] = ...
- myalign([nr_paths], dna, est, {h}, matchmatrix, donor, acceptor, remove_duplicate_scores, ...
- print_matrix) */
-
-Alignment::Alignment(int numQPlifs, int numq, bool use_qscores) {
- len = 0;
- limits = 0;
- penalties = 0;
- max_len = 0;
- min_len = 0;
- cache = 0;
- enum ETransformType transform = T_LINEAR;
- id = 0;
- name = 0;
- use_svm = 0;
-
- // set ptrs to zero first
- splice_align = 0;
- est_align = 0;
- mmatrix_param = 0;
- alignmentscores = 0;
- qualityScoresAllPaths = 0;
- mlen = 6; // score matrix: length of 6 for "- A C G T N"
-
- numQualSuppPoints = numq;
- numPlifs = numQPlifs;
- use_quality_scores = use_qscores;
-
- //printf("number of support points: %d\n",numQualSuppPoints);
- //printf("number of plifs: %d\n",numPlifs );
- FA( numQualSuppPoints >= 0 );
- FA( numPlifs >= 0 );
-}
-
-void Alignment::getDNAEST(){}
-
-void Alignment::myalign(int nr_paths_p, char* dna, int dna_len_p, char* est,
- int est_len_p, double* prb, double* chastity, struct penalty_struct h, double* matchmatrix, int mm_len,
- double* donor, int d_len, double* acceptor, int a_len, struct penalty_struct* qualityScores,
- bool remove_duplicate_scores, bool print_matrix) {
-
- // printf("Entering myalign...\n");
- dna_len = dna_len_p + 1 ;
- est_len = est_len_p + 1 ;
- nr_paths = nr_paths_p;
-
- /***************************************************************************/
- // initialize alignment matrices and call fill_matrix()
- /***************************************************************************/
-
- mode currentMode;
- if (use_quality_scores)
- currentMode = USE_QUALITY_SCORES;
- else
- currentMode = NORMAL;
-
- // dnaest
- double *DNA_ARRAY = 0;
- double *EST_ARRAY = 0;
-
- //possible donor positions
- int nr_donor_sites = 0 ;
- for (int ii=0; ii<d_len; ii++) {
- if(isfinite(donor[ii])) {
- nr_donor_sites++ ;
- }
- }
-
- int* donor_sites = new int[nr_donor_sites];
- int donor_idx = 0 ;
- for (int ii=0; ii<d_len; ii++) {
- if(isfinite(donor[ii])) {
- donor_sites[donor_idx] = ii+1 ;
- donor_idx++ ;
- }
- }
-
- int* max_score_positions = new int[nr_paths*2];
-
- Pre_score** matrices = new Pre_score*[nr_paths];
- for (int z=0; z<nr_paths; z++) {
- matrices[z] = new Pre_score[dna_len * est_len];
- }
-
- //printf("qualityScores\n");
- //for(int qidx=0;qidx<numPlifs;qidx++) {
- // penalty_struct p;
- // p = qualityScores[qidx];
- // printf("%d ",qidx);
- // for(int pidx=0;pidx<p.len;pidx++)
- // printf("%f ",pidx,p.penalties[pidx]);
- // printf("\n");
- //}
-
- //printf("calling fill_matrix...\n");
- fill_matrix(nr_paths, matrices, est_len, dna_len, est, dna, prb, &h, matchmatrix, qualityScores, donor, acceptor, remove_duplicate_scores, nr_donor_sites, donor_sites, max_score_positions,currentMode);
- //printf("after call to fill_matrix...\n");
-
- /***************************************************************************/
- // return arguments etc.
- /***************************************************************************/
- int result_length; //Eine Variable fuer alle Matrizen
-
- splice_align_size = (dna_len-1)*nr_paths;
- est_align_size = (est_len-1)*nr_paths;
-
- int mmatrix_size;
-
- if (currentMode == USE_QUALITY_SCORES) {
- mmatrix_param_size = mlen*nr_paths;
- mmatrix_size = mlen;
- }
-
- if (currentMode == NORMAL) {
- mmatrix_param_size = (mlen*mlen)*nr_paths;
- mmatrix_size = mlen*mlen;
- }
-
- alignmentscores_size = nr_paths; //alignment score for each path/matrix
- numPathsPlifs = numPlifs*nr_paths; //alignment score for each path/matrix
-
- splice_align = new int[splice_align_size];
- est_align = new int[est_align_size];
- mmatrix_param = new int[mmatrix_param_size];
- alignmentscores = new double[nr_paths]; //alignment score for each path/matrix
-
-
- // printf("before memset...\n");
- memset((char*)splice_align, -1, (dna_len-1)*nr_paths*sizeof(int)); // fills splice_align with zeros
- memset((char*)est_align, -1, (est_len-1)*nr_paths*sizeof(int)); // fills est_align with zeros
- memset((char*)mmatrix_param, 0, mmatrix_size*nr_paths*sizeof(int)); //fills mmatrix_param with zeros
- memset(alignmentscores, -1, nr_paths*sizeof(double)); //fills alignmentscores with zeros
- //printf("after memset...\n");
-
- qualityScoresAllPaths= new penalty_struct*[nr_paths];
-
- for (int z=0; z<nr_paths; z++) {
- result_length = 0 ;
-
- int* s_align = splice_align + (dna_len-1)*z; //pointer
- int* e_align = est_align + (est_len-1)*z ; //pointer
- int* mparam = mmatrix_param + mmatrix_size*z; //pointer
-
- qualityScoresAllPaths[z] = new penalty_struct[numPlifs];
-
- int qidx, pidx;
- for(qidx=0;qidx<numPlifs;qidx++) {
- penalty_struct p;
- p.len = numQualSuppPoints;
- p.limits = (REAL*) calloc(p.len,sizeof(REAL));
-
- for(pidx=0;pidx<p.len;pidx++)
- p.limits[pidx] = qualityScores[qidx%numPlifs].limits[pidx];
-
- p.penalties = (REAL*) calloc(p.len,sizeof(REAL));
- qualityScoresAllPaths[z][qidx] = p;
- }
-
- //penalty_struct* qparam = qualityScoresAllPaths + (numPlifs*z);
-
- //printf("before call to result_align...\n");
- bool no_more_path = result_align(matrices, z, est_len, dna_len, &result_length, est, dna, prb, chastity, s_align, e_align, mparam, alignmentscores, max_score_positions, qualityScoresAllPaths[z] , currentMode);
- //printf("after call to result_align...\n");
-
- //printf("z is %d\n",z);
- //for(qidx=0;qidx<numPlifs;qidx++) {
- // penalty_struct p;
- // p = qualityScoresAllPaths[qidx+numPlifs*z];
- // printf("%d ",qidx);
- // for(pidx=0;pidx<p.len;pidx++)
- // printf("%f ",pidx,p.penalties[pidx]);
- // printf("\n");
- //}
-
- //if(DNA_ARRAY != 0) {
- // delete[] DNA_ARRAY;
- // delete[] EST_ARRAY;
- // }
-
- //DNA_ARRAY = new double[result_length];
- //EST_ARRAY = new double[result_length];
-
- // //backtracking
- // int i = max_score_positions[2*z] ; //i (est)
- // int j = max_score_positions[2*z +1] ; //j (dna)
- // int prev_i = ((Pre_score*)matrices[z] + i*dna_len +j)->prev_i;
- // int prev_j = ((Pre_score*)matrices[z] + i*dna_len +j)->prev_j;
- // int prev_m_no = ((Pre_score*)matrices[z] + i*dna_len +j)->prev_matrix_no;
- //
- // for (int ii=result_length; ii>0; ii--) {
- // if ((prev_i == (i-1)) && (prev_j == (j-1))) { // match or mismatch
- //DNA_ARRAY[ii-1] = check_char(dna[j-1]) ;
- //EST_ARRAY[ii-1] = check_char(est[i-1]) ;
- // }
- // else if ((prev_i == (i)) && (prev_j == (j-1))) {// gap on EST_ARRAY
- //DNA_ARRAY[ii-1] = check_char(dna[j-1]) ;
- //EST_ARRAY[ii-1] = 0 ; //gap
- // }
- // else if ((prev_i == (i-1)) && (prev_j == (j))) { // gap on DNA_ARRAY
- //DNA_ARRAY[ii-1] = 0 ; //gap
- //EST_ARRAY[ii-1] = check_char(est[i-1]) ;
- // }
- // else {// splice site
- //for (j; j > prev_j; j--) {
- // DNA_ARRAY[ii-1] = check_char(dna[j-1]) ;
- // EST_ARRAY[ii-1] = 6 ; //intron
- // ii-- ;
- //}
- //ii++ ; // last ii-- too much (because done in for-loop)
- // }
- //
- // i = prev_i;
- // j = prev_j;
- // prev_i = ((Pre_score*)matrices[prev_m_no] + i*dna_len + j)->prev_i;
- // prev_j = ((Pre_score*)matrices[prev_m_no] + i*dna_len + j)->prev_j;
- // prev_m_no = ((Pre_score*)matrices[prev_m_no] + i*dna_len + j)->prev_matrix_no;
- // }
- //
- } //end of z
-
- //if(DNA_ARRAY != 0) {
- // delete[] DNA_ARRAY;
- // delete[] EST_ARRAY;
- // }
-
- for (int i=nr_paths-1;i>=0; i--)
- delete[] matrices[i];
-}
-
-void Alignment::getAlignmentResults(int* s_align, int* e_align,
- int* mmatrix_p, double* alignscores, double* qScores) {
-
- int idx;
- for(idx=0; idx<splice_align_size; idx++)
- s_align[idx] = splice_align[idx];
-
- for(idx=0; idx<est_align_size; idx++)
- e_align[idx] = est_align[idx];
-
- for(idx=0; idx<mmatrix_param_size; idx++)
- mmatrix_p[idx] = mmatrix_param[idx];
-
- for(idx=0; idx<alignmentscores_size; idx++)
- alignscores[idx] = alignmentscores[idx];
-
- if (use_quality_scores) {
- penalty_struct currentPlif;
- int ctr=0;
- for (int z=0; z<nr_paths; z++) {
- for(int estChar=1;estChar<6;estChar++) {
- for(int dnaChar=0;dnaChar<6;dnaChar++) {
- int currentPos = (estChar-1)*6+dnaChar;
- currentPlif = qualityScoresAllPaths[z][currentPos];
- for(int pidx=0; pidx<currentPlif.len; pidx++) {
- qScores[ctr] = currentPlif.penalties[pidx];
- ctr++;
- }
- }}}
- }
-
- //for(idx=0; idx<numPathsPlifs; idx++) {
- // currentPlif = qualityScoresAllPaths[idx];
- // //printf("Size is %dx%d\n",numPathsPlifs,currentPlif.len);
- // for(pidx=0; pidx<currentPlif.len; pidx++) {
- // qScores[ctr] = currentPlif.penalties[pidx];
- // ctr++;
- // }
- //}
-
- //printf("\nctr is %d\n",ctr);
- //printf("Leaving getAlignmentResults...\n");
-}
+++ /dev/null
-#ifndef _QPALMA_DP_H_
-#define _QPALMA_DP_H_
-
-#include "penalty_info.h"
-
-struct ArrayElem { //24B
- double score;
- int prev_j;
- int prev_i;
- int prev_matrix_no;
- bool isSplice;
-} ;
-
-struct pre_score { //24B
- double value; //8
- int prev_i; //4
- int prev_j; //4
- int prev_matrix_no;//4
- bool issplice;//4
-};
-
-typedef struct pre_score Pre_score;
-
-typedef struct Align_pair { //24B
- double value ;
- bool issplice;
- int previ,prevj ;
- char est_char;
- char dna_char;
-};
-
-void fill_matrix (int nr_paths, Pre_score* matrices[], int len_est, int len_dna, char* est, char* dna, double* prb, penalty_struct* functions, double* matchmatrix, penalty_struct* qualityScores, double* donor, double* acceptor, bool remove_duplicate_scores, int nr_donor_sites, int* donor_sites, int* max_score_positions, mode currentMode);
-
-int check_char(char base);
-void fassert(bool exp,int line, char* file);
-#define FA(expr) (fassert(expr,__LINE__,__FILE__))
-
-bool result_align (Pre_score* matrices[], int matrixnr, int i, int j, int* result_length_ptr, char* est, char* dna, double* prb, double* chastity, int* s_align, int* e_align, int* m_param, double* alignmentscores, int* max_score_positions, penalty_struct* qparam, mode currentMode);
-
-extern void print_align(Pre_score* matrix, int length_est, int length_dna, Align_pair* vektor, int result_length, int print_matrix);
-
-
-/*
- * The function myalign calculates a new alignment given the scoring scheme
- * Its input arguments are:
- *
- * num_path(id) -> an integer specifying which alignment has to be done
- * dna ->
- * est ->
- * h ->
- * mmatrix ->
- * donor ->
- * acceptor ->
- * remove_duplicate_scores -> a boolean
- * print_matrix -> a boolean
- *
- * [SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest] = myalign_local(...
- *
- * the idea of the qualityScores array is as follows
- *
- * consider a matrix of 24 plifs
- *
- * -> row major
- *
- */
-
-class Alignment {
-
- private:
- int* splice_align;
- int* est_align;
- int* mmatrix_param;
- double* alignmentscores;
- struct penalty_struct** qualityScoresAllPaths;
-
- int dna_len;
- int est_len;
- int mlen;
- int nr_paths;
-
- int numQualSuppPoints;
- int numPlifs;
- bool use_quality_scores;
-
- uint splice_align_size ;
- uint est_align_size ;
- uint mmatrix_param_size ;
- uint alignmentscores_size ;
- uint numPathsPlifs ;
-
- INT len;
- REAL *limits;
- REAL *penalties;
- INT max_len;
- INT min_len;
- REAL *cache;
- enum ETransformType transform ;
- INT id;
- char * name;
- INT use_svm;
-
- public:
- Alignment(int numQPlifs ,int numq, bool use_qscores);
- ~Alignment() {
- if(splice_align != 0)
- delete[] splice_align;
-
- if(est_align != 0)
- delete[] est_align;
-
- if(mmatrix_param != 0)
- delete[] mmatrix_param;
-
- if(alignmentscores != 0)
- delete[] alignmentscores;
-
- if(qualityScoresAllPaths != 0)
- delete[] qualityScoresAllPaths;
- }
-
- void myalign(int nr_paths_p, char* dna, int dna_len_p, char* est,
- int est_len_p, double* prb, double* chastity, struct penalty_struct h,
- double* matchmatrix, int mm_len, double* donor, int d_len, double* acceptor,
- int a_len, struct penalty_struct* qualityScores, bool remove_duplicate_scores,
- bool print_matrix);
-
- void getDNAEST();
- void getAlignmentResults(int* s_align, int* e_align,
- int* mmatrix_p, double* alignscores, double* qScores);
-};
-
-#endif // _QPALMA_DP_H_
-
+++ /dev/null
-#include "assert.h"
-#include "fill_matrix.h"
-#include "debug_tools.h"
-
-using namespace std;
-
-void increaseFeatureCount(penalty_struct* qparam, int dnaChar, int estChar, double estprb) {
-
- FA(estChar > 0 && estChar < 6);
- FA(dnaChar > -1 && dnaChar < 6);
-
- int currentPos = (estChar-1)*6+dnaChar;
-
- penalty_struct currentStruct = qparam[currentPos];
-
- //printf("Current index %d est/dna: %d %d score %f\n",currentPos,estChar,dnaChar,estprb);
-
- //printf("before\n");
- //int p_idx;
- //for(p_idx=0;p_idx<currentStruct.len;p_idx++) {
- // printf("%f ",currentStruct.limits[p_idx]);
- //}
- //printf("\n");
-
- //for(p_idx=0;p_idx<currentStruct.len;p_idx++) {
- // printf("%f ",currentStruct.penalties[p_idx]);
- //}
- //printf("\n");
-
- double value = estprb;
- int Lower = 0;
- int idx;
-
- for (idx=0;idx<currentStruct.len;idx++) {
- if (currentStruct.limits[idx] <= value)
- Lower++;
- }
-
- if (Lower == 0) {
- currentStruct.penalties[0] += 1;
- qparam[currentPos] = currentStruct;
- return;
- }
-
- if (Lower == currentStruct.len) {
- currentStruct.penalties[currentStruct.len-1] += 1;
- qparam[currentPos] = currentStruct;
- return;
- }
-
- Lower -= 1;
- int Upper = Lower+1; // x-werte bleiben fest
-
- double weightup = 1.0*(value - currentStruct.limits[Lower]) / (currentStruct.limits[Upper] - currentStruct.limits[Lower]);
- double weightlow = 1.0*(currentStruct.limits[Upper] - value) / (currentStruct.limits[Upper] - currentStruct.limits[Lower]);
- currentStruct.penalties[Upper] += weightup;
- currentStruct.penalties[Lower] += weightlow;
-
- //printf("estprb/Lower/Upper %f %d %d\n",estprb,Lower,Upper);
- //printf("after\n");
- //for(p_idx=0;p_idx<currentStruct.len;p_idx++) {
- // printf("%f ",currentStruct.limits[p_idx]);
- //}
- //printf("\n");
- //for(p_idx=0;p_idx<currentStruct.len;p_idx++)
- // printf("%f ",currentStruct.penalties[p_idx]);
- //printf("\n");
- qparam[currentPos] = currentStruct;
-}
-
-bool result_align(Pre_score* matrices[], int z, int est_len, int dna_len, int* result_length_ptr, char* est, char* dna, double* prb, double* chastity, int* s_align, int* e_align, int* mparam, double* alignmentscores, int* max_score_positions, penalty_struct* qparam, mode currentMode)
-//Gibt 1 zurueck, wenn kein weiterer Pfad existiert, sonst 0
-
-{
- const int mlen=6; // length of matchmatrix
- int startend[4] ; // [i_start, j_start, i_end, j_end] ;
- startend[3] = max_score_positions[2*z] ; //i (est)
- startend[4] = max_score_positions[2*z +1] ; //j (dna)
- //printf("in result_align: z=%d:(%d, %d)\n", z, startend[3], startend[4]) ;
- int dnanum ; //using check_char: letter -> number
- int estnum ; //using check_char: letter -> number
-
- double prbnum ;
- double chastitynum ;
-
- int dna_pos ;
- int est_pos ;
- int splice_state ;
- int est_state ;
- for (dna_pos=dna_len-1; dna_pos>startend[4]; dna_pos--) {
- s_align[dna_pos-1] = 4 ; // not aligned (-1 because array starts with index 0)
- }
- for (est_pos=est_len-1; est_pos>startend[3]; est_pos--) {
- e_align[est_pos-1] = 4 ;
- }
- //splice_align: 0: exon, 1: donor, 2: acceptor, 3: intro,. 4: dangling end
-
- /* starting point for backtracking */
- int i = startend[3] ;
- int j = startend[4] ;
- int act_m_no = z ;
- alignmentscores[z] = ((Pre_score*)matrices[z] + i*dna_len +j)->value;
- //printf("alignment_score for z=%d: %3.8f\n", z, alignmentscores[z]) ;
-
- int prev_i = ((Pre_score*)matrices[act_m_no] + i*dna_len +j)->prev_i;
- int prev_j = ((Pre_score*)matrices[act_m_no] + i*dna_len +j)->prev_j;
- int prev_m_no = ((Pre_score*)matrices[act_m_no] + i*dna_len +j)->prev_matrix_no;
-
- //printf("Current z is %d\n",z);
-
- if(!(isfinite(((Pre_score*)matrices[act_m_no] + i*dna_len +j)->value)))
- {
- cout << "No additional path left (inf)!\n";
- return 1;
- }
-
- if(((Pre_score*)matrices[act_m_no] + i*dna_len +j)->value==0) {
- cout << "No additional path left (0)!\n";
- return 1;
- }
-
- assert(dna_pos == j) ;
- assert(est_pos == i) ;
-
- splice_state = 0 ; //exon
- est_state = 1 ; //[112222111 ... for exons of length 2,4,3 ...
-
- //compute length of alignment (between max(m,n) and m+n)
- //local_alignment: backtracking to first Zero: look at actual value
- while(!(((Pre_score*)matrices[act_m_no] + i*dna_len +j)->value)==0.0) {
-
- //just some info
- //if ((z>0) && (!(z==prev_m_no))) {
- // printf("(%d,%d): next step in next matrix\n", i,j) ;
- //}
-
- if ((prev_i == (i-1)) && (prev_j == (j-1))) { // match or mismatch
- (*result_length_ptr)= (*result_length_ptr) + 1;
-
- s_align[dna_pos-1] = splice_state;
- dna_pos-- ;
- e_align[est_pos-1] = est_state ; //1 or 2, depended
- est_pos-- ;
-
- dnanum = check_char(dna[j-1]) ; //-1 because index starts with 0
- estnum = check_char(est[i-1]) ; //-1 because index starts with 0
-
- prbnum = prb[i-1];
- chastitynum = chastity[i-1];
-
- //printf("Position %d,%d est,dna: %d,%d\n",i,j,estnum,dnanum);
-
- if(currentMode == USE_QUALITY_SCORES)
- increaseFeatureCount(qparam,dnanum,estnum,prbnum);
- else
- mparam[mlen*dnanum+estnum] ++ ;
-
- }
- else if ((prev_i == (i)) && (prev_j == (j-1))) {// gap on EST
- (*result_length_ptr)= (*result_length_ptr) + 1;
-
- s_align[dna_pos-1] = splice_state;
- dna_pos-- ;
-
- dnanum = check_char(dna[j-1]) ; //-1 because index starts with 0
- estnum = 0 ; //gap
-
- if(currentMode == USE_QUALITY_SCORES)
- mparam[dnanum] ++ ;
- else
- mparam[mlen*dnanum +estnum] ++ ;
-
- }
- else if ((prev_i == (i-1)) && (prev_j == (j))) { // gap on DNA
- (*result_length_ptr)= (*result_length_ptr) + 1;
-
- e_align[est_pos-1] = est_state ; //1 or 2, depended
- est_pos-- ;
-
- dnanum = 0 ; //gap
- estnum = check_char(est[i-1]) ; //-1 because index starts with 0
-
- prbnum = prb[i-1];
- chastitynum = chastity[i-1];
-
- //printf("Position %d,%d est,dna: %d,%d\n",i,j,estnum,dnanum);
-
- if(currentMode == USE_QUALITY_SCORES)
- increaseFeatureCount(qparam,dnanum,estnum,prbnum);
- else
- mparam[mlen*dnanum +estnum] ++ ;
- }
- else {// splice site
- (*result_length_ptr) = (*result_length_ptr) + (j - prev_j);
- if (((Pre_score*)matrices[act_m_no] + i*dna_len + j)->issplice==0) {
- printf("matrices[act_m_no](i,j): %3.15f\n",((Pre_score*)matrices[act_m_no] + i*dna_len +j)->value) ;
- printf("matrices[prev_m_no](i,j): %3.15f\n",((Pre_score*)matrices[prev_m_no] + i*dna_len +j)->value) ;
- printf("issplice(act): %d\n",((Pre_score*)matrices[act_m_no] + i*dna_len + j)->issplice) ;
- printf("issplice(prev): %d\n",((Pre_score*)matrices[prev_m_no] + i*dna_len + j)->issplice) ;
- printf("act_m_no: %d, prev_m_no: %d\n", act_m_no, prev_m_no) ;
- printf("j: %d -> prev_j:%d\n", j, prev_j) ;
- printf("i: %d -> prev_i:%d\n", i, prev_i) ;
- }
- if (est_state==1) { //was exon labeled "1"
- est_state = 2 ; //new exon is labeled "2"
- }
- else {
- est_state = 1 ; //last exon was labeled "2", new exon is labeled "1"
- }
- for (j; j > prev_j; j--) {
- if (splice_state == 0) { //coming from exon
- splice_state = 2 ; //first intron_state: acceptor
- } else {
- splice_state = 3 ; //intron
- }
- if (j-1 == prev_j) { //last intron_state: donor
- splice_state = 1 ;//donor
- }
- s_align[dna_pos-1] = splice_state;
- dna_pos-- ;
- }
- splice_state = 0 ; //exon again
-
- }
-
- startend[1] = i;
- startend[2] = j;
- i = prev_i;
- j = prev_j;
- act_m_no = prev_m_no ;
-
- prev_i = ((Pre_score*)matrices[act_m_no] + i*dna_len + j)->prev_i;
- prev_j = ((Pre_score*)matrices[act_m_no] + i*dna_len + j)->prev_j;
- prev_m_no = ((Pre_score*)matrices[act_m_no] + i*dna_len +j)->prev_matrix_no;
- }
- for (dna_pos; dna_pos>0; dna_pos--) {
- s_align[dna_pos-1] = 4 ; // not aligned (-1 because array starts with index 0)
- }
- for (est_pos; est_pos>0; est_pos--) {
- e_align[est_pos-1] = 4 ; // not aligned (-1 because array starts with index 0)
- }
-
- //printf("(%d, %d) --> (%d, %d)\n", startend[1], startend[2], startend[3], startend[4]) ;
- //printf("result_length: %d\n", *result_length_ptr) ;
- return 0;
-}
--- /dev/null
+SRCS= Mathmatics.cpp\
+ fill_matrix.cpp\
+ qpalma_dp.cpp\
+ result_align.cpp\
+ penalty_info.cpp\
+ print_align.cpp\
+ io.cpp
+
+HDRS= Mathmatics.h\
+ common.h\
+ config.h\
+ debug_tools.h\
+ fill_matrix.h\
+ io.h\
+ penalty_info.h\
+ qpalma_dp.h
+
+OBJS = $(SRCS:%.cpp=%.o)
+
+#CXXFLAGS=-O3 -fPIC
+#CXXFLAGS=-O3 -fPIC -pg -fprofile-arcs
+CXXFLAGS=-O3 -fPIC -ggdb
+
+IF=QPalmaDP
+
+all: $(OBJS) $(HDRS)
+ swig -c++ -python ${IF}.i
+ g++ $(CXXFLAGS) -I/usr/include/python2.5 -c ${IF}_wrap.cxx -o ${IF}_wrap.o
+ g++ $(CXXFLAGS) -shared -lpython2.5 $(OBJS) ${IF}_wrap.o -o _${IF}.so
+ python -c "import ${IF}"
+
+clean:
+ rm *.o *.so *.cxx ${IF}.py ${IF}.pyc
--- /dev/null
+// Math.cpp: implementation of the CMath class.
+//
+//////////////////////////////////////////////////////////////////////
+
+
+#include "Mathmatics.h"
+#include "io.h"
+
+#include <sys/time.h>
+#include <sys/types.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+#include <unistd.h>
+#include <assert.h>
+
+//////////////////////////////////////////////////////////////////////
+// Construction/Destruction
+//////////////////////////////////////////////////////////////////////
+
+#ifdef USE_LOGCACHE
+//gene/math specific constants
+#ifdef USE_HMMDEBUG
+#define MAX_LOG_TABLE_SIZE 10*1024*1024
+#define LOG_TABLE_PRECISION 1e-6
+#else
+#define MAX_LOG_TABLE_SIZE 123*1024*1024
+#define LOG_TABLE_PRECISION 1e-15
+#endif
+
+INT CMath::LOGACCURACY = 0; // 100000 steps per integer
+#endif
+
+INT CMath::LOGRANGE = 0; // range for logtable: log(1+exp(x)) -25 <= x <= 0
+
+#ifdef USE_PATHDEBUG
+const REAL CMath::INFTY = 1e11; // infinity
+#else
+const REAL CMath::INFTY = -log(0.0); // infinity
+#endif
+const REAL CMath::ALMOST_NEG_INFTY = -1000;
+
+CHAR CMath::rand_state[256];
+
+CMath::CMath()
+{
+ struct timeval tv;
+ gettimeofday(&tv, NULL);
+ UINT seed=(UINT) (4223517*getpid()*tv.tv_sec*tv.tv_usec);
+ initstate(seed, CMath::rand_state, sizeof(CMath::rand_state));
+ CIO::message(M_INFO, "seeding random number generator with %u\n", seed);
+
+#ifdef USE_LOGCACHE
+ LOGRANGE=CMath::determine_logrange();
+ LOGACCURACY=CMath::determine_logaccuracy(LOGRANGE);
+ CIO::message(M_INFO, "Initializing log-table (size=%i*%i*%i=%2.1fMB) ...",LOGRANGE,LOGACCURACY,sizeof(REAL),LOGRANGE*LOGACCURACY*sizeof(REAL)/(1024.0*1024.0)) ;
+
+ CMath::logtable=new REAL[LOGRANGE*LOGACCURACY];
+ init_log_table();
+ CIO::message(M_INFO, "Done.\n") ;
+#else
+ INT i=0;
+ while ((REAL)log(1+((REAL)exp(-REAL(i)))))
+ i++;
+ CIO::message(M_INFO, "determined range for x in log(1+exp(-x)) is:%d\n", i);
+ LOGRANGE=i;
+#endif
+}
+
+CMath::~CMath()
+{
+#ifdef USE_LOGCACHE
+ delete[] logtable;
+#endif
+}
+
+#ifdef USE_LOGCACHE
+INT CMath::determine_logrange()
+{
+ INT i;
+ REAL acc=0;
+ for (i=0; i<50; i++)
+ {
+ acc=((REAL)log(1+((REAL)exp(-REAL(i)))));
+ if (acc<=(REAL)LOG_TABLE_PRECISION)
+ break;
+ }
+
+ CIO::message(M_INFO, "determined range for x in table log(1+exp(-x)) is:%d (error:%G)\n",i,acc);
+ return i;
+}
+
+INT CMath::determine_logaccuracy(INT range)
+{
+ range=MAX_LOG_TABLE_SIZE/range/((int)sizeof(REAL));
+ CIO::message(M_INFO, "determined accuracy for x in table log(1+exp(-x)) is:%d (error:%G)\n",range,1.0/(double) range);
+ return range;
+}
+
+//init log table of form log(1+exp(x))
+void CMath::init_log_table()
+{
+ for (INT i=0; i< LOGACCURACY*LOGRANGE; i++)
+ logtable[i]=log(1+exp(REAL(-i)/REAL(LOGACCURACY)));
+}
+#endif
+
+void CMath::sort(INT *a, INT cols, INT sort_col)
+{
+ INT changed=1;
+ if (a[0]==-1) return ;
+ while (changed)
+ {
+ changed=0; INT i=0 ;
+ while ((a[(i+1)*cols]!=-1) && (a[(i+1)*cols+1]!=-1)) // to be sure
+ {
+ if (a[i*cols+sort_col]>a[(i+1)*cols+sort_col])
+ {
+ for (INT j=0; j<cols; j++)
+ CMath::swap(a[i*cols+j],a[(i+1)*cols+j]) ;
+ changed=1 ;
+ } ;
+ i++ ;
+ } ;
+ } ;
+}
+
+void CMath::sort(REAL *a, INT* idx, INT N)
+{
+
+ INT changed=1;
+ while (changed)
+ {
+ changed=0;
+ for (INT i=0; i<N-1; i++)
+ {
+ if (a[i]>a[i+1])
+ {
+ swap(a[i],a[i+1]) ;
+ swap(idx[i],idx[i+1]) ;
+ changed=1 ;
+ } ;
+ } ;
+ } ;
+
+}
+
+
+
+//plot x- axis false positives (fp) 1-Specificity
+//plot y- axis true positives (tp) Sensitivity
+INT CMath::calcroc(REAL* fp, REAL* tp, REAL* output, INT* label, INT& size, INT& possize, INT& negsize, REAL& tresh, FILE* rocfile)
+{
+ INT left=0;
+ INT right=size-1;
+ INT i;
+
+ for (i=0; i<size; i++)
+ {
+ if (!(label[i]== -1 || label[i]==1))
+ return -1;
+ }
+
+ //sort data such that -1 labels first +1 behind
+ while (left<right)
+ {
+ while ((label[left] < 0) && (left<right))
+ left++;
+ while ((label[right] > 0) && (left<right)) //warning: label must be either -1 or +1
+ right--;
+
+ swap(output[left],output[right]);
+ swap(label[left], label[right]);
+ }
+
+ // neg/pos sizes
+ negsize=left;
+ possize=size-left;
+ REAL* negout=output;
+ REAL* posout=&output[left];
+
+ // sort the pos and neg outputs separately
+ qsort(negout, negsize);
+ qsort(posout, possize);
+
+ // set minimum+maximum values for decision-treshhold
+ REAL minimum=min(negout[0], posout[0]);
+ REAL maximum=minimum;
+ if (negsize>0)
+ maximum=max(maximum,negout[negsize-1]);
+ if (possize>0)
+ maximum=max(maximum,posout[possize-1]);
+
+ REAL treshhold=minimum;
+ REAL old_treshhold=treshhold;
+
+ //clear array.
+ for (i=0; i<size; i++)
+ {
+ fp[i]=1.0;
+ tp[i]=1.0;
+ }
+
+ //start with fp=1.0 tp=1.0 which is posidx=0, negidx=0
+ //everything right of {pos,neg}idx is considered to beLONG to +1
+ INT posidx=0;
+ INT negidx=0;
+ INT iteration=1;
+ INT returnidx=-1;
+
+ REAL minerr=10;
+
+ while (iteration < size && treshhold<=maximum)
+ {
+ old_treshhold=treshhold;
+
+ while (treshhold==old_treshhold && treshhold<=maximum)
+ {
+ if (posidx<possize && negidx<negsize)
+ {
+ if (posout[posidx]<negout[negidx])
+ {
+ if (posout[posidx]==treshhold)
+ posidx++;
+ else
+ treshhold=posout[posidx];
+ }
+ else
+ {
+ if (negout[negidx]==treshhold)
+ negidx++;
+ else
+ treshhold=negout[negidx];
+ }
+ }
+ else
+ {
+ if (posidx>=possize && negidx<negsize-1)
+ {
+ negidx++;
+ treshhold=negout[negidx];
+ }
+ else if (negidx>=negsize && posidx<possize-1)
+ {
+ posidx++;
+ treshhold=posout[posidx];
+ }
+ else if (negidx<negsize && treshhold!=negout[negidx])
+ treshhold=negout[negidx];
+ else if (posidx<possize && treshhold!=posout[posidx])
+ treshhold=posout[posidx];
+ else
+ {
+ treshhold=2*(maximum+100); // force termination
+ posidx=possize;
+ negidx=negsize;
+ break;
+ }
+ }
+ }
+
+ //calc tp,fp
+ tp[iteration]=(possize-posidx)/(REAL (possize));
+ fp[iteration]=(negsize-negidx)/(REAL (negsize));
+
+ //choose poINT with minimal error
+ if (minerr > negsize*fp[iteration]/size+(1-tp[iteration])*possize/size )
+ {
+ minerr=negsize*fp[iteration]/size+(1-tp[iteration])*possize/size;
+ tresh=(old_treshhold+treshhold)/2;
+ returnidx=iteration;
+ }
+
+ iteration++;
+ }
+
+ // set new size
+ size=iteration;
+
+ if (rocfile)
+ {
+ const CHAR id[]="ROC";
+ fwrite(id, sizeof(char), sizeof(id), rocfile);
+ fwrite(fp, sizeof(REAL), size, rocfile);
+ fwrite(tp, sizeof(REAL), size, rocfile);
+ }
+
+ return returnidx;
+}
+
+UINT CMath::crc32(BYTE *data, INT len)
+{
+ UINT result;
+ INT i,j;
+ BYTE octet;
+
+ result = 0-1;
+
+ for (i=0; i<len; i++)
+ {
+ octet = *(data++);
+ for (j=0; j<8; j++)
+ {
+ if ((octet >> 7) ^ (result >> 31))
+ {
+ result = (result << 1) ^ 0x04c11db7;
+ }
+ else
+ {
+ result = (result << 1);
+ }
+ octet <<= 1;
+ }
+ }
+
+ return ~result;
+}
+
+double CMath::mutual_info(REAL* p1, REAL* p2, INT len)
+{
+ double e=0;
+
+ for (INT i=0; i<len; i++)
+ for (INT j=0; j<len; j++)
+ e+=exp(p2[j*len+i])*(p2[j*len+i]-p1[i]-p1[j]);
+
+ return e;
+}
+
+double CMath::relative_entropy(REAL* p, REAL* q, INT len)
+{
+ double e=0;
+
+ for (INT i=0; i<len; i++)
+ e+=exp(p[i])*(p[i]-q[i]);
+
+ return e;
+}
+
+double CMath::entropy(REAL* p, INT len)
+{
+ double e=0;
+
+ for (INT i=0; i<len; i++)
+ e-=exp(p[i])*p[i];
+
+ return e;
+}
--- /dev/null
+#ifndef __MATHMATICS_H_
+#define __MATHMATICS_H_
+
+#include "common.h"
+#include "io.h"
+
+#include <math.h>
+#include <stdio.h>
+
+//define finite for win32
+#ifdef _WIN32
+#include <float.h>
+#define finite _finite
+#define isnan _isnan
+#endif
+
+#ifndef NAN
+#include <stdlib.h>
+#define NAN (strtod("NAN",NULL))
+#endif
+
+#ifdef SUNOS
+extern "C" int finite(double);
+#endif
+
+/** Mathematical Functions.
+ * Class which collects generic mathematical functions
+ */
+class CMath
+{
+public:
+ /**@name Constructor/Destructor.
+ */
+ //@{
+ ///Constructor - initializes log-table
+ CMath();
+
+ ///Destructor - frees logtable
+ virtual ~CMath();
+ //@}
+
+ /**@name min/max/abs functions.
+ */
+ //@{
+
+ ///return the minimum of two integers
+ //
+ template <class T>
+ static inline T min(T a, T b)
+ {
+ return ((a)<=(b))?(a):(b);
+ }
+
+ ///return the maximum of two integers
+ template <class T>
+ static inline T max(T a, T b)
+ {
+ return ((a)>=(b))?(a):(b);
+ }
+
+ ///return the maximum of two integers
+ template <class T>
+ static inline T abs(T a)
+ {
+ return ((a)>=(0))?(a):(-a);
+ }
+ //@}
+
+ /**@name misc functions */
+ //@{
+
+ /// crc32
+ static UINT crc32(BYTE *data, INT len);
+
+ static inline REAL round(REAL d)
+ {
+ return floor(d+0.5);
+ }
+
+ /// signum of type T variable a
+ template <class T>
+ static inline REAL sign(REAL a)
+ {
+ if (a==0)
+ return 0;
+ else return (a<0) ? (-1) : (+1);
+ }
+
+ /// swap floats a and b
+ template <class T>
+ static inline void swap(T & a,T &b)
+ {
+ T c=a ;
+ a=b; b=c ;
+ }
+
+ /// x^2
+ template <class T>
+ static inline T sq(T x)
+ {
+ return x*x;
+ }
+
+ static inline LONG factorial(INT n)
+ {
+ LONG res=1 ;
+ for (int i=2; i<=n; i++)
+ res*=i ;
+ return res ;
+ }
+
+ static inline LONG nchoosek(INT n, INT k)
+ {
+ long res=1 ;
+
+ for (INT i=n-k+1; i<=n; i++)
+ res*=i ;
+
+ return res/factorial(k) ;
+ }
+
+ /** performs a bubblesort on a given matrix a.
+ * it is sorted from in ascending order from top to bottom
+ * and left to right */
+ static void sort(INT *a, INT cols, INT sort_col=0) ;
+ static void sort(REAL *a, INT*idx, INT N) ;
+
+ /** performs a quicksort on an array output of length size
+ * it is sorted from in ascending (for type T) */
+ template <class T>
+ static void qsort(T* output, INT size) ;
+
+ /** performs a quicksort on an array output of length size
+ * it is sorted from in ascending order
+ * (for type T1) and returns the index (type T2)
+ * matlab alike [sorted,index]=sort(output)
+ */
+ template <class T1,class T2>
+ static void qsort(T1* output, T2* index, INT size);
+
+ /** performs a quicksort on an array output of length size
+ * it is sorted from in ascending order
+ * (for type T1) and returns the index (type T2)
+ * matlab alike [sorted,index]=sort(output)
+ */
+ template <class T1,class T2>
+ static void qsort_backward(T1* output, T2* index, INT size);
+
+ /* finds the smallest element in output and puts that element as the
+ first element */
+ template <class T>
+ static void min(REAL* output, T* index, INT size);
+
+ /* finds the n smallest elements in output and puts these elements as the
+ first n elements */
+ template <class T>
+ static void nmin(REAL* output, T* index, INT size, INT n);
+
+ /* performs a inplace unique of a vector of type T using quicksort
+ * returns the new number of elements */
+ template <class T>
+ static INT unique(T* output, INT size)
+ {
+ qsort(output, size) ;
+ INT i,j=0 ;
+ for (i=0; i<size; i++)
+ if (i==0 || output[i]!=output[i-1])
+ output[j++]=output[i] ;
+ return j ;
+ }
+
+ /* finds an element in a sorted array via binary search
+ * returns -1 if not found */
+ static inline INT fast_find(WORD* output, INT size, WORD elem)
+ {
+ INT start=0, end=size-1, middle=size/2 ;
+
+ if (output[start]>elem || output[end]<elem)
+ return -1 ;
+
+ while (1)
+ {
+ if (output[middle]>elem)
+ {
+ end = middle ;
+ middle=start+(end-start)/2 ;
+ } ;
+ if (output[middle]<elem)
+ {
+ start = middle ;
+ middle=start+(end-start)/2 ;
+ }
+ if (output[middle]==elem)
+ return middle ;
+ if (end-start<=1)
+ {
+ if (output[start]==elem)
+ return start ;
+ if (output[end]==elem)
+ return end ;
+ return -1 ;
+ }
+ }
+ }
+
+ /* finds an element in a sorted array via binary search
+ * that is smaller-equal elem und the next element is larger-equal */
+ static inline INT fast_find_range(REAL* output, INT size, REAL elem)
+ {
+ INT start=0, end=size-2, middle=(size-2)/2 ;
+
+ if (output[start]>elem)
+ return -1 ;
+ if (output[end]<=elem)
+ return size-1 ;
+
+ while (1)
+ {
+ if (output[middle]>elem)
+ {
+ end = middle ;
+ middle=start+(end-start)/2 ;
+ } ;
+ if (output[middle]<elem)
+ {
+ start = middle ;
+ middle=start+(end-start)/2 ;
+ }
+ if (output[middle]<=elem && output[middle+1]>=elem)
+ return middle ;
+ if (end-start<=1)
+ {
+ if (output[start]<=elem && output[start+1]>=elem)
+ return start ;
+ return end ;
+ }
+ }
+ }
+
+ /** calculates ROC into (fp,tp)
+ * from output and label of length size
+ * returns index with smallest error=fp+fn
+ */
+ static INT calcroc(REAL* fp, REAL* tp, REAL* output, INT* label, INT& size, INT& possize, INT& negsize, REAL& tresh, FILE* rocfile);
+ //@}
+
+ /// returns the mutual information of p which is given in logspace
+ /// where p,q are given in logspace
+ static double mutual_info(REAL* p1, REAL* p2, INT len);
+
+ /// returns the relative entropy H(P||Q),
+ /// where p,q are given in logspace
+ static double relative_entropy(REAL* p, REAL* q, INT len);
+
+ /// returns entropy of p which is given in logspace
+ static double entropy(REAL* p, INT len);
+
+ /**@name summing functions */
+ //@{
+ /** sum logarithmic probabilities.
+ * Probability measures are summed up but are now given in logspace where
+ * direct summation of exp(operand) is not possible due to numerical problems, i.e. eg. exp(-1000)=0. Therefore
+ * we do log( exp(a) + exp(b)) = a + log (1 + exp (b-a)) where a is max(p,q) and b min(p,q). */
+#ifdef USE_LOGCACHE
+ static inline REAL logarithmic_sum(REAL p, REAL q)
+ {
+ if (finite(p))
+ {
+ if (finite(q))
+ {
+
+ register REAL diff=p-q;
+
+ if (diff>0) //p>q
+ {
+ if (diff > LOGRANGE)
+ return p;
+ else
+ return p + logtable[(int)(diff*LOGACCURACY)];
+ }
+ else //p<=q
+ {
+ if (-diff > LOGRANGE)
+ return q;
+ else
+ return q + logtable[(int)(-diff*LOGACCURACY)];
+ }
+ }
+ CIO::message("INVALID second operand to logsum(%f,%f) expect undefined results\n", p, q);
+ return NAN;
+ }
+ else
+ return q;
+ }
+
+ ///init log table of form log(1+exp(x))
+ static void init_log_table();
+
+ /// determine INT x for that log(1+exp(-x)) == 0
+ static INT determine_logrange();
+
+ /// determine accuracy, such that the thing fits into MAX_LOG_TABLE_SIZE, needs logrange as argument
+ static INT determine_logaccuracy(INT range);
+#else
+ /*
+ inline REAL logarithmic_sum(REAL p, REAL q)
+ {
+ double result=comp_logarithmic_sum(p,q);
+
+ printf("diff:%f <-> %f\n",p-q, result);
+ return result;
+ }*/
+
+ static inline REAL logarithmic_sum(REAL p, REAL q)
+ {
+ if (finite(p))
+ {
+ if (finite(q))
+ {
+
+ register REAL diff=p-q;
+ if (diff>0) //p>q
+ {
+ if (diff > LOGRANGE)
+ return p;
+ else
+ return p + log(1+exp(-diff));
+ }
+ else //p<=q
+ {
+ if (-diff > LOGRANGE)
+ return q;
+ else
+ return q + log(1+exp(diff));
+ }
+ }
+ return p;
+ }
+ else
+ return q;
+ }
+#endif
+#ifdef LOG_SUM_ARRAY
+ /** sum up a whole array of values in logspace.
+ * This function addresses the numeric instabiliy caused by simply summing up N elements by adding
+ * each of the elements to some variable. Instead array neighbours are summed up until one element remains.
+ * Whilst the number of additions remains the same, the error is only in the order of log(N) instead N.
+ */
+ static inline REAL logarithmic_sum_array(REAL *p, INT len)
+ {
+ if (len<=2)
+ {
+ if (len==2)
+ return logarithmic_sum(p[0],p[1]) ;
+ if (len==1)
+ return p[0];
+ return -INFTY ;
+ }
+ else
+ {
+ register REAL *pp=p ;
+ if (len%2==1) pp++ ;
+ for (register INT j=0; j < len>>1; j++)
+ pp[j]=logarithmic_sum(pp[j<<1], pp[1+(j<<1)]) ;
+ }
+ return logarithmic_sum_array(p,len%2+len>>1) ;
+ }
+#endif
+ //@}
+public:
+ /**@name constants*/
+ //@{
+ /// infinity
+ static const REAL INFTY;
+
+ /// almost neg (log) infinity
+ static const REAL ALMOST_NEG_INFTY;
+
+ /// range for logtable: log(1+exp(x)) -LOGRANGE <= x <= 0
+ static INT LOGRANGE;
+
+#ifdef USE_LOGCACHE
+ /// number of steps per integer
+ static INT LOGACCURACY;
+ //@}
+protected:
+ ///table with log-values
+ static REAL* logtable;
+#endif
+ static CHAR rand_state[256];
+};
+
+
+//implementations of template functions
+template <class T>
+void CMath::qsort(T* output, INT size)
+{
+ if (size==2)
+ {
+ if (output[0] > output [1])
+ swap(output[0],output[1]);
+ }
+ else
+ {
+ REAL split=output[(size*rand())/(RAND_MAX+1)];
+ //REAL split=output[size/2];
+
+ INT left=0;
+ INT right=size-1;
+
+ while (left<=right)
+ {
+ while (output[left] < split)
+ left++;
+ while (output[right] > split)
+ right--;
+
+ if (left<=right)
+ {
+ swap(output[left],output[right]);
+ left++;
+ right--;
+ }
+ }
+
+ if (right+1> 1)
+ qsort(output,right+1);
+
+ if (size-left> 1)
+ qsort(&output[left],size-left);
+ }
+}
+
+template <class T1,class T2>
+void CMath::qsort(T1* output, T2* index, INT size)
+{
+ if (size==2)
+ {
+ if (output[0] > output [1]){
+ swap(output[0],output[1]);
+ swap(index[0],index[1]);
+ }
+
+ }
+ else
+ {
+ REAL split=output[(size*rand())/(RAND_MAX+1)];
+ //REAL split=output[size/2];
+
+ INT left=0;
+ INT right=size-1;
+
+ while (left<=right)
+ {
+ while (output[left] < split)
+ left++;
+ while (output[right] > split)
+ right--;
+
+ if (left<=right)
+ {
+ swap(output[left],output[right]);
+ swap(index[left],index[right]);
+ left++;
+ right--;
+ }
+ }
+
+ if (right+1> 1)
+ qsort(output,index,right+1);
+
+ if (size-left> 1)
+ qsort(&output[left],&index[left], size-left);
+ }
+}
+
+template <class T1,class T2>
+void CMath::qsort_backward(T1* output, T2* index, INT size)
+{
+ if (size==2)
+ {
+ if (output[0] < output [1]){
+ swap(output[0],output[1]);
+ swap(index[0],index[1]);
+ }
+
+ }
+ else
+ {
+ REAL split=output[(size*rand())/(RAND_MAX+1)];
+ //REAL split=output[size/2];
+
+ INT left=0;
+ INT right=size-1;
+
+ while (left<=right)
+ {
+ while (output[left] > split)
+ left++;
+ while (output[right] < split)
+ right--;
+
+ if (left<=right)
+ {
+ swap(output[left],output[right]);
+ swap(index[left],index[right]);
+ left++;
+ right--;
+ }
+ }
+
+ if (right+1> 1)
+ qsort(output,index,right+1);
+
+ if (size-left> 1)
+ qsort(&output[left],&index[left], size-left);
+ }
+}
+
+template <class T>
+void CMath::nmin(REAL* output, T* index, INT size, INT n)
+{
+ if (6*n*size<13*size*log((double)size))
+ for (INT i=0; i<n; i++)
+ min(&output[i], &index[i], size-i) ;
+ else
+ qsort(output, index, size) ;
+}
+
+template <class T>
+void CMath::min(REAL* output, T* index, INT size)
+{
+ if (size<=0)
+ return ;
+ REAL min_elem = output[0] ;
+ INT min_index = 0 ;
+ for (INT i=1; i<size; i++)
+ if (output[i]<min_elem)
+ {
+ min_index=i ;
+ min_elem=output[i] ;
+ }
+ swap(output[0], output[min_index]) ;
+ swap(index[0], index[min_index]) ;
+}
+#endif
--- /dev/null
+%module QPalmaDP
+%{
+#define SWIG_FILE_WITH_INIT
+#include "common.h"
+#include "Mathmatics.h"
+#include "penalty_info.h"
+#include "qpalma_dp.h"
+%}
+
+%include "std_vector.i"
+%include "std_string.i"
+
+%include "carrays.i"
+
+%include "common.h"
+%include "Mathmatics.h"
+%include "penalty_info.h"
+%include "qpalma_dp.h"
+
+%array_class(int, intArray) ;
+%array_class(double, doubleArray) ;
+%array_functions(double, doubleFArray) ;
+%array_functions(struct penalty_struct, penaltyArray) ;
+
+/*
+%array_functions(int, intArray) ;
+%array_functions(double, doubleArray) ;
+*/
+%array_class(Pre_score, Pre_scoreArray) ;
+
+%pythoncode
+%{
+
+def createPenaltyArrayFromList(list):
+ array = new_penaltyArray(len(list))
+ for i in range(len(list)):
+ penaltyArray_setitem(array, i, list[i])
+ return array
+
+def createDoubleArrayFromList(list):
+ #array = new_doubleArray(len(list))
+ array = doubleArray(len(list))
+ for i in range(len(list)):
+ # doubleArray_setitem(array, i, list[i])
+ array[i] = list[i]
+ return array
+
+def createIntArrayFromList(list):
+ #array = new_intArray(len(list))
+ array = intArray(len(list))
+ for i in range(len(list)):
+ #intArray_setitem(array, i, list[i])
+ array[i] = list[i]
+ return array
+
+def createListFromIntArray(array, array_len):
+ list = [0]*array_len
+ for i in range(array_len):
+ list[i] = intArray_getitem(array,i)
+ return list
+
+def createListFromDoubleArray(array, array_len):
+ list = [0]*array_len
+ for i in range(array_len):
+ list[i] = doubleArray_getitem(array,i)
+ return list
+
+%}
--- /dev/null
+#ifndef __COMMON_H__
+#define __COMMON_H__
+
+#include <stdlib.h>
+#include <stdio.h>
+#include "config.h"
+
+#ifdef SUNOS
+#define bool int
+#define false 0
+#define true 1
+#endif
+
+#ifndef LINUX
+#define RANDOM_MAX 2147483647
+#else
+#define RANDOM_MAX RAND_MAX
+#endif
+
+/**@name Standard Types
+ * Definition of Platform independent Types
+*/
+//@{
+
+/// Type CHAR
+typedef char CHAR;
+typedef CHAR* P_CHAR;
+
+/// Type BYTE
+typedef unsigned char BYTE;
+typedef BYTE* P_BYTE;
+
+/// Type SHORT is 2 bytes in size
+typedef short int SHORT;
+typedef SHORT* P_SHORT;
+
+/// Type WORD is 2 bytes in size
+typedef unsigned short int WORD;
+typedef WORD* P_WORD;
+
+/// Type INT is 4 bytes in size
+typedef int INT;
+typedef INT* P_INT;
+
+/// Type INT is 4 bytes in size
+typedef unsigned int UINT;
+typedef UINT* P_UINT;
+
+/// Type LONG is 8 bytes in size
+typedef long LONG;
+typedef LONG* P_LONG;
+
+/// Type SHORTREAL is 4 bytes in size
+typedef float SHORTREAL;
+typedef SHORTREAL* P_SHORTREAL;
+
+/// Type REAL is 8 bytes in size
+typedef double REAL;
+typedef REAL* P_REAL;
+
+/// Type LONGREAL is 16 bytes in size
+//typedef long double LONGREAL;
+//typedef LONGREAL* P_LONGREAL;
+
+#ifdef USE_SHORTREAL_KERNELCACHE
+ typedef SHORTREAL KERNELCACHE_ELEM;
+#else
+ typedef REAL KERNELCACHE_ELEM;
+#endif
+
+typedef KERNELCACHE_ELEM P_KERNELCACHE_ELEM;
+
+typedef LONG KERNELCACHE_IDX;
+
+/// The io libs output [DEBUG] etc in front of every CIO::message
+/// 'higher' messages filter output depending on the loglevel, i.e. CRITICAL messages
+/// will print all M_CRITICAL TO M_EMERGENCY messages to
+enum EMessageType
+{
+ M_DEBUG,
+ M_INFO,
+ M_NOTICE,
+ M_WARN,
+ M_ERROR,
+ M_CRITICAL,
+ M_ALERT,
+ M_EMERGENCY,
+ M_MESSAGEONLY,
+ M_PROGRESS
+};
+
+enum EKernelType
+{
+ K_UNKNOWN = 0,
+ K_OPTIMIZABLE = 4096,
+ K_LINEAR = 10 | K_OPTIMIZABLE,
+ K_POLY = 20,
+ K_GAUSSIAN = 30,
+ K_HISTOGRAM = 40,
+ K_SALZBERG = 41,
+ K_LOCALITYIMPROVED = 50,
+ K_SIMPLELOCALITYIMPROVED = 60,
+ K_FIXEDDEGREE = 70,
+ K_WEIGHTEDDEGREE = 80 | K_OPTIMIZABLE,
+ K_WEIGHTEDDEGREEPOS = 81 | K_OPTIMIZABLE,
+ K_WEIGHTEDDEGREEPOLYA = 82,
+ K_WD = 83,
+ K_COMMWORD = 90 | K_OPTIMIZABLE ,
+ K_POLYMATCH = 100,
+ K_ALIGNMENT = 110,
+ K_COMMWORDSTRING = 120 | K_OPTIMIZABLE,
+ K_SPARSENORMSQUARED = 130,
+ K_COMBINED = 140 | K_OPTIMIZABLE
+};
+
+enum EFeatureType
+{
+ F_UNKNOWN = 0,
+ F_REAL = 10,
+ F_SHORT = 20,
+ F_CHAR = 30,
+ F_INT = 40,
+ F_BYTE = 50,
+ F_WORD = 60
+};
+
+enum EFeatureClass
+{
+ C_UNKNOWN = 0,
+ C_SIMPLE = 10,
+ C_SPARSE = 20,
+ C_STRING = 30,
+ C_COMBINED = 40
+};
+
+/// Alphabet of charfeatures/observations
+enum E_ALPHABET
+{
+ /// DNA - letters A,C,G,T,*,N,n
+ DNA=0,
+
+ /// PROTEIN - letters a-z
+ PROTEIN=1,
+
+ /// ALPHANUM - [0-9a-z]
+ ALPHANUM=2,
+
+ /// CUBE - [1-6]
+ CUBE=3,
+
+ /// NONE - type has no alphabet
+ NONE=4
+};
+
+//@}
+
+#define TMP_DIR "/tmp/"
+//#define TMP_DIR "/short/x46/tmp/"
+
+#endif
--- /dev/null
+
+#define HAVE_MATLAB 1
+
+#define HAVE_LARGEFILE 1
+
+
+
+
+
+
+#define USE_BIGSTATES 1
+
+
+#define USE_HMMCACHE 1
+
+
+
+
+
--- /dev/null
+#ifndef __DEBUG_TOOLS__
+#define __DEBUG_TOOLS__
+
+#include <cstdio>
+
+void fassert(bool exp,int line, char* file) {
+ if (! exp) {
+ printf("invalid fassert at line %d in file %s!\n",line,file);
+ exit(EXIT_FAILURE);
+ }
+}
+
+#define FA(expr) (fassert(expr,__LINE__,__FILE__))
+
+#endif // __DEBUG_TOOLS__
--- /dev/null
+/*********************************************************************************************/
+// Version of fill_matrix with fills several alignment matrices (best, second_best ...)
+// nr_paths det. number of these matrices
+//
+// est_len: Number of columns in alignment matrix: length(EST) +1 (for the gap) (variable: i)
+// dna_len: Number of lines in alignment matrix: length(DNA) +1 (for the gap) (variable: j)
+//
+// !!! IMPORTANT !!! The pictures and equations in the paper/poster etc. describe the algorithm
+// for an alignment matrix where DNA and EST changed places !!! IMPORTANT !!!
+/*********************************************************************************************/
+
+/*
+ matchmatrix: columnwise
+ - A C G T N (dna)
+ - x x x x x x
+ A x x x x x x
+ C x x x x x x
+ G x x x x x x
+ T x x x x x x
+ N x x x x x x
+(est)
+*/
+
+/*
+ alignment matrix: columnwise
+ |-EST . . .
+ -+------------
+ -|00 ...
+ D|0.
+ N|. .
+ A|. .
+ .|.
+ .|
+ .|
+*/
+
+
+/* TODO
+ - max_len = 10000 for elegans!
+ - do not use double
+*/
+
+#include "fill_matrix.h"
+
+void fassert(bool exp,int line, char* file);
+#define FA(expr) (fassert(expr,__LINE__,__FILE__))
+
+using namespace std;
+
+int check_char(char base)
+{
+ switch(base)
+ {
+ case 'A':
+ case 'a': return 1;
+ case 'C':
+ case 'c': return 2;
+ case 'G':
+ case 'g': return 3;
+ case 'T':
+ case 't': return 4;
+ case 'N':
+ case 'n': return 5;
+ //default : return 0;
+ default : return -1;
+ }
+
+}
+
+ /* system of Palma. Otherwise scoring_functions should point to an array of
+ * plifs scoring the respective pairs of characters together with the EST
+ * quality score.
+ */
+
+double getScore(struct penalty_struct* qualityScores, int mlen, int dnaChar, int estChar, double baseScore) {
+ double score = 0.0;
+ FA(estChar > 0 && estChar < 6);
+ FA(dnaChar > -1 && dnaChar < 6);
+
+ //printf("Starting scoring / scheme is USE_QUALITY_SCORES\n");
+ //printf("estChar,dnaChar are: %d,%d,%d",estChar,dnaChar);
+
+ int currentPos = (estChar-1)*6+dnaChar;
+ struct penalty_struct currentPen = qualityScores[currentPos];
+
+ INT idx = 0 ;
+ for (INT i=0; i<currentPen.len; i++)
+ if (currentPen.limits[i] <= baseScore)
+ idx++ ;
+
+ if (idx==0)
+ score=currentPen.penalties[0] ;
+ else if (idx==currentPen.len)
+ score=currentPen.penalties[currentPen.len-1] ;
+ else
+ {
+ score = (currentPen.penalties[idx]*(baseScore-currentPen.limits[idx-1]) + currentPen.penalties[idx-1]*
+ (currentPen.limits[idx]-baseScore)) / (currentPen.limits[idx]-currentPen.limits[idx-1]) ;
+ }
+
+ //score = lookup_penalty(¤tPen,idx, NULL, 0);
+ //if(baseScore <= currentPen.limits[0]) {
+ // score = currentPen.penalties[0];
+ //} else {
+ // int sp_idx;
+ // for(sp_idx=0;sp_idx<currentPen.len;sp_idx++) {
+ // if(currentPen.limits[sp_idx] <= baseScore )
+ // score = currentPen.penalties[sp_idx];
+ // //printf("Current score is %f\n",score);
+ // }
+ //}
+
+ return score;
+}
+
+void fill_matrix(int nr_paths, Pre_score* matrices[], int est_len /*counter var: i*/, int dna_len /*counter var: j*/, char* est, char* dna, double* prb, penalty_struct* functions , double* matchmatrix, penalty_struct* qualityScores, double* donor, double* acceptor, bool remove_duplicate_scores, int nr_donor_sites, int* donor_sites, int* max_score_positions, mode currentMode)
+{
+ //printf("Entering fill_matrix...\n");
+
+ /*********************************************************************************************/
+ // variables
+ /*********************************************************************************************/
+ const int mlen = 6; // length of matchmatrix
+ int estChar, dnaChar; // 0:'-', 1:'a', 2:'c', 3:'g', 4:'t', 5:'n'
+
+ double baseScore;
+ double *est_scores = prb ;
+
+ double prevValue ;
+ double tempValue;
+ int max_len = (int)functions->limits[functions->len-1] ; //last (len) supporting point of h
+ //printf("max_len: %d\n", max_len) ;
+
+ int num_elem ; // counter variable
+
+ const int max_num_elem = (4+nr_paths+max_len) * nr_paths ; // upper bound on num_elem
+ double *scores=new double[max_num_elem] ; //are sorted later
+ struct ArrayElem *array=new ArrayElem[max_num_elem] ;
+ //printf("%d\n", max_num_elem*sizeof(ArrayElem)) ;
+
+ int max_ss_element[nr_paths*nr_paths]; //position on dna with max ss score for introns of length greater than max_len
+ int splice_pos ; //splice position get from donor_sites
+ int start_splice_pos ; //where to start looking in donor_sites
+
+ double max_scores[nr_paths] ;
+
+ /*********************************************************************************************/
+ // Initialisation of all alignment matrices, columnwise
+ /*********************************************************************************************/
+ for (int z=0; z<nr_paths; z++) /* matrice z */
+ {
+ max_scores[z] = log(0.0) ;
+ max_score_positions[2*z] = 0; // pos i
+ max_score_positions[2*z +1] = 0; // pos j
+
+ for (int i=0; i<est_len; i++) /* i.column */
+ {
+ for (int j=0; j<dna_len; j++) /* j.line */
+ {
+ ((Pre_score*)matrices[z] +(i*dna_len) +j)->value = log(0.0); // -inf
+ ((Pre_score*)matrices[z] +(i*dna_len) +j)->prev_i = 0;
+ ((Pre_score*)matrices[z] +(i*dna_len) +j)->prev_j = 0;
+ ((Pre_score*)matrices[z] +(i*dna_len) +j)->prev_matrix_no = 0;
+ ((Pre_score*)matrices[z] +(i*dna_len) +j)->issplice = 0;
+ }
+ }
+ }
+
+ /*********************************************************************************************/
+ // Element (0,0) for first/best matrice (z=0)
+ /*********************************************************************************************/
+ ((Pre_score*)matrices[0])->value = 0;
+ ((Pre_score*)matrices[0])->prev_i = 0;
+ ((Pre_score*)matrices[0])->prev_j = 0;
+ ((Pre_score*)matrices[0])->prev_matrix_no = 0;
+ ((Pre_score*)matrices[0])->issplice = 0;
+
+
+ /*********************************************************************************************/
+ // Line Zero for first/best matrice (z=0)
+ /*********************************************************************************************/
+ for (int i=1; i<est_len; i++) /* 0.line, all columns (DNA='-') */
+ {
+ ((Pre_score*)matrices[0] +(i*dna_len))->value = 0 ;
+ ((Pre_score*)matrices[0] +(i*dna_len))->prev_i = i-1;
+ ((Pre_score*)matrices[0] +(i*dna_len))->prev_j = 0;
+ ((Pre_score*)matrices[0] +(i*dna_len))->prev_matrix_no = 0;
+ ((Pre_score*)matrices[0] +(i*dna_len))->issplice = 0;
+ }
+
+
+ /*********************************************************************************************/
+ // Column Zero for first/best matrice (z=0)
+ /*********************************************************************************************/
+ for (int j=1; j<dna_len; j++) // 0.column, all lines
+ {
+ ((Pre_score*)matrices[0] +j)->value = 0 ;
+ ((Pre_score*)matrices[0] +j)->prev_i = 0 ;
+ ((Pre_score*)matrices[0] +j)->prev_j = j-1 ;
+ ((Pre_score*)matrices[0] +j)->prev_matrix_no = 0 ;
+ ((Pre_score*)matrices[0] +j)->issplice = 0;
+ }
+
+
+ /*********************************************************************************************/
+ // !!!! Column Zero and line Zero for all other matrices except the best one (z>0): -INF !!!!
+ /*********************************************************************************************/
+
+
+ //printf("after initialization\n");
+
+ /*********************************************************************************************/
+ // all other entries for all matrices, last column and row treated seperately
+ /*********************************************************************************************/
+
+ /* columnwise, 0.column and 0.line already filled */
+ for (int i=1; i<est_len; i++) /* i.column */
+ {
+
+ start_splice_pos = 0 ; //where to start looking in donor_sites
+ for (int zz=0; zz<nr_paths*nr_paths; zz++) {max_ss_element[zz]=0 ;} // initializing
+
+ for (int j=1; j<dna_len; j++) //j.line
+ {
+ dnaChar = check_char(dna[j-1]) ; // -1 because of of the gaps in the 0.line/column of the al-matrix
+ estChar = check_char(est[i-1]) ;
+ baseScore = est_scores[i-1];
+
+ //printf("at iteration %d %d\n",i,j);
+
+ //if introns of length greater than max_len possible
+ if (j > max_len)
+ {
+ //storing highest ss_score (the position) and NOT -INF
+ int new_ss_pos = j-max_len ;
+ double donor_score = donor[new_ss_pos-1] ;
+ if (isfinite(donor_score)) //splice site!
+ {
+ if (max_ss_element[nr_paths-1] != 0)
+ {
+ for (int zz=0; zz<nr_paths; zz++) //for each alignment matrix
+ {
+ int pos_with_minscore = nr_paths ; //assuming, the new score is the worst score
+ double new_intron_score = donor_score+((Pre_score*)matrices[zz]+(i*dna_len)+new_ss_pos-1)->value ; //could be -INF
+ if (!(isfinite(new_intron_score))) {printf("new_intron_score is -INF\n") ;}
+ for (int pos=0; pos<nr_paths; pos++) //find worst score of all nr_paths+1 scores
+ {
+ int act_splice_position = max_ss_element[zz*nr_paths+pos] ;
+ assert(isfinite(donor[act_splice_position-1])) ;
+ double act_intron_score = donor[act_splice_position-1]+((Pre_score*)matrices[zz]+(i*dna_len)+act_splice_position-1)->value ;
+ if (new_intron_score >= act_intron_score)
+ {
+ new_intron_score = act_intron_score ;
+ pos_with_minscore = pos ;
+ } //else nothing happens, new intron score still the worst one
+ }
+ if (pos_with_minscore != nr_paths)
+ {
+ max_ss_element[zz*nr_paths+pos_with_minscore] = new_ss_pos ; //replacing
+ }
+ }
+ }
+ else
+ {
+ for (int pos=0; pos<nr_paths; pos++)
+ {
+ if (max_ss_element[pos]==0) //no previous splice site at this pos
+ {
+ for (int zz=0; zz<nr_paths; zz++) {max_ss_element[zz*nr_paths+pos]=j-max_len ;}
+ break ;
+ }
+ }
+ }
+ }
+ //start_pos in donor_sites so that only introns shorter then max_length are seen
+ if ((start_splice_pos < nr_donor_sites) && donor_sites[start_splice_pos] == j-max_len)
+ {
+ start_splice_pos++ ;
+ }
+ }
+
+
+ /* Computing all possible scores */
+ num_elem = 0 ; /* number of different scores/possibilities so far */
+ for (int z=0; z<nr_paths; z++) /* matrice z */
+ {
+ Pre_score* actMatrix = (Pre_score*)matrices[z]; /* hm, why? */
+
+ /***************************************/
+ // 0. Zero (local alignment version)
+ /***************************************/
+ assert(num_elem<max_num_elem) ;
+ array[num_elem].prev_i = 0;
+ array[num_elem].prev_j = 0;
+ array[num_elem].prev_matrix_no = z;
+ array[num_elem].isSplice = false; /* no splice site */
+ scores[num_elem] = 0 ;
+ num_elem++ ;
+
+
+ /***************************************/
+ // 1. gap on EST (j-1,i) -> (j,i)
+ /***************************************/
+ prevValue = ((Pre_score*)actMatrix +(i*dna_len)+j-1)->value ; /* (j-1,i) -> (j,i) */
+ if (isfinite(prevValue))
+ {
+
+ if( currentMode == USE_QUALITY_SCORES ) {
+ tempValue = prevValue + matchmatrix[dnaChar];
+ } else {
+ tempValue = prevValue + matchmatrix[mlen*dnaChar]; /* score(gap,DNA) */
+ }
+
+
+ if (isfinite(tempValue))
+ {
+ assert(num_elem<max_num_elem) ;
+ array[num_elem].prev_i = i;
+ array[num_elem].prev_j = j-1; /* predecessor */
+ array[num_elem].prev_matrix_no = z;
+ array[num_elem].isSplice = false; /* no splice site */
+ scores[num_elem] = -tempValue ;
+ num_elem++ ;
+ }
+ }
+
+
+ /***************************************/
+ // 2. match/mismatch (j-1,i-1) -> (j,i)
+ /***************************************/
+ prevValue = ((Pre_score*)actMatrix +(i-1)*dna_len + j-1)->value ; /* (j-1,i-1) -> (j,i) */
+ if (isfinite(prevValue))
+ {
+ if( currentMode == USE_QUALITY_SCORES ) {
+ tempValue = prevValue + getScore(qualityScores,mlen,dnaChar,estChar,baseScore);
+ } else {
+ tempValue = prevValue +(matchmatrix[mlen* dnaChar +estChar]);
+ }
+
+ if (isfinite(tempValue))
+ {
+ assert(num_elem<max_num_elem) ;
+ array[num_elem].prev_i = i-1; /* predecessor */
+ array[num_elem].prev_j = j-1; /* predecessor */
+ array[num_elem].prev_matrix_no = z;
+ array[num_elem].isSplice = false; /* no splice site */
+ scores[num_elem] = -tempValue ;
+ num_elem++ ;
+ }
+ }
+
+
+ /***************************************/
+ // 3. gap on DNA (j,i-1) -> (j,i)
+ /***************************************/
+ prevValue = ((Pre_score*)actMatrix +(i-1)*dna_len + j)->value ;
+ if (isfinite(prevValue))
+ {
+
+ if( currentMode == USE_QUALITY_SCORES ) {
+ tempValue = prevValue + getScore(qualityScores,mlen,0,estChar,baseScore);
+ } else {
+ tempValue = prevValue +(matchmatrix[estChar]); /* score(EST,gap) */
+ }
+
+ if (isfinite(tempValue))
+ {
+ assert(num_elem<max_num_elem) ;
+ array[num_elem].prev_i = i-1; /* predecessor */
+ array[num_elem].prev_j = j;
+ array[num_elem].prev_matrix_no = z;
+ array[num_elem].isSplice = false; /* no splice site */
+ scores[num_elem]=-tempValue ;
+ num_elem++ ;
+ }
+ }
+
+ /***************************************/
+ // 4. all possible splice sites/introns
+ /***************************************/
+ // acceptor[j-1]: g of ag (-1 because cpp starts counting at 0)
+ // donor[splice_pos-1]: g of gt/gc (-1 because cpp starts counting at 0)
+ // donor_sites: the g's of the gt/gc
+
+ if (isfinite(acceptor[j-1]))
+ {
+
+ //the nr_paths many introns of length greater than max_len
+ for (int pos=0; pos<nr_paths; pos++)
+ {
+ if (max_ss_element[z*nr_paths+pos]>0)
+ {
+ assert((j-max_ss_element[z*nr_paths+pos]+1) > max_len) ;
+ splice_pos = max_ss_element[z*nr_paths+pos] ; //the g of the gt
+ prevValue = ((Pre_score*)actMatrix + (i*dna_len) +splice_pos-1)->value ; //one before the _gt
+ if (isfinite(prevValue) && (((Pre_score*)actMatrix + (i*dna_len) +splice_pos-1)->issplice!=1))
+ {
+ int dist = j -(splice_pos-1); //_gt...ag: dist from g to g
+ assert(dist>max_len) ;
+ tempValue = lookup_penalty(functions,dist, NULL, 0) +donor[splice_pos-1] +acceptor[j-1] +prevValue;
+ if (isfinite(tempValue))
+ {
+ assert(num_elem<max_num_elem) ;
+ array[num_elem].prev_i = i;
+ array[num_elem].prev_j = splice_pos-1; // predecessor
+ array[num_elem].prev_matrix_no = z;
+ array[num_elem].isSplice = true; // splice site
+ scores[num_elem]=-tempValue ;
+ num_elem++ ;
+ }
+ else if (isfinite(lookup_penalty(functions,dist, NULL, 0)))
+ {
+ printf("Something wrong with donor_sites(1000er) (intron_length: %d)\n", dist) ;
+ sleep(5000) ;
+ }
+ }
+ }
+ }
+
+ //all introns of length <= max_len
+ for (int k=start_splice_pos; k < nr_donor_sites; k++)
+ //for (int k=0; k < nr_donor_sites; k++) //also intron of length greater than 1000
+ {
+ splice_pos = donor_sites[k] ;
+ if (splice_pos > j-4) { break ;}
+ prevValue = ((Pre_score*)actMatrix + (i*dna_len) +splice_pos-1)->value ; //one before the _gt
+
+ if (isfinite(prevValue) && (((Pre_score*)actMatrix + (i*dna_len) +splice_pos-1)->issplice!=1))
+ {
+ int dist = j -(splice_pos-1); //_gt...ag: dist from g to g
+ assert(dist<=max_len) ;
+ tempValue = lookup_penalty(functions,dist, NULL, 0) +donor[splice_pos-1] +acceptor[j-1] +prevValue;
+ if (isfinite(tempValue))
+ {
+ assert(num_elem<max_num_elem) ;
+ array[num_elem].prev_i = i;
+ array[num_elem].prev_j = splice_pos-1; /* predecessor */
+ array[num_elem].prev_matrix_no = z;
+ array[num_elem].isSplice = true; /* splice site */
+ scores[num_elem]=-tempValue ;
+ num_elem++ ;
+ }
+ else if (isfinite(lookup_penalty(functions,dist, NULL, 0)))
+ {
+ printf("Something wrong with donor_sites (intron_length: %d)\n", dist) ;
+ sleep(5000) ;
+ }
+ }
+ }
+ }
+
+ } //end of z
+
+ CMath::nmin<struct ArrayElem>(scores, array, num_elem, nr_paths);
+
+ /* the nr_paths many highest scores are written in the nr_paths many matrices*/
+ for (int z=0; z<nr_paths; z++) {
+ if (z>=num_elem) {
+ //nothing happens, entry in matrices[z] at pos. (i,j) still -inf
+ }
+ else {
+ ((Pre_score*)matrices[z] + (i*dna_len) +j)->value = -scores[z] ;
+ ((Pre_score*)matrices[z] + (i*dna_len) +j)->prev_i = array[z].prev_i ;
+ ((Pre_score*)matrices[z] + (i*dna_len) +j)->prev_j = array[z].prev_j ;
+ ((Pre_score*)matrices[z] + (i*dna_len) +j)->issplice = array[z].isSplice ;
+ ((Pre_score*)matrices[z] + (i*dna_len) +j)->prev_matrix_no = array[z].prev_matrix_no;
+
+ if (-scores[z]>=max_scores[z]) { //storing the postion of the highest alignment score yet
+ max_scores[z] = -scores[z] ;
+ max_score_positions[2*z] = i; // pos i
+ max_score_positions[2*z +1] = j; // pos j
+ }
+ }
+ } //end of z
+
+ } //end of j
+ } // end of i
+
+ //free memory
+ delete[] array;
+ delete[] scores;
+
+ //printf("Leaving fill_matrix...\n");
+ return;
+}
--- /dev/null
+#ifndef _FILLMATRIX_H__
+#define _FILLMATRIX_H__
+
+#include "Mathmatics.h"
+#include "penalty_info.h"
+#include <algorithm> //Eigentlich in Header
+#include <assert.h>
+#include <ctype.h>
+#include <iostream>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+
+struct ArrayElem { //24B
+ double score;
+ int prev_j;
+ int prev_i;
+ int prev_matrix_no;
+ bool isSplice;
+} ;
+
+struct pre_score { //24B
+ double value; //8
+ int prev_i; //4
+ int prev_j; //4
+ int prev_matrix_no;//4
+ bool issplice;//4
+};
+
+typedef struct pre_score Pre_score;
+
+typedef struct Align_pair { //24B
+ double value ;
+ bool issplice;
+ int previ,prevj ;
+ char est_char;
+ char dna_char;
+};
+
+
+void fill_matrix (int nr_paths, Pre_score* matrices[], int len_est, int len_dna, char* est, char* dna, double* prb, penalty_struct* functions, double* matchmatrix, penalty_struct* qualityScores, double* donor, double* acceptor, bool remove_duplicate_scores, int nr_donor_sites, int* donor_sites, int* max_score_positions);
+// funktion aendern in void result_align(Pre_score* matrices[], int anzpath,
+
+int check_char(char base);
+
+bool result_align (Pre_score* matrices[], int matrixnr, int i, int j, int* result_length_ptr, char* est, char* dna, int* s_align, int* e_align, int* m_param, double* alignmentscores, int* max_score_positions );
+
+extern void print_align(Pre_score* matrix, int length_est, int length_dna, Align_pair* vektor, int result_length, int print_matrix);
+
+double getScore(double *matchmatrix,int mlen, int dnaChar, int estChar);
+
+
+double scoreMatch(int dnaChar, int estChar, double baseScore);
+
+
+#endif // _FILLMATRIX_H__
--- /dev/null
+#include "io.h"
+#include "common.h"
+
+#include <stdio.h>
+#include <stdarg.h>
+#include <ctype.h>
+
+FILE* CIO::target=stdout;
+
+CIO::CIO()
+{
+}
+
+void CIO::message(EMessageType prio, const CHAR *fmt, ... )
+{
+ check_target();
+ print_message_prio(prio, target);
+ va_list list;
+ va_start(list,fmt);
+ vfprintf(target,fmt,list);
+ va_end(list);
+ fflush(target);
+}
+
+void CIO::buffered_message(EMessageType prio, const CHAR *fmt, ... )
+{
+ check_target();
+ print_message_prio(prio, target);
+ va_list list;
+ va_start(list,fmt);
+ vfprintf(target,fmt,list);
+ va_end(list);
+}
+
+CHAR* CIO::skip_spaces(CHAR* str)
+{
+ INT i=0;
+
+ if (str)
+ {
+ for (i=0; isspace(str[i]); i++);
+
+ return &str[i];
+ }
+ else
+ return str;
+}
+
+void CIO::set_target(FILE* t)
+{
+ target=t;
+}
+
+void CIO::check_target()
+{
+ if (!target)
+ target=stdout;
+}
+
+void CIO::print_message_prio(EMessageType prio, FILE* target)
+{
+ switch (prio)
+ {
+ case M_DEBUG:
+ fprintf(target, "[DEBUG] ");
+ break;
+ case M_PROGRESS:
+ fprintf(target, "[PROGRESS] ");
+ break;
+ case M_INFO:
+ //fprintf(target, "[INFO] ");
+ break;
+ case M_NOTICE:
+ fprintf(target, "[NOTICE] ");
+ break;
+ case M_WARN:
+ fprintf(target, "[WARN] ");
+ break;
+ case M_ERROR:
+ fprintf(target, "[ERROR] ");
+ break;
+ case M_CRITICAL:
+ fprintf(target, "[CRITICAL] ");
+ break;
+ case M_ALERT:
+ fprintf(target, "[ALERT] ");
+ break;
+ case M_EMERGENCY:
+ fprintf(target, "[EMERGENCY] ");
+ break;
+ case M_MESSAGEONLY:
+ break;
+ default:
+ break;
+ }
+}
--- /dev/null
+#ifndef __CIO_H__
+#define __CIO_H__
+
+#include "common.h"
+
+#include <stdio.h>
+#include <stdarg.h>
+
+class CIO
+{
+public:
+ CIO();
+
+ static void set_target(FILE* target);
+ static void message(EMessageType prio, const CHAR *fmt, ... );
+
+ inline static void not_implemented()
+ {
+ message(M_ERROR, "Sorry, not yet implemented\n");
+ }
+
+ static void buffered_message(EMessageType prio, const CHAR *fmt, ... );
+
+ static CHAR* skip_spaces(CHAR* str);
+
+protected:
+ static void check_target();
+ static void print_message_prio(EMessageType prio, FILE* target);
+
+protected:
+ static FILE* target;
+};
+#endif
--- /dev/null
+#include <assert.h>
+#include "config.h"
+//#include "features/CharFeatures.h"
+//#include "features/StringFeatures.h"
+
+#include <stdio.h>
+#include <string.h>
+
+#include "io.h"
+
+
+#include "fill_matrix.h"
+#include "penalty_info.h"
+
+void init_penalty_struct(struct penalty_struct &PEN)
+{
+ PEN.limits=NULL ;
+ PEN.penalties=NULL ;
+ PEN.id=-1 ;
+ PEN.next_pen=NULL ;
+ PEN.transform = T_LINEAR ;
+ PEN.name = NULL ;
+ PEN.max_len=0 ;
+ PEN.min_len=0 ;
+ PEN.cache=NULL ;
+ PEN.use_svm=0 ;
+}
+
+void init_penalty_struct_cache(struct penalty_struct &PEN)
+{
+ if (PEN.cache || PEN.use_svm)
+ return ;
+
+ REAL* cache=new REAL[PEN.max_len+1] ;
+ if (cache)
+ {
+ for (INT i=0; i<=PEN.max_len; i++)
+ cache[i] = lookup_penalty(&PEN, i, 0, false) ;
+ PEN.cache = cache ;
+ }
+}
+
+void delete_penalty_struct_palma(struct penalty_struct &PEN)
+{
+ if (PEN.id!=-1)
+ {
+ delete[] PEN.limits ;
+ delete[] PEN.penalties ;
+ delete[] PEN.name ;
+ delete[] PEN.cache ;
+ }
+}
+
+void delete_penalty_struct_array(struct penalty_struct *PEN, INT len)
+{
+ for (int i=0; i<len; i++)
+ delete_penalty_struct_palma(PEN[i]) ;
+ delete[] PEN ;
+}
+
+void copy_penalty_struct(struct penalty_struct *old, struct penalty_struct *newp) {
+ newp->len = old->len;
+ newp->limits = old->limits;
+ newp->penalties = old->penalties;
+ newp->max_len = old->max_len;
+ newp->min_len = old->min_len;
+ newp->cache = old->cache;
+ newp->transform = old->transform;
+ newp->id = old->id;
+ newp->next_pen = old->next_pen;
+ newp->name = old->name;
+ newp->use_svm = old->use_svm;
+}
+
+
+//struct penalty_struct * read_penalty_struct_from_cell(const mxArray * mx_penalty_info, int &P)
+//{
+// P = mxGetN(mx_penalty_info) ;
+//
+// struct penalty_struct * PEN = new struct penalty_struct[P] ;
+// for (INT i=0; i<P; i++)
+// init_penalty_struct(PEN[i]) ;
+//
+// for (INT i=0; i<P; i++)
+// {
+// const mxArray* mx_elem = mxGetCell(mx_penalty_info, i) ;
+// if (mx_elem==NULL || !mxIsStruct(mx_elem))
+// {
+// CIO::message(M_ERROR, "empty cell element\n") ;
+// delete_penalty_struct_array(PEN,P) ;
+// return NULL ;
+// } ;
+// const mxArray* mx_id_field = mxGetField(mx_elem, 0, "id") ;
+// if (mx_id_field==NULL || !mxIsNumeric(mx_id_field) ||
+// mxGetN(mx_id_field)!=1 || mxGetM(mx_id_field)!=1)
+// {
+// CIO::message(M_ERROR, "missing id field\n") ;
+// delete_penalty_struct_array(PEN,P) ;
+// return NULL ;
+// }
+// const mxArray* mx_limits_field = mxGetField(mx_elem, 0, "limits") ;
+// if (mx_limits_field==NULL || !mxIsNumeric(mx_limits_field) ||
+// mxGetM(mx_limits_field)!=1)
+// {
+// CIO::message(M_ERROR, "missing limits field\n") ;
+// delete_penalty_struct_array(PEN,P) ;
+// return NULL ;
+// }
+// INT len = mxGetN(mx_limits_field) ;
+//
+// const mxArray* mx_penalties_field = mxGetField(mx_elem, 0, "penalties") ;
+// if (mx_penalties_field==NULL || !mxIsNumeric(mx_penalties_field) ||
+// mxGetM(mx_penalties_field)!=1 || mxGetN(mx_penalties_field)!=len)
+// {
+// CIO::message(M_ERROR, "missing penalties field\n") ;
+// delete_penalty_struct_array(PEN,P) ;
+// return NULL ;
+// }
+// const mxArray* mx_transform_field = mxGetField(mx_elem, 0, "transform") ;
+// if (mx_transform_field==NULL || !mxIsChar(mx_transform_field))
+// {
+// CIO::message(M_ERROR, "missing transform field\n") ;
+// delete_penalty_struct_array(PEN,P) ;
+// return NULL ;
+// }
+// const mxArray* mx_name_field = mxGetField(mx_elem, 0, "name") ;
+// if (mx_name_field==NULL || !mxIsChar(mx_name_field))
+// {
+// CIO::message(M_ERROR, "missing name field\n") ;
+// delete_penalty_struct_array(PEN,P) ;
+// return NULL ;
+// }
+// const mxArray* mx_max_len_field = mxGetField(mx_elem, 0, "max_len") ;
+// if (mx_max_len_field==NULL || !mxIsNumeric(mx_max_len_field) ||
+// mxGetM(mx_max_len_field)!=1 || mxGetN(mx_max_len_field)!=1)
+// {
+// CIO::message(M_ERROR, "missing max_len field\n") ;
+// delete_penalty_struct_array(PEN,P) ;
+// return NULL ;
+// }
+// const mxArray* mx_min_len_field = mxGetField(mx_elem, 0, "min_len") ;
+// if (mx_min_len_field==NULL || !mxIsNumeric(mx_min_len_field) ||
+// mxGetM(mx_min_len_field)!=1 || mxGetN(mx_min_len_field)!=1)
+// {
+// CIO::message(M_ERROR, "missing min_len field\n") ;
+// delete_penalty_struct_array(PEN,P) ;
+// return NULL ;
+// }
+// const mxArray* mx_use_svm_field = mxGetField(mx_elem, 0, "use_svm") ;
+// if (mx_use_svm_field==NULL || !mxIsNumeric(mx_use_svm_field) ||
+// mxGetM(mx_use_svm_field)!=1 || mxGetN(mx_use_svm_field)!=1)
+// {
+// CIO::message(M_ERROR, "missing use_svm field\n") ;
+// delete_penalty_struct_array(PEN,P) ;
+// return NULL ;
+// }
+// INT use_svm = (INT) mxGetScalar(mx_use_svm_field) ;
+// //fprintf(stderr, "use_svm_field=%i\n", use_svm) ;
+//
+// const mxArray* mx_next_id_field = mxGetField(mx_elem, 0, "next_id") ;
+// if (mx_next_id_field==NULL || !mxIsNumeric(mx_next_id_field) ||
+// mxGetM(mx_next_id_field)!=1 || mxGetN(mx_next_id_field)!=1)
+// {
+// CIO::message(M_ERROR, "missing next_id field\n") ;
+// delete_penalty_struct_array(PEN,P) ;
+// return NULL ;
+// }
+// INT next_id = (INT) mxGetScalar(mx_next_id_field)-1 ;
+//
+// INT id = (INT) mxGetScalar(mx_id_field)-1 ;
+// if (i<0 || i>P-1)
+// {
+// CIO::message(M_ERROR, "id out of range\n") ;
+// delete_penalty_struct_array(PEN,P) ;
+// return NULL ;
+// }
+// INT max_len = (INT) mxGetScalar(mx_max_len_field) ;
+// if (max_len<0 || max_len>1024*1024*100)
+// {
+// CIO::message(M_ERROR, "max_len out of range\n") ;
+// delete_penalty_struct_array(PEN,P) ;
+// return NULL ;
+// }
+// PEN[id].max_len = max_len ;
+//
+// INT min_len = (INT) mxGetScalar(mx_min_len_field) ;
+// if (min_len<0 || min_len>1024*1024*100)
+// {
+// CIO::message(M_ERROR, "min_len out of range\n") ;
+// delete_penalty_struct_array(PEN,P) ;
+// return NULL ;
+// }
+// PEN[id].min_len = min_len ;
+//
+// if (PEN[id].id!=-1)
+// {
+// CIO::message(M_ERROR, "penalty id already used\n") ;
+// delete_penalty_struct_array(PEN,P) ;
+// return NULL ;
+// }
+// PEN[id].id=id ;
+// if (next_id>=0)
+// PEN[id].next_pen=&PEN[next_id] ;
+// //fprintf(stderr,"id=%i, next_id=%i\n", id, next_id) ;
+//
+// assert(next_id!=id) ;
+// PEN[id].use_svm=use_svm ;
+// PEN[id].limits = new REAL[len] ;
+// PEN[id].penalties = new REAL[len] ;
+// double * limits = mxGetPr(mx_limits_field) ;
+// double * penalties = mxGetPr(mx_penalties_field) ;
+//
+// for (INT i=0; i<len; i++)
+// {
+// PEN[id].limits[i]=limits[i] ;
+// PEN[id].penalties[i]=penalties[i] ;
+// }
+// PEN[id].len = len ;
+//
+// char *transform_str = mxArrayToString(mx_transform_field) ;
+// char *name_str = mxArrayToString(mx_name_field) ;
+//
+// if (strcmp(transform_str, "log")==0)
+// PEN[id].transform = T_LOG ;
+// else if (strcmp(transform_str, "log(+1)")==0)
+// PEN[id].transform = T_LOG_PLUS1 ;
+// else if (strcmp(transform_str, "log(+3)")==0)
+// PEN[id].transform = T_LOG_PLUS3 ;
+// else if (strcmp(transform_str, "(+3)")==0)
+// PEN[id].transform = T_LINEAR_PLUS3 ;
+// else if (strcmp(transform_str, "")==0)
+// PEN[id].transform = T_LINEAR ;
+// else
+// {
+// delete_penalty_struct_array(PEN,P) ;
+// mxFree(transform_str) ;
+// return NULL ;
+// }
+// PEN[id].name = new char[strlen(name_str)+1] ;
+// strcpy(PEN[id].name, name_str) ;
+//
+// init_penalty_struct_cache(PEN[id]) ;
+//
+// mxFree(transform_str) ;
+// mxFree(name_str) ;
+// }
+// return PEN ;
+//}
+
+REAL lookup_penalty_svm(const struct penalty_struct *PEN, INT p_value, REAL *d_values)
+{
+ if (PEN==NULL)
+ return 0 ;
+ assert(PEN->use_svm>0) ;
+ REAL d_value=d_values[PEN->use_svm-1] ;
+ //fprintf(stderr,"transform=%i, d_value=%1.2f\n", (INT)PEN->transform, d_value) ;
+
+ switch (PEN->transform)
+ {
+ case T_LINEAR:
+ break ;
+ case T_LOG:
+ d_value = log(d_value) ;
+ break ;
+ case T_LOG_PLUS1:
+ d_value = log(d_value+1) ;
+ break ;
+ case T_LOG_PLUS3:
+ d_value = log(d_value+3) ;
+ break ;
+ case T_LINEAR_PLUS3:
+ d_value = d_value+3 ;
+ break ;
+ default:
+ CIO::message(M_ERROR, "unknown transform\n") ;
+ break ;
+ }
+
+ INT idx = 0 ;
+ REAL ret ;
+ for (INT i=0; i<PEN->len; i++)
+ if (PEN->limits[i]<=d_value)
+ idx++ ;
+
+ if (idx==0)
+ ret=PEN->penalties[0] ;
+ else if (idx==PEN->len)
+ ret=PEN->penalties[PEN->len-1] ;
+ else
+ {
+ ret = (PEN->penalties[idx]*(d_value-PEN->limits[idx-1]) + PEN->penalties[idx-1]*
+ (PEN->limits[idx]-d_value)) / (PEN->limits[idx]-PEN->limits[idx-1]) ;
+ }
+
+ //fprintf(stderr,"ret=%1.2f\n", ret) ;
+
+ if (PEN->next_pen)
+ ret+=lookup_penalty(PEN->next_pen, p_value, d_values);
+
+ //fprintf(stderr,"ret=%1.2f\n", ret) ;
+
+ return ret ;
+}
+
+REAL lookup_penalty(const struct penalty_struct *PEN, INT p_value,
+ REAL* svm_values, bool follow_next)
+{
+ if (PEN==NULL)
+ return 0 ;
+ if (PEN->use_svm)
+ return lookup_penalty_svm(PEN, p_value, svm_values) ;
+
+ if ((p_value<PEN->min_len) || (p_value>PEN->max_len))
+ return -CMath::INFTY ;
+
+ if (PEN->cache!=NULL && (p_value>=0) && (p_value<=PEN->max_len))
+ {
+ REAL ret=PEN->cache[p_value] ;
+ if (PEN->next_pen && follow_next)
+ ret+=lookup_penalty(PEN->next_pen, p_value, svm_values);
+ return ret ;
+ }
+
+ REAL d_value = (REAL) p_value ;
+ switch (PEN->transform)
+ {
+ case T_LINEAR:
+ break ;
+ case T_LOG:
+ d_value = log(d_value) ;
+ break ;
+ case T_LOG_PLUS1:
+ d_value = log(d_value+1) ;
+ break ;
+ case T_LOG_PLUS3:
+ d_value = log(d_value+3) ;
+ break ;
+ case T_LINEAR_PLUS3:
+ d_value = d_value+3 ;
+ break ;
+ default:
+ CIO::message(M_ERROR, "unknown transform\n") ;
+ break ;
+ }
+
+ INT idx = 0 ;
+ REAL ret ;
+ for (INT i=0; i<PEN->len; i++)
+ if (PEN->limits[i]<=d_value)
+ idx++ ;
+
+ if (idx==0)
+ ret=PEN->penalties[0] ;
+ else if (idx==PEN->len)
+ ret=PEN->penalties[PEN->len-1] ;
+ else
+ {
+ ret = (PEN->penalties[idx]*(d_value-PEN->limits[idx-1]) + PEN->penalties[idx-1]*
+ (PEN->limits[idx]-d_value)) / (PEN->limits[idx]-PEN->limits[idx-1]) ;
+ }
+ //if (p_value>=30 && p_value<150)
+ //fprintf(stderr, "%s %i(%i) -> %1.2f\n", PEN->name, p_value, idx, ret) ;
+
+ if (PEN->next_pen && follow_next)
+ ret+=lookup_penalty(PEN->next_pen, p_value, svm_values);
+
+ return ret ;
+}
--- /dev/null
+#ifndef __PENALTY_INFO_H__
+#define __PENALTY_INFO_H__
+
+#include "common.h"
+#include "Mathmatics.h"
+
+enum ETransformType
+{
+ T_LINEAR,
+ T_LOG,
+ T_LOG_PLUS1,
+ T_LOG_PLUS3,
+ T_LINEAR_PLUS3
+} ;
+
+struct penalty_struct
+{
+ INT len ;
+ REAL *limits ;
+ REAL *penalties ;
+ INT max_len ;
+ INT min_len ;
+ REAL *cache ;
+ enum ETransformType transform ;
+ INT id ;
+ struct penalty_struct *next_pen ;
+ char * name ;
+ INT use_svm ;
+} ;
+
+void init_penalty_struct(struct penalty_struct &PEN) ;
+void delete_penalty_struct_palma(struct penalty_struct &PEN) ;
+//void delete_penalty_struct_array(struct penalty_struct *PEN, INT len) ;
+void copy_penalty_struct(struct penalty_struct *old, struct penalty_struct *newp);
+
+//#ifdef HAVE_MATLAB
+//struct penalty_struct * read_penalty_struct_from_cell(const mxArray * mx_penalty_info, mwSize &P) ;
+//#endif
+
+REAL lookup_penalty(const struct penalty_struct *PEN, INT p_value,
+ REAL* svm_values, bool follow_next=true) ;
+
+enum mode { NORMAL, USE_QUALITY_SCORES};
+
+#endif
--- /dev/null
+#include "fill_matrix.h"
+using namespace std;
+
+void print_align(Pre_score* matrix, int length_est, int length_dna,Align_pair* vektor, int result_length, int print_matrix)
+
+{
+ //Druck der matrix
+ switch(print_matrix)
+ {
+ case 1:
+ {
+ cout <<"Matrix: \n";
+
+ for (int k = 0; k<length_est; k++)//k Zeile, l Spalte
+ {
+ cout << "Zeile " << k << ":\t";
+ for (int l= 0; l<length_dna; l++)
+ {
+ cout << ((Pre_score*)matrix + (k*length_dna) + l)->value <<"\t";
+ }
+ cout <<"\n";
+ }
+ break;
+ }
+
+ case 2:
+ {
+ cout <<"Matrix: \n";
+
+ for (int k = 0; k<length_est; k++)//k Zeile, l Spalte
+ {
+ cout << "Zeile " << k << ":\t";
+ for (int l= 0; l<length_dna; l++)
+ {
+ if (((((Pre_score*)matrix + (k*length_dna) + l)->prev_i) == k-1) && ((((Pre_score*)matrix + (k*length_dna) + l)->prev_j) == l-1))
+ cout << "\\" <<((Pre_score*)matrix + (k*length_dna) + l)->value <<"\t";
+ else if (((((Pre_score*)matrix + (k*length_dna) + l)->prev_i) == k) && ((((Pre_score*)matrix + (k*length_dna) + l)->prev_j) == l-1))
+ cout << "<"<< ((Pre_score*)matrix + (k*length_dna) + l)->value <<"\t";
+ else if (((((Pre_score*)matrix + (k*length_dna) + l)->prev_i) == k -1) && ((((Pre_score*)matrix + (k*length_dna) + l)->prev_j) == l))
+ cout << "|"<< ((Pre_score*)matrix + (k*length_dna) + l)->value <<"\t";
+ else { cout << ((Pre_score*)matrix + (k*length_dna) + l)->value << "(";
+ cout << ((Pre_score*)matrix + (k*length_dna) + l)->prev_i << "," << ((Pre_score*)matrix + (k*length_dna) + l)->prev_j << ")\t";
+ }
+
+ }
+ cout <<"\n";
+ }
+
+ cout << "length_est" << length_est << "\n";
+ cout << "length_dna" << length_dna << "\n";
+ cout << "result_length: " << result_length << "\n";
+
+
+ break;
+ }
+
+ case 3:
+ {
+ //Druck des Alignment
+
+ // cout << "Alignment\n" ;
+ int j = 0;
+ // cout << "resultlength: " << result_length <<"\n";
+ while (j<result_length) {
+ int pacman = j;
+ cout << "D:(" <<j<<")\t";
+ for (int i=0; ((j<result_length) && (i<50)); i++)
+ {
+ cout << vektor[j].dna_char << " ";
+ j++;
+ }
+
+ cout << "\n";
+
+ cout << "E:(" <<pacman<<")\t";
+ for (int i=0;((pacman<result_length) && (i<50)); i++)
+ {
+ cout << vektor[pacman].est_char << " ";
+ pacman++;
+ }
+ cout << "\n\n";
+ }
+
+ //Ende Druck Alignment
+ break ;
+ }
+
+ default:
+ {
+ cout <<"";
+ }
+ }
+
+
+
+}
+
--- /dev/null
+#include "qpalma_dp.h"
+#include <cstring>
+
+using namespace std;
+
+/*[splice_align, est_align, weightMatch, alignmentscores, dnaest] = ...
+ myalign([nr_paths], dna, est, {h}, matchmatrix, donor, acceptor, remove_duplicate_scores, ...
+ print_matrix) */
+
+Alignment::Alignment(int numQPlifs, int numq, bool use_qscores) {
+ len = 0;
+ limits = 0;
+ penalties = 0;
+ max_len = 0;
+ min_len = 0;
+ cache = 0;
+ enum ETransformType transform = T_LINEAR;
+ id = 0;
+ name = 0;
+ use_svm = 0;
+
+ // set ptrs to zero first
+ splice_align = 0;
+ est_align = 0;
+ mmatrix_param = 0;
+ alignmentscores = 0;
+ qualityScoresAllPaths = 0;
+ mlen = 6; // score matrix: length of 6 for "- A C G T N"
+
+ numQualSuppPoints = numq;
+ numPlifs = numQPlifs;
+ use_quality_scores = use_qscores;
+
+ //printf("number of support points: %d\n",numQualSuppPoints);
+ //printf("number of plifs: %d\n",numPlifs );
+ FA( numQualSuppPoints >= 0 );
+ FA( numPlifs >= 0 );
+}
+
+void Alignment::getDNAEST(){}
+
+void Alignment::myalign(int nr_paths_p, char* dna, int dna_len_p, char* est,
+ int est_len_p, double* prb, double* chastity, struct penalty_struct h, double* matchmatrix, int mm_len,
+ double* donor, int d_len, double* acceptor, int a_len, struct penalty_struct* qualityScores,
+ bool remove_duplicate_scores, bool print_matrix) {
+
+ // printf("Entering myalign...\n");
+ dna_len = dna_len_p + 1 ;
+ est_len = est_len_p + 1 ;
+ nr_paths = nr_paths_p;
+
+ /***************************************************************************/
+ // initialize alignment matrices and call fill_matrix()
+ /***************************************************************************/
+
+ mode currentMode;
+ if (use_quality_scores)
+ currentMode = USE_QUALITY_SCORES;
+ else
+ currentMode = NORMAL;
+
+ // dnaest
+ double *DNA_ARRAY = 0;
+ double *EST_ARRAY = 0;
+
+ //possible donor positions
+ int nr_donor_sites = 0 ;
+ for (int ii=0; ii<d_len; ii++) {
+ if(isfinite(donor[ii])) {
+ nr_donor_sites++ ;
+ }
+ }
+
+ int* donor_sites = new int[nr_donor_sites];
+ int donor_idx = 0 ;
+ for (int ii=0; ii<d_len; ii++) {
+ if(isfinite(donor[ii])) {
+ donor_sites[donor_idx] = ii+1 ;
+ donor_idx++ ;
+ }
+ }
+
+ int* max_score_positions = new int[nr_paths*2];
+
+ Pre_score** matrices = new Pre_score*[nr_paths];
+ for (int z=0; z<nr_paths; z++) {
+ matrices[z] = new Pre_score[dna_len * est_len];
+ }
+
+ //printf("qualityScores\n");
+ //for(int qidx=0;qidx<numPlifs;qidx++) {
+ // penalty_struct p;
+ // p = qualityScores[qidx];
+ // printf("%d ",qidx);
+ // for(int pidx=0;pidx<p.len;pidx++)
+ // printf("%f ",pidx,p.penalties[pidx]);
+ // printf("\n");
+ //}
+
+ //printf("calling fill_matrix...\n");
+ fill_matrix(nr_paths, matrices, est_len, dna_len, est, dna, prb, &h, matchmatrix, qualityScores, donor, acceptor, remove_duplicate_scores, nr_donor_sites, donor_sites, max_score_positions,currentMode);
+ //printf("after call to fill_matrix...\n");
+
+ /***************************************************************************/
+ // return arguments etc.
+ /***************************************************************************/
+ int result_length; //Eine Variable fuer alle Matrizen
+
+ splice_align_size = (dna_len-1)*nr_paths;
+ est_align_size = (est_len-1)*nr_paths;
+
+ int mmatrix_size;
+
+ if (currentMode == USE_QUALITY_SCORES) {
+ mmatrix_param_size = mlen*nr_paths;
+ mmatrix_size = mlen;
+ }
+
+ if (currentMode == NORMAL) {
+ mmatrix_param_size = (mlen*mlen)*nr_paths;
+ mmatrix_size = mlen*mlen;
+ }
+
+ alignmentscores_size = nr_paths; //alignment score for each path/matrix
+ numPathsPlifs = numPlifs*nr_paths; //alignment score for each path/matrix
+
+ splice_align = new int[splice_align_size];
+ est_align = new int[est_align_size];
+ mmatrix_param = new int[mmatrix_param_size];
+ alignmentscores = new double[nr_paths]; //alignment score for each path/matrix
+
+
+ // printf("before memset...\n");
+ memset((char*)splice_align, -1, (dna_len-1)*nr_paths*sizeof(int)); // fills splice_align with zeros
+ memset((char*)est_align, -1, (est_len-1)*nr_paths*sizeof(int)); // fills est_align with zeros
+ memset((char*)mmatrix_param, 0, mmatrix_size*nr_paths*sizeof(int)); //fills mmatrix_param with zeros
+ memset(alignmentscores, -1, nr_paths*sizeof(double)); //fills alignmentscores with zeros
+ //printf("after memset...\n");
+
+ qualityScoresAllPaths= new penalty_struct*[nr_paths];
+
+ for (int z=0; z<nr_paths; z++) {
+ result_length = 0 ;
+
+ int* s_align = splice_align + (dna_len-1)*z; //pointer
+ int* e_align = est_align + (est_len-1)*z ; //pointer
+ int* mparam = mmatrix_param + mmatrix_size*z; //pointer
+
+ qualityScoresAllPaths[z] = new penalty_struct[numPlifs];
+
+ int qidx, pidx;
+ for(qidx=0;qidx<numPlifs;qidx++) {
+ penalty_struct p;
+ p.len = numQualSuppPoints;
+ p.limits = (REAL*) calloc(p.len,sizeof(REAL));
+
+ for(pidx=0;pidx<p.len;pidx++)
+ p.limits[pidx] = qualityScores[qidx%numPlifs].limits[pidx];
+
+ p.penalties = (REAL*) calloc(p.len,sizeof(REAL));
+ qualityScoresAllPaths[z][qidx] = p;
+ }
+
+ //penalty_struct* qparam = qualityScoresAllPaths + (numPlifs*z);
+
+ //printf("before call to result_align...\n");
+ bool no_more_path = result_align(matrices, z, est_len, dna_len, &result_length, est, dna, prb, chastity, s_align, e_align, mparam, alignmentscores, max_score_positions, qualityScoresAllPaths[z] , currentMode);
+ //printf("after call to result_align...\n");
+
+ //printf("z is %d\n",z);
+ //for(qidx=0;qidx<numPlifs;qidx++) {
+ // penalty_struct p;
+ // p = qualityScoresAllPaths[qidx+numPlifs*z];
+ // printf("%d ",qidx);
+ // for(pidx=0;pidx<p.len;pidx++)
+ // printf("%f ",pidx,p.penalties[pidx]);
+ // printf("\n");
+ //}
+
+ //if(DNA_ARRAY != 0) {
+ // delete[] DNA_ARRAY;
+ // delete[] EST_ARRAY;
+ // }
+
+ //DNA_ARRAY = new double[result_length];
+ //EST_ARRAY = new double[result_length];
+
+ // //backtracking
+ // int i = max_score_positions[2*z] ; //i (est)
+ // int j = max_score_positions[2*z +1] ; //j (dna)
+ // int prev_i = ((Pre_score*)matrices[z] + i*dna_len +j)->prev_i;
+ // int prev_j = ((Pre_score*)matrices[z] + i*dna_len +j)->prev_j;
+ // int prev_m_no = ((Pre_score*)matrices[z] + i*dna_len +j)->prev_matrix_no;
+ //
+ // for (int ii=result_length; ii>0; ii--) {
+ // if ((prev_i == (i-1)) && (prev_j == (j-1))) { // match or mismatch
+ //DNA_ARRAY[ii-1] = check_char(dna[j-1]) ;
+ //EST_ARRAY[ii-1] = check_char(est[i-1]) ;
+ // }
+ // else if ((prev_i == (i)) && (prev_j == (j-1))) {// gap on EST_ARRAY
+ //DNA_ARRAY[ii-1] = check_char(dna[j-1]) ;
+ //EST_ARRAY[ii-1] = 0 ; //gap
+ // }
+ // else if ((prev_i == (i-1)) && (prev_j == (j))) { // gap on DNA_ARRAY
+ //DNA_ARRAY[ii-1] = 0 ; //gap
+ //EST_ARRAY[ii-1] = check_char(est[i-1]) ;
+ // }
+ // else {// splice site
+ //for (j; j > prev_j; j--) {
+ // DNA_ARRAY[ii-1] = check_char(dna[j-1]) ;
+ // EST_ARRAY[ii-1] = 6 ; //intron
+ // ii-- ;
+ //}
+ //ii++ ; // last ii-- too much (because done in for-loop)
+ // }
+ //
+ // i = prev_i;
+ // j = prev_j;
+ // prev_i = ((Pre_score*)matrices[prev_m_no] + i*dna_len + j)->prev_i;
+ // prev_j = ((Pre_score*)matrices[prev_m_no] + i*dna_len + j)->prev_j;
+ // prev_m_no = ((Pre_score*)matrices[prev_m_no] + i*dna_len + j)->prev_matrix_no;
+ // }
+ //
+ } //end of z
+
+ //if(DNA_ARRAY != 0) {
+ // delete[] DNA_ARRAY;
+ // delete[] EST_ARRAY;
+ // }
+
+ for (int i=nr_paths-1;i>=0; i--)
+ delete[] matrices[i];
+}
+
+void Alignment::getAlignmentResults(int* s_align, int* e_align,
+ int* mmatrix_p, double* alignscores, double* qScores) {
+
+ int idx;
+ for(idx=0; idx<splice_align_size; idx++)
+ s_align[idx] = splice_align[idx];
+
+ for(idx=0; idx<est_align_size; idx++)
+ e_align[idx] = est_align[idx];
+
+ for(idx=0; idx<mmatrix_param_size; idx++)
+ mmatrix_p[idx] = mmatrix_param[idx];
+
+ for(idx=0; idx<alignmentscores_size; idx++)
+ alignscores[idx] = alignmentscores[idx];
+
+ if (use_quality_scores) {
+ penalty_struct currentPlif;
+ int ctr=0;
+ for (int z=0; z<nr_paths; z++) {
+ for(int estChar=1;estChar<6;estChar++) {
+ for(int dnaChar=0;dnaChar<6;dnaChar++) {
+ int currentPos = (estChar-1)*6+dnaChar;
+ currentPlif = qualityScoresAllPaths[z][currentPos];
+ for(int pidx=0; pidx<currentPlif.len; pidx++) {
+ qScores[ctr] = currentPlif.penalties[pidx];
+ ctr++;
+ }
+ }}}
+ }
+
+ //for(idx=0; idx<numPathsPlifs; idx++) {
+ // currentPlif = qualityScoresAllPaths[idx];
+ // //printf("Size is %dx%d\n",numPathsPlifs,currentPlif.len);
+ // for(pidx=0; pidx<currentPlif.len; pidx++) {
+ // qScores[ctr] = currentPlif.penalties[pidx];
+ // ctr++;
+ // }
+ //}
+
+ //printf("\nctr is %d\n",ctr);
+ //printf("Leaving getAlignmentResults...\n");
+}
--- /dev/null
+#ifndef _QPALMA_DP_H_
+#define _QPALMA_DP_H_
+
+#include "penalty_info.h"
+
+struct ArrayElem { //24B
+ double score;
+ int prev_j;
+ int prev_i;
+ int prev_matrix_no;
+ bool isSplice;
+} ;
+
+struct pre_score { //24B
+ double value; //8
+ int prev_i; //4
+ int prev_j; //4
+ int prev_matrix_no;//4
+ bool issplice;//4
+};
+
+typedef struct pre_score Pre_score;
+
+typedef struct Align_pair { //24B
+ double value ;
+ bool issplice;
+ int previ,prevj ;
+ char est_char;
+ char dna_char;
+};
+
+void fill_matrix (int nr_paths, Pre_score* matrices[], int len_est, int len_dna, char* est, char* dna, double* prb, penalty_struct* functions, double* matchmatrix, penalty_struct* qualityScores, double* donor, double* acceptor, bool remove_duplicate_scores, int nr_donor_sites, int* donor_sites, int* max_score_positions, mode currentMode);
+
+int check_char(char base);
+void fassert(bool exp,int line, char* file);
+#define FA(expr) (fassert(expr,__LINE__,__FILE__))
+
+bool result_align (Pre_score* matrices[], int matrixnr, int i, int j, int* result_length_ptr, char* est, char* dna, double* prb, double* chastity, int* s_align, int* e_align, int* m_param, double* alignmentscores, int* max_score_positions, penalty_struct* qparam, mode currentMode);
+
+extern void print_align(Pre_score* matrix, int length_est, int length_dna, Align_pair* vektor, int result_length, int print_matrix);
+
+
+/*
+ * The function myalign calculates a new alignment given the scoring scheme
+ * Its input arguments are:
+ *
+ * num_path(id) -> an integer specifying which alignment has to be done
+ * dna ->
+ * est ->
+ * h ->
+ * mmatrix ->
+ * donor ->
+ * acceptor ->
+ * remove_duplicate_scores -> a boolean
+ * print_matrix -> a boolean
+ *
+ * [SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest] = myalign_local(...
+ *
+ * the idea of the qualityScores array is as follows
+ *
+ * consider a matrix of 24 plifs
+ *
+ * -> row major
+ *
+ */
+
+class Alignment {
+
+ private:
+ int* splice_align;
+ int* est_align;
+ int* mmatrix_param;
+ double* alignmentscores;
+ struct penalty_struct** qualityScoresAllPaths;
+
+ int dna_len;
+ int est_len;
+ int mlen;
+ int nr_paths;
+
+ int numQualSuppPoints;
+ int numPlifs;
+ bool use_quality_scores;
+
+ uint splice_align_size ;
+ uint est_align_size ;
+ uint mmatrix_param_size ;
+ uint alignmentscores_size ;
+ uint numPathsPlifs ;
+
+ INT len;
+ REAL *limits;
+ REAL *penalties;
+ INT max_len;
+ INT min_len;
+ REAL *cache;
+ enum ETransformType transform ;
+ INT id;
+ char * name;
+ INT use_svm;
+
+ public:
+ Alignment(int numQPlifs ,int numq, bool use_qscores);
+ ~Alignment() {
+ if(splice_align != 0)
+ delete[] splice_align;
+
+ if(est_align != 0)
+ delete[] est_align;
+
+ if(mmatrix_param != 0)
+ delete[] mmatrix_param;
+
+ if(alignmentscores != 0)
+ delete[] alignmentscores;
+
+ if(qualityScoresAllPaths != 0)
+ delete[] qualityScoresAllPaths;
+ }
+
+ void myalign(int nr_paths_p, char* dna, int dna_len_p, char* est,
+ int est_len_p, double* prb, double* chastity, struct penalty_struct h,
+ double* matchmatrix, int mm_len, double* donor, int d_len, double* acceptor,
+ int a_len, struct penalty_struct* qualityScores, bool remove_duplicate_scores,
+ bool print_matrix);
+
+ void getDNAEST();
+ void getAlignmentResults(int* s_align, int* e_align,
+ int* mmatrix_p, double* alignscores, double* qScores);
+};
+
+#endif // _QPALMA_DP_H_
+
--- /dev/null
+#include "assert.h"
+#include "fill_matrix.h"
+#include "debug_tools.h"
+
+using namespace std;
+
+void increaseFeatureCount(penalty_struct* qparam, int dnaChar, int estChar, double estprb) {
+
+ FA(estChar > 0 && estChar < 6);
+ FA(dnaChar > -1 && dnaChar < 6);
+
+ int currentPos = (estChar-1)*6+dnaChar;
+
+ penalty_struct currentStruct = qparam[currentPos];
+
+ //printf("Current index %d est/dna: %d %d score %f\n",currentPos,estChar,dnaChar,estprb);
+
+ //printf("before\n");
+ //int p_idx;
+ //for(p_idx=0;p_idx<currentStruct.len;p_idx++) {
+ // printf("%f ",currentStruct.limits[p_idx]);
+ //}
+ //printf("\n");
+
+ //for(p_idx=0;p_idx<currentStruct.len;p_idx++) {
+ // printf("%f ",currentStruct.penalties[p_idx]);
+ //}
+ //printf("\n");
+
+ double value = estprb;
+ int Lower = 0;
+ int idx;
+
+ for (idx=0;idx<currentStruct.len;idx++) {
+ if (currentStruct.limits[idx] <= value)
+ Lower++;
+ }
+
+ if (Lower == 0) {
+ currentStruct.penalties[0] += 1;
+ qparam[currentPos] = currentStruct;
+ return;
+ }
+
+ if (Lower == currentStruct.len) {
+ currentStruct.penalties[currentStruct.len-1] += 1;
+ qparam[currentPos] = currentStruct;
+ return;
+ }
+
+ Lower -= 1;
+ int Upper = Lower+1; // x-werte bleiben fest
+
+ double weightup = 1.0*(value - currentStruct.limits[Lower]) / (currentStruct.limits[Upper] - currentStruct.limits[Lower]);
+ double weightlow = 1.0*(currentStruct.limits[Upper] - value) / (currentStruct.limits[Upper] - currentStruct.limits[Lower]);
+ currentStruct.penalties[Upper] += weightup;
+ currentStruct.penalties[Lower] += weightlow;
+
+ //printf("estprb/Lower/Upper %f %d %d\n",estprb,Lower,Upper);
+ //printf("after\n");
+ //for(p_idx=0;p_idx<currentStruct.len;p_idx++) {
+ // printf("%f ",currentStruct.limits[p_idx]);
+ //}
+ //printf("\n");
+ //for(p_idx=0;p_idx<currentStruct.len;p_idx++)
+ // printf("%f ",currentStruct.penalties[p_idx]);
+ //printf("\n");
+ qparam[currentPos] = currentStruct;
+}
+
+bool result_align(Pre_score* matrices[], int z, int est_len, int dna_len, int* result_length_ptr, char* est, char* dna, double* prb, double* chastity, int* s_align, int* e_align, int* mparam, double* alignmentscores, int* max_score_positions, penalty_struct* qparam, mode currentMode)
+//Gibt 1 zurueck, wenn kein weiterer Pfad existiert, sonst 0
+
+{
+ const int mlen=6; // length of matchmatrix
+ int startend[4] ; // [i_start, j_start, i_end, j_end] ;
+ startend[3] = max_score_positions[2*z] ; //i (est)
+ startend[4] = max_score_positions[2*z +1] ; //j (dna)
+ //printf("in result_align: z=%d:(%d, %d)\n", z, startend[3], startend[4]) ;
+ int dnanum ; //using check_char: letter -> number
+ int estnum ; //using check_char: letter -> number
+
+ double prbnum ;
+ double chastitynum ;
+
+ int dna_pos ;
+ int est_pos ;
+ int splice_state ;
+ int est_state ;
+ for (dna_pos=dna_len-1; dna_pos>startend[4]; dna_pos--) {
+ s_align[dna_pos-1] = 4 ; // not aligned (-1 because array starts with index 0)
+ }
+ for (est_pos=est_len-1; est_pos>startend[3]; est_pos--) {
+ e_align[est_pos-1] = 4 ;
+ }
+ //splice_align: 0: exon, 1: donor, 2: acceptor, 3: intro,. 4: dangling end
+
+ /* starting point for backtracking */
+ int i = startend[3] ;
+ int j = startend[4] ;
+ int act_m_no = z ;
+ alignmentscores[z] = ((Pre_score*)matrices[z] + i*dna_len +j)->value;
+ //printf("alignment_score for z=%d: %3.8f\n", z, alignmentscores[z]) ;
+
+ int prev_i = ((Pre_score*)matrices[act_m_no] + i*dna_len +j)->prev_i;
+ int prev_j = ((Pre_score*)matrices[act_m_no] + i*dna_len +j)->prev_j;
+ int prev_m_no = ((Pre_score*)matrices[act_m_no] + i*dna_len +j)->prev_matrix_no;
+
+ //printf("Current z is %d\n",z);
+
+ if(!(isfinite(((Pre_score*)matrices[act_m_no] + i*dna_len +j)->value)))
+ {
+ cout << "No additional path left (inf)!\n";
+ return 1;
+ }
+
+ if(((Pre_score*)matrices[act_m_no] + i*dna_len +j)->value==0) {
+ cout << "No additional path left (0)!\n";
+ return 1;
+ }
+
+ assert(dna_pos == j) ;
+ assert(est_pos == i) ;
+
+ splice_state = 0 ; //exon
+ est_state = 1 ; //[112222111 ... for exons of length 2,4,3 ...
+
+ //compute length of alignment (between max(m,n) and m+n)
+ //local_alignment: backtracking to first Zero: look at actual value
+ while(!(((Pre_score*)matrices[act_m_no] + i*dna_len +j)->value)==0.0) {
+
+ //just some info
+ //if ((z>0) && (!(z==prev_m_no))) {
+ // printf("(%d,%d): next step in next matrix\n", i,j) ;
+ //}
+
+ if ((prev_i == (i-1)) && (prev_j == (j-1))) { // match or mismatch
+ (*result_length_ptr)= (*result_length_ptr) + 1;
+
+ s_align[dna_pos-1] = splice_state;
+ dna_pos-- ;
+ e_align[est_pos-1] = est_state ; //1 or 2, depended
+ est_pos-- ;
+
+ dnanum = check_char(dna[j-1]) ; //-1 because index starts with 0
+ estnum = check_char(est[i-1]) ; //-1 because index starts with 0
+
+ prbnum = prb[i-1];
+ chastitynum = chastity[i-1];
+
+ //printf("Position %d,%d est,dna: %d,%d\n",i,j,estnum,dnanum);
+
+ if(currentMode == USE_QUALITY_SCORES)
+ increaseFeatureCount(qparam,dnanum,estnum,prbnum);
+ else
+ mparam[mlen*dnanum+estnum] ++ ;
+
+ }
+ else if ((prev_i == (i)) && (prev_j == (j-1))) {// gap on EST
+ (*result_length_ptr)= (*result_length_ptr) + 1;
+
+ s_align[dna_pos-1] = splice_state;
+ dna_pos-- ;
+
+ dnanum = check_char(dna[j-1]) ; //-1 because index starts with 0
+ estnum = 0 ; //gap
+
+ if(currentMode == USE_QUALITY_SCORES)
+ mparam[dnanum] ++ ;
+ else
+ mparam[mlen*dnanum +estnum] ++ ;
+
+ }
+ else if ((prev_i == (i-1)) && (prev_j == (j))) { // gap on DNA
+ (*result_length_ptr)= (*result_length_ptr) + 1;
+
+ e_align[est_pos-1] = est_state ; //1 or 2, depended
+ est_pos-- ;
+
+ dnanum = 0 ; //gap
+ estnum = check_char(est[i-1]) ; //-1 because index starts with 0
+
+ prbnum = prb[i-1];
+ chastitynum = chastity[i-1];
+
+ //printf("Position %d,%d est,dna: %d,%d\n",i,j,estnum,dnanum);
+
+ if(currentMode == USE_QUALITY_SCORES)
+ increaseFeatureCount(qparam,dnanum,estnum,prbnum);
+ else
+ mparam[mlen*dnanum +estnum] ++ ;
+ }
+ else {// splice site
+ (*result_length_ptr) = (*result_length_ptr) + (j - prev_j);
+ if (((Pre_score*)matrices[act_m_no] + i*dna_len + j)->issplice==0) {
+ printf("matrices[act_m_no](i,j): %3.15f\n",((Pre_score*)matrices[act_m_no] + i*dna_len +j)->value) ;
+ printf("matrices[prev_m_no](i,j): %3.15f\n",((Pre_score*)matrices[prev_m_no] + i*dna_len +j)->value) ;
+ printf("issplice(act): %d\n",((Pre_score*)matrices[act_m_no] + i*dna_len + j)->issplice) ;
+ printf("issplice(prev): %d\n",((Pre_score*)matrices[prev_m_no] + i*dna_len + j)->issplice) ;
+ printf("act_m_no: %d, prev_m_no: %d\n", act_m_no, prev_m_no) ;
+ printf("j: %d -> prev_j:%d\n", j, prev_j) ;
+ printf("i: %d -> prev_i:%d\n", i, prev_i) ;
+ }
+ if (est_state==1) { //was exon labeled "1"
+ est_state = 2 ; //new exon is labeled "2"
+ }
+ else {
+ est_state = 1 ; //last exon was labeled "2", new exon is labeled "1"
+ }
+ for (j; j > prev_j; j--) {
+ if (splice_state == 0) { //coming from exon
+ splice_state = 2 ; //first intron_state: acceptor
+ } else {
+ splice_state = 3 ; //intron
+ }
+ if (j-1 == prev_j) { //last intron_state: donor
+ splice_state = 1 ;//donor
+ }
+ s_align[dna_pos-1] = splice_state;
+ dna_pos-- ;
+ }
+ splice_state = 0 ; //exon again
+
+ }
+
+ startend[1] = i;
+ startend[2] = j;
+ i = prev_i;
+ j = prev_j;
+ act_m_no = prev_m_no ;
+
+ prev_i = ((Pre_score*)matrices[act_m_no] + i*dna_len + j)->prev_i;
+ prev_j = ((Pre_score*)matrices[act_m_no] + i*dna_len + j)->prev_j;
+ prev_m_no = ((Pre_score*)matrices[act_m_no] + i*dna_len +j)->prev_matrix_no;
+ }
+ for (dna_pos; dna_pos>0; dna_pos--) {
+ s_align[dna_pos-1] = 4 ; // not aligned (-1 because array starts with index 0)
+ }
+ for (est_pos; est_pos>0; est_pos--) {
+ e_align[est_pos-1] = 4 ; // not aligned (-1 because array starts with index 0)
+ }
+
+ //printf("(%d, %d) --> (%d, %d)\n", startend[1], startend[2], startend[3], startend[4]) ;
+ //printf("result_length: %d\n", *result_length_ptr) ;
+ return 0;
+}
+++ /dev/null
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-###########################################################
-#
-# The QPalma project aims at extending the Palma project
-# to be able to use Solexa reads toegether with their
-# quality scores.
-#
-# This file represents the conversion of the main matlab
-# training loop for Palma to python.
-#
-# Author: Fabio De Bona
-#
-###########################################################
-
-import sys
-import subprocess
-import scipy.io
-import pdb
-import os.path
-
-from numpy.matlib import mat,zeros,ones,inf
-from numpy.linalg import norm
-
-import QPalmaDP
-import qpalma
-from qpalma.SIQP_CPX import SIQPSolver
-from qpalma.DataProc import *
-from qpalma.generateEvaluationData import *
-from qpalma.computeSpliceWeights import *
-from qpalma.set_param_palma import *
-from qpalma.computeSpliceAlignWithQuality import *
-from qpalma.penalty_lookup_new import *
-from qpalma.compute_donacc import *
-from qpalma.export_param import *
-from qpalma.TrainingParam import Param
-from qpalma.Plif import Plf
-
-from qpalma.tools.splicesites import getDonAccScores
-
-from qpalma.Configuration import *
-
-class para:
- pass
-
-class QPalma:
- """
- A training method for the QPalma project
- """
-
- def __init__(self):
- self.ARGS = Param()
- self.logfh = open('qpalma.log','w+')
-
- #gen_file= '%s/genome.config' % self.ARGS.basedir
- #ginfo_filename = 'genome_info.pickle'
- #self.genome_info = fetch_genome_info(ginfo_filename)
- #self.plog('genome_info.basedir is %s\n'%self.genome_info.basedir)
-
- #self.ARGS.train_with_splicesitescoreinformation = False
-
- # Load the whole dataset
- if Conf.mode == 'normal':
- #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
- Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
- self.use_quality_scores = False
-
- elif Conf.mode == 'using_quality_scores':
-
- filename = 'real_dset_%s'% 'recent'
- if True:# not os.path.exists(filename):
- Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos = paths_load_data_solexa('training',None,self.ARGS)
-
- end = 1000
- self.Sequences = Sequences[:end]
- self.Exons = Exons[:end]
- self.Ests = Ests[:end]
- self.Qualities = Qualities[:end]
- self.Acceptors = Acceptors[:end]
- self.Donors = Donors[:end]
-
- self.Donors, self.Acceptors = getDonAccScores(Sequences)
- #dset = Dataset()
- else:
- dset = cPickle.load(open(filename))
- Sequences = dset.Sequences
- Acceptors = dset.Acceptors
- Donors = dset.Donors
- Exons = dset.Exons
- Ests = dset.Ests
- Qualities = dset.Qualities
-
- #Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
- #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
- #Qualities = []
- #for i in range(len(Ests)):
- # Qualities.append([40]*len(Ests[i]))
- self.use_quality_scores = True
- else:
- assert(False)
-
- # number of training instances
- N = len(Sequences)
- self.numExamples = N
- assert N == len(Acceptors) and N == len(Acceptors) and N == len(Exons)\
- and N == len(Ests), 'The Seq,Accept,Donor,.. arrays are of different lengths'
- self.plog('Number of training examples: %d\n'% N)
- print 'Number of features: %d\n'% Conf.numFeatures
-
- def plog(self,string):
- self.logfh.write(string)
- self.logfh.flush()
-
- def train(self):
- beg = Conf.training_begin
- end = Conf.training_end
-
- Sequences = self.Sequences[beg:end]
- Exons = self.Exons[beg:end]
- Ests = self.Ests[beg:end]
- Qualities = self.Qualities[beg:end]
- Acceptors = self.Acceptors[beg:end]
- Donors = self.Donors[beg:end]
-
- iteration_steps = Conf.iter_steps ; #upper bound on iteration steps
- remove_duplicate_scores = Conf.remove_duplicate_scores
- print_matrix = Conf.print_matrix
- anzpath = Conf.anzpath
-
- # Initialize parameter vector / param = numpy.matlib.rand(126,1)
- param = Conf.fixedParam
-
- # Set the parameters such as limits penalties for the Plifs
- [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
-
- # delete splicesite-score-information
- #if not self.ARGS.train_with_splicesitescoreinformation:
- # for i in range(len(Acceptors)):
- # if Acceptors[i] > -20:
- # Acceptors[i] = 1
- # if Donors[i] >-20:
- # Donors[i] = 1
-
- # Initialize solver
- if Conf.USE_OPT:
- self.plog('Initializing problem...\n')
- solver = SIQPSolver(Conf.numFeatures,self.numExamples,Conf.C,self.logfh)
-
- # stores the number of alignments done for each example (best path, second-best path etc.)
- num_path = [anzpath]*self.numExamples
- # stores the gap for each example
- gap = [0.0]*self.numExamples
-
- #############################################################################################
- # Training
- #############################################################################################
- self.plog('Starting training...\n')
-
- donSP = Conf.numDonSuppPoints
- accSP = Conf.numAccSuppPoints
- lengthSP = Conf.numLengthSuppPoints
- mmatrixSP = Conf.sizeMatchmatrix[0]\
- *Conf.sizeMatchmatrix[1]
- numq = Conf.numQualSuppPoints
- totalQualSP = Conf.totalQualSuppPoints
-
- currentPhi = zeros((Conf.numFeatures,1))
- totalQualityPenalties = zeros((totalQualSP,1))
-
- iteration_nr = 0
- param_idx = 0
- const_added_ctr = 0
- while True:
- if iteration_nr == iteration_steps:
- break
-
- for exampleIdx in range(self.numExamples):
- if (exampleIdx%100) == 0:
- print 'Current example nr %d' % exampleIdx
-
- dna = Sequences[exampleIdx]
- est = Ests[exampleIdx]
-
- if Conf.mode == 'normal':
- quality = [40]*len(est)
-
- if Conf.mode == 'using_quality_scores':
- quality = Qualities[exampleIdx]
-
- exons = Exons[exampleIdx]
- # NoiseMatrix = Noises[exampleIdx]
- don_supp = Donors[exampleIdx]
- acc_supp = Acceptors[exampleIdx]
-
- # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
- trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
-
- #print 'trueWeights'
- # Calculate the weights
- trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
- trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
-
- currentPhi[0:donSP] = mat(d.penalties[:]).reshape(donSP,1)
- currentPhi[donSP:donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
- currentPhi[donSP+accSP:donSP+accSP+lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
- currentPhi[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP] = mmatrix[:]
-
- #pdb.set_trace()
- if Conf.mode == 'using_quality_scores':
- totalQualityPenalties = param[-totalQualSP:]
- currentPhi[donSP+accSP+lengthSP+mmatrixSP:] = totalQualityPenalties[:]
-
- # Calculate w'phi(x,y) the total score of the alignment
- trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
-
- # The allWeights vector is supposed to store the weight parameter
- # of the true alignment as well as the weight parameters of the
- # num_path[exampleIdx] other alignments
- allWeights = zeros((Conf.numFeatures,num_path[exampleIdx]+1))
- allWeights[:,0] = trueWeight[:,0]
-
- AlignmentScores = [0.0]*(num_path[exampleIdx]+1)
- AlignmentScores[0] = trueAlignmentScore
-
- ################## Calculate wrong alignment(s) ######################
-
- # Compute donor, acceptor with penalty_lookup_new
- # returns two double lists
- donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
-
- #myalign wants the acceptor site on the g of the ag
- acceptor = acceptor[1:]
- acceptor.append(-inf)
-
- # for now we don't use donor/acceptor scores
-
- #donor = [-inf] * len(donor)
- #acceptor = [-inf] * len(donor)
-
- dna = str(dna)
- est = str(est)
- dna_len = len(dna)
- est_len = len(est)
-
- ps = h.convert2SWIG()
-
- prb = QPalmaDP.createDoubleArrayFromList(quality)
- chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
-
- matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
- mm_len = Conf.sizeMatchmatrix[0]*Conf.sizeMatchmatrix[1]
-
- d_len = len(donor)
- donor = QPalmaDP.createDoubleArrayFromList(donor)
- a_len = len(acceptor)
- acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
-
- # Create the alignment object representing the interface to the C/C++ code.
- currentAlignment = QPalmaDP.Alignment(Conf.numQualPlifs,Conf.numQualSuppPoints, self.use_quality_scores)
-
- c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
- #print 'Calling myalign...'
- # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
- currentAlignment.myalign( num_path[exampleIdx], dna, dna_len,\
- est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
- acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
- print_matrix)
-
- #print 'After calling myalign...'
- #print 'Calling getAlignmentResults...'
-
- c_SpliceAlign = QPalmaDP.createIntArrayFromList([0]*(dna_len*num_path[exampleIdx]))
- c_EstAlign = QPalmaDP.createIntArrayFromList([0]*(est_len*num_path[exampleIdx]))
- c_WeightMatch = QPalmaDP.createIntArrayFromList([0]*(mm_len*num_path[exampleIdx]))
- c_DPScores = QPalmaDP.createDoubleArrayFromList([.0]*num_path[exampleIdx])
-
- c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(Conf.totalQualSuppPoints*num_path[exampleIdx]))
-
- currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
- c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
-
- #print 'After calling getAlignmentResults...'
-
- newSpliceAlign = zeros((num_path[exampleIdx]*dna_len,1))
- newEstAlign = zeros((est_len*num_path[exampleIdx],1))
- newWeightMatch = zeros((num_path[exampleIdx]*mm_len,1))
- newDPScores = zeros((num_path[exampleIdx],1))
- newQualityPlifsFeatures = zeros((Conf.totalQualSuppPoints*num_path[exampleIdx],1))
-
- #print 'newSpliceAlign'
- for i in range(dna_len*num_path[exampleIdx]):
- newSpliceAlign[i] = c_SpliceAlign[i]
- # print '%f' % (spliceAlign[i])
-
- #print 'newEstAlign'
- for i in range(est_len*num_path[exampleIdx]):
- newEstAlign[i] = c_EstAlign[i]
- # print '%f' % (spliceAlign[i])
-
- #print 'weightMatch'
- for i in range(mm_len*num_path[exampleIdx]):
- newWeightMatch[i] = c_WeightMatch[i]
- # print '%f' % (weightMatch[i])
-
- #print 'ViterbiScores'
- for i in range(num_path[exampleIdx]):
- newDPScores[i] = c_DPScores[i]
-
-
- if self.use_quality_scores:
- for i in range(Conf.totalQualSuppPoints*num_path[exampleIdx]):
- newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
-
- # equals palma up to here
-
- #print "Calling destructors"
- del c_SpliceAlign
- del c_EstAlign
- del c_WeightMatch
- del c_DPScores
- del c_qualityPlifsFeatures
- del currentAlignment
-
- newSpliceAlign = newSpliceAlign.reshape(num_path[exampleIdx],dna_len)
- newWeightMatch = newWeightMatch.reshape(num_path[exampleIdx],mm_len)
- # Calculate weights of the respective alignments Note that we are
- # calculating n-best alignments without hamming loss, so we
- # have to keep track which of the n-best alignments correspond to
- # the true one in order not to incorporate a true alignment in the
- # constraints. To keep track of the true and false alignments we
- # define an array true_map with a boolean indicating the
- # equivalence to the true alignment for each decoded alignment.
- true_map = [0]*(num_path[exampleIdx]+1)
- true_map[0] = 1
- path_loss = [0]*(num_path[exampleIdx])
-
- for pathNr in range(num_path[exampleIdx]):
- #print 'decodedWeights'
- weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a,\
- h, newSpliceAlign[pathNr,:].flatten().tolist()[0], don_supp,\
- acc_supp)
-
- decodedQualityFeatures = zeros((Conf.totalQualSuppPoints,1))
- for qidx in range(Conf.totalQualSuppPoints):
- decodedQualityFeatures[qidx] = newQualityPlifsFeatures[(pathNr*Conf.totalQualSuppPoints)+qidx]
-
- #pdb.set_trace()
-
- path_loss[pathNr] = 0
- # sum up positionwise loss between alignments
- for alignPosIdx in range(newSpliceAlign[pathNr,:].shape[1]):
- if newSpliceAlign[pathNr,alignPosIdx] != trueSpliceAlign[alignPosIdx]:
- path_loss[pathNr] += 1
-
- #pdb.set_trace()
-
- # Gewichte in restliche Zeilen der Matrix speichern
- wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch[pathNr,:].T, decodedQualityFeatures])
- allWeights[:,pathNr+1] = wp
-
- hpen = mat(h.penalties).reshape(len(h.penalties),1)
- dpen = mat(d.penalties).reshape(len(d.penalties),1)
- apen = mat(a.penalties).reshape(len(a.penalties),1)
- features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties])
-
- AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
-
- # Check wether scalar product + loss equals viterbi score
- #print 'Example nr.: %d, path nr. %d, scores: %f vs %f' % (exampleIdx,pathNr,newDPScores[pathNr,0], AlignmentScores[pathNr+1])
-
- distinct_scores = False
- if math.fabs(AlignmentScores[pathNr] - AlignmentScores[pathNr+1]) > 1e-5:
- distinct_scores = True
-
- #if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) + [0,1][distinct_scores and (pathNr>0)] <= 1e-5:
- if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) <= 1e-5:
- pdb.set_trace()
-
- # # if the pathNr-best alignment is very close to the true alignment consider it as true
- if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
- true_map[pathNr+1] = 1
-
- # assert AlignmentScores[0] > max(AlignmentScores[1:]) + 1e-6, pdb.set_trace()
-
- # the true label sequence should not have a larger score than the maximal one WHYYYYY?
- # this means that all n-best paths are to close to each other
- # we have to extend the n-best search to a (n+1)-best
- if len([elem for elem in true_map if elem == 1]) == len(true_map):
- num_path[exampleIdx] = num_path[exampleIdx]+1
-
- # Choose true and first false alignment for extending A
- firstFalseIdx = -1
- for map_idx,elem in enumerate(true_map):
- if elem == 0:
- firstFalseIdx = map_idx
- break
-
- # if there is at least one useful false alignment add the
- # corresponding constraints to the optimization problem
- if firstFalseIdx != -1:
- trueWeights = allWeights[:,0]
- firstFalseWeights = allWeights[:,firstFalseIdx]
- differenceVector = trueWeights - firstFalseWeights
- #pdb.set_trace()
-
- if Conf.USE_OPT:
- const_added = solver.addConstraint(differenceVector, exampleIdx)
- const_added_ctr += 1
- #
- # end of one example processing
- #
-
- # call solver every nth example //added constraint
- if exampleIdx != 0 and exampleIdx % 100 == 0 and Conf.USE_OPT:
- objValue,w,self.slacks = solver.solve()
-
- print "objValue is %f" % objValue
-
- sum_xis = 0
- for elem in self.slacks:
- sum_xis += elem
-
- for i in range(len(param)):
- param[i] = w[i]
-
- #pdb.set_trace()
- cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
- param_idx += 1
- [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
-
- #
- # end of one iteration through all examples
- #
- iteration_nr += 1
-
- #
- # end of optimization
- #
- print 'Training completed'
-
- pa = para()
- pa.h = h
- pa.d = d
- pa.a = a
- pa.mmatrix = mmatrix
- pa.qualityPlifs = qualityPlifs
-
- cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
- #cPickle.dump(pa,open('elegans.param','w+'))
- self.logfh.close()
-
-
-
- def predict(self):
- beg = Conf.prediction_begin
- end = Conf.prediction_end
-
- Sequences = self.Sequences[beg:end]
- Exons = self.Exons[beg:end]
- Ests = self.Ests[beg:end]
- Qualities = self.Qualities[beg:end]
- Acceptors = self.Acceptors[beg:end]
- Donors = self.Donors[beg:end]
-
- remove_duplicate_scores = Conf.remove_duplicate_scores
- print_matrix = Conf.print_matrix
- anzpath = Conf.anzpath
-
- param_filename='/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/param_28.pickle'
- param = load_param(param_filename)
-
- # Set the parameters such as limits penalties for the Plifs
- [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
-
- #############################################################################################
- # Prediction
- #############################################################################################
- self.plog('Starting prediction...\n')
-
- donSP = Conf.numDonSuppPoints
- accSP = Conf.numAccSuppPoints
- lengthSP = Conf.numLengthSuppPoints
- mmatrixSP = Conf.sizeMatchmatrix[0]\
- *Conf.sizeMatchmatrix[1]
- numq = Conf.numQualSuppPoints
- totalQualSP = Conf.totalQualSuppPoints
-
- currentPhi = zeros((Conf.numFeatures,1))
- totalQualityPenalties = zeros((totalQualSP,1))
-
- total_up_off = []
- total_down_off = []
-
- for exampleIdx in range(self.numExamples):
- dna = Sequences[exampleIdx]
- est = Ests[exampleIdx]
- currentSplitPos = SplitPos[exampleIdx]
-
- if Conf.mode == 'normal':
- quality = [40]*len(est)
-
- if Conf.mode == 'using_quality_scores':
- quality = Qualities[exampleIdx]
-
- exons = Exons[exampleIdx]
- # NoiseMatrix = Noises[exampleIdx]
- don_supp = Donors[exampleIdx]
- acc_supp = Acceptors[exampleIdx]
-
- if exons[-1,1] > len(dna):
- continue
-
- # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
- trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
-
- # Calculate the weights
- trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
- trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
-
- currentPhi[0:donSP] = mat(d.penalties[:]).reshape(donSP,1)
- currentPhi[donSP:donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
- currentPhi[donSP+accSP:donSP+accSP+lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
- currentPhi[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP] = mmatrix[:]
-
- if Conf.mode == 'using_quality_scores':
- totalQualityPenalties = param[-totalQualSP:]
- currentPhi[donSP+accSP+lengthSP+mmatrixSP:] = totalQualityPenalties[:]
-
- # Calculate w'phi(x,y) the total score of the alignment
- trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
-
- # The allWeights vector is supposed to store the weight parameter
- # of the true alignment as well as the weight parameters of the
- # 1 other alignments
- allWeights = zeros((Conf.numFeatures,1+1))
- allWeights[:,0] = trueWeight[:,0]
-
- AlignmentScores = [0.0]*(1+1)
- AlignmentScores[0] = trueAlignmentScore
-
- ################## Calculate wrong alignment(s) ######################
-
- # Compute donor, acceptor with penalty_lookup_new
- # returns two double lists
- donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
-
- #myalign wants the acceptor site on the g of the ag
- acceptor = acceptor[1:]
- acceptor.append(-inf)
-
- dna = str(dna)
- est = str(est)
- dna_len = len(dna)
- est_len = len(est)
-
- ps = h.convert2SWIG()
-
- prb = QPalmaDP.createDoubleArrayFromList(quality)
- chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
-
- matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
- mm_len = Conf.sizeMatchmatrix[0]*Conf.sizeMatchmatrix[1]
-
- d_len = len(donor)
- donor = QPalmaDP.createDoubleArrayFromList(donor)
- a_len = len(acceptor)
- acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
-
- # Create the alignment object representing the interface to the C/C++ code.
- currentAlignment = QPalmaDP.Alignment(Conf.numQualPlifs,Conf.numQualSuppPoints, self.use_quality_scores)
-
- c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
-
- # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
- currentAlignment.myalign(1, dna, dna_len,\
- est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
- acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
- print_matrix)
-
- c_SpliceAlign = QPalmaDP.createIntArrayFromList([0]*(dna_len*1))
- c_EstAlign = QPalmaDP.createIntArrayFromList([0]*(est_len*1))
- c_WeightMatch = QPalmaDP.createIntArrayFromList([0]*(mm_len*1))
- c_DPScores = QPalmaDP.createDoubleArrayFromList([.0]*1)
-
- c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(Conf.totalQualSuppPoints))
-
- currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
- c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
-
- newSpliceAlign = zeros((dna_len,1))
- newEstAlign = zeros((est_len,1))
- newWeightMatch = zeros((mm_len,1))
- newQualityPlifsFeatures = zeros((Conf.totalQualSuppPoints,1))
-
- #print 'newSpliceAlign'
- for i in range(dna_len):
- newSpliceAlign[i] = c_SpliceAlign[i]
- # print '%f' % (spliceAlign[i])
-
- #print 'newEstAlign'
- for i in range(est_len):
- newEstAlign[i] = c_EstAlign[i]
- # print '%f' % (spliceAlign[i])
-
- #print 'weightMatch'
- for i in range(mm_len):
- newWeightMatch[i] = c_WeightMatch[i]
- # print '%f' % (weightMatch[i])
-
- newDPScores = c_DPScores[0]
-
- for i in range(Conf.totalQualSuppPoints):
- newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
-
- #print "Calling destructors"
- del c_SpliceAlign
- del c_EstAlign
- del c_WeightMatch
- del c_DPScores
- del c_qualityPlifsFeatures
- del currentAlignment
-
- newSpliceAlign = newSpliceAlign.reshape(1,dna_len)
- newWeightMatch = newWeightMatch.reshape(1,mm_len)
- # Calculate weights of the respective alignments Note that we are
- # calculating n-best alignments without hamming loss, so we
- # have to keep track which of the n-best alignments correspond to
- # the true one in order not to incorporate a true alignment in the
- # constraints. To keep track of the true and false alignments we
- # define an array true_map with a boolean indicating the
- # equivalence to the true alignment for each decoded alignment.
- true_map = [0]*2
- true_map[0] = 1
- pathNr = 0
-
- weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a, h, newSpliceAlign.flatten().tolist()[0], don_supp, acc_supp)
-
- decodedQualityFeatures = zeros((Conf.totalQualSuppPoints,1))
- for qidx in range(Conf.totalQualSuppPoints):
- decodedQualityFeatures[qidx] = newQualityPlifsFeatures[qidx]
-
- # Gewichte in restliche Zeilen der Matrix speichern
- wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch.T, decodedQualityFeatures])
- allWeights[:,pathNr+1] = wp
-
- hpen = mat(h.penalties).reshape(len(h.penalties),1)
- dpen = mat(d.penalties).reshape(len(d.penalties),1)
- apen = mat(a.penalties).reshape(len(a.penalties),1)
- features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties])
- AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
-
- # if the pathNr-best alignment is very close to the true alignment consider it as true
- if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
- true_map[pathNr+1] = 1
-
- up_off,down_off = evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign,currentSplitPos)
- #evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign,currentSplitPos)
- #print up_off,down_off
-
- if up_off > -1:
- total_up_off.append(up_off)
- total_down_off.append(down_off)
-
- total_up = 0
- total_down = 0
- for idx in range(len(total_up_off)):
- total_up += total_up_off[idx]
- total_down += total_down_off[idx]
-
- total_up /= len(total_up_off)
- total_down /= len(total_down_off)
-
- print 'Mean up_off is %f' % total_up
- print 'Mean down_off is %f' % total_down
- print 'Correct up_off len is %d' % len([elem for elem in total_up_off if elem in [0,1]])
- print 'Correct down_off len is %d' % len([elem for elem in total_down_off if elem in [0,1]])
-
- print 'Prediction completed'
- self.logfh.close()
-
-def fetch_genome_info(ginfo_filename):
- if not os.path.exists(ginfo_filename):
- cmd = ['']*4
- cmd[0] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/utils'
- cmd[1] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/genomes'
- cmd[2] = 'genome_info = init_genome(\'%s\')' % gen_file
- cmd[3] = 'save genome_info.mat genome_info'
- full_cmd = "matlab -nojvm -nodisplay -r \"%s; %s; %s; %s; exit\"" % (cmd[0],cmd[1],cmd[2],cmd[3])
-
- obj = subprocess.Popen(full_cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
- out,err = obj.communicate()
- assert err == '', 'An error occured!\n%s'%err
-
- ginfo = scipy.io.loadmat('genome_info.mat')
- cPickle.dump(self.genome_info,open(ginfo_filename,'w+'))
- return ginfo['genome_info']
-
- else:
- return cPickle.load(open(ginfo_filename))
-
-def plifs2param(h,d,a,mmatrix,qualityPlifs):
- donSP = Conf.numDonSuppPoints
- accSP = Conf.numAccSuppPoints
- lengthSP = Conf.numLengthSuppPoints
- mmatrixSP = Conf.sizeMatchmatrix[0]\
- *Conf.sizeMatchmatrix[1]
-
-
- param = zeros((Conf.numFeatures,1))
- param[0:donSP] = mat(d.penalties[:]).reshape(donSP,1)
- param[donSP:donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
- param[donSP+accSP:donSP+accSP+lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
- param[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP] = mmatrix[:]
-
- for idx in range(len(qualityPlifs)):
- currentPlif = qualityPlifs[idx]
- begin = lengthSP+donSP+accSP+mmatrixSP+(idx*Conf.numQualSuppPoints)
- end = lengthSP+donSP+accSP+mmatrixSP+((idx+1)*Conf.numQualSuppPoints)
- param[begin:end] = mat(currentPlif.penalties).reshape(Conf.numQualSuppPoints,1)
-
- return param
-
-def load_param(filename):
- param = None
- #try:
- # para = cPickle.load(open(filename))
- #except:
- # print 'Error: Could not open parameter file!'
- # sys.exit(1)
- #
- #param = plifs2param(para.h,para.d,para.a,para.mmatrix,para.qualityPlifs)
-
- param = cPickle.load(open(filename))
- return param
-
-def evaluateExample(dna,est,exons,SpliceAlign,newEstAlign,spos):
- newExons = []
- oldElem = -1
- SpliceAlign = SpliceAlign.flatten().tolist()[0]
- SpliceAlign.append(-1)
- for pos,elem in enumerate(SpliceAlign):
- if pos == 0:
- oldElem = -1
- else:
- oldElem = SpliceAlign[pos-1]
-
- if oldElem != 0 and elem == 0: # start of exon
- newExons.append(pos-1)
-
- if oldElem == 0 and elem != 0: # end of exon
- newExons.append(pos)
-
- up_off = -1
- down_off = -1
-
- if len(newExons) != 4:
- acc = 0.0
- else:
- e1_begin,e1_end = newExons[0],newExons[1]
- e2_begin,e2_end = newExons[2],newExons[3]
-
- up_off = int(math.fabs(e1_end - exons[0,1]))
- down_off = int(math.fabs(e2_begin - exons[1,0]))
-
- return up_off,down_off
-
-
-if __name__ == '__main__':
-
- qpalma = QPalma()
- if len(sys.argv) != 2:
- print 'You have to choose between training or prediction mode:'
- print 'python qpalma. py (train|predict)'
-
- mode = sys.argv[1]
- assert mode in ['train','predict']
-
- if mode == 'train':
- qpalma.train()
-
- if mode == 'predict':
- qpalma.predict()
-
--- /dev/null
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+###########################################################
+#
+# The QPalma project aims at extending the Palma project
+# to be able to use Solexa reads toegether with their
+# quality scores.
+#
+# This file represents the conversion of the main matlab
+# training loop for Palma to python.
+#
+# Author: Fabio De Bona
+#
+###########################################################
+
+import sys
+import subprocess
+import scipy.io
+import pdb
+import os.path
+
+from numpy.matlib import mat,zeros,ones,inf
+from numpy.linalg import norm
+
+import QPalmaDP
+import qpalma
+from qpalma.SIQP_CPX import SIQPSolver
+from qpalma.DataProc import *
+from qpalma.generateEvaluationData import *
+from qpalma.computeSpliceWeights import *
+from qpalma.set_param_palma import *
+from qpalma.computeSpliceAlignWithQuality import *
+from qpalma.penalty_lookup_new import *
+from qpalma.compute_donacc import *
+from qpalma.export_param import *
+from qpalma.TrainingParam import Param
+from qpalma.Plif import Plf
+
+from qpalma.tools.splicesites import getDonAccScores
+
+from qpalma.Configuration import *
+
+class para:
+ pass
+
+class QPalma:
+ """
+ A training method for the QPalma project
+ """
+
+ def __init__(self):
+ self.ARGS = Param()
+ self.logfh = open('_qpalma.log','w+')
+
+ #gen_file= '%s/genome.config' % self.ARGS.basedir
+ #ginfo_filename = 'genome_info.pickle'
+ #self.genome_info = fetch_genome_info(ginfo_filename)
+ #self.plog('genome_info.basedir is %s\n'%self.genome_info.basedir)
+
+ #self.ARGS.train_with_splicesitescoreinformation = False
+
+ # Load the whole dataset
+ if Conf.mode == 'normal':
+ #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
+ Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
+ self.use_quality_scores = False
+
+ elif Conf.mode == 'using_quality_scores':
+
+ filename = 'real_dset_%s'% 'recent'
+ if True:# not os.path.exists(filename):
+ Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos = paths_load_data_solexa('training',None,self.ARGS)
+
+ end = 1000
+ self.Sequences = Sequences[:end]
+ self.Exons = Exons[:end]
+ self.Ests = Ests[:end]
+ self.Qualities = Qualities[:end]
+ self.Acceptors = Acceptors[:end]
+ self.Donors = Donors[:end]
+
+ self.Donors, self.Acceptors = getDonAccScores(Sequences)
+ #dset = Dataset()
+ else:
+ dset = cPickle.load(open(filename))
+ Sequences = dset.Sequences
+ Acceptors = dset.Acceptors
+ Donors = dset.Donors
+ Exons = dset.Exons
+ Ests = dset.Ests
+ Qualities = dset.Qualities
+
+ #Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
+ #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
+ #Qualities = []
+ #for i in range(len(Ests)):
+ # Qualities.append([40]*len(Ests[i]))
+ self.use_quality_scores = True
+ else:
+ assert(False)
+
+ # number of training instances
+ N = len(Sequences)
+ self.numExamples = N
+ assert N == len(Acceptors) and N == len(Acceptors) and N == len(Exons)\
+ and N == len(Ests), 'The Seq,Accept,Donor,.. arrays are of different lengths'
+ self.plog('Number of training examples: %d\n'% N)
+ print 'Number of features: %d\n'% Conf.numFeatures
+
+ def plog(self,string):
+ self.logfh.write(string)
+ self.logfh.flush()
+
+ def train(self):
+ beg = Conf.training_begin
+ end = Conf.training_end
+
+ Sequences = self.Sequences[beg:end]
+ Exons = self.Exons[beg:end]
+ Ests = self.Ests[beg:end]
+ Qualities = self.Qualities[beg:end]
+ Acceptors = self.Acceptors[beg:end]
+ Donors = self.Donors[beg:end]
+
+ iteration_steps = Conf.iter_steps ; #upper bound on iteration steps
+ remove_duplicate_scores = Conf.remove_duplicate_scores
+ print_matrix = Conf.print_matrix
+ anzpath = Conf.anzpath
+
+ # Initialize parameter vector / param = numpy.matlib.rand(126,1)
+ param = Conf.fixedParam
+
+ # Set the parameters such as limits penalties for the Plifs
+ [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
+
+ # delete splicesite-score-information
+ #if not self.ARGS.train_with_splicesitescoreinformation:
+ # for i in range(len(Acceptors)):
+ # if Acceptors[i] > -20:
+ # Acceptors[i] = 1
+ # if Donors[i] >-20:
+ # Donors[i] = 1
+
+ # Initialize solver
+ if Conf.USE_OPT:
+ self.plog('Initializing problem...\n')
+ solver = SIQPSolver(Conf.numFeatures,self.numExamples,Conf.C,self.logfh)
+
+ # stores the number of alignments done for each example (best path, second-best path etc.)
+ num_path = [anzpath]*self.numExamples
+ # stores the gap for each example
+ gap = [0.0]*self.numExamples
+
+ #############################################################################################
+ # Training
+ #############################################################################################
+ self.plog('Starting training...\n')
+
+ donSP = Conf.numDonSuppPoints
+ accSP = Conf.numAccSuppPoints
+ lengthSP = Conf.numLengthSuppPoints
+ mmatrixSP = Conf.sizeMatchmatrix[0]\
+ *Conf.sizeMatchmatrix[1]
+ numq = Conf.numQualSuppPoints
+ totalQualSP = Conf.totalQualSuppPoints
+
+ currentPhi = zeros((Conf.numFeatures,1))
+ totalQualityPenalties = zeros((totalQualSP,1))
+
+ iteration_nr = 0
+ param_idx = 0
+ const_added_ctr = 0
+ while True:
+ if iteration_nr == iteration_steps:
+ break
+
+ for exampleIdx in range(self.numExamples):
+ if (exampleIdx%100) == 0:
+ print 'Current example nr %d' % exampleIdx
+
+ dna = Sequences[exampleIdx]
+ est = Ests[exampleIdx]
+
+ if Conf.mode == 'normal':
+ quality = [40]*len(est)
+
+ if Conf.mode == 'using_quality_scores':
+ quality = Qualities[exampleIdx]
+
+ exons = Exons[exampleIdx]
+ # NoiseMatrix = Noises[exampleIdx]
+ don_supp = Donors[exampleIdx]
+ acc_supp = Acceptors[exampleIdx]
+
+ # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
+ trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
+
+ #print 'trueWeights'
+ # Calculate the weights
+ trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
+ trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
+
+ currentPhi[0:donSP] = mat(d.penalties[:]).reshape(donSP,1)
+ currentPhi[donSP:donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
+ currentPhi[donSP+accSP:donSP+accSP+lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
+ currentPhi[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP] = mmatrix[:]
+
+ #pdb.set_trace()
+ if Conf.mode == 'using_quality_scores':
+ totalQualityPenalties = param[-totalQualSP:]
+ currentPhi[donSP+accSP+lengthSP+mmatrixSP:] = totalQualityPenalties[:]
+
+ # Calculate w'phi(x,y) the total score of the alignment
+ trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
+
+ # The allWeights vector is supposed to store the weight parameter
+ # of the true alignment as well as the weight parameters of the
+ # num_path[exampleIdx] other alignments
+ allWeights = zeros((Conf.numFeatures,num_path[exampleIdx]+1))
+ allWeights[:,0] = trueWeight[:,0]
+
+ AlignmentScores = [0.0]*(num_path[exampleIdx]+1)
+ AlignmentScores[0] = trueAlignmentScore
+
+ ################## Calculate wrong alignment(s) ######################
+
+ # Compute donor, acceptor with penalty_lookup_new
+ # returns two double lists
+ donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
+
+ #myalign wants the acceptor site on the g of the ag
+ acceptor = acceptor[1:]
+ acceptor.append(-inf)
+
+ # for now we don't use donor/acceptor scores
+
+ #donor = [-inf] * len(donor)
+ #acceptor = [-inf] * len(donor)
+
+ dna = str(dna)
+ est = str(est)
+ dna_len = len(dna)
+ est_len = len(est)
+
+ ps = h.convert2SWIG()
+
+ prb = QPalmaDP.createDoubleArrayFromList(quality)
+ chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
+
+ matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
+ mm_len = Conf.sizeMatchmatrix[0]*Conf.sizeMatchmatrix[1]
+
+ d_len = len(donor)
+ donor = QPalmaDP.createDoubleArrayFromList(donor)
+ a_len = len(acceptor)
+ acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
+
+ # Create the alignment object representing the interface to the C/C++ code.
+ currentAlignment = QPalmaDP.Alignment(Conf.numQualPlifs,Conf.numQualSuppPoints, self.use_quality_scores)
+
+ c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
+ #print 'Calling myalign...'
+ # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
+ currentAlignment.myalign( num_path[exampleIdx], dna, dna_len,\
+ est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
+ acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
+ print_matrix)
+
+ #print 'After calling myalign...'
+ #print 'Calling getAlignmentResults...'
+
+ c_SpliceAlign = QPalmaDP.createIntArrayFromList([0]*(dna_len*num_path[exampleIdx]))
+ c_EstAlign = QPalmaDP.createIntArrayFromList([0]*(est_len*num_path[exampleIdx]))
+ c_WeightMatch = QPalmaDP.createIntArrayFromList([0]*(mm_len*num_path[exampleIdx]))
+ c_DPScores = QPalmaDP.createDoubleArrayFromList([.0]*num_path[exampleIdx])
+
+ c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(Conf.totalQualSuppPoints*num_path[exampleIdx]))
+
+ currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
+ c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
+
+ #print 'After calling getAlignmentResults...'
+
+ newSpliceAlign = zeros((num_path[exampleIdx]*dna_len,1))
+ newEstAlign = zeros((est_len*num_path[exampleIdx],1))
+ newWeightMatch = zeros((num_path[exampleIdx]*mm_len,1))
+ newDPScores = zeros((num_path[exampleIdx],1))
+ newQualityPlifsFeatures = zeros((Conf.totalQualSuppPoints*num_path[exampleIdx],1))
+
+ #print 'newSpliceAlign'
+ for i in range(dna_len*num_path[exampleIdx]):
+ newSpliceAlign[i] = c_SpliceAlign[i]
+ # print '%f' % (spliceAlign[i])
+
+ #print 'newEstAlign'
+ for i in range(est_len*num_path[exampleIdx]):
+ newEstAlign[i] = c_EstAlign[i]
+ # print '%f' % (spliceAlign[i])
+
+ #print 'weightMatch'
+ for i in range(mm_len*num_path[exampleIdx]):
+ newWeightMatch[i] = c_WeightMatch[i]
+ # print '%f' % (weightMatch[i])
+
+ #print 'ViterbiScores'
+ for i in range(num_path[exampleIdx]):
+ newDPScores[i] = c_DPScores[i]
+
+
+ if self.use_quality_scores:
+ for i in range(Conf.totalQualSuppPoints*num_path[exampleIdx]):
+ newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
+
+ # equals palma up to here
+
+ #print "Calling destructors"
+ del c_SpliceAlign
+ del c_EstAlign
+ del c_WeightMatch
+ del c_DPScores
+ del c_qualityPlifsFeatures
+ del currentAlignment
+
+ newSpliceAlign = newSpliceAlign.reshape(num_path[exampleIdx],dna_len)
+ newWeightMatch = newWeightMatch.reshape(num_path[exampleIdx],mm_len)
+ # Calculate weights of the respective alignments Note that we are
+ # calculating n-best alignments without hamming loss, so we
+ # have to keep track which of the n-best alignments correspond to
+ # the true one in order not to incorporate a true alignment in the
+ # constraints. To keep track of the true and false alignments we
+ # define an array true_map with a boolean indicating the
+ # equivalence to the true alignment for each decoded alignment.
+ true_map = [0]*(num_path[exampleIdx]+1)
+ true_map[0] = 1
+ path_loss = [0]*(num_path[exampleIdx])
+
+ for pathNr in range(num_path[exampleIdx]):
+ #print 'decodedWeights'
+ weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a,\
+ h, newSpliceAlign[pathNr,:].flatten().tolist()[0], don_supp,\
+ acc_supp)
+
+ decodedQualityFeatures = zeros((Conf.totalQualSuppPoints,1))
+ for qidx in range(Conf.totalQualSuppPoints):
+ decodedQualityFeatures[qidx] = newQualityPlifsFeatures[(pathNr*Conf.totalQualSuppPoints)+qidx]
+
+ #pdb.set_trace()
+
+ path_loss[pathNr] = 0
+ # sum up positionwise loss between alignments
+ for alignPosIdx in range(newSpliceAlign[pathNr,:].shape[1]):
+ if newSpliceAlign[pathNr,alignPosIdx] != trueSpliceAlign[alignPosIdx]:
+ path_loss[pathNr] += 1
+
+ #pdb.set_trace()
+
+ # Gewichte in restliche Zeilen der Matrix speichern
+ wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch[pathNr,:].T, decodedQualityFeatures])
+ allWeights[:,pathNr+1] = wp
+
+ hpen = mat(h.penalties).reshape(len(h.penalties),1)
+ dpen = mat(d.penalties).reshape(len(d.penalties),1)
+ apen = mat(a.penalties).reshape(len(a.penalties),1)
+ features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties])
+
+ AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
+
+ # Check wether scalar product + loss equals viterbi score
+ #print 'Example nr.: %d, path nr. %d, scores: %f vs %f' % (exampleIdx,pathNr,newDPScores[pathNr,0], AlignmentScores[pathNr+1])
+
+ distinct_scores = False
+ if math.fabs(AlignmentScores[pathNr] - AlignmentScores[pathNr+1]) > 1e-5:
+ distinct_scores = True
+
+ #if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) + [0,1][distinct_scores and (pathNr>0)] <= 1e-5:
+ if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) <= 1e-5:
+ pdb.set_trace()
+
+ # # if the pathNr-best alignment is very close to the true alignment consider it as true
+ if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
+ true_map[pathNr+1] = 1
+
+ # assert AlignmentScores[0] > max(AlignmentScores[1:]) + 1e-6, pdb.set_trace()
+
+ # the true label sequence should not have a larger score than the maximal one WHYYYYY?
+ # this means that all n-best paths are to close to each other
+ # we have to extend the n-best search to a (n+1)-best
+ if len([elem for elem in true_map if elem == 1]) == len(true_map):
+ num_path[exampleIdx] = num_path[exampleIdx]+1
+
+ # Choose true and first false alignment for extending A
+ firstFalseIdx = -1
+ for map_idx,elem in enumerate(true_map):
+ if elem == 0:
+ firstFalseIdx = map_idx
+ break
+
+ # if there is at least one useful false alignment add the
+ # corresponding constraints to the optimization problem
+ if firstFalseIdx != -1:
+ trueWeights = allWeights[:,0]
+ firstFalseWeights = allWeights[:,firstFalseIdx]
+ differenceVector = trueWeights - firstFalseWeights
+ #pdb.set_trace()
+
+ if Conf.USE_OPT:
+ const_added = solver.addConstraint(differenceVector, exampleIdx)
+ const_added_ctr += 1
+ #
+ # end of one example processing
+ #
+
+ # call solver every nth example //added constraint
+ if exampleIdx != 0 and exampleIdx % 100 == 0 and Conf.USE_OPT:
+ objValue,w,self.slacks = solver.solve()
+
+ print "objValue is %f" % objValue
+
+ sum_xis = 0
+ for elem in self.slacks:
+ sum_xis += elem
+
+ for i in range(len(param)):
+ param[i] = w[i]
+
+ #pdb.set_trace()
+ cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
+ param_idx += 1
+ [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
+
+ #
+ # end of one iteration through all examples
+ #
+ iteration_nr += 1
+
+ #
+ # end of optimization
+ #
+ print 'Training completed'
+
+ pa = para()
+ pa.h = h
+ pa.d = d
+ pa.a = a
+ pa.mmatrix = mmatrix
+ pa.qualityPlifs = qualityPlifs
+
+ cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
+ #cPickle.dump(pa,open('elegans.param','w+'))
+ self.logfh.close()
+
+
+
+ def predict(self):
+ beg = Conf.prediction_begin
+ end = Conf.prediction_end
+
+ Sequences = self.Sequences[beg:end]
+ Exons = self.Exons[beg:end]
+ Ests = self.Ests[beg:end]
+ Qualities = self.Qualities[beg:end]
+ Acceptors = self.Acceptors[beg:end]
+ Donors = self.Donors[beg:end]
+
+ remove_duplicate_scores = Conf.remove_duplicate_scores
+ print_matrix = Conf.print_matrix
+ anzpath = Conf.anzpath
+
+ param_filename='/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/param_28.pickle'
+ param = load_param(param_filename)
+
+ # Set the parameters such as limits penalties for the Plifs
+ [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
+
+ #############################################################################################
+ # Prediction
+ #############################################################################################
+ self.plog('Starting prediction...\n')
+
+ donSP = Conf.numDonSuppPoints
+ accSP = Conf.numAccSuppPoints
+ lengthSP = Conf.numLengthSuppPoints
+ mmatrixSP = Conf.sizeMatchmatrix[0]\
+ *Conf.sizeMatchmatrix[1]
+ numq = Conf.numQualSuppPoints
+ totalQualSP = Conf.totalQualSuppPoints
+
+ currentPhi = zeros((Conf.numFeatures,1))
+ totalQualityPenalties = zeros((totalQualSP,1))
+
+ total_up_off = []
+ total_down_off = []
+
+ for exampleIdx in range(self.numExamples):
+ dna = Sequences[exampleIdx]
+ est = Ests[exampleIdx]
+ currentSplitPos = SplitPos[exampleIdx]
+
+ if Conf.mode == 'normal':
+ quality = [40]*len(est)
+
+ if Conf.mode == 'using_quality_scores':
+ quality = Qualities[exampleIdx]
+
+ exons = Exons[exampleIdx]
+ # NoiseMatrix = Noises[exampleIdx]
+ don_supp = Donors[exampleIdx]
+ acc_supp = Acceptors[exampleIdx]
+
+ if exons[-1,1] > len(dna):
+ continue
+
+ # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
+ trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
+
+ # Calculate the weights
+ trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
+ trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
+
+ currentPhi[0:donSP] = mat(d.penalties[:]).reshape(donSP,1)
+ currentPhi[donSP:donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
+ currentPhi[donSP+accSP:donSP+accSP+lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
+ currentPhi[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP] = mmatrix[:]
+
+ if Conf.mode == 'using_quality_scores':
+ totalQualityPenalties = param[-totalQualSP:]
+ currentPhi[donSP+accSP+lengthSP+mmatrixSP:] = totalQualityPenalties[:]
+
+ # Calculate w'phi(x,y) the total score of the alignment
+ trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
+
+ # The allWeights vector is supposed to store the weight parameter
+ # of the true alignment as well as the weight parameters of the
+ # 1 other alignments
+ allWeights = zeros((Conf.numFeatures,1+1))
+ allWeights[:,0] = trueWeight[:,0]
+
+ AlignmentScores = [0.0]*(1+1)
+ AlignmentScores[0] = trueAlignmentScore
+
+ ################## Calculate wrong alignment(s) ######################
+
+ # Compute donor, acceptor with penalty_lookup_new
+ # returns two double lists
+ donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
+
+ #myalign wants the acceptor site on the g of the ag
+ acceptor = acceptor[1:]
+ acceptor.append(-inf)
+
+ dna = str(dna)
+ est = str(est)
+ dna_len = len(dna)
+ est_len = len(est)
+
+ ps = h.convert2SWIG()
+
+ prb = QPalmaDP.createDoubleArrayFromList(quality)
+ chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
+
+ matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
+ mm_len = Conf.sizeMatchmatrix[0]*Conf.sizeMatchmatrix[1]
+
+ d_len = len(donor)
+ donor = QPalmaDP.createDoubleArrayFromList(donor)
+ a_len = len(acceptor)
+ acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
+
+ # Create the alignment object representing the interface to the C/C++ code.
+ currentAlignment = QPalmaDP.Alignment(Conf.numQualPlifs,Conf.numQualSuppPoints, self.use_quality_scores)
+
+ c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
+
+ # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
+ currentAlignment.myalign(1, dna, dna_len,\
+ est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
+ acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
+ print_matrix)
+
+ c_SpliceAlign = QPalmaDP.createIntArrayFromList([0]*(dna_len*1))
+ c_EstAlign = QPalmaDP.createIntArrayFromList([0]*(est_len*1))
+ c_WeightMatch = QPalmaDP.createIntArrayFromList([0]*(mm_len*1))
+ c_DPScores = QPalmaDP.createDoubleArrayFromList([.0]*1)
+
+ c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(Conf.totalQualSuppPoints))
+
+ currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
+ c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
+
+ newSpliceAlign = zeros((dna_len,1))
+ newEstAlign = zeros((est_len,1))
+ newWeightMatch = zeros((mm_len,1))
+ newQualityPlifsFeatures = zeros((Conf.totalQualSuppPoints,1))
+
+ #print 'newSpliceAlign'
+ for i in range(dna_len):
+ newSpliceAlign[i] = c_SpliceAlign[i]
+ # print '%f' % (spliceAlign[i])
+
+ #print 'newEstAlign'
+ for i in range(est_len):
+ newEstAlign[i] = c_EstAlign[i]
+ # print '%f' % (spliceAlign[i])
+
+ #print 'weightMatch'
+ for i in range(mm_len):
+ newWeightMatch[i] = c_WeightMatch[i]
+ # print '%f' % (weightMatch[i])
+
+ newDPScores = c_DPScores[0]
+
+ for i in range(Conf.totalQualSuppPoints):
+ newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
+
+ #print "Calling destructors"
+ del c_SpliceAlign
+ del c_EstAlign
+ del c_WeightMatch
+ del c_DPScores
+ del c_qualityPlifsFeatures
+ del currentAlignment
+
+ newSpliceAlign = newSpliceAlign.reshape(1,dna_len)
+ newWeightMatch = newWeightMatch.reshape(1,mm_len)
+ # Calculate weights of the respective alignments Note that we are
+ # calculating n-best alignments without hamming loss, so we
+ # have to keep track which of the n-best alignments correspond to
+ # the true one in order not to incorporate a true alignment in the
+ # constraints. To keep track of the true and false alignments we
+ # define an array true_map with a boolean indicating the
+ # equivalence to the true alignment for each decoded alignment.
+ true_map = [0]*2
+ true_map[0] = 1
+ pathNr = 0
+
+ weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a, h, newSpliceAlign.flatten().tolist()[0], don_supp, acc_supp)
+
+ decodedQualityFeatures = zeros((Conf.totalQualSuppPoints,1))
+ for qidx in range(Conf.totalQualSuppPoints):
+ decodedQualityFeatures[qidx] = newQualityPlifsFeatures[qidx]
+
+ # Gewichte in restliche Zeilen der Matrix speichern
+ wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch.T, decodedQualityFeatures])
+ allWeights[:,pathNr+1] = wp
+
+ hpen = mat(h.penalties).reshape(len(h.penalties),1)
+ dpen = mat(d.penalties).reshape(len(d.penalties),1)
+ apen = mat(a.penalties).reshape(len(a.penalties),1)
+ features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties])
+ AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
+
+ # if the pathNr-best alignment is very close to the true alignment consider it as true
+ if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
+ true_map[pathNr+1] = 1
+
+ up_off,down_off = evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign,currentSplitPos)
+ #evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign,currentSplitPos)
+ #print up_off,down_off
+
+ if up_off > -1:
+ total_up_off.append(up_off)
+ total_down_off.append(down_off)
+
+ total_up = 0
+ total_down = 0
+ for idx in range(len(total_up_off)):
+ total_up += total_up_off[idx]
+ total_down += total_down_off[idx]
+
+ total_up /= len(total_up_off)
+ total_down /= len(total_down_off)
+
+ print 'Mean up_off is %f' % total_up
+ print 'Mean down_off is %f' % total_down
+ print 'Correct up_off len is %d' % len([elem for elem in total_up_off if elem in [0,1]])
+ print 'Correct down_off len is %d' % len([elem for elem in total_down_off if elem in [0,1]])
+
+ print 'Prediction completed'
+ self.logfh.close()
+
+def fetch_genome_info(ginfo_filename):
+ if not os.path.exists(ginfo_filename):
+ cmd = ['']*4
+ cmd[0] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/utils'
+ cmd[1] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/genomes'
+ cmd[2] = 'genome_info = init_genome(\'%s\')' % gen_file
+ cmd[3] = 'save genome_info.mat genome_info'
+ full_cmd = "matlab -nojvm -nodisplay -r \"%s; %s; %s; %s; exit\"" % (cmd[0],cmd[1],cmd[2],cmd[3])
+
+ obj = subprocess.Popen(full_cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
+ out,err = obj.communicate()
+ assert err == '', 'An error occured!\n%s'%err
+
+ ginfo = scipy.io.loadmat('genome_info.mat')
+ cPickle.dump(self.genome_info,open(ginfo_filename,'w+'))
+ return ginfo['genome_info']
+
+ else:
+ return cPickle.load(open(ginfo_filename))
+
+def plifs2param(h,d,a,mmatrix,qualityPlifs):
+ donSP = Conf.numDonSuppPoints
+ accSP = Conf.numAccSuppPoints
+ lengthSP = Conf.numLengthSuppPoints
+ mmatrixSP = Conf.sizeMatchmatrix[0]\
+ *Conf.sizeMatchmatrix[1]
+
+
+ param = zeros((Conf.numFeatures,1))
+ param[0:donSP] = mat(d.penalties[:]).reshape(donSP,1)
+ param[donSP:donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
+ param[donSP+accSP:donSP+accSP+lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
+ param[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP] = mmatrix[:]
+
+ for idx in range(len(qualityPlifs)):
+ currentPlif = qualityPlifs[idx]
+ begin = lengthSP+donSP+accSP+mmatrixSP+(idx*Conf.numQualSuppPoints)
+ end = lengthSP+donSP+accSP+mmatrixSP+((idx+1)*Conf.numQualSuppPoints)
+ param[begin:end] = mat(currentPlif.penalties).reshape(Conf.numQualSuppPoints,1)
+
+ return param
+
+def load_param(filename):
+ param = None
+ #try:
+ # para = cPickle.load(open(filename))
+ #except:
+ # print 'Error: Could not open parameter file!'
+ # sys.exit(1)
+ #
+ #param = plifs2param(para.h,para.d,para.a,para.mmatrix,para.qualityPlifs)
+
+ param = cPickle.load(open(filename))
+ return param
+
+def evaluateExample(dna,est,exons,SpliceAlign,newEstAlign,spos):
+ newExons = []
+ oldElem = -1
+ SpliceAlign = SpliceAlign.flatten().tolist()[0]
+ SpliceAlign.append(-1)
+ for pos,elem in enumerate(SpliceAlign):
+ if pos == 0:
+ oldElem = -1
+ else:
+ oldElem = SpliceAlign[pos-1]
+
+ if oldElem != 0 and elem == 0: # start of exon
+ newExons.append(pos-1)
+
+ if oldElem == 0 and elem != 0: # end of exon
+ newExons.append(pos)
+
+ up_off = -1
+ down_off = -1
+
+ if len(newExons) != 4:
+ acc = 0.0
+ else:
+ e1_begin,e1_end = newExons[0],newExons[1]
+ e2_begin,e2_end = newExons[2],newExons[3]
+
+ up_off = int(math.fabs(e1_end - exons[0,1]))
+ down_off = int(math.fabs(e2_begin - exons[1,0]))
+
+ return up_off,down_off
+
+
+if __name__ == '__main__':
+
+ qpalma = QPalma()
+ if len(sys.argv) != 2:
+ print 'You have to choose between training or prediction mode:'
+ print 'python qpalma. py (train|predict)'
+
+ mode = sys.argv[1]
+ assert mode in ['train','predict']
+
+ if mode == 'train':
+ qpalma.train()
+
+ if mode == 'predict':
+ qpalma.predict()
+