From 2e7d55a1ae0e0d12a40f30ed7ee4ec725475e55c Mon Sep 17 00:00:00 2001 From: fabio Date: Fri, 30 Nov 2007 15:53:57 +0000 Subject: [PATCH] + added c code from the palma project + added own version of the swig interface TODO - think about wrapper for structs / cells user either SWIG constrcuts or native python object git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@6902 e1793c9e-67f9-0310-80fc-b846ff1f7b36 --- QPalmaDP/Makefile | 24 ++ QPalmaDP/Mathmatics.cpp | 349 +++++++++++++++++++++ QPalmaDP/Mathmatics.h | 546 +++++++++++++++++++++++++++++++++ QPalmaDP/QPalmaDP.i | 46 +++ QPalmaDP/common.h | 160 ++++++++++ QPalmaDP/config.h | 19 ++ QPalmaDP/fill_matrix.cpp | 421 +++++++++++++++++++++++++ QPalmaDP/fill_matrix.h | 51 +++ QPalmaDP/io.cpp | 96 ++++++ QPalmaDP/io.h | 33 ++ QPalmaDP/penalty_info.cpp | 354 +++++++++++++++++++++ QPalmaDP/penalty_info.h | 42 +++ QPalmaDP/print_align.cpp | 97 ++++++ QPalmaDP/qpalma_dp.cpp | 321 +++++++++++++++++++ QPalmaDP/qpalma_dp.h | 83 +++++ QPalmaDP/result_align.cpp | 156 ++++++++++ python/.qpalma.py.swp | Bin 53248 -> 57344 bytes python/computeSpliceWeights.py | 64 ++-- python/qpalma.py | 49 +-- 19 files changed, 2858 insertions(+), 53 deletions(-) create mode 100644 QPalmaDP/Makefile create mode 100644 QPalmaDP/Mathmatics.cpp create mode 100644 QPalmaDP/Mathmatics.h create mode 100644 QPalmaDP/QPalmaDP.i create mode 100644 QPalmaDP/common.h create mode 100644 QPalmaDP/config.h create mode 100644 QPalmaDP/fill_matrix.cpp create mode 100644 QPalmaDP/fill_matrix.h create mode 100644 QPalmaDP/io.cpp create mode 100644 QPalmaDP/io.h create mode 100644 QPalmaDP/penalty_info.cpp create mode 100644 QPalmaDP/penalty_info.h create mode 100644 QPalmaDP/print_align.cpp create mode 100644 QPalmaDP/qpalma_dp.cpp create mode 100644 QPalmaDP/qpalma_dp.h create mode 100644 QPalmaDP/result_align.cpp diff --git a/QPalmaDP/Makefile b/QPalmaDP/Makefile new file mode 100644 index 0000000..42574ea --- /dev/null +++ b/QPalmaDP/Makefile @@ -0,0 +1,24 @@ +SRCS= Mathmatics.cpp\ + fill_matrix.cpp\ + result_align.cpp\ + penalty_info.cpp\ + print_align.cpp\ + io.cpp + +OBJS = $(SRCS:%.cpp=%.o) + +#CXXFLAGS=-O3 -fPIC +#CXXFLAGS=-O3 -fPIC -pg -fprofile-arcs +CXXFLAGS=-O3 -fPIC -ggdb + +IF=QPalmaDP + +all: $(OBJS) + swig -c++ -python ${IF}.i + g++ $(CXXFLAGS) -I/usr/include/python2.5 -c ${IF}_wrap.cxx -o ${IF}_wrap.o + g++ $(CXXFLAGS) -shared -lpython2.5 $(OBJS) ${IF}_wrap.o -o _${IF}.so + python -c "import ${IF}" + +clean: + rm *.o *.so *.cxx ${IF}.py ${IF}.pyc + diff --git a/QPalmaDP/Mathmatics.cpp b/QPalmaDP/Mathmatics.cpp new file mode 100644 index 0000000..61298da --- /dev/null +++ b/QPalmaDP/Mathmatics.cpp @@ -0,0 +1,349 @@ +// Math.cpp: implementation of the CMath class. +// +////////////////////////////////////////////////////////////////////// + + +#include "Mathmatics.h" +#include "io.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +////////////////////////////////////////////////////////////////////// +// Construction/Destruction +////////////////////////////////////////////////////////////////////// + +#ifdef USE_LOGCACHE +//gene/math specific constants +#ifdef USE_HMMDEBUG +#define MAX_LOG_TABLE_SIZE 10*1024*1024 +#define LOG_TABLE_PRECISION 1e-6 +#else +#define MAX_LOG_TABLE_SIZE 123*1024*1024 +#define LOG_TABLE_PRECISION 1e-15 +#endif + +INT CMath::LOGACCURACY = 0; // 100000 steps per integer +#endif + +INT CMath::LOGRANGE = 0; // range for logtable: log(1+exp(x)) -25 <= x <= 0 + +#ifdef USE_PATHDEBUG +const REAL CMath::INFTY = 1e11; // infinity +#else +const REAL CMath::INFTY = -log(0.0); // infinity +#endif +const REAL CMath::ALMOST_NEG_INFTY = -1000; + +CHAR CMath::rand_state[256]; + +CMath::CMath() +{ + struct timeval tv; + gettimeofday(&tv, NULL); + UINT seed=(UINT) (4223517*getpid()*tv.tv_sec*tv.tv_usec); + initstate(seed, CMath::rand_state, sizeof(CMath::rand_state)); + CIO::message(M_INFO, "seeding random number generator with %u\n", seed); + +#ifdef USE_LOGCACHE + LOGRANGE=CMath::determine_logrange(); + LOGACCURACY=CMath::determine_logaccuracy(LOGRANGE); + CIO::message(M_INFO, "Initializing log-table (size=%i*%i*%i=%2.1fMB) ...",LOGRANGE,LOGACCURACY,sizeof(REAL),LOGRANGE*LOGACCURACY*sizeof(REAL)/(1024.0*1024.0)) ; + + CMath::logtable=new REAL[LOGRANGE*LOGACCURACY]; + init_log_table(); + CIO::message(M_INFO, "Done.\n") ; +#else + INT i=0; + while ((REAL)log(1+((REAL)exp(-REAL(i))))) + i++; + CIO::message(M_INFO, "determined range for x in log(1+exp(-x)) is:%d\n", i); + LOGRANGE=i; +#endif +} + +CMath::~CMath() +{ +#ifdef USE_LOGCACHE + delete[] logtable; +#endif +} + +#ifdef USE_LOGCACHE +INT CMath::determine_logrange() +{ + INT i; + REAL acc=0; + for (i=0; i<50; i++) + { + acc=((REAL)log(1+((REAL)exp(-REAL(i))))); + if (acc<=(REAL)LOG_TABLE_PRECISION) + break; + } + + CIO::message(M_INFO, "determined range for x in table log(1+exp(-x)) is:%d (error:%G)\n",i,acc); + return i; +} + +INT CMath::determine_logaccuracy(INT range) +{ + range=MAX_LOG_TABLE_SIZE/range/((int)sizeof(REAL)); + CIO::message(M_INFO, "determined accuracy for x in table log(1+exp(-x)) is:%d (error:%G)\n",range,1.0/(double) range); + return range; +} + +//init log table of form log(1+exp(x)) +void CMath::init_log_table() +{ + for (INT i=0; i< LOGACCURACY*LOGRANGE; i++) + logtable[i]=log(1+exp(REAL(-i)/REAL(LOGACCURACY))); +} +#endif + +void CMath::sort(INT *a, INT cols, INT sort_col) +{ + INT changed=1; + if (a[0]==-1) return ; + while (changed) + { + changed=0; INT i=0 ; + while ((a[(i+1)*cols]!=-1) && (a[(i+1)*cols+1]!=-1)) // to be sure + { + if (a[i*cols+sort_col]>a[(i+1)*cols+sort_col]) + { + for (INT j=0; ja[i+1]) + { + swap(a[i],a[i+1]) ; + swap(idx[i],idx[i+1]) ; + changed=1 ; + } ; + } ; + } ; + +} + + + +//plot x- axis false positives (fp) 1-Specificity +//plot y- axis true positives (tp) Sensitivity +INT CMath::calcroc(REAL* fp, REAL* tp, REAL* output, INT* label, INT& size, INT& possize, INT& negsize, REAL& tresh, FILE* rocfile) +{ + INT left=0; + INT right=size-1; + INT i; + + for (i=0; i 0) && (left0) + maximum=max(maximum,negout[negsize-1]); + if (possize>0) + maximum=max(maximum,posout[possize-1]); + + REAL treshhold=minimum; + REAL old_treshhold=treshhold; + + //clear array. + for (i=0; i=possize && negidx=negsize && posidx negsize*fp[iteration]/size+(1-tp[iteration])*possize/size ) + { + minerr=negsize*fp[iteration]/size+(1-tp[iteration])*possize/size; + tresh=(old_treshhold+treshhold)/2; + returnidx=iteration; + } + + iteration++; + } + + // set new size + size=iteration; + + if (rocfile) + { + const CHAR id[]="ROC"; + fwrite(id, sizeof(char), sizeof(id), rocfile); + fwrite(fp, sizeof(REAL), size, rocfile); + fwrite(tp, sizeof(REAL), size, rocfile); + } + + return returnidx; +} + +UINT CMath::crc32(BYTE *data, INT len) +{ + UINT result; + INT i,j; + BYTE octet; + + result = 0-1; + + for (i=0; i> 7) ^ (result >> 31)) + { + result = (result << 1) ^ 0x04c11db7; + } + else + { + result = (result << 1); + } + octet <<= 1; + } + } + + return ~result; +} + +double CMath::mutual_info(REAL* p1, REAL* p2, INT len) +{ + double e=0; + + for (INT i=0; i +#include + +//define finite for win32 +#ifdef _WIN32 +#include +#define finite _finite +#define isnan _isnan +#endif + +#ifndef NAN +#include +#define NAN (strtod("NAN",NULL)) +#endif + +#ifdef SUNOS +extern "C" int finite(double); +#endif + +/** Mathematical Functions. + * Class which collects generic mathematical functions + */ +class CMath +{ +public: + /**@name Constructor/Destructor. + */ + //@{ + ///Constructor - initializes log-table + CMath(); + + ///Destructor - frees logtable + virtual ~CMath(); + //@} + + /**@name min/max/abs functions. + */ + //@{ + + ///return the minimum of two integers + // + template + static inline T min(T a, T b) + { + return ((a)<=(b))?(a):(b); + } + + ///return the maximum of two integers + template + static inline T max(T a, T b) + { + return ((a)>=(b))?(a):(b); + } + + ///return the maximum of two integers + template + static inline T abs(T a) + { + return ((a)>=(0))?(a):(-a); + } + //@} + + /**@name misc functions */ + //@{ + + /// crc32 + static UINT crc32(BYTE *data, INT len); + + static inline REAL round(REAL d) + { + return floor(d+0.5); + } + + /// signum of type T variable a + template + static inline REAL sign(REAL a) + { + if (a==0) + return 0; + else return (a<0) ? (-1) : (+1); + } + + /// swap floats a and b + template + static inline void swap(T & a,T &b) + { + T c=a ; + a=b; b=c ; + } + + /// x^2 + template + static inline T sq(T x) + { + return x*x; + } + + static inline LONG factorial(INT n) + { + LONG res=1 ; + for (int i=2; i<=n; i++) + res*=i ; + return res ; + } + + static inline LONG nchoosek(INT n, INT k) + { + long res=1 ; + + for (INT i=n-k+1; i<=n; i++) + res*=i ; + + return res/factorial(k) ; + } + + /** performs a bubblesort on a given matrix a. + * it is sorted from in ascending order from top to bottom + * and left to right */ + static void sort(INT *a, INT cols, INT sort_col=0) ; + static void sort(REAL *a, INT*idx, INT N) ; + + /** performs a quicksort on an array output of length size + * it is sorted from in ascending (for type T) */ + template + static void qsort(T* output, INT size) ; + + /** performs a quicksort on an array output of length size + * it is sorted from in ascending order + * (for type T1) and returns the index (type T2) + * matlab alike [sorted,index]=sort(output) + */ + template + static void qsort(T1* output, T2* index, INT size); + + /** performs a quicksort on an array output of length size + * it is sorted from in ascending order + * (for type T1) and returns the index (type T2) + * matlab alike [sorted,index]=sort(output) + */ + template + static void qsort_backward(T1* output, T2* index, INT size); + + /* finds the smallest element in output and puts that element as the + first element */ + template + static void min(REAL* output, T* index, INT size); + + /* finds the n smallest elements in output and puts these elements as the + first n elements */ + template + static void nmin(REAL* output, T* index, INT size, INT n); + + /* performs a inplace unique of a vector of type T using quicksort + * returns the new number of elements */ + template + static INT unique(T* output, INT size) + { + qsort(output, size) ; + INT i,j=0 ; + for (i=0; ielem || output[end]elem) + { + end = middle ; + middle=start+(end-start)/2 ; + } ; + if (output[middle]elem) + return -1 ; + if (output[end]<=elem) + return size-1 ; + + while (1) + { + if (output[middle]>elem) + { + end = middle ; + middle=start+(end-start)/2 ; + } ; + if (output[middle]=elem) + return middle ; + if (end-start<=1) + { + if (output[start]<=elem && output[start+1]>=elem) + return start ; + return end ; + } + } + } + + /** calculates ROC into (fp,tp) + * from output and label of length size + * returns index with smallest error=fp+fn + */ + static INT calcroc(REAL* fp, REAL* tp, REAL* output, INT* label, INT& size, INT& possize, INT& negsize, REAL& tresh, FILE* rocfile); + //@} + + /// returns the mutual information of p which is given in logspace + /// where p,q are given in logspace + static double mutual_info(REAL* p1, REAL* p2, INT len); + + /// returns the relative entropy H(P||Q), + /// where p,q are given in logspace + static double relative_entropy(REAL* p, REAL* q, INT len); + + /// returns entropy of p which is given in logspace + static double entropy(REAL* p, INT len); + + /**@name summing functions */ + //@{ + /** sum logarithmic probabilities. + * Probability measures are summed up but are now given in logspace where + * direct summation of exp(operand) is not possible due to numerical problems, i.e. eg. exp(-1000)=0. Therefore + * we do log( exp(a) + exp(b)) = a + log (1 + exp (b-a)) where a is max(p,q) and b min(p,q). */ +#ifdef USE_LOGCACHE + static inline REAL logarithmic_sum(REAL p, REAL q) + { + if (finite(p)) + { + if (finite(q)) + { + + register REAL diff=p-q; + + if (diff>0) //p>q + { + if (diff > LOGRANGE) + return p; + else + return p + logtable[(int)(diff*LOGACCURACY)]; + } + else //p<=q + { + if (-diff > LOGRANGE) + return q; + else + return q + logtable[(int)(-diff*LOGACCURACY)]; + } + } + CIO::message("INVALID second operand to logsum(%f,%f) expect undefined results\n", p, q); + return NAN; + } + else + return q; + } + + ///init log table of form log(1+exp(x)) + static void init_log_table(); + + /// determine INT x for that log(1+exp(-x)) == 0 + static INT determine_logrange(); + + /// determine accuracy, such that the thing fits into MAX_LOG_TABLE_SIZE, needs logrange as argument + static INT determine_logaccuracy(INT range); +#else + /* + inline REAL logarithmic_sum(REAL p, REAL q) + { + double result=comp_logarithmic_sum(p,q); + + printf("diff:%f <-> %f\n",p-q, result); + return result; + }*/ + + static inline REAL logarithmic_sum(REAL p, REAL q) + { + if (finite(p)) + { + if (finite(q)) + { + + register REAL diff=p-q; + if (diff>0) //p>q + { + if (diff > LOGRANGE) + return p; + else + return p + log(1+exp(-diff)); + } + else //p<=q + { + if (-diff > LOGRANGE) + return q; + else + return q + log(1+exp(diff)); + } + } + return p; + } + else + return q; + } +#endif +#ifdef LOG_SUM_ARRAY + /** sum up a whole array of values in logspace. + * This function addresses the numeric instabiliy caused by simply summing up N elements by adding + * each of the elements to some variable. Instead array neighbours are summed up until one element remains. + * Whilst the number of additions remains the same, the error is only in the order of log(N) instead N. + */ + static inline REAL logarithmic_sum_array(REAL *p, INT len) + { + if (len<=2) + { + if (len==2) + return logarithmic_sum(p[0],p[1]) ; + if (len==1) + return p[0]; + return -INFTY ; + } + else + { + register REAL *pp=p ; + if (len%2==1) pp++ ; + for (register INT j=0; j < len>>1; j++) + pp[j]=logarithmic_sum(pp[j<<1], pp[1+(j<<1)]) ; + } + return logarithmic_sum_array(p,len%2+len>>1) ; + } +#endif + //@} +public: + /**@name constants*/ + //@{ + /// infinity + static const REAL INFTY; + + /// almost neg (log) infinity + static const REAL ALMOST_NEG_INFTY; + + /// range for logtable: log(1+exp(x)) -LOGRANGE <= x <= 0 + static INT LOGRANGE; + +#ifdef USE_LOGCACHE + /// number of steps per integer + static INT LOGACCURACY; + //@} +protected: + ///table with log-values + static REAL* logtable; +#endif + static CHAR rand_state[256]; +}; + + +//implementations of template functions +template +void CMath::qsort(T* output, INT size) +{ + if (size==2) + { + if (output[0] > output [1]) + swap(output[0],output[1]); + } + else + { + REAL split=output[(size*rand())/(RAND_MAX+1)]; + //REAL split=output[size/2]; + + INT left=0; + INT right=size-1; + + while (left<=right) + { + while (output[left] < split) + left++; + while (output[right] > split) + right--; + + if (left<=right) + { + swap(output[left],output[right]); + left++; + right--; + } + } + + if (right+1> 1) + qsort(output,right+1); + + if (size-left> 1) + qsort(&output[left],size-left); + } +} + +template +void CMath::qsort(T1* output, T2* index, INT size) +{ + if (size==2) + { + if (output[0] > output [1]){ + swap(output[0],output[1]); + swap(index[0],index[1]); + } + + } + else + { + REAL split=output[(size*rand())/(RAND_MAX+1)]; + //REAL split=output[size/2]; + + INT left=0; + INT right=size-1; + + while (left<=right) + { + while (output[left] < split) + left++; + while (output[right] > split) + right--; + + if (left<=right) + { + swap(output[left],output[right]); + swap(index[left],index[right]); + left++; + right--; + } + } + + if (right+1> 1) + qsort(output,index,right+1); + + if (size-left> 1) + qsort(&output[left],&index[left], size-left); + } +} + +template +void CMath::qsort_backward(T1* output, T2* index, INT size) +{ + if (size==2) + { + if (output[0] < output [1]){ + swap(output[0],output[1]); + swap(index[0],index[1]); + } + + } + else + { + REAL split=output[(size*rand())/(RAND_MAX+1)]; + //REAL split=output[size/2]; + + INT left=0; + INT right=size-1; + + while (left<=right) + { + while (output[left] > split) + left++; + while (output[right] < split) + right--; + + if (left<=right) + { + swap(output[left],output[right]); + swap(index[left],index[right]); + left++; + right--; + } + } + + if (right+1> 1) + qsort(output,index,right+1); + + if (size-left> 1) + qsort(&output[left],&index[left], size-left); + } +} + +template +void CMath::nmin(REAL* output, T* index, INT size, INT n) +{ + if (6*n*size<13*size*log((double)size)) + for (INT i=0; i +void CMath::min(REAL* output, T* index, INT size) +{ + if (size<=0) + return ; + REAL min_elem = output[0] ; + INT min_index = 0 ; + for (INT i=1; i +#include +#include "config.h" + +#ifdef SUNOS +#define bool int +#define false 0 +#define true 1 +#endif + +#ifndef LINUX +#define RANDOM_MAX 2147483647 +#else +#define RANDOM_MAX RAND_MAX +#endif + +/**@name Standard Types + * Definition of Platform independent Types +*/ +//@{ + +/// Type CHAR +typedef char CHAR; +typedef CHAR* P_CHAR; + +/// Type BYTE +typedef unsigned char BYTE; +typedef BYTE* P_BYTE; + +/// Type SHORT is 2 bytes in size +typedef short int SHORT; +typedef SHORT* P_SHORT; + +/// Type WORD is 2 bytes in size +typedef unsigned short int WORD; +typedef WORD* P_WORD; + +/// Type INT is 4 bytes in size +typedef int INT; +typedef INT* P_INT; + +/// Type INT is 4 bytes in size +typedef unsigned int UINT; +typedef UINT* P_UINT; + +/// Type LONG is 8 bytes in size +typedef long LONG; +typedef LONG* P_LONG; + +/// Type SHORTREAL is 4 bytes in size +typedef float SHORTREAL; +typedef SHORTREAL* P_SHORTREAL; + +/// Type REAL is 8 bytes in size +typedef double REAL; +typedef REAL* P_REAL; + +/// Type LONGREAL is 16 bytes in size +//typedef long double LONGREAL; +//typedef LONGREAL* P_LONGREAL; + +#ifdef USE_SHORTREAL_KERNELCACHE + typedef SHORTREAL KERNELCACHE_ELEM; +#else + typedef REAL KERNELCACHE_ELEM; +#endif + +typedef KERNELCACHE_ELEM P_KERNELCACHE_ELEM; + +typedef LONG KERNELCACHE_IDX; + +/// The io libs output [DEBUG] etc in front of every CIO::message +/// 'higher' messages filter output depending on the loglevel, i.e. CRITICAL messages +/// will print all M_CRITICAL TO M_EMERGENCY messages to +enum EMessageType +{ + M_DEBUG, + M_INFO, + M_NOTICE, + M_WARN, + M_ERROR, + M_CRITICAL, + M_ALERT, + M_EMERGENCY, + M_MESSAGEONLY, + M_PROGRESS +}; + +enum EKernelType +{ + K_UNKNOWN = 0, + K_OPTIMIZABLE = 4096, + K_LINEAR = 10 | K_OPTIMIZABLE, + K_POLY = 20, + K_GAUSSIAN = 30, + K_HISTOGRAM = 40, + K_SALZBERG = 41, + K_LOCALITYIMPROVED = 50, + K_SIMPLELOCALITYIMPROVED = 60, + K_FIXEDDEGREE = 70, + K_WEIGHTEDDEGREE = 80 | K_OPTIMIZABLE, + K_WEIGHTEDDEGREEPOS = 81 | K_OPTIMIZABLE, + K_WEIGHTEDDEGREEPOLYA = 82, + K_WD = 83, + K_COMMWORD = 90 | K_OPTIMIZABLE , + K_POLYMATCH = 100, + K_ALIGNMENT = 110, + K_COMMWORDSTRING = 120 | K_OPTIMIZABLE, + K_SPARSENORMSQUARED = 130, + K_COMBINED = 140 | K_OPTIMIZABLE +}; + +enum EFeatureType +{ + F_UNKNOWN = 0, + F_REAL = 10, + F_SHORT = 20, + F_CHAR = 30, + F_INT = 40, + F_BYTE = 50, + F_WORD = 60 +}; + +enum EFeatureClass +{ + C_UNKNOWN = 0, + C_SIMPLE = 10, + C_SPARSE = 20, + C_STRING = 30, + C_COMBINED = 40 +}; + +/// Alphabet of charfeatures/observations +enum E_ALPHABET +{ + /// DNA - letters A,C,G,T,*,N,n + DNA=0, + + /// PROTEIN - letters a-z + PROTEIN=1, + + /// ALPHANUM - [0-9a-z] + ALPHANUM=2, + + /// CUBE - [1-6] + CUBE=3, + + /// NONE - type has no alphabet + NONE=4 +}; + +//@} + +#define TMP_DIR "/tmp/" +//#define TMP_DIR "/short/x46/tmp/" + +#endif diff --git a/QPalmaDP/config.h b/QPalmaDP/config.h new file mode 100644 index 0000000..88b448b --- /dev/null +++ b/QPalmaDP/config.h @@ -0,0 +1,19 @@ + +#define HAVE_MATLAB 1 + +#define HAVE_LARGEFILE 1 + + + + + + +#define USE_BIGSTATES 1 + + +#define USE_HMMCACHE 1 + + + + + diff --git a/QPalmaDP/fill_matrix.cpp b/QPalmaDP/fill_matrix.cpp new file mode 100644 index 0000000..e36af54 --- /dev/null +++ b/QPalmaDP/fill_matrix.cpp @@ -0,0 +1,421 @@ +/*********************************************************************************************/ +// Version of fill_matrix with fills several alignment matrices (best, second_best ...) +// nr_paths det. number of these matrices +// +// est_len: Number of columns in alignment matrix: length(EST) +1 (for the gap) (variable: i) +// dna_len: Number of lines in alignment matrix: length(DNA) +1 (for the gap) (variable: j) +// +// !!! IMPORTANT !!! The pictures and equations in the paper/poster etc. describe the algorithm +// for an alignment matrix where DNA and EST changed places !!! IMPORTANT !!! +/*********************************************************************************************/ + +/* + matchmatrix: columnwise + - A C G T N (dna) + - x x x x x x + A x x x x x x + C x x x x x x + G x x x x x x + T x x x x x x + N x x x x x x +(est) +*/ + +/* + alignment matrix: columnwise + |-EST . . . + -+------------ + -|00 ... + D|0. + N|. . + A|. . + .|. + .| + .| +*/ + + +/* TODO + - max_len = 10000 for elegans! + - do not use double + +*/ + + +#include "fill_matrix.h" + +using namespace std; + +int check_char(char base) +{ + switch(base) + { + case 'A': + case 'a': return 1; + case 'C': + case 'c': return 2; + case 'G': + case 'g': return 3; + case 'T': + case 't': return 4; + case 'N': + case 'n': return 5; + default : return 0; + } + +} + + + +void fill_matrix(int nr_paths, Pre_score* matrices[], int est_len /*counter var: i*/, int dna_len /*counter var: j*/, char* est, char* dna, penalty_struct* functions , double* matchmatrix, double* donor, double* acceptor, bool remove_duplicate_scores, int nr_donor_sites, int* donor_sites, int* max_score_positions) +{ + + /*********************************************************************************************/ + // variables + /*********************************************************************************************/ + const int mlen = 6; // length of matchmatrix + int estChar, dnaChar; // 0:'-', 1:'a', 2:'c', 3:'g', 4:'t', 5:'n' + double prevValue ; + double tempValue; + int max_len = (int)functions->limits[functions->len-1] ; //last (len) supporting point of h + //printf("max_len: %d\n", max_len) ; + + int num_elem ; // counter variable + + const int max_num_elem = (4+nr_paths+max_len) * nr_paths ; // upper bound on num_elem + double *scores=new double[max_num_elem] ; //are sorted later + struct ArrayElem *array=new ArrayElem[max_num_elem] ; + //printf("%d\n", max_num_elem*sizeof(ArrayElem)) ; + + int max_ss_element[nr_paths*nr_paths]; //position on dna with max ss score for introns of length greater than max_len + int splice_pos ; //splice position get from donor_sites + int start_splice_pos ; //where to start looking in donor_sites + + double max_scores[nr_paths] ; + + /*********************************************************************************************/ + // Initialisation of all alignment matrices, columnwise + /*********************************************************************************************/ + for (int z=0; zvalue = log(0.0); // -inf + ((Pre_score*)matrices[z] +(i*dna_len) +j)->prev_i = 0; + ((Pre_score*)matrices[z] +(i*dna_len) +j)->prev_j = 0; + ((Pre_score*)matrices[z] +(i*dna_len) +j)->prev_matrix_no = 0; + ((Pre_score*)matrices[z] +(i*dna_len) +j)->issplice = 0; + } + } + } + + + /*********************************************************************************************/ + // Element (0,0) for first/best matrice (z=0) + /*********************************************************************************************/ + ((Pre_score*)matrices[0])->value = 0; + ((Pre_score*)matrices[0])->prev_i = 0; + ((Pre_score*)matrices[0])->prev_j = 0; + ((Pre_score*)matrices[0])->prev_matrix_no = 0; + ((Pre_score*)matrices[0])->issplice = 0; + + + /*********************************************************************************************/ + // Line Zero for first/best matrice (z=0) + /*********************************************************************************************/ + for (int i=1; ivalue = 0 ; + ((Pre_score*)matrices[0] +(i*dna_len))->prev_i = i-1; + ((Pre_score*)matrices[0] +(i*dna_len))->prev_j = 0; + ((Pre_score*)matrices[0] +(i*dna_len))->prev_matrix_no = 0; + ((Pre_score*)matrices[0] +(i*dna_len))->issplice = 0; + } + + + /*********************************************************************************************/ + // Column Zero for first/best matrice (z=0) + /*********************************************************************************************/ + for (int j=1; jvalue = 0 ; + ((Pre_score*)matrices[0] +j)->prev_i = 0 ; + ((Pre_score*)matrices[0] +j)->prev_j = j-1 ; + ((Pre_score*)matrices[0] +j)->prev_matrix_no = 0 ; + ((Pre_score*)matrices[0] +j)->issplice = 0; + } + + + /*********************************************************************************************/ + // !!!! Column Zero and line Zero for all other matrices except the best one (z>0): -INF !!!! + /*********************************************************************************************/ + + + + /*********************************************************************************************/ + // all other entries for all matrices, last column and row treated seperately + /*********************************************************************************************/ + + /* columnwise, 0.column and 0.line already filled */ + for (int i=1; i max_len) + { + //storing highest ss_score (the position) and NOT -INF + int new_ss_pos = j-max_len ; + double donor_score = donor[new_ss_pos-1] ; + if (isfinite(donor_score)) //splice site! + { + if (max_ss_element[nr_paths-1] != 0) + { + for (int zz=0; zzvalue ; //could be -INF + if (!(isfinite(new_intron_score))) {printf("new_intron_score is -INF\n") ;} + for (int pos=0; posvalue ; + if (new_intron_score >= act_intron_score) + { + new_intron_score = act_intron_score ; + pos_with_minscore = pos ; + } //else nothing happens, new intron score still the worst one + } + if (pos_with_minscore != nr_paths) + { + max_ss_element[zz*nr_paths+pos_with_minscore] = new_ss_pos ; //replacing + } + } + } + else + { + for (int pos=0; pos (j,i) + /***************************************/ + prevValue = ((Pre_score*)actMatrix +(i*dna_len)+j-1)->value ; /* (j-1,i) -> (j,i) */ + if (isfinite(prevValue)) + { + tempValue = prevValue +(matchmatrix[mlen* dnaChar]); /* score(gap,DNA) */ + if (isfinite(tempValue)) + { + assert(num_elem (j,i) + /***************************************/ + prevValue = ((Pre_score*)actMatrix +(i-1)*dna_len + j-1)->value ; /* (j-1,i-1) -> (j,i) */ + if (isfinite(prevValue)) + { + tempValue = prevValue +(matchmatrix[mlen* dnaChar +estChar]);//DNA mit EST + if (isfinite(tempValue)) + { + assert(num_elem (j,i) + /***************************************/ + prevValue = ((Pre_score*)actMatrix +(i-1)*dna_len + j)->value ; + if (isfinite(prevValue)) + { + tempValue = prevValue +(matchmatrix[estChar]); /* score(EST,gap) */ + if (isfinite(tempValue)) + { + assert(num_elem0) + { + assert((j-max_ss_element[z*nr_paths+pos]+1) > max_len) ; + splice_pos = max_ss_element[z*nr_paths+pos] ; //the g of the gt + prevValue = ((Pre_score*)actMatrix + (i*dna_len) +splice_pos-1)->value ; //one before the _gt + if (isfinite(prevValue) && (((Pre_score*)actMatrix + (i*dna_len) +splice_pos-1)->issplice!=1)) + { + int dist = j -(splice_pos-1); //_gt...ag: dist from g to g + assert(dist>max_len) ; + tempValue = lookup_penalty(functions,dist, NULL, 0) +donor[splice_pos-1] +acceptor[j-1] +prevValue; + if (isfinite(tempValue)) + { + assert(num_elem j-4) { break ;} + prevValue = ((Pre_score*)actMatrix + (i*dna_len) +splice_pos-1)->value ; //one before the _gt + + if (isfinite(prevValue) && (((Pre_score*)actMatrix + (i*dna_len) +splice_pos-1)->issplice!=1)) + { + int dist = j -(splice_pos-1); //_gt...ag: dist from g to g + assert(dist<=max_len) ; + tempValue = lookup_penalty(functions,dist, NULL, 0) +donor[splice_pos-1] +acceptor[j-1] +prevValue; + if (isfinite(tempValue)) + { + assert(num_elem(scores, array, num_elem, nr_paths); + + /* the nr_paths many highest scores are written in the nr_paths many matrices*/ + for (int z=0; z=num_elem) { + //nothing happens, entry in matrices[z] at pos. (i,j) still -inf + } + else { + ((Pre_score*)matrices[z] + (i*dna_len) +j)->value = -scores[z] ; + ((Pre_score*)matrices[z] + (i*dna_len) +j)->prev_i = array[z].prev_i ; + ((Pre_score*)matrices[z] + (i*dna_len) +j)->prev_j = array[z].prev_j ; + ((Pre_score*)matrices[z] + (i*dna_len) +j)->issplice = array[z].isSplice ; + ((Pre_score*)matrices[z] + (i*dna_len) +j)->prev_matrix_no = array[z].prev_matrix_no; + + if (-scores[z]>=max_scores[z]) { //storing the postion of the highest alignment score yet + max_scores[z] = -scores[z] ; + max_score_positions[2*z] = i; // pos i + max_score_positions[2*z +1] = j; // pos j + } + } + } //end of z + + } //end of j + } // end of i + + + + //free memory + delete[] array; + delete[] scores; + + return; +} + + diff --git a/QPalmaDP/fill_matrix.h b/QPalmaDP/fill_matrix.h new file mode 100644 index 0000000..b0f169f --- /dev/null +++ b/QPalmaDP/fill_matrix.h @@ -0,0 +1,51 @@ +#ifndef _FILLMATRIX_H__ +#define _FILLMATRIX_H__ + +#include "Mathmatics.h" +#include "penalty_info.h" +#include //Eigentlich in Header +#include +#include +#include +#include +#include +#include + + +struct ArrayElem { //24B + double score; + int prev_j; + int prev_i; + int prev_matrix_no; + bool isSplice; +} ; + +struct pre_score { //24B + double value; //8 + int prev_i; //4 + int prev_j; //4 + int prev_matrix_no;//4 + bool issplice;//4 +}; + +typedef struct pre_score Pre_score; + +typedef struct Align_pair { //24B + double value ; + bool issplice; + int previ,prevj ; + char est_char; + char dna_char; +}; + + +void fill_matrix (int nr_paths, Pre_score* matrices[], int len_est, int len_dna, char* est, char* dna, penalty_struct* functions, double* matchmatrix, double* donor, double* acceptor, bool remove_duplicate_scores, int nr_donor_sites, int* donor_sites, int* max_score_positions); +// funktion aendern in void result_align(Pre_score* matrices[], int anzpath, + +int check_char(char base); + +bool result_align (Pre_score* matrices[], int matrixnr, int i, int j, int* result_length_ptr, char* est, char* dna, int* s_align, int* e_align, int* m_param, double* alignmentscores, int* max_score_positions ); + +extern void print_align(Pre_score* matrix, int length_est, int length_dna, Align_pair* vektor, int result_length, int print_matrix); + +#endif // _FILLMATRIX_H__ diff --git a/QPalmaDP/io.cpp b/QPalmaDP/io.cpp new file mode 100644 index 0000000..5ae3e1c --- /dev/null +++ b/QPalmaDP/io.cpp @@ -0,0 +1,96 @@ +#include "io.h" +#include "common.h" + +#include +#include +#include + +FILE* CIO::target=stdout; + +CIO::CIO() +{ +} + +void CIO::message(EMessageType prio, const CHAR *fmt, ... ) +{ + check_target(); + print_message_prio(prio, target); + va_list list; + va_start(list,fmt); + vfprintf(target,fmt,list); + va_end(list); + fflush(target); +} + +void CIO::buffered_message(EMessageType prio, const CHAR *fmt, ... ) +{ + check_target(); + print_message_prio(prio, target); + va_list list; + va_start(list,fmt); + vfprintf(target,fmt,list); + va_end(list); +} + +CHAR* CIO::skip_spaces(CHAR* str) +{ + INT i=0; + + if (str) + { + for (i=0; isspace(str[i]); i++); + + return &str[i]; + } + else + return str; +} + +void CIO::set_target(FILE* t) +{ + target=t; +} + +void CIO::check_target() +{ + if (!target) + target=stdout; +} + +void CIO::print_message_prio(EMessageType prio, FILE* target) +{ + switch (prio) + { + case M_DEBUG: + fprintf(target, "[DEBUG] "); + break; + case M_PROGRESS: + fprintf(target, "[PROGRESS] "); + break; + case M_INFO: + //fprintf(target, "[INFO] "); + break; + case M_NOTICE: + fprintf(target, "[NOTICE] "); + break; + case M_WARN: + fprintf(target, "[WARN] "); + break; + case M_ERROR: + fprintf(target, "[ERROR] "); + break; + case M_CRITICAL: + fprintf(target, "[CRITICAL] "); + break; + case M_ALERT: + fprintf(target, "[ALERT] "); + break; + case M_EMERGENCY: + fprintf(target, "[EMERGENCY] "); + break; + case M_MESSAGEONLY: + break; + default: + break; + } +} diff --git a/QPalmaDP/io.h b/QPalmaDP/io.h new file mode 100644 index 0000000..c127f92 --- /dev/null +++ b/QPalmaDP/io.h @@ -0,0 +1,33 @@ +#ifndef __CIO_H__ +#define __CIO_H__ + +#include "common.h" + +#include +#include + +class CIO +{ +public: + CIO(); + + static void set_target(FILE* target); + static void message(EMessageType prio, const CHAR *fmt, ... ); + + inline static void not_implemented() + { + message(M_ERROR, "Sorry, not yet implemented\n"); + } + + static void buffered_message(EMessageType prio, const CHAR *fmt, ... ); + + static CHAR* skip_spaces(CHAR* str); + +protected: + static void check_target(); + static void print_message_prio(EMessageType prio, FILE* target); + +protected: + static FILE* target; +}; +#endif diff --git a/QPalmaDP/penalty_info.cpp b/QPalmaDP/penalty_info.cpp new file mode 100644 index 0000000..8095ca1 --- /dev/null +++ b/QPalmaDP/penalty_info.cpp @@ -0,0 +1,354 @@ +#include +#include "config.h" +//#include "features/CharFeatures.h" +//#include "features/StringFeatures.h" + +#include +#include + +#include "io.h" + + +#include "fill_matrix.h" +#include "penalty_info.h" + +void init_penalty_struct(struct penalty_struct &PEN) +{ + PEN.limits=NULL ; + PEN.penalties=NULL ; + PEN.id=-1 ; + PEN.next_pen=NULL ; + PEN.transform = T_LINEAR ; + PEN.name = NULL ; + PEN.max_len=0 ; + PEN.min_len=0 ; + PEN.cache=NULL ; + PEN.use_svm=0 ; +} + +void init_penalty_struct_cache(struct penalty_struct &PEN) +{ + if (PEN.cache || PEN.use_svm) + return ; + + REAL* cache=new REAL[PEN.max_len+1] ; + if (cache) + { + for (INT i=0; i<=PEN.max_len; i++) + cache[i] = lookup_penalty(&PEN, i, 0, false) ; + PEN.cache = cache ; + } +} + +void delete_penalty_struct(struct penalty_struct &PEN) +{ + if (PEN.id!=-1) + { + delete[] PEN.limits ; + delete[] PEN.penalties ; + delete[] PEN.name ; + delete[] PEN.cache ; + } +} + +void delete_penalty_struct_array(struct penalty_struct *PEN, INT len) +{ + for (int i=0; iP-1) +// { +// CIO::message(M_ERROR, "id out of range\n") ; +// delete_penalty_struct_array(PEN,P) ; +// return NULL ; +// } +// INT max_len = (INT) mxGetScalar(mx_max_len_field) ; +// if (max_len<0 || max_len>1024*1024*100) +// { +// CIO::message(M_ERROR, "max_len out of range\n") ; +// delete_penalty_struct_array(PEN,P) ; +// return NULL ; +// } +// PEN[id].max_len = max_len ; +// +// INT min_len = (INT) mxGetScalar(mx_min_len_field) ; +// if (min_len<0 || min_len>1024*1024*100) +// { +// CIO::message(M_ERROR, "min_len out of range\n") ; +// delete_penalty_struct_array(PEN,P) ; +// return NULL ; +// } +// PEN[id].min_len = min_len ; +// +// if (PEN[id].id!=-1) +// { +// CIO::message(M_ERROR, "penalty id already used\n") ; +// delete_penalty_struct_array(PEN,P) ; +// return NULL ; +// } +// PEN[id].id=id ; +// if (next_id>=0) +// PEN[id].next_pen=&PEN[next_id] ; +// //fprintf(stderr,"id=%i, next_id=%i\n", id, next_id) ; +// +// assert(next_id!=id) ; +// PEN[id].use_svm=use_svm ; +// PEN[id].limits = new REAL[len] ; +// PEN[id].penalties = new REAL[len] ; +// double * limits = mxGetPr(mx_limits_field) ; +// double * penalties = mxGetPr(mx_penalties_field) ; +// +// for (INT i=0; iuse_svm>0) ; + REAL d_value=d_values[PEN->use_svm-1] ; + //fprintf(stderr,"transform=%i, d_value=%1.2f\n", (INT)PEN->transform, d_value) ; + + switch (PEN->transform) + { + case T_LINEAR: + break ; + case T_LOG: + d_value = log(d_value) ; + break ; + case T_LOG_PLUS1: + d_value = log(d_value+1) ; + break ; + case T_LOG_PLUS3: + d_value = log(d_value+3) ; + break ; + case T_LINEAR_PLUS3: + d_value = d_value+3 ; + break ; + default: + CIO::message(M_ERROR, "unknown transform\n") ; + break ; + } + + INT idx = 0 ; + REAL ret ; + for (INT i=0; ilen; i++) + if (PEN->limits[i]<=d_value) + idx++ ; + + if (idx==0) + ret=PEN->penalties[0] ; + else if (idx==PEN->len) + ret=PEN->penalties[PEN->len-1] ; + else + { + ret = (PEN->penalties[idx]*(d_value-PEN->limits[idx-1]) + PEN->penalties[idx-1]* + (PEN->limits[idx]-d_value)) / (PEN->limits[idx]-PEN->limits[idx-1]) ; + } + + //fprintf(stderr,"ret=%1.2f\n", ret) ; + + if (PEN->next_pen) + ret+=lookup_penalty(PEN->next_pen, p_value, d_values); + + //fprintf(stderr,"ret=%1.2f\n", ret) ; + + return ret ; +} + +REAL lookup_penalty(const struct penalty_struct *PEN, INT p_value, + REAL* svm_values, bool follow_next) +{ + if (PEN==NULL) + return 0 ; + if (PEN->use_svm) + return lookup_penalty_svm(PEN, p_value, svm_values) ; + + if ((p_valuemin_len) || (p_value>PEN->max_len)) + return -CMath::INFTY ; + + if (PEN->cache!=NULL && (p_value>=0) && (p_value<=PEN->max_len)) + { + REAL ret=PEN->cache[p_value] ; + if (PEN->next_pen && follow_next) + ret+=lookup_penalty(PEN->next_pen, p_value, svm_values); + return ret ; + } + + REAL d_value = (REAL) p_value ; + switch (PEN->transform) + { + case T_LINEAR: + break ; + case T_LOG: + d_value = log(d_value) ; + break ; + case T_LOG_PLUS1: + d_value = log(d_value+1) ; + break ; + case T_LOG_PLUS3: + d_value = log(d_value+3) ; + break ; + case T_LINEAR_PLUS3: + d_value = d_value+3 ; + break ; + default: + CIO::message(M_ERROR, "unknown transform\n") ; + break ; + } + + INT idx = 0 ; + REAL ret ; + for (INT i=0; ilen; i++) + if (PEN->limits[i]<=d_value) + idx++ ; + + if (idx==0) + ret=PEN->penalties[0] ; + else if (idx==PEN->len) + ret=PEN->penalties[PEN->len-1] ; + else + { + ret = (PEN->penalties[idx]*(d_value-PEN->limits[idx-1]) + PEN->penalties[idx-1]* + (PEN->limits[idx]-d_value)) / (PEN->limits[idx]-PEN->limits[idx-1]) ; + } + //if (p_value>=30 && p_value<150) + //fprintf(stderr, "%s %i(%i) -> %1.2f\n", PEN->name, p_value, idx, ret) ; + + if (PEN->next_pen && follow_next) + ret+=lookup_penalty(PEN->next_pen, p_value, svm_values); + + return ret ; +} diff --git a/QPalmaDP/penalty_info.h b/QPalmaDP/penalty_info.h new file mode 100644 index 0000000..2ed9ee1 --- /dev/null +++ b/QPalmaDP/penalty_info.h @@ -0,0 +1,42 @@ +#ifndef __PENALTY_INFO_H__ +#define __PENALTY_INFO_H__ + +#include "common.h" +#include "Mathmatics.h" + +enum ETransformType +{ + T_LINEAR, + T_LOG, + T_LOG_PLUS1, + T_LOG_PLUS3, + T_LINEAR_PLUS3 +} ; + +struct penalty_struct +{ + INT len ; + REAL *limits ; + REAL *penalties ; + INT max_len ; + INT min_len ; + REAL *cache ; + enum ETransformType transform ; + INT id ; + struct penalty_struct *next_pen ; + char * name ; + INT use_svm ; +} ; + +void init_penalty_struct(struct penalty_struct &PEN) ; +void delete_penalty_struct(struct penalty_struct &PEN) ; +void delete_penalty_struct_array(struct penalty_struct *PEN, INT len) ; + +//#ifdef HAVE_MATLAB +//struct penalty_struct * read_penalty_struct_from_cell(const mxArray * mx_penalty_info, mwSize &P) ; +//#endif + +REAL lookup_penalty(const struct penalty_struct *PEN, INT p_value, + REAL* svm_values, bool follow_next=true) ; + +#endif diff --git a/QPalmaDP/print_align.cpp b/QPalmaDP/print_align.cpp new file mode 100644 index 0000000..312cabc --- /dev/null +++ b/QPalmaDP/print_align.cpp @@ -0,0 +1,97 @@ +#include "fill_matrix.h" +using namespace std; + +void print_align(Pre_score* matrix, int length_est, int length_dna,Align_pair* vektor, int result_length, int print_matrix) + +{ + //Druck der matrix + switch(print_matrix) + { + case 1: + { + cout <<"Matrix: \n"; + + for (int k = 0; kvalue <<"\t"; + } + cout <<"\n"; + } + break; + } + + case 2: + { + cout <<"Matrix: \n"; + + for (int k = 0; kprev_i) == k-1) && ((((Pre_score*)matrix + (k*length_dna) + l)->prev_j) == l-1)) + cout << "\\" <<((Pre_score*)matrix + (k*length_dna) + l)->value <<"\t"; + else if (((((Pre_score*)matrix + (k*length_dna) + l)->prev_i) == k) && ((((Pre_score*)matrix + (k*length_dna) + l)->prev_j) == l-1)) + cout << "<"<< ((Pre_score*)matrix + (k*length_dna) + l)->value <<"\t"; + else if (((((Pre_score*)matrix + (k*length_dna) + l)->prev_i) == k -1) && ((((Pre_score*)matrix + (k*length_dna) + l)->prev_j) == l)) + cout << "|"<< ((Pre_score*)matrix + (k*length_dna) + l)->value <<"\t"; + else { cout << ((Pre_score*)matrix + (k*length_dna) + l)->value << "("; + cout << ((Pre_score*)matrix + (k*length_dna) + l)->prev_i << "," << ((Pre_score*)matrix + (k*length_dna) + l)->prev_j << ")\t"; + } + + } + cout <<"\n"; + } + + cout << "length_est" << length_est << "\n"; + cout << "length_dna" << length_dna << "\n"; + cout << "result_length: " << result_length << "\n"; + + + break; + } + + case 3: + { + //Druck des Alignment + + // cout << "Alignment\n" ; + int j = 0; + // cout << "resultlength: " << result_length <<"\n"; + while (j 1 + + + /***************************************************************************/ + // 1. dna + /***************************************************************************/ + int dna_len = mxGetN(prhs[arg]) + 1; // mxGetN: number of columns (int) + char* dna = (char*)mxCalloc(dna_len, sizeof(char)); + + failure = mxGetString(prhs[arg], dna, dna_len); // NULL at end of string + if (failure) { // mxGetString returns 0 if success, 1 on failure + mexPrintf("couldn't open dna"); + return; + } + arg++; // arg -> 2 + + + /***************************************************************************/ + // 2. est + /***************************************************************************/ + int est_len = mxGetN(prhs[arg]) + 1; + char* est = (char*)mxCalloc(est_len, sizeof(char)); + + failure = mxGetString(prhs[arg], est, est_len); + if (failure) { + mexPrintf("couldn't open est"); + return; + } + arg++; // arg -> 3 + + + /***************************************************************************/ + // 3. h (cell array) + /***************************************************************************/ + penalty_struct* functions; + mwSize anz_func = 1; + + functions = read_penalty_struct_from_cell(prhs[arg], anz_func); + //std::cout << " bla " << functions->max_len << " \n"; + //std::cout << "lookup_penalty(5) = " << lookup_penalty(functions, 5, NULL, 0)< 4 + + + /***************************************************************************/ + // 4. match matrix (columnwise) + /***************************************************************************/ + int nr_entries = mxGetN(prhs[arg]) * mxGetM(prhs[arg]); + //mexPrintf("\nNumber of entries in matchmatrix: %d\n", nr_entries) ; + double *matchmatrix; + matchmatrix = mxGetPr(prhs[arg]); + //mexPrintf("matchmatrix\n") ; + //for (int i=0; i< nr_entries; i++) { + // mexPrintf("entry %d: <%f>\n", i, matchmatrix[i]); + //} + arg++; // arg -> 5 + + + /***************************************************************************/ + // 5. donor + /***************************************************************************/ + int donor_length = mxGetN(prhs[arg]); + double *donor; + donor = mxGetPr(prhs[arg]); + //mexPrintf("Donor eingelesen: <%f>\n", donor[0]); + arg++; // arg -> 6 + + + /***************************************************************************/ + // 6. acceptor + /***************************************************************************/ + int acceptor_length = mxGetN(prhs[arg]); + double *acceptor; + acceptor = mxGetPr(prhs[arg]); + //mexPrintf("Acceptor eingelesen: <%f>\n", acceptor[0]); + arg++; // arg -> 7 + + + /***************************************************************************/ + // 7.remove_duplicate_scores + /***************************************************************************/ + bool remove_duplicate_scores = (bool)*mxGetPr(prhs[arg]); + arg++; // arg -> 8 + + + /***************************************************************************/ + // 8. print_matrix + /***************************************************************************/ + int print_matrix = (int)*mxGetPr(prhs[arg]); + arg++; // arg -> 9 + + + /***************************************************************************/ + // initialize alignment matrices and call fill_matrix() + /***************************************************************************/ + + //possible donor positions + int nr_donor_sites = 0 ; + for (int ii=0; iiprev_i; + mwSize prev_j = ((Pre_score*)matrices[z] + i*dna_len +j)->prev_j; + mwSize prev_m_no = ((Pre_score*)matrices[z] + i*dna_len +j)->prev_matrix_no; + + for (int ii=result_length; ii>0; ii--) { + if ((prev_i == (i-1)) && (prev_j == (j-1))) { // match or mismatch + DNA[ii-1] = check_char(dna[j-1]) ; + EST[ii-1] = check_char(est[i-1]) ; + } + else if ((prev_i == (i)) && (prev_j == (j-1))) {// gap on EST + DNA[ii-1] = check_char(dna[j-1]) ; + EST[ii-1] = 0 ; //gap + } + else if ((prev_i == (i-1)) && (prev_j == (j))) { // gap on DNA + DNA[ii-1] = 0 ; //gap + EST[ii-1] = check_char(est[i-1]) ; + } + else {// splice site + for (j; j > prev_j; j--) { + DNA[ii-1] = check_char(dna[j-1]) ; + EST[ii-1] = 6 ; //intron + ii-- ; + } + ii++ ; // last ii-- too much (because done in for-loop) + } + + i = prev_i; + j = prev_j; + prev_i = ((Pre_score*)matrices[prev_m_no] + i*dna_len + j)->prev_i; + prev_j = ((Pre_score*)matrices[prev_m_no] + i*dna_len + j)->prev_j; + prev_m_no = ((Pre_score*)matrices[prev_m_no] + i*dna_len + j)->prev_matrix_no; + } + + } //end of z + + + + //mexPrintf("DEBUG:Returning arguments\n"); + int index = 0 ; + + //splice_align (mxArray): + const mwSize dims0[] = {(dna_len-1), nr_paths}; + //const int dims0[] = {(dna_len-1), nr_paths}; + plhs[index] = mxCreateNumericArray(2, dims0, mxINT32_CLASS, mxREAL); + mwSize* start0 = (mwSize*)(mxGetData(plhs[index])); + memcpy(start0, splice_align, mxGetM(plhs[index])*mxGetN(plhs[index])*mxGetElementSize(plhs[index])); + index++ ; // ->1 + + //est_align (mxArray): + const mwSize dims1[] = {(est_len-1), nr_paths}; + //const int dims1[] = {(est_len-1), nr_paths}; + plhs[index] = mxCreateNumericArray(2, dims1, mxINT32_CLASS, mxREAL); + mwSize* start1 = (mwSize*)(mxGetData(plhs[index])); + memcpy(start1, est_align, mxGetM(plhs[index])*mxGetN(plhs[index])*mxGetElementSize(plhs[index])); + index++ ; // ->2 + + //weightmatch (mxArray) + const mwSize dims2[] = {mlen*mlen, nr_paths}; + //const int dims2[] = {mlen*mlen, nr_paths}; + plhs[index] = mxCreateNumericArray(2, dims2, mxINT32_CLASS, mxREAL); + mwSize* start2= (mwSize*)(mxGetData(plhs[index])); + memcpy(start2, mmatrix_param, mxGetM(plhs[index])*mxGetN(plhs[index])*mxGetElementSize(plhs[index])); + index++ ; // ->3 + + //alignmentscore + plhs[index] = mxCreateDoubleMatrix(1,nr_paths,mxREAL); + if (plhs[index] == NULL) + { + printf("%s : Out of memory on line %d\n", __FILE__, __LINE__); + printf("Unable to create mxArray.\n"); + exit(1); + } + double* start3 = (double*)(mxGetData(plhs[index])); + memcpy(start3, alignmentscores, mxGetM(plhs[index])*mxGetN(plhs[index])*mxGetElementSize(plhs[index])); + index++ ; // ->4 + + assert(nlhs==index+1) ; + + //Free Memory + //mexPrintf("DEBUG:Free Memory\n"); + mxFree(donor_sites); + mxFree(max_score_positions); + for (int i=nr_paths-1;i>=0; i--) + { + //delete[] matrices[i]; + mxFree(matrices[i]); + } + mxFree(matrices); + mxFree(splice_align); + mxFree(est_align); + mxFree(mmatrix_param); + mxFree(dna); + mxFree(est); + + return; + +} + diff --git a/QPalmaDP/qpalma_dp.h b/QPalmaDP/qpalma_dp.h new file mode 100644 index 0000000..c019466 --- /dev/null +++ b/QPalmaDP/qpalma_dp.h @@ -0,0 +1,83 @@ +#ifndef _QPALMA_DP_H_ +#define _QPALMA_DP_H_ + +#include "penalty_info.h" + +struct ArrayElem { //24B + double score; + int prev_j; + int prev_i; + int prev_matrix_no; + bool isSplice; +} ; + +struct pre_score { //24B + double value; //8 + int prev_i; //4 + int prev_j; //4 + int prev_matrix_no;//4 + bool issplice;//4 +}; + +typedef struct pre_score Pre_score; + +typedef struct Align_pair { //24B + double value ; + bool issplice; + int previ,prevj ; + char est_char; + char dna_char; +}; + + +void fill_matrix (int nr_paths, Pre_score* matrices[], int len_est, int len_dna, char* est, char* dna, penalty_struct* functions, double* matchmatrix, double* donor, double* acceptor, bool remove_duplicate_scores, int nr_donor_sites, int* donor_sites, int* max_score_positions); + +int check_char(char base); + +bool result_align (Pre_score* matrices[], int matrixnr, int i, int j, int* result_length_ptr, char* est, char* dna, int* s_align, int* e_align, int* m_param, double* alignmentscores, int* max_score_positions ); + +extern void print_align(Pre_score* matrix, int length_est, int length_dna, Align_pair* vektor, int result_length, int print_matrix); + + +/* + * The function myalign calculates a new alignment given the scoring scheme + * Its input arguments are: + * + * num_path(id) -> an integer specifying which alignment has to be done + * dna -> + * est -> + * h -> + * mmatrix -> + * donor -> + * acceptor -> + * remove_duplicate_scores -> a boolean + * print_matrix -> a boolean + * + * [SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest] = myalign_local(... + */ + +class Alignment { + + private: + SpliceAlign + EstAlign; + WeightMatch; + TotalScores; + + public: + Alignment(); + ~Alignment(); + + void myalign(int nr_paths, dna, est, h, matchmatrix, double* donor, double* acceptor, bool remove_duplicate_scores, bool print_matrix); + getSpliceAlign() + getEstAlign(); + getWeightMatch(); + getTotalScores(); + getDNAEST(); + + +}; + + +#endif // _QPALMA_DP_H_ + diff --git a/QPalmaDP/result_align.cpp b/QPalmaDP/result_align.cpp new file mode 100644 index 0000000..2a49f68 --- /dev/null +++ b/QPalmaDP/result_align.cpp @@ -0,0 +1,156 @@ +#include "assert.h" +#include "fill_matrix.h" + +using namespace std; + +bool result_align(Pre_score* matrices[], int z, int est_len, int dna_len, int* result_length_ptr, char* est, char* dna, int* s_align, int* e_align, int* mparam, double* alignmentscores, int* max_score_positions) +//Gibt 1 zurueck, wenn kein weiterer Pfad existiert, sonst 0 + +{ + const int mlen=6; // length of matchmatrix + int startend[4] ; // [i_start, j_start, i_end, j_end] ; + startend[3] = max_score_positions[2*z] ; //i (est) + startend[4] = max_score_positions[2*z +1] ; //j (dna) + //printf("in result_align: z=%d:(%d, %d)\n", z, startend[3], startend[4]) ; + int dnanum ; //using check_char: letter -> number + int estnum ; //using check_char: letter -> number + + int dna_pos ; + int est_pos ; + int splice_state ; + int est_state ; + for (dna_pos=dna_len-1; dna_pos>startend[4]; dna_pos--) { + s_align[dna_pos-1] = 4 ; // not aligned (-1 because array starts with index 0) + } + for (est_pos=est_len-1; est_pos>startend[3]; est_pos--) { + e_align[est_pos-1] = 4 ; + } + //splice_align: 0: exon, 1: donor, 2: acceptor, 3: intro,. 4: dangling end + + + /* starting point for backtracking */ + int i = startend[3] ; + int j = startend[4] ; + int act_m_no = z ; + alignmentscores[z] = ((Pre_score*)matrices[z] + i*dna_len +j)->value; + //printf("alignment_score for z=%d: %3.8f\n", z, alignmentscores[z]) ; + + int prev_i = ((Pre_score*)matrices[act_m_no] + i*dna_len +j)->prev_i; + int prev_j = ((Pre_score*)matrices[act_m_no] + i*dna_len +j)->prev_j; + int prev_m_no = ((Pre_score*)matrices[act_m_no] + i*dna_len +j)->prev_matrix_no; + + if(!(isfinite(((Pre_score*)matrices[act_m_no] + i*dna_len +j)->value))) + { + cout << "Kein weiterer Pfad vorhanden\n"; + return 1; + } + + if(((Pre_score*)matrices[act_m_no] + i*dna_len +j)->value==0) { + cout << "Kein weiterer Pfad vorhanden\n"; + return 1; + } + + assert(dna_pos == j) ; + assert(est_pos == i) ; + splice_state = 0 ; //exon + est_state = 1 ; //[112222111 ... for exons of length 2,4,3 ... + + //compute length of alignment (between max(m,n) and m+n) + //local_alignment: backtracking to first Zero: look at actual value + while(!(((Pre_score*)matrices[act_m_no] + i*dna_len +j)->value)==0.0) { + + //just some info + //if ((z>0) && (!(z==prev_m_no))) { + // printf("(%d,%d): next step in next matrix\n", i,j) ; + //} + + if ((prev_i == (i-1)) && (prev_j == (j-1))) { // match or mismatch + (*result_length_ptr)= (*result_length_ptr) + 1; + + s_align[dna_pos-1] = splice_state; + dna_pos-- ; + e_align[est_pos-1] = est_state ; //1 or 2, depended + est_pos-- ; + + dnanum = check_char(dna[j-1]) ; //-1 because index starts with 0 + estnum = check_char(est[i-1]) ; //-1 because index starts with 0 + mparam[mlen*dnanum +estnum] ++ ; + } + else if ((prev_i == (i)) && (prev_j == (j-1))) {// gap on EST + (*result_length_ptr)= (*result_length_ptr) + 1; + + s_align[dna_pos-1] = splice_state; + dna_pos-- ; + + dnanum = check_char(dna[j-1]) ; //-1 because index starts with 0 + estnum = 0 ; //gap + mparam[mlen*dnanum +estnum] ++ ; + } + else if ((prev_i == (i-1)) && (prev_j == (j))) { // gap on DNA + (*result_length_ptr)= (*result_length_ptr) + 1; + + e_align[est_pos-1] = est_state ; //1 or 2, depended + est_pos-- ; + + dnanum = 0 ; //gap + estnum = check_char(est[i-1]) ; //-1 because index starts with 0 + mparam[mlen*dnanum +estnum] ++ ; + } + else {// splice site + (*result_length_ptr) = (*result_length_ptr) + (j - prev_j); + if (((Pre_score*)matrices[act_m_no] + i*dna_len + j)->issplice==0) { + printf("matrices[act_m_no](i,j): %3.15f\n",((Pre_score*)matrices[act_m_no] + i*dna_len +j)->value) ; + printf("matrices[prev_m_no](i,j): %3.15f\n",((Pre_score*)matrices[prev_m_no] + i*dna_len +j)->value) ; + printf("issplice(act): %d\n",((Pre_score*)matrices[act_m_no] + i*dna_len + j)->issplice) ; + printf("issplice(prev): %d\n",((Pre_score*)matrices[prev_m_no] + i*dna_len + j)->issplice) ; + printf("act_m_no: %d, prev_m_no: %d\n", act_m_no, prev_m_no) ; + printf("j: %d -> prev_j:%d\n", j, prev_j) ; + printf("i: %d -> prev_i:%d\n", i, prev_i) ; + } + if (est_state==1) { //was exon labeled "1" + est_state = 2 ; //new exon is labeled "2" + } + else { + est_state = 1 ; //last exon was labeled "2", new exon is labeled "1" + } + for (j; j > prev_j; j--) { + if (splice_state == 0) { //coming from exon + splice_state = 2 ; //first intron_state: acceptor + } else { + splice_state = 3 ; //intron + } + if (j-1 == prev_j) { //last intron_state: donor + splice_state = 1 ;//donor + } + s_align[dna_pos-1] = splice_state; + dna_pos-- ; + } + splice_state = 0 ; //exon again + + } + + startend[1] = i; + startend[2] = j; + i = prev_i; + j = prev_j; + act_m_no = prev_m_no ; + + prev_i = ((Pre_score*)matrices[act_m_no] + i*dna_len + j)->prev_i; + prev_j = ((Pre_score*)matrices[act_m_no] + i*dna_len + j)->prev_j; + prev_m_no = ((Pre_score*)matrices[act_m_no] + i*dna_len +j)->prev_matrix_no; + } + for (dna_pos; dna_pos>0; dna_pos--) { + s_align[dna_pos-1] = 4 ; // not aligned (-1 because array starts with index 0) + } + for (est_pos; est_pos>0; est_pos--) { + e_align[est_pos-1] = 4 ; // not aligned (-1 because array starts with index 0) + } + + //printf("(%d, %d) --> (%d, %d)\n", startend[1], startend[2], startend[3], startend[4]) ; + //printf("result_length: %d\n", *result_length_ptr) ; + + + return 0; +} + + diff --git a/python/.qpalma.py.swp b/python/.qpalma.py.swp index 7e345a8b1e080381e52e3101ed5140c80704d583..4f1759fab2ff70cf854f4bee78ea640764ea958e 100644 GIT binary patch delta 2803 zcmaLYdrTBZ9Ki8CUW#zQDS{|?>s{p~FCRrw9)ebcM5^L5{$YSS;1oQ19KC>Tk2Yzv zMx%98Td`HCJsbaMBU)oxW2;rP_1Px0NwvW?wkD05#(wA6Euf~G`($?JH#0Xo zGdtV*eqig#z?jOcWhD#L=^0rHSCpdEr7SFded^JezR1XM#YuJEO9g{ zw#3+9B<0X;rKE6)`dqY4bOg~?<#B>5JZwnl7MgNzjsigFgR zd;V3OJ~k*)Q3COp((iEv=WsT{SE*J|MhzAt2e*t&-f`xs3vOg1 z6Bb+wSCr%E!hYmK#cyGX(u*rNjU8Bz9E`&y=Itwdfg{+Aa+Km>D1G8EvXK@VuP7IW zD~iBduwV?jnTlh>e1@R55HYQJvDji69<8cswbP|KY~1)SN3|*nnu}#n+-V*&Vn}iB zpisonklrFKgckjs%+YJ>H7Tbk2bSjs`c1BP>i{T8?NwW)0U{lS$N@Fh54_<(NiC>J_x1 zQI<2|iLxiMjGkI$!TzV=E;e>Zg@KIhUzm!b#k}U(+PdVMDr}097py2NL;ANluhQ(l zM_(bg5gQ;QE3n*`P&hMK6c%NPoAb&7=~tL*m-y0(uO=IDK^~(c#)0=jObf@gL626U6>DZ2(QjP z9_njMLOddIK1fks!*Zm;j9A>|xH*c0*bWz#UpvUG!((tTc3bXY~%l!-p$zAuAC&-53P{0m}L6>VWu)v6B3R5|M#n%s8r$;WY} z1bMObtHD;8(P($CcDpL7?8zwu0k^BkzTEDpsddkFu1iw~vV~PuT2|OvvP9u5b;6+j zWWDl+I!Begu+CAVHxz%iZ1#EAzZq@JWex8`u4ttw8!TyWJwpW^zzwqN#yl_r`ip9 zWQRk3G&&ivJd#c5W|ZZYEFPr=- zy5WI*g?icO*I~ybSaFff-j0{h04FL@fIc?+Y3xKDF0jW>!Uq>Dc*H}>eSaFC<83Sr zGWS3ITr%j!nmWKP|+Kv?sM^xeM z2-aSJHDSKTT?W-F&zu-T{h3SFZLGM9+Dpw!_iIAVm-&rjAx%40#DHw}a zS*s;T#6PUjN7#TCOvG5+Vts7#E7=kvE|)c;+3L^2>D0)A>px8td{Hb{DT;X$!Z)P z{}+8KrWep}iMUplRH7F=)%UXl&n}fCrbjtB@S0x`_F3rdWBP zUEDuV8YE9yQP61-t2@*AbXmog&LVjlr}}%asxG^`$+fOgb$gtu&Dm60XIJYSjqXO} MzkjfbxEYiG1ppjXoB#j- delta 2765 zcmb`}4@{J090%~{;4uEkpGy$`1UxEqoE|42b|fCkiINqd(h`{)9e3}+EqL#|_W)7! zw53*?qx`ipQ3((v#Y1j)vhQ;ov6xwHzT4;h^SsaV zd!Ofd-}^nc?nvmmcSFZkWj?>Km>Uh53|kyyX{tpjED%{G5A+#T4LVV&$n=O^yV1Gu7ZeiX;`W%-N|(h`qg2*9t(MwKx6AFc@~S32uSSxqd70BG^HRB* z`VA#CYx;5T02eCEz3uZeU)nKra--&zOy?a0=E#5{!p_=*+H6Qg@}&CQmVa;mKDA3gha>Qf~cLYV1y>o!!&uMEB!# ztGk3c8hCh`M^Zd^9Gz~+pk2GOD5tSh#mLZ;r%I!B7fiILCx^c6Nu;|ynPhmc7-jQm za??1}DQMDD{GME#9cU^-?Nb9;)ZVvLjgB+Q+&huF`pUya*94mMOgw$McQGaQmQYP^ zGFol!t)%O)c;8%HZQVBo`*kH+y1Y+M<9zwF%vVNzzGWzF?^{lCUk(B;^NL{Hq9~Hv zlWwW4O}ELGbcbzarNwF;I_78VGO~0gleSu`)A79j(qe`#BU`kv47UgdU7free!5|Gau^ogR5GHJM-6oy~86$ z;}L@tT}J6pmvJh(Jfg>GiW6VbU_ggcR=CwETP3&bz;EAEBXPx5>#$0S10RN%xxv{R z9K4I0yu83_m0TXDTUqU}tqIQV(39}wLZ>Y5q2=Iacex$1$DXcDv++iY!j%^tE|+A} z^D4<|saGV9@?tZ0+IgK+=X5s(D#?;$lWfant=?hN50+P=5;kuj>ca{;*hTFBIpByY z6%S=3x;(}#HpX5=#;GClfjq_-BNJQ2i58dsZn4U%OtoI#R24F};vmoI)l2@ht2f3+JJ1Cax+?}?AkFJ3-kVq^eulHK z9a^Co>Yy6zFb`583GQJD`2)U#^KcG6hcj>xwn8(MLkYygH7qm7pdSeAFb}dvvlR9# zx+B)0e%J}?!3Bk2io)zTBPqYN-2d$M*&(!PXDdC{HuJf_JZN1Ox1~<2=h_;vvx%KW z?CR<)9=BtqI4uynD(z4X;fkd}N)NQNJCHFfb&{SN3P%cPvdVqd0ld}wF9pMSRfH-FCf|pd`bAuCFz$#{AMDk?W_#58p9vo=0=Y^2 Of6R_${;2)8QvL*i3)-2)=='a')) ; - assert(all(dna(idx_acc(idx_acc>2)-1)=='g')) ; - %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % True Alignment and Comparison with wrong ones %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% @@ -365,9 +375,7 @@ def run(): end ; end fprintf('\n') ; - - - + ################################################################################# const_added = solver.addConstraint(deltas, loss, pos, marginRescaling) objValue,w,self.slacks = solver.solve() @@ -377,14 +385,8 @@ def run(): # maxgap=max(gap) [mean(xis>=1) mean(gap>=1) mean(xis>=1e-3) mean(gap>=1e-3)] - [h,d,a,mmatrix] = set_param_palma(param,ARGS.train_with_intronlengthinformation) ; - - save(paramfilename, ... - 'param', 'h', 'd', 'a', 'mmatrix', 'A', 'b') ; - if max(gap)<=column_eps: break - assert(all(maxgap<=column_eps)) assert(length(take_idx)==N) @@ -395,7 +397,6 @@ if __name__ == '__main__': run() """ - def __init__(self): numAcceptorParams = 0 numDonorParams = 0 -- 2.20.1