From: fabio Date: Fri, 30 Nov 2007 15:53:57 +0000 (+0000) Subject: + added c code from the palma project X-Git-Url: http://git.tuebingen.mpg.de/?p=qpalma.git;a=commitdiff_plain;h=2e7d55a1ae0e0d12a40f30ed7ee4ec725475e55c + added c code from the palma project + added own version of the swig interface TODO - think about wrapper for structs / cells user either SWIG constrcuts or native python object git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@6902 e1793c9e-67f9-0310-80fc-b846ff1f7b36 --- diff --git a/QPalmaDP/Makefile b/QPalmaDP/Makefile new file mode 100644 index 0000000..42574ea --- /dev/null +++ b/QPalmaDP/Makefile @@ -0,0 +1,24 @@ +SRCS= Mathmatics.cpp\ + fill_matrix.cpp\ + result_align.cpp\ + penalty_info.cpp\ + print_align.cpp\ + io.cpp + +OBJS = $(SRCS:%.cpp=%.o) + +#CXXFLAGS=-O3 -fPIC +#CXXFLAGS=-O3 -fPIC -pg -fprofile-arcs +CXXFLAGS=-O3 -fPIC -ggdb + +IF=QPalmaDP + +all: $(OBJS) + swig -c++ -python ${IF}.i + g++ $(CXXFLAGS) -I/usr/include/python2.5 -c ${IF}_wrap.cxx -o ${IF}_wrap.o + g++ $(CXXFLAGS) -shared -lpython2.5 $(OBJS) ${IF}_wrap.o -o _${IF}.so + python -c "import ${IF}" + +clean: + rm *.o *.so *.cxx ${IF}.py ${IF}.pyc + diff --git a/QPalmaDP/Mathmatics.cpp b/QPalmaDP/Mathmatics.cpp new file mode 100644 index 0000000..61298da --- /dev/null +++ b/QPalmaDP/Mathmatics.cpp @@ -0,0 +1,349 @@ +// Math.cpp: implementation of the CMath class. +// +////////////////////////////////////////////////////////////////////// + + +#include "Mathmatics.h" +#include "io.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +////////////////////////////////////////////////////////////////////// +// Construction/Destruction +////////////////////////////////////////////////////////////////////// + +#ifdef USE_LOGCACHE +//gene/math specific constants +#ifdef USE_HMMDEBUG +#define MAX_LOG_TABLE_SIZE 10*1024*1024 +#define LOG_TABLE_PRECISION 1e-6 +#else +#define MAX_LOG_TABLE_SIZE 123*1024*1024 +#define LOG_TABLE_PRECISION 1e-15 +#endif + +INT CMath::LOGACCURACY = 0; // 100000 steps per integer +#endif + +INT CMath::LOGRANGE = 0; // range for logtable: log(1+exp(x)) -25 <= x <= 0 + +#ifdef USE_PATHDEBUG +const REAL CMath::INFTY = 1e11; // infinity +#else +const REAL CMath::INFTY = -log(0.0); // infinity +#endif +const REAL CMath::ALMOST_NEG_INFTY = -1000; + +CHAR CMath::rand_state[256]; + +CMath::CMath() +{ + struct timeval tv; + gettimeofday(&tv, NULL); + UINT seed=(UINT) (4223517*getpid()*tv.tv_sec*tv.tv_usec); + initstate(seed, CMath::rand_state, sizeof(CMath::rand_state)); + CIO::message(M_INFO, "seeding random number generator with %u\n", seed); + +#ifdef USE_LOGCACHE + LOGRANGE=CMath::determine_logrange(); + LOGACCURACY=CMath::determine_logaccuracy(LOGRANGE); + CIO::message(M_INFO, "Initializing log-table (size=%i*%i*%i=%2.1fMB) ...",LOGRANGE,LOGACCURACY,sizeof(REAL),LOGRANGE*LOGACCURACY*sizeof(REAL)/(1024.0*1024.0)) ; + + CMath::logtable=new REAL[LOGRANGE*LOGACCURACY]; + init_log_table(); + CIO::message(M_INFO, "Done.\n") ; +#else + INT i=0; + while ((REAL)log(1+((REAL)exp(-REAL(i))))) + i++; + CIO::message(M_INFO, "determined range for x in log(1+exp(-x)) is:%d\n", i); + LOGRANGE=i; +#endif +} + +CMath::~CMath() +{ +#ifdef USE_LOGCACHE + delete[] logtable; +#endif +} + +#ifdef USE_LOGCACHE +INT CMath::determine_logrange() +{ + INT i; + REAL acc=0; + for (i=0; i<50; i++) + { + acc=((REAL)log(1+((REAL)exp(-REAL(i))))); + if (acc<=(REAL)LOG_TABLE_PRECISION) + break; + } + + CIO::message(M_INFO, "determined range for x in table log(1+exp(-x)) is:%d (error:%G)\n",i,acc); + return i; +} + +INT CMath::determine_logaccuracy(INT range) +{ + range=MAX_LOG_TABLE_SIZE/range/((int)sizeof(REAL)); + CIO::message(M_INFO, "determined accuracy for x in table log(1+exp(-x)) is:%d (error:%G)\n",range,1.0/(double) range); + return range; +} + +//init log table of form log(1+exp(x)) +void CMath::init_log_table() +{ + for (INT i=0; i< LOGACCURACY*LOGRANGE; i++) + logtable[i]=log(1+exp(REAL(-i)/REAL(LOGACCURACY))); +} +#endif + +void CMath::sort(INT *a, INT cols, INT sort_col) +{ + INT changed=1; + if (a[0]==-1) return ; + while (changed) + { + changed=0; INT i=0 ; + while ((a[(i+1)*cols]!=-1) && (a[(i+1)*cols+1]!=-1)) // to be sure + { + if (a[i*cols+sort_col]>a[(i+1)*cols+sort_col]) + { + for (INT j=0; ja[i+1]) + { + swap(a[i],a[i+1]) ; + swap(idx[i],idx[i+1]) ; + changed=1 ; + } ; + } ; + } ; + +} + + + +//plot x- axis false positives (fp) 1-Specificity +//plot y- axis true positives (tp) Sensitivity +INT CMath::calcroc(REAL* fp, REAL* tp, REAL* output, INT* label, INT& size, INT& possize, INT& negsize, REAL& tresh, FILE* rocfile) +{ + INT left=0; + INT right=size-1; + INT i; + + for (i=0; i 0) && (left0) + maximum=max(maximum,negout[negsize-1]); + if (possize>0) + maximum=max(maximum,posout[possize-1]); + + REAL treshhold=minimum; + REAL old_treshhold=treshhold; + + //clear array. + for (i=0; i=possize && negidx=negsize && posidx negsize*fp[iteration]/size+(1-tp[iteration])*possize/size ) + { + minerr=negsize*fp[iteration]/size+(1-tp[iteration])*possize/size; + tresh=(old_treshhold+treshhold)/2; + returnidx=iteration; + } + + iteration++; + } + + // set new size + size=iteration; + + if (rocfile) + { + const CHAR id[]="ROC"; + fwrite(id, sizeof(char), sizeof(id), rocfile); + fwrite(fp, sizeof(REAL), size, rocfile); + fwrite(tp, sizeof(REAL), size, rocfile); + } + + return returnidx; +} + +UINT CMath::crc32(BYTE *data, INT len) +{ + UINT result; + INT i,j; + BYTE octet; + + result = 0-1; + + for (i=0; i> 7) ^ (result >> 31)) + { + result = (result << 1) ^ 0x04c11db7; + } + else + { + result = (result << 1); + } + octet <<= 1; + } + } + + return ~result; +} + +double CMath::mutual_info(REAL* p1, REAL* p2, INT len) +{ + double e=0; + + for (INT i=0; i +#include + +//define finite for win32 +#ifdef _WIN32 +#include +#define finite _finite +#define isnan _isnan +#endif + +#ifndef NAN +#include +#define NAN (strtod("NAN",NULL)) +#endif + +#ifdef SUNOS +extern "C" int finite(double); +#endif + +/** Mathematical Functions. + * Class which collects generic mathematical functions + */ +class CMath +{ +public: + /**@name Constructor/Destructor. + */ + //@{ + ///Constructor - initializes log-table + CMath(); + + ///Destructor - frees logtable + virtual ~CMath(); + //@} + + /**@name min/max/abs functions. + */ + //@{ + + ///return the minimum of two integers + // + template + static inline T min(T a, T b) + { + return ((a)<=(b))?(a):(b); + } + + ///return the maximum of two integers + template + static inline T max(T a, T b) + { + return ((a)>=(b))?(a):(b); + } + + ///return the maximum of two integers + template + static inline T abs(T a) + { + return ((a)>=(0))?(a):(-a); + } + //@} + + /**@name misc functions */ + //@{ + + /// crc32 + static UINT crc32(BYTE *data, INT len); + + static inline REAL round(REAL d) + { + return floor(d+0.5); + } + + /// signum of type T variable a + template + static inline REAL sign(REAL a) + { + if (a==0) + return 0; + else return (a<0) ? (-1) : (+1); + } + + /// swap floats a and b + template + static inline void swap(T & a,T &b) + { + T c=a ; + a=b; b=c ; + } + + /// x^2 + template + static inline T sq(T x) + { + return x*x; + } + + static inline LONG factorial(INT n) + { + LONG res=1 ; + for (int i=2; i<=n; i++) + res*=i ; + return res ; + } + + static inline LONG nchoosek(INT n, INT k) + { + long res=1 ; + + for (INT i=n-k+1; i<=n; i++) + res*=i ; + + return res/factorial(k) ; + } + + /** performs a bubblesort on a given matrix a. + * it is sorted from in ascending order from top to bottom + * and left to right */ + static void sort(INT *a, INT cols, INT sort_col=0) ; + static void sort(REAL *a, INT*idx, INT N) ; + + /** performs a quicksort on an array output of length size + * it is sorted from in ascending (for type T) */ + template + static void qsort(T* output, INT size) ; + + /** performs a quicksort on an array output of length size + * it is sorted from in ascending order + * (for type T1) and returns the index (type T2) + * matlab alike [sorted,index]=sort(output) + */ + template + static void qsort(T1* output, T2* index, INT size); + + /** performs a quicksort on an array output of length size + * it is sorted from in ascending order + * (for type T1) and returns the index (type T2) + * matlab alike [sorted,index]=sort(output) + */ + template + static void qsort_backward(T1* output, T2* index, INT size); + + /* finds the smallest element in output and puts that element as the + first element */ + template + static void min(REAL* output, T* index, INT size); + + /* finds the n smallest elements in output and puts these elements as the + first n elements */ + template + static void nmin(REAL* output, T* index, INT size, INT n); + + /* performs a inplace unique of a vector of type T using quicksort + * returns the new number of elements */ + template + static INT unique(T* output, INT size) + { + qsort(output, size) ; + INT i,j=0 ; + for (i=0; ielem || output[end]elem) + { + end = middle ; + middle=start+(end-start)/2 ; + } ; + if (output[middle]elem) + return -1 ; + if (output[end]<=elem) + return size-1 ; + + while (1) + { + if (output[middle]>elem) + { + end = middle ; + middle=start+(end-start)/2 ; + } ; + if (output[middle]=elem) + return middle ; + if (end-start<=1) + { + if (output[start]<=elem && output[start+1]>=elem) + return start ; + return end ; + } + } + } + + /** calculates ROC into (fp,tp) + * from output and label of length size + * returns index with smallest error=fp+fn + */ + static INT calcroc(REAL* fp, REAL* tp, REAL* output, INT* label, INT& size, INT& possize, INT& negsize, REAL& tresh, FILE* rocfile); + //@} + + /// returns the mutual information of p which is given in logspace + /// where p,q are given in logspace + static double mutual_info(REAL* p1, REAL* p2, INT len); + + /// returns the relative entropy H(P||Q), + /// where p,q are given in logspace + static double relative_entropy(REAL* p, REAL* q, INT len); + + /// returns entropy of p which is given in logspace + static double entropy(REAL* p, INT len); + + /**@name summing functions */ + //@{ + /** sum logarithmic probabilities. + * Probability measures are summed up but are now given in logspace where + * direct summation of exp(operand) is not possible due to numerical problems, i.e. eg. exp(-1000)=0. Therefore + * we do log( exp(a) + exp(b)) = a + log (1 + exp (b-a)) where a is max(p,q) and b min(p,q). */ +#ifdef USE_LOGCACHE + static inline REAL logarithmic_sum(REAL p, REAL q) + { + if (finite(p)) + { + if (finite(q)) + { + + register REAL diff=p-q; + + if (diff>0) //p>q + { + if (diff > LOGRANGE) + return p; + else + return p + logtable[(int)(diff*LOGACCURACY)]; + } + else //p<=q + { + if (-diff > LOGRANGE) + return q; + else + return q + logtable[(int)(-diff*LOGACCURACY)]; + } + } + CIO::message("INVALID second operand to logsum(%f,%f) expect undefined results\n", p, q); + return NAN; + } + else + return q; + } + + ///init log table of form log(1+exp(x)) + static void init_log_table(); + + /// determine INT x for that log(1+exp(-x)) == 0 + static INT determine_logrange(); + + /// determine accuracy, such that the thing fits into MAX_LOG_TABLE_SIZE, needs logrange as argument + static INT determine_logaccuracy(INT range); +#else + /* + inline REAL logarithmic_sum(REAL p, REAL q) + { + double result=comp_logarithmic_sum(p,q); + + printf("diff:%f <-> %f\n",p-q, result); + return result; + }*/ + + static inline REAL logarithmic_sum(REAL p, REAL q) + { + if (finite(p)) + { + if (finite(q)) + { + + register REAL diff=p-q; + if (diff>0) //p>q + { + if (diff > LOGRANGE) + return p; + else + return p + log(1+exp(-diff)); + } + else //p<=q + { + if (-diff > LOGRANGE) + return q; + else + return q + log(1+exp(diff)); + } + } + return p; + } + else + return q; + } +#endif +#ifdef LOG_SUM_ARRAY + /** sum up a whole array of values in logspace. + * This function addresses the numeric instabiliy caused by simply summing up N elements by adding + * each of the elements to some variable. Instead array neighbours are summed up until one element remains. + * Whilst the number of additions remains the same, the error is only in the order of log(N) instead N. + */ + static inline REAL logarithmic_sum_array(REAL *p, INT len) + { + if (len<=2) + { + if (len==2) + return logarithmic_sum(p[0],p[1]) ; + if (len==1) + return p[0]; + return -INFTY ; + } + else + { + register REAL *pp=p ; + if (len%2==1) pp++ ; + for (register INT j=0; j < len>>1; j++) + pp[j]=logarithmic_sum(pp[j<<1], pp[1+(j<<1)]) ; + } + return logarithmic_sum_array(p,len%2+len>>1) ; + } +#endif + //@} +public: + /**@name constants*/ + //@{ + /// infinity + static const REAL INFTY; + + /// almost neg (log) infinity + static const REAL ALMOST_NEG_INFTY; + + /// range for logtable: log(1+exp(x)) -LOGRANGE <= x <= 0 + static INT LOGRANGE; + +#ifdef USE_LOGCACHE + /// number of steps per integer + static INT LOGACCURACY; + //@} +protected: + ///table with log-values + static REAL* logtable; +#endif + static CHAR rand_state[256]; +}; + + +//implementations of template functions +template +void CMath::qsort(T* output, INT size) +{ + if (size==2) + { + if (output[0] > output [1]) + swap(output[0],output[1]); + } + else + { + REAL split=output[(size*rand())/(RAND_MAX+1)]; + //REAL split=output[size/2]; + + INT left=0; + INT right=size-1; + + while (left<=right) + { + while (output[left] < split) + left++; + while (output[right] > split) + right--; + + if (left<=right) + { + swap(output[left],output[right]); + left++; + right--; + } + } + + if (right+1> 1) + qsort(output,right+1); + + if (size-left> 1) + qsort(&output[left],size-left); + } +} + +template +void CMath::qsort(T1* output, T2* index, INT size) +{ + if (size==2) + { + if (output[0] > output [1]){ + swap(output[0],output[1]); + swap(index[0],index[1]); + } + + } + else + { + REAL split=output[(size*rand())/(RAND_MAX+1)]; + //REAL split=output[size/2]; + + INT left=0; + INT right=size-1; + + while (left<=right) + { + while (output[left] < split) + left++; + while (output[right] > split) + right--; + + if (left<=right) + { + swap(output[left],output[right]); + swap(index[left],index[right]); + left++; + right--; + } + } + + if (right+1> 1) + qsort(output,index,right+1); + + if (size-left> 1) + qsort(&output[left],&index[left], size-left); + } +} + +template +void CMath::qsort_backward(T1* output, T2* index, INT size) +{ + if (size==2) + { + if (output[0] < output [1]){ + swap(output[0],output[1]); + swap(index[0],index[1]); + } + + } + else + { + REAL split=output[(size*rand())/(RAND_MAX+1)]; + //REAL split=output[size/2]; + + INT left=0; + INT right=size-1; + + while (left<=right) + { + while (output[left] > split) + left++; + while (output[right] < split) + right--; + + if (left<=right) + { + swap(output[left],output[right]); + swap(index[left],index[right]); + left++; + right--; + } + } + + if (right+1> 1) + qsort(output,index,right+1); + + if (size-left> 1) + qsort(&output[left],&index[left], size-left); + } +} + +template +void CMath::nmin(REAL* output, T* index, INT size, INT n) +{ + if (6*n*size<13*size*log((double)size)) + for (INT i=0; i +void CMath::min(REAL* output, T* index, INT size) +{ + if (size<=0) + return ; + REAL min_elem = output[0] ; + INT min_index = 0 ; + for (INT i=1; i +#include +#include "config.h" + +#ifdef SUNOS +#define bool int +#define false 0 +#define true 1 +#endif + +#ifndef LINUX +#define RANDOM_MAX 2147483647 +#else +#define RANDOM_MAX RAND_MAX +#endif + +/**@name Standard Types + * Definition of Platform independent Types +*/ +//@{ + +/// Type CHAR +typedef char CHAR; +typedef CHAR* P_CHAR; + +/// Type BYTE +typedef unsigned char BYTE; +typedef BYTE* P_BYTE; + +/// Type SHORT is 2 bytes in size +typedef short int SHORT; +typedef SHORT* P_SHORT; + +/// Type WORD is 2 bytes in size +typedef unsigned short int WORD; +typedef WORD* P_WORD; + +/// Type INT is 4 bytes in size +typedef int INT; +typedef INT* P_INT; + +/// Type INT is 4 bytes in size +typedef unsigned int UINT; +typedef UINT* P_UINT; + +/// Type LONG is 8 bytes in size +typedef long LONG; +typedef LONG* P_LONG; + +/// Type SHORTREAL is 4 bytes in size +typedef float SHORTREAL; +typedef SHORTREAL* P_SHORTREAL; + +/// Type REAL is 8 bytes in size +typedef double REAL; +typedef REAL* P_REAL; + +/// Type LONGREAL is 16 bytes in size +//typedef long double LONGREAL; +//typedef LONGREAL* P_LONGREAL; + +#ifdef USE_SHORTREAL_KERNELCACHE + typedef SHORTREAL KERNELCACHE_ELEM; +#else + typedef REAL KERNELCACHE_ELEM; +#endif + +typedef KERNELCACHE_ELEM P_KERNELCACHE_ELEM; + +typedef LONG KERNELCACHE_IDX; + +/// The io libs output [DEBUG] etc in front of every CIO::message +/// 'higher' messages filter output depending on the loglevel, i.e. CRITICAL messages +/// will print all M_CRITICAL TO M_EMERGENCY messages to +enum EMessageType +{ + M_DEBUG, + M_INFO, + M_NOTICE, + M_WARN, + M_ERROR, + M_CRITICAL, + M_ALERT, + M_EMERGENCY, + M_MESSAGEONLY, + M_PROGRESS +}; + +enum EKernelType +{ + K_UNKNOWN = 0, + K_OPTIMIZABLE = 4096, + K_LINEAR = 10 | K_OPTIMIZABLE, + K_POLY = 20, + K_GAUSSIAN = 30, + K_HISTOGRAM = 40, + K_SALZBERG = 41, + K_LOCALITYIMPROVED = 50, + K_SIMPLELOCALITYIMPROVED = 60, + K_FIXEDDEGREE = 70, + K_WEIGHTEDDEGREE = 80 | K_OPTIMIZABLE, + K_WEIGHTEDDEGREEPOS = 81 | K_OPTIMIZABLE, + K_WEIGHTEDDEGREEPOLYA = 82, + K_WD = 83, + K_COMMWORD = 90 | K_OPTIMIZABLE , + K_POLYMATCH = 100, + K_ALIGNMENT = 110, + K_COMMWORDSTRING = 120 | K_OPTIMIZABLE, + K_SPARSENORMSQUARED = 130, + K_COMBINED = 140 | K_OPTIMIZABLE +}; + +enum EFeatureType +{ + F_UNKNOWN = 0, + F_REAL = 10, + F_SHORT = 20, + F_CHAR = 30, + F_INT = 40, + F_BYTE = 50, + F_WORD = 60 +}; + +enum EFeatureClass +{ + C_UNKNOWN = 0, + C_SIMPLE = 10, + C_SPARSE = 20, + C_STRING = 30, + C_COMBINED = 40 +}; + +/// Alphabet of charfeatures/observations +enum E_ALPHABET +{ + /// DNA - letters A,C,G,T,*,N,n + DNA=0, + + /// PROTEIN - letters a-z + PROTEIN=1, + + /// ALPHANUM - [0-9a-z] + ALPHANUM=2, + + /// CUBE - [1-6] + CUBE=3, + + /// NONE - type has no alphabet + NONE=4 +}; + +//@} + +#define TMP_DIR "/tmp/" +//#define TMP_DIR "/short/x46/tmp/" + +#endif diff --git a/QPalmaDP/config.h b/QPalmaDP/config.h new file mode 100644 index 0000000..88b448b --- /dev/null +++ b/QPalmaDP/config.h @@ -0,0 +1,19 @@ + +#define HAVE_MATLAB 1 + +#define HAVE_LARGEFILE 1 + + + + + + +#define USE_BIGSTATES 1 + + +#define USE_HMMCACHE 1 + + + + + diff --git a/QPalmaDP/fill_matrix.cpp b/QPalmaDP/fill_matrix.cpp new file mode 100644 index 0000000..e36af54 --- /dev/null +++ b/QPalmaDP/fill_matrix.cpp @@ -0,0 +1,421 @@ +/*********************************************************************************************/ +// Version of fill_matrix with fills several alignment matrices (best, second_best ...) +// nr_paths det. number of these matrices +// +// est_len: Number of columns in alignment matrix: length(EST) +1 (for the gap) (variable: i) +// dna_len: Number of lines in alignment matrix: length(DNA) +1 (for the gap) (variable: j) +// +// !!! IMPORTANT !!! The pictures and equations in the paper/poster etc. describe the algorithm +// for an alignment matrix where DNA and EST changed places !!! IMPORTANT !!! +/*********************************************************************************************/ + +/* + matchmatrix: columnwise + - A C G T N (dna) + - x x x x x x + A x x x x x x + C x x x x x x + G x x x x x x + T x x x x x x + N x x x x x x +(est) +*/ + +/* + alignment matrix: columnwise + |-EST . . . + -+------------ + -|00 ... + D|0. + N|. . + A|. . + .|. + .| + .| +*/ + + +/* TODO + - max_len = 10000 for elegans! + - do not use double + +*/ + + +#include "fill_matrix.h" + +using namespace std; + +int check_char(char base) +{ + switch(base) + { + case 'A': + case 'a': return 1; + case 'C': + case 'c': return 2; + case 'G': + case 'g': return 3; + case 'T': + case 't': return 4; + case 'N': + case 'n': return 5; + default : return 0; + } + +} + + + +void fill_matrix(int nr_paths, Pre_score* matrices[], int est_len /*counter var: i*/, int dna_len /*counter var: j*/, char* est, char* dna, penalty_struct* functions , double* matchmatrix, double* donor, double* acceptor, bool remove_duplicate_scores, int nr_donor_sites, int* donor_sites, int* max_score_positions) +{ + + /*********************************************************************************************/ + // variables + /*********************************************************************************************/ + const int mlen = 6; // length of matchmatrix + int estChar, dnaChar; // 0:'-', 1:'a', 2:'c', 3:'g', 4:'t', 5:'n' + double prevValue ; + double tempValue; + int max_len = (int)functions->limits[functions->len-1] ; //last (len) supporting point of h + //printf("max_len: %d\n", max_len) ; + + int num_elem ; // counter variable + + const int max_num_elem = (4+nr_paths+max_len) * nr_paths ; // upper bound on num_elem + double *scores=new double[max_num_elem] ; //are sorted later + struct ArrayElem *array=new ArrayElem[max_num_elem] ; + //printf("%d\n", max_num_elem*sizeof(ArrayElem)) ; + + int max_ss_element[nr_paths*nr_paths]; //position on dna with max ss score for introns of length greater than max_len + int splice_pos ; //splice position get from donor_sites + int start_splice_pos ; //where to start looking in donor_sites + + double max_scores[nr_paths] ; + + /*********************************************************************************************/ + // Initialisation of all alignment matrices, columnwise + /*********************************************************************************************/ + for (int z=0; zvalue = log(0.0); // -inf + ((Pre_score*)matrices[z] +(i*dna_len) +j)->prev_i = 0; + ((Pre_score*)matrices[z] +(i*dna_len) +j)->prev_j = 0; + ((Pre_score*)matrices[z] +(i*dna_len) +j)->prev_matrix_no = 0; + ((Pre_score*)matrices[z] +(i*dna_len) +j)->issplice = 0; + } + } + } + + + /*********************************************************************************************/ + // Element (0,0) for first/best matrice (z=0) + /*********************************************************************************************/ + ((Pre_score*)matrices[0])->value = 0; + ((Pre_score*)matrices[0])->prev_i = 0; + ((Pre_score*)matrices[0])->prev_j = 0; + ((Pre_score*)matrices[0])->prev_matrix_no = 0; + ((Pre_score*)matrices[0])->issplice = 0; + + + /*********************************************************************************************/ + // Line Zero for first/best matrice (z=0) + /*********************************************************************************************/ + for (int i=1; ivalue = 0 ; + ((Pre_score*)matrices[0] +(i*dna_len))->prev_i = i-1; + ((Pre_score*)matrices[0] +(i*dna_len))->prev_j = 0; + ((Pre_score*)matrices[0] +(i*dna_len))->prev_matrix_no = 0; + ((Pre_score*)matrices[0] +(i*dna_len))->issplice = 0; + } + + + /*********************************************************************************************/ + // Column Zero for first/best matrice (z=0) + /*********************************************************************************************/ + for (int j=1; jvalue = 0 ; + ((Pre_score*)matrices[0] +j)->prev_i = 0 ; + ((Pre_score*)matrices[0] +j)->prev_j = j-1 ; + ((Pre_score*)matrices[0] +j)->prev_matrix_no = 0 ; + ((Pre_score*)matrices[0] +j)->issplice = 0; + } + + + /*********************************************************************************************/ + // !!!! Column Zero and line Zero for all other matrices except the best one (z>0): -INF !!!! + /*********************************************************************************************/ + + + + /*********************************************************************************************/ + // all other entries for all matrices, last column and row treated seperately + /*********************************************************************************************/ + + /* columnwise, 0.column and 0.line already filled */ + for (int i=1; i max_len) + { + //storing highest ss_score (the position) and NOT -INF + int new_ss_pos = j-max_len ; + double donor_score = donor[new_ss_pos-1] ; + if (isfinite(donor_score)) //splice site! + { + if (max_ss_element[nr_paths-1] != 0) + { + for (int zz=0; zzvalue ; //could be -INF + if (!(isfinite(new_intron_score))) {printf("new_intron_score is -INF\n") ;} + for (int pos=0; posvalue ; + if (new_intron_score >= act_intron_score) + { + new_intron_score = act_intron_score ; + pos_with_minscore = pos ; + } //else nothing happens, new intron score still the worst one + } + if (pos_with_minscore != nr_paths) + { + max_ss_element[zz*nr_paths+pos_with_minscore] = new_ss_pos ; //replacing + } + } + } + else + { + for (int pos=0; pos (j,i) + /***************************************/ + prevValue = ((Pre_score*)actMatrix +(i*dna_len)+j-1)->value ; /* (j-1,i) -> (j,i) */ + if (isfinite(prevValue)) + { + tempValue = prevValue +(matchmatrix[mlen* dnaChar]); /* score(gap,DNA) */ + if (isfinite(tempValue)) + { + assert(num_elem (j,i) + /***************************************/ + prevValue = ((Pre_score*)actMatrix +(i-1)*dna_len + j-1)->value ; /* (j-1,i-1) -> (j,i) */ + if (isfinite(prevValue)) + { + tempValue = prevValue +(matchmatrix[mlen* dnaChar +estChar]);//DNA mit EST + if (isfinite(tempValue)) + { + assert(num_elem (j,i) + /***************************************/ + prevValue = ((Pre_score*)actMatrix +(i-1)*dna_len + j)->value ; + if (isfinite(prevValue)) + { + tempValue = prevValue +(matchmatrix[estChar]); /* score(EST,gap) */ + if (isfinite(tempValue)) + { + assert(num_elem0) + { + assert((j-max_ss_element[z*nr_paths+pos]+1) > max_len) ; + splice_pos = max_ss_element[z*nr_paths+pos] ; //the g of the gt + prevValue = ((Pre_score*)actMatrix + (i*dna_len) +splice_pos-1)->value ; //one before the _gt + if (isfinite(prevValue) && (((Pre_score*)actMatrix + (i*dna_len) +splice_pos-1)->issplice!=1)) + { + int dist = j -(splice_pos-1); //_gt...ag: dist from g to g + assert(dist>max_len) ; + tempValue = lookup_penalty(functions,dist, NULL, 0) +donor[splice_pos-1] +acceptor[j-1] +prevValue; + if (isfinite(tempValue)) + { + assert(num_elem j-4) { break ;} + prevValue = ((Pre_score*)actMatrix + (i*dna_len) +splice_pos-1)->value ; //one before the _gt + + if (isfinite(prevValue) && (((Pre_score*)actMatrix + (i*dna_len) +splice_pos-1)->issplice!=1)) + { + int dist = j -(splice_pos-1); //_gt...ag: dist from g to g + assert(dist<=max_len) ; + tempValue = lookup_penalty(functions,dist, NULL, 0) +donor[splice_pos-1] +acceptor[j-1] +prevValue; + if (isfinite(tempValue)) + { + assert(num_elem(scores, array, num_elem, nr_paths); + + /* the nr_paths many highest scores are written in the nr_paths many matrices*/ + for (int z=0; z=num_elem) { + //nothing happens, entry in matrices[z] at pos. (i,j) still -inf + } + else { + ((Pre_score*)matrices[z] + (i*dna_len) +j)->value = -scores[z] ; + ((Pre_score*)matrices[z] + (i*dna_len) +j)->prev_i = array[z].prev_i ; + ((Pre_score*)matrices[z] + (i*dna_len) +j)->prev_j = array[z].prev_j ; + ((Pre_score*)matrices[z] + (i*dna_len) +j)->issplice = array[z].isSplice ; + ((Pre_score*)matrices[z] + (i*dna_len) +j)->prev_matrix_no = array[z].prev_matrix_no; + + if (-scores[z]>=max_scores[z]) { //storing the postion of the highest alignment score yet + max_scores[z] = -scores[z] ; + max_score_positions[2*z] = i; // pos i + max_score_positions[2*z +1] = j; // pos j + } + } + } //end of z + + } //end of j + } // end of i + + + + //free memory + delete[] array; + delete[] scores; + + return; +} + + diff --git a/QPalmaDP/fill_matrix.h b/QPalmaDP/fill_matrix.h new file mode 100644 index 0000000..b0f169f --- /dev/null +++ b/QPalmaDP/fill_matrix.h @@ -0,0 +1,51 @@ +#ifndef _FILLMATRIX_H__ +#define _FILLMATRIX_H__ + +#include "Mathmatics.h" +#include "penalty_info.h" +#include //Eigentlich in Header +#include +#include +#include +#include +#include +#include + + +struct ArrayElem { //24B + double score; + int prev_j; + int prev_i; + int prev_matrix_no; + bool isSplice; +} ; + +struct pre_score { //24B + double value; //8 + int prev_i; //4 + int prev_j; //4 + int prev_matrix_no;//4 + bool issplice;//4 +}; + +typedef struct pre_score Pre_score; + +typedef struct Align_pair { //24B + double value ; + bool issplice; + int previ,prevj ; + char est_char; + char dna_char; +}; + + +void fill_matrix (int nr_paths, Pre_score* matrices[], int len_est, int len_dna, char* est, char* dna, penalty_struct* functions, double* matchmatrix, double* donor, double* acceptor, bool remove_duplicate_scores, int nr_donor_sites, int* donor_sites, int* max_score_positions); +// funktion aendern in void result_align(Pre_score* matrices[], int anzpath, + +int check_char(char base); + +bool result_align (Pre_score* matrices[], int matrixnr, int i, int j, int* result_length_ptr, char* est, char* dna, int* s_align, int* e_align, int* m_param, double* alignmentscores, int* max_score_positions ); + +extern void print_align(Pre_score* matrix, int length_est, int length_dna, Align_pair* vektor, int result_length, int print_matrix); + +#endif // _FILLMATRIX_H__ diff --git a/QPalmaDP/io.cpp b/QPalmaDP/io.cpp new file mode 100644 index 0000000..5ae3e1c --- /dev/null +++ b/QPalmaDP/io.cpp @@ -0,0 +1,96 @@ +#include "io.h" +#include "common.h" + +#include +#include +#include + +FILE* CIO::target=stdout; + +CIO::CIO() +{ +} + +void CIO::message(EMessageType prio, const CHAR *fmt, ... ) +{ + check_target(); + print_message_prio(prio, target); + va_list list; + va_start(list,fmt); + vfprintf(target,fmt,list); + va_end(list); + fflush(target); +} + +void CIO::buffered_message(EMessageType prio, const CHAR *fmt, ... ) +{ + check_target(); + print_message_prio(prio, target); + va_list list; + va_start(list,fmt); + vfprintf(target,fmt,list); + va_end(list); +} + +CHAR* CIO::skip_spaces(CHAR* str) +{ + INT i=0; + + if (str) + { + for (i=0; isspace(str[i]); i++); + + return &str[i]; + } + else + return str; +} + +void CIO::set_target(FILE* t) +{ + target=t; +} + +void CIO::check_target() +{ + if (!target) + target=stdout; +} + +void CIO::print_message_prio(EMessageType prio, FILE* target) +{ + switch (prio) + { + case M_DEBUG: + fprintf(target, "[DEBUG] "); + break; + case M_PROGRESS: + fprintf(target, "[PROGRESS] "); + break; + case M_INFO: + //fprintf(target, "[INFO] "); + break; + case M_NOTICE: + fprintf(target, "[NOTICE] "); + break; + case M_WARN: + fprintf(target, "[WARN] "); + break; + case M_ERROR: + fprintf(target, "[ERROR] "); + break; + case M_CRITICAL: + fprintf(target, "[CRITICAL] "); + break; + case M_ALERT: + fprintf(target, "[ALERT] "); + break; + case M_EMERGENCY: + fprintf(target, "[EMERGENCY] "); + break; + case M_MESSAGEONLY: + break; + default: + break; + } +} diff --git a/QPalmaDP/io.h b/QPalmaDP/io.h new file mode 100644 index 0000000..c127f92 --- /dev/null +++ b/QPalmaDP/io.h @@ -0,0 +1,33 @@ +#ifndef __CIO_H__ +#define __CIO_H__ + +#include "common.h" + +#include +#include + +class CIO +{ +public: + CIO(); + + static void set_target(FILE* target); + static void message(EMessageType prio, const CHAR *fmt, ... ); + + inline static void not_implemented() + { + message(M_ERROR, "Sorry, not yet implemented\n"); + } + + static void buffered_message(EMessageType prio, const CHAR *fmt, ... ); + + static CHAR* skip_spaces(CHAR* str); + +protected: + static void check_target(); + static void print_message_prio(EMessageType prio, FILE* target); + +protected: + static FILE* target; +}; +#endif diff --git a/QPalmaDP/penalty_info.cpp b/QPalmaDP/penalty_info.cpp new file mode 100644 index 0000000..8095ca1 --- /dev/null +++ b/QPalmaDP/penalty_info.cpp @@ -0,0 +1,354 @@ +#include +#include "config.h" +//#include "features/CharFeatures.h" +//#include "features/StringFeatures.h" + +#include +#include + +#include "io.h" + + +#include "fill_matrix.h" +#include "penalty_info.h" + +void init_penalty_struct(struct penalty_struct &PEN) +{ + PEN.limits=NULL ; + PEN.penalties=NULL ; + PEN.id=-1 ; + PEN.next_pen=NULL ; + PEN.transform = T_LINEAR ; + PEN.name = NULL ; + PEN.max_len=0 ; + PEN.min_len=0 ; + PEN.cache=NULL ; + PEN.use_svm=0 ; +} + +void init_penalty_struct_cache(struct penalty_struct &PEN) +{ + if (PEN.cache || PEN.use_svm) + return ; + + REAL* cache=new REAL[PEN.max_len+1] ; + if (cache) + { + for (INT i=0; i<=PEN.max_len; i++) + cache[i] = lookup_penalty(&PEN, i, 0, false) ; + PEN.cache = cache ; + } +} + +void delete_penalty_struct(struct penalty_struct &PEN) +{ + if (PEN.id!=-1) + { + delete[] PEN.limits ; + delete[] PEN.penalties ; + delete[] PEN.name ; + delete[] PEN.cache ; + } +} + +void delete_penalty_struct_array(struct penalty_struct *PEN, INT len) +{ + for (int i=0; iP-1) +// { +// CIO::message(M_ERROR, "id out of range\n") ; +// delete_penalty_struct_array(PEN,P) ; +// return NULL ; +// } +// INT max_len = (INT) mxGetScalar(mx_max_len_field) ; +// if (max_len<0 || max_len>1024*1024*100) +// { +// CIO::message(M_ERROR, "max_len out of range\n") ; +// delete_penalty_struct_array(PEN,P) ; +// return NULL ; +// } +// PEN[id].max_len = max_len ; +// +// INT min_len = (INT) mxGetScalar(mx_min_len_field) ; +// if (min_len<0 || min_len>1024*1024*100) +// { +// CIO::message(M_ERROR, "min_len out of range\n") ; +// delete_penalty_struct_array(PEN,P) ; +// return NULL ; +// } +// PEN[id].min_len = min_len ; +// +// if (PEN[id].id!=-1) +// { +// CIO::message(M_ERROR, "penalty id already used\n") ; +// delete_penalty_struct_array(PEN,P) ; +// return NULL ; +// } +// PEN[id].id=id ; +// if (next_id>=0) +// PEN[id].next_pen=&PEN[next_id] ; +// //fprintf(stderr,"id=%i, next_id=%i\n", id, next_id) ; +// +// assert(next_id!=id) ; +// PEN[id].use_svm=use_svm ; +// PEN[id].limits = new REAL[len] ; +// PEN[id].penalties = new REAL[len] ; +// double * limits = mxGetPr(mx_limits_field) ; +// double * penalties = mxGetPr(mx_penalties_field) ; +// +// for (INT i=0; iuse_svm>0) ; + REAL d_value=d_values[PEN->use_svm-1] ; + //fprintf(stderr,"transform=%i, d_value=%1.2f\n", (INT)PEN->transform, d_value) ; + + switch (PEN->transform) + { + case T_LINEAR: + break ; + case T_LOG: + d_value = log(d_value) ; + break ; + case T_LOG_PLUS1: + d_value = log(d_value+1) ; + break ; + case T_LOG_PLUS3: + d_value = log(d_value+3) ; + break ; + case T_LINEAR_PLUS3: + d_value = d_value+3 ; + break ; + default: + CIO::message(M_ERROR, "unknown transform\n") ; + break ; + } + + INT idx = 0 ; + REAL ret ; + for (INT i=0; ilen; i++) + if (PEN->limits[i]<=d_value) + idx++ ; + + if (idx==0) + ret=PEN->penalties[0] ; + else if (idx==PEN->len) + ret=PEN->penalties[PEN->len-1] ; + else + { + ret = (PEN->penalties[idx]*(d_value-PEN->limits[idx-1]) + PEN->penalties[idx-1]* + (PEN->limits[idx]-d_value)) / (PEN->limits[idx]-PEN->limits[idx-1]) ; + } + + //fprintf(stderr,"ret=%1.2f\n", ret) ; + + if (PEN->next_pen) + ret+=lookup_penalty(PEN->next_pen, p_value, d_values); + + //fprintf(stderr,"ret=%1.2f\n", ret) ; + + return ret ; +} + +REAL lookup_penalty(const struct penalty_struct *PEN, INT p_value, + REAL* svm_values, bool follow_next) +{ + if (PEN==NULL) + return 0 ; + if (PEN->use_svm) + return lookup_penalty_svm(PEN, p_value, svm_values) ; + + if ((p_valuemin_len) || (p_value>PEN->max_len)) + return -CMath::INFTY ; + + if (PEN->cache!=NULL && (p_value>=0) && (p_value<=PEN->max_len)) + { + REAL ret=PEN->cache[p_value] ; + if (PEN->next_pen && follow_next) + ret+=lookup_penalty(PEN->next_pen, p_value, svm_values); + return ret ; + } + + REAL d_value = (REAL) p_value ; + switch (PEN->transform) + { + case T_LINEAR: + break ; + case T_LOG: + d_value = log(d_value) ; + break ; + case T_LOG_PLUS1: + d_value = log(d_value+1) ; + break ; + case T_LOG_PLUS3: + d_value = log(d_value+3) ; + break ; + case T_LINEAR_PLUS3: + d_value = d_value+3 ; + break ; + default: + CIO::message(M_ERROR, "unknown transform\n") ; + break ; + } + + INT idx = 0 ; + REAL ret ; + for (INT i=0; ilen; i++) + if (PEN->limits[i]<=d_value) + idx++ ; + + if (idx==0) + ret=PEN->penalties[0] ; + else if (idx==PEN->len) + ret=PEN->penalties[PEN->len-1] ; + else + { + ret = (PEN->penalties[idx]*(d_value-PEN->limits[idx-1]) + PEN->penalties[idx-1]* + (PEN->limits[idx]-d_value)) / (PEN->limits[idx]-PEN->limits[idx-1]) ; + } + //if (p_value>=30 && p_value<150) + //fprintf(stderr, "%s %i(%i) -> %1.2f\n", PEN->name, p_value, idx, ret) ; + + if (PEN->next_pen && follow_next) + ret+=lookup_penalty(PEN->next_pen, p_value, svm_values); + + return ret ; +} diff --git a/QPalmaDP/penalty_info.h b/QPalmaDP/penalty_info.h new file mode 100644 index 0000000..2ed9ee1 --- /dev/null +++ b/QPalmaDP/penalty_info.h @@ -0,0 +1,42 @@ +#ifndef __PENALTY_INFO_H__ +#define __PENALTY_INFO_H__ + +#include "common.h" +#include "Mathmatics.h" + +enum ETransformType +{ + T_LINEAR, + T_LOG, + T_LOG_PLUS1, + T_LOG_PLUS3, + T_LINEAR_PLUS3 +} ; + +struct penalty_struct +{ + INT len ; + REAL *limits ; + REAL *penalties ; + INT max_len ; + INT min_len ; + REAL *cache ; + enum ETransformType transform ; + INT id ; + struct penalty_struct *next_pen ; + char * name ; + INT use_svm ; +} ; + +void init_penalty_struct(struct penalty_struct &PEN) ; +void delete_penalty_struct(struct penalty_struct &PEN) ; +void delete_penalty_struct_array(struct penalty_struct *PEN, INT len) ; + +//#ifdef HAVE_MATLAB +//struct penalty_struct * read_penalty_struct_from_cell(const mxArray * mx_penalty_info, mwSize &P) ; +//#endif + +REAL lookup_penalty(const struct penalty_struct *PEN, INT p_value, + REAL* svm_values, bool follow_next=true) ; + +#endif diff --git a/QPalmaDP/print_align.cpp b/QPalmaDP/print_align.cpp new file mode 100644 index 0000000..312cabc --- /dev/null +++ b/QPalmaDP/print_align.cpp @@ -0,0 +1,97 @@ +#include "fill_matrix.h" +using namespace std; + +void print_align(Pre_score* matrix, int length_est, int length_dna,Align_pair* vektor, int result_length, int print_matrix) + +{ + //Druck der matrix + switch(print_matrix) + { + case 1: + { + cout <<"Matrix: \n"; + + for (int k = 0; kvalue <<"\t"; + } + cout <<"\n"; + } + break; + } + + case 2: + { + cout <<"Matrix: \n"; + + for (int k = 0; kprev_i) == k-1) && ((((Pre_score*)matrix + (k*length_dna) + l)->prev_j) == l-1)) + cout << "\\" <<((Pre_score*)matrix + (k*length_dna) + l)->value <<"\t"; + else if (((((Pre_score*)matrix + (k*length_dna) + l)->prev_i) == k) && ((((Pre_score*)matrix + (k*length_dna) + l)->prev_j) == l-1)) + cout << "<"<< ((Pre_score*)matrix + (k*length_dna) + l)->value <<"\t"; + else if (((((Pre_score*)matrix + (k*length_dna) + l)->prev_i) == k -1) && ((((Pre_score*)matrix + (k*length_dna) + l)->prev_j) == l)) + cout << "|"<< ((Pre_score*)matrix + (k*length_dna) + l)->value <<"\t"; + else { cout << ((Pre_score*)matrix + (k*length_dna) + l)->value << "("; + cout << ((Pre_score*)matrix + (k*length_dna) + l)->prev_i << "," << ((Pre_score*)matrix + (k*length_dna) + l)->prev_j << ")\t"; + } + + } + cout <<"\n"; + } + + cout << "length_est" << length_est << "\n"; + cout << "length_dna" << length_dna << "\n"; + cout << "result_length: " << result_length << "\n"; + + + break; + } + + case 3: + { + //Druck des Alignment + + // cout << "Alignment\n" ; + int j = 0; + // cout << "resultlength: " << result_length <<"\n"; + while (j 1 + + + /***************************************************************************/ + // 1. dna + /***************************************************************************/ + int dna_len = mxGetN(prhs[arg]) + 1; // mxGetN: number of columns (int) + char* dna = (char*)mxCalloc(dna_len, sizeof(char)); + + failure = mxGetString(prhs[arg], dna, dna_len); // NULL at end of string + if (failure) { // mxGetString returns 0 if success, 1 on failure + mexPrintf("couldn't open dna"); + return; + } + arg++; // arg -> 2 + + + /***************************************************************************/ + // 2. est + /***************************************************************************/ + int est_len = mxGetN(prhs[arg]) + 1; + char* est = (char*)mxCalloc(est_len, sizeof(char)); + + failure = mxGetString(prhs[arg], est, est_len); + if (failure) { + mexPrintf("couldn't open est"); + return; + } + arg++; // arg -> 3 + + + /***************************************************************************/ + // 3. h (cell array) + /***************************************************************************/ + penalty_struct* functions; + mwSize anz_func = 1; + + functions = read_penalty_struct_from_cell(prhs[arg], anz_func); + //std::cout << " bla " << functions->max_len << " \n"; + //std::cout << "lookup_penalty(5) = " << lookup_penalty(functions, 5, NULL, 0)< 4 + + + /***************************************************************************/ + // 4. match matrix (columnwise) + /***************************************************************************/ + int nr_entries = mxGetN(prhs[arg]) * mxGetM(prhs[arg]); + //mexPrintf("\nNumber of entries in matchmatrix: %d\n", nr_entries) ; + double *matchmatrix; + matchmatrix = mxGetPr(prhs[arg]); + //mexPrintf("matchmatrix\n") ; + //for (int i=0; i< nr_entries; i++) { + // mexPrintf("entry %d: <%f>\n", i, matchmatrix[i]); + //} + arg++; // arg -> 5 + + + /***************************************************************************/ + // 5. donor + /***************************************************************************/ + int donor_length = mxGetN(prhs[arg]); + double *donor; + donor = mxGetPr(prhs[arg]); + //mexPrintf("Donor eingelesen: <%f>\n", donor[0]); + arg++; // arg -> 6 + + + /***************************************************************************/ + // 6. acceptor + /***************************************************************************/ + int acceptor_length = mxGetN(prhs[arg]); + double *acceptor; + acceptor = mxGetPr(prhs[arg]); + //mexPrintf("Acceptor eingelesen: <%f>\n", acceptor[0]); + arg++; // arg -> 7 + + + /***************************************************************************/ + // 7.remove_duplicate_scores + /***************************************************************************/ + bool remove_duplicate_scores = (bool)*mxGetPr(prhs[arg]); + arg++; // arg -> 8 + + + /***************************************************************************/ + // 8. print_matrix + /***************************************************************************/ + int print_matrix = (int)*mxGetPr(prhs[arg]); + arg++; // arg -> 9 + + + /***************************************************************************/ + // initialize alignment matrices and call fill_matrix() + /***************************************************************************/ + + //possible donor positions + int nr_donor_sites = 0 ; + for (int ii=0; iiprev_i; + mwSize prev_j = ((Pre_score*)matrices[z] + i*dna_len +j)->prev_j; + mwSize prev_m_no = ((Pre_score*)matrices[z] + i*dna_len +j)->prev_matrix_no; + + for (int ii=result_length; ii>0; ii--) { + if ((prev_i == (i-1)) && (prev_j == (j-1))) { // match or mismatch + DNA[ii-1] = check_char(dna[j-1]) ; + EST[ii-1] = check_char(est[i-1]) ; + } + else if ((prev_i == (i)) && (prev_j == (j-1))) {// gap on EST + DNA[ii-1] = check_char(dna[j-1]) ; + EST[ii-1] = 0 ; //gap + } + else if ((prev_i == (i-1)) && (prev_j == (j))) { // gap on DNA + DNA[ii-1] = 0 ; //gap + EST[ii-1] = check_char(est[i-1]) ; + } + else {// splice site + for (j; j > prev_j; j--) { + DNA[ii-1] = check_char(dna[j-1]) ; + EST[ii-1] = 6 ; //intron + ii-- ; + } + ii++ ; // last ii-- too much (because done in for-loop) + } + + i = prev_i; + j = prev_j; + prev_i = ((Pre_score*)matrices[prev_m_no] + i*dna_len + j)->prev_i; + prev_j = ((Pre_score*)matrices[prev_m_no] + i*dna_len + j)->prev_j; + prev_m_no = ((Pre_score*)matrices[prev_m_no] + i*dna_len + j)->prev_matrix_no; + } + + } //end of z + + + + //mexPrintf("DEBUG:Returning arguments\n"); + int index = 0 ; + + //splice_align (mxArray): + const mwSize dims0[] = {(dna_len-1), nr_paths}; + //const int dims0[] = {(dna_len-1), nr_paths}; + plhs[index] = mxCreateNumericArray(2, dims0, mxINT32_CLASS, mxREAL); + mwSize* start0 = (mwSize*)(mxGetData(plhs[index])); + memcpy(start0, splice_align, mxGetM(plhs[index])*mxGetN(plhs[index])*mxGetElementSize(plhs[index])); + index++ ; // ->1 + + //est_align (mxArray): + const mwSize dims1[] = {(est_len-1), nr_paths}; + //const int dims1[] = {(est_len-1), nr_paths}; + plhs[index] = mxCreateNumericArray(2, dims1, mxINT32_CLASS, mxREAL); + mwSize* start1 = (mwSize*)(mxGetData(plhs[index])); + memcpy(start1, est_align, mxGetM(plhs[index])*mxGetN(plhs[index])*mxGetElementSize(plhs[index])); + index++ ; // ->2 + + //weightmatch (mxArray) + const mwSize dims2[] = {mlen*mlen, nr_paths}; + //const int dims2[] = {mlen*mlen, nr_paths}; + plhs[index] = mxCreateNumericArray(2, dims2, mxINT32_CLASS, mxREAL); + mwSize* start2= (mwSize*)(mxGetData(plhs[index])); + memcpy(start2, mmatrix_param, mxGetM(plhs[index])*mxGetN(plhs[index])*mxGetElementSize(plhs[index])); + index++ ; // ->3 + + //alignmentscore + plhs[index] = mxCreateDoubleMatrix(1,nr_paths,mxREAL); + if (plhs[index] == NULL) + { + printf("%s : Out of memory on line %d\n", __FILE__, __LINE__); + printf("Unable to create mxArray.\n"); + exit(1); + } + double* start3 = (double*)(mxGetData(plhs[index])); + memcpy(start3, alignmentscores, mxGetM(plhs[index])*mxGetN(plhs[index])*mxGetElementSize(plhs[index])); + index++ ; // ->4 + + assert(nlhs==index+1) ; + + //Free Memory + //mexPrintf("DEBUG:Free Memory\n"); + mxFree(donor_sites); + mxFree(max_score_positions); + for (int i=nr_paths-1;i>=0; i--) + { + //delete[] matrices[i]; + mxFree(matrices[i]); + } + mxFree(matrices); + mxFree(splice_align); + mxFree(est_align); + mxFree(mmatrix_param); + mxFree(dna); + mxFree(est); + + return; + +} + diff --git a/QPalmaDP/qpalma_dp.h b/QPalmaDP/qpalma_dp.h new file mode 100644 index 0000000..c019466 --- /dev/null +++ b/QPalmaDP/qpalma_dp.h @@ -0,0 +1,83 @@ +#ifndef _QPALMA_DP_H_ +#define _QPALMA_DP_H_ + +#include "penalty_info.h" + +struct ArrayElem { //24B + double score; + int prev_j; + int prev_i; + int prev_matrix_no; + bool isSplice; +} ; + +struct pre_score { //24B + double value; //8 + int prev_i; //4 + int prev_j; //4 + int prev_matrix_no;//4 + bool issplice;//4 +}; + +typedef struct pre_score Pre_score; + +typedef struct Align_pair { //24B + double value ; + bool issplice; + int previ,prevj ; + char est_char; + char dna_char; +}; + + +void fill_matrix (int nr_paths, Pre_score* matrices[], int len_est, int len_dna, char* est, char* dna, penalty_struct* functions, double* matchmatrix, double* donor, double* acceptor, bool remove_duplicate_scores, int nr_donor_sites, int* donor_sites, int* max_score_positions); + +int check_char(char base); + +bool result_align (Pre_score* matrices[], int matrixnr, int i, int j, int* result_length_ptr, char* est, char* dna, int* s_align, int* e_align, int* m_param, double* alignmentscores, int* max_score_positions ); + +extern void print_align(Pre_score* matrix, int length_est, int length_dna, Align_pair* vektor, int result_length, int print_matrix); + + +/* + * The function myalign calculates a new alignment given the scoring scheme + * Its input arguments are: + * + * num_path(id) -> an integer specifying which alignment has to be done + * dna -> + * est -> + * h -> + * mmatrix -> + * donor -> + * acceptor -> + * remove_duplicate_scores -> a boolean + * print_matrix -> a boolean + * + * [SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest] = myalign_local(... + */ + +class Alignment { + + private: + SpliceAlign + EstAlign; + WeightMatch; + TotalScores; + + public: + Alignment(); + ~Alignment(); + + void myalign(int nr_paths, dna, est, h, matchmatrix, double* donor, double* acceptor, bool remove_duplicate_scores, bool print_matrix); + getSpliceAlign() + getEstAlign(); + getWeightMatch(); + getTotalScores(); + getDNAEST(); + + +}; + + +#endif // _QPALMA_DP_H_ + diff --git a/QPalmaDP/result_align.cpp b/QPalmaDP/result_align.cpp new file mode 100644 index 0000000..2a49f68 --- /dev/null +++ b/QPalmaDP/result_align.cpp @@ -0,0 +1,156 @@ +#include "assert.h" +#include "fill_matrix.h" + +using namespace std; + +bool result_align(Pre_score* matrices[], int z, int est_len, int dna_len, int* result_length_ptr, char* est, char* dna, int* s_align, int* e_align, int* mparam, double* alignmentscores, int* max_score_positions) +//Gibt 1 zurueck, wenn kein weiterer Pfad existiert, sonst 0 + +{ + const int mlen=6; // length of matchmatrix + int startend[4] ; // [i_start, j_start, i_end, j_end] ; + startend[3] = max_score_positions[2*z] ; //i (est) + startend[4] = max_score_positions[2*z +1] ; //j (dna) + //printf("in result_align: z=%d:(%d, %d)\n", z, startend[3], startend[4]) ; + int dnanum ; //using check_char: letter -> number + int estnum ; //using check_char: letter -> number + + int dna_pos ; + int est_pos ; + int splice_state ; + int est_state ; + for (dna_pos=dna_len-1; dna_pos>startend[4]; dna_pos--) { + s_align[dna_pos-1] = 4 ; // not aligned (-1 because array starts with index 0) + } + for (est_pos=est_len-1; est_pos>startend[3]; est_pos--) { + e_align[est_pos-1] = 4 ; + } + //splice_align: 0: exon, 1: donor, 2: acceptor, 3: intro,. 4: dangling end + + + /* starting point for backtracking */ + int i = startend[3] ; + int j = startend[4] ; + int act_m_no = z ; + alignmentscores[z] = ((Pre_score*)matrices[z] + i*dna_len +j)->value; + //printf("alignment_score for z=%d: %3.8f\n", z, alignmentscores[z]) ; + + int prev_i = ((Pre_score*)matrices[act_m_no] + i*dna_len +j)->prev_i; + int prev_j = ((Pre_score*)matrices[act_m_no] + i*dna_len +j)->prev_j; + int prev_m_no = ((Pre_score*)matrices[act_m_no] + i*dna_len +j)->prev_matrix_no; + + if(!(isfinite(((Pre_score*)matrices[act_m_no] + i*dna_len +j)->value))) + { + cout << "Kein weiterer Pfad vorhanden\n"; + return 1; + } + + if(((Pre_score*)matrices[act_m_no] + i*dna_len +j)->value==0) { + cout << "Kein weiterer Pfad vorhanden\n"; + return 1; + } + + assert(dna_pos == j) ; + assert(est_pos == i) ; + splice_state = 0 ; //exon + est_state = 1 ; //[112222111 ... for exons of length 2,4,3 ... + + //compute length of alignment (between max(m,n) and m+n) + //local_alignment: backtracking to first Zero: look at actual value + while(!(((Pre_score*)matrices[act_m_no] + i*dna_len +j)->value)==0.0) { + + //just some info + //if ((z>0) && (!(z==prev_m_no))) { + // printf("(%d,%d): next step in next matrix\n", i,j) ; + //} + + if ((prev_i == (i-1)) && (prev_j == (j-1))) { // match or mismatch + (*result_length_ptr)= (*result_length_ptr) + 1; + + s_align[dna_pos-1] = splice_state; + dna_pos-- ; + e_align[est_pos-1] = est_state ; //1 or 2, depended + est_pos-- ; + + dnanum = check_char(dna[j-1]) ; //-1 because index starts with 0 + estnum = check_char(est[i-1]) ; //-1 because index starts with 0 + mparam[mlen*dnanum +estnum] ++ ; + } + else if ((prev_i == (i)) && (prev_j == (j-1))) {// gap on EST + (*result_length_ptr)= (*result_length_ptr) + 1; + + s_align[dna_pos-1] = splice_state; + dna_pos-- ; + + dnanum = check_char(dna[j-1]) ; //-1 because index starts with 0 + estnum = 0 ; //gap + mparam[mlen*dnanum +estnum] ++ ; + } + else if ((prev_i == (i-1)) && (prev_j == (j))) { // gap on DNA + (*result_length_ptr)= (*result_length_ptr) + 1; + + e_align[est_pos-1] = est_state ; //1 or 2, depended + est_pos-- ; + + dnanum = 0 ; //gap + estnum = check_char(est[i-1]) ; //-1 because index starts with 0 + mparam[mlen*dnanum +estnum] ++ ; + } + else {// splice site + (*result_length_ptr) = (*result_length_ptr) + (j - prev_j); + if (((Pre_score*)matrices[act_m_no] + i*dna_len + j)->issplice==0) { + printf("matrices[act_m_no](i,j): %3.15f\n",((Pre_score*)matrices[act_m_no] + i*dna_len +j)->value) ; + printf("matrices[prev_m_no](i,j): %3.15f\n",((Pre_score*)matrices[prev_m_no] + i*dna_len +j)->value) ; + printf("issplice(act): %d\n",((Pre_score*)matrices[act_m_no] + i*dna_len + j)->issplice) ; + printf("issplice(prev): %d\n",((Pre_score*)matrices[prev_m_no] + i*dna_len + j)->issplice) ; + printf("act_m_no: %d, prev_m_no: %d\n", act_m_no, prev_m_no) ; + printf("j: %d -> prev_j:%d\n", j, prev_j) ; + printf("i: %d -> prev_i:%d\n", i, prev_i) ; + } + if (est_state==1) { //was exon labeled "1" + est_state = 2 ; //new exon is labeled "2" + } + else { + est_state = 1 ; //last exon was labeled "2", new exon is labeled "1" + } + for (j; j > prev_j; j--) { + if (splice_state == 0) { //coming from exon + splice_state = 2 ; //first intron_state: acceptor + } else { + splice_state = 3 ; //intron + } + if (j-1 == prev_j) { //last intron_state: donor + splice_state = 1 ;//donor + } + s_align[dna_pos-1] = splice_state; + dna_pos-- ; + } + splice_state = 0 ; //exon again + + } + + startend[1] = i; + startend[2] = j; + i = prev_i; + j = prev_j; + act_m_no = prev_m_no ; + + prev_i = ((Pre_score*)matrices[act_m_no] + i*dna_len + j)->prev_i; + prev_j = ((Pre_score*)matrices[act_m_no] + i*dna_len + j)->prev_j; + prev_m_no = ((Pre_score*)matrices[act_m_no] + i*dna_len +j)->prev_matrix_no; + } + for (dna_pos; dna_pos>0; dna_pos--) { + s_align[dna_pos-1] = 4 ; // not aligned (-1 because array starts with index 0) + } + for (est_pos; est_pos>0; est_pos--) { + e_align[est_pos-1] = 4 ; // not aligned (-1 because array starts with index 0) + } + + //printf("(%d, %d) --> (%d, %d)\n", startend[1], startend[2], startend[3], startend[4]) ; + //printf("result_length: %d\n", *result_length_ptr) ; + + + return 0; +} + + diff --git a/python/.qpalma.py.swp b/python/.qpalma.py.swp index 7e345a8..4f1759f 100644 Binary files a/python/.qpalma.py.swp and b/python/.qpalma.py.swp differ diff --git a/python/computeSpliceWeights.py b/python/computeSpliceWeights.py index e4c421c..51c6e9f 100644 --- a/python/computeSpliceWeights.py +++ b/python/computeSpliceWeights.py @@ -1,6 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- +from numpy import inf from numpy.matlib import zeros def calculateWeights(plf, scores): @@ -22,42 +23,44 @@ def calculateWeights(plf, scores): y_i and y_i+1 parameters, so the fractions are saved in the weight vectors! """ - currentWeight = zeros((1,30)) + + currentWeight = zeros((30,1)) for k in range(len(scores)): value = scores[k] - Lower = sum(d.limits <= value) - Upper = Lower +1 ; # x-werte bleiben fest + + Lower = 0 + Lower = len([elem for elem in plf.limits if elem <= value]) + Upper = Lower+1 ; # x-werte bleiben fest if Lower == 0: - currentWeight[1] = currentWeight[1] + 1; - elif: - Lower == length[plf.limits] - currentWeight[30] = currentWeight[30] + 1; + currentWeight[0] = currentWeight[0] + 1; + elif Lower == len(plf.limits): + currentWeight[-1] = currentWeight[-1] + 1; else: - weightup = (value - plf.limits(Lower)) / (plf.limits(Upper) - plf.limits(Lower)) ; - weightlow = (plf.limits(Upper) - value) / (plf.limits(Upper) - plf.limits(Lower)) ; - currentWeight(Upper) = currentWeight(Upper) + weightup; - currentWeight(Lower) = currentWeight(Lower) + weightlow; + weightup = (value - plf.limits[Lower]) / (plf.limits[Upper] - plf.limits[Lower]) ; + weightlow = (plf.limits[Upper] - value) / (plf.limits[Upper] - plf.limits[Lower]) ; + currentWeight[Upper] = currentWeight[Upper] + weightup; + currentWeight[Lower] = currentWeight[Lower] + weightlow; return currentWeight def computeSpliceWeights(d, a, h, SpliceAlign, don_supp, acc_supp): - assert(isempty(d.transform)) - assert(isempty(a.transform)) - assert(isempty(h.transform)) + #assert(isempty(d.transform)) + #assert(isempty(a.transform)) + #assert(isempty(h.transform)) #################################################################################### # 1. Donor: In don_supp stehen Werte der SVM., in SpliceAlign die Einsen #################################################################################### - # Lege Gewichtsvektor fuer 30 supporting points an: - weightDon = zeros((1,30)) - # Picke die Positionen raus, an denen eine Donorstelle ist - DonorScores = don_supp(find(SpliceAlign==1)) ; - assert(all(DonorScores~=-Inf)) ; + #DonorScores = don_supp(find(SpliceAlign==1)) ; + #assert(all(DonorScores~=-Inf)) ; + + DonorScores = [elem for pos,elem in enumerate(don_supp) if SpliceAlign[pos] == 1] + assert not ( -inf in DonorScores ) weightDon = calculateWeights(d,DonorScores) @@ -65,25 +68,28 @@ def computeSpliceWeights(d, a, h, SpliceAlign, don_supp, acc_supp): # 2. Acceptor: In acc_supp stehen Werte der SVM., in SpliceAlign die Einsen #################################################################################### - AcceptorScores = acc_supp(find(SpliceAlign == 2)+1) ; - assert(all(AcceptorScores~=-Inf)) ; + #AcceptorScores = acc_supp(find(SpliceAlign == 2)+1) ; + #assert(all(AcceptorScores~=-Inf)) ; #Den Vektor Acceptorstellen durchgehen und die Gewichtsvektoren belasten: - weightAcc = calculateWeights(a,AcceptorScores) + AcceptorScores = [elem for pos,elem in enumerate(acc_supp) if SpliceAlign[pos] == 2] + #assert not ( -inf in AcceptorScores ) + weightAcc = calculateWeights(a,AcceptorScores) #################################################################################### # 3. Intron length: SpliceAlign: Gaps zaehlen und auf Gapgewichte addieren #################################################################################### - intron_starts = find(SpliceAlign==1) ; - intron_ends = find(SpliceAlign==2) ; + #intron_starts = find(SpliceAlign==1) ; + #intron_ends = find(SpliceAlign==2) ; - %assert: introns do not overlap - assert(length(intron_starts) == length(intron_ends)) ; - assert(all(intron_starts < intron_ends)) ; - assert(all(intron_ends(1:end-1) < intron_starts(2:end))) ; + #%assert: introns do not overlap + #assert(length(intron_starts) == length(intron_ends)) ; + #assert(all(intron_starts < intron_ends)) ; + #assert(all(intron_ends(1:end-1) < intron_starts(2:end))) ; - weightIntron = calculateWeights(h,AcceptorScores) + #weightIntron = calculateWeights(h,AcceptorScores) + weightIntron = [1,2,3] return weightDon, weightAcc, weightIntron diff --git a/python/qpalma.py b/python/qpalma.py index 71ab47a..e48073e 100644 --- a/python/qpalma.py +++ b/python/qpalma.py @@ -10,6 +10,8 @@ import numpy.matlib from set_param_palma import * from computeSpliceAlign import * +from computeSpliceWeights import * + """ A training method for the QPalma project @@ -213,15 +215,32 @@ def run(): don_supp = Donors[exampleId] acc_supp = Acceptors[exampleId] - # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...) - true_SpliceAlign, true_weightMatch = computeSpliceAlign(dna, exons) + trueSpliceAlign, trueWeightMatch = computeSpliceAlign(dna, exons) + trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp) pdb.set_trace() - true_weightDon, true_weightAcc, true_weightIntron = computeSpliceWeights(d, a, h, true_SpliceAlign, don_supp, acc_supp) - - + # + # compute wrong alignments + # + + # Compute donor, acceptor with penalty_lookup_new + # returns two double lists + [donor, acceptor] = compute_donacc(don_supp, acc_supp, d, a) ; + + #myalign wants the acceptor site on the g of the ag + acceptor = [acceptor(2:end) -Inf] ; + + # call myalign + [SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest] = ... + myalign_local(num_path(id), dna, est, {h}, mmatrix, donor, acceptor, ... + remove_duplicate_scores, print_matrix); + + SpliceAlign = double(SpliceAlign') ; %column + weightMatch = double(weightMatch') ; + + iteration_nr += 1 break @@ -229,16 +248,7 @@ def run(): for id = 1:N %fprintf('%i\n', id) ; if (mod(id,50)==0), fprintf(1,'.'); end - - %some assertions concerning the splicesites - idx_don = find(don_supp~=-Inf) ; - assert(all(dna(idx_don)=='g')) ; - assert(all(dna(idx_don(idx_don3)-2)=='a')) ; - assert(all(dna(idx_acc(idx_acc>2)-1)=='g')) ; - %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % True Alignment and Comparison with wrong ones %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% @@ -365,9 +375,7 @@ def run(): end ; end fprintf('\n') ; - - - + ################################################################################# const_added = solver.addConstraint(deltas, loss, pos, marginRescaling) objValue,w,self.slacks = solver.solve() @@ -377,14 +385,8 @@ def run(): # maxgap=max(gap) [mean(xis>=1) mean(gap>=1) mean(xis>=1e-3) mean(gap>=1e-3)] - [h,d,a,mmatrix] = set_param_palma(param,ARGS.train_with_intronlengthinformation) ; - - save(paramfilename, ... - 'param', 'h', 'd', 'a', 'mmatrix', 'A', 'b') ; - if max(gap)<=column_eps: break - assert(all(maxgap<=column_eps)) assert(length(take_idx)==N) @@ -395,7 +397,6 @@ if __name__ == '__main__': run() """ - def __init__(self): numAcceptorParams = 0 numDonorParams = 0