+ fixed a bug in the C++ interface
[qpalma.git] / QPalmaDP / fill_matrix.cpp
1 /*********************************************************************************************/
2 // Version of fill_matrix with fills several alignment matrices (best, second_best ...)
3 // nr_paths det. number of these matrices
4 //
5 // est_len: Number of columns in alignment matrix: length(EST) +1 (for the gap) (variable: i)
6 // dna_len: Number of lines in alignment matrix: length(DNA) +1 (for the gap) (variable: j)
7 //
8 // !!! IMPORTANT !!! The pictures and equations in the paper/poster etc. describe the algorithm
9 // for an alignment matrix where DNA and EST changed places !!! IMPORTANT !!!
10 /*********************************************************************************************/
11
12 /*
13 matchmatrix: columnwise
14 - A C G T N (dna)
15 - x x x x x x
16 A x x x x x x
17 C x x x x x x
18 G x x x x x x
19 T x x x x x x
20 N x x x x x x
21 (est)
22 */
23
24 /*
25 alignment matrix: columnwise
26 |-EST . . .
27 -+------------
28 -|00 ...
29 D|0.
30 N|. .
31 A|. .
32 .|.
33 .|
34 .|
35 */
36
37
38 /* TODO
39 - max_len = 10000 for elegans!
40 - do not use double
41
42 */
43
44
45 #include "fill_matrix.h"
46
47 using namespace std;
48
49 int check_char(char base)
50 {
51 switch(base)
52 {
53 case 'A':
54 case 'a': return 1;
55 case 'C':
56 case 'c': return 2;
57 case 'G':
58 case 'g': return 3;
59 case 'T':
60 case 't': return 4;
61 case 'N':
62 case 'n': return 5;
63 default : return 0;
64 }
65
66 }
67
68
69
70 void fill_matrix(int nr_paths, Pre_score* matrices[], int est_len /*counter var: i*/, int dna_len /*counter var: j*/, char* est, char* dna, penalty_struct* functions , double* matchmatrix, double* donor, double* acceptor, bool remove_duplicate_scores, int nr_donor_sites, int* donor_sites, int* max_score_positions)
71 {
72 // printf("Entering fill_matrix...\n");
73
74 /*********************************************************************************************/
75 // variables
76 /*********************************************************************************************/
77 const int mlen = 6; // length of matchmatrix
78 int estChar, dnaChar; // 0:'-', 1:'a', 2:'c', 3:'g', 4:'t', 5:'n'
79 double prevValue ;
80 double tempValue;
81 int max_len = (int)functions->limits[functions->len-1] ; //last (len) supporting point of h
82 //printf("max_len: %d\n", max_len) ;
83
84 int num_elem ; // counter variable
85
86 const int max_num_elem = (4+nr_paths+max_len) * nr_paths ; // upper bound on num_elem
87 double *scores=new double[max_num_elem] ; //are sorted later
88 struct ArrayElem *array=new ArrayElem[max_num_elem] ;
89 //printf("%d\n", max_num_elem*sizeof(ArrayElem)) ;
90
91 int max_ss_element[nr_paths*nr_paths]; //position on dna with max ss score for introns of length greater than max_len
92 int splice_pos ; //splice position get from donor_sites
93 int start_splice_pos ; //where to start looking in donor_sites
94
95 double max_scores[nr_paths] ;
96
97 /*********************************************************************************************/
98 // Initialisation of all alignment matrices, columnwise
99 /*********************************************************************************************/
100 for (int z=0; z<nr_paths; z++) /* matrice z */
101 {
102 max_scores[z] = log(0.0) ;
103 max_score_positions[2*z] = 0; // pos i
104 max_score_positions[2*z +1] = 0; // pos j
105
106 for (int i=0; i<est_len; i++) /* i.column */
107 {
108 for (int j=0; j<dna_len; j++) /* j.line */
109 {
110 ((Pre_score*)matrices[z] +(i*dna_len) +j)->value = log(0.0); // -inf
111 ((Pre_score*)matrices[z] +(i*dna_len) +j)->prev_i = 0;
112 ((Pre_score*)matrices[z] +(i*dna_len) +j)->prev_j = 0;
113 ((Pre_score*)matrices[z] +(i*dna_len) +j)->prev_matrix_no = 0;
114 ((Pre_score*)matrices[z] +(i*dna_len) +j)->issplice = 0;
115 }
116 }
117 }
118
119
120 /*********************************************************************************************/
121 // Element (0,0) for first/best matrice (z=0)
122 /*********************************************************************************************/
123 ((Pre_score*)matrices[0])->value = 0;
124 ((Pre_score*)matrices[0])->prev_i = 0;
125 ((Pre_score*)matrices[0])->prev_j = 0;
126 ((Pre_score*)matrices[0])->prev_matrix_no = 0;
127 ((Pre_score*)matrices[0])->issplice = 0;
128
129
130 /*********************************************************************************************/
131 // Line Zero for first/best matrice (z=0)
132 /*********************************************************************************************/
133 for (int i=1; i<est_len; i++) /* 0.line, all columns (DNA='-') */
134 {
135 ((Pre_score*)matrices[0] +(i*dna_len))->value = 0 ;
136 ((Pre_score*)matrices[0] +(i*dna_len))->prev_i = i-1;
137 ((Pre_score*)matrices[0] +(i*dna_len))->prev_j = 0;
138 ((Pre_score*)matrices[0] +(i*dna_len))->prev_matrix_no = 0;
139 ((Pre_score*)matrices[0] +(i*dna_len))->issplice = 0;
140 }
141
142
143 /*********************************************************************************************/
144 // Column Zero for first/best matrice (z=0)
145 /*********************************************************************************************/
146 for (int j=1; j<dna_len; j++) // 0.column, all lines
147 {
148 ((Pre_score*)matrices[0] +j)->value = 0 ;
149 ((Pre_score*)matrices[0] +j)->prev_i = 0 ;
150 ((Pre_score*)matrices[0] +j)->prev_j = j-1 ;
151 ((Pre_score*)matrices[0] +j)->prev_matrix_no = 0 ;
152 ((Pre_score*)matrices[0] +j)->issplice = 0;
153 }
154
155
156 /*********************************************************************************************/
157 // !!!! Column Zero and line Zero for all other matrices except the best one (z>0): -INF !!!!
158 /*********************************************************************************************/
159
160
161
162 /*********************************************************************************************/
163 // all other entries for all matrices, last column and row treated seperately
164 /*********************************************************************************************/
165
166 /* columnwise, 0.column and 0.line already filled */
167 for (int i=1; i<est_len; i++) /* i.column */
168 {
169
170 start_splice_pos = 0 ; //where to start looking in donor_sites
171 for (int zz=0; zz<nr_paths*nr_paths; zz++) {max_ss_element[zz]=0 ;} // initializing
172
173 for (int j=1; j<dna_len; j++) //j.line
174 {
175 dnaChar = check_char(dna[j-1]) ; // -1 because of of the gaps in the 0.line/column of the al-matrix
176 estChar = check_char(est[i-1]) ;
177
178
179 //if introns of length greater than max_len possible
180 if (j > max_len)
181 {
182 //storing highest ss_score (the position) and NOT -INF
183 int new_ss_pos = j-max_len ;
184 double donor_score = donor[new_ss_pos-1] ;
185 if (isfinite(donor_score)) //splice site!
186 {
187 if (max_ss_element[nr_paths-1] != 0)
188 {
189 for (int zz=0; zz<nr_paths; zz++) //for each alignment matrix
190 {
191 int pos_with_minscore = nr_paths ; //assuming, the new score is the worst score
192 double new_intron_score = donor_score+((Pre_score*)matrices[zz]+(i*dna_len)+new_ss_pos-1)->value ; //could be -INF
193 if (!(isfinite(new_intron_score))) {printf("new_intron_score is -INF\n") ;}
194 for (int pos=0; pos<nr_paths; pos++) //find worst score of all nr_paths+1 scores
195 {
196 int act_splice_position = max_ss_element[zz*nr_paths+pos] ;
197 assert(isfinite(donor[act_splice_position-1])) ;
198 double act_intron_score = donor[act_splice_position-1]+((Pre_score*)matrices[zz]+(i*dna_len)+act_splice_position-1)->value ;
199 if (new_intron_score >= act_intron_score)
200 {
201 new_intron_score = act_intron_score ;
202 pos_with_minscore = pos ;
203 } //else nothing happens, new intron score still the worst one
204 }
205 if (pos_with_minscore != nr_paths)
206 {
207 max_ss_element[zz*nr_paths+pos_with_minscore] = new_ss_pos ; //replacing
208 }
209 }
210 }
211 else
212 {
213 for (int pos=0; pos<nr_paths; pos++)
214 {
215 if (max_ss_element[pos]==0) //no previous splice site at this pos
216 {
217 for (int zz=0; zz<nr_paths; zz++) {max_ss_element[zz*nr_paths+pos]=j-max_len ;}
218 break ;
219 }
220 }
221 }
222 }
223 //start_pos in donor_sites so that only introns shorter then max_length are seen
224 if ((start_splice_pos < nr_donor_sites) && donor_sites[start_splice_pos] == j-max_len)
225 {
226 start_splice_pos++ ;
227 }
228 }
229
230
231 /* Computing all possible scores */
232 num_elem = 0 ; /* number of different scores/possibilities so far */
233 for (int z=0; z<nr_paths; z++) /* matrice z */
234 {
235 Pre_score* actMatrix = (Pre_score*)matrices[z]; /* hm, why? */
236
237 /***************************************/
238 // 0. Zero (local alignment version)
239 /***************************************/
240 assert(num_elem<max_num_elem) ;
241 array[num_elem].prev_i = 0;
242 array[num_elem].prev_j = 0;
243 array[num_elem].prev_matrix_no = z;
244 array[num_elem].isSplice = false; /* no splice site */
245 scores[num_elem] = 0 ;
246 num_elem++ ;
247
248
249 /***************************************/
250 // 1. gap on EST (j-1,i) -> (j,i)
251 /***************************************/
252 prevValue = ((Pre_score*)actMatrix +(i*dna_len)+j-1)->value ; /* (j-1,i) -> (j,i) */
253 if (isfinite(prevValue))
254 {
255 tempValue = prevValue +(matchmatrix[mlen* dnaChar]); /* score(gap,DNA) */
256 if (isfinite(tempValue))
257 {
258 assert(num_elem<max_num_elem) ;
259 array[num_elem].prev_i = i;
260 array[num_elem].prev_j = j-1; /* predecessor */
261 array[num_elem].prev_matrix_no = z;
262 array[num_elem].isSplice = false; /* no splice site */
263 scores[num_elem] = -tempValue ;
264 num_elem++ ;
265 }
266 }
267
268
269 /***************************************/
270 // 2. match/mismatch (j-1,i-1) -> (j,i)
271 /***************************************/
272 prevValue = ((Pre_score*)actMatrix +(i-1)*dna_len + j-1)->value ; /* (j-1,i-1) -> (j,i) */
273 if (isfinite(prevValue))
274 {
275 tempValue = prevValue +(matchmatrix[mlen* dnaChar +estChar]);//DNA mit EST
276 if (isfinite(tempValue))
277 {
278 assert(num_elem<max_num_elem) ;
279 array[num_elem].prev_i = i-1; /* predecessor */
280 array[num_elem].prev_j = j-1; /* predecessor */
281 array[num_elem].prev_matrix_no = z;
282 array[num_elem].isSplice = false; /* no splice site */
283 scores[num_elem] = -tempValue ;
284 num_elem++ ;
285 }
286 }
287
288
289 /***************************************/
290 // 3. gap on DNA (j,i-1) -> (j,i)
291 /***************************************/
292 prevValue = ((Pre_score*)actMatrix +(i-1)*dna_len + j)->value ;
293 if (isfinite(prevValue))
294 {
295 tempValue = prevValue +(matchmatrix[estChar]); /* score(EST,gap) */
296 if (isfinite(tempValue))
297 {
298 assert(num_elem<max_num_elem) ;
299 array[num_elem].prev_i = i-1; /* predecessor */
300 array[num_elem].prev_j = j;
301 array[num_elem].prev_matrix_no = z;
302 array[num_elem].isSplice = false; /* no splice site */
303 scores[num_elem]=-tempValue ;
304 num_elem++ ;
305 }
306
307 }
308
309
310 /***************************************/
311 // 4. all possible splice sites/introns
312 /***************************************/
313 // acceptor[j-1]: g of ag (-1 because cpp starts counting at 0)
314 // donor[splice_pos-1]: g of gt/gc (-1 because cpp starts counting at 0)
315 // donor_sites: the g's of the gt/gc
316
317 if (isfinite(acceptor[j-1]))
318 {
319
320 //the nr_paths many introns of length greater than max_len
321 for (int pos=0; pos<nr_paths; pos++)
322 {
323 if (max_ss_element[z*nr_paths+pos]>0)
324 {
325 assert((j-max_ss_element[z*nr_paths+pos]+1) > max_len) ;
326 splice_pos = max_ss_element[z*nr_paths+pos] ; //the g of the gt
327 prevValue = ((Pre_score*)actMatrix + (i*dna_len) +splice_pos-1)->value ; //one before the _gt
328 if (isfinite(prevValue) && (((Pre_score*)actMatrix + (i*dna_len) +splice_pos-1)->issplice!=1))
329 {
330 int dist = j -(splice_pos-1); //_gt...ag: dist from g to g
331 assert(dist>max_len) ;
332 tempValue = lookup_penalty(functions,dist, NULL, 0) +donor[splice_pos-1] +acceptor[j-1] +prevValue;
333 if (isfinite(tempValue))
334 {
335 assert(num_elem<max_num_elem) ;
336 array[num_elem].prev_i = i;
337 array[num_elem].prev_j = splice_pos-1; // predecessor
338 array[num_elem].prev_matrix_no = z;
339 array[num_elem].isSplice = true; // splice site
340 scores[num_elem]=-tempValue ;
341 num_elem++ ;
342 }
343 else if (isfinite(lookup_penalty(functions,dist, NULL, 0)))
344 {
345 printf("Something wrong with donor_sites(1000er) (intron_length: %d)\n", dist) ;
346 sleep(5000) ;
347 }
348 }
349 }
350 }
351
352
353
354 //all introns of length <= max_len
355 for (int k=start_splice_pos; k < nr_donor_sites; k++)
356 //for (int k=0; k < nr_donor_sites; k++) //also intron of length greater than 1000
357 {
358 splice_pos = donor_sites[k] ;
359 if (splice_pos > j-4) { break ;}
360 prevValue = ((Pre_score*)actMatrix + (i*dna_len) +splice_pos-1)->value ; //one before the _gt
361
362 if (isfinite(prevValue) && (((Pre_score*)actMatrix + (i*dna_len) +splice_pos-1)->issplice!=1))
363 {
364 int dist = j -(splice_pos-1); //_gt...ag: dist from g to g
365 assert(dist<=max_len) ;
366 tempValue = lookup_penalty(functions,dist, NULL, 0) +donor[splice_pos-1] +acceptor[j-1] +prevValue;
367 if (isfinite(tempValue))
368 {
369 assert(num_elem<max_num_elem) ;
370 array[num_elem].prev_i = i;
371 array[num_elem].prev_j = splice_pos-1; /* predecessor */
372 array[num_elem].prev_matrix_no = z;
373 array[num_elem].isSplice = true; /* splice site */
374 scores[num_elem]=-tempValue ;
375 num_elem++ ;
376 }
377 else if (isfinite(lookup_penalty(functions,dist, NULL, 0)))
378 {
379 printf("Something wrong with donor_sites (intron_length: %d)\n", dist) ;
380 sleep(5000) ;
381 }
382 }
383 }
384 }
385
386 } //end of z
387
388 CMath::nmin<struct ArrayElem>(scores, array, num_elem, nr_paths);
389
390 /* the nr_paths many highest scores are written in the nr_paths many matrices*/
391 for (int z=0; z<nr_paths; z++) {
392 if (z>=num_elem) {
393 //nothing happens, entry in matrices[z] at pos. (i,j) still -inf
394 }
395 else {
396 ((Pre_score*)matrices[z] + (i*dna_len) +j)->value = -scores[z] ;
397 ((Pre_score*)matrices[z] + (i*dna_len) +j)->prev_i = array[z].prev_i ;
398 ((Pre_score*)matrices[z] + (i*dna_len) +j)->prev_j = array[z].prev_j ;
399 ((Pre_score*)matrices[z] + (i*dna_len) +j)->issplice = array[z].isSplice ;
400 ((Pre_score*)matrices[z] + (i*dna_len) +j)->prev_matrix_no = array[z].prev_matrix_no;
401
402 if (-scores[z]>=max_scores[z]) { //storing the postion of the highest alignment score yet
403 max_scores[z] = -scores[z] ;
404 max_score_positions[2*z] = i; // pos i
405 max_score_positions[2*z +1] = j; // pos j
406 }
407 }
408 } //end of z
409
410 } //end of j
411 } // end of i
412
413
414
415 //free memory
416 delete[] array;
417 delete[] scores;
418
419 // printf("Leaving fill_matrix...\n");
420 return;
421 }
422
423