+ added code for negative strand processing
[qpalma.git] / dyn_prog / fill_matrix.cpp
1 /*********************************************************************************************/
2 // Version of fill_matrix with fills several alignment matrices (best, second_best ...)
3 // nr_paths det. number of these matrices
4 //
5 // est_len: Number of columns in alignment matrix: length(EST) +1 (for the gap) (variable: i)
6 // dna_len: Number of lines in alignment matrix: length(DNA) +1 (for the gap) (variable: j)
7 //
8 // !!! IMPORTANT !!! The pictures and equations in the paper/poster etc. describe the algorithm
9 // for an alignment matrix where DNA and EST changed places !!! IMPORTANT !!!
10 /*********************************************************************************************/
11
12 /*
13 matchmatrix: columnwise
14 - A C G T N (dna)
15 - x x x x x x
16 A x x x x x x
17 C x x x x x x
18 G x x x x x x
19 T x x x x x x
20 N x x x x x x
21 (est)
22 */
23
24 /*
25 alignment matrix: columnwise
26 |-EST . . .
27 -+------------
28 -|00 ...
29 D|0.
30 N|. .
31 A|. .
32 .|.
33 .|
34 .|
35 */
36
37
38 /* TODO
39 - max_len = 10000 for elegans!
40 - do not use double
41 */
42
43 #include "fill_matrix.h"
44 #include "debug_tools.h"
45
46 #define D_USE_QUALITY_SCORES 1
47
48 int check_char(char base)
49 {
50 switch(base)
51 {
52 case 'A':
53 case 'a': return 1;
54 case 'C':
55 case 'c': return 2;
56 case 'G':
57 case 'g': return 3;
58 case 'T':
59 case 't': return 4;
60 case 'N':
61 case 'n': return 5;
62 //default : return 0;
63 default : return -1;
64 }
65
66 }
67
68 /* system of Palma. Otherwise scoring_functions should point to an array of
69 * plifs scoring the respective pairs of characters together with the EST
70 * quality score.
71 */
72
73 inline double getScore(struct penalty_struct* qualityScores, int mlen, int dnaChar, int estChar, double baseScore) {
74 double score = 0.0;
75 FA(estChar > 0 && estChar < 6);
76 FA(dnaChar > -1 && dnaChar < 6);
77
78 //printf("Starting scoring / scheme is USE_QUALITY_SCORES\n");
79 //printf("estChar,dnaChar are: %d,%d,%d",estChar,dnaChar);
80
81 int currentPos = (estChar-1)*6+dnaChar;
82 struct penalty_struct currentPen = qualityScores[currentPos];
83
84 //if( estChar == 1 and dnaChar == 1)
85 // printf("A-A : %f %f\n",currentPen.penalties[0],currentPen.penalties[1]);
86
87 INT idx = 0 ;
88 for (INT i=0; i<currentPen.len; i++)
89 if (currentPen.limits[i] <= baseScore)
90 idx++ ;
91
92 if (idx==0)
93 score=currentPen.penalties[0] ;
94 else if (idx==currentPen.len)
95 score=currentPen.penalties[currentPen.len-1] ;
96 else
97 {
98 score = (currentPen.penalties[idx]*(baseScore-currentPen.limits[idx-1]) + currentPen.penalties[idx-1]*
99 (currentPen.limits[idx]-baseScore)) / (currentPen.limits[idx]-currentPen.limits[idx-1]) ;
100 }
101
102 //score = lookup_penalty(&currentPen,idx, NULL, 0);
103 //if(baseScore <= currentPen.limits[0]) {
104 // score = currentPen.penalties[0];
105 //} else {
106 // int sp_idx;
107 // for(sp_idx=0;sp_idx<currentPen.len;sp_idx++) {
108 // if(currentPen.limits[sp_idx] <= baseScore )
109 // score = currentPen.penalties[sp_idx];
110 // //printf("Current score is %f\n",score);
111 // }
112 //}
113
114 return score;
115 }
116
117 void fill_matrix(int nr_paths, Pre_score* matrices[], int est_len /*counter var: i*/, int dna_len /*counter var: j*/, char* est, char* dna, double* prb, penalty_struct* functions , double* matchmatrix, penalty_struct* qualityScores, double* donor, double* acceptor, bool remove_duplicate_scores, int nr_donor_sites, int* donor_sites, int* max_score_positions)
118 {
119 //printf("Entering fill_matrix...\n");
120
121 /*********************************************************************************************/
122 // variables
123 /*********************************************************************************************/
124 const int mlen = 6; // length of matchmatrix
125 int estChar, dnaChar; // 0:'-', 1:'a', 2:'c', 3:'g', 4:'t', 5:'n'
126
127 double baseScore;
128 double *est_scores = prb ;
129
130 double prevValue ;
131 double tempValue;
132 int max_len = (int)functions->limits[functions->len-1] ; //last (len) supporting point of h
133 //printf("max_len: %d\n", max_len) ;
134
135 int num_elem ; // counter variable
136
137 const int max_num_elem = (4+nr_paths+max_len) * nr_paths ; // upper bound on num_elem
138 double *scores=new double[max_num_elem] ; //are sorted later
139 struct ArrayElem *array=new ArrayElem[max_num_elem] ;
140 //printf("%d\n", max_num_elem*sizeof(ArrayElem)) ;
141
142 int max_ss_element[nr_paths*nr_paths]; //position on dna with max ss score for introns of length greater than max_len
143 int splice_pos ; //splice position get from donor_sites
144 int start_splice_pos ; //where to start looking in donor_sites
145
146 double max_scores[nr_paths] ;
147
148 /*********************************************************************************************/
149 // Initialisation of all alignment matrices, columnwise
150 /*********************************************************************************************/
151 for (int z=0; z<nr_paths; z++) /* matrice z */
152 {
153 max_scores[z] = log(0.0) ;
154 max_score_positions[2*z] = 0; // pos i
155 max_score_positions[2*z +1] = 0; // pos j
156
157 for (int i=0; i<est_len; i++) /* i.column */
158 {
159 for (int j=0; j<dna_len; j++) /* j.line */
160 {
161 ((Pre_score*)matrices[z] +(i*dna_len) +j)->value = log(0.0); // -inf
162 ((Pre_score*)matrices[z] +(i*dna_len) +j)->prev_i = 0;
163 ((Pre_score*)matrices[z] +(i*dna_len) +j)->prev_j = 0;
164 ((Pre_score*)matrices[z] +(i*dna_len) +j)->prev_matrix_no = 0;
165 ((Pre_score*)matrices[z] +(i*dna_len) +j)->issplice = 0;
166 }
167 }
168 }
169
170 /*********************************************************************************************/
171 // Element (0,0) for first/best matrice (z=0)
172 /*********************************************************************************************/
173 ((Pre_score*)matrices[0])->value = 0;
174 ((Pre_score*)matrices[0])->prev_i = 0;
175 ((Pre_score*)matrices[0])->prev_j = 0;
176 ((Pre_score*)matrices[0])->prev_matrix_no = 0;
177 ((Pre_score*)matrices[0])->issplice = 0;
178
179
180 /*********************************************************************************************/
181 // Line Zero for first/best matrice (z=0)
182 /*********************************************************************************************/
183 for (int i=1; i<est_len; i++) /* 0.line, all columns (DNA='-') */
184 {
185 ((Pre_score*)matrices[0] +(i*dna_len))->value = 0 ;
186 ((Pre_score*)matrices[0] +(i*dna_len))->prev_i = i-1;
187 ((Pre_score*)matrices[0] +(i*dna_len))->prev_j = 0;
188 ((Pre_score*)matrices[0] +(i*dna_len))->prev_matrix_no = 0;
189 ((Pre_score*)matrices[0] +(i*dna_len))->issplice = 0;
190 }
191
192
193 /*********************************************************************************************/
194 // Column Zero for first/best matrice (z=0)
195 /*********************************************************************************************/
196 for (int j=1; j<dna_len; j++) // 0.column, all lines
197 {
198 ((Pre_score*)matrices[0] +j)->value = 0 ;
199 ((Pre_score*)matrices[0] +j)->prev_i = 0 ;
200 ((Pre_score*)matrices[0] +j)->prev_j = j-1 ;
201 ((Pre_score*)matrices[0] +j)->prev_matrix_no = 0 ;
202 ((Pre_score*)matrices[0] +j)->issplice = 0;
203 }
204
205
206 /*********************************************************************************************/
207 // !!!! Column Zero and line Zero for all other matrices except the best one (z>0): -INF !!!!
208 /*********************************************************************************************/
209
210
211 //printf("after initialization\n");
212
213 /*********************************************************************************************/
214 // all other entries for all matrices, last column and row treated seperately
215 /*********************************************************************************************/
216
217 /* columnwise, 0.column and 0.line already filled */
218 for (int i=1; i<est_len; i++) /* i.column */
219 {
220
221 start_splice_pos = 0 ; //where to start looking in donor_sites
222 for (int zz=0; zz<nr_paths*nr_paths; zz++) {max_ss_element[zz]=0 ;} // initializing
223
224 for (int j=1; j<dna_len; j++) //j.line
225 {
226 dnaChar = check_char(dna[j-1]) ; // -1 because of of the gaps in the 0.line/column of the al-matrix
227 estChar = check_char(est[i-1]) ;
228 baseScore = est_scores[i-1];
229
230 //printf("at iteration %d %d\n",i,j);
231
232 //if introns of length greater than max_len possible
233 if (j > max_len)
234 {
235 //storing highest ss_score (the position) and NOT -INF
236 int new_ss_pos = j-max_len ;
237 double donor_score = donor[new_ss_pos-1] ;
238 if (isfinite(donor_score)) //splice site!
239 {
240 if (max_ss_element[nr_paths-1] != 0)
241 {
242 for (int zz=0; zz<nr_paths; zz++) //for each alignment matrix
243 {
244 int pos_with_minscore = nr_paths ; //assuming, the new score is the worst score
245 double new_intron_score = donor_score+((Pre_score*)matrices[zz]+(i*dna_len)+new_ss_pos-1)->value ; //could be -INF
246 if (!(isfinite(new_intron_score))) {printf("new_intron_score is -INF\n") ;}
247 for (int pos=0; pos<nr_paths; pos++) //find worst score of all nr_paths+1 scores
248 {
249 int act_splice_position = max_ss_element[zz*nr_paths+pos] ;
250 assert(isfinite(donor[act_splice_position-1])) ;
251 double act_intron_score = donor[act_splice_position-1]+((Pre_score*)matrices[zz]+(i*dna_len)+act_splice_position-1)->value ;
252 if (new_intron_score >= act_intron_score)
253 {
254 new_intron_score = act_intron_score ;
255 pos_with_minscore = pos ;
256 } //else nothing happens, new intron score still the worst one
257 }
258 if (pos_with_minscore != nr_paths)
259 {
260 max_ss_element[zz*nr_paths+pos_with_minscore] = new_ss_pos ; //replacing
261 }
262 }
263 }
264 else
265 {
266 for (int pos=0; pos<nr_paths; pos++)
267 {
268 if (max_ss_element[pos]==0) //no previous splice site at this pos
269 {
270 for (int zz=0; zz<nr_paths; zz++) {max_ss_element[zz*nr_paths+pos]=j-max_len ;}
271 break ;
272 }
273 }
274 }
275 }
276 //start_pos in donor_sites so that only introns shorter then max_length are seen
277 if ((start_splice_pos < nr_donor_sites) && donor_sites[start_splice_pos] == j-max_len)
278 {
279 start_splice_pos++ ;
280 }
281 }
282
283
284 /* Computing all possible scores */
285 num_elem = 0 ; /* number of different scores/possibilities so far */
286 for (int z=0; z<nr_paths; z++) /* matrice z */
287 {
288 Pre_score* actMatrix = (Pre_score*)matrices[z]; /* hm, why? */
289
290 /***************************************/
291 // 0. Zero (local alignment version)
292 /***************************************/
293 assert(num_elem<max_num_elem) ;
294 array[num_elem].prev_i = 0;
295 array[num_elem].prev_j = 0;
296 array[num_elem].prev_matrix_no = z;
297 array[num_elem].isSplice = false; /* no splice site */
298 scores[num_elem] = 0 ;
299 num_elem++ ;
300
301
302 /***************************************/
303 // 1. gap on EST (j-1,i) -> (j,i)
304 /***************************************/
305 prevValue = ((Pre_score*)actMatrix +(i*dna_len)+j-1)->value ; /* (j-1,i) -> (j,i) */
306 if (isfinite(prevValue))
307 {
308 //if( currentMode == USE_QUALITY_SCORES ) {
309 #if D_USE_QUALITY_SCORES
310 tempValue = prevValue + matchmatrix[dnaChar];
311 #else
312 //} else {
313 tempValue = prevValue + matchmatrix[mlen*dnaChar]; /* score(gap,DNA) */
314 //}
315 #endif
316
317 //printf("tempValue: %f\n",tempValue);
318
319 if (isfinite(tempValue))
320 {
321 assert(num_elem<max_num_elem) ;
322 array[num_elem].prev_i = i;
323 array[num_elem].prev_j = j-1; /* predecessor */
324 array[num_elem].prev_matrix_no = z;
325 array[num_elem].isSplice = false; /* no splice site */
326 scores[num_elem] = -tempValue ;
327 num_elem++ ;
328 }
329 }
330
331
332 /***************************************/
333 // 2. match/mismatch (j-1,i-1) -> (j,i)
334 /***************************************/
335 prevValue = ((Pre_score*)actMatrix +(i-1)*dna_len + j-1)->value ; /* (j-1,i-1) -> (j,i) */
336 if (isfinite(prevValue))
337 {
338 //if( currentMode == USE_QUALITY_SCORES ) {
339 #if D_USE_QUALITY_SCORES
340 tempValue = prevValue + getScore(qualityScores,mlen,dnaChar,estChar,baseScore);
341 #else
342 //} else {
343 tempValue = prevValue +(matchmatrix[mlen* dnaChar +estChar]);
344 #endif
345 //}
346
347 //printf("tempValue: %f\n",tempValue);
348
349 if (isfinite(tempValue))
350 {
351 assert(num_elem<max_num_elem) ;
352 array[num_elem].prev_i = i-1; /* predecessor */
353 array[num_elem].prev_j = j-1; /* predecessor */
354 array[num_elem].prev_matrix_no = z;
355 array[num_elem].isSplice = false; /* no splice site */
356 scores[num_elem] = -tempValue ;
357 num_elem++ ;
358 }
359 }
360
361
362 /***************************************/
363 // 3. gap on DNA (j,i-1) -> (j,i)
364 /***************************************/
365 prevValue = ((Pre_score*)actMatrix +(i-1)*dna_len + j)->value ;
366 if (isfinite(prevValue))
367 {
368 //if( currentMode == USE_QUALITY_SCORES ) {
369 #if D_USE_QUALITY_SCORES
370 tempValue = prevValue + getScore(qualityScores,mlen,0,estChar,baseScore);
371 #else
372 // } else {
373 tempValue = prevValue +(matchmatrix[estChar]); /* score(EST,gap) */
374 // }
375 #endif
376
377 //printf("tempValue: %f\n",tempValue);
378
379 if (isfinite(tempValue))
380 {
381 assert(num_elem<max_num_elem) ;
382 array[num_elem].prev_i = i-1; /* predecessor */
383 array[num_elem].prev_j = j;
384 array[num_elem].prev_matrix_no = z;
385 array[num_elem].isSplice = false; /* no splice site */
386 scores[num_elem]=-tempValue ;
387 num_elem++ ;
388 }
389 }
390
391 /***************************************/
392 // 4. all possible splice sites/introns
393 /***************************************/
394 // acceptor[j-1]: g of ag (-1 because cpp starts counting at 0)
395 // donor[splice_pos-1]: g of gt/gc (-1 because cpp starts counting at 0)
396 // donor_sites: the g's of the gt/gc
397
398 if (isfinite(acceptor[j-1]))
399 {
400
401 //the nr_paths many introns of length greater than max_len
402 for (int pos=0; pos<nr_paths; pos++)
403 {
404 if (max_ss_element[z*nr_paths+pos]>0)
405 {
406 assert((j-max_ss_element[z*nr_paths+pos]+1) > max_len) ;
407 splice_pos = max_ss_element[z*nr_paths+pos] ; //the g of the gt
408 prevValue = ((Pre_score*)actMatrix + (i*dna_len) +splice_pos-1)->value ; //one before the _gt
409 if (isfinite(prevValue) && (((Pre_score*)actMatrix + (i*dna_len) +splice_pos-1)->issplice!=1))
410 {
411 int dist = j -(splice_pos-1); //_gt...ag: dist from g to g
412 assert(dist>max_len) ;
413 tempValue = lookup_penalty(functions,dist, NULL, 0) +donor[splice_pos-1] +acceptor[j-1] +prevValue;
414
415 //printf("tempValue: %f\n",tempValue);
416
417 if (isfinite(tempValue))
418 {
419 assert(num_elem<max_num_elem) ;
420 array[num_elem].prev_i = i;
421 array[num_elem].prev_j = splice_pos-1; // predecessor
422 array[num_elem].prev_matrix_no = z;
423 array[num_elem].isSplice = true; // splice site
424 scores[num_elem]=-tempValue ;
425 num_elem++ ;
426 }
427 else if (isfinite(lookup_penalty(functions,dist, NULL, 0)))
428 {
429 printf("Something wrong with donor_sites(1000er) (intron_length: %d)\n", dist) ;
430 sleep(5000) ;
431 }
432 }
433 }
434 }
435
436 //all introns of length <= max_len
437 for (int k=start_splice_pos; k < nr_donor_sites; k++)
438 //for (int k=0; k < nr_donor_sites; k++) //also intron of length greater than 1000
439 {
440 splice_pos = donor_sites[k] ;
441 if (splice_pos > j-4) { break ;}
442 prevValue = ((Pre_score*)actMatrix + (i*dna_len) +splice_pos-1)->value ; //one before the _gt
443
444 if (isfinite(prevValue) && (((Pre_score*)actMatrix + (i*dna_len) +splice_pos-1)->issplice!=1))
445 {
446 int dist = j -(splice_pos-1); //_gt...ag: dist from g to g
447 assert(dist<=max_len) ;
448 tempValue = lookup_penalty(functions,dist, NULL, 0) +donor[splice_pos-1] +acceptor[j-1] +prevValue;
449 if (isfinite(tempValue))
450 {
451 assert(num_elem<max_num_elem) ;
452 array[num_elem].prev_i = i;
453 array[num_elem].prev_j = splice_pos-1; /* predecessor */
454 array[num_elem].prev_matrix_no = z;
455 array[num_elem].isSplice = true; /* splice site */
456 scores[num_elem]=-tempValue ;
457 num_elem++ ;
458 }
459 else if (isfinite(lookup_penalty(functions,dist, NULL, 0)))
460 {
461 printf("Something wrong with donor_sites (intron_length: %d)\n", dist) ;
462 sleep(5000) ;
463 }
464 }
465 }
466 }
467
468 } //end of z
469
470 CMath::nmin<struct ArrayElem>(scores, array, num_elem, nr_paths);
471
472 /* the nr_paths many highest scores are written in the nr_paths many matrices*/
473 for (int z=0; z<nr_paths; z++) {
474 if (z>=num_elem) {
475 //nothing happens, entry in matrices[z] at pos. (i,j) still -inf
476 }
477 else {
478 ((Pre_score*)matrices[z] + (i*dna_len) +j)->value = -scores[z] ;
479 ((Pre_score*)matrices[z] + (i*dna_len) +j)->prev_i = array[z].prev_i ;
480 ((Pre_score*)matrices[z] + (i*dna_len) +j)->prev_j = array[z].prev_j ;
481 ((Pre_score*)matrices[z] + (i*dna_len) +j)->issplice = array[z].isSplice ;
482 ((Pre_score*)matrices[z] + (i*dna_len) +j)->prev_matrix_no = array[z].prev_matrix_no;
483
484 if (-scores[z]>=max_scores[z]) { //storing the postion of the highest alignment score yet
485 max_scores[z] = -scores[z] ;
486 max_score_positions[2*z] = i; // pos i
487 max_score_positions[2*z +1] = j; // pos j
488 }
489 }
490 } //end of z
491
492 } //end of j
493 } // end of i
494
495 //free memory
496 delete[] array;
497 delete[] scores;
498
499 //printf("Leaving fill_matrix...\n");
500 return;
501 }