+ parameter transfer to python layer works
[qpalma.git] / QPalmaDP / fill_matrix.cpp
1 /*********************************************************************************************/
2 // Version of fill_matrix with fills several alignment matrices (best, second_best ...)
3 // nr_paths det. number of these matrices
4 //
5 // est_len: Number of columns in alignment matrix: length(EST) +1 (for the gap) (variable: i)
6 // dna_len: Number of lines in alignment matrix: length(DNA) +1 (for the gap) (variable: j)
7 //
8 // !!! IMPORTANT !!! The pictures and equations in the paper/poster etc. describe the algorithm
9 // for an alignment matrix where DNA and EST changed places !!! IMPORTANT !!!
10 /*********************************************************************************************/
11
12 /*
13 matchmatrix: columnwise
14 - A C G T N (dna)
15 - x x x x x x
16 A x x x x x x
17 C x x x x x x
18 G x x x x x x
19 T x x x x x x
20 N x x x x x x
21 (est)
22 */
23
24 /*
25 alignment matrix: columnwise
26 |-EST . . .
27 -+------------
28 -|00 ...
29 D|0.
30 N|. .
31 A|. .
32 .|.
33 .|
34 .|
35 */
36
37
38 /* TODO
39 - max_len = 10000 for elegans!
40 - do not use double
41 */
42
43 #include "fill_matrix.h"
44
45 using namespace std;
46
47 int check_char(char base)
48 {
49 switch(base)
50 {
51 case 'A':
52 case 'a': return 1;
53 case 'C':
54 case 'c': return 2;
55 case 'G':
56 case 'g': return 3;
57 case 'T':
58 case 't': return 4;
59 case 'N':
60 case 'n': return 5;
61 default : return 0;
62 }
63
64 }
65
66 /* if scoring_functions is a NULL pointer then we realize the "normal" scoring
67 * system of Palma. Otherwiese scoring_functions should point to an array of
68 * plifs scoring the respective pairs of characters together with the EST
69 * quality score.
70 */
71
72 double getScore(struct penalty_struct* qualityScores, double* matchmatrix, int mlen, int dnaChar, int estChar, double baseScore, mode scoringScheme) {
73 double score = 0.0;
74
75 if( scoringScheme == NORMAL ) {
76 if( estChar == -1 )
77 score = matchmatrix[mlen*dnaChar];
78
79 if( dnaChar == -1 )
80 score = matchmatrix[estChar];
81
82 if( estChar != -1 && dnaChar != -1 )
83 score = matchmatrix[mlen* dnaChar +estChar];
84 } else {
85 int estIdx = check_char(estChar) - 1;
86 int dnaIdx = check_char(dnaChar) - 1;
87 // assert 0 <= estIdx <= 3
88 int rowSize = 6;
89 struct penalty_struct currentPen = qualityScores[estIdx*rowSize+dnaIdx];
90 score = 0;//scoreMatch(dnaChar,estChar,baseScore);
91 }
92
93 return score;
94 }
95
96 void fill_matrix(int nr_paths, Pre_score* matrices[], int est_len /*counter var: i*/, int dna_len /*counter var: j*/, char* est, char* dna, penalty_struct* functions , double* matchmatrix, penalty_struct* qualityScores, double* donor, double* acceptor, bool remove_duplicate_scores, int nr_donor_sites, int* donor_sites, int* max_score_positions, mode currentMode)
97 {
98 // printf("Entering fill_matrix...\n");
99
100 /*********************************************************************************************/
101 // variables
102 /*********************************************************************************************/
103 const int mlen = 6; // length of matchmatrix
104 int estChar, dnaChar; // 0:'-', 1:'a', 2:'c', 3:'g', 4:'t', 5:'n'
105
106 printf("1st plif first three elems %f %f %f\n",qualityScores[0].penalties[0],qualityScores[0].penalties[1],qualityScores[0].penalties[2]);
107 printf("2nd plif first three elems %f %f %f\n",qualityScores[1].penalties[0],qualityScores[1].penalties[1],qualityScores[1].penalties[2]);
108
109 double baseScore;
110 double *est_scores = new double[est_len];
111 for(int i=0;i<est_len;i++)
112 est_scores[i] = 0.0;
113
114
115 double prevValue ;
116 double tempValue;
117 int max_len = (int)functions->limits[functions->len-1] ; //last (len) supporting point of h
118 //printf("max_len: %d\n", max_len) ;
119
120 int num_elem ; // counter variable
121
122 const int max_num_elem = (4+nr_paths+max_len) * nr_paths ; // upper bound on num_elem
123 double *scores=new double[max_num_elem] ; //are sorted later
124 struct ArrayElem *array=new ArrayElem[max_num_elem] ;
125 //printf("%d\n", max_num_elem*sizeof(ArrayElem)) ;
126
127 int max_ss_element[nr_paths*nr_paths]; //position on dna with max ss score for introns of length greater than max_len
128 int splice_pos ; //splice position get from donor_sites
129 int start_splice_pos ; //where to start looking in donor_sites
130
131 double max_scores[nr_paths] ;
132
133 /*********************************************************************************************/
134 // The penalty functions
135 /*********************************************************************************************/
136
137 /* scoring_functions is an array of size 4x6
138 *
139 */
140
141 struct penalty_struct* scoring_functions = 0;
142
143 /*********************************************************************************************/
144 // Initialisation of all alignment matrices, columnwise
145 /*********************************************************************************************/
146 for (int z=0; z<nr_paths; z++) /* matrice z */
147 {
148 max_scores[z] = log(0.0) ;
149 max_score_positions[2*z] = 0; // pos i
150 max_score_positions[2*z +1] = 0; // pos j
151
152 for (int i=0; i<est_len; i++) /* i.column */
153 {
154 for (int j=0; j<dna_len; j++) /* j.line */
155 {
156 ((Pre_score*)matrices[z] +(i*dna_len) +j)->value = log(0.0); // -inf
157 ((Pre_score*)matrices[z] +(i*dna_len) +j)->prev_i = 0;
158 ((Pre_score*)matrices[z] +(i*dna_len) +j)->prev_j = 0;
159 ((Pre_score*)matrices[z] +(i*dna_len) +j)->prev_matrix_no = 0;
160 ((Pre_score*)matrices[z] +(i*dna_len) +j)->issplice = 0;
161 }
162 }
163 }
164
165
166 /*********************************************************************************************/
167 // Element (0,0) for first/best matrice (z=0)
168 /*********************************************************************************************/
169 ((Pre_score*)matrices[0])->value = 0;
170 ((Pre_score*)matrices[0])->prev_i = 0;
171 ((Pre_score*)matrices[0])->prev_j = 0;
172 ((Pre_score*)matrices[0])->prev_matrix_no = 0;
173 ((Pre_score*)matrices[0])->issplice = 0;
174
175
176 /*********************************************************************************************/
177 // Line Zero for first/best matrice (z=0)
178 /*********************************************************************************************/
179 for (int i=1; i<est_len; i++) /* 0.line, all columns (DNA='-') */
180 {
181 ((Pre_score*)matrices[0] +(i*dna_len))->value = 0 ;
182 ((Pre_score*)matrices[0] +(i*dna_len))->prev_i = i-1;
183 ((Pre_score*)matrices[0] +(i*dna_len))->prev_j = 0;
184 ((Pre_score*)matrices[0] +(i*dna_len))->prev_matrix_no = 0;
185 ((Pre_score*)matrices[0] +(i*dna_len))->issplice = 0;
186 }
187
188
189 /*********************************************************************************************/
190 // Column Zero for first/best matrice (z=0)
191 /*********************************************************************************************/
192 for (int j=1; j<dna_len; j++) // 0.column, all lines
193 {
194 ((Pre_score*)matrices[0] +j)->value = 0 ;
195 ((Pre_score*)matrices[0] +j)->prev_i = 0 ;
196 ((Pre_score*)matrices[0] +j)->prev_j = j-1 ;
197 ((Pre_score*)matrices[0] +j)->prev_matrix_no = 0 ;
198 ((Pre_score*)matrices[0] +j)->issplice = 0;
199 }
200
201
202 /*********************************************************************************************/
203 // !!!! Column Zero and line Zero for all other matrices except the best one (z>0): -INF !!!!
204 /*********************************************************************************************/
205
206
207
208 /*********************************************************************************************/
209 // all other entries for all matrices, last column and row treated seperately
210 /*********************************************************************************************/
211
212 /* columnwise, 0.column and 0.line already filled */
213 for (int i=1; i<est_len; i++) /* i.column */
214 {
215
216 start_splice_pos = 0 ; //where to start looking in donor_sites
217 for (int zz=0; zz<nr_paths*nr_paths; zz++) {max_ss_element[zz]=0 ;} // initializing
218
219 for (int j=1; j<dna_len; j++) //j.line
220 {
221 dnaChar = check_char(dna[j-1]) ; // -1 because of of the gaps in the 0.line/column of the al-matrix
222 estChar = check_char(est[i-1]) ;
223 baseScore = est_scores[i-1];
224
225
226 //if introns of length greater than max_len possible
227 if (j > max_len)
228 {
229 //storing highest ss_score (the position) and NOT -INF
230 int new_ss_pos = j-max_len ;
231 double donor_score = donor[new_ss_pos-1] ;
232 if (isfinite(donor_score)) //splice site!
233 {
234 if (max_ss_element[nr_paths-1] != 0)
235 {
236 for (int zz=0; zz<nr_paths; zz++) //for each alignment matrix
237 {
238 int pos_with_minscore = nr_paths ; //assuming, the new score is the worst score
239 double new_intron_score = donor_score+((Pre_score*)matrices[zz]+(i*dna_len)+new_ss_pos-1)->value ; //could be -INF
240 if (!(isfinite(new_intron_score))) {printf("new_intron_score is -INF\n") ;}
241 for (int pos=0; pos<nr_paths; pos++) //find worst score of all nr_paths+1 scores
242 {
243 int act_splice_position = max_ss_element[zz*nr_paths+pos] ;
244 assert(isfinite(donor[act_splice_position-1])) ;
245 double act_intron_score = donor[act_splice_position-1]+((Pre_score*)matrices[zz]+(i*dna_len)+act_splice_position-1)->value ;
246 if (new_intron_score >= act_intron_score)
247 {
248 new_intron_score = act_intron_score ;
249 pos_with_minscore = pos ;
250 } //else nothing happens, new intron score still the worst one
251 }
252 if (pos_with_minscore != nr_paths)
253 {
254 max_ss_element[zz*nr_paths+pos_with_minscore] = new_ss_pos ; //replacing
255 }
256 }
257 }
258 else
259 {
260 for (int pos=0; pos<nr_paths; pos++)
261 {
262 if (max_ss_element[pos]==0) //no previous splice site at this pos
263 {
264 for (int zz=0; zz<nr_paths; zz++) {max_ss_element[zz*nr_paths+pos]=j-max_len ;}
265 break ;
266 }
267 }
268 }
269 }
270 //start_pos in donor_sites so that only introns shorter then max_length are seen
271 if ((start_splice_pos < nr_donor_sites) && donor_sites[start_splice_pos] == j-max_len)
272 {
273 start_splice_pos++ ;
274 }
275 }
276
277
278 /* Computing all possible scores */
279 num_elem = 0 ; /* number of different scores/possibilities so far */
280 for (int z=0; z<nr_paths; z++) /* matrice z */
281 {
282 Pre_score* actMatrix = (Pre_score*)matrices[z]; /* hm, why? */
283
284 /***************************************/
285 // 0. Zero (local alignment version)
286 /***************************************/
287 assert(num_elem<max_num_elem) ;
288 array[num_elem].prev_i = 0;
289 array[num_elem].prev_j = 0;
290 array[num_elem].prev_matrix_no = z;
291 array[num_elem].isSplice = false; /* no splice site */
292 scores[num_elem] = 0 ;
293 num_elem++ ;
294
295
296 /***************************************/
297 // 1. gap on EST (j-1,i) -> (j,i)
298 /***************************************/
299 prevValue = ((Pre_score*)actMatrix +(i*dna_len)+j-1)->value ; /* (j-1,i) -> (j,i) */
300 if (isfinite(prevValue))
301 {
302 // tempValue = prevValue +(matchmatrix[mlen* dnaChar]); /* score(gap,DNA) */
303 tempValue = prevValue + getScore(qualityScores,matchmatrix,mlen,dnaChar,-1,baseScore,currentMode);
304 if (isfinite(tempValue))
305 {
306 assert(num_elem<max_num_elem) ;
307 array[num_elem].prev_i = i;
308 array[num_elem].prev_j = j-1; /* predecessor */
309 array[num_elem].prev_matrix_no = z;
310 array[num_elem].isSplice = false; /* no splice site */
311 scores[num_elem] = -tempValue ;
312 num_elem++ ;
313 }
314 }
315
316
317 /***************************************/
318 // 2. match/mismatch (j-1,i-1) -> (j,i)
319 /***************************************/
320 prevValue = ((Pre_score*)actMatrix +(i-1)*dna_len + j-1)->value ; /* (j-1,i-1) -> (j,i) */
321 if (isfinite(prevValue))
322 {
323 //tempValue = prevValue +(matchmatrix[mlen* dnaChar +estChar]);//DNA mit EST
324 tempValue = prevValue + getScore(qualityScores,matchmatrix,mlen,dnaChar,estChar,baseScore,currentMode);
325 if (isfinite(tempValue))
326 {
327 assert(num_elem<max_num_elem) ;
328 array[num_elem].prev_i = i-1; /* predecessor */
329 array[num_elem].prev_j = j-1; /* predecessor */
330 array[num_elem].prev_matrix_no = z;
331 array[num_elem].isSplice = false; /* no splice site */
332 scores[num_elem] = -tempValue ;
333 num_elem++ ;
334 }
335 }
336
337
338 /***************************************/
339 // 3. gap on DNA (j,i-1) -> (j,i)
340 /***************************************/
341 prevValue = ((Pre_score*)actMatrix +(i-1)*dna_len + j)->value ;
342 if (isfinite(prevValue))
343 {
344 // tempValue = prevValue +(matchmatrix[estChar]); /* score(EST,gap) */
345 tempValue = prevValue + getScore(qualityScores,matchmatrix,mlen,-1,estChar,baseScore,currentMode);
346 if (isfinite(tempValue))
347 {
348 assert(num_elem<max_num_elem) ;
349 array[num_elem].prev_i = i-1; /* predecessor */
350 array[num_elem].prev_j = j;
351 array[num_elem].prev_matrix_no = z;
352 array[num_elem].isSplice = false; /* no splice site */
353 scores[num_elem]=-tempValue ;
354 num_elem++ ;
355 }
356
357 }
358
359
360 /***************************************/
361 // 4. all possible splice sites/introns
362 /***************************************/
363 // acceptor[j-1]: g of ag (-1 because cpp starts counting at 0)
364 // donor[splice_pos-1]: g of gt/gc (-1 because cpp starts counting at 0)
365 // donor_sites: the g's of the gt/gc
366
367 if (isfinite(acceptor[j-1]))
368 {
369
370 //the nr_paths many introns of length greater than max_len
371 for (int pos=0; pos<nr_paths; pos++)
372 {
373 if (max_ss_element[z*nr_paths+pos]>0)
374 {
375 assert((j-max_ss_element[z*nr_paths+pos]+1) > max_len) ;
376 splice_pos = max_ss_element[z*nr_paths+pos] ; //the g of the gt
377 prevValue = ((Pre_score*)actMatrix + (i*dna_len) +splice_pos-1)->value ; //one before the _gt
378 if (isfinite(prevValue) && (((Pre_score*)actMatrix + (i*dna_len) +splice_pos-1)->issplice!=1))
379 {
380 int dist = j -(splice_pos-1); //_gt...ag: dist from g to g
381 assert(dist>max_len) ;
382 tempValue = lookup_penalty(functions,dist, NULL, 0) +donor[splice_pos-1] +acceptor[j-1] +prevValue;
383 if (isfinite(tempValue))
384 {
385 assert(num_elem<max_num_elem) ;
386 array[num_elem].prev_i = i;
387 array[num_elem].prev_j = splice_pos-1; // predecessor
388 array[num_elem].prev_matrix_no = z;
389 array[num_elem].isSplice = true; // splice site
390 scores[num_elem]=-tempValue ;
391 num_elem++ ;
392 }
393 else if (isfinite(lookup_penalty(functions,dist, NULL, 0)))
394 {
395 printf("Something wrong with donor_sites(1000er) (intron_length: %d)\n", dist) ;
396 sleep(5000) ;
397 }
398 }
399 }
400 }
401
402
403
404 //all introns of length <= max_len
405 for (int k=start_splice_pos; k < nr_donor_sites; k++)
406 //for (int k=0; k < nr_donor_sites; k++) //also intron of length greater than 1000
407 {
408 splice_pos = donor_sites[k] ;
409 if (splice_pos > j-4) { break ;}
410 prevValue = ((Pre_score*)actMatrix + (i*dna_len) +splice_pos-1)->value ; //one before the _gt
411
412 if (isfinite(prevValue) && (((Pre_score*)actMatrix + (i*dna_len) +splice_pos-1)->issplice!=1))
413 {
414 int dist = j -(splice_pos-1); //_gt...ag: dist from g to g
415 assert(dist<=max_len) ;
416 tempValue = lookup_penalty(functions,dist, NULL, 0) +donor[splice_pos-1] +acceptor[j-1] +prevValue;
417 if (isfinite(tempValue))
418 {
419 assert(num_elem<max_num_elem) ;
420 array[num_elem].prev_i = i;
421 array[num_elem].prev_j = splice_pos-1; /* predecessor */
422 array[num_elem].prev_matrix_no = z;
423 array[num_elem].isSplice = true; /* splice site */
424 scores[num_elem]=-tempValue ;
425 num_elem++ ;
426 }
427 else if (isfinite(lookup_penalty(functions,dist, NULL, 0)))
428 {
429 printf("Something wrong with donor_sites (intron_length: %d)\n", dist) ;
430 sleep(5000) ;
431 }
432 }
433 }
434 }
435
436 } //end of z
437
438 CMath::nmin<struct ArrayElem>(scores, array, num_elem, nr_paths);
439
440 /* the nr_paths many highest scores are written in the nr_paths many matrices*/
441 for (int z=0; z<nr_paths; z++) {
442 if (z>=num_elem) {
443 //nothing happens, entry in matrices[z] at pos. (i,j) still -inf
444 }
445 else {
446 ((Pre_score*)matrices[z] + (i*dna_len) +j)->value = -scores[z] ;
447 ((Pre_score*)matrices[z] + (i*dna_len) +j)->prev_i = array[z].prev_i ;
448 ((Pre_score*)matrices[z] + (i*dna_len) +j)->prev_j = array[z].prev_j ;
449 ((Pre_score*)matrices[z] + (i*dna_len) +j)->issplice = array[z].isSplice ;
450 ((Pre_score*)matrices[z] + (i*dna_len) +j)->prev_matrix_no = array[z].prev_matrix_no;
451
452 if (-scores[z]>=max_scores[z]) { //storing the postion of the highest alignment score yet
453 max_scores[z] = -scores[z] ;
454 max_score_positions[2*z] = i; // pos i
455 max_score_positions[2*z +1] = j; // pos j
456 }
457 }
458 } //end of z
459
460 } //end of j
461 } // end of i
462
463
464
465 //free memory
466 delete[] array;
467 delete[] scores;
468
469 // printf("Leaving fill_matrix...\n");
470 return;
471 }
472
473