+ added python parsers for gff and solexa / read files
[qpalma.git] / QPalmaDP / fill_matrix.cpp
1 /*********************************************************************************************/
2 // Version of fill_matrix with fills several alignment matrices (best, second_best ...)
3 // nr_paths det. number of these matrices
4 //
5 // est_len: Number of columns in alignment matrix: length(EST) +1 (for the gap) (variable: i)
6 // dna_len: Number of lines in alignment matrix: length(DNA) +1 (for the gap) (variable: j)
7 //
8 // !!! IMPORTANT !!! The pictures and equations in the paper/poster etc. describe the algorithm
9 // for an alignment matrix where DNA and EST changed places !!! IMPORTANT !!!
10 /*********************************************************************************************/
11
12 /*
13 matchmatrix: columnwise
14 - A C G T N (dna)
15 - x x x x x x
16 A x x x x x x
17 C x x x x x x
18 G x x x x x x
19 T x x x x x x
20 N x x x x x x
21 (est)
22 */
23
24 /*
25 alignment matrix: columnwise
26 |-EST . . .
27 -+------------
28 -|00 ...
29 D|0.
30 N|. .
31 A|. .
32 .|.
33 .|
34 .|
35 */
36
37
38 /* TODO
39 - max_len = 10000 for elegans!
40 - do not use double
41 */
42
43 #include "fill_matrix.h"
44
45 using namespace std;
46
47 int check_char(char base)
48 {
49 switch(base)
50 {
51 case 'A':
52 case 'a': return 1;
53 case 'C':
54 case 'c': return 2;
55 case 'G':
56 case 'g': return 3;
57 case 'T':
58 case 't': return 4;
59 case 'N':
60 case 'n': return 5;
61 default : return 0;
62 }
63
64 }
65
66 /* if scoring_functions is a NULL pointer then we realize the "normal" scoring
67 * system of Palma. Otherwiese scoring_functions should point to an array of
68 * plifs scoring the respective pairs of characters together with the EST
69 * quality score.
70 */
71
72 double scoreMatch(int dnaChar, int estChar, double baseScore){
73 for(int idx=0;idx<24;idx++)
74 struct penalty_struct currentPen = ::qualityScores[idx];
75
76 return 1.0;
77 }
78
79 double getScore(double *matchmatrix,int mlen, int dnaChar, int estChar, double baseScore, mode scoringScheme) {
80 double score = 0.0;
81
82 if( scoringScheme == NORMAL ) {
83 if( estChar == -1 )
84 score = matchmatrix[mlen*dnaChar];
85
86 if( dnaChar == -1 )
87 score = matchmatrix[estChar];
88
89 if( estChar != -1 && dnaChar != -1 )
90 score = matchmatrix[mlen* dnaChar +estChar];
91 } else {
92 score = scoreMatch(dnaChar,estChar,baseScore);
93 }
94
95 return score;
96 }
97
98
99 void fill_matrix(int nr_paths, Pre_score* matrices[], int est_len /*counter var: i*/, int dna_len /*counter var: j*/, char* est, char* dna, penalty_struct* functions , double* matchmatrix, double* donor, double* acceptor, bool remove_duplicate_scores, int nr_donor_sites, int* donor_sites, int* max_score_positions)
100 {
101 // printf("Entering fill_matrix...\n");
102
103 /*********************************************************************************************/
104 // variables
105 /*********************************************************************************************/
106 const int mlen = 6; // length of matchmatrix
107 int estChar, dnaChar; // 0:'-', 1:'a', 2:'c', 3:'g', 4:'t', 5:'n'
108
109
110 double baseScore;
111 double *est_scores = new double[est_len];
112 for(int i=0;i<est_len;i++)
113 est_scores[i] = 0.0;
114
115 mode currentMode = NORMAL;
116
117
118 double prevValue ;
119 double tempValue;
120 int max_len = (int)functions->limits[functions->len-1] ; //last (len) supporting point of h
121 //printf("max_len: %d\n", max_len) ;
122
123 int num_elem ; // counter variable
124
125 const int max_num_elem = (4+nr_paths+max_len) * nr_paths ; // upper bound on num_elem
126 double *scores=new double[max_num_elem] ; //are sorted later
127 struct ArrayElem *array=new ArrayElem[max_num_elem] ;
128 //printf("%d\n", max_num_elem*sizeof(ArrayElem)) ;
129
130 int max_ss_element[nr_paths*nr_paths]; //position on dna with max ss score for introns of length greater than max_len
131 int splice_pos ; //splice position get from donor_sites
132 int start_splice_pos ; //where to start looking in donor_sites
133
134 double max_scores[nr_paths] ;
135
136 /*********************************************************************************************/
137 // The penalty functions
138 /*********************************************************************************************/
139
140 /* scoring_functions is an array of size 4x6
141 *
142 */
143
144 struct penalty_struct* scoring_functions = 0;
145
146 /*********************************************************************************************/
147 // Initialisation of all alignment matrices, columnwise
148 /*********************************************************************************************/
149 for (int z=0; z<nr_paths; z++) /* matrice z */
150 {
151 max_scores[z] = log(0.0) ;
152 max_score_positions[2*z] = 0; // pos i
153 max_score_positions[2*z +1] = 0; // pos j
154
155 for (int i=0; i<est_len; i++) /* i.column */
156 {
157 for (int j=0; j<dna_len; j++) /* j.line */
158 {
159 ((Pre_score*)matrices[z] +(i*dna_len) +j)->value = log(0.0); // -inf
160 ((Pre_score*)matrices[z] +(i*dna_len) +j)->prev_i = 0;
161 ((Pre_score*)matrices[z] +(i*dna_len) +j)->prev_j = 0;
162 ((Pre_score*)matrices[z] +(i*dna_len) +j)->prev_matrix_no = 0;
163 ((Pre_score*)matrices[z] +(i*dna_len) +j)->issplice = 0;
164 }
165 }
166 }
167
168
169 /*********************************************************************************************/
170 // Element (0,0) for first/best matrice (z=0)
171 /*********************************************************************************************/
172 ((Pre_score*)matrices[0])->value = 0;
173 ((Pre_score*)matrices[0])->prev_i = 0;
174 ((Pre_score*)matrices[0])->prev_j = 0;
175 ((Pre_score*)matrices[0])->prev_matrix_no = 0;
176 ((Pre_score*)matrices[0])->issplice = 0;
177
178
179 /*********************************************************************************************/
180 // Line Zero for first/best matrice (z=0)
181 /*********************************************************************************************/
182 for (int i=1; i<est_len; i++) /* 0.line, all columns (DNA='-') */
183 {
184 ((Pre_score*)matrices[0] +(i*dna_len))->value = 0 ;
185 ((Pre_score*)matrices[0] +(i*dna_len))->prev_i = i-1;
186 ((Pre_score*)matrices[0] +(i*dna_len))->prev_j = 0;
187 ((Pre_score*)matrices[0] +(i*dna_len))->prev_matrix_no = 0;
188 ((Pre_score*)matrices[0] +(i*dna_len))->issplice = 0;
189 }
190
191
192 /*********************************************************************************************/
193 // Column Zero for first/best matrice (z=0)
194 /*********************************************************************************************/
195 for (int j=1; j<dna_len; j++) // 0.column, all lines
196 {
197 ((Pre_score*)matrices[0] +j)->value = 0 ;
198 ((Pre_score*)matrices[0] +j)->prev_i = 0 ;
199 ((Pre_score*)matrices[0] +j)->prev_j = j-1 ;
200 ((Pre_score*)matrices[0] +j)->prev_matrix_no = 0 ;
201 ((Pre_score*)matrices[0] +j)->issplice = 0;
202 }
203
204
205 /*********************************************************************************************/
206 // !!!! Column Zero and line Zero for all other matrices except the best one (z>0): -INF !!!!
207 /*********************************************************************************************/
208
209
210
211 /*********************************************************************************************/
212 // all other entries for all matrices, last column and row treated seperately
213 /*********************************************************************************************/
214
215 /* columnwise, 0.column and 0.line already filled */
216 for (int i=1; i<est_len; i++) /* i.column */
217 {
218
219 start_splice_pos = 0 ; //where to start looking in donor_sites
220 for (int zz=0; zz<nr_paths*nr_paths; zz++) {max_ss_element[zz]=0 ;} // initializing
221
222 for (int j=1; j<dna_len; j++) //j.line
223 {
224 dnaChar = check_char(dna[j-1]) ; // -1 because of of the gaps in the 0.line/column of the al-matrix
225 estChar = check_char(est[i-1]) ;
226 baseScore = est_scores[i-1];
227
228
229 //if introns of length greater than max_len possible
230 if (j > max_len)
231 {
232 //storing highest ss_score (the position) and NOT -INF
233 int new_ss_pos = j-max_len ;
234 double donor_score = donor[new_ss_pos-1] ;
235 if (isfinite(donor_score)) //splice site!
236 {
237 if (max_ss_element[nr_paths-1] != 0)
238 {
239 for (int zz=0; zz<nr_paths; zz++) //for each alignment matrix
240 {
241 int pos_with_minscore = nr_paths ; //assuming, the new score is the worst score
242 double new_intron_score = donor_score+((Pre_score*)matrices[zz]+(i*dna_len)+new_ss_pos-1)->value ; //could be -INF
243 if (!(isfinite(new_intron_score))) {printf("new_intron_score is -INF\n") ;}
244 for (int pos=0; pos<nr_paths; pos++) //find worst score of all nr_paths+1 scores
245 {
246 int act_splice_position = max_ss_element[zz*nr_paths+pos] ;
247 assert(isfinite(donor[act_splice_position-1])) ;
248 double act_intron_score = donor[act_splice_position-1]+((Pre_score*)matrices[zz]+(i*dna_len)+act_splice_position-1)->value ;
249 if (new_intron_score >= act_intron_score)
250 {
251 new_intron_score = act_intron_score ;
252 pos_with_minscore = pos ;
253 } //else nothing happens, new intron score still the worst one
254 }
255 if (pos_with_minscore != nr_paths)
256 {
257 max_ss_element[zz*nr_paths+pos_with_minscore] = new_ss_pos ; //replacing
258 }
259 }
260 }
261 else
262 {
263 for (int pos=0; pos<nr_paths; pos++)
264 {
265 if (max_ss_element[pos]==0) //no previous splice site at this pos
266 {
267 for (int zz=0; zz<nr_paths; zz++) {max_ss_element[zz*nr_paths+pos]=j-max_len ;}
268 break ;
269 }
270 }
271 }
272 }
273 //start_pos in donor_sites so that only introns shorter then max_length are seen
274 if ((start_splice_pos < nr_donor_sites) && donor_sites[start_splice_pos] == j-max_len)
275 {
276 start_splice_pos++ ;
277 }
278 }
279
280
281 /* Computing all possible scores */
282 num_elem = 0 ; /* number of different scores/possibilities so far */
283 for (int z=0; z<nr_paths; z++) /* matrice z */
284 {
285 Pre_score* actMatrix = (Pre_score*)matrices[z]; /* hm, why? */
286
287 /***************************************/
288 // 0. Zero (local alignment version)
289 /***************************************/
290 assert(num_elem<max_num_elem) ;
291 array[num_elem].prev_i = 0;
292 array[num_elem].prev_j = 0;
293 array[num_elem].prev_matrix_no = z;
294 array[num_elem].isSplice = false; /* no splice site */
295 scores[num_elem] = 0 ;
296 num_elem++ ;
297
298
299 /***************************************/
300 // 1. gap on EST (j-1,i) -> (j,i)
301 /***************************************/
302 prevValue = ((Pre_score*)actMatrix +(i*dna_len)+j-1)->value ; /* (j-1,i) -> (j,i) */
303 if (isfinite(prevValue))
304 {
305 // tempValue = prevValue +(matchmatrix[mlen* dnaChar]); /* score(gap,DNA) */
306 tempValue = prevValue + getScore(matchmatrix,mlen,dnaChar,-1,baseScore,currentMode);
307 if (isfinite(tempValue))
308 {
309 assert(num_elem<max_num_elem) ;
310 array[num_elem].prev_i = i;
311 array[num_elem].prev_j = j-1; /* predecessor */
312 array[num_elem].prev_matrix_no = z;
313 array[num_elem].isSplice = false; /* no splice site */
314 scores[num_elem] = -tempValue ;
315 num_elem++ ;
316 }
317 }
318
319
320 /***************************************/
321 // 2. match/mismatch (j-1,i-1) -> (j,i)
322 /***************************************/
323 prevValue = ((Pre_score*)actMatrix +(i-1)*dna_len + j-1)->value ; /* (j-1,i-1) -> (j,i) */
324 if (isfinite(prevValue))
325 {
326 //tempValue = prevValue +(matchmatrix[mlen* dnaChar +estChar]);//DNA mit EST
327 tempValue = prevValue + getScore(matchmatrix,mlen,dnaChar,estChar,baseScore,currentMode);
328 if (isfinite(tempValue))
329 {
330 assert(num_elem<max_num_elem) ;
331 array[num_elem].prev_i = i-1; /* predecessor */
332 array[num_elem].prev_j = j-1; /* predecessor */
333 array[num_elem].prev_matrix_no = z;
334 array[num_elem].isSplice = false; /* no splice site */
335 scores[num_elem] = -tempValue ;
336 num_elem++ ;
337 }
338 }
339
340
341 /***************************************/
342 // 3. gap on DNA (j,i-1) -> (j,i)
343 /***************************************/
344 prevValue = ((Pre_score*)actMatrix +(i-1)*dna_len + j)->value ;
345 if (isfinite(prevValue))
346 {
347 // tempValue = prevValue +(matchmatrix[estChar]); /* score(EST,gap) */
348 tempValue = prevValue + getScore(matchmatrix,mlen,-1,estChar,baseScore,currentMode);
349 if (isfinite(tempValue))
350 {
351 assert(num_elem<max_num_elem) ;
352 array[num_elem].prev_i = i-1; /* predecessor */
353 array[num_elem].prev_j = j;
354 array[num_elem].prev_matrix_no = z;
355 array[num_elem].isSplice = false; /* no splice site */
356 scores[num_elem]=-tempValue ;
357 num_elem++ ;
358 }
359
360 }
361
362
363 /***************************************/
364 // 4. all possible splice sites/introns
365 /***************************************/
366 // acceptor[j-1]: g of ag (-1 because cpp starts counting at 0)
367 // donor[splice_pos-1]: g of gt/gc (-1 because cpp starts counting at 0)
368 // donor_sites: the g's of the gt/gc
369
370 if (isfinite(acceptor[j-1]))
371 {
372
373 //the nr_paths many introns of length greater than max_len
374 for (int pos=0; pos<nr_paths; pos++)
375 {
376 if (max_ss_element[z*nr_paths+pos]>0)
377 {
378 assert((j-max_ss_element[z*nr_paths+pos]+1) > max_len) ;
379 splice_pos = max_ss_element[z*nr_paths+pos] ; //the g of the gt
380 prevValue = ((Pre_score*)actMatrix + (i*dna_len) +splice_pos-1)->value ; //one before the _gt
381 if (isfinite(prevValue) && (((Pre_score*)actMatrix + (i*dna_len) +splice_pos-1)->issplice!=1))
382 {
383 int dist = j -(splice_pos-1); //_gt...ag: dist from g to g
384 assert(dist>max_len) ;
385 tempValue = lookup_penalty(functions,dist, NULL, 0) +donor[splice_pos-1] +acceptor[j-1] +prevValue;
386 if (isfinite(tempValue))
387 {
388 assert(num_elem<max_num_elem) ;
389 array[num_elem].prev_i = i;
390 array[num_elem].prev_j = splice_pos-1; // predecessor
391 array[num_elem].prev_matrix_no = z;
392 array[num_elem].isSplice = true; // splice site
393 scores[num_elem]=-tempValue ;
394 num_elem++ ;
395 }
396 else if (isfinite(lookup_penalty(functions,dist, NULL, 0)))
397 {
398 printf("Something wrong with donor_sites(1000er) (intron_length: %d)\n", dist) ;
399 sleep(5000) ;
400 }
401 }
402 }
403 }
404
405
406
407 //all introns of length <= max_len
408 for (int k=start_splice_pos; k < nr_donor_sites; k++)
409 //for (int k=0; k < nr_donor_sites; k++) //also intron of length greater than 1000
410 {
411 splice_pos = donor_sites[k] ;
412 if (splice_pos > j-4) { break ;}
413 prevValue = ((Pre_score*)actMatrix + (i*dna_len) +splice_pos-1)->value ; //one before the _gt
414
415 if (isfinite(prevValue) && (((Pre_score*)actMatrix + (i*dna_len) +splice_pos-1)->issplice!=1))
416 {
417 int dist = j -(splice_pos-1); //_gt...ag: dist from g to g
418 assert(dist<=max_len) ;
419 tempValue = lookup_penalty(functions,dist, NULL, 0) +donor[splice_pos-1] +acceptor[j-1] +prevValue;
420 if (isfinite(tempValue))
421 {
422 assert(num_elem<max_num_elem) ;
423 array[num_elem].prev_i = i;
424 array[num_elem].prev_j = splice_pos-1; /* predecessor */
425 array[num_elem].prev_matrix_no = z;
426 array[num_elem].isSplice = true; /* splice site */
427 scores[num_elem]=-tempValue ;
428 num_elem++ ;
429 }
430 else if (isfinite(lookup_penalty(functions,dist, NULL, 0)))
431 {
432 printf("Something wrong with donor_sites (intron_length: %d)\n", dist) ;
433 sleep(5000) ;
434 }
435 }
436 }
437 }
438
439 } //end of z
440
441 CMath::nmin<struct ArrayElem>(scores, array, num_elem, nr_paths);
442
443 /* the nr_paths many highest scores are written in the nr_paths many matrices*/
444 for (int z=0; z<nr_paths; z++) {
445 if (z>=num_elem) {
446 //nothing happens, entry in matrices[z] at pos. (i,j) still -inf
447 }
448 else {
449 ((Pre_score*)matrices[z] + (i*dna_len) +j)->value = -scores[z] ;
450 ((Pre_score*)matrices[z] + (i*dna_len) +j)->prev_i = array[z].prev_i ;
451 ((Pre_score*)matrices[z] + (i*dna_len) +j)->prev_j = array[z].prev_j ;
452 ((Pre_score*)matrices[z] + (i*dna_len) +j)->issplice = array[z].isSplice ;
453 ((Pre_score*)matrices[z] + (i*dna_len) +j)->prev_matrix_no = array[z].prev_matrix_no;
454
455 if (-scores[z]>=max_scores[z]) { //storing the postion of the highest alignment score yet
456 max_scores[z] = -scores[z] ;
457 max_score_positions[2*z] = i; // pos i
458 max_score_positions[2*z +1] = j; // pos j
459 }
460 }
461 } //end of z
462
463 } //end of j
464 } // end of i
465
466
467
468 //free memory
469 delete[] array;
470 delete[] scores;
471
472 // printf("Leaving fill_matrix...\n");
473 return;
474 }
475
476