1 /*********************************************************************************************/
2 // Version of fill_matrix with fills several alignment matrices (best, second_best ...)
3 // nr_paths det. number of these matrices
5 // est_len: Number of columns in alignment matrix: length(EST) +1 (for the gap) (variable: i)
6 // dna_len: Number of lines in alignment matrix: length(DNA) +1 (for the gap) (variable: j)
8 // !!! IMPORTANT !!! The pictures and equations in the paper/poster etc. describe the algorithm
9 // for an alignment matrix where DNA and EST changed places !!! IMPORTANT !!!
10 /*********************************************************************************************/
13 matchmatrix: columnwise
25 alignment matrix: columnwise
39 - max_len = 10000 for elegans!
45 #include "fill_matrix.h"
49 int check_char(char base
)
70 void fill_matrix(int nr_paths
, Pre_score
* matrices
[], int est_len
/*counter var: i*/, int dna_len
/*counter var: j*/, char* est
, char* dna
, penalty_struct
* functions
, double* matchmatrix
, double* donor
, double* acceptor
, bool remove_duplicate_scores
, int nr_donor_sites
, int* donor_sites
, int* max_score_positions
)
73 /*********************************************************************************************/
75 /*********************************************************************************************/
76 const int mlen
= 6; // length of matchmatrix
77 int estChar
, dnaChar
; // 0:'-', 1:'a', 2:'c', 3:'g', 4:'t', 5:'n'
80 int max_len
= (int)functions
->limits
[functions
->len
-1] ; //last (len) supporting point of h
81 //printf("max_len: %d\n", max_len) ;
83 int num_elem
; // counter variable
85 const int max_num_elem
= (4+nr_paths
+max_len
) * nr_paths
; // upper bound on num_elem
86 double *scores
=new double[max_num_elem
] ; //are sorted later
87 struct ArrayElem
*array
=new ArrayElem
[max_num_elem
] ;
88 //printf("%d\n", max_num_elem*sizeof(ArrayElem)) ;
90 int max_ss_element
[nr_paths
*nr_paths
]; //position on dna with max ss score for introns of length greater than max_len
91 int splice_pos
; //splice position get from donor_sites
92 int start_splice_pos
; //where to start looking in donor_sites
94 double max_scores
[nr_paths
] ;
96 /*********************************************************************************************/
97 // Initialisation of all alignment matrices, columnwise
98 /*********************************************************************************************/
99 for (int z
=0; z
<nr_paths
; z
++) /* matrice z */
101 max_scores
[z
] = log(0.0) ;
102 max_score_positions
[2*z
] = 0; // pos i
103 max_score_positions
[2*z
+1] = 0; // pos j
105 for (int i
=0; i
<est_len
; i
++) /* i.column */
107 for (int j
=0; j
<dna_len
; j
++) /* j.line */
109 ((Pre_score
*)matrices
[z
] +(i
*dna_len
) +j
)->value
= log(0.0); // -inf
110 ((Pre_score
*)matrices
[z
] +(i
*dna_len
) +j
)->prev_i
= 0;
111 ((Pre_score
*)matrices
[z
] +(i
*dna_len
) +j
)->prev_j
= 0;
112 ((Pre_score
*)matrices
[z
] +(i
*dna_len
) +j
)->prev_matrix_no
= 0;
113 ((Pre_score
*)matrices
[z
] +(i
*dna_len
) +j
)->issplice
= 0;
119 /*********************************************************************************************/
120 // Element (0,0) for first/best matrice (z=0)
121 /*********************************************************************************************/
122 ((Pre_score
*)matrices
[0])->value
= 0;
123 ((Pre_score
*)matrices
[0])->prev_i
= 0;
124 ((Pre_score
*)matrices
[0])->prev_j
= 0;
125 ((Pre_score
*)matrices
[0])->prev_matrix_no
= 0;
126 ((Pre_score
*)matrices
[0])->issplice
= 0;
129 /*********************************************************************************************/
130 // Line Zero for first/best matrice (z=0)
131 /*********************************************************************************************/
132 for (int i
=1; i
<est_len
; i
++) /* 0.line, all columns (DNA='-') */
134 ((Pre_score
*)matrices
[0] +(i
*dna_len
))->value
= 0 ;
135 ((Pre_score
*)matrices
[0] +(i
*dna_len
))->prev_i
= i
-1;
136 ((Pre_score
*)matrices
[0] +(i
*dna_len
))->prev_j
= 0;
137 ((Pre_score
*)matrices
[0] +(i
*dna_len
))->prev_matrix_no
= 0;
138 ((Pre_score
*)matrices
[0] +(i
*dna_len
))->issplice
= 0;
142 /*********************************************************************************************/
143 // Column Zero for first/best matrice (z=0)
144 /*********************************************************************************************/
145 for (int j
=1; j
<dna_len
; j
++) // 0.column, all lines
147 ((Pre_score
*)matrices
[0] +j
)->value
= 0 ;
148 ((Pre_score
*)matrices
[0] +j
)->prev_i
= 0 ;
149 ((Pre_score
*)matrices
[0] +j
)->prev_j
= j
-1 ;
150 ((Pre_score
*)matrices
[0] +j
)->prev_matrix_no
= 0 ;
151 ((Pre_score
*)matrices
[0] +j
)->issplice
= 0;
155 /*********************************************************************************************/
156 // !!!! Column Zero and line Zero for all other matrices except the best one (z>0): -INF !!!!
157 /*********************************************************************************************/
161 /*********************************************************************************************/
162 // all other entries for all matrices, last column and row treated seperately
163 /*********************************************************************************************/
165 /* columnwise, 0.column and 0.line already filled */
166 for (int i
=1; i
<est_len
; i
++) /* i.column */
169 start_splice_pos
= 0 ; //where to start looking in donor_sites
170 for (int zz
=0; zz
<nr_paths
*nr_paths
; zz
++) {max_ss_element
[zz
]=0 ;} // initializing
172 for (int j
=1; j
<dna_len
; j
++) //j.line
174 dnaChar
= check_char(dna
[j
-1]) ; // -1 because of of the gaps in the 0.line/column of the al-matrix
175 estChar
= check_char(est
[i
-1]) ;
178 //if introns of length greater than max_len possible
181 //storing highest ss_score (the position) and NOT -INF
182 int new_ss_pos
= j
-max_len
;
183 double donor_score
= donor
[new_ss_pos
-1] ;
184 if (isfinite(donor_score
)) //splice site!
186 if (max_ss_element
[nr_paths
-1] != 0)
188 for (int zz
=0; zz
<nr_paths
; zz
++) //for each alignment matrix
190 int pos_with_minscore
= nr_paths
; //assuming, the new score is the worst score
191 double new_intron_score
= donor_score
+((Pre_score
*)matrices
[zz
]+(i
*dna_len
)+new_ss_pos
-1)->value
; //could be -INF
192 if (!(isfinite(new_intron_score
))) {printf("new_intron_score is -INF\n") ;}
193 for (int pos
=0; pos
<nr_paths
; pos
++) //find worst score of all nr_paths+1 scores
195 int act_splice_position
= max_ss_element
[zz
*nr_paths
+pos
] ;
196 assert(isfinite(donor
[act_splice_position
-1])) ;
197 double act_intron_score
= donor
[act_splice_position
-1]+((Pre_score
*)matrices
[zz
]+(i
*dna_len
)+act_splice_position
-1)->value
;
198 if (new_intron_score
>= act_intron_score
)
200 new_intron_score
= act_intron_score
;
201 pos_with_minscore
= pos
;
202 } //else nothing happens, new intron score still the worst one
204 if (pos_with_minscore
!= nr_paths
)
206 max_ss_element
[zz
*nr_paths
+pos_with_minscore
] = new_ss_pos
; //replacing
212 for (int pos
=0; pos
<nr_paths
; pos
++)
214 if (max_ss_element
[pos
]==0) //no previous splice site at this pos
216 for (int zz
=0; zz
<nr_paths
; zz
++) {max_ss_element
[zz
*nr_paths
+pos
]=j
-max_len
;}
222 //start_pos in donor_sites so that only introns shorter then max_length are seen
223 if ((start_splice_pos
< nr_donor_sites
) && donor_sites
[start_splice_pos
] == j
-max_len
)
230 /* Computing all possible scores */
231 num_elem
= 0 ; /* number of different scores/possibilities so far */
232 for (int z
=0; z
<nr_paths
; z
++) /* matrice z */
234 Pre_score
* actMatrix
= (Pre_score
*)matrices
[z
]; /* hm, why? */
236 /***************************************/
237 // 0. Zero (local alignment version)
238 /***************************************/
239 assert(num_elem
<max_num_elem
) ;
240 array
[num_elem
].prev_i
= 0;
241 array
[num_elem
].prev_j
= 0;
242 array
[num_elem
].prev_matrix_no
= z
;
243 array
[num_elem
].isSplice
= false; /* no splice site */
244 scores
[num_elem
] = 0 ;
248 /***************************************/
249 // 1. gap on EST (j-1,i) -> (j,i)
250 /***************************************/
251 prevValue
= ((Pre_score
*)actMatrix
+(i
*dna_len
)+j
-1)->value
; /* (j-1,i) -> (j,i) */
252 if (isfinite(prevValue
))
254 tempValue
= prevValue
+(matchmatrix
[mlen
* dnaChar
]); /* score(gap,DNA) */
255 if (isfinite(tempValue
))
257 assert(num_elem
<max_num_elem
) ;
258 array
[num_elem
].prev_i
= i
;
259 array
[num_elem
].prev_j
= j
-1; /* predecessor */
260 array
[num_elem
].prev_matrix_no
= z
;
261 array
[num_elem
].isSplice
= false; /* no splice site */
262 scores
[num_elem
] = -tempValue
;
268 /***************************************/
269 // 2. match/mismatch (j-1,i-1) -> (j,i)
270 /***************************************/
271 prevValue
= ((Pre_score
*)actMatrix
+(i
-1)*dna_len
+ j
-1)->value
; /* (j-1,i-1) -> (j,i) */
272 if (isfinite(prevValue
))
274 tempValue
= prevValue
+(matchmatrix
[mlen
* dnaChar
+estChar
]);//DNA mit EST
275 if (isfinite(tempValue
))
277 assert(num_elem
<max_num_elem
) ;
278 array
[num_elem
].prev_i
= i
-1; /* predecessor */
279 array
[num_elem
].prev_j
= j
-1; /* predecessor */
280 array
[num_elem
].prev_matrix_no
= z
;
281 array
[num_elem
].isSplice
= false; /* no splice site */
282 scores
[num_elem
] = -tempValue
;
288 /***************************************/
289 // 3. gap on DNA (j,i-1) -> (j,i)
290 /***************************************/
291 prevValue
= ((Pre_score
*)actMatrix
+(i
-1)*dna_len
+ j
)->value
;
292 if (isfinite(prevValue
))
294 tempValue
= prevValue
+(matchmatrix
[estChar
]); /* score(EST,gap) */
295 if (isfinite(tempValue
))
297 assert(num_elem
<max_num_elem
) ;
298 array
[num_elem
].prev_i
= i
-1; /* predecessor */
299 array
[num_elem
].prev_j
= j
;
300 array
[num_elem
].prev_matrix_no
= z
;
301 array
[num_elem
].isSplice
= false; /* no splice site */
302 scores
[num_elem
]=-tempValue
;
309 /***************************************/
310 // 4. all possible splice sites/introns
311 /***************************************/
312 // acceptor[j-1]: g of ag (-1 because cpp starts counting at 0)
313 // donor[splice_pos-1]: g of gt/gc (-1 because cpp starts counting at 0)
314 // donor_sites: the g's of the gt/gc
316 if (isfinite(acceptor
[j
-1]))
319 //the nr_paths many introns of length greater than max_len
320 for (int pos
=0; pos
<nr_paths
; pos
++)
322 if (max_ss_element
[z
*nr_paths
+pos
]>0)
324 assert((j
-max_ss_element
[z
*nr_paths
+pos
]+1) > max_len
) ;
325 splice_pos
= max_ss_element
[z
*nr_paths
+pos
] ; //the g of the gt
326 prevValue
= ((Pre_score
*)actMatrix
+ (i
*dna_len
) +splice_pos
-1)->value
; //one before the _gt
327 if (isfinite(prevValue
) && (((Pre_score
*)actMatrix
+ (i
*dna_len
) +splice_pos
-1)->issplice
!=1))
329 int dist
= j
-(splice_pos
-1); //_gt...ag: dist from g to g
330 assert(dist
>max_len
) ;
331 tempValue
= lookup_penalty(functions
,dist
, NULL
, 0) +donor
[splice_pos
-1] +acceptor
[j
-1] +prevValue
;
332 if (isfinite(tempValue
))
334 assert(num_elem
<max_num_elem
) ;
335 array
[num_elem
].prev_i
= i
;
336 array
[num_elem
].prev_j
= splice_pos
-1; // predecessor
337 array
[num_elem
].prev_matrix_no
= z
;
338 array
[num_elem
].isSplice
= true; // splice site
339 scores
[num_elem
]=-tempValue
;
342 else if (isfinite(lookup_penalty(functions
,dist
, NULL
, 0)))
344 printf("Something wrong with donor_sites(1000er) (intron_length: %d)\n", dist
) ;
353 //all introns of length <= max_len
354 for (int k
=start_splice_pos
; k
< nr_donor_sites
; k
++)
355 //for (int k=0; k < nr_donor_sites; k++) //also intron of length greater than 1000
357 splice_pos
= donor_sites
[k
] ;
358 if (splice_pos
> j
-4) { break ;}
359 prevValue
= ((Pre_score
*)actMatrix
+ (i
*dna_len
) +splice_pos
-1)->value
; //one before the _gt
361 if (isfinite(prevValue
) && (((Pre_score
*)actMatrix
+ (i
*dna_len
) +splice_pos
-1)->issplice
!=1))
363 int dist
= j
-(splice_pos
-1); //_gt...ag: dist from g to g
364 assert(dist
<=max_len
) ;
365 tempValue
= lookup_penalty(functions
,dist
, NULL
, 0) +donor
[splice_pos
-1] +acceptor
[j
-1] +prevValue
;
366 if (isfinite(tempValue
))
368 assert(num_elem
<max_num_elem
) ;
369 array
[num_elem
].prev_i
= i
;
370 array
[num_elem
].prev_j
= splice_pos
-1; /* predecessor */
371 array
[num_elem
].prev_matrix_no
= z
;
372 array
[num_elem
].isSplice
= true; /* splice site */
373 scores
[num_elem
]=-tempValue
;
376 else if (isfinite(lookup_penalty(functions
,dist
, NULL
, 0)))
378 printf("Something wrong with donor_sites (intron_length: %d)\n", dist
) ;
387 CMath::nmin
<struct ArrayElem
>(scores
, array
, num_elem
, nr_paths
);
389 /* the nr_paths many highest scores are written in the nr_paths many matrices*/
390 for (int z
=0; z
<nr_paths
; z
++) {
392 //nothing happens, entry in matrices[z] at pos. (i,j) still -inf
395 ((Pre_score
*)matrices
[z
] + (i
*dna_len
) +j
)->value
= -scores
[z
] ;
396 ((Pre_score
*)matrices
[z
] + (i
*dna_len
) +j
)->prev_i
= array
[z
].prev_i
;
397 ((Pre_score
*)matrices
[z
] + (i
*dna_len
) +j
)->prev_j
= array
[z
].prev_j
;
398 ((Pre_score
*)matrices
[z
] + (i
*dna_len
) +j
)->issplice
= array
[z
].isSplice
;
399 ((Pre_score
*)matrices
[z
] + (i
*dna_len
) +j
)->prev_matrix_no
= array
[z
].prev_matrix_no
;
401 if (-scores
[z
]>=max_scores
[z
]) { //storing the postion of the highest alignment score yet
402 max_scores
[z
] = -scores
[z
] ;
403 max_score_positions
[2*z
] = i
; // pos i
404 max_score_positions
[2*z
+1] = j
; // pos j