1 /*********************************************************************************************/
2 // Version of fill_matrix with fills several alignment matrices (best, second_best ...)
3 // nr_paths det. number of these matrices
5 // est_len: Number of columns in alignment matrix: length(EST) +1 (for the gap) (variable: i)
6 // dna_len: Number of lines in alignment matrix: length(DNA) +1 (for the gap) (variable: j)
8 // !!! IMPORTANT !!! The pictures and equations in the paper/poster etc. describe the algorithm
9 // for an alignment matrix where DNA and EST changed places !!! IMPORTANT !!!
10 /*********************************************************************************************/
13 matchmatrix: columnwise
25 alignment matrix: columnwise
39 - max_len = 10000 for elegans!
45 #include "fill_matrix.h"
49 int check_char(char base
)
70 void fill_matrix(int nr_paths
, Pre_score
* matrices
[], int est_len
/*counter var: i*/, int dna_len
/*counter var: j*/, char* est
, char* dna
, penalty_struct
* functions
, double* matchmatrix
, double* donor
, double* acceptor
, bool remove_duplicate_scores
, int nr_donor_sites
, int* donor_sites
, int* max_score_positions
)
72 // printf("Entering fill_matrix...\n");
74 /*********************************************************************************************/
76 /*********************************************************************************************/
77 const int mlen
= 6; // length of matchmatrix
78 int estChar
, dnaChar
; // 0:'-', 1:'a', 2:'c', 3:'g', 4:'t', 5:'n'
81 int max_len
= (int)functions
->limits
[functions
->len
-1] ; //last (len) supporting point of h
82 //printf("max_len: %d\n", max_len) ;
84 int num_elem
; // counter variable
86 const int max_num_elem
= (4+nr_paths
+max_len
) * nr_paths
; // upper bound on num_elem
87 double *scores
=new double[max_num_elem
] ; //are sorted later
88 struct ArrayElem
*array
=new ArrayElem
[max_num_elem
] ;
89 //printf("%d\n", max_num_elem*sizeof(ArrayElem)) ;
91 int max_ss_element
[nr_paths
*nr_paths
]; //position on dna with max ss score for introns of length greater than max_len
92 int splice_pos
; //splice position get from donor_sites
93 int start_splice_pos
; //where to start looking in donor_sites
95 double max_scores
[nr_paths
] ;
97 /*********************************************************************************************/
98 // Initialisation of all alignment matrices, columnwise
99 /*********************************************************************************************/
100 for (int z
=0; z
<nr_paths
; z
++) /* matrice z */
102 max_scores
[z
] = log(0.0) ;
103 max_score_positions
[2*z
] = 0; // pos i
104 max_score_positions
[2*z
+1] = 0; // pos j
106 for (int i
=0; i
<est_len
; i
++) /* i.column */
108 for (int j
=0; j
<dna_len
; j
++) /* j.line */
110 ((Pre_score
*)matrices
[z
] +(i
*dna_len
) +j
)->value
= log(0.0); // -inf
111 ((Pre_score
*)matrices
[z
] +(i
*dna_len
) +j
)->prev_i
= 0;
112 ((Pre_score
*)matrices
[z
] +(i
*dna_len
) +j
)->prev_j
= 0;
113 ((Pre_score
*)matrices
[z
] +(i
*dna_len
) +j
)->prev_matrix_no
= 0;
114 ((Pre_score
*)matrices
[z
] +(i
*dna_len
) +j
)->issplice
= 0;
120 /*********************************************************************************************/
121 // Element (0,0) for first/best matrice (z=0)
122 /*********************************************************************************************/
123 ((Pre_score
*)matrices
[0])->value
= 0;
124 ((Pre_score
*)matrices
[0])->prev_i
= 0;
125 ((Pre_score
*)matrices
[0])->prev_j
= 0;
126 ((Pre_score
*)matrices
[0])->prev_matrix_no
= 0;
127 ((Pre_score
*)matrices
[0])->issplice
= 0;
130 /*********************************************************************************************/
131 // Line Zero for first/best matrice (z=0)
132 /*********************************************************************************************/
133 for (int i
=1; i
<est_len
; i
++) /* 0.line, all columns (DNA='-') */
135 ((Pre_score
*)matrices
[0] +(i
*dna_len
))->value
= 0 ;
136 ((Pre_score
*)matrices
[0] +(i
*dna_len
))->prev_i
= i
-1;
137 ((Pre_score
*)matrices
[0] +(i
*dna_len
))->prev_j
= 0;
138 ((Pre_score
*)matrices
[0] +(i
*dna_len
))->prev_matrix_no
= 0;
139 ((Pre_score
*)matrices
[0] +(i
*dna_len
))->issplice
= 0;
143 /*********************************************************************************************/
144 // Column Zero for first/best matrice (z=0)
145 /*********************************************************************************************/
146 for (int j
=1; j
<dna_len
; j
++) // 0.column, all lines
148 ((Pre_score
*)matrices
[0] +j
)->value
= 0 ;
149 ((Pre_score
*)matrices
[0] +j
)->prev_i
= 0 ;
150 ((Pre_score
*)matrices
[0] +j
)->prev_j
= j
-1 ;
151 ((Pre_score
*)matrices
[0] +j
)->prev_matrix_no
= 0 ;
152 ((Pre_score
*)matrices
[0] +j
)->issplice
= 0;
156 /*********************************************************************************************/
157 // !!!! Column Zero and line Zero for all other matrices except the best one (z>0): -INF !!!!
158 /*********************************************************************************************/
162 /*********************************************************************************************/
163 // all other entries for all matrices, last column and row treated seperately
164 /*********************************************************************************************/
166 /* columnwise, 0.column and 0.line already filled */
167 for (int i
=1; i
<est_len
; i
++) /* i.column */
170 start_splice_pos
= 0 ; //where to start looking in donor_sites
171 for (int zz
=0; zz
<nr_paths
*nr_paths
; zz
++) {max_ss_element
[zz
]=0 ;} // initializing
173 for (int j
=1; j
<dna_len
; j
++) //j.line
175 dnaChar
= check_char(dna
[j
-1]) ; // -1 because of of the gaps in the 0.line/column of the al-matrix
176 estChar
= check_char(est
[i
-1]) ;
179 //if introns of length greater than max_len possible
182 //storing highest ss_score (the position) and NOT -INF
183 int new_ss_pos
= j
-max_len
;
184 double donor_score
= donor
[new_ss_pos
-1] ;
185 if (isfinite(donor_score
)) //splice site!
187 if (max_ss_element
[nr_paths
-1] != 0)
189 for (int zz
=0; zz
<nr_paths
; zz
++) //for each alignment matrix
191 int pos_with_minscore
= nr_paths
; //assuming, the new score is the worst score
192 double new_intron_score
= donor_score
+((Pre_score
*)matrices
[zz
]+(i
*dna_len
)+new_ss_pos
-1)->value
; //could be -INF
193 if (!(isfinite(new_intron_score
))) {printf("new_intron_score is -INF\n") ;}
194 for (int pos
=0; pos
<nr_paths
; pos
++) //find worst score of all nr_paths+1 scores
196 int act_splice_position
= max_ss_element
[zz
*nr_paths
+pos
] ;
197 assert(isfinite(donor
[act_splice_position
-1])) ;
198 double act_intron_score
= donor
[act_splice_position
-1]+((Pre_score
*)matrices
[zz
]+(i
*dna_len
)+act_splice_position
-1)->value
;
199 if (new_intron_score
>= act_intron_score
)
201 new_intron_score
= act_intron_score
;
202 pos_with_minscore
= pos
;
203 } //else nothing happens, new intron score still the worst one
205 if (pos_with_minscore
!= nr_paths
)
207 max_ss_element
[zz
*nr_paths
+pos_with_minscore
] = new_ss_pos
; //replacing
213 for (int pos
=0; pos
<nr_paths
; pos
++)
215 if (max_ss_element
[pos
]==0) //no previous splice site at this pos
217 for (int zz
=0; zz
<nr_paths
; zz
++) {max_ss_element
[zz
*nr_paths
+pos
]=j
-max_len
;}
223 //start_pos in donor_sites so that only introns shorter then max_length are seen
224 if ((start_splice_pos
< nr_donor_sites
) && donor_sites
[start_splice_pos
] == j
-max_len
)
231 /* Computing all possible scores */
232 num_elem
= 0 ; /* number of different scores/possibilities so far */
233 for (int z
=0; z
<nr_paths
; z
++) /* matrice z */
235 Pre_score
* actMatrix
= (Pre_score
*)matrices
[z
]; /* hm, why? */
237 /***************************************/
238 // 0. Zero (local alignment version)
239 /***************************************/
240 assert(num_elem
<max_num_elem
) ;
241 array
[num_elem
].prev_i
= 0;
242 array
[num_elem
].prev_j
= 0;
243 array
[num_elem
].prev_matrix_no
= z
;
244 array
[num_elem
].isSplice
= false; /* no splice site */
245 scores
[num_elem
] = 0 ;
249 /***************************************/
250 // 1. gap on EST (j-1,i) -> (j,i)
251 /***************************************/
252 prevValue
= ((Pre_score
*)actMatrix
+(i
*dna_len
)+j
-1)->value
; /* (j-1,i) -> (j,i) */
253 if (isfinite(prevValue
))
255 tempValue
= prevValue
+(matchmatrix
[mlen
* dnaChar
]); /* score(gap,DNA) */
256 if (isfinite(tempValue
))
258 assert(num_elem
<max_num_elem
) ;
259 array
[num_elem
].prev_i
= i
;
260 array
[num_elem
].prev_j
= j
-1; /* predecessor */
261 array
[num_elem
].prev_matrix_no
= z
;
262 array
[num_elem
].isSplice
= false; /* no splice site */
263 scores
[num_elem
] = -tempValue
;
269 /***************************************/
270 // 2. match/mismatch (j-1,i-1) -> (j,i)
271 /***************************************/
272 prevValue
= ((Pre_score
*)actMatrix
+(i
-1)*dna_len
+ j
-1)->value
; /* (j-1,i-1) -> (j,i) */
273 if (isfinite(prevValue
))
275 tempValue
= prevValue
+(matchmatrix
[mlen
* dnaChar
+estChar
]);//DNA mit EST
276 if (isfinite(tempValue
))
278 assert(num_elem
<max_num_elem
) ;
279 array
[num_elem
].prev_i
= i
-1; /* predecessor */
280 array
[num_elem
].prev_j
= j
-1; /* predecessor */
281 array
[num_elem
].prev_matrix_no
= z
;
282 array
[num_elem
].isSplice
= false; /* no splice site */
283 scores
[num_elem
] = -tempValue
;
289 /***************************************/
290 // 3. gap on DNA (j,i-1) -> (j,i)
291 /***************************************/
292 prevValue
= ((Pre_score
*)actMatrix
+(i
-1)*dna_len
+ j
)->value
;
293 if (isfinite(prevValue
))
295 tempValue
= prevValue
+(matchmatrix
[estChar
]); /* score(EST,gap) */
296 if (isfinite(tempValue
))
298 assert(num_elem
<max_num_elem
) ;
299 array
[num_elem
].prev_i
= i
-1; /* predecessor */
300 array
[num_elem
].prev_j
= j
;
301 array
[num_elem
].prev_matrix_no
= z
;
302 array
[num_elem
].isSplice
= false; /* no splice site */
303 scores
[num_elem
]=-tempValue
;
310 /***************************************/
311 // 4. all possible splice sites/introns
312 /***************************************/
313 // acceptor[j-1]: g of ag (-1 because cpp starts counting at 0)
314 // donor[splice_pos-1]: g of gt/gc (-1 because cpp starts counting at 0)
315 // donor_sites: the g's of the gt/gc
317 if (isfinite(acceptor
[j
-1]))
320 //the nr_paths many introns of length greater than max_len
321 for (int pos
=0; pos
<nr_paths
; pos
++)
323 if (max_ss_element
[z
*nr_paths
+pos
]>0)
325 assert((j
-max_ss_element
[z
*nr_paths
+pos
]+1) > max_len
) ;
326 splice_pos
= max_ss_element
[z
*nr_paths
+pos
] ; //the g of the gt
327 prevValue
= ((Pre_score
*)actMatrix
+ (i
*dna_len
) +splice_pos
-1)->value
; //one before the _gt
328 if (isfinite(prevValue
) && (((Pre_score
*)actMatrix
+ (i
*dna_len
) +splice_pos
-1)->issplice
!=1))
330 int dist
= j
-(splice_pos
-1); //_gt...ag: dist from g to g
331 assert(dist
>max_len
) ;
332 tempValue
= lookup_penalty(functions
,dist
, NULL
, 0) +donor
[splice_pos
-1] +acceptor
[j
-1] +prevValue
;
333 if (isfinite(tempValue
))
335 assert(num_elem
<max_num_elem
) ;
336 array
[num_elem
].prev_i
= i
;
337 array
[num_elem
].prev_j
= splice_pos
-1; // predecessor
338 array
[num_elem
].prev_matrix_no
= z
;
339 array
[num_elem
].isSplice
= true; // splice site
340 scores
[num_elem
]=-tempValue
;
343 else if (isfinite(lookup_penalty(functions
,dist
, NULL
, 0)))
345 printf("Something wrong with donor_sites(1000er) (intron_length: %d)\n", dist
) ;
354 //all introns of length <= max_len
355 for (int k
=start_splice_pos
; k
< nr_donor_sites
; k
++)
356 //for (int k=0; k < nr_donor_sites; k++) //also intron of length greater than 1000
358 splice_pos
= donor_sites
[k
] ;
359 if (splice_pos
> j
-4) { break ;}
360 prevValue
= ((Pre_score
*)actMatrix
+ (i
*dna_len
) +splice_pos
-1)->value
; //one before the _gt
362 if (isfinite(prevValue
) && (((Pre_score
*)actMatrix
+ (i
*dna_len
) +splice_pos
-1)->issplice
!=1))
364 int dist
= j
-(splice_pos
-1); //_gt...ag: dist from g to g
365 assert(dist
<=max_len
) ;
366 tempValue
= lookup_penalty(functions
,dist
, NULL
, 0) +donor
[splice_pos
-1] +acceptor
[j
-1] +prevValue
;
367 if (isfinite(tempValue
))
369 assert(num_elem
<max_num_elem
) ;
370 array
[num_elem
].prev_i
= i
;
371 array
[num_elem
].prev_j
= splice_pos
-1; /* predecessor */
372 array
[num_elem
].prev_matrix_no
= z
;
373 array
[num_elem
].isSplice
= true; /* splice site */
374 scores
[num_elem
]=-tempValue
;
377 else if (isfinite(lookup_penalty(functions
,dist
, NULL
, 0)))
379 printf("Something wrong with donor_sites (intron_length: %d)\n", dist
) ;
388 CMath::nmin
<struct ArrayElem
>(scores
, array
, num_elem
, nr_paths
);
390 /* the nr_paths many highest scores are written in the nr_paths many matrices*/
391 for (int z
=0; z
<nr_paths
; z
++) {
393 //nothing happens, entry in matrices[z] at pos. (i,j) still -inf
396 ((Pre_score
*)matrices
[z
] + (i
*dna_len
) +j
)->value
= -scores
[z
] ;
397 ((Pre_score
*)matrices
[z
] + (i
*dna_len
) +j
)->prev_i
= array
[z
].prev_i
;
398 ((Pre_score
*)matrices
[z
] + (i
*dna_len
) +j
)->prev_j
= array
[z
].prev_j
;
399 ((Pre_score
*)matrices
[z
] + (i
*dna_len
) +j
)->issplice
= array
[z
].isSplice
;
400 ((Pre_score
*)matrices
[z
] + (i
*dna_len
) +j
)->prev_matrix_no
= array
[z
].prev_matrix_no
;
402 if (-scores
[z
]>=max_scores
[z
]) { //storing the postion of the highest alignment score yet
403 max_scores
[z
] = -scores
[z
] ;
404 max_score_positions
[2*z
] = i
; // pos i
405 max_score_positions
[2*z
+1] = j
; // pos j
419 // printf("Leaving fill_matrix...\n");