+ rewrote C interface for SWIG/Python
[qpalma.git] / QPalmaDP / fill_matrix.cpp
1 /*********************************************************************************************/
2 // Version of fill_matrix with fills several alignment matrices (best, second_best ...)
3 // nr_paths det. number of these matrices
4 //
5 // est_len: Number of columns in alignment matrix: length(EST) +1 (for the gap) (variable: i)
6 // dna_len: Number of lines in alignment matrix: length(DNA) +1 (for the gap) (variable: j)
7 //
8 // !!! IMPORTANT !!! The pictures and equations in the paper/poster etc. describe the algorithm
9 // for an alignment matrix where DNA and EST changed places !!! IMPORTANT !!!
10 /*********************************************************************************************/
11
12 /*
13 matchmatrix: columnwise
14 - A C G T N (dna)
15 - x x x x x x
16 A x x x x x x
17 C x x x x x x
18 G x x x x x x
19 T x x x x x x
20 N x x x x x x
21 (est)
22 */
23
24 /*
25 alignment matrix: columnwise
26 |-EST . . .
27 -+------------
28 -|00 ...
29 D|0.
30 N|. .
31 A|. .
32 .|.
33 .|
34 .|
35 */
36
37
38 /* TODO
39 - max_len = 10000 for elegans!
40 - do not use double
41
42 */
43
44
45 #include "fill_matrix.h"
46
47 using namespace std;
48
49 int check_char(char base)
50 {
51 switch(base)
52 {
53 case 'A':
54 case 'a': return 1;
55 case 'C':
56 case 'c': return 2;
57 case 'G':
58 case 'g': return 3;
59 case 'T':
60 case 't': return 4;
61 case 'N':
62 case 'n': return 5;
63 default : return 0;
64 }
65
66 }
67
68
69
70 void fill_matrix(int nr_paths, Pre_score* matrices[], int est_len /*counter var: i*/, int dna_len /*counter var: j*/, char* est, char* dna, penalty_struct* functions , double* matchmatrix, double* donor, double* acceptor, bool remove_duplicate_scores, int nr_donor_sites, int* donor_sites, int* max_score_positions)
71 {
72
73 /*********************************************************************************************/
74 // variables
75 /*********************************************************************************************/
76 const int mlen = 6; // length of matchmatrix
77 int estChar, dnaChar; // 0:'-', 1:'a', 2:'c', 3:'g', 4:'t', 5:'n'
78 double prevValue ;
79 double tempValue;
80 int max_len = (int)functions->limits[functions->len-1] ; //last (len) supporting point of h
81 //printf("max_len: %d\n", max_len) ;
82
83 int num_elem ; // counter variable
84
85 const int max_num_elem = (4+nr_paths+max_len) * nr_paths ; // upper bound on num_elem
86 double *scores=new double[max_num_elem] ; //are sorted later
87 struct ArrayElem *array=new ArrayElem[max_num_elem] ;
88 //printf("%d\n", max_num_elem*sizeof(ArrayElem)) ;
89
90 int max_ss_element[nr_paths*nr_paths]; //position on dna with max ss score for introns of length greater than max_len
91 int splice_pos ; //splice position get from donor_sites
92 int start_splice_pos ; //where to start looking in donor_sites
93
94 double max_scores[nr_paths] ;
95
96 /*********************************************************************************************/
97 // Initialisation of all alignment matrices, columnwise
98 /*********************************************************************************************/
99 for (int z=0; z<nr_paths; z++) /* matrice z */
100 {
101 max_scores[z] = log(0.0) ;
102 max_score_positions[2*z] = 0; // pos i
103 max_score_positions[2*z +1] = 0; // pos j
104
105 for (int i=0; i<est_len; i++) /* i.column */
106 {
107 for (int j=0; j<dna_len; j++) /* j.line */
108 {
109 ((Pre_score*)matrices[z] +(i*dna_len) +j)->value = log(0.0); // -inf
110 ((Pre_score*)matrices[z] +(i*dna_len) +j)->prev_i = 0;
111 ((Pre_score*)matrices[z] +(i*dna_len) +j)->prev_j = 0;
112 ((Pre_score*)matrices[z] +(i*dna_len) +j)->prev_matrix_no = 0;
113 ((Pre_score*)matrices[z] +(i*dna_len) +j)->issplice = 0;
114 }
115 }
116 }
117
118
119 /*********************************************************************************************/
120 // Element (0,0) for first/best matrice (z=0)
121 /*********************************************************************************************/
122 ((Pre_score*)matrices[0])->value = 0;
123 ((Pre_score*)matrices[0])->prev_i = 0;
124 ((Pre_score*)matrices[0])->prev_j = 0;
125 ((Pre_score*)matrices[0])->prev_matrix_no = 0;
126 ((Pre_score*)matrices[0])->issplice = 0;
127
128
129 /*********************************************************************************************/
130 // Line Zero for first/best matrice (z=0)
131 /*********************************************************************************************/
132 for (int i=1; i<est_len; i++) /* 0.line, all columns (DNA='-') */
133 {
134 ((Pre_score*)matrices[0] +(i*dna_len))->value = 0 ;
135 ((Pre_score*)matrices[0] +(i*dna_len))->prev_i = i-1;
136 ((Pre_score*)matrices[0] +(i*dna_len))->prev_j = 0;
137 ((Pre_score*)matrices[0] +(i*dna_len))->prev_matrix_no = 0;
138 ((Pre_score*)matrices[0] +(i*dna_len))->issplice = 0;
139 }
140
141
142 /*********************************************************************************************/
143 // Column Zero for first/best matrice (z=0)
144 /*********************************************************************************************/
145 for (int j=1; j<dna_len; j++) // 0.column, all lines
146 {
147 ((Pre_score*)matrices[0] +j)->value = 0 ;
148 ((Pre_score*)matrices[0] +j)->prev_i = 0 ;
149 ((Pre_score*)matrices[0] +j)->prev_j = j-1 ;
150 ((Pre_score*)matrices[0] +j)->prev_matrix_no = 0 ;
151 ((Pre_score*)matrices[0] +j)->issplice = 0;
152 }
153
154
155 /*********************************************************************************************/
156 // !!!! Column Zero and line Zero for all other matrices except the best one (z>0): -INF !!!!
157 /*********************************************************************************************/
158
159
160
161 /*********************************************************************************************/
162 // all other entries for all matrices, last column and row treated seperately
163 /*********************************************************************************************/
164
165 /* columnwise, 0.column and 0.line already filled */
166 for (int i=1; i<est_len; i++) /* i.column */
167 {
168
169 start_splice_pos = 0 ; //where to start looking in donor_sites
170 for (int zz=0; zz<nr_paths*nr_paths; zz++) {max_ss_element[zz]=0 ;} // initializing
171
172 for (int j=1; j<dna_len; j++) //j.line
173 {
174 dnaChar = check_char(dna[j-1]) ; // -1 because of of the gaps in the 0.line/column of the al-matrix
175 estChar = check_char(est[i-1]) ;
176
177
178 //if introns of length greater than max_len possible
179 if (j > max_len)
180 {
181 //storing highest ss_score (the position) and NOT -INF
182 int new_ss_pos = j-max_len ;
183 double donor_score = donor[new_ss_pos-1] ;
184 if (isfinite(donor_score)) //splice site!
185 {
186 if (max_ss_element[nr_paths-1] != 0)
187 {
188 for (int zz=0; zz<nr_paths; zz++) //for each alignment matrix
189 {
190 int pos_with_minscore = nr_paths ; //assuming, the new score is the worst score
191 double new_intron_score = donor_score+((Pre_score*)matrices[zz]+(i*dna_len)+new_ss_pos-1)->value ; //could be -INF
192 if (!(isfinite(new_intron_score))) {printf("new_intron_score is -INF\n") ;}
193 for (int pos=0; pos<nr_paths; pos++) //find worst score of all nr_paths+1 scores
194 {
195 int act_splice_position = max_ss_element[zz*nr_paths+pos] ;
196 assert(isfinite(donor[act_splice_position-1])) ;
197 double act_intron_score = donor[act_splice_position-1]+((Pre_score*)matrices[zz]+(i*dna_len)+act_splice_position-1)->value ;
198 if (new_intron_score >= act_intron_score)
199 {
200 new_intron_score = act_intron_score ;
201 pos_with_minscore = pos ;
202 } //else nothing happens, new intron score still the worst one
203 }
204 if (pos_with_minscore != nr_paths)
205 {
206 max_ss_element[zz*nr_paths+pos_with_minscore] = new_ss_pos ; //replacing
207 }
208 }
209 }
210 else
211 {
212 for (int pos=0; pos<nr_paths; pos++)
213 {
214 if (max_ss_element[pos]==0) //no previous splice site at this pos
215 {
216 for (int zz=0; zz<nr_paths; zz++) {max_ss_element[zz*nr_paths+pos]=j-max_len ;}
217 break ;
218 }
219 }
220 }
221 }
222 //start_pos in donor_sites so that only introns shorter then max_length are seen
223 if ((start_splice_pos < nr_donor_sites) && donor_sites[start_splice_pos] == j-max_len)
224 {
225 start_splice_pos++ ;
226 }
227 }
228
229
230 /* Computing all possible scores */
231 num_elem = 0 ; /* number of different scores/possibilities so far */
232 for (int z=0; z<nr_paths; z++) /* matrice z */
233 {
234 Pre_score* actMatrix = (Pre_score*)matrices[z]; /* hm, why? */
235
236 /***************************************/
237 // 0. Zero (local alignment version)
238 /***************************************/
239 assert(num_elem<max_num_elem) ;
240 array[num_elem].prev_i = 0;
241 array[num_elem].prev_j = 0;
242 array[num_elem].prev_matrix_no = z;
243 array[num_elem].isSplice = false; /* no splice site */
244 scores[num_elem] = 0 ;
245 num_elem++ ;
246
247
248 /***************************************/
249 // 1. gap on EST (j-1,i) -> (j,i)
250 /***************************************/
251 prevValue = ((Pre_score*)actMatrix +(i*dna_len)+j-1)->value ; /* (j-1,i) -> (j,i) */
252 if (isfinite(prevValue))
253 {
254 tempValue = prevValue +(matchmatrix[mlen* dnaChar]); /* score(gap,DNA) */
255 if (isfinite(tempValue))
256 {
257 assert(num_elem<max_num_elem) ;
258 array[num_elem].prev_i = i;
259 array[num_elem].prev_j = j-1; /* predecessor */
260 array[num_elem].prev_matrix_no = z;
261 array[num_elem].isSplice = false; /* no splice site */
262 scores[num_elem] = -tempValue ;
263 num_elem++ ;
264 }
265 }
266
267
268 /***************************************/
269 // 2. match/mismatch (j-1,i-1) -> (j,i)
270 /***************************************/
271 prevValue = ((Pre_score*)actMatrix +(i-1)*dna_len + j-1)->value ; /* (j-1,i-1) -> (j,i) */
272 if (isfinite(prevValue))
273 {
274 tempValue = prevValue +(matchmatrix[mlen* dnaChar +estChar]);//DNA mit EST
275 if (isfinite(tempValue))
276 {
277 assert(num_elem<max_num_elem) ;
278 array[num_elem].prev_i = i-1; /* predecessor */
279 array[num_elem].prev_j = j-1; /* predecessor */
280 array[num_elem].prev_matrix_no = z;
281 array[num_elem].isSplice = false; /* no splice site */
282 scores[num_elem] = -tempValue ;
283 num_elem++ ;
284 }
285 }
286
287
288 /***************************************/
289 // 3. gap on DNA (j,i-1) -> (j,i)
290 /***************************************/
291 prevValue = ((Pre_score*)actMatrix +(i-1)*dna_len + j)->value ;
292 if (isfinite(prevValue))
293 {
294 tempValue = prevValue +(matchmatrix[estChar]); /* score(EST,gap) */
295 if (isfinite(tempValue))
296 {
297 assert(num_elem<max_num_elem) ;
298 array[num_elem].prev_i = i-1; /* predecessor */
299 array[num_elem].prev_j = j;
300 array[num_elem].prev_matrix_no = z;
301 array[num_elem].isSplice = false; /* no splice site */
302 scores[num_elem]=-tempValue ;
303 num_elem++ ;
304 }
305
306 }
307
308
309 /***************************************/
310 // 4. all possible splice sites/introns
311 /***************************************/
312 // acceptor[j-1]: g of ag (-1 because cpp starts counting at 0)
313 // donor[splice_pos-1]: g of gt/gc (-1 because cpp starts counting at 0)
314 // donor_sites: the g's of the gt/gc
315
316 if (isfinite(acceptor[j-1]))
317 {
318
319 //the nr_paths many introns of length greater than max_len
320 for (int pos=0; pos<nr_paths; pos++)
321 {
322 if (max_ss_element[z*nr_paths+pos]>0)
323 {
324 assert((j-max_ss_element[z*nr_paths+pos]+1) > max_len) ;
325 splice_pos = max_ss_element[z*nr_paths+pos] ; //the g of the gt
326 prevValue = ((Pre_score*)actMatrix + (i*dna_len) +splice_pos-1)->value ; //one before the _gt
327 if (isfinite(prevValue) && (((Pre_score*)actMatrix + (i*dna_len) +splice_pos-1)->issplice!=1))
328 {
329 int dist = j -(splice_pos-1); //_gt...ag: dist from g to g
330 assert(dist>max_len) ;
331 tempValue = lookup_penalty(functions,dist, NULL, 0) +donor[splice_pos-1] +acceptor[j-1] +prevValue;
332 if (isfinite(tempValue))
333 {
334 assert(num_elem<max_num_elem) ;
335 array[num_elem].prev_i = i;
336 array[num_elem].prev_j = splice_pos-1; // predecessor
337 array[num_elem].prev_matrix_no = z;
338 array[num_elem].isSplice = true; // splice site
339 scores[num_elem]=-tempValue ;
340 num_elem++ ;
341 }
342 else if (isfinite(lookup_penalty(functions,dist, NULL, 0)))
343 {
344 printf("Something wrong with donor_sites(1000er) (intron_length: %d)\n", dist) ;
345 sleep(5000) ;
346 }
347 }
348 }
349 }
350
351
352
353 //all introns of length <= max_len
354 for (int k=start_splice_pos; k < nr_donor_sites; k++)
355 //for (int k=0; k < nr_donor_sites; k++) //also intron of length greater than 1000
356 {
357 splice_pos = donor_sites[k] ;
358 if (splice_pos > j-4) { break ;}
359 prevValue = ((Pre_score*)actMatrix + (i*dna_len) +splice_pos-1)->value ; //one before the _gt
360
361 if (isfinite(prevValue) && (((Pre_score*)actMatrix + (i*dna_len) +splice_pos-1)->issplice!=1))
362 {
363 int dist = j -(splice_pos-1); //_gt...ag: dist from g to g
364 assert(dist<=max_len) ;
365 tempValue = lookup_penalty(functions,dist, NULL, 0) +donor[splice_pos-1] +acceptor[j-1] +prevValue;
366 if (isfinite(tempValue))
367 {
368 assert(num_elem<max_num_elem) ;
369 array[num_elem].prev_i = i;
370 array[num_elem].prev_j = splice_pos-1; /* predecessor */
371 array[num_elem].prev_matrix_no = z;
372 array[num_elem].isSplice = true; /* splice site */
373 scores[num_elem]=-tempValue ;
374 num_elem++ ;
375 }
376 else if (isfinite(lookup_penalty(functions,dist, NULL, 0)))
377 {
378 printf("Something wrong with donor_sites (intron_length: %d)\n", dist) ;
379 sleep(5000) ;
380 }
381 }
382 }
383 }
384
385 } //end of z
386
387 CMath::nmin<struct ArrayElem>(scores, array, num_elem, nr_paths);
388
389 /* the nr_paths many highest scores are written in the nr_paths many matrices*/
390 for (int z=0; z<nr_paths; z++) {
391 if (z>=num_elem) {
392 //nothing happens, entry in matrices[z] at pos. (i,j) still -inf
393 }
394 else {
395 ((Pre_score*)matrices[z] + (i*dna_len) +j)->value = -scores[z] ;
396 ((Pre_score*)matrices[z] + (i*dna_len) +j)->prev_i = array[z].prev_i ;
397 ((Pre_score*)matrices[z] + (i*dna_len) +j)->prev_j = array[z].prev_j ;
398 ((Pre_score*)matrices[z] + (i*dna_len) +j)->issplice = array[z].isSplice ;
399 ((Pre_score*)matrices[z] + (i*dna_len) +j)->prev_matrix_no = array[z].prev_matrix_no;
400
401 if (-scores[z]>=max_scores[z]) { //storing the postion of the highest alignment score yet
402 max_scores[z] = -scores[z] ;
403 max_score_positions[2*z] = i; // pos i
404 max_score_positions[2*z +1] = j; // pos j
405 }
406 }
407 } //end of z
408
409 } //end of j
410 } // end of i
411
412
413
414 //free memory
415 delete[] array;
416 delete[] scores;
417
418 return;
419 }
420
421