+ minor changes in the paths
[qpalma.git] / tools / run_specific_scripts / transcriptome_analysis / compare_predictions / compare.c
1 // use macro for GNU getline
2 #define _GNU_SOURCE
3 #include <stdio.h>
4
5 #include <stdlib.h>
6 #include <assert.h>
7 #include <string.h>
8 #include <unistd.h>
9 #include <math.h>
10
11 /*
12 * This program compares two lists of intron positions stemming for example from
13 * a gff file in order to evaluate predictions made by QPalma.
14 *
15 * It is assumed that the first file contains the ground truth and the second
16 * file the predicted positions.
17 *
18 */
19
20 int* gt_intron_starts;
21 int* gt_intron_stops;
22 int gt_size;
23
24 int* pred_intron_starts;
25 int* pred_intron_stops;
26 int pred_size;
27
28 int matching_values;
29
30 static char* info = "Usage: ./<app name> <input 1 file name> <input 2 file name> <result file name>";
31
32 static int chromo_sizes[5] = {30432563, 19705359, 23470805, 18585042, 26992728};
33
34 short strand_info;
35
36
37 void check_seeds() {
38
39 matching_values = 0;
40 int pred_idx, gt_idx, chromo, tmp;
41 int pos,gt_start,gt_stop,pred_start,pred_stop;
42
43 printf("sizes are %d/%d (gt/pred)\n",gt_size,pred_size);
44
45 for(pred_idx=0;pred_idx<pred_size;pred_idx++) {
46
47 pred_start = pos-1500;
48 pred_stop = pos+1500;
49 pos = pred_intron_stops[pred_idx];
50 chromo = pred_intron_starts[pred_idx];
51
52 for(gt_idx=0;gt_idx<gt_size;gt_idx++) {
53
54 gt_start = gt_intron_starts[gt_idx];
55 gt_stop = gt_intron_stops[gt_idx];
56
57 if(strand_info == 0) {
58 tmp = gt_start;
59 gt_start = chromo_sizes[chromo] - gt_stop;
60 gt_stop = chromo_sizes[chromo] - tmp;
61 }
62
63 //printf("pos: %d/%d %d/%d\n",gt_start,gt_stop,pred_start,pred_stop);
64 if ( pred_start <= gt_start && gt_stop <= pred_stop ) {
65 matching_values++;
66 break;
67 }
68
69 //if (gt_start == pred_start && gt_stop == pred_stop)
70 // matching_values++;
71 }}
72 }
73
74
75 void compare_introns() {
76
77 matching_values = 0;
78 int pred_idx, gt_idx;
79 int gt_start,gt_stop,pred_start,pred_stop;
80
81 printf("sizes are %d/%d (gt/pred)\n",gt_size,pred_size);
82
83 for(pred_idx=0;pred_idx<pred_size;pred_idx++) {
84
85 pred_start = pred_intron_starts[pred_idx];
86 pred_stop = pred_intron_stops[pred_idx];
87
88 for(gt_idx=0;gt_idx<gt_size;gt_idx++) {
89
90 gt_start = gt_intron_starts[gt_idx];
91 gt_stop = gt_intron_stops[gt_idx];
92
93 //printf("pos: %d/%d %d/%d\n",gt_start,gt_stop,pred_start,pred_stop);
94 if ( fabs(gt_start - pred_start) <= 2 && fabs(gt_stop - pred_stop) <= 2) {
95 matching_values++;
96 break;
97 }
98 //if (gt_start == pred_start && gt_stop == pred_stop)
99 // matching_values++;
100 }}
101 }
102
103
104 /*
105 * This function compares the intron boundaries of the two files.
106 *
107 */
108
109 void _compare_introns() {
110
111 matching_values = 0;
112
113 int gt_idx = 0;
114 int pred_idx = 0;
115 int gt_start,gt_stop,pred_start,pred_stop;
116
117 printf("sizes are %d/%d\n",gt_size,pred_size);
118
119 while(1) {
120 if (gt_idx == gt_size || pred_idx == pred_size)
121 break;
122
123 gt_start = gt_intron_starts[gt_idx];
124 gt_stop = gt_intron_stops[gt_idx];
125
126 pred_start = pred_intron_starts[pred_idx];
127 pred_stop = pred_intron_stops[pred_idx];
128
129 //printf("pos: %d/%d %d/%d\n",gt_start,gt_stop,pred_start,pred_stop);
130 if (gt_start == pred_start && gt_stop == pred_stop) {
131 matching_values++;
132
133 // take doubles into account
134 #if 1
135 int idx = pred_idx+1;
136 for(;idx<pred_size;idx++) {
137 if (pred_intron_starts[idx] == pred_start && pred_intron_stops[idx] == pred_stop)
138 matching_values++;
139 else
140 break;
141 }
142 #endif
143 gt_idx++;
144 pred_idx++;
145 continue;
146 }
147
148 // check for all possible overlappings / positions
149 if ( gt_stop <= pred_start ) {
150 gt_idx++;
151 continue;
152 }
153
154 if ( pred_stop <= gt_start ) {
155 pred_idx++;
156 continue;
157 }
158
159 if (gt_stop <= pred_stop && gt_stop >= pred_start) {
160 gt_idx++;
161 continue;
162 }
163
164 if (pred_stop <= gt_stop && pred_stop >= gt_start) {
165 pred_idx++;
166 continue;
167 }
168 }
169 }
170
171
172 /*
173 *
174 *
175 *
176 */
177
178 void load_introns(char* fn, int** starts, int** stops, int* size) {
179
180 FILE *fs = fopen(fn,"r");
181 if(fs == NULL) {
182 printf("Error: Could not open file: %s",fn);
183 exit(EXIT_FAILURE);
184 }
185
186 size_t line_size = 256;
187 char* current_line = malloc(sizeof(char)*line_size);
188 size_t status;
189
190 // we count the number of lines the file has
191 *size = 0;
192 while( getline(&current_line,&line_size,fs) != -1 ) (*size)++;
193
194 // now we allocate memory to store all positions
195 //*id = malloc(sizeof(char)*(*size));
196 *starts = malloc(sizeof(int)*(*size));
197 *stops = malloc(sizeof(int)*(*size));
198
199 if ( *starts == NULL || *stops == NULL )
200 perror("Could not allocate memory for position arrays");
201
202 fs = freopen(fn,"r",fs);
203 if(fs == NULL) {
204 printf("Error: Could not open file: %s",fn);
205 exit(EXIT_FAILURE);
206 }
207
208 //char* id_string = malloc(sizeof(char)*line_size);
209 char* first_num = malloc(sizeof(char)*line_size);
210 char* second_num = malloc(sizeof(char)*line_size);
211
212 int idx;
213 int intron_start;
214 int intron_stop;
215 int ctr = 0;
216
217 while(1) {
218 status = getline(&current_line,&line_size,fs);
219 if ( status == -1 )
220 break;
221
222 //printf("%s",current_line);
223
224 for(idx=0;idx<line_size;idx++) {
225 if (current_line[idx] == ' ')
226 break;
227 }
228 strncpy(first_num,current_line,idx);
229 strncpy(second_num,&current_line[idx],line_size-idx);
230 intron_start = atoi(first_num);
231 intron_stop = atoi(second_num);
232 //printf("start/stop: %d/%d\n",intron_start,intron_stop);
233
234 (*starts)[ctr] = intron_start;
235 (*stops)[ctr] = intron_stop;
236 ctr++;
237 }
238
239 //printf("ctr is %d\n",ctr);
240
241 if (fclose(fs) != 0)
242 perror("Closing of filestream failed!");
243 }
244
245
246 int main(int argc, char* argv[]) {
247
248 if(argc != 6) {
249 printf("%s\n",info);
250 exit(EXIT_FAILURE);
251 }
252
253 int filenameSize = 256;
254 char* gt_fn = malloc(sizeof(char)*filenameSize);
255 char* pred_fn = malloc(sizeof(char)*filenameSize);
256 char* result_fn= malloc(sizeof(char)*filenameSize);
257
258 if ( gt_fn == NULL || pred_fn == NULL || result_fn == NULL)
259 perror("Could not allocate memory for filenames");
260
261 short intron_mode = -1;
262 if ( strcmp(argv[1],"-intron") == 0 )
263 intron_mode = 1;
264
265 if ( strcmp(argv[1],"-seed") == 0 )
266 intron_mode = 0;
267
268 if (intron_mode == -1)
269 perror("You have to choose between -intron or -seed mode!");
270
271 strand_info = -1;
272 if ( strcmp(argv[2],"-strand=+") == 0 )
273 strand_info = 1;
274
275 if ( strcmp(argv[2],"-strand=-") == 0 )
276 strand_info = 0;
277
278 if (strand_info == -1)
279 perror("You have to choose between -strand=+ or -strand=- !");
280
281 strncpy(gt_fn,argv[3],strlen(argv[3]));
282 strncpy(pred_fn,argv[4],strlen(argv[4]));
283 strncpy(result_fn,argv[5],strlen(argv[5]));
284
285 FILE *result_fs = fopen(result_fn,"w+");
286 if(result_fs == NULL) {
287 printf("Error: Could not open file: %s",result_fn);
288 exit(EXIT_FAILURE);
289 }
290
291 load_introns(gt_fn,&gt_intron_starts,&gt_intron_stops,&gt_size);
292 load_introns(pred_fn,&pred_intron_starts,&pred_intron_stops,&pred_size);
293
294 if (intron_mode == 1)
295 compare_introns();
296 else
297 check_seeds();
298
299 int f_status = fclose(result_fs);
300 if(f_status != 0)
301 printf("closing of result filestream failed!\n");
302
303 free(gt_fn);
304 free(pred_fn);
305 free(result_fn);
306
307 printf("Found %d matching intron(s)/seed(s).\n",matching_values);
308 exit(EXIT_SUCCESS);
309 }