+ parameter transfer to python layer works
[qpalma.git] / tools / data_tools / filterReads.c
1 ////////////////////////////////////////////////////////////
2 // The purpose of this program is to read a gff and a
3 // solexa reads file and create a data set used by QPalma.
4 //
5 //
6 //
7 ////////////////////////////////////////////////////////////
8
9 #include <sys/mman.h>
10 #include <sys/stat.h>
11 #include <stdio.h>
12 #include <stdlib.h>
13 #include <string.h>
14 #include <unistd.h>
15 #include <math.h>
16
17 #include "common.h"
18 #include "datastructures.h"
19
20 int compare_gene_struct(struct gene* a, struct gene* b) {
21 return a->stop - b->start;
22 }
23
24 int parse_gff(char* filename,FILE* fid,struct gene*** allGenes);
25
26 void sort_genes(struct gene*** allGenes, int numGenes);
27
28 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes,FILE* out_fs);
29
30 void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size, FILE* out_fs);
31
32 int fitting(char* up_prb, char* up_prb_end, char* down_prb, char* down_prb_end);
33
34 void remove_ambiguities(char * old_seq, int old_seq_size, char* new_seq);
35
36 static char *info = "Usage is:\n./filterReads gff reads output";
37
38 const int read_size = 36;
39
40 int main(int argc, char* argv[]) {
41
42 if(argc != 4) {
43 printf("%s\n",info);
44 exit(EXIT_FAILURE);
45 }
46
47 int status;
48 int filenameSize = 256;
49 char* gff_filename = malloc(sizeof(char)*filenameSize);
50 char* reads_filename = malloc(sizeof(char)*filenameSize);
51 char* output_filename = malloc(sizeof(char)*filenameSize);
52
53 strncpy(gff_filename,argv[1],filenameSize);
54 strncpy(reads_filename,argv[2],filenameSize);
55 strncpy(output_filename,argv[3],filenameSize);
56
57 FILE *gff_fs = fopen(gff_filename,"r");
58 FILE *reads_fs = fopen(reads_filename,"r");
59 FILE *out_fs = fopen(output_filename,"w");
60
61 if(gff_fs == NULL) {
62 printf("Error: Could not open file: %s",gff_filename);
63 exit(EXIT_FAILURE);
64 }
65
66 if(reads_fs == NULL) {
67 printf("Error: Could not open file: %s",reads_filename);
68 exit(EXIT_FAILURE);
69 }
70
71 if(out_fs == NULL) {
72 printf("Error: Could not open file: %s",output_filename);
73 exit(EXIT_FAILURE);
74 }
75
76 struct gene** allGenes;
77 int numGenes = parse_gff(gff_filename,gff_fs,&allGenes);
78 status = fclose(gff_fs);
79 free(gff_filename);
80 if(status != 0)
81 printf("closing of gff filestream failed!\n");
82
83 printf("Successfully parsed gff file! Found %d genes.\n",numGenes);
84
85 // some entries in the gff files are not in a sorted order
86 //printf("Sorting genes...\n");
87 //sort_genes(&allGenes,numGenes);
88 //qsort(allGenes,numGenes,sizeof(struct gene*),compare_gene_struct);
89 ///printf("Genes were sorted!\n");
90
91 int gidx, eidx;
92 int exon_cov = 0;
93 for(gidx=0;gidx<numGenes;gidx++) {
94 for(eidx=0;eidx<allGenes[gidx]->num_exons;eidx++) {
95 exon_cov += allGenes[gidx]->exon_stops[eidx] - allGenes[gidx]->exon_starts[eidx];
96 }}
97 printf("Exon coverage is %f\n",(double)exon_cov/30432563);
98
99 process_reads(reads_fs,&allGenes,numGenes,out_fs);
100
101 status = fclose(reads_fs);
102 status = fclose(out_fs);
103 if(status != 0)
104 perror("fclose");
105
106 free(reads_filename);
107 free(output_filename);
108 return 0;
109 }
110
111 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* out_fs) {
112 int status;
113
114 int buffer_size= 64;
115 int chr = 0;
116 int pos = 0;
117 char* seq = malloc(sizeof(char)*buffer_size);
118 unsigned long id = 0;
119 char strand = ' ';
120 int mismatch = 0;
121 int occurrence = 0;
122 int size = 0;
123 int cut = 0;
124 char* prb = malloc(sizeof(char)*buffer_size);
125 char* cal_prb = malloc(sizeof(char)*buffer_size);
126 char* chastity = malloc(sizeof(char)*buffer_size);
127
128 int reads_fid = fileno(reads_fs);
129 struct stat reads_stat;
130 if ( fstat(reads_fid,&reads_stat) == -1) {
131 perror("fstat");
132 exit(EXIT_FAILURE);
133 }
134 off_t reads_filesize = reads_stat.st_size;
135 printf("Reads file is of size %lu bytes\n",(unsigned long) reads_filesize);
136 int numReads = reads_filesize / 178.0;
137
138 void *reads_area = mmap (NULL,reads_filesize,PROT_READ|PROT_WRITE,MAP_PRIVATE,reads_fid,0);
139 if (reads_area == MAP_FAILED) {
140 perror("mmap");
141 exit(EXIT_FAILURE);
142 }
143 close(reads_fid);
144 printf("Successfully mapped %lu bytes of reads file into memory\n",(unsigned long)reads_filesize);
145
146 void* linePtr = reads_area;
147 char* current_line = malloc(sizeof(char)*256);
148
149 int SIZE = 500;
150 // initialize boundary arrays
151 void** upstream_end = malloc(sizeof(void*)*SIZE);
152 void** upstream_overlap = malloc(sizeof(void*)*SIZE);
153 void** downstream_start = malloc(sizeof(void*)*SIZE);
154 void** downstream_overlap= malloc(sizeof(void*)*SIZE);
155 int ue,uo,ds,dov;
156 ue = uo = ds = dov = 0;
157
158 int skippedLinesCounter = 0;
159
160 int prev_exon_start = -1;
161 int prev_exon_stop = -1;
162 int cur_exon_start = -1;
163
164 current_line = strncpy(current_line,linePtr,256);
165 int gene_idx = 0;
166 int exon_idx = 1;
167 struct gene* currentGene = (*allGenes)[gene_idx];
168
169 char* disamb_seq = malloc(sizeof(char)*read_size);
170
171 int skippedReadCtr = 0;
172 int uselessReadCtr = 0;
173 int exonicReadCtr = 0;
174 int endPrevCtr = 0;
175 int prevOverlapCtr = 0;
176 int currentStartCtr = 0;
177 int currentOverlapCtr = 0;
178 int multioccurReadCtr = 0;
179
180 int readCtr = 0;
181 // start of the parsing loop
182 while(1) {
183 if (gene_idx == numGenes || strcmp(current_line,"") == 0)
184 break;
185
186 if (readCtr != 0 && readCtr % 1000000 == 0)
187 printf("Processed %d/%d genes and %d/%d reads.\n",gene_idx,numGenes,readCtr,numReads);
188
189 //if (gene_idx >= 1833)
190 // printf("currentGene start/stop: %d/%d. Positions is %d\n",currentGene->start,currentGene->stop,pos);
191 //if (readCtr == 2000000)
192 // exit(EXIT_SUCCESS);
193
194 status = sscanf(current_line,"%d\t%d\t%s\t%lu\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
195 &chr,&pos,seq,&id,&strand,&mismatch,&occurrence,&size,&cut,prb,cal_prb,chastity);
196 if (status < 12) {
197 skippedLinesCounter++;
198 goto next_read;
199 }
200 //printf("gene start/stop %d,%d, pos: %d\n",currentGene->start,currentGene->stop,pos);
201 // if the read is occurring several times elsewhere then get rid of it
202 if(!(occurrence >= 1 && occurrence <= 25)) {
203 //if ( occurrence != 1 ) {
204 multioccurReadCtr++;
205 goto next_read;
206 //while (*(char*)linePtr != '\n') linePtr++;
207 //linePtr++;
208 //readCtr += 1;
209 //current_line = strncpy(current_line,linePtr,256);
210 //continue;
211 }
212
213 if (!(currentGene->start <= pos && (pos + read_size-1) <= currentGene->stop)) { // read is not within gene borders
214
215 if ( currentGene->stop < (pos + read_size-1) ) { // go to next gene
216 gene_idx++;
217 exon_idx = 1;
218 currentGene = (*allGenes)[gene_idx];
219 //printf("currentGene->start / currentGene->stop %d/%d pos is %d\n",currentGene->start,currentGene->stop,pos);
220 ue = uo = ds = dov = 0;
221 continue;
222 }
223
224 if ( pos < currentGene->start ) { // go to next read
225 skippedReadCtr++;
226
227 next_read:
228
229 while (*(char*)linePtr != '\n') linePtr++;
230 linePtr++;
231 readCtr += 1;
232 current_line = strncpy(current_line,linePtr,256);
233 continue;
234 }
235
236 } else { // read IS within gene borders
237
238 exon_label:
239
240 if (exon_idx == currentGene->num_exons) {
241 gene_idx++;
242 exon_idx = 1;
243 currentGene = (*allGenes)[gene_idx];
244 continue;
245 }
246
247 prev_exon_start = currentGene->exon_starts[exon_idx-1];
248 prev_exon_stop = currentGene->exon_stops[exon_idx-1];
249 cur_exon_start = currentGene->exon_starts[exon_idx];
250
251 //printf("exon %d %d inton til %d pos %d\n",prev_exon_start,prev_exon_stop,cur_exon_start,pos);
252
253 if (cur_exon_start - prev_exon_stop < 6 || cur_exon_start < pos ) { // go to next exon
254 exon_idx++;
255
256 if (ue != 0 && dov != 0)
257 combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov,out_fs);
258
259 if (uo != 0 && ds != 0)
260 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds,out_fs);
261
262 ue = uo = ds = dov = 0;
263 goto exon_label;
264 }
265
266 if ( prev_exon_start < pos && (pos+read_size) < prev_exon_stop ) { // read is inside previous exon
267 exonicReadCtr++;
268 remove_ambiguities(seq,strlen(seq),disamb_seq);
269 fprintf(out_fs,"%d\t%c\t%s\t%d\t%s\t%s\t%s\n",chr,strand,disamb_seq,read_size,prb,cal_prb,chastity);
270 goto next_read;
271 }
272
273 if ( pos + (read_size-1) < prev_exon_stop ) // go to next read
274 goto next_read;
275
276 // if this position is reached the read is somehow overlapping or
277 // exactly on the exon boundary. now determine where exactly:
278 if (pos + (read_size-1) == prev_exon_stop) { // read ends at previous exon end
279 endPrevCtr++;
280 upstream_end[ue] = linePtr;
281 ue++;
282 goto next_read;
283 }
284
285 if ( (prev_exon_stop - pos) >= 6 && (prev_exon_stop - pos) <= 30) { // read overlaps with previous exon boundary
286 prevOverlapCtr++;
287 upstream_overlap[uo] = linePtr;
288 uo++;
289 goto next_read;
290 }
291
292 if ( pos == cur_exon_start ) { // read starts at current exon start
293 currentStartCtr++;
294 downstream_start[ds] = linePtr;
295 ds++;
296 goto next_read;
297 }
298
299 if ( (cur_exon_start - pos) >= 6 && (cur_exon_start - pos) <= 30 ) { // read overlaps with current exon boundary
300 currentOverlapCtr++;
301 downstream_overlap[dov] = linePtr;
302 dov++;
303 goto next_read;
304 }
305
306 uselessReadCtr++;
307 goto next_read; // read was not useful i.e. not overlapping/starting at exon boundaries
308 }
309 }
310
311 combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov,out_fs);
312 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds,out_fs);
313
314 free(upstream_end);
315 free(upstream_overlap);
316 free(downstream_start);
317 free(downstream_overlap);
318
319 printf("Processed %d reads in total while %d reads where skipped.\n",readCtr,skippedLinesCounter);
320 printf("%d were totally intronic. %d were outside of genes and %d were occurring on several positions.\n",uselessReadCtr,skippedReadCtr,multioccurReadCtr);
321 printf("%d reads were useless in total\n",uselessReadCtr+skippedReadCtr+multioccurReadCtr);
322 printf("%d reads were totally exonic\n",exonicReadCtr);
323 printf("%d reads end at prev. exon. %d reads overlap with prev. exon.\n", endPrevCtr, prevOverlapCtr);
324 printf("%d reads start at current exon. %d reads overlap with current exon.\n",currentStartCtr,currentOverlapCtr);
325 printf("Total used reads: %d\n",exonicReadCtr+endPrevCtr+prevOverlapCtr+currentStartCtr+currentOverlapCtr);
326
327 status = munmap(reads_area,reads_filesize);
328 if(status != 0)
329 perror("munmap");
330
331 //free(current_line);
332 free(seq);
333 free(prb);
334 free(cal_prb);
335 free(chastity);
336 }
337
338 void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size,FILE* out_fs) {
339 //printf("up/down size is %d/%d\n",up_size,down_size);
340
341 int up_idx, down_idx, status;
342 char* upstream_line = malloc(sizeof(char)*256);
343 char* downstream_line = malloc(sizeof(char)*256);
344
345 int buffer_size= 64;
346
347 int up_chr = 0;
348 int up_pos = 0;
349 char* up_seq = malloc(sizeof(char)*buffer_size);
350 int up_id = 0;
351 char up_strand = ' ';
352 int up_mismatch = 0;
353 int up_occurrence = 0;
354 int up_sz = 0;
355 int up_cut = 0;
356 char* up_prb = malloc(sizeof(char)*buffer_size);
357 char* up_cal_prb = malloc(sizeof(char)*buffer_size);
358 char* up_chastity = malloc(sizeof(char)*buffer_size);
359
360 int down_chr = 0;
361 int down_pos = 0;
362 char* down_seq = malloc(sizeof(char)*buffer_size);
363 int down_id = 0;
364 char down_strand = ' ';
365 int down_mismatch = 0;
366 int down_occurrence = 0;
367 int down_sz = 0;
368 int down_cut = 0;
369 char* down_prb = malloc(sizeof(char)*buffer_size);
370 char* down_cal_prb = malloc(sizeof(char)*buffer_size);
371 char* down_chastity = malloc(sizeof(char)*buffer_size);
372
373 int new_chr = 0;
374 char* new_seq = malloc(sizeof(char)*buffer_size);
375 char new_strand = ' ';
376 char* new_prb = malloc(sizeof(char)*buffer_size);
377 char* new_cal_prb = malloc(sizeof(char)*buffer_size);
378 char* new_chastity = malloc(sizeof(char)*buffer_size);
379 char* new_up_seq = malloc(sizeof(char)*read_size);
380 char* new_down_seq = malloc(sizeof(char)*read_size);
381
382 up_idx=0;
383 down_idx=0;
384 while(1) {
385 if (up_idx == up_size || down_idx == down_size || up_strand != down_strand)
386 break;
387
388 strncpy(upstream_line,upstream[up_idx],256);
389 status = sscanf(upstream_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
390 &up_chr,&up_pos,up_seq,&up_id,&up_strand,&up_mismatch,&up_occurrence,&up_sz,
391 &up_cut,up_prb,up_cal_prb,up_chastity);
392
393 strncpy(downstream_line,downstream[down_idx],256);
394 status = sscanf(downstream_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
395 &down_chr,&down_pos,down_seq,&down_id,&down_strand,&down_mismatch,&down_occurrence,&down_sz,
396 &down_cut,down_prb,down_cal_prb,down_chastity);
397
398 remove_ambiguities(up_seq,strlen(up_seq),new_up_seq);
399 remove_ambiguities(down_seq,strlen(down_seq),new_down_seq);
400
401 new_seq[0] = '\0';
402 new_prb[0] = '\0';
403 new_cal_prb[0] = '\0';
404 new_chastity[0] = '\0';
405
406 int fit;
407 int w_size = 6;
408 int overlap = 0;
409 if (up_pos+35 == exon_stop) { // merge with read which is downstream overlapping
410 overlap = exon_start - down_pos;
411
412 //fit = fitting(up_prb+(read_size-w_size),up_prb+read_size,down_prb+overlap,down_prb+overlap+w_size);
413 //if (fit != 1)
414 // goto end;
415
416 new_chr = up_chr;
417 new_strand = up_strand;
418
419 strncat(new_seq,new_up_seq+(read_size-overlap),overlap);
420 strncat(new_prb,up_prb+(read_size-overlap),overlap);
421 strncat(new_cal_prb,up_cal_prb+(read_size-overlap),overlap);
422 strncat(new_chastity,up_chastity+(read_size-overlap),overlap);
423
424 strncat(new_seq,new_down_seq+overlap,read_size-overlap);
425 strncat(new_prb,down_prb+overlap,read_size-overlap);
426 strncat(new_cal_prb,down_cal_prb+overlap,read_size-overlap);
427 strncat(new_chastity,down_chastity+overlap,read_size-overlap);
428
429 //printf("Between exon stop/start %d/%d : merging pos %d %d with overlap %d\n",exon_stop,exon_start,up_pos+35,down_pos, overlap);
430
431 } // merge with read which is upstream overlapping
432
433 if (down_pos == exon_start) {
434 overlap = up_pos+read_size - exon_stop;
435 //printf("overlap is %d\n",overlap);
436 //printf("pos are: %d %d\n",up_pos,down_pos);
437
438 //fit = fitting(up_prb+read_size-overlap-w_size,up_prb+read_size-overlap,down_prb,down_prb+w_size);
439 //if (fit != 1)
440 // goto end;
441
442 new_chr = up_chr;
443 new_strand = up_strand;
444
445 strncat(new_seq,new_up_seq,(read_size-overlap));
446 strncat(new_prb,up_prb,(read_size-overlap));
447 strncat(new_cal_prb,up_cal_prb,(read_size-overlap));
448 strncat(new_chastity,up_chastity,(read_size-overlap));
449
450 strncat(new_seq,new_down_seq,overlap);
451 strncat(new_prb,down_prb,overlap);
452 strncat(new_cal_prb,down_cal_prb,overlap);
453 strncat(new_chastity,down_chastity,overlap);
454
455 //printf("Between exon stop/start %d/%d : merging pos %d %d with overlap %d\n",exon_stop,exon_start,up_pos,down_pos, overlap);
456 }
457
458 if ( !(up_pos+35 == exon_stop) && !(down_pos == exon_start) )
459 printf("ERROR: Between exon stop/start %d/%d : merging pos %d %d with overlap %d\n",exon_stop,exon_start,up_pos,down_pos, overlap);
460
461 fprintf(out_fs,"%d\t%c\t%s\t%d\t%s\t%s\t%s\n",
462 new_chr,new_strand,new_seq,read_size,new_prb,new_cal_prb,new_chastity);
463
464 end:
465
466 up_idx++;
467 down_idx++;
468 }
469
470 free(upstream_line);
471 free(downstream_line);
472
473 free(new_up_seq);
474 free(new_down_seq);
475
476 free(up_prb);
477 free(up_cal_prb);
478 free(up_chastity);
479
480 free(down_prb);
481 free(down_cal_prb);
482 free(down_chastity);
483
484 free(new_prb);
485 free(new_cal_prb);
486 free(new_chastity);
487
488 }
489
490 int fitting(char* up_prb, char* up_prb_end, char* down_prb, char* down_prb_end) {
491 double epsilon_mean = 15.0;
492 double epsilon_var = 10.0;
493 double mean_up = 0;
494 double variance_up = 0;
495 double mean_down = 0;
496 double variance_down = 0;
497
498 char *up_ptr = up_prb;
499 char *down_ptr = down_prb;
500
501 int w_size = 0;
502 while(up_ptr != up_prb_end) {
503 mean_up += (*up_ptr)-50;
504 mean_down += (*down_ptr)-50;
505 w_size++;
506 up_ptr++;
507 down_ptr++;
508 }
509 mean_up /= w_size;
510 mean_down /= w_size;
511
512
513 up_ptr = up_prb;
514 down_ptr = down_prb;
515 w_size = 0;
516 while(up_ptr != up_prb_end) {
517 variance_up += pow((*up_prb)-50 - mean_up,2);
518 variance_down += pow((*down_prb)-50 - mean_down,2);
519 w_size++;
520 up_ptr++;
521 down_ptr++;
522 }
523 variance_up /= (w_size-1);
524 variance_down /= (w_size-1);
525
526 //printf("means: %f %f, variances: %f %f\n",mean_up,mean_down,variance_up,variance_down);
527
528 if ( abs(mean_up - mean_down) < epsilon_mean && abs(variance_up - variance_down) < epsilon_var )
529 return 1;
530
531 return 0;
532 }
533
534 void remove_ambiguities(char * old_seq, int old_seq_size, char* new_seq) {
535 //printf("old seq: %s\n",old_seq);
536 //printf("new seq: %s\n",new_seq);
537
538 int idx=0;
539 int new_idx = 0;
540 while(idx<old_seq_size) {
541 if (old_seq[idx] == '[') {
542 new_seq[new_idx] = old_seq[++idx];
543 new_idx++;
544 idx += 3;
545 continue;
546 }
547
548 new_seq[new_idx] = old_seq[idx++];
549 new_idx++;
550 }
551 //printf("old seq: %s\n",old_seq);
552 //printf("new seq: %s\n",new_seq);
553 }
554
555 /*
556 * TODO:
557 * - Check strand -> done simple (only if equal)
558 * - check for [AC] and similar entries -> done simple (see function
559 * remove_ambiguities (exchanges [XY] by the first entry)
560 */