+ changed reads combination in order to allow for overlaps at both exon borders
[qpalma.git] / tools / data_tools / filterReads.c
1 ////////////////////////////////////////////////////////////
2 // The purpose of this program is to read a gff and a
3 // solexa reads file and create a data set used by QPalma.
4 //
5 //
6 //
7 ////////////////////////////////////////////////////////////
8
9 #include <sys/mman.h>
10 #include <sys/stat.h>
11 #include <stdio.h>
12 #include <stdlib.h>
13 #include <string.h>
14 #include <unistd.h>
15 #include <math.h>
16
17 #include "common.h"
18 #include "datastructures.h"
19
20 int compare_gene_struct(struct gene* a, struct gene* b) {
21 return a->stop - b->start;
22 }
23
24 int parse_gff(char* filename,FILE* fid,struct gene*** allGenes);
25
26 void sort_genes(struct gene*** allGenes, int numGenes);
27
28 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes,FILE* out_fs);
29
30 void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size, FILE* out_fs);
31
32 int fitting(char* up_prb, char* down_prb);
33
34 void remove_ambiguities(char * old_seq, int old_seq_size, char* new_seq);
35
36 static char *info = "Usage is:\n./filterReads gff reads output";
37
38 const int read_size = 36;
39
40 int combined_reads = 0;
41
42 int main(int argc, char* argv[]) {
43 combined_reads = 0;
44
45 if(argc != 4) {
46 printf("%s\n",info);
47 exit(EXIT_FAILURE);
48 }
49
50 int status;
51 int filenameSize = 256;
52 char* gff_filename = malloc(sizeof(char)*filenameSize);
53 char* reads_filename = malloc(sizeof(char)*filenameSize);
54 char* output_filename = malloc(sizeof(char)*filenameSize);
55
56 strncpy(gff_filename,argv[1],filenameSize);
57 strncpy(reads_filename,argv[2],filenameSize);
58 strncpy(output_filename,argv[3],filenameSize);
59
60 FILE *gff_fs = fopen(gff_filename,"r");
61 FILE *reads_fs = fopen(reads_filename,"r");
62 FILE *out_fs = fopen(output_filename,"w");
63
64 if(gff_fs == NULL) {
65 printf("Error: Could not open file: %s",gff_filename);
66 exit(EXIT_FAILURE);
67 }
68
69 if(reads_fs == NULL) {
70 printf("Error: Could not open file: %s",reads_filename);
71 exit(EXIT_FAILURE);
72 }
73
74 if(out_fs == NULL) {
75 printf("Error: Could not open file: %s",output_filename);
76 exit(EXIT_FAILURE);
77 }
78
79 struct gene** allGenes;
80 int numGenes = parse_gff(gff_filename,gff_fs,&allGenes);
81 status = fclose(gff_fs);
82 free(gff_filename);
83 if(status != 0)
84 printf("closing of gff filestream failed!\n");
85
86 printf("Successfully parsed gff file! Found %d genes.\n",numGenes);
87
88 // some entries in the gff files are not in a sorted order
89 //printf("Sorting genes...\n");
90 //sort_genes(&allGenes,numGenes);
91 //qsort(allGenes,numGenes,sizeof(struct gene*),compare_gene_struct);
92 ///printf("Genes were sorted!\n");
93
94 int gidx, eidx;
95 int exon_cov = 0;
96 for(gidx=0;gidx<numGenes;gidx++) {
97 for(eidx=0;eidx<allGenes[gidx]->num_exons;eidx++) {
98 exon_cov += allGenes[gidx]->exon_stops[eidx] - allGenes[gidx]->exon_starts[eidx];
99 }}
100 printf("Exon coverage is %f\n",(double)exon_cov/30432563);
101
102 process_reads(reads_fs,&allGenes,numGenes,out_fs);
103
104 status = fclose(reads_fs);
105 status = fclose(out_fs);
106 if(status != 0)
107 perror("fclose");
108
109 free(reads_filename);
110 free(output_filename);
111 return 0;
112 }
113
114 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* out_fs) {
115 int status;
116
117 int buffer_size= 64;
118 int chr = 0;
119 int pos = 0;
120 char* seq = malloc(sizeof(char)*buffer_size);
121 unsigned long id = 0;
122 char strand = ' ';
123 int mismatch = 0;
124 int occurrence = 0;
125 int size = 0;
126 int cut = 0;
127 char* prb = malloc(sizeof(char)*buffer_size);
128 char* cal_prb = malloc(sizeof(char)*buffer_size);
129 char* chastity = malloc(sizeof(char)*buffer_size);
130
131 int reads_fid = fileno(reads_fs);
132 struct stat reads_stat;
133 if ( fstat(reads_fid,&reads_stat) == -1) {
134 perror("fstat");
135 exit(EXIT_FAILURE);
136 }
137 off_t reads_filesize = reads_stat.st_size;
138 printf("Reads file is of size %lu bytes\n",(unsigned long) reads_filesize);
139 int numReads = reads_filesize / 178.0;
140
141 void *reads_area = mmap (NULL,reads_filesize,PROT_READ|PROT_WRITE,MAP_PRIVATE,reads_fid,0);
142 if (reads_area == MAP_FAILED) {
143 perror("mmap");
144 exit(EXIT_FAILURE);
145 }
146 close(reads_fid);
147 printf("Successfully mapped %lu bytes of reads file into memory\n",(unsigned long)reads_filesize);
148
149 void* linePtr = reads_area;
150 char* current_line = malloc(sizeof(char)*256);
151
152 int SIZE = 500;
153 // initialize boundary arrays
154 void** upstream_overlap = malloc(sizeof(void*)*SIZE);
155 void** downstream_overlap= malloc(sizeof(void*)*SIZE);
156 int uo,dov;
157 uo = dov = 0;
158
159 int skippedLinesCounter = 0;
160
161 int prev_exon_start = -1;
162 int prev_exon_stop = -1;
163 int cur_exon_start = -1;
164
165 current_line = strncpy(current_line,linePtr,256);
166 int gene_idx = 0;
167 int exon_idx = 1;
168 struct gene* currentGene = (*allGenes)[gene_idx];
169
170 char* disamb_seq = malloc(sizeof(char)*read_size);
171
172 int skippedReadCtr = 0;
173 int uselessReadCtr = 0;
174 int exonicReadCtr = 0;
175 int endPrevCtr = 0;
176 int prevOverlapCtr = 0;
177 int currentStartCtr = 0;
178 int currentOverlapCtr = 0;
179 int multioccurReadCtr = 0;
180
181 int readCtr = 0;
182 // start of the parsing loop
183 while(1) {
184 if (gene_idx == numGenes || strcmp(current_line,"") == 0)
185 break;
186
187 if (readCtr != 0 && readCtr % 1000000 == 0)
188 printf("Processed %d/%d genes and %d/%d reads.\n",gene_idx,numGenes,readCtr,numReads);
189
190 //if (gene_idx >= 1833)
191 // printf("currentGene start/stop: %d/%d. Positions is %d\n",currentGene->start,currentGene->stop,pos);
192
193 status = sscanf(current_line,"%d\t%d\t%s\t%lu\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
194 &chr,&pos,seq,&id,&strand,&mismatch,&occurrence,&size,&cut,prb,cal_prb,chastity);
195 if (status < 12) {
196 skippedLinesCounter++;
197 goto next_read;
198 }
199
200 // if the read is occurring several times elsewhere then get rid of it
201 //if(!(occurrence >= 1 && occurrence <= 25)) {
202 if ( occurrence != 1 ) {
203 multioccurReadCtr++;
204 goto next_read;
205 }
206
207 if (!(currentGene->start <= pos && (pos + read_size-1) <= currentGene->stop)) { // read is not within gene borders
208
209 if ( currentGene->stop < (pos + read_size-1) ) { // go to next gene
210 gene_idx++;
211 exon_idx = 1;
212 currentGene = (*allGenes)[gene_idx];
213 //printf("currentGene->start / currentGene->stop %d/%d pos is %d\n",currentGene->start,currentGene->stop,pos);
214 uo = dov = 0;
215 continue;
216 }
217
218 if ( pos < currentGene->start ) { // go to next read
219 skippedReadCtr++;
220
221 next_read:
222
223 while (*(char*)linePtr != '\n') linePtr++;
224 linePtr++;
225 readCtr += 1;
226 current_line = strncpy(current_line,linePtr,256);
227 continue;
228 }
229
230 } else { // read IS within gene borders
231
232 exon_label:
233
234 if (exon_idx == currentGene->num_exons) {
235 gene_idx++;
236 exon_idx = 1;
237 currentGene = (*allGenes)[gene_idx];
238 continue;
239 }
240
241 prev_exon_start = currentGene->exon_starts[exon_idx-1];
242 prev_exon_stop = currentGene->exon_stops[exon_idx-1];
243 cur_exon_start = currentGene->exon_starts[exon_idx];
244
245 //printf("exon %d %d inton til %d pos %d\n",prev_exon_start,prev_exon_stop,cur_exon_start,pos);
246
247 if (cur_exon_start - prev_exon_stop < 6 || cur_exon_start < pos ) { // go to next exon
248 exon_idx++;
249
250 if (uo != 0 && dov != 0)
251 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_overlap,dov,out_fs);
252
253 uo = dov = 0;
254 goto exon_label;
255 }
256
257 if ( prev_exon_start < pos && (pos+read_size) < prev_exon_stop ) { // read is inside previous exon
258 exonicReadCtr++;
259 remove_ambiguities(seq,strlen(seq),disamb_seq);
260 fprintf(out_fs,"%d\t%c\t%s\t%d\t%s\t%s\t%s\n",chr,strand,disamb_seq,read_size,prb,cal_prb,chastity);
261 goto next_read;
262 }
263
264 if ( pos + (read_size-1) < prev_exon_stop ) // go to next read
265 goto next_read;
266
267 // if this position is reached the read is somehow overlapping at
268 // the exon boundary. now determine where exactly:
269
270 if ( (prev_exon_stop - pos) >= 6 && (prev_exon_stop - pos) <= 30) { // read overlaps with previous exon boundary
271 prevOverlapCtr++;
272 upstream_overlap[uo] = linePtr;
273 uo++;
274 goto next_read;
275 }
276
277 if ( (cur_exon_start - pos) >= 6 && (cur_exon_start - pos) <= 30 ) { // read overlaps with current exon boundary
278 currentOverlapCtr++;
279 downstream_overlap[dov] = linePtr;
280 dov++;
281 goto next_read;
282 }
283
284 uselessReadCtr++;
285 goto next_read; // read was not useful i.e. not overlapping/starting at exon boundaries
286 }
287 }
288
289 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_overlap,dov,out_fs);
290
291 free(upstream_overlap);
292 free(downstream_overlap);
293
294 printf("Processed %d reads in total while %d reads where skipped.\n",readCtr,skippedLinesCounter);
295 printf("%d were totally intronic. %d were outside of genes and %d were occurring on several positions.\n",uselessReadCtr,skippedReadCtr,multioccurReadCtr);
296 printf("%d reads were useless in total\n",uselessReadCtr+skippedReadCtr+multioccurReadCtr);
297 printf("%d reads were totally exonic\n",exonicReadCtr);
298 printf("%d reads overlap with prev. exon. %d reads overlap with current exon\n",prevOverlapCtr,currentOverlapCtr);
299 printf("%d reads where newly combined from two original reads\n",combined_reads);
300 printf("Total used reads: %d\n",exonicReadCtr+endPrevCtr+prevOverlapCtr+currentStartCtr+currentOverlapCtr);
301
302 status = munmap(reads_area,reads_filesize);
303 if(status != 0)
304 perror("munmap");
305
306 //free(current_line);
307 free(seq);
308 free(prb);
309 free(cal_prb);
310 free(chastity);
311 }
312
313 void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size,FILE* out_fs) {
314 //printf("up/down size is %d/%d\n",up_size,down_size);
315
316 int up_idx, down_idx, status;
317
318 char* upstream_line = malloc(sizeof(char)*256);
319 char* downstream_line = malloc(sizeof(char)*256);
320
321 int buffer_size= 64;
322
323 int up_chr = 0;
324 int up_pos = 0;
325 char* up_seq = malloc(sizeof(char)*buffer_size);
326 int up_id = 0;
327 char up_strand = ' ';
328 int up_mismatch = 0;
329 int up_occurrence = 0;
330 int up_sz = 0;
331 int up_cut = 0;
332 char* up_prb = malloc(sizeof(char)*buffer_size);
333 char* up_cal_prb = malloc(sizeof(char)*buffer_size);
334 char* up_chastity = malloc(sizeof(char)*buffer_size);
335
336 int down_chr = 0;
337 int down_pos = 0;
338 char* down_seq = malloc(sizeof(char)*buffer_size);
339 int down_id = 0;
340 char down_strand = ' ';
341 int down_mismatch = 0;
342 int down_occurrence = 0;
343 int down_sz = 0;
344 int down_cut = 0;
345 char* down_prb = malloc(sizeof(char)*buffer_size);
346 char* down_cal_prb = malloc(sizeof(char)*buffer_size);
347 char* down_chastity = malloc(sizeof(char)*buffer_size);
348
349 int new_chr = 0;
350 char* new_seq = malloc(sizeof(char)*buffer_size);
351 char new_strand = ' ';
352 char* new_prb = malloc(sizeof(char)*buffer_size);
353 char* new_cal_prb = malloc(sizeof(char)*buffer_size);
354 char* new_chastity = malloc(sizeof(char)*buffer_size);
355
356 char* new_up_seq = malloc(sizeof(char)*read_size);
357 char* new_down_seq = malloc(sizeof(char)*read_size);
358
359 int overlap;
360 int fit;
361
362 char* used_flag = calloc(down_size,sizeof(char));
363
364 for(up_idx=0;up_idx<up_size;up_idx++) {
365 strncpy(upstream_line,upstream[up_idx],256);
366 status = sscanf(upstream_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
367 &up_chr,&up_pos,up_seq,&up_id,&up_strand,&up_mismatch,&up_occurrence,&up_sz,
368 &up_cut,up_prb,up_cal_prb,up_chastity);
369
370 remove_ambiguities(up_seq,strlen(up_seq),new_up_seq);
371
372 overlap = exon_stop - up_pos;
373
374 for(down_idx=0;down_idx<down_size;down_idx++) {
375 if( used_flag[down_idx] == 1)
376 continue;
377
378 strncpy(downstream_line,downstream[down_idx],256);
379 status = sscanf(downstream_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
380 &down_chr,&down_pos,down_seq,&down_id,&down_strand,&down_mismatch,&down_occurrence,&down_sz,
381 &down_cut,down_prb,down_cal_prb,down_chastity);
382
383 remove_ambiguities(down_seq,strlen(down_seq),new_down_seq);
384
385 new_seq[0] = '\0';
386 new_prb[0] = '\0';
387 new_cal_prb[0] = '\0';
388 new_chastity[0] = '\0';
389
390 fit = fitting(up_prb+(36-overlap),down_prb);
391 if (fit == -1)
392 continue;
393
394 if (! (fit < overlap ))
395 continue;
396
397 new_chr = up_chr;
398 new_strand = up_strand;
399
400 strncat(new_seq,new_up_seq,overlap);
401 strncat(new_prb,up_prb,overlap);
402 strncat(new_cal_prb,up_cal_prb,overlap);
403 strncat(new_chastity,up_chastity,overlap);
404
405 strncat(new_seq,new_down_seq+fit,read_size-overlap);
406 strncat(new_prb,down_prb+fit,read_size-overlap);
407 strncat(new_cal_prb,down_cal_prb+fit,read_size-overlap);
408 strncat(new_chastity,down_chastity+fit,read_size-overlap);
409
410 fprintf(out_fs,"%d\t%c\t%s\t%d\t%s\t%s\t%s\n",
411 new_chr,new_strand,new_seq,read_size,new_prb,new_cal_prb,new_chastity);
412
413 combined_reads++;
414 used_flag[down_idx] = 1;
415 }
416 }
417
418 free(upstream_line);
419 free(downstream_line);
420
421 free(new_up_seq);
422 free(new_down_seq);
423
424 free(up_seq);
425 free(up_prb);
426 free(up_cal_prb);
427 free(up_chastity);
428
429 free(down_seq);
430 free(down_prb);
431 free(down_cal_prb);
432 free(down_chastity);
433
434 free(new_seq);
435 free(new_prb);
436 free(new_cal_prb);
437 free(new_chastity);
438 }
439
440 int fitting(char* up_prb, char* down_prb) {
441 double epsilon_mean = 30.0;
442 double epsilon_var = 30.0;
443 int w_size = 6;
444
445 double current_mean_up = 0;
446 double current_variance_up = 0;
447 double current_mean_down = 0;
448 double current_variance_down = 0;
449
450 double* mean_up = malloc(sizeof(double)*read_size-2*w_size);
451 double* variance_up = malloc(sizeof(double)*read_size-2*w_size);
452 double* mean_down = malloc(sizeof(double)*read_size-2*w_size);
453 double* variance_down = malloc(sizeof(double)*read_size-2*w_size);
454
455 int iidx = -1;
456 int uidx;
457 int didx;
458
459 for(uidx=iidx-w_size;uidx<iidx;uidx++) {
460 didx = uidx+w_size;
461 current_mean_up += up_prb[uidx]-50;
462 current_mean_down += up_prb[didx]-50;
463
464 }
465 current_mean_up /= w_size;
466 current_mean_down /= w_size;
467
468 for(uidx=iidx-w_size;uidx<iidx;uidx++) {
469 didx = uidx+w_size;
470 current_variance_up += pow(up_prb[uidx]-50 - current_mean_up,2);
471 current_variance_down += pow(up_prb[didx]-50 - current_mean_up,2);
472
473 }
474 current_variance_up /= (w_size-1);
475 current_variance_down /= (w_size-1);
476
477 for(iidx=w_size;iidx<30;iidx++) {
478 for(uidx=iidx-w_size;uidx<iidx;uidx++) {
479 didx = uidx+w_size;
480 mean_up[iidx-w_size] += down_prb[uidx]-50;
481 mean_down[iidx-w_size] += down_prb[didx]-50;
482
483 }
484 mean_up[iidx-w_size] /= w_size;
485 mean_down[iidx-w_size] /= w_size;
486
487 for(uidx=iidx-w_size;uidx<iidx;uidx++) {
488 didx = uidx+w_size;
489 variance_up[iidx-w_size] += pow(down_prb[uidx]-50 - mean_up[iidx-w_size],2);
490 variance_down[iidx-w_size] += pow(down_prb[didx]-50 - mean_down[iidx-w_size],2);
491 }
492 variance_up[iidx-w_size] /= (w_size-1);
493 variance_down[iidx-w_size] /= (w_size-1);
494
495 //printf("means: %f %f %f %f, variances: %f %f %f %f\n",current_mean_up,current_mean_down,mean_up,mean_down,current_variance_up,current_variance_down,variance_up,variance_down);
496 //if ( abs(current_mean_up - mean_up) < epsilon_mean && abs(current_variance_up - variance_up) < epsilon_var
497 //&& abs(current_mean_down - mean_down) < epsilon_mean && abs(current_variance_down - variance_down) < epsilon_var )
498 // return iidx;
499 }
500
501 int bidx;
502 int bestIdx = -1;
503 double min = 1000.0;
504 for(bidx=0;bidx<read_size-2*w_size;bidx++) {
505 if ( abs(current_mean_up - mean_up[bidx]) < epsilon_mean && abs(current_variance_up - variance_up[bidx]) < epsilon_var
506 && abs(current_mean_down - mean_down[bidx]) < epsilon_mean && abs(current_variance_down - variance_down[bidx]) < epsilon_var ) {
507 if ( abs(current_mean_up - mean_up[bidx]) + abs(current_variance_up - variance_up[bidx]) + abs(current_mean_down - mean_down[bidx]) + abs(current_variance_down - variance_down[bidx])) {
508 min = abs(current_mean_up - mean_up[bidx]) + abs(current_variance_up - variance_up[bidx]) + abs(current_mean_down - mean_down[bidx]) + abs(current_variance_down - variance_down[bidx]);
509 bestIdx = bidx;
510 }
511 }
512 }
513
514 free(mean_up);
515 free(variance_up);
516 free(mean_down);
517 free(variance_down);
518
519 return bestIdx;
520 }
521
522 void remove_ambiguities(char * old_seq, int old_seq_size, char* new_seq) {
523 //printf("old seq: %s\n",old_seq);
524 //printf("new seq: %s\n",new_seq);
525
526 int idx=0;
527 int new_idx = 0;
528
529 while(idx<old_seq_size) {
530 //printf("Current elem %c pos %d %d\n",old_seq[idx],idx,new_idx);
531 if (old_seq[idx] == ']' || old_seq[idx] == '-' ) {
532 idx++;
533 continue;
534 }
535
536 if (old_seq[idx] == '[') {
537 idx += 2;
538 //printf("%c %c\n",old_seq[idx-2],old_seq[idx]);
539 continue;
540 }
541
542 new_seq[new_idx] = old_seq[idx];
543 idx++;
544 new_idx++;
545 }
546
547 if (new_idx != 36) {
548 printf("Error: Sequence is not of length 36!\n");
549 printf("old seq: %s\n",old_seq);
550 printf("new seq: %s\n",new_seq);
551 exit(EXIT_FAILURE);
552 }
553 }
554
555 /*
556 * TODO:
557 * - Check strand -> done simple (only if equal)
558 * - check for [AC] and similar entries -> done simple (see function
559 * - remove_ambiguities (exchanges [XY] by the second entry)
560 */