+ restored functionality
[qpalma.git] / tools / data_tools / filterReads.c
1 ////////////////////////////////////////////////////////////
2 // The purpose of this program is to read a gff and a
3 // solexa reads file and create a data set used by QPalma.
4 //
5 //
6 //
7 ////////////////////////////////////////////////////////////
8
9 #include <sys/mman.h>
10 #include <sys/stat.h>
11 #include <stdio.h>
12 #include <stdlib.h>
13 #include <string.h>
14 #include <unistd.h>
15 #include <math.h>
16
17 #include "datastructures.h"
18
19 #define _FILE_OFFSET_BITS == 64
20
21 int compare_gene_struct(struct gene* a, struct gene* b) {
22 return a->stop - b->start;
23 }
24
25 int parse_gff(char* filename,FILE* fid,struct gene*** allGenes);
26
27 void sort_genes(struct gene*** allGenes, int numGenes);
28
29 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes,FILE* out_fs);
30
31 void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size, FILE* out_fs,const char* gene_id, int exon_idx);
32
33 //int fitting(char* up_prb, char* up_prb_end, char* down_prb, char* down_prb_end);
34 int fitting(char* up_prb, char* down_prb);
35
36 void remove_ambiguities(char * old_seq, int old_seq_size, char* new_seq);
37
38 static char *info = "Usage is:\n./filterReads gff reads output";
39
40 /*
41 * Some constants specifying the exact behavior of the filter
42 *
43 */
44
45 const int read_size = 36;
46
47 const int min_overlap = 6;
48 const int max_overlap = 30;
49
50
51 int combined_reads = 0;
52 int main(int argc, char* argv[]) {
53
54 if(argc != 4) {
55 printf("%s\n",info);
56 exit(EXIT_FAILURE);
57 }
58
59 int status;
60 int filenameSize = 256;
61 char* gff_filename = malloc(sizeof(char)*filenameSize);
62 char* reads_filename = malloc(sizeof(char)*filenameSize);
63 char* output_filename = malloc(sizeof(char)*filenameSize);
64
65 strncpy(gff_filename,argv[1],filenameSize);
66 strncpy(reads_filename,argv[2],filenameSize);
67 strncpy(output_filename,argv[3],filenameSize);
68
69 FILE *gff_fs = fopen(gff_filename,"r");
70 FILE *reads_fs = fopen(reads_filename,"r");
71 FILE *out_fs = fopen(output_filename,"w");
72
73 if(gff_fs == NULL) {
74 printf("Error: Could not open file: %s",gff_filename);
75 exit(EXIT_FAILURE);
76 }
77
78 if(reads_fs == NULL) {
79 printf("Error: Could not open file: %s",reads_filename);
80 exit(EXIT_FAILURE);
81 }
82
83 if(out_fs == NULL) {
84 printf("Error: Could not open file: %s",output_filename);
85 exit(EXIT_FAILURE);
86 }
87
88 struct gene** allGenes;
89 int numGenes = parse_gff(gff_filename,gff_fs,&allGenes);
90 status = fclose(gff_fs);
91 free(gff_filename);
92 if(status != 0)
93 printf("closing of gff filestream failed!\n");
94
95 printf("Successfully parsed gff file! Found %d genes.\n",numGenes);
96
97 // some entries in the gff files are not in a sorted order
98 //printf("Sorting genes...\n");
99 //sort_genes(&allGenes,numGenes);
100 //qsort(allGenes,numGenes,sizeof(struct gene*),compare_gene_struct);
101 ///printf("Genes were sorted!\n");
102
103 int gidx, eidx;
104 int exon_cov = 0;
105 for(gidx=0;gidx<numGenes;gidx++) {
106 for(eidx=0;eidx<allGenes[gidx]->num_exons;eidx++) {
107 exon_cov += allGenes[gidx]->exon_stops[eidx] - allGenes[gidx]->exon_starts[eidx];
108 }}
109 printf("Exon coverage is %f\n",(double)exon_cov/30432563);
110
111 process_reads(reads_fs,&allGenes,numGenes,out_fs);
112
113 status = fclose(reads_fs);
114 status = fclose(out_fs);
115 if(status != 0)
116 perror("fclose");
117
118 free(reads_filename);
119 free(output_filename);
120 return 0;
121 }
122
123 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* out_fs) {
124 int status;
125
126 int buffer_size= 64;
127 int chr = 0;
128 int pos = 0;
129 char* seq = malloc(sizeof(char)*buffer_size);
130 unsigned long id = 0;
131 char strand = ' ';
132 int mismatch = 0;
133 int occurrence = 0;
134 int size = 0;
135 int cut = 0;
136 char* prb = malloc(sizeof(char)*buffer_size);
137 char* cal_prb = malloc(sizeof(char)*buffer_size);
138 char* chastity = malloc(sizeof(char)*buffer_size);
139
140 int reads_fid = fileno(reads_fs);
141 struct stat reads_stat;
142 if ( fstat(reads_fid,&reads_stat) == -1) {
143 perror("fstat");
144 exit(EXIT_FAILURE);
145 }
146 off_t reads_filesize = reads_stat.st_size;
147 printf("Reads file is of size %lu bytes\n",(unsigned long) reads_filesize);
148 int numReads = reads_filesize / 178.0;
149
150 void *reads_area = mmap (NULL,reads_filesize,PROT_READ|PROT_WRITE,MAP_PRIVATE,reads_fid,0);
151 if (reads_area == MAP_FAILED) {
152 perror("mmap");
153 exit(EXIT_FAILURE);
154 }
155 close(reads_fid);
156 printf("Successfully mapped %lu bytes of reads file into memory\n",(unsigned long)reads_filesize);
157
158 void* linePtr = reads_area;
159 char* current_line = malloc(sizeof(char)*256);
160
161 int SIZE = 500;
162 // initialize boundary arrays
163 void** upstream_end = malloc(sizeof(void*)*SIZE);
164 void** upstream_overlap = malloc(sizeof(void*)*SIZE);
165 void** downstream_start = malloc(sizeof(void*)*SIZE);
166 void** downstream_overlap= malloc(sizeof(void*)*SIZE);
167 int ue,uo,ds,dov;
168 ue = uo = ds = dov = 0;
169
170 int skippedLinesCounter = 0;
171
172 int prev_exon_start = -1;
173 int prev_exon_stop = -1;
174 int cur_exon_start = -1;
175
176 current_line = strncpy(current_line,linePtr,256);
177 int gene_idx = 0;
178 int exon_idx = 1;
179 struct gene* currentGene = (*allGenes)[gene_idx];
180 char* gene_id = currentGene->id;
181
182 //char* disamb_seq = malloc(sizeof(char)*read_size);
183
184 int skippedReadCtr = 0;
185 int uselessReadCtr = 0;
186 int exonicReadCtr = 0;
187 int endPrevCtr = 0;
188 int prevOverlapCtr = 0;
189 int currentStartCtr = 0;
190 int currentOverlapCtr = 0;
191 int multioccurReadCtr = 0;
192
193 int readCtr = 0;
194 // start of the parsing loop
195 while(1) {
196 if (gene_idx == numGenes || strcmp(current_line,"") == 0)
197 break;
198
199 gene_id = currentGene->id;
200
201 if (readCtr != 0 && readCtr % 1000000 == 0)
202 printf("Processed %d/%d genes and %d/%d reads.\n",gene_idx,numGenes,readCtr,numReads);
203
204 //if (gene_idx >= 1833)
205 // printf("currentGene start/stop: %d/%d. Positions is %d\n",currentGene->start,currentGene->stop,pos);
206
207 status = sscanf(current_line,"%d\t%d\t%s\t%lu\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
208 &chr,&pos,seq,&id,&strand,&mismatch,&occurrence,&size,&cut,prb,cal_prb,chastity);
209 if (status < 12) {
210 skippedLinesCounter++;
211 goto next_read;
212 }
213
214 // if the read is occurring several times elsewhere then get rid of it
215 //if(!(occurrence >= 1 && occurrence <= 25)) {
216 if ( occurrence != 1 ) {
217 multioccurReadCtr++;
218 goto next_read;
219 }
220
221 if (!(currentGene->start <= pos && (pos + read_size-1) <= currentGene->stop)) { // read is not within gene borders
222
223 if ( currentGene->stop < (pos + read_size-1) ) { // go to next gene
224 gene_idx++;
225 exon_idx = 1;
226 currentGene = (*allGenes)[gene_idx];
227 //printf("currentGene->start / currentGene->stop %d/%d pos is %d\n",currentGene->start,currentGene->stop,pos);
228 ue = uo = ds = dov = 0;
229 continue;
230 }
231
232 if ( pos < currentGene->start ) { // go to next read
233 skippedReadCtr++;
234
235 next_read:
236
237 while (*(char*)linePtr != '\n') linePtr++;
238 linePtr++;
239 readCtr += 1;
240 current_line = strncpy(current_line,linePtr,256);
241 continue;
242 }
243
244 } else { // read IS within gene borders
245
246 exon_label:
247
248 if (exon_idx == currentGene->num_exons) {
249 gene_idx++;
250 exon_idx = 1;
251 currentGene = (*allGenes)[gene_idx];
252 continue;
253 }
254
255 prev_exon_start = currentGene->exon_starts[exon_idx-1];
256 prev_exon_stop = currentGene->exon_stops[exon_idx-1];
257 cur_exon_start = currentGene->exon_starts[exon_idx];
258
259 //printf("exon %d %d inton til %d pos %d\n",prev_exon_start,prev_exon_stop,cur_exon_start,pos);
260
261 if (cur_exon_start - prev_exon_stop < min_overlap || cur_exon_start < pos ) { // go to next exon
262 exon_idx++;
263
264 if (ue != 0 && dov != 0)
265 combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov,out_fs,gene_id,exon_idx-1);
266
267 if (uo != 0 && ds != 0)
268 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds,out_fs,gene_id,exon_idx-1);
269
270 ue = uo = ds = dov = 0;
271 goto exon_label;
272 }
273
274 if ( prev_exon_start < pos && (pos+read_size) < prev_exon_stop ) { // read is inside previous exon
275 exonicReadCtr++;
276
277 // Removed exonic reads
278
279 //remove_ambiguities(seq,strlen(seq),disamb_seq);
280 //fprintf(out_fs,"%d\t%c\t%s\t%d\t%s\t%s\t%s\n",chr,strand,disamb_seq,read_size,prb,cal_prb,chastity);
281
282 goto next_read;
283 }
284
285 if ( pos + (read_size-1) < prev_exon_stop ) // go to next read
286 goto next_read;
287
288 // if this position is reached the read is somehow overlapping or
289 // exactly on the exon boundary. now determine where exactly:
290 if (pos + (read_size-1) == prev_exon_stop) { // read ends at previous exon end
291 endPrevCtr++;
292 upstream_end[ue] = linePtr;
293 ue++;
294 goto next_read;
295 }
296
297 if ( (prev_exon_stop - pos) >= min_overlap && (prev_exon_stop - pos) <= max_overlap) { // read overlaps with previous exon boundary
298 prevOverlapCtr++;
299 upstream_overlap[uo] = linePtr;
300 uo++;
301 goto next_read;
302 }
303
304 if ( pos == cur_exon_start ) { // read starts at current exon start
305 currentStartCtr++;
306 downstream_start[ds] = linePtr;
307 ds++;
308 goto next_read;
309 }
310
311 if ( (cur_exon_start - pos) >= min_overlap && (cur_exon_start - pos) <= max_overlap) { // read overlaps with current exon boundary
312 currentOverlapCtr++;
313 downstream_overlap[dov] = linePtr;
314 dov++;
315 goto next_read;
316 }
317
318 uselessReadCtr++;
319 goto next_read; // read was not useful i.e. not overlapping/starting at exon boundaries
320 }
321 }
322
323 combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov,out_fs,gene_id,exon_idx);
324 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds,out_fs,gene_id,exon_idx);
325
326 free(upstream_end);
327 free(upstream_overlap);
328 free(downstream_start);
329 free(downstream_overlap);
330
331 printf("Processed %d reads in total while %d reads where skipped.\n",readCtr,skippedLinesCounter);
332 printf("%d were totally intronic. %d were outside of genes and %d were occurring on several positions.\n",uselessReadCtr,skippedReadCtr,multioccurReadCtr);
333 printf("%d reads were useless in total\n",uselessReadCtr+skippedReadCtr+multioccurReadCtr);
334 printf("%d reads were totally exonic\n",exonicReadCtr);
335 printf("%d reads overlap with prev. exon. %d reads overlap with current exon\n",prevOverlapCtr,currentOverlapCtr);
336 printf("%d reads where newly combined from two original reads\n",combined_reads);
337 printf("Total used reads: %d\n",exonicReadCtr+endPrevCtr+prevOverlapCtr+currentStartCtr+currentOverlapCtr);
338
339 status = munmap(reads_area,reads_filesize);
340 if(status != 0)
341 perror("munmap");
342
343 //free(current_line);
344 free(seq);
345 free(prb);
346 free(cal_prb);
347 free(chastity);
348 }
349
350 void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size,FILE* out_fs,const char* gene_id, int exon_idx) {
351 //printf("up/down size is %d/%d\n",up_size,down_size);
352
353 int up_idx, down_idx, status;
354 char* upstream_line = malloc(sizeof(char)*256);
355 char* downstream_line = malloc(sizeof(char)*256);
356
357 int buffer_size= 64;
358
359 int up_chr = 0;
360 int up_pos = 0;
361 char* up_seq = malloc(sizeof(char)*buffer_size);
362 int up_id = 0;
363 char up_strand = ' ';
364 int up_mismatch = 0;
365 int up_occurrence = 0;
366 int up_sz = 0;
367 int up_cut = 0;
368 char* up_prb = malloc(sizeof(char)*buffer_size);
369 char* up_cal_prb = malloc(sizeof(char)*buffer_size);
370 char* up_chastity = malloc(sizeof(char)*buffer_size);
371
372 int down_chr = 0;
373 int down_pos = 0;
374 char* down_seq = malloc(sizeof(char)*buffer_size);
375 int down_id = 0;
376 char down_strand = ' ';
377 int down_mismatch = 0;
378 int down_occurrence = 0;
379 int down_sz = 0;
380 int down_cut = 0;
381 char* down_prb = malloc(sizeof(char)*buffer_size);
382 char* down_cal_prb = malloc(sizeof(char)*buffer_size);
383 char* down_chastity = malloc(sizeof(char)*buffer_size);
384
385 int new_chr = 0;
386 char* new_seq = malloc(sizeof(char)*buffer_size);
387 char new_strand = ' ';
388 char* new_prb = malloc(sizeof(char)*buffer_size);
389 char* new_cal_prb = malloc(sizeof(char)*buffer_size);
390 char* new_chastity = malloc(sizeof(char)*buffer_size);
391 char* new_up_seq = malloc(sizeof(char)*read_size);
392 char* new_down_seq = malloc(sizeof(char)*read_size);
393
394 int overlap;
395 int fit;
396
397 char* used_flag = calloc(down_size,sizeof(char));
398
399 for(up_idx=0;up_idx<up_size;up_idx++) {
400 strncpy(upstream_line,upstream[up_idx],256);
401 status = sscanf(upstream_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
402 &up_chr,&up_pos,up_seq,&up_id,&up_strand,&up_mismatch,&up_occurrence,&up_sz,
403 &up_cut,up_prb,up_cal_prb,up_chastity);
404
405 remove_ambiguities(up_seq,strlen(up_seq),new_up_seq);
406
407 overlap = exon_stop - up_pos;
408
409 for(down_idx=0;down_idx<down_size;down_idx++) {
410 if( used_flag[down_idx] == 1)
411 continue;
412
413 strncpy(downstream_line,downstream[down_idx],256);
414 status = sscanf(downstream_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
415 &down_chr,&down_pos,down_seq,&down_id,&down_strand,&down_mismatch,&down_occurrence,&down_sz,
416 &down_cut,down_prb,down_cal_prb,down_chastity);
417
418 remove_ambiguities(down_seq,strlen(down_seq),new_down_seq);
419
420 new_seq[0] = '\0';
421 new_prb[0] = '\0';
422 new_cal_prb[0] = '\0';
423 new_chastity[0] = '\0';
424 int splitpos = 0;
425
426 fit = fitting(up_prb+(36-overlap),down_prb);
427 if (fit == -1)
428 continue;
429
430 if (! (fit < overlap ))
431 continue;
432
433 new_chr = up_chr;
434 new_strand = up_strand;
435
436 strncat(new_seq,new_up_seq,overlap);
437 strncat(new_prb,up_prb,overlap);
438 strncat(new_cal_prb,up_cal_prb,overlap);
439 strncat(new_chastity,up_chastity,overlap);
440
441 strncat(new_seq,new_down_seq+fit,read_size-overlap);
442 strncat(new_prb,down_prb+fit,read_size-overlap);
443 strncat(new_cal_prb,down_cal_prb+fit,read_size-overlap);
444 strncat(new_chastity,down_chastity+fit,read_size-overlap);
445
446 fprintf(out_fs,"%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\n",
447 new_chr,new_strand,new_seq,splitpos,read_size,new_prb,new_cal_prb,new_chastity,gene_id,exon_idx);
448
449 combined_reads++;
450 used_flag[down_idx] = 1;
451 }
452 }
453
454 free(upstream_line);
455 free(downstream_line);
456
457 free(new_up_seq);
458 free(new_down_seq);
459
460 free(up_seq);
461 free(up_prb);
462 free(up_cal_prb);
463 free(up_chastity);
464
465 free(down_seq);
466 free(down_prb);
467 free(down_cal_prb);
468 free(down_chastity);
469
470 free(new_seq);
471 free(new_prb);
472 free(new_cal_prb);
473 free(new_chastity);
474 }
475
476 int fitting(char* up_prb, char* down_prb) {
477 double epsilon_mean = 30.0;
478 double epsilon_var = 30.0;
479 int w_size = 6;
480
481 double current_mean_up = 0;
482 double current_variance_up = 0;
483 double current_mean_down = 0;
484 double current_variance_down = 0;
485
486 double* mean_up = malloc(sizeof(double)*read_size-2*w_size);
487 double* variance_up = malloc(sizeof(double)*read_size-2*w_size);
488 double* mean_down = malloc(sizeof(double)*read_size-2*w_size);
489 double* variance_down = malloc(sizeof(double)*read_size-2*w_size);
490
491 int iidx = -1;
492 int uidx;
493 int didx;
494
495 for(uidx=iidx-w_size;uidx<iidx;uidx++) {
496 didx = uidx+w_size;
497 current_mean_up += up_prb[uidx]-50;
498 current_mean_down += up_prb[didx]-50;
499
500 }
501 current_mean_up /= w_size;
502 current_mean_down /= w_size;
503
504 for(uidx=iidx-w_size;uidx<iidx;uidx++) {
505 didx = uidx+w_size;
506 current_variance_up += pow(up_prb[uidx]-50 - current_mean_up,2);
507 current_variance_down += pow(up_prb[didx]-50 - current_mean_up,2);
508
509 }
510 current_variance_up /= (w_size-1);
511 current_variance_down /= (w_size-1);
512
513 for(iidx=w_size;iidx<30;iidx++) {
514 for(uidx=iidx-w_size;uidx<iidx;uidx++) {
515 didx = uidx+w_size;
516 mean_up[iidx-w_size] += down_prb[uidx]-50;
517 mean_down[iidx-w_size] += down_prb[didx]-50;
518
519 }
520 mean_up[iidx-w_size] /= w_size;
521 mean_down[iidx-w_size] /= w_size;
522
523 for(uidx=iidx-w_size;uidx<iidx;uidx++) {
524 didx = uidx+w_size;
525 variance_up[iidx-w_size] += pow(down_prb[uidx]-50 - mean_up[iidx-w_size],2);
526 variance_down[iidx-w_size] += pow(down_prb[didx]-50 - mean_down[iidx-w_size],2);
527 }
528 variance_up[iidx-w_size] /= (w_size-1);
529 variance_down[iidx-w_size] /= (w_size-1);
530
531 //printf("means: %f %f %f %f, variances: %f %f %f %f\n",current_mean_up,current_mean_down,mean_up,mean_down,current_variance_up,current_variance_down,variance_up,variance_down);
532 //if ( abs(current_mean_up - mean_up) < epsilon_mean && abs(current_variance_up - variance_up) < epsilon_var
533 //&& abs(current_mean_down - mean_down) < epsilon_mean && abs(current_variance_down - variance_down) < epsilon_var )
534 // return iidx;
535 }
536
537 int bidx;
538 int bestIdx = -1;
539 double min = 1000.0;
540 for(bidx=0;bidx<read_size-2*w_size;bidx++) {
541 if ( abs(current_mean_up - mean_up[bidx]) < epsilon_mean && abs(current_variance_up - variance_up[bidx]) < epsilon_var
542 && abs(current_mean_down - mean_down[bidx]) < epsilon_mean && abs(current_variance_down - variance_down[bidx]) < epsilon_var ) {
543 if ( abs(current_mean_up - mean_up[bidx]) + abs(current_variance_up - variance_up[bidx]) + abs(current_mean_down - mean_down[bidx]) + abs(current_variance_down - variance_down[bidx])) {
544 min = abs(current_mean_up - mean_up[bidx]) + abs(current_variance_up - variance_up[bidx]) + abs(current_mean_down - mean_down[bidx]) + abs(current_variance_down - variance_down[bidx]);
545 bestIdx = bidx;
546 }
547 }
548 }
549
550 free(mean_up);
551 free(variance_up);
552 free(mean_down);
553 free(variance_down);
554
555 return bestIdx;
556 }
557
558 void remove_ambiguities(char * old_seq, int old_seq_size, char* new_seq) {
559 int idx=0;
560 int new_idx = 0;
561
562 while(idx<old_seq_size) {
563 //printf("Current elem %c pos %d %d\n",old_seq[idx],idx,new_idx);
564 if (old_seq[idx] == ']' || old_seq[idx] == '-' ) {
565 idx++;
566 continue;
567 }
568
569 if (old_seq[idx] == '[') {
570 idx += 2;
571 //printf("%c %c\n",old_seq[idx-2],old_seq[idx]);
572 continue;
573 }
574
575 new_seq[new_idx] = old_seq[idx];
576 idx++;
577 new_idx++;
578 }
579
580 if (new_idx != 36) {
581 printf("Error: Sequence is not of length 36!\n");
582 printf("old seq: %s\n",old_seq);
583 printf("new seq: %s\n",new_seq);
584 exit(EXIT_FAILURE);
585 }
586 }
587
588 /*
589 * TODO:
590 * - Check strand -> done simple (only if equal)
591 * - check for [AC] and similar entries -> done simple (see function
592 * - remove_ambiguities (exchanges [XY] by the second entry)
593 */