+ minor changes to filterReads
[qpalma.git] / tools / data_tools / filterReads.c
1 ////////////////////////////////////////////////////////////
2 // The purpose of this program is to read a gff and a
3 // solexa reads file and create a data set used by QPalma.
4 //
5 //
6 ////////////////////////////////////////////////////////////
7
8 #include <sys/mman.h>
9 #include <sys/stat.h>
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <string.h>
13 #include <unistd.h>
14 #include <math.h>
15
16 #include "debug_tools.h"
17 #include "datastructures.h"
18
19 #define _FILE_OFFSET_BITS == 64
20
21 int compare_gene_struct(struct gene* a, struct gene* b) {
22 return a->stop - b->start;
23 }
24
25 int parse_gff(char* filename,FILE* fid,struct gene*** allGenes);
26 void sort_genes(struct gene*** allGenes, int numGenes);
27 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes,FILE* out_fs);
28 void combine_info(int exon_stop, int exon_start, Read** upstream, int up_size, Read** downstream, int down_size, FILE* out_fs,const char* gene_id, int exon_idx);
29 int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FILE* out_fs,const char* gene_id, int exon_idx);
30 int fitting(char* up_prb, char* down_prb, int u_off, int d_off);
31 void remove_ambiguities(char * old_seq, char* new_seq);
32
33 static char *info = "Usage is:\n./filterReads gff reads output";
34
35 /*
36 * Some constants specifying the exact behavior of the filter
37 *
38 */
39
40 const char* line_format = "%d\t%d\t%s\t%lu\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n";
41
42 const int read_size = 36;
43
44 const int min_overlap = 6;
45 const int max_overlap = 30;
46
47 int read_nr = 1;
48
49 int combined_reads = 0;
50
51 int main(int argc, char* argv[]) {
52
53 if(argc != 5) {
54 printf("%s\n",info);
55 exit(EXIT_FAILURE);
56 }
57
58 int status;
59 int filenameSize = 256;
60 char* gff_filename = malloc(sizeof(char)*filenameSize);
61 char* reads_filename = malloc(sizeof(char)*filenameSize);
62 char* output_filename = malloc(sizeof(char)*filenameSize);
63
64
65 strncpy(gff_filename,argv[1],filenameSize);
66 strncpy(reads_filename,argv[2],filenameSize);
67 strncpy(output_filename,argv[3],filenameSize);
68
69 FILE *gff_fs = fopen(gff_filename,"r");
70 FILE *reads_fs = fopen(reads_filename,"r");
71 FILE *out_fs = fopen(output_filename,"w");
72
73 read_nr = atoi(argv[4]);
74 read_nr++;
75
76 if(gff_fs == NULL) {
77 printf("Error: Could not open file: %s",gff_filename);
78 exit(EXIT_FAILURE);
79 }
80
81 if(reads_fs == NULL) {
82 printf("Error: Could not open file: %s",reads_filename);
83 exit(EXIT_FAILURE);
84 }
85
86 if(out_fs == NULL) {
87 printf("Error: Could not open file: %s",output_filename);
88 exit(EXIT_FAILURE);
89 }
90
91 struct gene** allGenes;
92 int numGenes = parse_gff(gff_filename,gff_fs,&allGenes);
93 status = fclose(gff_fs);
94 free(gff_filename);
95 if(status != 0)
96 printf("closing of gff filestream failed!\n");
97
98 printf("Successfully parsed gff file! Found %d genes.\n",numGenes);
99
100 // check if allGenes is sorted. if not throw away those genes that do not
101 // occur in the sorted order
102 int g_idx;
103 struct gene* currentGene = 0;
104 int nulled_genes=0;
105 int old_gene_stop = -1;
106 for(g_idx=0;g_idx<numGenes;g_idx++) {
107 currentGene = allGenes[g_idx];
108
109 if (! (currentGene->start < currentGene->stop))
110 printf("Invalid positions for gene %s!\n",currentGene->id);
111
112 if (! (old_gene_stop < currentGene->start ) ) {
113 old_gene_stop = currentGene->stop;
114 allGenes[g_idx] = 0;
115 nulled_genes++;
116 continue;
117 }
118 old_gene_stop = currentGene->stop;
119 }
120
121 printf("Found %d unordered genes.\n",nulled_genes);
122 int gidx, eidx;
123 int exon_cov = 0;
124 for(gidx=0;gidx<numGenes;gidx++) {
125 if (allGenes[gidx] == 0)
126 continue;
127
128 for(eidx=0;eidx<allGenes[gidx]->num_exons;eidx++) {
129 exon_cov += allGenes[gidx]->exon_stops[eidx] - allGenes[gidx]->exon_starts[eidx];
130 }}
131 printf("Exon coverage is %f\n",(double)exon_cov/30432563);
132
133 process_reads(reads_fs,&allGenes,numGenes,out_fs);
134
135 for(g_idx=0;g_idx<numGenes;g_idx++) {
136 if(allGenes[g_idx] != 0)
137 free_gene(allGenes[g_idx]);
138 free(allGenes[g_idx]);
139
140 }
141 free(allGenes);
142
143 status = fclose(reads_fs);
144 status = fclose(out_fs);
145 if(status != 0)
146 perror("fclose");
147
148 free(reads_filename);
149 free(output_filename);
150 return 0;
151 }
152
153 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* out_fs) {
154 int status;
155
156 int buffer_size= 64;
157 int chr = 0;
158 int pos = 0;
159 char* seq = malloc(sizeof(char)*buffer_size);
160 unsigned long id = 0;
161 char strand = ' ';
162 int mismatch = 0;
163 int occurrence = 0;
164 int size = 0;
165 int cut = 0;
166 char* prb = malloc(sizeof(char)*buffer_size);
167 char* cal_prb = malloc(sizeof(char)*buffer_size);
168 char* chastity = malloc(sizeof(char)*buffer_size);
169
170 int reads_fid = fileno(reads_fs);
171 struct stat reads_stat;
172 if ( fstat(reads_fid,&reads_stat) == -1) {
173 perror("fstat");
174 exit(EXIT_FAILURE);
175 }
176 off_t reads_filesize = reads_stat.st_size;
177 printf("Reads file is of size %lu bytes\n",(unsigned long) reads_filesize);
178 int numReads = reads_filesize / 178.0;
179
180 void *reads_area = mmap (NULL,reads_filesize,PROT_READ,MAP_PRIVATE,reads_fid,0);
181 if (reads_area == MAP_FAILED) {
182 perror("mmap");
183 exit(EXIT_FAILURE);
184 }
185 close(reads_fid);
186 printf("Successfully mapped %lu bytes of reads file into memory\n",(unsigned long)reads_filesize);
187
188 char* lineBeginPtr = (char*) reads_area;
189 char* lineEndPtr = (char*) reads_area;
190 char* end_of_mapped_area = ((char*) reads_area) + reads_filesize;
191
192 while (*lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
193 lineEndPtr++;
194
195 char* current_line = malloc(sizeof(char)*512);
196 memset(current_line,0,512);
197
198 int SIZE = 500;
199 // initialize boundary arrays
200 Read** upstream_overlap = malloc(sizeof(Read*)*SIZE);
201 Read** downstream_overlap= malloc(sizeof(Read*)*SIZE);
202 int uov,dov;
203 uov = dov = 0;
204
205 int skippedLinesCounter = 0;
206
207 int prev_exon_start = -1;
208 int prev_exon_stop = -1;
209 int cur_exon_start = -1;
210
211 unsigned long line_size = lineEndPtr - lineBeginPtr;
212 //printf("distance is %lu\n",line_size);
213 strncpy(current_line,lineBeginPtr,line_size);
214 current_line[line_size] = '\0';
215 //printf("%s test",current_line);
216
217 int gene_idx = 0;
218 int exon_idx = 1;
219 struct gene* currentGene = (*allGenes)[gene_idx];
220 char* gene_id = currentGene->id;
221
222 int skippedReadCtr = 0;
223 int uselessReadCtr = 0;
224 int exonicReadCtr = 0;
225 int endPrevCtr = 0;
226 int prevOverlapCtr = 0;
227 int currentStartCtr = 0;
228 int currentOverlapCtr = 0;
229 int multioccurReadCtr = 0;
230
231 Read* currentRead;
232 int up_idx, down_idx;
233
234 int readCtr = 0;
235 // start of the parsing loop
236 while(1) {
237 if (gene_idx == numGenes || strcmp(current_line,"") == 0)
238 break;
239
240 gene_id = currentGene->id;
241
242 if (readCtr != 0 && readCtr % 1000000 == 0)
243 printf("Processed %d/%d genes and %d/%d reads.\n",gene_idx,numGenes,readCtr,numReads);
244
245 status = sscanf(current_line,line_format,&chr,&pos,seq,&id,&strand,&mismatch,&occurrence,&size,&cut,prb,cal_prb,chastity);
246 if (status < 12) {
247 skippedLinesCounter++;
248 goto next_read;
249 }
250
251 // if the read is occurring several times elsewhere then get rid of it
252 if ( occurrence != 1 ) {
253 multioccurReadCtr++;
254 goto next_read;
255 }
256
257 FA(strlen(seq) >= read_size);
258
259 FA(currentGene != 0);
260
261 if ( currentGene->start <= pos && (pos + read_size-1) <= currentGene->stop) { // read is not within gene borders
262
263 exon_label:
264
265 if (exon_idx == currentGene->num_exons) {
266 gene_idx++;
267 exon_idx = 1;
268 currentGene = (*allGenes)[gene_idx];
269 while( currentGene == 0 && gene_idx < numGenes) {
270 currentGene = (*allGenes)[gene_idx];
271 gene_idx++;
272 }
273 continue;
274 }
275
276 prev_exon_start = currentGene->exon_starts[exon_idx-1];
277 prev_exon_stop = currentGene->exon_stops[exon_idx-1];
278 cur_exon_start = currentGene->exon_starts[exon_idx];
279
280 //printf("exon: %d %d %d\t pos: %d\n",prev_exon_start,prev_exon_stop,cur_exon_start,pos);
281
282 if (cur_exon_start - prev_exon_stop < min_overlap || cur_exon_start < pos ) { // go to next exon
283
284 if (uov != 0 && dov != 0)
285 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
286
287 for(up_idx=0;up_idx<uov;up_idx++) {
288 free_read(upstream_overlap[up_idx]);
289 free(upstream_overlap[up_idx]);
290 }
291
292 for(down_idx=0;down_idx<dov;down_idx++) {
293 free_read(downstream_overlap[down_idx]);
294 free(downstream_overlap[down_idx]);
295 }
296
297 uov = dov = 0;
298
299 exon_idx++;
300 goto exon_label;
301 }
302
303 if ( prev_exon_start < pos && (pos+read_size) < prev_exon_stop ) { // read is inside previous exon
304 exonicReadCtr++;
305 goto next_read;
306 }
307
308 if ( pos + (read_size-1) < prev_exon_stop ) // go to next read
309 goto next_read;
310
311 // if this position is reached the read is somehow overlapping or
312 // exactly on the exon boundary.
313 if ( (prev_exon_stop - pos) >= min_overlap && (prev_exon_stop - pos) < read_size) { // read overlaps with previous exon boundary
314 //printf("%s\n",current_line);
315 prevOverlapCtr++;
316 currentRead = create_read(chr,pos,seq,id,strand,mismatch,occurrence,size,cut,prb,cal_prb,chastity);
317 upstream_overlap[uov] = currentRead;
318 uov++;
319 goto next_read;
320 }
321
322 int end_pos = pos+read_size-1;
323 if ( (end_pos - cur_exon_start) >= min_overlap && (end_pos - cur_exon_start) < read_size) { // read overlaps with current exon boundary
324 //printf("%s\n",current_line);
325 currentOverlapCtr++;
326 currentRead = create_read(chr,pos,seq,id,strand,mismatch,occurrence,size,cut,prb,cal_prb,chastity);
327 downstream_overlap[dov] = currentRead;
328 dov++;
329 goto next_read;
330 }
331
332 uselessReadCtr++;
333 goto next_read; // read was not useful i.e. not overlapping/starting at exon boundaries
334
335 } else { // read is not within gene borders
336
337 if (uov != 0 && dov != 0)
338 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
339
340 for(up_idx=0;up_idx<uov;up_idx++) {
341 free_read(upstream_overlap[up_idx]);
342 free(upstream_overlap[up_idx]);
343 }
344
345 for(down_idx=0;down_idx<dov;down_idx++) {
346 free_read(downstream_overlap[down_idx]);
347 free(downstream_overlap[down_idx]);
348 }
349
350 uov = dov = 0;
351
352 if ( currentGene->stop < (pos + read_size-1) ) { // go to next gene
353 gene_idx++;
354 exon_idx = 1;
355 currentGene = (*allGenes)[gene_idx];
356 while( currentGene == 0 && gene_idx < numGenes) {
357 currentGene = (*allGenes)[gene_idx];
358 gene_idx++;
359 }
360 //printf("currentGene->start / currentGene->stop %d/%d pos is %d\n",currentGene->start,currentGene->stop,pos);
361 continue;
362 }
363
364 if ( pos < currentGene->start ) { // go to next read
365 skippedReadCtr++;
366
367 next_read:
368
369 lineBeginPtr = lineEndPtr;
370 while (*(char*)lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
371 lineEndPtr++;
372 readCtr += 1;
373 current_line = strncpy(current_line,lineBeginPtr,lineEndPtr-lineBeginPtr);
374 current_line[lineEndPtr-lineBeginPtr] = '\0';
375 continue;
376 }
377 }
378 }
379
380 if (uov != 0 && dov != 0)
381 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
382
383 for(up_idx=0;up_idx<uov;up_idx++) {
384 free_read(upstream_overlap[up_idx]);
385 free(upstream_overlap[up_idx]);
386 }
387
388 for(down_idx=0;down_idx<dov;down_idx++) {
389 free_read(downstream_overlap[down_idx]);
390 free(downstream_overlap[down_idx]);
391 }
392
393 uov = dov = 0;
394
395 free(upstream_overlap);
396 free(downstream_overlap);
397
398 printf("Processed %d reads in total while %d reads where skipped.\n",readCtr,skippedLinesCounter);
399 printf("%d were totally intronic. %d were outside of genes and %d were occurring on several positions.\n",uselessReadCtr,skippedReadCtr,multioccurReadCtr);
400 printf("%d reads were useless in total\n",uselessReadCtr+skippedReadCtr+multioccurReadCtr);
401 printf("%d reads were totally exonic\n",exonicReadCtr);
402 printf("%d reads overlap with prev. exon. %d reads overlap with current exon\n",prevOverlapCtr,currentOverlapCtr);
403 printf("%d reads where newly combined from two original reads\n",combined_reads);
404 printf("Total used reads: %d\n",exonicReadCtr+endPrevCtr+prevOverlapCtr+currentStartCtr+currentOverlapCtr);
405
406 status = munmap(reads_area,reads_filesize);
407 if(status != 0)
408 perror("munmap");
409
410 free(current_line);
411 free(seq);
412 free(prb);
413 free(cal_prb);
414 free(chastity);
415 }
416
417 void combine_info(int exon_stop, int exon_start, Read** upstream, int up_size, Read** downstream, int down_size, FILE* out_fs,const char* gene_id, int exon_idx) {
418
419 //printf("up/down size is %d/%d\n",up_size,down_size);
420
421 int up_idx, down_idx, success;
422
423 char* up_used_flag = calloc(up_size,sizeof(char));
424 char* down_used_flag = calloc(down_size,sizeof(char));
425
426 Read* currentUpRead;
427 Read* currentDownRead;
428
429 for(up_idx=0;up_idx<up_size;up_idx++) {
430 if( up_used_flag[up_idx] == 1)
431 continue;
432
433 currentUpRead = upstream[up_idx];
434
435 for(down_idx=0;down_idx<down_size;down_idx++) {
436
437 if( up_used_flag[up_idx] == 1 || down_used_flag[down_idx] == 1)
438 continue;
439
440 currentDownRead = downstream[down_idx];
441
442 if(currentUpRead->strand != currentDownRead->strand)
443 continue;
444
445 success = join_reads(exon_stop,exon_start,currentUpRead,currentDownRead,out_fs,gene_id,exon_idx);
446
447 if(success == 1) {
448 up_used_flag[up_idx] = 1;
449 down_used_flag[down_idx] = 1;
450 }
451
452 }
453 }
454
455 free(up_used_flag);
456 free(down_used_flag);
457 }
458
459 int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FILE* out_fs,const char* gene_id, int exon_idx) {
460 int buffer_size = read_size+1;
461
462 char* new_seq = malloc(sizeof(char)*buffer_size);
463 char* new_prb = malloc(sizeof(char)*buffer_size);
464 char* new_cal_prb = malloc(sizeof(char)*buffer_size);
465 char* new_chastity = malloc(sizeof(char)*buffer_size);
466
467 char* new_up_seq = malloc(sizeof(char)*(read_size+1));
468 char* new_down_seq = malloc(sizeof(char)*(read_size+1));
469 memset(new_up_seq,0,sizeof(char)*read_size);
470 memset(new_down_seq,0,sizeof(char)*read_size);
471
472 // range of possible sequence length on exon side
473 int up_range = exon_stop - up_read->pos+1;
474 int down_range = down_read->pos+(read_size-1) - exon_start+1;
475
476 int u_off, d_off, u_size, d_size;
477 u_size = -1;
478 d_size = -1;
479
480 //printf("possible range %d %d\n",up_range,down_range);
481
482 if(up_range+down_range < read_size)
483 return 0;
484
485 int half_read = read_size / 2;
486
487 // both overlap more than a half of the new read
488 if(up_range >= half_read && down_range >= half_read) {
489
490 u_size = half_read;
491 d_size = half_read;
492
493 } else {
494
495 if(up_range < down_range) {
496 u_size = up_range;
497 d_size = read_size - u_size;
498 }
499
500 if(up_range > down_range) {
501 d_size = down_range;
502 u_size = read_size - d_size;
503 }
504 }
505
506 int p_start = exon_stop - u_size + 1;
507 int p_stop = exon_start + d_size - 1;
508
509 u_off = p_start - up_read->pos;
510 d_off = exon_start - down_read->pos;
511
512 FA(u_size != -1);
513 FA(d_size != -1);
514 FA(u_size + d_size == read_size);
515
516 //printf("%lu %lu\n",up_read->id,down_read->id);
517
518 remove_ambiguities(up_read->seq,new_up_seq);
519 remove_ambiguities(down_read->seq,new_down_seq);
520
521 strncpy(new_seq,new_up_seq+u_off,u_size);
522 strncpy(new_seq+u_size,new_down_seq+d_off,d_size);
523 new_seq[read_size] = '\0';
524
525 strncpy(new_prb, up_read->prb+u_off, u_size);
526 strncpy(new_prb+u_size, down_read->prb+d_off, d_size);
527 new_prb[read_size] = '\0';
528
529 strncpy(new_cal_prb, up_read->cal_prb+u_off, u_size);
530 strncpy(new_cal_prb+u_size, down_read->cal_prb+d_off, d_size);
531 new_cal_prb[read_size] = '\0';
532
533 strncpy(new_chastity, up_read->chastity+u_off, u_size);
534 strncpy(new_chastity+u_size, down_read->chastity+d_off, d_size);
535 new_chastity[read_size] = '\0';
536
537 int status = fitting(up_read->prb,down_read->prb,u_off,d_off);
538
539 if(status != 1)
540 return status;
541
542 //if( read_nr == 13 || read_nr == 14 ) {
543 // printf("read nr %d",read_nr);
544 // printf("pos: %d %d %d %d\n",p_start,exon_stop,exon_start,p_stop);
545 // printf("u/d range: %d %d\n",up_range,down_range);
546 // printf("u/d size: %d %d\n",u_size,d_size);
547 // printf("u/d off: %d %d\n",u_off,d_off);
548 // printf("******************\n");
549
550 // printf("%s\n",up_read->seq);
551 // printf("%s\n",down_read->seq);
552
553 // printf("******************\n");
554
555 // printf("%s\n",new_up_seq);
556 // printf("%s\n",new_down_seq);
557
558 // printf("******************\n");
559 // printf("%s\n",new_seq);
560 // printf("%s\n",new_prb);
561 // printf("%s\n",new_cal_prb);
562 // printf("%s\n",new_chastity);
563 //}
564
565 //// The four last integers specify the positions of
566 //// p_start : the position in the dna where the first read starts
567 //// exons_stop : the position in the dna where the first exons ends
568 //// exons_start : the position in the dna where the second exons starts
569 //// p_stop : the position in the dna where the (truncated) second read ends
570
571 fprintf(out_fs,"%d\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\n",
572 read_nr,up_read->chr,up_read->strand,new_seq,u_size,read_size,new_prb,new_cal_prb,new_chastity,gene_id,p_start,exon_stop,exon_start,p_stop);
573
574 read_nr++;
575 combined_reads++;
576
577 free(new_up_seq);
578 free(new_down_seq);
579
580 free(new_seq);
581 free(new_prb);
582 free(new_cal_prb);
583 free(new_chastity);
584
585 return 1;
586 }
587
588 int fitting(char* up_prb, char* down_prb, int u_off, int d_off) {
589 double current_mean_up = 0;
590 double current_mean_down = 0;
591
592 int w_size = 3;
593
594 int idx;
595
596 for(idx=0;idx<w_size;idx++) {
597 current_mean_up += up_prb[u_off+idx]-50;
598 current_mean_up += up_prb[u_off-idx]-50;
599 current_mean_up -= up_prb[idx]-50;
600
601 current_mean_down += up_prb[d_off+idx]-50;
602 current_mean_down += up_prb[d_off-idx]-50;
603 current_mean_down -= up_prb[idx]-50;
604 }
605
606 current_mean_up /= (2*w_size+1);
607 current_mean_down /= (2*w_size+1);
608
609 double ratio;
610 if( current_mean_up > current_mean_down)
611 ratio = current_mean_down / current_mean_up;
612 else
613 ratio = current_mean_up / current_mean_down;
614
615 if (ratio < 0.5)
616 return 0;
617
618 return 1;
619 }
620
621 void remove_ambiguities(char * old_seq, char* new_seq) {
622 int idx=0;
623 int new_idx = 0;
624 int old_seq_size = strlen(old_seq);
625
626 while(idx<old_seq_size) {
627 //printf("Current elem %c pos %d %d\n",old_seq[idx],idx,new_idx);
628 if (old_seq[idx] == ']' || old_seq[idx] == '-' ) {
629 idx++;
630 continue;
631 }
632
633 if (old_seq[idx] == '[') {
634 idx += 2;
635 //printf("%c %c\n",old_seq[idx-2],old_seq[idx]);
636 continue;
637 }
638
639 new_seq[new_idx] = old_seq[idx];
640 idx++;
641 new_idx++;
642 //printf("%s\t%s\n",old_seq,new_seq);
643 //printf("%d\t%d\n",idx,new_idx);
644 }
645
646 if (new_idx != read_size) {
647 printf("Error: Sequence is not of length 36!\n");
648 printf("old seq: %s\n",old_seq);
649 printf("new seq: %s\n",new_seq);
650 exit(EXIT_FAILURE);
651 }
652
653 new_seq[read_size] = '\0';
654 }
655
656 void print_read(Read* cRead) {
657
658 printf(line_format,
659 cRead->chr, cRead->pos, cRead->seq, cRead->id,
660 cRead->strand, cRead->mismatch, cRead->occurrence,
661 cRead->size, cRead->cut, cRead->prb, cRead->cal_prb,
662 cRead->chastity);
663 }
664
665 /*
666 * TODO:
667 * - Check strand -> done simple (only if equal)
668 * - check for [AC] and similar entries -> done simple (see function
669 * - remove_ambiguities (exchanges [XY] by the second entry)
670 */