From 811eb349c8f8b7348a9ae0e909b1092316ce07c7 Mon Sep 17 00:00:00 2001 From: fabio Date: Wed, 19 Dec 2007 11:43:54 +0000 Subject: [PATCH] + added processing function for reads TODO - figure out how to use sscanf together with mmap git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@7170 e1793c9e-67f9-0310-80fc-b846ff1f7b36 --- tools/data_tools/Makefile | 6 +- tools/data_tools/datastructures.c | 15 ++- tools/data_tools/filterReads.c | 188 +++++++++++++++++++++++++++--- tools/data_tools/parser.c | 36 +++--- 4 files changed, 203 insertions(+), 42 deletions(-) diff --git a/tools/data_tools/Makefile b/tools/data_tools/Makefile index 71b500f..618d9a2 100644 --- a/tools/data_tools/Makefile +++ b/tools/data_tools/Makefile @@ -2,8 +2,10 @@ SRCS= parser.c\ datastructures.c - OBJS = $(SRCS:%.cpp=%.o) all: $(OBJS) - gcc -Wall -o filterReads $(OBJS) filterReads.c + gcc -O3 -Wall -o filterReads $(OBJS) filterReads.c + +clean: + rm -rf *.o diff --git a/tools/data_tools/datastructures.c b/tools/data_tools/datastructures.c index 2f464c4..dd9e48e 100644 --- a/tools/data_tools/datastructures.c +++ b/tools/data_tools/datastructures.c @@ -9,8 +9,6 @@ struct gene* gene_alloc(void) { newGene->exon_stops = malloc(2*sizeof(int)); newGene->num_exons = 0; newGene->max_exons = 2; - newGene->start = -1; - return newGene; } @@ -20,14 +18,15 @@ void free_gene(struct gene* oldGene) { } void add_exon(struct gene* currentGene,int start, int stop) { + int idx = currentGene->num_exons; - if (currentGene->num_exons < currentGene->num_exons) { - int idx = currentGene->num_exons; - currentGene->exon_starts[idx] = start; - currentGene->exon_stops[idx] = stop; - currentGene->num_exons++; - } else { + if ( idx >= currentGene->max_exons) { currentGene->exon_starts = realloc(currentGene->exon_starts,sizeof(int)*2*currentGene->max_exons); currentGene->exon_stops = realloc(currentGene->exon_stops,sizeof(int)*2*currentGene->max_exons); + currentGene->max_exons *= 2; } + + currentGene->exon_starts[idx] = start; + currentGene->exon_stops[idx] = stop; + currentGene->num_exons++; } diff --git a/tools/data_tools/filterReads.c b/tools/data_tools/filterReads.c index 4e7ec9f..6044142 100644 --- a/tools/data_tools/filterReads.c +++ b/tools/data_tools/filterReads.c @@ -4,7 +4,6 @@ // // // -// //////////////////////////////////////////////////////////// #include @@ -14,7 +13,9 @@ #include "datastructures.h" -void parse_gff(char* filename,FILE* fid,struct gene* allGenes); +int parse_gff(char* filename,FILE* fid,struct gene*** allGenes); + +void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes); static char *info = "Usage is:\n./filterReads gff reads"; @@ -27,6 +28,7 @@ int main(int argc, char* argv[]) { //static size_t page_size; // page_size = (size_t) sysconf (_SC_PAGESIZE); + int status; int filenameSize = 256; char *gff_filename = malloc(sizeof(char)*filenameSize); char *reads_filename = malloc(sizeof(char)*filenameSize); @@ -47,34 +49,188 @@ int main(int argc, char* argv[]) { exit(EXIT_FAILURE); } - struct gene **allGenes; - parse_gff(gff_filename,gff_fs,allGenes); + struct gene** allGenes; + int numGenes = parse_gff(gff_filename,gff_fs,&allGenes); + status = fclose(gff_fs); + if(status != 0) + printf("closing of gff filestream failed!\n"); + process_reads(reads_fs,&allGenes,numGenes); + + status += fclose(reads_fs); + if(status != 0) + printf("closing of filestreams failed!\n"); + + //free(allGenes); + free(gff_filename); + free(reads_filename); + return 0; +} + + +void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes) { + int status; + + int buffer_size= 64; + int chr = 0; + int pos = 0; + char* seq = malloc(sizeof(char)*buffer_size); + int id = 0; + char strand = ' '; + int mismatch = 0; + int repetition = 0; + int size = 0; + int cut = 0; + char* prb = malloc(sizeof(char)*buffer_size); + char* cal_prb = malloc(sizeof(char)*buffer_size); + char* chastity = malloc(sizeof(char)*buffer_size); int reads_fid = fileno(reads_fs); - size_t mmapAreaSize = 16; + size_t mmapAreaSize = 20000; + void *reads_area = mmap (NULL,mmapAreaSize,PROT_READ|PROT_WRITE,MAP_PRIVATE,reads_fid,0); if((long int) reads_area == -1) printf("mmapping failed!\n"); + void* linePtr = reads_area; + char *test_string = malloc(sizeof(char)*10); strncpy(test_string,(char*)reads_area,10); - //printf("Reads area at %s\n",test_string); - int status = munmap(reads_area,mmapAreaSize); + status = munmap(reads_area,mmapAreaSize); if(status != 0) printf("munmap failed!\n"); - status = fclose(gff_fs); - status += fclose(reads_fs); - if(status != 0) - printf("closing of filestreams failed!\n"); + void** upstream_end; + void** upstream_overlap; + + void** downstream_start; + void** downstream_overlap; + + int ue,uo,ds,dov; - printf("sizeof(struct gene) %d\n",sizeof(struct gene)); - printf("sizeof(struct read) %d\n",sizeof(struct read)); + int SIZE = 50; + + int skippedLinesCounter = 0; + + printf("Entering loop...\n"); + + int gene_idx; + for (gene_idx = 0; gene_idx < numGenes;) { + struct gene* currentGene = (*allGenes)[gene_idx]; + + int exon_idx; + for(exon_idx=1;exon_idxnum_exons-1;exon_idx++) { + int prev_exon_stop = currentGene->exon_stops[exon_idx-1]; + int cur_exon_start = currentGene->exon_starts[exon_idx]; + + if (cur_exon_start - prev_exon_stop < 6) + continue; + + // initialize boundary arrays + upstream_end = malloc(sizeof(void*)*SIZE); + upstream_overlap = malloc(sizeof(void*)*SIZE); + downstream_start = malloc(sizeof(void*)*SIZE); + downstream_overlap= malloc(sizeof(void*)*SIZE); + ue = 0; + uo = 0; + ds = 0; + dov = 0; + + printf("Fetching reads line...\n"); + + label: + + status = sscanf((char*)linePtr,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n", + &chr,&pos,seq,&id,&strand,&mismatch,&repetition,&size,&cut,prb,cal_prb,chastity); + if (status < 12) { + skippedLinesCounter++; + continue; + } + + printf("read line!"); + + if (currentGene->start <= pos && pos+35 <= currentGene->stop) { + printf("gene boundaries: %d %d, pos: %d\n",currentGene->start,currentGene->stop,pos); + + // upstream ending + if (pos+35 == prev_exon_stop) { + upstream_end[ue] = linePtr; + ue++; + } + + // upstream overlapping + if (prev_exon_stop - pos <= 30) { + upstream_overlap[uo] = linePtr; + uo++; + } + + // downstream starting + if (pos == cur_exon_start) { + downstream_start[ds] = linePtr; + ds++; + } + + // downstream overlapping + if (cur_exon_start - pos >= 6) { + downstream_overlap[dov] = linePtr; + dov++; + } + + while (*(char*)linePtr != '\n') { + printf("linePtr is %d",linePtr); + linePtr++; + } + + linePtr++; + goto label; + + free(upstream_end); + free(upstream_overlap); + free(downstream_start); + free(downstream_overlap); + } + + while (*(char*)linePtr != '\n') { + printf("linePtr is %d",linePtr); + linePtr++; + } + + linePtr++; + goto label; + + } + + gene_idx++; + } + printf("skipped %d lines.\n",skippedLinesCounter); - free(gff_filename); - free(reads_filename); - return 0; + free(seq); + free(prb); + free(cal_prb); + free(chastity); } + +// int skippedLinesCounter = 0; +// while(1) { +// status = fscanf(reads_fs,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",&chr,&pos,seq,&id,&strand,&mismatch,&repetition,&size,&cut,prb,cal_prb,chastity); +// if(status == EOF) +// break; +// +// if (status < 12) { +// skippedLinesCounter++; +// continue; +// } +// +// label: +// +// if (pos < currentGene->start) +// continue; +// +// if (currentGene->stop < pos) { +// gene_idx++; +// currentGene = (*allGenes)[gene_idx]; +// goto label; +// } +// diff --git a/tools/data_tools/parser.c b/tools/data_tools/parser.c index e686cf3..2783157 100644 --- a/tools/data_tools/parser.c +++ b/tools/data_tools/parser.c @@ -4,7 +4,7 @@ #include "datastructures.h" -void parse_gff(char *filename, FILE* fid,struct gene** allGenes) { +int parse_gff(char *filename, FILE* fid,struct gene*** allGenes) { int buffer_size = 256; char* chr = malloc(sizeof(char)*buffer_size); @@ -32,13 +32,13 @@ void parse_gff(char *filename, FILE* fid,struct gene** allGenes) { } freopen(filename,"r",fid); - printf("Found %d genes!\n",numGenes); + //printf("Found %d genes!\n",numGenes); - allGenes = malloc(sizeof(struct gene)*numGenes); - struct gene *currentGene = gene_alloc(); + int idx = 0; + (*allGenes) = malloc(sizeof(struct gene*)*numGenes); + (*allGenes)[idx] = NULL; int skippedLinesCounter = 0; - int idx = 0; while(1) { status = fscanf(fid,"%s\t%s\t%s\t%d\t%d\t%c\t%c\t%c\t%s\n",chr,blah,id,&start,&stop,xy,strand,xz,desc); if(status == EOF) @@ -50,35 +50,36 @@ void parse_gff(char *filename, FILE* fid,struct gene** allGenes) { } if (strcmp(id,"gene")==0) { - if ( currentGene->start != -1) - allGenes[idx] = currentGene; + if ( (*allGenes)[idx] !=NULL ) idx++; - currentGene = gene_alloc(); - currentGene->start = start; - currentGene->stop = stop; - currentGene->strand = (*strand); + (*allGenes)[idx] = gene_alloc(); + (*allGenes)[idx]->start = start; + (*allGenes)[idx]->stop = stop; + (*allGenes)[idx]->strand = (*strand); //printf("gene start/stop: %d/%d\n",start,stop); continue; } if (strcmp(id,"exon")==0) { - add_exon(currentGene,start,stop); + add_exon((*allGenes)[idx],start,stop); //printf("exon start/stop: %d/%d\n",start,stop); continue; } if (strcmp(id,"pseudogene")==0) { - if ( currentGene->start != -1) - allGenes[idx] = currentGene; + if ( (*allGenes)[idx] !=NULL ) idx++; } } - if ( currentGene->start != -1) - allGenes[idx] = currentGene; + if ( (*allGenes)[idx] !=NULL ) idx++; + //printf("allGenes[0] is %d\n",(*allGenes)[0]); + //printf("allGenes[1] is %d\n",(*allGenes)[1]); + //printf("Skipped %d lines.\n",skippedLinesCounter); + free(chr); free(blah); free(id); @@ -86,4 +87,7 @@ void parse_gff(char *filename, FILE* fid,struct gene** allGenes) { free(xy); free(strand); free(xz); + + return numGenes; } + -- 2.20.1