From 149f62bd2d75d57fca07b60b1436f05fdd54fd50 Mon Sep 17 00:00:00 2001 From: fabio Date: Thu, 20 Dec 2007 08:18:56 +0000 Subject: [PATCH] + compacted parser code + found conceptual error git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@7191 e1793c9e-67f9-0310-80fc-b846ff1f7b36 --- tools/data_tools/filterReads.c | 179 ++++++++++++++++++++------------- 1 file changed, 108 insertions(+), 71 deletions(-) diff --git a/tools/data_tools/filterReads.c b/tools/data_tools/filterReads.c index 9398454..9a738e5 100644 --- a/tools/data_tools/filterReads.c +++ b/tools/data_tools/filterReads.c @@ -7,9 +7,11 @@ //////////////////////////////////////////////////////////// #include +#include #include #include #include +#include #include "datastructures.h" @@ -28,8 +30,6 @@ int main(int argc, char* argv[]) { exit(EXIT_FAILURE); } - //static size_t page_size; - // page_size = (size_t) sysconf (_SC_PAGESIZE); int status; int filenameSize = 256; char *gff_filename = malloc(sizeof(char)*filenameSize); @@ -88,7 +88,18 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes) { char* chastity = malloc(sizeof(char)*buffer_size); int reads_fid = fileno(reads_fs); - size_t mmapAreaSize = 20000; + struct stat* reads_stat = malloc(sizeof(struct stat)); + status = fstat(reads_fid,reads_stat); + if(status != 0) + printf("could not stat reads file"); + + off_t reads_filesize = reads_stat->st_size; + free(reads_stat); + + const size_t page_size = (size_t) sysconf(_SC_PAGESIZE); + printf("page_size is %d\n",page_size); + printf("reads file is of size %d bytes\n",reads_filesize); + size_t mmapAreaSize = (reads_filesize / page_size)+1 ; void *reads_area = mmap (NULL,mmapAreaSize,PROT_READ|PROT_WRITE,MAP_PRIVATE,reads_fid,0); if((long int) reads_area == -1) @@ -111,91 +122,106 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes) { int prev_exon_stop = -1; int cur_exon_start = -1; - int gene_idx; - for (gene_idx = 0; gene_idx < numGenes;) { - struct gene* currentGene = (*allGenes)[gene_idx]; + current_line = strncpy(current_line,linePtr,256); + int gene_idx = 0; + int exon_idx = 1; + struct gene* currentGene = (*allGenes)[gene_idx]; + while(1) { + if (gene_idx == numGenes || strcmp(current_line,"") == 0) + break; - int exon_idx; - for(exon_idx=1;exon_idxnum_exons;exon_idx++) { + label: + status = sscanf(current_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n", + &chr,&pos,seq,&id,&strand,&mismatch,&repetition,&size,&cut,prb,cal_prb,chastity); + if (status < 12) { + skippedLinesCounter++; + } - combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov); - combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds); + printf("gene start/stop %d,%d, pos: %d\n",currentGene->start,currentGene->stop,pos); - prev_exon_stop = currentGene->exon_stops[exon_idx-1]; - cur_exon_start = currentGene->exon_starts[exon_idx]; + if (!(currentGene->start <= pos && pos+35 <= currentGene->stop)) { - if (cur_exon_start - prev_exon_stop < 6) + // go to next gene + if ( currentGene->stop < pos+35 ) { + gene_idx++; + exon_idx = 1; + currentGene = (*allGenes)[gene_idx]; continue; + } - ue = uo = ds = dov = 0; - - label: + // go to next line + if ( pos < currentGene->start ) { + while (*(char*)linePtr != '\n') linePtr++; + linePtr++; - current_line = strncpy(current_line,linePtr,256); - if (strcmp(current_line,"") == 0) - goto end; - - status = sscanf(current_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n", - &chr,&pos,seq,&id,&strand,&mismatch,&repetition,&size,&cut,prb,cal_prb,chastity); - if (status < 12) { - //printf("skipped line is %s\n",current_line); - skippedLinesCounter++; + current_line = strncpy(current_line,linePtr,256); + continue; } - //printf("%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n", - //chr,pos,seq,id,strand,mismatch,repetition,size,cut,prb,cal_prb,chastity); - //printf("read position: %d gene start/stop: %d-%d\n ",pos,currentGene->start,currentGene->stop); - - if (currentGene->start <= pos && pos+35 <= currentGene->stop) { - //printf("gene boundaries: %d %d, pos: %d linePtr %d\n",currentGene->start,currentGene->stop,pos,linePtr); - // upstream ending - if (pos+35 == prev_exon_stop) { - upstream_end[ue] = linePtr; - ue++; - } else { - - // upstream overlapping - if (prev_exon_stop - pos >= 6 && prev_exon_stop - pos <= 30) { - upstream_overlap[uo] = linePtr; - uo++; - } - } + } else { - // downstream starting - if (pos == cur_exon_start) { - downstream_start[ds] = linePtr; - ds++; - } else { - - // downstream overlapping - if (cur_exon_start - pos >= 6 && cur_exon_start - pos <= 30 ) { - downstream_overlap[dov] = linePtr; - dov++; - } - } + exon_label: + prev_exon_stop = currentGene->exon_stops[exon_idx-1]; + cur_exon_start = currentGene->exon_starts[exon_idx]; + printf("prev_exon_stop %d cur_exon_start %d pos %d\n",prev_exon_stop,cur_exon_start,pos); + printf("exon_idx %d\n",exon_idx); + + if (exon_idx == currentGene->num_exons) + break; + + if (cur_exon_start - prev_exon_stop < 6 || cur_exon_start+36 < pos ) { + exon_idx++; + combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov); + combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds); + ue = uo = ds = dov = 0; + goto exon_label; + } + if ( pos < prev_exon_stop - 36 ) { while (*(char*)linePtr != '\n') linePtr++; linePtr++; - goto label; + current_line = strncpy(current_line,linePtr,256); + continue; + } + + // upstream ending + if (pos+35 == prev_exon_stop) { + upstream_end[ue] = linePtr; + ue++; } else { - if (currentGene->stop < pos) { - //printf("read is too far downstream\n"); - break; + // upstream overlapping + if (prev_exon_stop - pos >= 6 && prev_exon_stop - pos <= 30) { + upstream_overlap[uo] = linePtr; + uo++; } + } - //printf("read is too far upstream\n"); - while (*(char*)linePtr != '\n') linePtr++; - linePtr++; - goto label; + // downstream starting + if (pos == cur_exon_start) { + downstream_start[ds] = linePtr; + ds++; + } else { + + // downstream overlapping + if (cur_exon_start - pos >= 6 && cur_exon_start - pos <= 30 ) { + downstream_overlap[dov] = linePtr; + dov++; + } } + + while (*(char*)linePtr != '\n') linePtr++; + linePtr++; + current_line = strncpy(current_line,linePtr,256); + continue; } gene_idx++; } - end: + combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov); + combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds); free(upstream_end); free(upstream_overlap); @@ -215,7 +241,7 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes) { } void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size) { - //printf("up_/down_size %d %d\n",up_size,down_size); + printf("up_/down_size %d %d\n",up_size,down_size); if (up_size == 0 || down_size == 0 || exon_stop == -1) return; @@ -296,15 +322,26 @@ void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, v strncat(new_prb,down_prb+overlap,36-overlap); strncat(new_cal_prb,down_cal_prb+overlap,36-overlap); strncat(new_chastity,down_chastity+overlap,36-overlap); - - //printf("up_prb: %s\n",up_prb); - //printf("down_prb: %s\n",down_prb); - //printf("new_prb: %s\n",new_prb); - // is upstream overlapping - } else { + } + + if (down_pos == exon_start) { + int overlap = up_pos+36 - exon_stop; + //printf("overlap is %d\n",overlap); + //printf("pos are: %d %d\n",up_pos,down_pos); + strncat(new_prb,up_prb,(36-overlap)); + strncat(new_cal_prb,up_cal_prb,(36-overlap)); + strncat(new_chastity,up_chastity,(36-overlap)); + + strncat(new_prb,down_prb,overlap); + strncat(new_cal_prb,down_cal_prb,overlap); + strncat(new_chastity,down_chastity,overlap); } + + printf("up_prb: %s\n",up_prb); + printf("down_prb: %s\n",down_prb); + printf("new_prb: %s\n",new_prb); up_idx++; down_idx++; -- 2.20.1