From 6cb24b89418de6d61fbe8eee5146317e179db5d0 Mon Sep 17 00:00:00 2001 From: fabio Date: Wed, 19 Dec 2007 16:26:28 +0000 Subject: [PATCH] + parser works now with mmap and sscanf TODO + perform some logical checks -> orientation... git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@7178 e1793c9e-67f9-0310-80fc-b846ff1f7b36 --- tools/data_tools/filterReads.c | 251 ++++++++++++++++++++++----------- 1 file changed, 167 insertions(+), 84 deletions(-) diff --git a/tools/data_tools/filterReads.c b/tools/data_tools/filterReads.c index 6044142..9398454 100644 --- a/tools/data_tools/filterReads.c +++ b/tools/data_tools/filterReads.c @@ -17,6 +17,8 @@ int parse_gff(char* filename,FILE* fid,struct gene*** allGenes); void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes); +void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size); + static char *info = "Usage is:\n./filterReads gff reads"; int main(int argc, char* argv[]) { @@ -93,144 +95,225 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes) { printf("mmapping failed!\n"); void* linePtr = reads_area; - - char *test_string = malloc(sizeof(char)*10); - strncpy(test_string,(char*)reads_area,10); - - status = munmap(reads_area,mmapAreaSize); - if(status != 0) - printf("munmap failed!\n"); - - void** upstream_end; - void** upstream_overlap; - - void** downstream_start; - void** downstream_overlap; - + char* current_line = malloc(sizeof(char)*256); + + int SIZE = 500; + // initialize boundary arrays + void** upstream_end = malloc(sizeof(void*)*SIZE); + void** upstream_overlap = malloc(sizeof(void*)*SIZE); + void** downstream_start = malloc(sizeof(void*)*SIZE); + void** downstream_overlap= malloc(sizeof(void*)*SIZE); int ue,uo,ds,dov; - - int SIZE = 50; + ue = uo = ds = dov = 0; int skippedLinesCounter = 0; - printf("Entering loop...\n"); + int prev_exon_stop = -1; + int cur_exon_start = -1; int gene_idx; for (gene_idx = 0; gene_idx < numGenes;) { struct gene* currentGene = (*allGenes)[gene_idx]; int exon_idx; - for(exon_idx=1;exon_idxnum_exons-1;exon_idx++) { - int prev_exon_stop = currentGene->exon_stops[exon_idx-1]; - int cur_exon_start = currentGene->exon_starts[exon_idx]; + for(exon_idx=1;exon_idxnum_exons;exon_idx++) { + + combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov); + combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds); + + prev_exon_stop = currentGene->exon_stops[exon_idx-1]; + cur_exon_start = currentGene->exon_starts[exon_idx]; if (cur_exon_start - prev_exon_stop < 6) continue; - // initialize boundary arrays - upstream_end = malloc(sizeof(void*)*SIZE); - upstream_overlap = malloc(sizeof(void*)*SIZE); - downstream_start = malloc(sizeof(void*)*SIZE); - downstream_overlap= malloc(sizeof(void*)*SIZE); - ue = 0; - uo = 0; - ds = 0; - dov = 0; - - printf("Fetching reads line...\n"); + ue = uo = ds = dov = 0; label: - status = sscanf((char*)linePtr,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n", + current_line = strncpy(current_line,linePtr,256); + if (strcmp(current_line,"") == 0) + goto end; + + status = sscanf(current_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n", &chr,&pos,seq,&id,&strand,&mismatch,&repetition,&size,&cut,prb,cal_prb,chastity); if (status < 12) { + //printf("skipped line is %s\n",current_line); skippedLinesCounter++; - continue; } - printf("read line!"); + //printf("%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n", + //chr,pos,seq,id,strand,mismatch,repetition,size,cut,prb,cal_prb,chastity); + //printf("read position: %d gene start/stop: %d-%d\n ",pos,currentGene->start,currentGene->stop); if (currentGene->start <= pos && pos+35 <= currentGene->stop) { - printf("gene boundaries: %d %d, pos: %d\n",currentGene->start,currentGene->stop,pos); - + //printf("gene boundaries: %d %d, pos: %d linePtr %d\n",currentGene->start,currentGene->stop,pos,linePtr); // upstream ending if (pos+35 == prev_exon_stop) { upstream_end[ue] = linePtr; ue++; - } + } else { - // upstream overlapping - if (prev_exon_stop - pos <= 30) { - upstream_overlap[uo] = linePtr; - uo++; + // upstream overlapping + if (prev_exon_stop - pos >= 6 && prev_exon_stop - pos <= 30) { + upstream_overlap[uo] = linePtr; + uo++; + } } // downstream starting if (pos == cur_exon_start) { downstream_start[ds] = linePtr; ds++; - } + } else { - // downstream overlapping - if (cur_exon_start - pos >= 6) { - downstream_overlap[dov] = linePtr; - dov++; - } - - while (*(char*)linePtr != '\n') { - printf("linePtr is %d",linePtr); - linePtr++; + // downstream overlapping + if (cur_exon_start - pos >= 6 && cur_exon_start - pos <= 30 ) { + downstream_overlap[dov] = linePtr; + dov++; + } } + while (*(char*)linePtr != '\n') linePtr++; linePtr++; goto label; - free(upstream_end); - free(upstream_overlap); - free(downstream_start); - free(downstream_overlap); - } + } else { - while (*(char*)linePtr != '\n') { - printf("linePtr is %d",linePtr); + if (currentGene->stop < pos) { + //printf("read is too far downstream\n"); + break; + } + + //printf("read is too far upstream\n"); + while (*(char*)linePtr != '\n') linePtr++; linePtr++; + goto label; } - - linePtr++; - goto label; - } gene_idx++; } + end: + + free(upstream_end); + free(upstream_overlap); + free(downstream_start); + free(downstream_overlap); + printf("skipped %d lines.\n",skippedLinesCounter); + status = munmap(reads_area,mmapAreaSize); + if(status != 0) + printf("munmap failed!\n"); + free(seq); free(prb); free(cal_prb); free(chastity); } -// int skippedLinesCounter = 0; -// while(1) { -// status = fscanf(reads_fs,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",&chr,&pos,seq,&id,&strand,&mismatch,&repetition,&size,&cut,prb,cal_prb,chastity); -// if(status == EOF) -// break; -// -// if (status < 12) { -// skippedLinesCounter++; -// continue; -// } -// -// label: -// -// if (pos < currentGene->start) -// continue; -// -// if (currentGene->stop < pos) { -// gene_idx++; -// currentGene = (*allGenes)[gene_idx]; -// goto label; -// } -// +void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size) { + //printf("up_/down_size %d %d\n",up_size,down_size); + + if (up_size == 0 || down_size == 0 || exon_stop == -1) + return; + + int up_idx, down_idx, status; + char* upstream_line = malloc(sizeof(char)*256); + char* downstream_line = malloc(sizeof(char)*256); + + int buffer_size= 64; + + int up_chr = 0; + int up_pos = 0; + char* up_seq = malloc(sizeof(char)*buffer_size); + int up_id = 0; + char up_strand = ' '; + int up_mismatch = 0; + int up_repetition = 0; + int up_sz = 0; + int up_cut = 0; + char* up_prb = malloc(sizeof(char)*buffer_size); + char* up_cal_prb = malloc(sizeof(char)*buffer_size); + char* up_chastity = malloc(sizeof(char)*buffer_size); + + int down_chr = 0; + int down_pos = 0; + char* down_seq = malloc(sizeof(char)*buffer_size); + int down_id = 0; + char down_strand = ' '; + int down_mismatch = 0; + int down_repetition = 0; + int down_sz = 0; + int down_cut = 0; + char* down_prb = malloc(sizeof(char)*buffer_size); + char* down_cal_prb = malloc(sizeof(char)*buffer_size); + char* down_chastity = malloc(sizeof(char)*buffer_size); + + int new_chr = 0; + int new_pos = 0; + char* new_seq = malloc(sizeof(char)*buffer_size); + int new_id = 0; + char new_strand = ' '; + char* new_prb = malloc(sizeof(char)*buffer_size); + char* new_cal_prb = malloc(sizeof(char)*buffer_size); + char* new_chastity = malloc(sizeof(char)*buffer_size); + + up_idx=0; + down_idx=0; + while(1) { + //printf("up_/down_idx %d %d\n",up_idx,down_idx); + + if (up_idx == up_size || down_idx == down_size) + break; + + strncpy(upstream_line,upstream[up_idx],256); + status = sscanf(upstream_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n", + &up_chr,&up_pos,up_seq,&up_id,&up_strand,&up_mismatch,&up_repetition,&up_sz, + &up_cut,up_prb,up_cal_prb,up_chastity); + + strncpy(downstream_line,downstream[down_idx],256); + status = sscanf(downstream_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n", + &down_chr,&down_pos,down_seq,&down_id,&down_strand,&down_mismatch,&down_repetition,&down_sz, + &down_cut,down_prb,down_cal_prb,down_chastity); + + new_prb[0] = '\0'; + new_cal_prb[0] = '\0'; + new_chastity[0] = '\0'; + + // is downstream overlapping + if (up_pos+35 == exon_stop) { + int overlap = exon_start - down_pos; + //printf("overlap is %d\n",overlap); + //printf("pos are: %d %d\n",up_pos,down_pos); + + strncat(new_prb,up_prb+(36-overlap),overlap); + strncat(new_cal_prb,up_cal_prb+(36-overlap),overlap); + strncat(new_chastity,up_chastity+(36-overlap),overlap); + + strncat(new_prb,down_prb+overlap,36-overlap); + strncat(new_cal_prb,down_cal_prb+overlap,36-overlap); + strncat(new_chastity,down_chastity+overlap,36-overlap); + + //printf("up_prb: %s\n",up_prb); + //printf("down_prb: %s\n",down_prb); + //printf("new_prb: %s\n",new_prb); + + // is upstream overlapping + } else { + + } + + up_idx++; + down_idx++; + } +} + +/* + * TODO + * - Check strand + * - check repetition + * + */ -- 2.20.1