+ compacted parser code
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Thu, 20 Dec 2007 08:18:56 +0000 (08:18 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Thu, 20 Dec 2007 08:18:56 +0000 (08:18 +0000)
+ found conceptual error

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@7191 e1793c9e-67f9-0310-80fc-b846ff1f7b36

tools/data_tools/filterReads.c

index 9398454..9a738e5 100644 (file)
@@ -7,9 +7,11 @@
 ////////////////////////////////////////////////////////////
 
 #include <sys/mman.h>
+#include <sys/stat.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <unistd.h>
 
 #include "datastructures.h"
 
@@ -28,8 +30,6 @@ int main(int argc, char* argv[]) {
       exit(EXIT_FAILURE);
    }
 
-   //static size_t page_size;
-   // page_size = (size_t) sysconf (_SC_PAGESIZE);
    int status;
    int filenameSize = 256;
    char *gff_filename = malloc(sizeof(char)*filenameSize);
@@ -88,7 +88,18 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes) {
    char* chastity = malloc(sizeof(char)*buffer_size);
 
    int reads_fid = fileno(reads_fs);
-   size_t mmapAreaSize = 20000;
+   struct stat* reads_stat = malloc(sizeof(struct stat));
+   status = fstat(reads_fid,reads_stat);
+   if(status != 0)
+      printf("could not stat reads file");
+
+   off_t reads_filesize = reads_stat->st_size;
+   free(reads_stat);
+
+   const size_t page_size = (size_t) sysconf(_SC_PAGESIZE);
+   printf("page_size is %d\n",page_size);
+   printf("reads file is of size %d bytes\n",reads_filesize);
+   size_t mmapAreaSize = (reads_filesize / page_size)+1 ;
 
    void *reads_area = mmap (NULL,mmapAreaSize,PROT_READ|PROT_WRITE,MAP_PRIVATE,reads_fid,0);
    if((long int) reads_area == -1)
@@ -111,91 +122,106 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes) {
    int prev_exon_stop = -1;
    int cur_exon_start = -1;
 
-   int gene_idx;
-   for (gene_idx = 0; gene_idx < numGenes;) {
-      struct gene* currentGene = (*allGenes)[gene_idx];
+   current_line = strncpy(current_line,linePtr,256);
+   int gene_idx = 0;
+   int exon_idx = 1;
+   struct gene* currentGene = (*allGenes)[gene_idx];
+   while(1) {
+      if (gene_idx == numGenes || strcmp(current_line,"") == 0)
+         break;
 
-      int exon_idx;
-      for(exon_idx=1;exon_idx<currentGene->num_exons;exon_idx++) {
+      label:
+      status = sscanf(current_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
+      &chr,&pos,seq,&id,&strand,&mismatch,&repetition,&size,&cut,prb,cal_prb,chastity);
+      if (status < 12) {
+         skippedLinesCounter++;
+      }
 
-         combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov);
-         combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds);
+      printf("gene start/stop %d,%d, pos: %d\n",currentGene->start,currentGene->stop,pos);
 
-         prev_exon_stop = currentGene->exon_stops[exon_idx-1];
-         cur_exon_start = currentGene->exon_starts[exon_idx];
+      if (!(currentGene->start <= pos && pos+35 <= currentGene->stop)) {
 
-         if (cur_exon_start - prev_exon_stop < 6)
+         // go to next gene
+         if ( currentGene->stop < pos+35 ) {
+            gene_idx++;
+            exon_idx = 1;
+            currentGene = (*allGenes)[gene_idx];
             continue;
+         }
 
-         ue = uo = ds = dov = 0;
-
-         label:
+         // go to next line
+         if ( pos < currentGene->start ) {
+            while (*(char*)linePtr != '\n') linePtr++;
+            linePtr++;
 
-         current_line = strncpy(current_line,linePtr,256);
-         if (strcmp(current_line,"") == 0)
-            goto end;
-
-         status = sscanf(current_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
-         &chr,&pos,seq,&id,&strand,&mismatch,&repetition,&size,&cut,prb,cal_prb,chastity);
-         if (status < 12) {
-            //printf("skipped line is %s\n",current_line);
-            skippedLinesCounter++;
+            current_line = strncpy(current_line,linePtr,256);
+            continue;
          }
 
-         //printf("%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
-         //chr,pos,seq,id,strand,mismatch,repetition,size,cut,prb,cal_prb,chastity);
-         //printf("read position: %d gene start/stop: %d-%d\n ",pos,currentGene->start,currentGene->stop);
-
-         if (currentGene->start <= pos && pos+35 <= currentGene->stop) {
-            //printf("gene boundaries: %d %d, pos: %d linePtr %d\n",currentGene->start,currentGene->stop,pos,linePtr);
-            // upstream ending
-            if (pos+35 == prev_exon_stop) {
-               upstream_end[ue] = linePtr;
-               ue++;
-            } else {
-
-               // upstream overlapping
-               if (prev_exon_stop - pos >= 6 && prev_exon_stop - pos <= 30) {
-                  upstream_overlap[uo] = linePtr;
-                  uo++;
-               }
-            }
+      } else {
 
-            // downstream starting
-            if (pos == cur_exon_start) {
-               downstream_start[ds] = linePtr;
-               ds++;
-            } else {
-
-               // downstream overlapping
-               if (cur_exon_start - pos >= 6 && cur_exon_start - pos <= 30 ) {
-                  downstream_overlap[dov] = linePtr;
-                  dov++;
-               }
-            }
+         exon_label:
+         prev_exon_stop = currentGene->exon_stops[exon_idx-1];
+         cur_exon_start = currentGene->exon_starts[exon_idx];
+         printf("prev_exon_stop %d cur_exon_start %d pos %d\n",prev_exon_stop,cur_exon_start,pos);
+         printf("exon_idx %d\n",exon_idx);
+
+         if (exon_idx == currentGene->num_exons)
+            break;
+
+         if (cur_exon_start - prev_exon_stop < 6 || cur_exon_start+36 < pos ) {
+            exon_idx++;
+            combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov);
+            combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds);
+            ue = uo = ds = dov = 0;
+            goto exon_label;
+         }
 
+         if ( pos < prev_exon_stop - 36 ) {
             while (*(char*)linePtr != '\n') linePtr++;
             linePtr++;
-            goto label;
 
+            current_line = strncpy(current_line,linePtr,256);
+            continue;
+         }
+
+         // upstream ending
+         if (pos+35 == prev_exon_stop) {
+            upstream_end[ue] = linePtr;
+            ue++;
          } else {
 
-            if (currentGene->stop < pos) {
-               //printf("read is too far downstream\n");
-               break;
+            // upstream overlapping
+            if (prev_exon_stop - pos >= 6 && prev_exon_stop - pos <= 30) {
+               upstream_overlap[uo] = linePtr;
+               uo++;
             }
+         }
 
-            //printf("read is too far upstream\n");
-            while (*(char*)linePtr != '\n') linePtr++;
-            linePtr++;
-            goto label;
+         // downstream starting
+         if (pos == cur_exon_start) {
+            downstream_start[ds] = linePtr;
+            ds++;
+         } else {
+
+            // downstream overlapping
+            if (cur_exon_start - pos >= 6 && cur_exon_start - pos <= 30 ) {
+               downstream_overlap[dov] = linePtr;
+               dov++;
+            }
          }
+
+         while (*(char*)linePtr != '\n') linePtr++;
+         linePtr++;
+         current_line = strncpy(current_line,linePtr,256);
+         continue;
       }
 
       gene_idx++;
    }
 
-   end:
+   combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov);
+   combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds);
 
    free(upstream_end);
    free(upstream_overlap);
@@ -215,7 +241,7 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes) {
 }
 
 void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size) {
-   //printf("up_/down_size %d %d\n",up_size,down_size);
+   printf("up_/down_size %d %d\n",up_size,down_size);
 
    if (up_size == 0 || down_size == 0 || exon_stop == -1)
       return;
@@ -296,15 +322,26 @@ void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, v
          strncat(new_prb,down_prb+overlap,36-overlap);
          strncat(new_cal_prb,down_cal_prb+overlap,36-overlap);
          strncat(new_chastity,down_chastity+overlap,36-overlap);
-
-         //printf("up_prb: %s\n",up_prb);
-         //printf("down_prb: %s\n",down_prb);
-         //printf("new_prb: %s\n",new_prb);
-
       // is upstream overlapping
-      } else {
+      }
+      
+      if (down_pos == exon_start) {
+         int overlap = up_pos+36 - exon_stop;
+         //printf("overlap is %d\n",overlap);
+         //printf("pos are: %d %d\n",up_pos,down_pos);
 
+         strncat(new_prb,up_prb,(36-overlap));
+         strncat(new_cal_prb,up_cal_prb,(36-overlap));
+         strncat(new_chastity,up_chastity,(36-overlap));
+
+         strncat(new_prb,down_prb,overlap);
+         strncat(new_cal_prb,down_cal_prb,overlap);
+         strncat(new_chastity,down_chastity,overlap);
       }
+
+      printf("up_prb: %s\n",up_prb);
+      printf("down_prb: %s\n",down_prb);
+      printf("new_prb: %s\n",new_prb);
          
       up_idx++;
       down_idx++;