+ added processing function for reads
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Wed, 19 Dec 2007 11:43:54 +0000 (11:43 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Wed, 19 Dec 2007 11:43:54 +0000 (11:43 +0000)
TODO
- figure out how to use sscanf together with mmap

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@7170 e1793c9e-67f9-0310-80fc-b846ff1f7b36

tools/data_tools/Makefile
tools/data_tools/datastructures.c
tools/data_tools/filterReads.c
tools/data_tools/parser.c

index 71b500f..618d9a2 100644 (file)
@@ -2,8 +2,10 @@
 SRCS= parser.c\
        datastructures.c
 
 SRCS= parser.c\
        datastructures.c
 
-                         
 OBJS = $(SRCS:%.cpp=%.o)
 
 all: $(OBJS)
 OBJS = $(SRCS:%.cpp=%.o)
 
 all: $(OBJS)
-       gcc -Wall -o filterReads $(OBJS) filterReads.c
+       gcc -O3 -Wall -o filterReads $(OBJS) filterReads.c
+
+clean:
+       rm -rf *.o
index 2f464c4..dd9e48e 100644 (file)
@@ -9,8 +9,6 @@ struct gene* gene_alloc(void) {
    newGene->exon_stops = malloc(2*sizeof(int));
    newGene->num_exons = 0;
    newGene->max_exons = 2;
    newGene->exon_stops = malloc(2*sizeof(int));
    newGene->num_exons = 0;
    newGene->max_exons = 2;
-   newGene->start = -1;
-
    return newGene;
 }
 
    return newGene;
 }
 
@@ -20,14 +18,15 @@ void free_gene(struct gene* oldGene) {
 }
 
 void add_exon(struct gene* currentGene,int start, int stop) {
 }
 
 void add_exon(struct gene* currentGene,int start, int stop) {
+   int idx = currentGene->num_exons;
 
 
-   if (currentGene->num_exons < currentGene->num_exons) {
-      int idx = currentGene->num_exons;
-      currentGene->exon_starts[idx] = start;
-      currentGene->exon_stops[idx] = stop;
-      currentGene->num_exons++;
-   } else {
+   if ( idx >= currentGene->max_exons) {
       currentGene->exon_starts = realloc(currentGene->exon_starts,sizeof(int)*2*currentGene->max_exons);
       currentGene->exon_stops = realloc(currentGene->exon_stops,sizeof(int)*2*currentGene->max_exons);
       currentGene->exon_starts = realloc(currentGene->exon_starts,sizeof(int)*2*currentGene->max_exons);
       currentGene->exon_stops = realloc(currentGene->exon_stops,sizeof(int)*2*currentGene->max_exons);
+      currentGene->max_exons *= 2;
    }
    }
+
+   currentGene->exon_starts[idx] = start;
+   currentGene->exon_stops[idx] = stop;
+   currentGene->num_exons++;
 }
 }
index 4e7ec9f..6044142 100644 (file)
@@ -4,7 +4,6 @@
 //
 //
 //
 //
 //
 //
-//
 ////////////////////////////////////////////////////////////
 
 #include <sys/mman.h>
 ////////////////////////////////////////////////////////////
 
 #include <sys/mman.h>
@@ -14,7 +13,9 @@
 
 #include "datastructures.h"
 
 
 #include "datastructures.h"
 
-void parse_gff(char* filename,FILE* fid,struct gene* allGenes);
+int parse_gff(char* filename,FILE* fid,struct gene*** allGenes);
+
+void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes);
 
 static char *info = "Usage is:\n./filterReads gff reads";
 
 
 static char *info = "Usage is:\n./filterReads gff reads";
 
@@ -27,6 +28,7 @@ int main(int argc, char* argv[]) {
 
    //static size_t page_size;
    // page_size = (size_t) sysconf (_SC_PAGESIZE);
 
    //static size_t page_size;
    // page_size = (size_t) sysconf (_SC_PAGESIZE);
+   int status;
    int filenameSize = 256;
    char *gff_filename = malloc(sizeof(char)*filenameSize);
    char *reads_filename = malloc(sizeof(char)*filenameSize);
    int filenameSize = 256;
    char *gff_filename = malloc(sizeof(char)*filenameSize);
    char *reads_filename = malloc(sizeof(char)*filenameSize);
@@ -47,34 +49,188 @@ int main(int argc, char* argv[]) {
       exit(EXIT_FAILURE);
    }
 
       exit(EXIT_FAILURE);
    }
 
-   struct gene **allGenes;
-   parse_gff(gff_filename,gff_fs,allGenes);
+   struct gene** allGenes;
+   int numGenes = parse_gff(gff_filename,gff_fs,&allGenes);
+   status = fclose(gff_fs);
+   if(status != 0)
+      printf("closing of gff filestream failed!\n");
 
 
+   process_reads(reads_fs,&allGenes,numGenes);
+
+   status += fclose(reads_fs);
+   if(status != 0)
+      printf("closing of filestreams failed!\n");
+      
+   //free(allGenes);
+   free(gff_filename);
+   free(reads_filename);
+   return 0;
+}
+
+
+void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes) {
+   int status;
+
+   int buffer_size= 64;
+   int chr        = 0;
+   int pos        = 0;
+   char* seq      = malloc(sizeof(char)*buffer_size);
+   int id         = 0;
+   char strand    = ' ';
+   int mismatch   = 0;
+   int repetition = 0;
+   int size       = 0;
+   int cut        = 0;
+   char* prb      = malloc(sizeof(char)*buffer_size);
+   char* cal_prb  = malloc(sizeof(char)*buffer_size);
+   char* chastity = malloc(sizeof(char)*buffer_size);
 
    int reads_fid = fileno(reads_fs);
 
    int reads_fid = fileno(reads_fs);
-   size_t mmapAreaSize = 16;
+   size_t mmapAreaSize = 20000;
+
    void *reads_area = mmap (NULL,mmapAreaSize,PROT_READ|PROT_WRITE,MAP_PRIVATE,reads_fid,0);
    if((long int) reads_area == -1)
       printf("mmapping failed!\n");
 
    void *reads_area = mmap (NULL,mmapAreaSize,PROT_READ|PROT_WRITE,MAP_PRIVATE,reads_fid,0);
    if((long int) reads_area == -1)
       printf("mmapping failed!\n");
 
+   void* linePtr = reads_area;
+
    char *test_string = malloc(sizeof(char)*10);
    strncpy(test_string,(char*)reads_area,10);
    char *test_string = malloc(sizeof(char)*10);
    strncpy(test_string,(char*)reads_area,10);
-   //printf("Reads area at %s\n",test_string);
    
    
-   int status = munmap(reads_area,mmapAreaSize);
+   status = munmap(reads_area,mmapAreaSize);
    if(status != 0)
       printf("munmap failed!\n");
 
    if(status != 0)
       printf("munmap failed!\n");
 
-   status = fclose(gff_fs);
-   status += fclose(reads_fs);
-   if(status != 0)
-      printf("closing of filestreams failed!\n");
+   void** upstream_end;
+   void** upstream_overlap;
+
+   void** downstream_start;
+   void** downstream_overlap;
+   
+   int ue,uo,ds,dov;
       
       
-   printf("sizeof(struct gene) %d\n",sizeof(struct gene));
-   printf("sizeof(struct read) %d\n",sizeof(struct read));
+   int SIZE = 50;
+
+   int skippedLinesCounter = 0;
+
+   printf("Entering loop...\n");
+
+   int gene_idx;
+   for (gene_idx = 0; gene_idx < numGenes;) {
+      struct gene* currentGene = (*allGenes)[gene_idx];
+
+      int exon_idx;
+      for(exon_idx=1;exon_idx<currentGene->num_exons-1;exon_idx++) {
+         int prev_exon_stop = currentGene->exon_stops[exon_idx-1];
+         int cur_exon_start = currentGene->exon_starts[exon_idx];
+
+         if (cur_exon_start - prev_exon_stop < 6)
+            continue;
+
+         // initialize boundary arrays
+         upstream_end      = malloc(sizeof(void*)*SIZE);
+         upstream_overlap  = malloc(sizeof(void*)*SIZE);
+         downstream_start  = malloc(sizeof(void*)*SIZE);
+         downstream_overlap= malloc(sizeof(void*)*SIZE);
+         ue = 0;
+         uo = 0;
+         ds = 0;
+         dov = 0;
+
+         printf("Fetching reads line...\n");
+
+         label:
+
+         status = sscanf((char*)linePtr,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
+         &chr,&pos,seq,&id,&strand,&mismatch,&repetition,&size,&cut,prb,cal_prb,chastity);
+         if (status < 12) {
+            skippedLinesCounter++;
+            continue;
+         }
+
+         printf("read line!");
+
+         if (currentGene->start <= pos && pos+35 <= currentGene->stop) {
+            printf("gene boundaries: %d %d, pos: %d\n",currentGene->start,currentGene->stop,pos);
+
+            // upstream ending
+            if (pos+35 == prev_exon_stop) {
+               upstream_end[ue] = linePtr;
+               ue++;
+            }
+
+            // upstream overlapping
+            if (prev_exon_stop - pos <= 30) {
+               upstream_overlap[uo] = linePtr;
+               uo++;
+            }
+
+            // downstream starting
+            if (pos == cur_exon_start) {
+               downstream_start[ds] = linePtr;
+               ds++;
+            }
+
+            // downstream overlapping
+            if (cur_exon_start - pos >= 6) {
+               downstream_overlap[dov] = linePtr;
+               dov++;
+            }
+
+            while (*(char*)linePtr != '\n') {
+               printf("linePtr is %d",linePtr);
+               linePtr++;
+            }
+
+            linePtr++;
+            goto label;
+
+            free(upstream_end);
+            free(upstream_overlap);
+            free(downstream_start);
+            free(downstream_overlap);
+         }
+
+         while (*(char*)linePtr != '\n') {
+            printf("linePtr is %d",linePtr);
+            linePtr++;
+         }
+
+         linePtr++;
+         goto label;
+         
+      }
+
+      gene_idx++;
+   }
 
 
+   printf("skipped %d lines.\n",skippedLinesCounter);
 
 
-   free(gff_filename);
-   free(reads_filename);
-   return 0;
+   free(seq);
+   free(prb);
+   free(cal_prb);
+   free(chastity);
 }
 }
+
+//      int skippedLinesCounter = 0;
+//      while(1) {
+//         status = fscanf(reads_fs,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",&chr,&pos,seq,&id,&strand,&mismatch,&repetition,&size,&cut,prb,cal_prb,chastity);
+//         if(status == EOF)
+//            break;
+//
+//         if (status < 12) {
+//            skippedLinesCounter++;
+//            continue;
+//         }
+//
+//         label:
+//
+//         if (pos < currentGene->start)
+//            continue;
+//
+//         if (currentGene->stop < pos) {
+//            gene_idx++;
+//            currentGene = (*allGenes)[gene_idx];
+//            goto label;
+//         }
+//
index e686cf3..2783157 100644 (file)
@@ -4,7 +4,7 @@
 
 #include "datastructures.h"
 
 
 #include "datastructures.h"
 
-void parse_gff(char *filename, FILE* fid,struct gene** allGenes) {
+int parse_gff(char *filename, FILE* fid,struct gene*** allGenes) {
 
    int buffer_size = 256;
    char* chr   = malloc(sizeof(char)*buffer_size);
 
    int buffer_size = 256;
    char* chr   = malloc(sizeof(char)*buffer_size);
@@ -32,13 +32,13 @@ void parse_gff(char *filename, FILE* fid,struct gene** allGenes) {
    }
    freopen(filename,"r",fid);
  
    }
    freopen(filename,"r",fid);
  
-   printf("Found %d genes!\n",numGenes);
+   //printf("Found %d genes!\n",numGenes);
 
 
-   allGenes = malloc(sizeof(struct gene)*numGenes);
-   struct gene *currentGene = gene_alloc();
+   int idx = 0;
+   (*allGenes) = malloc(sizeof(struct gene*)*numGenes);
+   (*allGenes)[idx] = NULL;
 
    int skippedLinesCounter = 0;
 
    int skippedLinesCounter = 0;
-   int idx = 0;
    while(1) {
       status = fscanf(fid,"%s\t%s\t%s\t%d\t%d\t%c\t%c\t%c\t%s\n",chr,blah,id,&start,&stop,xy,strand,xz,desc);
       if(status == EOF)
    while(1) {
       status = fscanf(fid,"%s\t%s\t%s\t%d\t%d\t%c\t%c\t%c\t%s\n",chr,blah,id,&start,&stop,xy,strand,xz,desc);
       if(status == EOF)
@@ -50,35 +50,36 @@ void parse_gff(char *filename, FILE* fid,struct gene** allGenes) {
       }
 
       if (strcmp(id,"gene")==0) {
       }
 
       if (strcmp(id,"gene")==0) {
-         if ( currentGene->start != -1)
-            allGenes[idx] = currentGene;
+         if ( (*allGenes)[idx] !=NULL )
             idx++;
        
             idx++;
        
-         currentGene = gene_alloc();
-         currentGene->start = start;
-         currentGene->stop = stop;
-         currentGene->strand = (*strand);
+         (*allGenes)[idx] = gene_alloc();
+         (*allGenes)[idx]->start = start;
+         (*allGenes)[idx]->stop = stop;
+         (*allGenes)[idx]->strand = (*strand);
          //printf("gene start/stop: %d/%d\n",start,stop);
          continue;
       }
 
       if (strcmp(id,"exon")==0) {
          //printf("gene start/stop: %d/%d\n",start,stop);
          continue;
       }
 
       if (strcmp(id,"exon")==0) {
-         add_exon(currentGene,start,stop);
+         add_exon((*allGenes)[idx],start,stop);
          //printf("exon start/stop: %d/%d\n",start,stop);
          continue;
       }
 
       if (strcmp(id,"pseudogene")==0) {
          //printf("exon start/stop: %d/%d\n",start,stop);
          continue;
       }
 
       if (strcmp(id,"pseudogene")==0) {
-         if ( currentGene->start != -1)
-            allGenes[idx] = currentGene;
+         if ( (*allGenes)[idx] !=NULL )
             idx++;
       }
    }
 
             idx++;
       }
    }
 
-   if ( currentGene->start != -1)
-      allGenes[idx] = currentGene;
+   if ( (*allGenes)[idx] !=NULL )
       idx++;
 
       idx++;
 
+   //printf("allGenes[0] is %d\n",(*allGenes)[0]);
+   //printf("allGenes[1] is %d\n",(*allGenes)[1]);
+   //printf("Skipped %d lines.\n",skippedLinesCounter);
+
    free(chr);
    free(blah);
    free(id);
    free(chr);
    free(blah);
    free(id);
@@ -86,4 +87,7 @@ void parse_gff(char *filename, FILE* fid,struct gene** allGenes) {
    free(xy);
    free(strand);
    free(xz);
    free(xy);
    free(strand);
    free(xz);
+
+   return numGenes;
 }
 }
+