+ added support for writing in result file
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Thu, 20 Dec 2007 16:29:37 +0000 (16:29 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Thu, 20 Dec 2007 16:29:37 +0000 (16:29 +0000)
+ changed number of attributes which are stored in result file

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@7200 e1793c9e-67f9-0310-80fc-b846ff1f7b36

tools/data_tools/Makefile
tools/data_tools/filterReads.c

index 618d9a2..1d4a721 100644 (file)
@@ -4,8 +4,10 @@ SRCS= parser.c\
 
 OBJS = $(SRCS:%.cpp=%.o)
 
+FLAGS=-O3 -Wall -Wshadow
+
 all: $(OBJS)
-       gcc -O3 -Wall -o filterReads $(OBJS) filterReads.c
+       gcc $(FLAGS) -o filterReads $(OBJS) filterReads.c
 
 clean:
        rm -rf *.o
index 786b3e6..669045e 100644 (file)
 
 int parse_gff(char* filename,FILE* fid,struct gene*** allGenes);
 
-void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes);
+void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes,FILE* out_fs);
 
-void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size);
+void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size, FILE* out_fs);
 
 int fitting(char* up_prb, char* up_prb_end, char* down_prb, char* down_prb_end);
 
-static char *info = "Usage is:\n./filterReads gff reads";
+static char *info = "Usage is:\n./filterReads gff reads output";
 
 const int read_size = 36;
 
 int main(int argc, char* argv[]) {
 
-   if(argc != 3) {
+   if(argc != 4) {
       printf("%s\n",info);
       exit(EXIT_FAILURE);
    }
@@ -40,12 +40,15 @@ int main(int argc, char* argv[]) {
    int filenameSize = 256;
    char *gff_filename = malloc(sizeof(char)*filenameSize);
    char *reads_filename = malloc(sizeof(char)*filenameSize);
+   char *output_filename = malloc(sizeof(char)*filenameSize);
 
    strncpy(gff_filename,argv[1],filenameSize);
    strncpy(reads_filename,argv[2],filenameSize);
+   strncpy(output_filename,argv[3],filenameSize);
 
    FILE *gff_fs = fopen(gff_filename,"r");
    FILE *reads_fs = fopen(reads_filename,"r");
+   FILE *out_fs = fopen(output_filename,"w");
 
    if(gff_fs == NULL) {
       printf("Error: Could not open file: %s",gff_filename);
@@ -57,6 +60,11 @@ int main(int argc, char* argv[]) {
       exit(EXIT_FAILURE);
    }
 
+   if(out_fs == NULL) {
+      printf("Error: Could not open file: %s",output_filename);
+      exit(EXIT_FAILURE);
+   }
+
    struct gene** allGenes;
    int numGenes = parse_gff(gff_filename,gff_fs,&allGenes);
    status = fclose(gff_fs);
@@ -65,9 +73,10 @@ int main(int argc, char* argv[]) {
 
    printf("Successfully parsed gff file! Found %d genes.\n",numGenes);
 
-   process_reads(reads_fs,&allGenes,numGenes);
+   process_reads(reads_fs,&allGenes,numGenes,out_fs);
 
    status = fclose(reads_fs);
+   status += fclose(out_fs);
    if(status != 0)
       printf("closing of filestreams failed!\n");
       
@@ -77,14 +86,14 @@ int main(int argc, char* argv[]) {
    return 0;
 }
 
-void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes) {
+void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* out_fs) {
    int status;
 
    int buffer_size= 64;
    int chr        = 0;
    int pos        = 0;
    char* seq      = malloc(sizeof(char)*buffer_size);
-   int id         = 0;
+   unsigned long id         = 0;
    char strand    = ' ';
    int mismatch   = 0;
    int occurrence = 0;
@@ -95,27 +104,23 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes) {
    char* chastity = malloc(sizeof(char)*buffer_size);
 
    int reads_fid = fileno(reads_fs);
-   struct stat* reads_stat = malloc(sizeof(struct stat));
-   status = fstat(reads_fid,reads_stat);
-   if(status != 0) {
-      fprintf(stderr,"could not stat reads file");
+   struct stat reads_stat;
+   if ( fstat(reads_fid,&reads_stat) == -1) {
+      perror("fstat");
       exit(EXIT_FAILURE);
    }
+   off_t reads_filesize = reads_stat.st_size;
 
-   off_t reads_filesize = reads_stat->st_size;
-   free(reads_stat);
-
-   const size_t page_size = (size_t) sysconf(_SC_PAGESIZE);
-   size_t mmapAreaSize = (reads_filesize / page_size)+1 ;
-   printf("Page_size is %d. Reads file is of size %d bytes\n",(int)page_size,reads_filesize);
+   printf("Reads file is of size %lu bytes\n",(unsigned long) reads_filesize);
 
    void *reads_area = mmap (NULL,reads_filesize,PROT_READ|PROT_WRITE,MAP_PRIVATE,reads_fid,0);
-   if((long int) reads_area == -1) {
-      fprintf(stderr,"mmapping failed!\n");
+   if (reads_area == MAP_FAILED) {
+      perror("mmap");
       exit(EXIT_FAILURE);
    }
-
-   printf("Successfully mapped %d bytes of reads file into memory\n",reads_filesize);
+   close(reads_fid);
+   
+   printf("Successfully mapped %lu bytes of reads file into memory\n",(unsigned long)reads_filesize);
 
    void* linePtr = reads_area;
    char* current_line = malloc(sizeof(char)*256);
@@ -145,13 +150,13 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes) {
       if (gene_idx == numGenes || strcmp(current_line,"") == 0)
          break;
 
-      if (readCtr % 10000 == 0)
+      if (readCtr != 0 && readCtr % 100000 == 0)
          printf("Processed %d reads.\n",readCtr);
 
       //if (readCtr == 10000000)
       //   exit(EXIT_FAILURE);
 
-      status = sscanf(current_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
+      status = sscanf(current_line,"%d\t%d\t%s\t%lu\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
       &chr,&pos,seq,&id,&strand,&mismatch,&occurrence,&size,&cut,prb,cal_prb,chastity);
       if (status < 12) {
          skippedLinesCounter++;
@@ -204,8 +209,8 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes) {
             
          if (cur_exon_start - prev_exon_stop < 6 || cur_exon_start < pos ) { // go to next exon
             exon_idx++;
-            combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov);
-            combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds);
+            combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov,out_fs);
+            combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds,out_fs);
             ue = uo = ds = dov = 0;
             goto exon_label;
          }
@@ -223,19 +228,19 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes) {
             goto next_read;
          }
 
-         if (prev_exon_stop - pos >= 6 && prev_exon_stop - pos <= 30) { // read overlaps previous exon boundary
+         if (prev_exon_stop - pos >= 6 && prev_exon_stop - pos <= 30) { // read overlaps with previous exon boundary
             upstream_overlap[uo] = linePtr;
             uo++;
             goto next_read;
          }
 
-         if (pos == cur_exon_start) { // reads start at current exon start
+         if (pos == cur_exon_start) { // read starts at current exon start
             downstream_start[ds] = linePtr;
             ds++;
             goto next_read;
          }
 
-         if (cur_exon_start - pos >= 6 && cur_exon_start - pos <= 30 ) { // read overlaps previous exon boundary
+         if (cur_exon_start - pos >= 6 && cur_exon_start - pos <= 30 ) { // read overlaps with current exon boundary
             downstream_overlap[dov] = linePtr;
             dov++;
             goto next_read;
@@ -245,17 +250,17 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes) {
       }
    }
 
-   combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov);
-   combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds);
+   combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov,out_fs);
+   combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds,out_fs);
 
    free(upstream_end);
    free(upstream_overlap);
    free(downstream_start);
    free(downstream_overlap);
 
-   printf("skipped %d lines.\n",skippedLinesCounter);
+   printf("Processed %d reads in total while %d reads where skipped.\n",readCtr,skippedLinesCounter);
 
-   status = munmap(reads_area,mmapAreaSize);
+   status = munmap(reads_area,reads_filesize);
    if(status != 0)
       printf("munmap failed!\n");
 
@@ -265,7 +270,7 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes) {
    free(chastity);
 }
 
-void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size) {
+void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size,FILE* out_fs) {
    //printf("up_/down_size %d %d\n",up_size,down_size);
 
    if (up_size == 0 || down_size == 0 || exon_stop == -1)
@@ -304,9 +309,7 @@ void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, v
    char* down_chastity = malloc(sizeof(char)*buffer_size);
 
    int new_chr        = 0;
-   int new_pos        = 0;
    char* new_seq      = malloc(sizeof(char)*buffer_size);
-   int new_id         = 0;
    char new_strand    = ' ';
    char* new_prb        = malloc(sizeof(char)*buffer_size);
    char* new_cal_prb    = malloc(sizeof(char)*buffer_size);
@@ -334,53 +337,68 @@ void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, v
       new_cal_prb[0] = '\0';
       new_chastity[0] = '\0';
          
+      int fit;
       int w_size = 5;
-      // is downstream overlapping
-      if (up_pos+35 == exon_stop) {
+      if (up_pos+35 == exon_stop) { // merge with read which is downstream overlapping
          int overlap = exon_start - down_pos;
          //printf("overlap is %d\n",overlap);
          //printf("pos are: %d %d\n",up_pos,down_pos);
          
+         fit = fitting(up_prb+(36-w_size),up_prb+36,down_prb+overlap,down_prb+overlap+w_size);
+         if (fit != 1)
+            goto end;
 
-         fitting(up_prb+(36-w_size),up_prb+36,down_prb+overlap,down_prb+overlap+w_size);
+         new_chr     = up_chr;
+         new_strand  = up_strand;
 
+         strncat(new_seq,up_seq+(36-overlap),overlap);
          strncat(new_prb,up_prb+(36-overlap),overlap);
          strncat(new_cal_prb,up_cal_prb+(36-overlap),overlap);
          strncat(new_chastity,up_chastity+(36-overlap),overlap);
 
+         strncat(new_seq,down_seq+overlap,36-overlap);
          strncat(new_prb,down_prb+overlap,36-overlap);
          strncat(new_cal_prb,down_cal_prb+overlap,36-overlap);
          strncat(new_chastity,down_chastity+overlap,36-overlap);
-      // is upstream overlapping
-      }
+
+      } // merge with read which is upstream overlapping
       
       if (down_pos == exon_start) {
          int overlap = up_pos+36 - exon_stop;
          //printf("overlap is %d\n",overlap);
          //printf("pos are: %d %d\n",up_pos,down_pos);
-         fitting(up_prb+36-overlap-w_size,up_prb+36-overlap,down_prb,down_prb+w_size);
+         fit = fitting(up_prb+36-overlap-w_size,up_prb+36-overlap,down_prb,down_prb+w_size);
+         if (fit != 1)
+            goto end;
+
+         new_chr     = up_chr;
+         new_strand  = up_strand;
 
+         strncat(new_seq,up_seq,(36-overlap));
          strncat(new_prb,up_prb,(36-overlap));
          strncat(new_cal_prb,up_cal_prb,(36-overlap));
          strncat(new_chastity,up_chastity,(36-overlap));
 
+         strncat(new_seq,down_seq,overlap);
          strncat(new_prb,down_prb,overlap);
          strncat(new_cal_prb,down_cal_prb,overlap);
          strncat(new_chastity,down_chastity,overlap);
       }
 
-      //printf("up_prb: %s\n",up_prb);
-      //printf("down_prb: %s\n",down_prb);
-      //printf("new_prb: %s\n",new_prb);
-         
+      //printf("New entry!\n");
+      fprintf(out_fs,"%d\t%c\t%s\t%d\t%s\t%s\t%s\n",
+      new_chr,new_strand,new_seq,read_size,new_prb,new_cal_prb,new_chastity);
+    
+      end:
+
       up_idx++;
       down_idx++;
    }
 }
 
 int fitting(char* up_prb, char* up_prb_end, char* down_prb, char* down_prb_end) {
-   double epsilon_mean = 0.1;
-   double epsilon_var = 0.1;
+   double epsilon_mean = 10.0;
+   double epsilon_var = 10.0;
    double mean_up = 0;
    double variance_up = 0;
    double mean_down = 0;
@@ -426,6 +444,5 @@ int fitting(char* up_prb, char* up_prb_end, char* down_prb, char* down_prb_end)
 /*
  * TODO
  * - Check strand
- * - check occurrence
- *
+ * - check for [AC] and similar entries
  */