+ optimized and fixed map file parser
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Thu, 20 Dec 2007 15:05:19 +0000 (15:05 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Thu, 20 Dec 2007 15:05:19 +0000 (15:05 +0000)
git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@7199 e1793c9e-67f9-0310-80fc-b846ff1f7b36

tools/data_tools/filterReads.c
tools/data_tools/parser.c

index 9a738e5..786b3e6 100644 (file)
@@ -12,7 +12,9 @@
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
+#include <math.h>
 
+#include "common.h"
 #include "datastructures.h"
 
 int parse_gff(char* filename,FILE* fid,struct gene*** allGenes);
@@ -21,8 +23,12 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes);
 
 void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size);
 
+int fitting(char* up_prb, char* up_prb_end, char* down_prb, char* down_prb_end);
+
 static char *info = "Usage is:\n./filterReads gff reads";
 
+const int read_size = 36;
+
 int main(int argc, char* argv[]) {
 
    if(argc != 3) {
@@ -57,9 +63,11 @@ int main(int argc, char* argv[]) {
    if(status != 0)
       printf("closing of gff filestream failed!\n");
 
+   printf("Successfully parsed gff file! Found %d genes.\n",numGenes);
+
    process_reads(reads_fs,&allGenes,numGenes);
 
-   status += fclose(reads_fs);
+   status = fclose(reads_fs);
    if(status != 0)
       printf("closing of filestreams failed!\n");
       
@@ -69,7 +77,6 @@ int main(int argc, char* argv[]) {
    return 0;
 }
 
-
 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes) {
    int status;
 
@@ -80,7 +87,7 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes) {
    int id         = 0;
    char strand    = ' ';
    int mismatch   = 0;
-   int repetition = 0;
+   int occurrence = 0;
    int size       = 0;
    int cut        = 0;
    char* prb      = malloc(sizeof(char)*buffer_size);
@@ -90,20 +97,25 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes) {
    int reads_fid = fileno(reads_fs);
    struct stat* reads_stat = malloc(sizeof(struct stat));
    status = fstat(reads_fid,reads_stat);
-   if(status != 0)
-      printf("could not stat reads file");
+   if(status != 0) {
+      fprintf(stderr,"could not stat reads file");
+      exit(EXIT_FAILURE);
+   }
 
    off_t reads_filesize = reads_stat->st_size;
    free(reads_stat);
 
    const size_t page_size = (size_t) sysconf(_SC_PAGESIZE);
-   printf("page_size is %d\n",page_size);
-   printf("reads file is of size %d bytes\n",reads_filesize);
    size_t mmapAreaSize = (reads_filesize / page_size)+1 ;
+   printf("Page_size is %d. Reads file is of size %d bytes\n",(int)page_size,reads_filesize);
 
-   void *reads_area = mmap (NULL,mmapAreaSize,PROT_READ|PROT_WRITE,MAP_PRIVATE,reads_fid,0);
-   if((long int) reads_area == -1)
-      printf("mmapping failed!\n");
+   void *reads_area = mmap (NULL,reads_filesize,PROT_READ|PROT_WRITE,MAP_PRIVATE,reads_fid,0);
+   if((long int) reads_area == -1) {
+      fprintf(stderr,"mmapping failed!\n");
+      exit(EXIT_FAILURE);
+   }
+
+   printf("Successfully mapped %d bytes of reads file into memory\n",reads_filesize);
 
    void* linePtr = reads_area;
    char* current_line = malloc(sizeof(char)*256);
@@ -126,50 +138,71 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes) {
    int gene_idx = 0;
    int exon_idx = 1;
    struct gene* currentGene = (*allGenes)[gene_idx];
+
+   int readCtr = 0;
+   // start of the parsing loop 
    while(1) {
       if (gene_idx == numGenes || strcmp(current_line,"") == 0)
          break;
 
-      label:
+      if (readCtr % 10000 == 0)
+         printf("Processed %d reads.\n",readCtr);
+
+      //if (readCtr == 10000000)
+      //   exit(EXIT_FAILURE);
+
       status = sscanf(current_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
-      &chr,&pos,seq,&id,&strand,&mismatch,&repetition,&size,&cut,prb,cal_prb,chastity);
+      &chr,&pos,seq,&id,&strand,&mismatch,&occurrence,&size,&cut,prb,cal_prb,chastity);
       if (status < 12) {
          skippedLinesCounter++;
       }
 
-      printf("gene start/stop %d,%d, pos: %d\n",currentGene->start,currentGene->stop,pos);
+      //printf("gene start/stop %d,%d, pos: %d\n",currentGene->start,currentGene->stop,pos);
 
-      if (!(currentGene->start <= pos && pos+35 <= currentGene->stop)) {
+      // if the read is occurring several times elsewhere then get rid of it 
+      if(occurrence != 1) {
+         while (*(char*)linePtr != '\n') linePtr++;
+         linePtr++;
+         readCtr += 1;
+         current_line = strncpy(current_line,linePtr,256);
+         continue;
+      }
+
+      if (!(currentGene->start <= pos && (pos + read_size-1) <= currentGene->stop)) { // read is not within gene borders
 
-         // go to next gene
-         if ( currentGene->stop < pos+35 ) {
+         if ( currentGene->stop < (pos + read_size-1) ) { // go to next gene
             gene_idx++;
             exon_idx = 1;
             currentGene = (*allGenes)[gene_idx];
             continue;
          }
 
-         // go to next line
-         if ( pos < currentGene->start ) {
+         if ( pos < currentGene->start ) { // go to next read
+            next_read:
+
             while (*(char*)linePtr != '\n') linePtr++;
             linePtr++;
-
+            readCtr += 1;
             current_line = strncpy(current_line,linePtr,256);
             continue;
          }
 
-      } else {
+      } else { // read IS within gene borders
 
          exon_label:
          prev_exon_stop = currentGene->exon_stops[exon_idx-1];
          cur_exon_start = currentGene->exon_starts[exon_idx];
-         printf("prev_exon_stop %d cur_exon_start %d pos %d\n",prev_exon_stop,cur_exon_start,pos);
-         printf("exon_idx %d\n",exon_idx);
-
-         if (exon_idx == currentGene->num_exons)
-            break;
+         //printf("prev_exon_stop %d cur_exon_start %d pos %d\n",prev_exon_stop,cur_exon_start,pos);
+         //printf("exon_idx %d\n",exon_idx);
 
-         if (cur_exon_start - prev_exon_stop < 6 || cur_exon_start+36 < pos ) {
+         if (exon_idx == currentGene->num_exons) {
+            gene_idx++;
+            exon_idx = 1;
+            currentGene = (*allGenes)[gene_idx];
+            continue;
+         }
+            
+         if (cur_exon_start - prev_exon_stop < 6 || cur_exon_start < pos ) { // go to next exon
             exon_idx++;
             combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov);
             combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds);
@@ -177,47 +210,39 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes) {
             goto exon_label;
          }
 
-         if ( pos < prev_exon_stop - 36 ) {
-            while (*(char*)linePtr != '\n') linePtr++;
-            linePtr++;
-
-            current_line = strncpy(current_line,linePtr,256);
-            continue;
+         if ( pos < prev_exon_stop - read_size + 1  ) { // go to next read
+            goto next_read;
          }
 
-         // upstream ending
-         if (pos+35 == prev_exon_stop) {
+         // if this position is reached the read is somehow overlapping or
+         // exactly on the exon boundary. now determine where exactly:
+
+         if (pos + (read_size-1) == prev_exon_stop) {  // read ends at previous exon end
             upstream_end[ue] = linePtr;
             ue++;
-         } else {
+            goto next_read;
+         }
 
-            // upstream overlapping
-            if (prev_exon_stop - pos >= 6 && prev_exon_stop - pos <= 30) {
-               upstream_overlap[uo] = linePtr;
-               uo++;
-            }
+         if (prev_exon_stop - pos >= 6 && prev_exon_stop - pos <= 30) { // read overlaps previous exon boundary
+            upstream_overlap[uo] = linePtr;
+            uo++;
+            goto next_read;
          }
 
-         // downstream starting
-         if (pos == cur_exon_start) {
+         if (pos == cur_exon_start) { // reads start at current exon start
             downstream_start[ds] = linePtr;
             ds++;
-         } else {
+            goto next_read;
+         }
 
-            // downstream overlapping
-            if (cur_exon_start - pos >= 6 && cur_exon_start - pos <= 30 ) {
-               downstream_overlap[dov] = linePtr;
-               dov++;
-            }
+         if (cur_exon_start - pos >= 6 && cur_exon_start - pos <= 30 ) { // read overlaps previous exon boundary
+            downstream_overlap[dov] = linePtr;
+            dov++;
+            goto next_read;
          }
 
-         while (*(char*)linePtr != '\n') linePtr++;
-         linePtr++;
-         current_line = strncpy(current_line,linePtr,256);
-         continue;
+         goto next_read;
       }
-
-      gene_idx++;
    }
 
    combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov);
@@ -241,7 +266,7 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes) {
 }
 
 void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size) {
-   printf("up_/down_size %d %d\n",up_size,down_size);
+   //printf("up_/down_size %d %d\n",up_size,down_size);
 
    if (up_size == 0 || down_size == 0 || exon_stop == -1)
       return;
@@ -258,7 +283,7 @@ void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, v
    int up_id         = 0;
    char up_strand    = ' ';
    int up_mismatch   = 0;
-   int up_repetition = 0;
+   int up_occurrence = 0;
    int up_sz       = 0;
    int up_cut        = 0;
    char* up_prb      = malloc(sizeof(char)*buffer_size);
@@ -271,7 +296,7 @@ void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, v
    int down_id         = 0;
    char down_strand    = ' ';
    int down_mismatch   = 0;
-   int down_repetition = 0;
+   int down_occurrence = 0;
    int down_sz         = 0;
    int down_cut        = 0;
    char* down_prb      = malloc(sizeof(char)*buffer_size);
@@ -297,23 +322,27 @@ void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, v
 
       strncpy(upstream_line,upstream[up_idx],256);
       status = sscanf(upstream_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
-      &up_chr,&up_pos,up_seq,&up_id,&up_strand,&up_mismatch,&up_repetition,&up_sz,
+      &up_chr,&up_pos,up_seq,&up_id,&up_strand,&up_mismatch,&up_occurrence,&up_sz,
       &up_cut,up_prb,up_cal_prb,up_chastity);
       
       strncpy(downstream_line,downstream[down_idx],256);
       status = sscanf(downstream_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
-      &down_chr,&down_pos,down_seq,&down_id,&down_strand,&down_mismatch,&down_repetition,&down_sz,
+      &down_chr,&down_pos,down_seq,&down_id,&down_strand,&down_mismatch,&down_occurrence,&down_sz,
       &down_cut,down_prb,down_cal_prb,down_chastity);
 
       new_prb[0] = '\0';
       new_cal_prb[0] = '\0';
       new_chastity[0] = '\0';
          
+      int w_size = 5;
       // is downstream overlapping
       if (up_pos+35 == exon_stop) {
          int overlap = exon_start - down_pos;
          //printf("overlap is %d\n",overlap);
          //printf("pos are: %d %d\n",up_pos,down_pos);
+         
+
+         fitting(up_prb+(36-w_size),up_prb+36,down_prb+overlap,down_prb+overlap+w_size);
 
          strncat(new_prb,up_prb+(36-overlap),overlap);
          strncat(new_cal_prb,up_cal_prb+(36-overlap),overlap);
@@ -329,6 +358,7 @@ void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, v
          int overlap = up_pos+36 - exon_stop;
          //printf("overlap is %d\n",overlap);
          //printf("pos are: %d %d\n",up_pos,down_pos);
+         fitting(up_prb+36-overlap-w_size,up_prb+36-overlap,down_prb,down_prb+w_size);
 
          strncat(new_prb,up_prb,(36-overlap));
          strncat(new_cal_prb,up_cal_prb,(36-overlap));
@@ -339,18 +369,63 @@ void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, v
          strncat(new_chastity,down_chastity,overlap);
       }
 
-      printf("up_prb: %s\n",up_prb);
-      printf("down_prb: %s\n",down_prb);
-      printf("new_prb: %s\n",new_prb);
+      //printf("up_prb: %s\n",up_prb);
+      //printf("down_prb: %s\n",down_prb);
+      //printf("new_prb: %s\n",new_prb);
          
       up_idx++;
       down_idx++;
    }
 }
 
+int fitting(char* up_prb, char* up_prb_end, char* down_prb, char* down_prb_end) {
+   double epsilon_mean = 0.1;
+   double epsilon_var = 0.1;
+   double mean_up = 0;
+   double variance_up = 0;
+   double mean_down = 0;
+   double variance_down = 0;
+
+   char *up_ptr = up_prb;
+   char *down_ptr = down_prb;
+
+   int w_size = 0;
+   while(up_ptr != up_prb_end) {
+      mean_up += (*up_ptr)-50;
+      mean_down += (*down_ptr)-50;
+      w_size++;
+      up_ptr++;
+      down_ptr++;
+   }
+   mean_up /= w_size;
+   mean_down /= w_size;
+
+   //printf("means: %f %f\n",mean_up,mean_down);
+
+   up_ptr = up_prb;
+   down_ptr = down_prb;
+   w_size = 0;
+   while(up_ptr != up_prb_end) {
+      variance_up += pow((*up_prb)-50 - mean_up,2);
+      variance_down += pow((*down_prb)-50 - mean_down,2);
+      w_size++;
+      up_ptr++;
+      down_ptr++;
+   }
+   variance_up /= (w_size-1);
+   variance_down /= (w_size-1);
+
+   //printf("variances: %f %f\n",variance_up,variance_down);
+
+   if ( abs(mean_up - mean_down) < epsilon_mean && abs(variance_up - variance_down) < epsilon_var )
+      return 1;
+      
+   return 0;
+}
+
 /*
  * TODO
  * - Check strand
- * - check repetition
+ * - check occurrence
  *
  */
index 2783157..7b7b431 100644 (file)
@@ -2,6 +2,7 @@
 #include <stdlib.h>
 #include <string.h>
 
+#include "common.h"
 #include "datastructures.h"
 
 int parse_gff(char *filename, FILE* fid,struct gene*** allGenes) {
@@ -20,20 +21,18 @@ int parse_gff(char *filename, FILE* fid,struct gene*** allGenes) {
    // do one pass through the gff line to determine the number of
    // genes
    int numGenes = 0;
-
    int status = 0;
    while(1) {
       status = fscanf(fid,"%s\t%s\t%s\t%d\t%d\t%c\t%c\t%c\t%s\n",chr,blah,id,&start,&stop,xy,strand,xz,desc);
       if(status == EOF)
          break;
 
-      if ( status > 5 && strcmp(id,"gene")==0)
+      if ( status >= 5 && strcmp(id,"gene")==0)
          numGenes++;
+
    }
    freopen(filename,"r",fid);
  
-   //printf("Found %d genes!\n",numGenes);
-
    int idx = 0;
    (*allGenes) = malloc(sizeof(struct gene*)*numGenes);
    (*allGenes)[idx] = NULL;