+ found bug which caused ~10 reads of overlap "0"
[qpalma.git] / tools / data_tools / filterReads.c
index 9398454..bbd54b4 100644 (file)
@@ -7,39 +7,50 @@
 ////////////////////////////////////////////////////////////
 
 #include <sys/mman.h>
+#include <sys/stat.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <unistd.h>
+#include <math.h>
 
+#include "common.h"
 #include "datastructures.h"
 
 int parse_gff(char* filename,FILE* fid,struct gene*** allGenes);
 
-void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes);
+void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes,FILE* out_fs);
 
-void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size);
+void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size, FILE* out_fs);
 
-static char *info = "Usage is:\n./filterReads gff reads";
+int fitting(char* up_prb, char* up_prb_end, char* down_prb, char* down_prb_end);
+
+void remove_ambiguities(char * old_seq, int old_seq_size, char* new_seq);
+
+static char *info = "Usage is:\n./filterReads gff reads output";
+
+const int read_size = 36;
 
 int main(int argc, char* argv[]) {
 
-   if(argc != 3) {
+   if(argc != 4) {
       printf("%s\n",info);
       exit(EXIT_FAILURE);
    }
 
-   //static size_t page_size;
-   // page_size = (size_t) sysconf (_SC_PAGESIZE);
    int status;
    int filenameSize = 256;
    char *gff_filename = malloc(sizeof(char)*filenameSize);
    char *reads_filename = malloc(sizeof(char)*filenameSize);
+   char *output_filename = malloc(sizeof(char)*filenameSize);
 
    strncpy(gff_filename,argv[1],filenameSize);
    strncpy(reads_filename,argv[2],filenameSize);
+   strncpy(output_filename,argv[3],filenameSize);
 
    FILE *gff_fs = fopen(gff_filename,"r");
    FILE *reads_fs = fopen(reads_filename,"r");
+   FILE *out_fs = fopen(output_filename,"w");
 
    if(gff_fs == NULL) {
       printf("Error: Could not open file: %s",gff_filename);
@@ -51,17 +62,25 @@ int main(int argc, char* argv[]) {
       exit(EXIT_FAILURE);
    }
 
+   if(out_fs == NULL) {
+      printf("Error: Could not open file: %s",output_filename);
+      exit(EXIT_FAILURE);
+   }
+
    struct gene** allGenes;
    int numGenes = parse_gff(gff_filename,gff_fs,&allGenes);
    status = fclose(gff_fs);
    if(status != 0)
       printf("closing of gff filestream failed!\n");
 
-   process_reads(reads_fs,&allGenes,numGenes);
+   printf("Successfully parsed gff file! Found %d genes.\n",numGenes);
+
+   process_reads(reads_fs,&allGenes,numGenes,out_fs);
 
-   status += fclose(reads_fs);
+   status = fclose(reads_fs);
+   status = fclose(out_fs);
    if(status != 0)
-      printf("closing of filestreams failed!\n");
+      perror("fclose");
       
    //free(allGenes);
    free(gff_filename);
@@ -69,18 +88,17 @@ int main(int argc, char* argv[]) {
    return 0;
 }
 
-
-void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes) {
+void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* out_fs) {
    int status;
 
    int buffer_size= 64;
    int chr        = 0;
    int pos        = 0;
    char* seq      = malloc(sizeof(char)*buffer_size);
-   int id         = 0;
+   unsigned long id         = 0;
    char strand    = ' ';
    int mismatch   = 0;
-   int repetition = 0;
+   int occurrence = 0;
    int size       = 0;
    int cut        = 0;
    char* prb      = malloc(sizeof(char)*buffer_size);
@@ -88,11 +106,22 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes) {
    char* chastity = malloc(sizeof(char)*buffer_size);
 
    int reads_fid = fileno(reads_fs);
-   size_t mmapAreaSize = 20000;
+   struct stat reads_stat;
+   if ( fstat(reads_fid,&reads_stat) == -1) {
+      perror("fstat");
+      exit(EXIT_FAILURE);
+   }
+   off_t reads_filesize = reads_stat.st_size;
+   printf("Reads file is of size %lu bytes\n",(unsigned long) reads_filesize);
+   int numReads = reads_filesize / 178.0;
 
-   void *reads_area = mmap (NULL,mmapAreaSize,PROT_READ|PROT_WRITE,MAP_PRIVATE,reads_fid,0);
-   if((long int) reads_area == -1)
-      printf("mmapping failed!\n");
+   void *reads_area = mmap (NULL,reads_filesize,PROT_READ|PROT_WRITE,MAP_PRIVATE,reads_fid,0);
+   if (reads_area == MAP_FAILED) {
+      perror("mmap");
+      exit(EXIT_FAILURE);
+   }
+   close(reads_fid);
+   printf("Successfully mapped %lu bytes of reads file into memory\n",(unsigned long)reads_filesize);
 
    void* linePtr = reads_area;
    char* current_line = malloc(sizeof(char)*256);
@@ -111,100 +140,160 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes) {
    int prev_exon_stop = -1;
    int cur_exon_start = -1;
 
-   int gene_idx;
-   for (gene_idx = 0; gene_idx < numGenes;) {
-      struct gene* currentGene = (*allGenes)[gene_idx];
+   current_line = strncpy(current_line,linePtr,256);
+   int gene_idx = 0;
+   int exon_idx = 1;
+   struct gene* currentGene = (*allGenes)[gene_idx];
+
+   int readCtr = 0;
+   int old_gene_stop = -1;
+   int old_pos = 0;
+   char* posPtr;
+   int new_chr = 0;
+   int new_pos = 0;
+   char* tmp_line = malloc(sizeof(char)*256);
+   // start of the parsing loop 
+   while(1) {
+      if (gene_idx == numGenes || strcmp(current_line,"") == 0)
+         break;
 
-      int exon_idx;
-      for(exon_idx=1;exon_idx<currentGene->num_exons;exon_idx++) {
+      if (readCtr != 0 && readCtr % 1000000 == 0)
+         printf("Processed %d/%d genes and %d/%d reads.\n",readCtr,gene_idx,numGenes,readCtr,numReads);
 
-         combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov);
-         combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds);
+      //if (gene_idx >= 1833)
+      //   printf("currentGene start/stop: %d/%d. Positions is %d\n",currentGene->start,currentGene->stop,pos);
 
-         prev_exon_stop = currentGene->exon_stops[exon_idx-1];
-         cur_exon_start = currentGene->exon_starts[exon_idx];
+      old_pos = pos;
+      status = sscanf(current_line,"%d\t%d\t%s\t%lu\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
+      &chr,&pos,seq,&id,&strand,&mismatch,&occurrence,&size,&cut,prb,cal_prb,chastity);
+      if (status < 12) {
+         skippedLinesCounter++;
+      }
+      //printf("gene start/stop %d,%d, pos: %d\n",currentGene->start,currentGene->stop,pos);
+      // if the read is occurring several times elsewhere then get rid of it 
+      if(occurrence != 1) {
+         while (*(char*)linePtr != '\n') linePtr++;
+         linePtr++;
+         readCtr += 1;
+         current_line = strncpy(current_line,linePtr,256);
+         continue;
+      }
+
+      if (!(currentGene->start <= pos && (pos + read_size-1) <= currentGene->stop)) { // read is not within gene borders
 
-         if (cur_exon_start - prev_exon_stop < 6)
+         if ( currentGene->stop < (pos + read_size-1) || currentGene->start < old_gene_stop ) { // go to next gene
+            gene_idx++;
+            exon_idx = 1;
+            old_gene_stop = currentGene->stop ;
+            currentGene = (*allGenes)[gene_idx];
+            //printf("currentGene->start / currentGene->stop %d/%d pos is %d\n",currentGene->start,currentGene->stop,pos);
+            ue = uo = ds = dov = 0;
             continue;
+         }
+
+         if ( pos < currentGene->start || pos < old_pos) { // go to next read
+            next_read:
+
+            //posPtr = linePtr;
+            //while (1) {
+            //   printf("posPtr points to %c\n",*(char*)posPtr);
+            //   if ((*(char*)posPtr) == '\n') {
+            //      posPtr++;
+            //      tmp_line = strncpy(tmp_line,posPtr,256);
+
+            //      if (strcmp(tmp_line,"") == 0)
+            //         break;
+
+            //      sscanf(tmp_line,"%d\t%d\t",new_chr,new_pos);
+            //      printf("new_pos %d\n",new_pos);
+            //      if (new_pos >= currentGene->start) {
+            //         linePtr = posPtr;
+            //         break;
+            //      }
+            //   }
+            //   posPtr++;
+            //}
+            //printf("Went out!\n");
 
-         ue = uo = ds = dov = 0;
+            while (*(char*)linePtr != '\n') linePtr++;
+            linePtr++;
+            readCtr += 1;
+            current_line = strncpy(current_line,linePtr,256);
+            
+            continue;
+         }
 
-         label:
+      } else { // read IS within gene borders
 
-         current_line = strncpy(current_line,linePtr,256);
-         if (strcmp(current_line,"") == 0)
-            goto end;
+         exon_label:
 
-         status = sscanf(current_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
-         &chr,&pos,seq,&id,&strand,&mismatch,&repetition,&size,&cut,prb,cal_prb,chastity);
-         if (status < 12) {
-            //printf("skipped line is %s\n",current_line);
-            skippedLinesCounter++;
+         if (exon_idx == currentGene->num_exons) {
+            gene_idx++;
+            exon_idx = 1;
+            currentGene = (*allGenes)[gene_idx];
+            continue;
          }
 
-         //printf("%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
-         //chr,pos,seq,id,strand,mismatch,repetition,size,cut,prb,cal_prb,chastity);
-         //printf("read position: %d gene start/stop: %d-%d\n ",pos,currentGene->start,currentGene->stop);
-
-         if (currentGene->start <= pos && pos+35 <= currentGene->stop) {
-            //printf("gene boundaries: %d %d, pos: %d linePtr %d\n",currentGene->start,currentGene->stop,pos,linePtr);
-            // upstream ending
-            if (pos+35 == prev_exon_stop) {
-               upstream_end[ue] = linePtr;
-               ue++;
-            } else {
-
-               // upstream overlapping
-               if (prev_exon_stop - pos >= 6 && prev_exon_stop - pos <= 30) {
-                  upstream_overlap[uo] = linePtr;
-                  uo++;
-               }
-            }
-
-            // downstream starting
-            if (pos == cur_exon_start) {
-               downstream_start[ds] = linePtr;
-               ds++;
-            } else {
-
-               // downstream overlapping
-               if (cur_exon_start - pos >= 6 && cur_exon_start - pos <= 30 ) {
-                  downstream_overlap[dov] = linePtr;
-                  dov++;
-               }
-            }
+         prev_exon_stop = currentGene->exon_stops[exon_idx-1];
+         cur_exon_start = currentGene->exon_starts[exon_idx];
+            
+         if (cur_exon_start - prev_exon_stop < 6 || cur_exon_start < pos ) { // go to next exon
+            exon_idx++;
+
+            if (ue != 0 && dov != 0)
+               combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov,out_fs);
 
-            while (*(char*)linePtr != '\n') linePtr++;
-            linePtr++;
-            goto label;
+            if (uo != 0 && ds != 0) 
+               combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds,out_fs);
+
+            ue = uo = ds = dov = 0;
+            goto exon_label;
+         }
 
-         } else {
+         if ( pos + (read_size-1) < prev_exon_stop ) // go to next read
+            goto next_read;
 
-            if (currentGene->stop < pos) {
-               //printf("read is too far downstream\n");
-               break;
-            }
+         // if this position is reached the read is somehow overlapping or
+         // exactly on the exon boundary. now determine where exactly:
+         if (pos + (read_size-1) == prev_exon_stop) {  // read ends at previous exon end
+            upstream_end[ue] = linePtr;
+            ue++;
+            goto next_read;
+         }
 
-            //printf("read is too far upstream\n");
-            while (*(char*)linePtr != '\n') linePtr++;
-            linePtr++;
-            goto label;
+         if ( (prev_exon_stop - pos) >= 6 && (prev_exon_stop - pos) <= 30) { // read overlaps with previous exon boundary
+            upstream_overlap[uo] = linePtr;
+            uo++;
+            goto next_read;
          }
-      }
 
-      gene_idx++;
+         if ( pos == cur_exon_start ) { // read starts at current exon start
+            downstream_start[ds] = linePtr;
+            ds++;
+            goto next_read;
+         }
+
+         if ( (cur_exon_start - pos) >= 6 && (cur_exon_start - pos) <= 30 ) { // read overlaps with current exon boundary
+            downstream_overlap[dov] = linePtr;
+            dov++;
+            goto next_read;
+         }
+
+         goto next_read; // read was not useful i.e. not overlapping/starting at exon boundaries
+      }
    }
 
-   end:
+   combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov,out_fs);
+   combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds,out_fs);
 
    free(upstream_end);
    free(upstream_overlap);
    free(downstream_start);
    free(downstream_overlap);
 
-   printf("skipped %d lines.\n",skippedLinesCounter);
+   printf("Processed %d reads in total while %d reads where skipped.\n",readCtr,skippedLinesCounter);
 
-   status = munmap(reads_area,mmapAreaSize);
+   status = munmap(reads_area,reads_filesize);
    if(status != 0)
       printf("munmap failed!\n");
 
@@ -214,12 +303,7 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes) {
    free(chastity);
 }
 
-void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size) {
-   //printf("up_/down_size %d %d\n",up_size,down_size);
-
-   if (up_size == 0 || down_size == 0 || exon_stop == -1)
-      return;
-
+void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size,FILE* out_fs) {
    int up_idx, down_idx, status;
    char* upstream_line = malloc(sizeof(char)*256);
    char* downstream_line = malloc(sizeof(char)*256);
@@ -232,7 +316,7 @@ void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, v
    int up_id         = 0;
    char up_strand    = ' ';
    int up_mismatch   = 0;
-   int up_repetition = 0;
+   int up_occurrence = 0;
    int up_sz       = 0;
    int up_cut        = 0;
    char* up_prb      = malloc(sizeof(char)*buffer_size);
@@ -245,7 +329,7 @@ void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, v
    int down_id         = 0;
    char down_strand    = ' ';
    int down_mismatch   = 0;
-   int down_repetition = 0;
+   int down_occurrence = 0;
    int down_sz         = 0;
    int down_cut        = 0;
    char* down_prb      = malloc(sizeof(char)*buffer_size);
@@ -253,9 +337,7 @@ void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, v
    char* down_chastity = malloc(sizeof(char)*buffer_size);
 
    int new_chr        = 0;
-   int new_pos        = 0;
    char* new_seq      = malloc(sizeof(char)*buffer_size);
-   int new_id         = 0;
    char new_strand    = ' ';
    char* new_prb        = malloc(sizeof(char)*buffer_size);
    char* new_cal_prb    = malloc(sizeof(char)*buffer_size);
@@ -264,56 +346,165 @@ void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, v
    up_idx=0;
    down_idx=0;
    while(1) {
-      //printf("up_/down_idx %d %d\n",up_idx,down_idx);
-
-      if (up_idx == up_size || down_idx == down_size)
+      if (up_idx == up_size || down_idx == down_size || up_strand != down_strand)
          break;
 
       strncpy(upstream_line,upstream[up_idx],256);
       status = sscanf(upstream_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
-      &up_chr,&up_pos,up_seq,&up_id,&up_strand,&up_mismatch,&up_repetition,&up_sz,
+      &up_chr,&up_pos,up_seq,&up_id,&up_strand,&up_mismatch,&up_occurrence,&up_sz,
       &up_cut,up_prb,up_cal_prb,up_chastity);
       
       strncpy(downstream_line,downstream[down_idx],256);
       status = sscanf(downstream_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
-      &down_chr,&down_pos,down_seq,&down_id,&down_strand,&down_mismatch,&down_repetition,&down_sz,
+      &down_chr,&down_pos,down_seq,&down_id,&down_strand,&down_mismatch,&down_occurrence,&down_sz,
       &down_cut,down_prb,down_cal_prb,down_chastity);
 
+      char* new_up_seq = malloc(sizeof(char)*read_size);
+      char* new_down_seq = malloc(sizeof(char)*read_size); 
+
+      remove_ambiguities(up_seq,strlen(up_seq),new_up_seq);
+      remove_ambiguities(down_seq,strlen(down_seq),new_down_seq);
+
+      new_seq[0] = '\0';
       new_prb[0] = '\0';
       new_cal_prb[0] = '\0';
       new_chastity[0] = '\0';
          
-      // is downstream overlapping
-      if (up_pos+35 == exon_stop) {
-         int overlap = exon_start - down_pos;
-         //printf("overlap is %d\n",overlap);
-         //printf("pos are: %d %d\n",up_pos,down_pos);
+      int fit;
+      int w_size = 6;
+      int overlap = 0;
+      if (up_pos+35 == exon_stop) { // merge with read which is downstream overlapping
+         overlap = exon_start - down_pos;
+         
+         fit = fitting(up_prb+(36-w_size),up_prb+36,down_prb+overlap,down_prb+overlap+w_size);
+         if (fit != 1)
+            goto end;
+
+         new_chr     = up_chr;
+         new_strand  = up_strand;
 
+         strncat(new_seq,new_up_seq+(36-overlap),overlap);
          strncat(new_prb,up_prb+(36-overlap),overlap);
          strncat(new_cal_prb,up_cal_prb+(36-overlap),overlap);
          strncat(new_chastity,up_chastity+(36-overlap),overlap);
 
+         strncat(new_seq,new_down_seq+overlap,36-overlap);
          strncat(new_prb,down_prb+overlap,36-overlap);
          strncat(new_cal_prb,down_cal_prb+overlap,36-overlap);
          strncat(new_chastity,down_chastity+overlap,36-overlap);
 
-         //printf("up_prb: %s\n",up_prb);
-         //printf("down_prb: %s\n",down_prb);
-         //printf("new_prb: %s\n",new_prb);
+         //printf("Between exon stop/start %d/%d : merging pos %d %d with overlap %d\n",exon_stop,exon_start,up_pos+35,down_pos, overlap);
+
+      } // merge with read which is upstream overlapping
+      
+      if (down_pos == exon_start) {
+         overlap = up_pos+36 - exon_stop;
+         //printf("overlap is %d\n",overlap);
+         //printf("pos are: %d %d\n",up_pos,down_pos);
+         fit = fitting(up_prb+36-overlap-w_size,up_prb+36-overlap,down_prb,down_prb+w_size);
+         if (fit != 1)
+            goto end;
+
+         new_chr     = up_chr;
+         new_strand  = up_strand;
+
+         strncat(new_seq,new_up_seq,(36-overlap));
+         strncat(new_prb,up_prb,(36-overlap));
+         strncat(new_cal_prb,up_cal_prb,(36-overlap));
+         strncat(new_chastity,up_chastity,(36-overlap));
 
-      // is upstream overlapping
-      } else {
+         strncat(new_seq,new_down_seq,overlap);
+         strncat(new_prb,down_prb,overlap);
+         strncat(new_cal_prb,down_cal_prb,overlap);
+         strncat(new_chastity,down_chastity,overlap);
 
+         //printf("Between exon stop/start %d/%d : merging pos %d %d with overlap %d\n",exon_stop,exon_start,up_pos,down_pos, overlap);
       }
-         
+
+      if ( !(up_pos+35 == exon_stop) && !(down_pos == exon_start) )
+         printf("ERROR: Between exon stop/start %d/%d : merging pos %d %d with overlap %d\n",exon_stop,exon_start,up_pos,down_pos, overlap);
+
+      fprintf(out_fs,"%d\t%c\t%s\t%d\t%s\t%s\t%s\n",
+      new_chr,new_strand,new_seq,read_size,new_prb,new_cal_prb,new_chastity);
+    
+      end:
+
       up_idx++;
       down_idx++;
+
+      free(new_up_seq);
+      free(new_down_seq);
+   }
+}
+
+int fitting(char* up_prb, char* up_prb_end, char* down_prb, char* down_prb_end) {
+   double epsilon_mean = 15.0;
+   double epsilon_var = 10.0;
+   double mean_up = 0;
+   double variance_up = 0;
+   double mean_down = 0;
+   double variance_down = 0;
+
+   char *up_ptr = up_prb;
+   char *down_ptr = down_prb;
+
+   int w_size = 0;
+   while(up_ptr != up_prb_end) {
+      mean_up += (*up_ptr)-50;
+      mean_down += (*down_ptr)-50;
+      w_size++;
+      up_ptr++;
+      down_ptr++;
+   }
+   mean_up /= w_size;
+   mean_down /= w_size;
+
+
+   up_ptr = up_prb;
+   down_ptr = down_prb;
+   w_size = 0;
+   while(up_ptr != up_prb_end) {
+      variance_up += pow((*up_prb)-50 - mean_up,2);
+      variance_down += pow((*down_prb)-50 - mean_down,2);
+      w_size++;
+      up_ptr++;
+      down_ptr++;
+   }
+   variance_up /= (w_size-1);
+   variance_down /= (w_size-1);
+
+   //printf("means: %f %f, variances: %f %f\n",mean_up,mean_down,variance_up,variance_down);
+
+   if ( abs(mean_up - mean_down) < epsilon_mean && abs(variance_up - variance_down) < epsilon_var )
+      return 1;
+      
+   return 0;
+}
+
+void remove_ambiguities(char * old_seq, int old_seq_size, char* new_seq) {
+   //printf("old seq: %s\n",old_seq);
+   //printf("new seq: %s\n",new_seq);
+
+   int idx=0;
+   int new_idx = 0;
+   while(idx<old_seq_size) {
+      if (old_seq[idx] == '[') {
+         new_seq[new_idx] = old_seq[++idx];
+         new_idx++;
+         idx += 3;
+         continue;
+      }
+
+      new_seq[new_idx] = old_seq[idx++];
+      new_idx++;
    }
+   //printf("old seq: %s\n",old_seq);
+   //printf("new seq: %s\n",new_seq);
 }
 
 /*
- * TODO
- * - Check strand
- * - check repetition
- *
+ * TODO:
+ * - Check strand -> done simple (only if equal)
+ * - check for [AC] and similar entries -> done simple (see function
+ *   remove_ambiguities (exchanges [XY] by the first entry)
  */