+ restored old filtering functionality -> reads have to start/stop at exon boundaries
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Wed, 9 Jan 2008 11:44:33 +0000 (11:44 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Wed, 9 Jan 2008 11:44:33 +0000 (11:44 +0000)
+ added regex functionality to extract gene ids from gff files

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@7265 e1793c9e-67f9-0310-80fc-b846ff1f7b36

tools/data_tools/datastructures.c
tools/data_tools/datastructures.h
tools/data_tools/filterReads.c
tools/data_tools/parser.c

index dd9e48e..2f6e6fd 100644 (file)
@@ -9,12 +9,16 @@ struct gene* gene_alloc(void) {
    newGene->exon_stops = malloc(2*sizeof(int));
    newGene->num_exons = 0;
    newGene->max_exons = 2;
+   newGene->id = 0;
    return newGene;
 }
 
 void free_gene(struct gene* oldGene) {
    free(oldGene->exon_starts);
    free(oldGene->exon_stops);
+   if( oldGene->id != 0) {
+      free(oldGene->id);
+   }
 }
 
 void add_exon(struct gene* currentGene,int start, int stop) {
index 64c48c1..4b295f3 100644 (file)
@@ -9,6 +9,7 @@ struct gene {
    int num_exons;
    int max_exons;
    char strand;
+   char* id;
 };
 
 struct read {
index 1204315..98a3e2b 100644 (file)
@@ -27,8 +27,9 @@ void sort_genes(struct gene*** allGenes, int numGenes);
 
 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes,FILE* out_fs);
 
-void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size, FILE* out_fs);
+void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size, FILE* out_fs,const char* gene_id);
 
+//int fitting(char* up_prb, char* up_prb_end, char* down_prb, char* down_prb_end);
 int fitting(char* up_prb, char* down_prb);
 
 void remove_ambiguities(char * old_seq, int old_seq_size, char* new_seq);
@@ -38,9 +39,7 @@ static char *info = "Usage is:\n./filterReads gff reads output";
 const int read_size = 36;
 
 int combined_reads = 0;
-
 int main(int argc, char* argv[]) {
-   combined_reads = 0;
 
    if(argc != 4) {
       printf("%s\n",info);
@@ -151,10 +150,12 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
 
    int SIZE = 500;
    // initialize boundary arrays
+   void** upstream_end      = malloc(sizeof(void*)*SIZE);
    void** upstream_overlap  = malloc(sizeof(void*)*SIZE);
+   void** downstream_start  = malloc(sizeof(void*)*SIZE);
    void** downstream_overlap= malloc(sizeof(void*)*SIZE);
-   int uo,dov;
-   uo = dov = 0;
+   int ue,uo,ds,dov;
+   ue = uo = ds = dov = 0;
 
    int skippedLinesCounter = 0;
 
@@ -166,6 +167,7 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
    int gene_idx = 0;
    int exon_idx = 1;
    struct gene* currentGene = (*allGenes)[gene_idx];
+   char* gene_id = currentGene->id;
 
    char* disamb_seq = malloc(sizeof(char)*read_size);
 
@@ -211,7 +213,8 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
             exon_idx = 1;
             currentGene = (*allGenes)[gene_idx];
             //printf("currentGene->start / currentGene->stop %d/%d pos is %d\n",currentGene->start,currentGene->stop,pos);
-            uo = dov = 0;
+            gene_id = currentGene->id;
+            ue = uo = ds = dov = 0;
             continue;
          }
 
@@ -247,26 +250,39 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
          if (cur_exon_start - prev_exon_stop < 6 || cur_exon_start < pos ) { // go to next exon
             exon_idx++;
 
-            if (uo != 0 && dov != 0) 
-               combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_overlap,dov,out_fs);
+            if (ue != 0 && dov != 0)
+               combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov,out_fs,gene_id);
+
+            if (uo != 0 && ds != 0) 
+               combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds,out_fs,gene_id);
 
-            uo = dov = 0;
+            ue = uo = ds = dov = 0;
             goto exon_label;
          }
 
          if ( prev_exon_start < pos && (pos+read_size) < prev_exon_stop ) { // read is inside previous exon
             exonicReadCtr++;
-            remove_ambiguities(seq,strlen(seq),disamb_seq);
-            fprintf(out_fs,"%d\t%c\t%s\t%d\t%s\t%s\t%s\n",chr,strand,disamb_seq,read_size,prb,cal_prb,chastity);
+
+   // Removed exonic reads
+
+            //remove_ambiguities(seq,strlen(seq),disamb_seq);
+            //fprintf(out_fs,"%d\t%c\t%s\t%d\t%s\t%s\t%s\n",chr,strand,disamb_seq,read_size,prb,cal_prb,chastity);
+
             goto next_read;
          }
 
          if ( pos + (read_size-1) < prev_exon_stop ) // go to next read
             goto next_read;
 
-         // if this position is reached the read is somehow overlapping at 
-         // the exon boundary. now determine where exactly:
-         
+         // if this position is reached the read is somehow overlapping or
+         // exactly on the exon boundary. now determine where exactly:
+         if (pos + (read_size-1) == prev_exon_stop) {  // read ends at previous exon end
+            endPrevCtr++;
+            upstream_end[ue] = linePtr;
+            ue++;
+            goto next_read;
+         }
+
          if ( (prev_exon_stop - pos) >= 6 && (prev_exon_stop - pos) <= 30) { // read overlaps with previous exon boundary
             prevOverlapCtr++;
             upstream_overlap[uo] = linePtr;
@@ -274,6 +290,13 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
             goto next_read;
          }
 
+         if ( pos == cur_exon_start ) { // read starts at current exon start
+            currentStartCtr++;
+            downstream_start[ds] = linePtr;
+            ds++;
+            goto next_read;
+         }
+
          if ( (cur_exon_start - pos) >= 6 && (cur_exon_start - pos) <= 30 ) { // read overlaps with current exon boundary
             currentOverlapCtr++;
             downstream_overlap[dov] = linePtr;
@@ -286,9 +309,12 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
       }
    }
 
-   combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_overlap,dov,out_fs);
+   combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov,out_fs,gene_id);
+   combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds,out_fs,gene_id);
 
+   free(upstream_end);
    free(upstream_overlap);
+   free(downstream_start);
    free(downstream_overlap);
 
    printf("Processed %d reads in total while %d reads where skipped.\n",readCtr,skippedLinesCounter);
@@ -310,11 +336,11 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
    free(chastity);
 }
 
+/*
 void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size,FILE* out_fs) {
    //printf("up/down size is %d/%d\n",up_size,down_size);
 
    int up_idx, down_idx, status;
-
    char* upstream_line = malloc(sizeof(char)*256);
    char* downstream_line = malloc(sizeof(char)*256);
 
@@ -352,7 +378,160 @@ void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, v
    char* new_prb        = malloc(sizeof(char)*buffer_size);
    char* new_cal_prb    = malloc(sizeof(char)*buffer_size);
    char* new_chastity   = malloc(sizeof(char)*buffer_size);
+   char* new_up_seq = malloc(sizeof(char)*read_size);
+   char* new_down_seq = malloc(sizeof(char)*read_size); 
 
+   up_idx=0;
+   down_idx=0;
+   while(1) {
+      if (up_idx == up_size || down_idx == down_size || up_strand != down_strand)
+         break;
+
+      strncpy(upstream_line,upstream[up_idx],256);
+      status = sscanf(upstream_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
+      &up_chr,&up_pos,up_seq,&up_id,&up_strand,&up_mismatch,&up_occurrence,&up_sz,
+      &up_cut,up_prb,up_cal_prb,up_chastity);
+      
+      strncpy(downstream_line,downstream[down_idx],256);
+      status = sscanf(downstream_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
+      &down_chr,&down_pos,down_seq,&down_id,&down_strand,&down_mismatch,&down_occurrence,&down_sz,
+      &down_cut,down_prb,down_cal_prb,down_chastity);
+
+      remove_ambiguities(up_seq,strlen(up_seq),new_up_seq);
+      remove_ambiguities(down_seq,strlen(down_seq),new_down_seq);
+
+      new_seq[0] = '\0';
+      new_prb[0] = '\0';
+      new_cal_prb[0] = '\0';
+      new_chastity[0] = '\0';
+         
+      int fit;
+      int w_size = 6;
+      int overlap = 0;
+      if (up_pos+35 == exon_stop) { // merge with read which is downstream overlapping
+         overlap = exon_start - down_pos;
+         
+         //fit = fitting(up_prb+(read_size-w_size),up_prb+read_size,down_prb+overlap,down_prb+overlap+w_size);
+         //if (fit != 1)
+         //   goto end;
+
+         new_chr     = up_chr;
+         new_strand  = up_strand;
+
+         strncat(new_seq,new_up_seq+(read_size-overlap),overlap);
+         strncat(new_prb,up_prb+(read_size-overlap),overlap);
+         strncat(new_cal_prb,up_cal_prb+(read_size-overlap),overlap);
+         strncat(new_chastity,up_chastity+(read_size-overlap),overlap);
+
+         strncat(new_seq,new_down_seq+overlap,read_size-overlap);
+         strncat(new_prb,down_prb+overlap,read_size-overlap);
+         strncat(new_cal_prb,down_cal_prb+overlap,read_size-overlap);
+         strncat(new_chastity,down_chastity+overlap,read_size-overlap);
+
+         //printf("Between exon stop/start %d/%d : merging pos %d %d with overlap %d\n",exon_stop,exon_start,up_pos+35,down_pos, overlap);
+
+      } // merge with read which is upstream overlapping
+      
+      if (down_pos == exon_start) {
+         overlap = up_pos+read_size - exon_stop;
+         //printf("overlap is %d\n",overlap);
+         //printf("pos are: %d %d\n",up_pos,down_pos);
+         
+         fit = fitting(up_prb+read_size-overlap-w_size,up_prb+read_size-overlap,down_prb,down_prb+w_size);
+         if (fit == -1)
+
+         //   goto end;
+
+         new_chr     = up_chr;
+         new_strand  = up_strand;
+
+         strncat(new_seq,new_up_seq,(read_size-overlap));
+         strncat(new_prb,up_prb,(read_size-overlap));
+         strncat(new_cal_prb,up_cal_prb,(read_size-overlap));
+         strncat(new_chastity,up_chastity,(read_size-overlap));
+
+         strncat(new_seq,new_down_seq,overlap);
+         strncat(new_prb,down_prb,overlap);
+         strncat(new_cal_prb,down_cal_prb,overlap);
+         strncat(new_chastity,down_chastity,overlap);
+
+         //printf("Between exon stop/start %d/%d : merging pos %d %d with overlap %d\n",exon_stop,exon_start,up_pos,down_pos, overlap);
+      }
+
+      if ( !(up_pos+35 == exon_stop) && !(down_pos == exon_start) )
+         printf("ERROR: Between exon stop/start %d/%d : merging pos %d %d with overlap %d\n",exon_stop,exon_start,up_pos,down_pos, overlap);
+
+      fprintf(out_fs,"%d\t%c\t%s\t%d\t%s\t%s\t%s\n",
+      new_chr,new_strand,new_seq,read_size,new_prb,new_cal_prb,new_chastity);
+    
+      end:
+
+      up_idx++;
+      down_idx++;
+   }
+
+   free(upstream_line);
+   free(downstream_line);
+  
+   free(new_up_seq);
+   free(new_down_seq); 
+
+   free(up_prb);
+   free(up_cal_prb);
+   free(up_chastity);
+
+   free(down_prb);
+   free(down_cal_prb);
+   free(down_chastity);
+
+   free(new_prb);
+   free(new_cal_prb);
+   free(new_chastity);
+
+}
+*/ 
+
+void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size,FILE* out_fs,const char* gene_id) {
+   //printf("up/down size is %d/%d\n",up_size,down_size);
+
+   int up_idx, down_idx, status;
+   char* upstream_line = malloc(sizeof(char)*256);
+   char* downstream_line = malloc(sizeof(char)*256);
+
+   int buffer_size= 64;
+
+   int up_chr        = 0;
+   int up_pos        = 0;
+   char* up_seq      = malloc(sizeof(char)*buffer_size);
+   int up_id         = 0;
+   char up_strand    = ' ';
+   int up_mismatch   = 0;
+   int up_occurrence = 0;
+   int up_sz       = 0;
+   int up_cut        = 0;
+   char* up_prb      = malloc(sizeof(char)*buffer_size);
+   char* up_cal_prb  = malloc(sizeof(char)*buffer_size);
+   char* up_chastity = malloc(sizeof(char)*buffer_size);
+
+   int down_chr        = 0;
+   int down_pos        = 0;
+   char* down_seq      = malloc(sizeof(char)*buffer_size);
+   int down_id         = 0;
+   char down_strand    = ' ';
+   int down_mismatch   = 0;
+   int down_occurrence = 0;
+   int down_sz         = 0;
+   int down_cut        = 0;
+   char* down_prb      = malloc(sizeof(char)*buffer_size);
+   char* down_cal_prb  = malloc(sizeof(char)*buffer_size);
+   char* down_chastity = malloc(sizeof(char)*buffer_size);
+
+   int new_chr        = 0;
+   char* new_seq      = malloc(sizeof(char)*buffer_size);
+   char new_strand    = ' ';
+   char* new_prb        = malloc(sizeof(char)*buffer_size);
+   char* new_cal_prb    = malloc(sizeof(char)*buffer_size);
+   char* new_chastity   = malloc(sizeof(char)*buffer_size);
    char* new_up_seq = malloc(sizeof(char)*read_size);
    char* new_down_seq = malloc(sizeof(char)*read_size); 
 
@@ -386,6 +565,7 @@ void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, v
          new_prb[0] = '\0';
          new_cal_prb[0] = '\0';
          new_chastity[0] = '\0';
+         int splitpos = 0;
          
          fit = fitting(up_prb+(36-overlap),down_prb);
          if (fit == -1)
@@ -407,8 +587,8 @@ void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, v
          strncat(new_cal_prb,down_cal_prb+fit,read_size-overlap);
          strncat(new_chastity,down_chastity+fit,read_size-overlap);
 
-         fprintf(out_fs,"%d\t%c\t%s\t%d\t%s\t%s\t%s\n",
-         new_chr,new_strand,new_seq,read_size,new_prb,new_cal_prb,new_chastity);
+         fprintf(out_fs,"%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n",
+         new_chr,new_strand,new_seq,splitpos,read_size,new_prb,new_cal_prb,new_chastity,gene_id);
 
          combined_reads++;
          used_flag[down_idx] = 1;
@@ -437,6 +617,55 @@ void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, v
    free(new_chastity);
 }
 
+
+
+
+/*
+int fitting(char* up_prb, char* up_prb_end, char* down_prb, char* down_prb_end) {
+   double epsilon_mean = 15.0;
+   double epsilon_var = 10.0;
+   double mean_up = 0;
+   double variance_up = 0;
+   double mean_down = 0;
+   double variance_down = 0;
+
+   char *up_ptr = up_prb;
+   char *down_ptr = down_prb;
+
+   int w_size = 0;
+   while(up_ptr != up_prb_end) {
+      mean_up += (*up_ptr)-50;
+      mean_down += (*down_ptr)-50;
+      w_size++;
+      up_ptr++;
+      down_ptr++;
+   }
+   mean_up /= w_size;
+   mean_down /= w_size;
+
+
+   up_ptr = up_prb;
+   down_ptr = down_prb;
+   w_size = 0;
+   while(up_ptr != up_prb_end) {
+      variance_up += pow((*up_prb)-50 - mean_up,2);
+      variance_down += pow((*down_prb)-50 - mean_down,2);
+      w_size++;
+      up_ptr++;
+      down_ptr++;
+   }
+   variance_up /= (w_size-1);
+   variance_down /= (w_size-1);
+
+   //printf("means: %f %f, variances: %f %f\n",mean_up,mean_down,variance_up,variance_down);
+
+   if ( abs(mean_up - mean_down) < epsilon_mean && abs(variance_up - variance_down) < epsilon_var )
+      return 1;
+      
+   return 0;
+}
+*/
+
 int fitting(char* up_prb, char* down_prb) {
    double epsilon_mean = 30.0;
    double epsilon_var = 30.0;
@@ -520,9 +749,6 @@ int fitting(char* up_prb, char* down_prb) {
 }
 
 void remove_ambiguities(char * old_seq, int old_seq_size, char* new_seq) {
-   //printf("old seq: %s\n",old_seq);
-   //printf("new seq: %s\n",new_seq);
-
    int idx=0;
    int new_idx = 0;
 
index 2e587ac..3c019d5 100644 (file)
@@ -1,10 +1,39 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <regex.h>
 
 #include "common.h"
 #include "datastructures.h"
 
+char* get_regerror (int errcode, regex_t *compiled) {
+     size_t length = regerror (errcode, compiled, NULL, 0);
+     char *buffer = malloc (length);
+     (void) regerror (errcode, compiled, buffer, length);
+     return buffer;
+}
+
+char* get_id(regex_t rx, const char* desc) {
+   size_t nmatch = 1;
+   regmatch_t* all_matches = malloc(sizeof(regmatch_t)*nmatch);
+   int regerr = regexec(&rx, desc, nmatch, all_matches, REG_NOTBOL);
+
+   if ( regerr != 0 ) {
+      char* mes = get_regerror(regerr,&rx);
+      perror(mes);
+      exit(EXIT_FAILURE);
+   }
+
+   int start = all_matches[0].rm_so+3;
+   int end = all_matches[0].rm_eo-1;
+
+   int id_size = end - start;
+   char* id = malloc(sizeof(char)*id_size);
+   strncpy(id,desc+start,id_size);
+
+   return id;
+}
+  
 int parse_gff(char *filename, FILE* fid,struct gene*** allGenes) {
 
    int buffer_size = 256;
@@ -18,6 +47,14 @@ int parse_gff(char *filename, FILE* fid,struct gene*** allGenes) {
    char* strand = malloc(sizeof(char)*4);
    char* xz    = malloc(sizeof(char)*4);
 
+   regex_t rx;
+   const char* pattern = "ID=[^;]*;";
+   if ( regcomp(&rx, pattern, 0) != 0) {
+      perror("regcomp");
+      exit(EXIT_FAILURE);
+   }
+
+
    // do one pass through the gff line to determine the number of
    // genes
    int numGenes = 0;
@@ -56,6 +93,7 @@ int parse_gff(char *filename, FILE* fid,struct gene*** allGenes) {
          (*allGenes)[idx]->start = start;
          (*allGenes)[idx]->stop = stop;
          (*allGenes)[idx]->strand = (*strand);
+         (*allGenes)[idx]->id = get_id(rx,desc);
          //printf("gene start/stop: %d/%d\n",start,stop);
          continue;
       }
@@ -79,6 +117,7 @@ int parse_gff(char *filename, FILE* fid,struct gene*** allGenes) {
    //printf("allGenes[1] is %d\n",(*allGenes)[1]);
    //printf("Skipped %d lines.\n",skippedLinesCounter);
 
+   regfree(&rx);
    free(chr);
    free(blah);
    free(id);