+ fixed index of reads in filterReads
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Thu, 6 Mar 2008 17:00:55 +0000 (17:00 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Thu, 6 Mar 2008 17:00:55 +0000 (17:00 +0000)
+ added script to check for valid splice sites defined by gff files

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@8083 e1793c9e-67f9-0310-80fc-b846ff1f7b36

tools/data_tools/Makefile
tools/data_tools/debug_tools.c [new file with mode: 0644]
tools/data_tools/filterReads.c
tools/data_tools/parser.c
tools/data_tools/tools.c [new file with mode: 0644]

index f950d55..5602972 100644 (file)
@@ -3,6 +3,8 @@ CFLAGS=-O3 -ggdb -Wall -Wshadow -lm
 
 SRCS= parser.c\
        sort_genes.c\
+       tools.c\
+       debug_tools.c\
        datastructures.c
 
 OBJS = $(SRCS:%.cpp=%.o)
diff --git a/tools/data_tools/debug_tools.c b/tools/data_tools/debug_tools.c
new file mode 100644 (file)
index 0000000..ed97af8
--- /dev/null
@@ -0,0 +1,10 @@
+#include "debug_tools.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+void fassert(int expr,int line, char* file) {
+   if (! expr) {
+      printf("invalid fassert at line %d in file %s!\n",line,file);
+      exit(EXIT_FAILURE);
+   }
+}
index d9ae253..2df9192 100644 (file)
@@ -44,7 +44,7 @@ static char *info = "Usage is:\n./filterReads gff reads output";
  *
  */
 
-#define _FDEBUG_ 1
+#define _FDEBUG_ 0
 
 #define DEBUG_READ 38603
 
@@ -266,6 +266,9 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
          goto next_read;
       }
 
+      if( strand != 'D' )
+         goto next_read;
+
       FA(strlen(seq) >= read_size);
 
       FA(currentGene != 0);
@@ -322,7 +325,7 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
 
          // if this position is reached the read is somehow overlapping or
          // exactly on the exon boundary.
-         if ( (prev_exon_stop - pos + 1) >= min_overlap && (prev_exon_stop - pos + 1) <= max_overlap ) { // read overlaps with previous exon boundary
+         if ( (prev_exon_stop - pos) >= min_overlap && (prev_exon_stop - pos) <= max_overlap ) { // read overlaps with previous exon boundary
             //printf("%s\n",current_line);
             prevOverlapCtr++;
             currentRead = create_read(chr,pos,seq,id,strand,mismatch,occurrence,size,cut,prb,cal_prb,chastity);
@@ -470,54 +473,33 @@ void combine_info(int exon_stop, int exon_start, Read** upstream, int up_size, R
 
 int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FILE* out_fs,const char* gene_id, int exon_idx) {
    // range of possible sequence length on exon side
-   int up_range   = exon_stop - up_read->pos+1;
+   int up_range   = exon_stop - up_read->pos;
    int down_range = down_read->pos+(read_size-1) - exon_start+1;
    int retval;
 
-   int u_off, d_off, u_size, d_size;
+   int u_size, d_size;
    u_size = d_size = -1;
 
-   //printf("possible range %d %d\n",up_range,down_range);
-
    if(up_range+down_range < read_size)
       return 0;
 
-   // both overlap more than a half of the new read
-   //if(up_range >= half_read &&  down_range >= half_read) {
-
-   //   u_size = half_read;
-   //   d_size = half_read;
-
-   //} else {
-
-      //if(up_range < down_range) {
-      //   u_size = up_range;
-      //   d_size = read_size - u_size;
-      //}
-
-      //if(up_range > down_range) {
-      //   d_size = down_range;
-      //   u_size = read_size - d_size;
-      //}
-      
-      //if(up_range < down_range) {
-      if (read_nr % 2 != 0) {
-         d_size = down_range;
-         u_size = read_size - d_size;
-      }
+   //if(up_range < down_range) {
+   if (read_nr % 2 != 0) {
+      d_size = down_range;
+      u_size = read_size - d_size;
+   }
 
-      //if(up_range >= down_range) {
-      if (read_nr % 2 == 0) {
-         u_size = up_range;
-         d_size = read_size - u_size;
-      }
-   //}
+   //if(up_range >= down_range) {
+   if (read_nr % 2 == 0) {
+      u_size = up_range;
+      d_size = read_size - u_size;
+   }
 
-   int p_start  = exon_stop - u_size + 1;
-   int p_stop   = exon_start + d_size - 1;
+   const int p_start  = exon_stop - u_size;
+   const int p_stop   = exon_start + d_size - 1;
 
-   u_off = p_start - up_read->pos;
-   d_off = exon_start - down_read->pos;
+   const int u_off = p_start - up_read->pos;
+   const int d_off = exon_start - down_read->pos;
 
    FA(u_size != -1);
    FA(d_size != -1);
@@ -532,9 +514,6 @@ int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FI
    memset(new_down_seq,0,sizeof(char)*buf_size);
    memset(new_seq,0,sizeof(char)*buf_size);
    
-   //remove_ambiguities(up_read->seq,new_up_seq);
-   //remove_ambiguities(down_read->seq,new_down_seq);
-   //
    strncpy(new_up_seq,up_read->seq,strlen(up_read->seq));
    strncpy(new_down_seq,down_read->seq,strlen(down_read->seq));
 
@@ -622,98 +601,6 @@ int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FI
    return retval;
 }
 
-int fitting(char* up_prb, char* down_prb, int u_off, int d_off, int u_size, int d_size) {
-   double current_mean_up = 0;
-   double current_mean_down = 0;
-
-   int w_size = 2;
-
-   int idx;
-
-   //printf("prb %s\n",up_prb);
-   //printf("prb %s\n",down_prb);
-
-   for(idx=0;idx<w_size;idx++) {
-      current_mean_up += up_prb[u_off+u_size+idx]-50;
-      current_mean_up += up_prb[u_off+u_size-idx]-50;
-      current_mean_up -= up_prb[idx]-50;
-
-      current_mean_down += up_prb[d_off+idx]-50;
-      current_mean_down += up_prb[d_off+idx]-50;
-      current_mean_down -= up_prb[idx]-50;
-   }
-
-   current_mean_up /= (2*w_size+1);
-   current_mean_down /= (2*w_size+1);
-
-   double ratio;
-   if( current_mean_up  > current_mean_down)
-      ratio = current_mean_down / current_mean_up;
-   else
-      ratio = current_mean_up / current_mean_down;
-
-   //printf("ratio is %f\n",ratio);
-
-   if (ratio < 0.35)
-      return 0;
-
-   return 1;
-}
-
-/*
- * As we can have 3 kinds of ambiguities
- *
- * [AG]
- * [A-]
- * [-A]
- *
- * we want to remove/store these ambiguities in the filtered reads
- */
-
-void remove_ambiguities(char * old_seq, char* new_seq) {
-
-   FA(0);
-
-   int idx=0;
-   int new_idx = 0;
-   int old_seq_size = strlen(old_seq);
-
-   while(idx<old_seq_size) {
-      //printf("Current elem %c pos %d %d\n",old_seq[idx],idx,new_idx);
-      if (old_seq[idx] == ']') {
-         idx++;
-         continue;
-      }
-
-      if (old_seq[idx] == '[') {
-
-         // we have a indel either on the dna or read sequence
-         if (old_seq[idx+1] == '-' || old_seq[idx+2] == '-') {
-            
-
-            while(1) { 
-               new_seq[new_idx] = old_seq[idx];
-               if(old_seq[idx] == ']')
-                  break;
-               new_idx++;
-               idx++;
-            }
-
-         } else {
-            idx += 2;
-            continue;
-         }
-      }
-
-      new_seq[new_idx] = old_seq[idx];
-      idx++;
-      new_idx++;
-   }
-
-   new_seq[new_idx] = '\0';
-   //printf("new_seq is %d :: %s\n",new_idx,new_seq);
-}
-
 
 Tuple join_seq(char* new_seq, char* up_seq,int u_off, int u_size, char* down_seq, int d_off, int d_size, int d_range) {
    int new_idx, idx;
@@ -740,7 +627,7 @@ Tuple join_seq(char* new_seq, char* up_seq,int u_off, int u_size, char* down_seq
       printf("u_off is %d\n",u_off);
       printf("u_idx is %d\n",u_idx);
    }
-   u_off = u_idx;
+   u_off = u_idx+1;
 
 
    if( u_off > 0 && up_seq[u_off+idx-1] == '[' )
@@ -803,7 +690,7 @@ Tuple join_seq(char* new_seq, char* up_seq,int u_off, int u_size, char* down_seq
       printf("d_off is %d\n",d_off);
       printf("d_idx is %d\n",d_idx);
    }
-   d_off = d_idx;
+   d_off = d_idx+1;
    //if(read_nr == DEBUG_READ) {
    //   printf("%s\n",new_seq);
    //   printf("%c\n",down_seq[d_off+idx]);
@@ -909,32 +796,3 @@ void print_read(Read* cRead) {
  * - check for [AC] and similar entries -> done simple (see function
  * - remove_ambiguities (exchanges [XY] by the second entry)
  */
-
-   //if( read_nr == 13 || read_nr == 14 )  {
-   //   printf("read nr %d",read_nr);
-   //   printf("pos: %d %d %d %d\n",p_start,exon_stop,exon_start,p_stop);
-   //   printf("u/d range: %d %d\n",up_range,down_range);
-   //   printf("u/d size: %d %d\n",u_size,d_size);
-   //   printf("u/d off: %d %d\n",u_off,d_off);
-   //   printf("******************\n");
-
-   //   printf("%s\n",up_read->seq);
-   //   printf("%s\n",down_read->seq);
-
-   //   printf("******************\n");
-
-   //   printf("%s\n",new_up_seq);
-   //   printf("%s\n",new_down_seq);
-
-   //   printf("******************\n");
-   //   printf("%s\n",new_seq);
-   //   printf("%s\n",new_prb);
-   //   printf("%s\n",new_cal_prb);
-   //   printf("%s\n",new_chastity);
-   //}
-
-   //// The four last integers specify the positions of 
-   //// p_start : the position in the dna where the first read starts
-   //// exons_stop  : the position in the dna where the first exons ends
-   //// exons_start : the position in the dna where the second exons starts
-   //// p_stop  : the position in the dna where the (truncated) second read ends
index d07172a..9a5b9f8 100644 (file)
@@ -6,6 +6,23 @@
 
 #include "datastructures.h"
 
+/* 
+ * Note: 
+ * In the current gff files from the TAIR repository
+ * the exons boundaries are defined as follows: 
+ *
+ * exon start points at:
+ *
+ * agxy
+ *    ^
+ * whereas exon stop points at:
+ *
+ * xygt
+ *   ^
+ *
+ *
+ */
+
 char* get_regerror (int errcode, regex_t *compiled) {
      size_t length = regerror (errcode, compiled, NULL, 0);
      char *buffer = malloc (length);
@@ -104,8 +121,8 @@ int parse_gff(char *filename, FILE* fid,struct gene*** allGenes) {
       }
 
       if (strcmp(id,"exon")==0) {
-         add_exon((*allGenes)[idx],start,stop);
-         //printf("exon start/stop: %d/%d\n",start,stop);
+         add_exon((*allGenes)[idx],start-1,stop);
+         //printf("exon start/stop: %d/%d\n",start-1,stop);
          continue;
       }
 
diff --git a/tools/data_tools/tools.c b/tools/data_tools/tools.c
new file mode 100644 (file)
index 0000000..f1e28cb
--- /dev/null
@@ -0,0 +1,96 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "debug_tools.h"
+
+int fitting(char* up_prb, char* down_prb, int u_off, int d_off, int u_size, int d_size) {
+   double current_mean_up = 0;
+   double current_mean_down = 0;
+
+   int w_size = 2;
+
+   int idx;
+
+   //printf("prb %s\n",up_prb);
+   //printf("prb %s\n",down_prb);
+
+   for(idx=0;idx<w_size;idx++) {
+      current_mean_up += up_prb[u_off+u_size+idx]-50;
+      current_mean_up += up_prb[u_off+u_size-idx]-50;
+      current_mean_up -= up_prb[idx]-50;
+
+      current_mean_down += up_prb[d_off+idx]-50;
+      current_mean_down += up_prb[d_off+idx]-50;
+      current_mean_down -= up_prb[idx]-50;
+   }
+
+   current_mean_up /= (2*w_size+1);
+   current_mean_down /= (2*w_size+1);
+
+   double ratio;
+   if( current_mean_up  > current_mean_down)
+      ratio = current_mean_down / current_mean_up;
+   else
+      ratio = current_mean_up / current_mean_down;
+
+   //printf("ratio is %f\n",ratio);
+
+   if (ratio < 0.35)
+      return 0;
+
+   return 1;
+}
+
+/*
+ * As we can have 3 kinds of ambiguities
+ *
+ * [AG]
+ * [A-]
+ * [-A]
+ *
+ * we want to remove/store these ambiguities in the filtered reads
+ */
+
+void remove_ambiguities(char * old_seq, char* new_seq) {
+
+   FA(0);
+
+   int idx=0;
+   int new_idx = 0;
+   int old_seq_size = strlen(old_seq);
+
+   while(idx<old_seq_size) {
+      //printf("Current elem %c pos %d %d\n",old_seq[idx],idx,new_idx);
+      if (old_seq[idx] == ']') {
+         idx++;
+         continue;
+      }
+
+      if (old_seq[idx] == '[') {
+
+         // we have a indel either on the dna or read sequence
+         if (old_seq[idx+1] == '-' || old_seq[idx+2] == '-') {
+            
+
+            while(1) { 
+               new_seq[new_idx] = old_seq[idx];
+               if(old_seq[idx] == ']')
+                  break;
+               new_idx++;
+               idx++;
+            }
+
+         } else {
+            idx += 2;
+            continue;
+         }
+      }
+
+      new_seq[new_idx] = old_seq[idx];
+      idx++;
+      new_idx++;
+   }
+
+   new_seq[new_idx] = '\0';
+   //printf("new_seq is %d :: %s\n",new_idx,new_seq);
+}