+ extended filtering to take indels [A-] resp. [-A] into account
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Sat, 9 Feb 2008 12:14:26 +0000 (12:14 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Sat, 9 Feb 2008 12:14:26 +0000 (12:14 +0000)
git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@7729 e1793c9e-67f9-0310-80fc-b846ff1f7b36

tools/data_tools/filterReads.c

index 71679c4..1a05ae9 100644 (file)
@@ -27,8 +27,9 @@ void sort_genes(struct gene*** allGenes, int numGenes);
 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes,FILE* out_fs);
 void combine_info(int exon_stop, int exon_start, Read** upstream, int up_size, Read** downstream, int down_size, FILE* out_fs,const char* gene_id, int exon_idx);
 int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FILE* out_fs,const char* gene_id, int exon_idx);
-int fitting(char* up_prb, char* down_prb, int u_off, int d_off);
+int fitting(char* up_prb, char* down_prb, int u_off, int d_off, int u_size, int d_size);
 void remove_ambiguities(char * old_seq, char* new_seq);
+void join_seq(char* new_seq, char* up_seq,int u_off, int u_size, char* down_seq, int d_off, int d_size);
 
 static char *info = "Usage is:\n./filterReads gff reads output";
 
@@ -457,25 +458,12 @@ void combine_info(int exon_stop, int exon_start, Read** upstream, int up_size, R
 }
 
 int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FILE* out_fs,const char* gene_id, int exon_idx) {
-   int buffer_size = read_size+1;
-    
-   char* new_seq        = malloc(sizeof(char)*buffer_size);
-   char* new_prb        = malloc(sizeof(char)*buffer_size);
-   char* new_cal_prb    = malloc(sizeof(char)*buffer_size);
-   char* new_chastity   = malloc(sizeof(char)*buffer_size);
-
-   char* new_up_seq     = malloc(sizeof(char)*(read_size+1));
-   char* new_down_seq   = malloc(sizeof(char)*(read_size+1)); 
-   memset(new_up_seq,0,sizeof(char)*read_size);
-   memset(new_down_seq,0,sizeof(char)*read_size);
-
    // range of possible sequence length on exon side
    int up_range   = exon_stop - up_read->pos+1;
    int down_range = down_read->pos+(read_size-1) - exon_start+1;
 
    int u_off, d_off, u_size, d_size;
-   u_size = -1;
-   d_size = -1;
+   u_size = d_size = -1;
 
    //printf("possible range %d %d\n",up_range,down_range);
 
@@ -513,14 +501,25 @@ int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FI
    FA(d_size != -1);
    FA(u_size + d_size == read_size);
 
-   //printf("%lu %lu\n",up_read->id,down_read->id);
+   int buf_size = 4*read_size;
+   char* new_up_seq     = malloc(sizeof(char)*buf_size);
+   char* new_down_seq   = malloc(sizeof(char)*buf_size); 
+   char* new_seq        = malloc(sizeof(char)*buf_size);
+
+   memset(new_up_seq,0,sizeof(char)*buf_size);
+   memset(new_down_seq,0,sizeof(char)*buf_size);
+   memset(new_seq,0,sizeof(char)*buf_size);
    
    remove_ambiguities(up_read->seq,new_up_seq);
    remove_ambiguities(down_read->seq,new_down_seq);
 
-   strncpy(new_seq,new_up_seq+u_off,u_size);
-   strncpy(new_seq+u_size,new_down_seq+d_off,d_size);
-   new_seq[read_size] = '\0';
+   join_seq(new_seq,new_up_seq,u_off,u_size,new_down_seq,d_off,d_size);
+
+   buf_size = read_size+1;
+
+   char* new_prb        = malloc(sizeof(char)*buf_size);
+   char* new_cal_prb    = malloc(sizeof(char)*buf_size);
+   char* new_chastity   = malloc(sizeof(char)*buf_size);
 
    strncpy(new_prb, up_read->prb+u_off, u_size);
    strncpy(new_prb+u_size, down_read->prb+d_off, d_size);
@@ -534,46 +533,23 @@ int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FI
    strncpy(new_chastity+u_size, down_read->chastity+d_off, d_size);
    new_chastity[read_size] = '\0';
 
-   int status = fitting(up_read->prb,down_read->prb,u_off,d_off);
-
-   if(status != 1)
-      return status;
-
-   //if( read_nr == 13 || read_nr == 14 )  {
-   //   printf("read nr %d",read_nr);
-   //   printf("pos: %d %d %d %d\n",p_start,exon_stop,exon_start,p_stop);
-   //   printf("u/d range: %d %d\n",up_range,down_range);
-   //   printf("u/d size: %d %d\n",u_size,d_size);
-   //   printf("u/d off: %d %d\n",u_off,d_off);
-   //   printf("******************\n");
-
-   //   printf("%s\n",up_read->seq);
-   //   printf("%s\n",down_read->seq);
+   int status = fitting(up_read->prb,down_read->prb,u_off,d_off,u_size,d_size);
+   int retval;
 
-   //   printf("******************\n");
-
-   //   printf("%s\n",new_up_seq);
-   //   printf("%s\n",new_down_seq);
+   if(status != 1) {
+      retval = 0;
+      goto free;
+   }
 
-   //   printf("******************\n");
-   //   printf("%s\n",new_seq);
-   //   printf("%s\n",new_prb);
-   //   printf("%s\n",new_cal_prb);
-   //   printf("%s\n",new_chastity);
-   //}
+   retval = status;
 
-   //// The four last integers specify the positions of 
-   //// p_start : the position in the dna where the first read starts
-   //// exons_stop  : the position in the dna where the first exons ends
-   //// exons_start : the position in the dna where the second exons starts
-   //// p_stop  : the position in the dna where the (truncated) second read ends
-   
    fprintf(out_fs,"%d\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\n",
       read_nr,up_read->chr,up_read->strand,new_seq,u_size,read_size,new_prb,new_cal_prb,new_chastity,gene_id,p_start,exon_stop,exon_start,p_stop);
 
    read_nr++;
    combined_reads++;
 
+   free:
    free(new_up_seq);
    free(new_down_seq); 
 
@@ -582,24 +558,27 @@ int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FI
    free(new_cal_prb);
    free(new_chastity);
 
-   return 1;
+   return retval;
 }
 
-int fitting(char* up_prb, char* down_prb, int u_off, int d_off) {
+int fitting(char* up_prb, char* down_prb, int u_off, int d_off, int u_size, int d_size) {
    double current_mean_up = 0;
    double current_mean_down = 0;
 
-   int w_size = 3;
+   int w_size = 2;
 
    int idx;
 
+   printf("prb %s\n",up_prb);
+   printf("prb %s\n",down_prb);
+
    for(idx=0;idx<w_size;idx++) {
-      current_mean_up += up_prb[u_off+idx]-50;
-      current_mean_up += up_prb[u_off-idx]-50;
+      current_mean_up += up_prb[u_off+u_size+idx]-50;
+      current_mean_up += up_prb[u_off+u_size-idx]-50;
       current_mean_up -= up_prb[idx]-50;
 
       current_mean_down += up_prb[d_off+idx]-50;
-      current_mean_down += up_prb[d_off-idx]-50;
+      current_mean_down += up_prb[d_off+idx]-50;
       current_mean_down -= up_prb[idx]-50;
    }
 
@@ -612,12 +591,24 @@ int fitting(char* up_prb, char* down_prb, int u_off, int d_off) {
    else
       ratio = current_mean_up / current_mean_down;
 
+   printf("ratio is %f\n",ratio);
+
    if (ratio < 0.5)
       return 0;
 
    return 1;
 }
 
+/*
+ * As we can have 3 kinds of ambiguities
+ *
+ * [AG]
+ * [A-]
+ * [-A]
+ *
+ * we want to remove/store these ambiguities in the filtered reads
+ */
+
 void remove_ambiguities(char * old_seq, char* new_seq) {
    int idx=0;
    int new_idx = 0;
@@ -625,32 +616,95 @@ void remove_ambiguities(char * old_seq, char* new_seq) {
 
    while(idx<old_seq_size) {
       //printf("Current elem %c pos %d %d\n",old_seq[idx],idx,new_idx);
-      if (old_seq[idx] == ']' || old_seq[idx] == '-' ) {
+      if (old_seq[idx] == ']') {
          idx++;
          continue;
       }
 
       if (old_seq[idx] == '[') {
-         idx += 2;
-         //printf("%c %c\n",old_seq[idx-2],old_seq[idx]);
-         continue;
+
+         // we have a indel either on the dna or read sequence
+         if (old_seq[idx+1] == '-' || old_seq[idx+2] == '-') {
+
+            while(1) { 
+               new_seq[new_idx] = old_seq[idx];
+               if(old_seq[idx] == ']')
+                  break;
+               new_idx++;
+               idx++;
+            }
+
+         } else {
+            idx += 2;
+            continue;
+         }
       }
 
       new_seq[new_idx] = old_seq[idx];
       idx++;
       new_idx++;
-      //printf("%s\t%s\n",old_seq,new_seq);
-      //printf("%d\t%d\n",idx,new_idx);
    }
 
-   if (new_idx != read_size) {
-      printf("Error: Sequence is not of length 36!\n");
-      printf("old seq: %s\n",old_seq);
-      printf("new seq: %s\n",new_seq);
-      exit(EXIT_FAILURE);
+   new_seq[new_idx] = '\0';
+   //printf("new_seq is %d :: %s\n",new_idx,new_seq);
+}
+
+void join_seq(char* new_seq, char* up_seq,int u_off, int u_size, char* down_seq, int d_off, int d_size) {
+   int new_idx, idx;
+   new_idx = idx = 0;
+   int elem_ctr = 0;
+
+   printf("join_seq %d %d %d %d\n",u_off,u_size,d_off,d_size);
+
+   while(1) {
+      if (elem_ctr == u_size)
+         break;
+
+      if(up_seq[u_off+idx] == '[') {
+         while(1) { 
+            new_seq[new_idx] = up_seq[u_off+idx];
+            new_idx++;
+            idx++;
+            if(up_seq[u_off+idx-1] == ']')
+               break;
+         }
+         elem_ctr++;
+         continue;
+      }
+
+      new_seq[new_idx] = up_seq[u_off+idx];
+      idx++;
+      new_idx++;
+      elem_ctr++;
+   }
+
+   elem_ctr = 0;
+   idx = 0;
+
+   while(1) {
+      if (elem_ctr == d_size)
+         break;
+
+      if(down_seq[d_off+idx] == '[') {
+         while(1) { 
+            new_seq[new_idx] = down_seq[d_off+idx];
+            new_idx++;
+            idx++;
+            if(down_seq[d_off+idx-1] == ']')
+               break;
+         }
+         elem_ctr++;
+         continue;
+      }
+
+      new_seq[new_idx] = down_seq[d_off+idx];
+      idx++;
+      new_idx++;
+      elem_ctr++;
    }
 
-   new_seq[read_size] = '\0';
+   new_seq[new_idx+1] = '\0';
+   printf("result: %s\n",new_seq);
 }
 
 void print_read(Read* cRead) {
@@ -668,3 +722,32 @@ void print_read(Read* cRead) {
  * - check for [AC] and similar entries -> done simple (see function
  * - remove_ambiguities (exchanges [XY] by the second entry)
  */
+
+   //if( read_nr == 13 || read_nr == 14 )  {
+   //   printf("read nr %d",read_nr);
+   //   printf("pos: %d %d %d %d\n",p_start,exon_stop,exon_start,p_stop);
+   //   printf("u/d range: %d %d\n",up_range,down_range);
+   //   printf("u/d size: %d %d\n",u_size,d_size);
+   //   printf("u/d off: %d %d\n",u_off,d_off);
+   //   printf("******************\n");
+
+   //   printf("%s\n",up_read->seq);
+   //   printf("%s\n",down_read->seq);
+
+   //   printf("******************\n");
+
+   //   printf("%s\n",new_up_seq);
+   //   printf("%s\n",new_down_seq);
+
+   //   printf("******************\n");
+   //   printf("%s\n",new_seq);
+   //   printf("%s\n",new_prb);
+   //   printf("%s\n",new_cal_prb);
+   //   printf("%s\n",new_chastity);
+   //}
+
+   //// The four last integers specify the positions of 
+   //// p_start : the position in the dna where the first read starts
+   //// exons_stop  : the position in the dna where the first exons ends
+   //// exons_start : the position in the dna where the second exons starts
+   //// p_stop  : the position in the dna where the (truncated) second read ends