+ added missing check for strand orientation
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 21 Dec 2007 15:44:52 +0000 (15:44 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 21 Dec 2007 15:44:52 +0000 (15:44 +0000)
+ added function remove_ambiguities which is responsible for the differences
between annotation and reads ( the [XY] pairs in the sequence string)

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@7207 e1793c9e-67f9-0310-80fc-b846ff1f7b36

tools/data_tools/filterReads.c

index 669045e..9bca1b1 100644 (file)
@@ -25,6 +25,8 @@ void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, v
 
 int fitting(char* up_prb, char* up_prb_end, char* down_prb, char* down_prb_end);
 
+void remove_ambiguities(char * old_seq, int old_seq_size, char* new_seq);
+
 static char *info = "Usage is:\n./filterReads gff reads output";
 
 const int read_size = 36;
@@ -323,6 +325,9 @@ void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, v
       if (up_idx == up_size || down_idx == down_size)
          break;
 
+      if ( up_strand != down_strand )
+         break;
+
       strncpy(upstream_line,upstream[up_idx],256);
       status = sscanf(upstream_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
       &up_chr,&up_pos,up_seq,&up_id,&up_strand,&up_mismatch,&up_occurrence,&up_sz,
@@ -333,6 +338,13 @@ void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, v
       &down_chr,&down_pos,down_seq,&down_id,&down_strand,&down_mismatch,&down_occurrence,&down_sz,
       &down_cut,down_prb,down_cal_prb,down_chastity);
 
+      char* new_up_seq = malloc(sizeof(char)*read_size);
+      char* new_down_seq = malloc(sizeof(char)*read_size); 
+
+      remove_ambiguities(up_seq,strlen(up_seq),new_up_seq);
+      remove_ambiguities(down_seq,strlen(down_seq),new_down_seq);
+
+      new_seq[0] = '\0';
       new_prb[0] = '\0';
       new_cal_prb[0] = '\0';
       new_chastity[0] = '\0';
@@ -351,12 +363,12 @@ void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, v
          new_chr     = up_chr;
          new_strand  = up_strand;
 
-         strncat(new_seq,up_seq+(36-overlap),overlap);
+         strncat(new_seq,new_up_seq+(36-overlap),overlap);
          strncat(new_prb,up_prb+(36-overlap),overlap);
          strncat(new_cal_prb,up_cal_prb+(36-overlap),overlap);
          strncat(new_chastity,up_chastity+(36-overlap),overlap);
 
-         strncat(new_seq,down_seq+overlap,36-overlap);
+         strncat(new_seq,new_down_seq+overlap,36-overlap);
          strncat(new_prb,down_prb+overlap,36-overlap);
          strncat(new_cal_prb,down_cal_prb+overlap,36-overlap);
          strncat(new_chastity,down_chastity+overlap,36-overlap);
@@ -374,12 +386,12 @@ void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, v
          new_chr     = up_chr;
          new_strand  = up_strand;
 
-         strncat(new_seq,up_seq,(36-overlap));
+         strncat(new_seq,new_up_seq,(36-overlap));
          strncat(new_prb,up_prb,(36-overlap));
          strncat(new_cal_prb,up_cal_prb,(36-overlap));
          strncat(new_chastity,up_chastity,(36-overlap));
 
-         strncat(new_seq,down_seq,overlap);
+         strncat(new_seq,new_down_seq,overlap);
          strncat(new_prb,down_prb,overlap);
          strncat(new_cal_prb,down_cal_prb,overlap);
          strncat(new_chastity,down_chastity,overlap);
@@ -393,6 +405,9 @@ void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, v
 
       up_idx++;
       down_idx++;
+
+      free(new_up_seq);
+      free(new_down_seq);
    }
 }
 
@@ -441,8 +456,30 @@ int fitting(char* up_prb, char* up_prb_end, char* down_prb, char* down_prb_end)
    return 0;
 }
 
+void remove_ambiguities(char * old_seq, int old_seq_size, char* new_seq) {
+   //printf("old seq: %s\n",old_seq);
+   //printf("new seq: %s\n",new_seq);
+
+   int idx=0;
+   int new_idx = 0;
+   while(idx<old_seq_size) {
+      if (old_seq[idx] == '[') {
+         new_seq[new_idx] = old_seq[++idx];
+         new_idx++;
+         idx += 3;
+         continue;
+      }
+
+      new_seq[new_idx] = old_seq[idx++];
+      new_idx++;
+   }
+   //printf("old seq: %s\n",old_seq);
+   //printf("new seq: %s\n",new_seq);
+}
+
 /*
- * TODO
- * - Check strand
- * - check for [AC] and similar entries
+ * TODO:
+ * - Check strand -> done simple (only if equal)
+ * - check for [AC] and similar entries -> done simple (see function
+ *   remove_ambiguities (exchanges [XY] by the first entry)
  */