+ changed reads combination in order to allow for overlaps at both exon borders
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Mon, 7 Jan 2008 16:03:15 +0000 (16:03 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Mon, 7 Jan 2008 16:03:15 +0000 (16:03 +0000)
+ fixed index bug in the remove_ambiguities function

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@7245 e1793c9e-67f9-0310-80fc-b846ff1f7b36

tools/data_tools/filterReads.c

index 436168c..1204315 100644 (file)
@@ -29,7 +29,7 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes,FILE* out
 
 void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size, FILE* out_fs);
 
-int fitting(char* up_prb, char* up_prb_end, char* down_prb, char* down_prb_end);
+int fitting(char* up_prb, char* down_prb);
 
 void remove_ambiguities(char * old_seq, int old_seq_size, char* new_seq);
 
@@ -37,7 +37,10 @@ static char *info = "Usage is:\n./filterReads gff reads output";
 
 const int read_size = 36;
 
+int combined_reads = 0;
+
 int main(int argc, char* argv[]) {
+   combined_reads = 0;
 
    if(argc != 4) {
       printf("%s\n",info);
@@ -148,12 +151,10 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
 
    int SIZE = 500;
    // initialize boundary arrays
-   void** upstream_end      = malloc(sizeof(void*)*SIZE);
    void** upstream_overlap  = malloc(sizeof(void*)*SIZE);
-   void** downstream_start  = malloc(sizeof(void*)*SIZE);
    void** downstream_overlap= malloc(sizeof(void*)*SIZE);
-   int ue,uo,ds,dov;
-   ue = uo = ds = dov = 0;
+   int uo,dov;
+   uo = dov = 0;
 
    int skippedLinesCounter = 0;
 
@@ -188,8 +189,6 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
 
       //if (gene_idx >= 1833)
       //   printf("currentGene start/stop: %d/%d. Positions is %d\n",currentGene->start,currentGene->stop,pos);
-      //if (readCtr == 2000000)
-      //   exit(EXIT_SUCCESS);
 
       status = sscanf(current_line,"%d\t%d\t%s\t%lu\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
       &chr,&pos,seq,&id,&strand,&mismatch,&occurrence,&size,&cut,prb,cal_prb,chastity);
@@ -197,17 +196,12 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
          skippedLinesCounter++;
          goto next_read;
       }
-      //printf("gene start/stop %d,%d, pos: %d\n",currentGene->start,currentGene->stop,pos);
+
       // if the read is occurring several times elsewhere then get rid of it 
-      if(!(occurrence >= 1 && occurrence <= 25)) {
-      //if ( occurrence != 1 ) {
+      //if(!(occurrence >= 1 && occurrence <= 25)) {
+      if ( occurrence != 1 ) {
          multioccurReadCtr++;
          goto next_read;
-         //while (*(char*)linePtr != '\n') linePtr++;
-         //linePtr++;
-         //readCtr += 1;
-         //current_line = strncpy(current_line,linePtr,256);
-         //continue;
       }
 
       if (!(currentGene->start <= pos && (pos + read_size-1) <= currentGene->stop)) { // read is not within gene borders
@@ -217,7 +211,7 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
             exon_idx = 1;
             currentGene = (*allGenes)[gene_idx];
             //printf("currentGene->start / currentGene->stop %d/%d pos is %d\n",currentGene->start,currentGene->stop,pos);
-            ue = uo = ds = dov = 0;
+            uo = dov = 0;
             continue;
          }
 
@@ -253,13 +247,10 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
          if (cur_exon_start - prev_exon_stop < 6 || cur_exon_start < pos ) { // go to next exon
             exon_idx++;
 
-            if (ue != 0 && dov != 0)
-               combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov,out_fs);
+            if (uo != 0 && dov != 0) 
+               combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_overlap,dov,out_fs);
 
-            if (uo != 0 && ds != 0) 
-               combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds,out_fs);
-
-            ue = uo = ds = dov = 0;
+            uo = dov = 0;
             goto exon_label;
          }
 
@@ -273,15 +264,9 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
          if ( pos + (read_size-1) < prev_exon_stop ) // go to next read
             goto next_read;
 
-         // if this position is reached the read is somehow overlapping or
-         // exactly on the exon boundary. now determine where exactly:
-         if (pos + (read_size-1) == prev_exon_stop) {  // read ends at previous exon end
-            endPrevCtr++;
-            upstream_end[ue] = linePtr;
-            ue++;
-            goto next_read;
-         }
-
+         // if this position is reached the read is somehow overlapping at 
+         // the exon boundary. now determine where exactly:
+         
          if ( (prev_exon_stop - pos) >= 6 && (prev_exon_stop - pos) <= 30) { // read overlaps with previous exon boundary
             prevOverlapCtr++;
             upstream_overlap[uo] = linePtr;
@@ -289,13 +274,6 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
             goto next_read;
          }
 
-         if ( pos == cur_exon_start ) { // read starts at current exon start
-            currentStartCtr++;
-            downstream_start[ds] = linePtr;
-            ds++;
-            goto next_read;
-         }
-
          if ( (cur_exon_start - pos) >= 6 && (cur_exon_start - pos) <= 30 ) { // read overlaps with current exon boundary
             currentOverlapCtr++;
             downstream_overlap[dov] = linePtr;
@@ -308,20 +286,17 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
       }
    }
 
-   combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov,out_fs);
-   combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds,out_fs);
+   combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_overlap,dov,out_fs);
 
-   free(upstream_end);
    free(upstream_overlap);
-   free(downstream_start);
    free(downstream_overlap);
 
    printf("Processed %d reads in total while %d reads where skipped.\n",readCtr,skippedLinesCounter);
    printf("%d were totally intronic. %d were outside of genes and %d were occurring on several positions.\n",uselessReadCtr,skippedReadCtr,multioccurReadCtr);
    printf("%d reads were useless in total\n",uselessReadCtr+skippedReadCtr+multioccurReadCtr);
    printf("%d reads were totally exonic\n",exonicReadCtr);
-   printf("%d reads end at prev. exon. %d reads overlap with prev. exon.\n", endPrevCtr, prevOverlapCtr);
-   printf("%d reads start at current exon. %d reads overlap with current exon.\n",currentStartCtr,currentOverlapCtr);
+   printf("%d reads overlap with prev. exon. %d reads overlap with current exon\n",prevOverlapCtr,currentOverlapCtr);
+   printf("%d reads where newly combined from two original reads\n",combined_reads);
    printf("Total used reads: %d\n",exonicReadCtr+endPrevCtr+prevOverlapCtr+currentStartCtr+currentOverlapCtr);
 
    status = munmap(reads_area,reads_filesize);
@@ -339,6 +314,7 @@ void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, v
    //printf("up/down size is %d/%d\n",up_size,down_size);
 
    int up_idx, down_idx, status;
+
    char* upstream_line = malloc(sizeof(char)*256);
    char* downstream_line = malloc(sizeof(char)*256);
 
@@ -376,95 +352,67 @@ void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, v
    char* new_prb        = malloc(sizeof(char)*buffer_size);
    char* new_cal_prb    = malloc(sizeof(char)*buffer_size);
    char* new_chastity   = malloc(sizeof(char)*buffer_size);
+
    char* new_up_seq = malloc(sizeof(char)*read_size);
    char* new_down_seq = malloc(sizeof(char)*read_size); 
 
-   up_idx=0;
-   down_idx=0;
-   while(1) {
-      if (up_idx == up_size || down_idx == down_size || up_strand != down_strand)
-         break;
+   int overlap;
+   int fit;
+
+   char* used_flag = calloc(down_size,sizeof(char));
 
+   for(up_idx=0;up_idx<up_size;up_idx++) {
       strncpy(upstream_line,upstream[up_idx],256);
       status = sscanf(upstream_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
       &up_chr,&up_pos,up_seq,&up_id,&up_strand,&up_mismatch,&up_occurrence,&up_sz,
       &up_cut,up_prb,up_cal_prb,up_chastity);
-      
-      strncpy(downstream_line,downstream[down_idx],256);
-      status = sscanf(downstream_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
-      &down_chr,&down_pos,down_seq,&down_id,&down_strand,&down_mismatch,&down_occurrence,&down_sz,
-      &down_cut,down_prb,down_cal_prb,down_chastity);
 
       remove_ambiguities(up_seq,strlen(up_seq),new_up_seq);
-      remove_ambiguities(down_seq,strlen(down_seq),new_down_seq);
-
-      new_seq[0] = '\0';
-      new_prb[0] = '\0';
-      new_cal_prb[0] = '\0';
-      new_chastity[0] = '\0';
-         
-      int fit;
-      int w_size = 6;
-      int overlap = 0;
-      if (up_pos+35 == exon_stop) { // merge with read which is downstream overlapping
-         overlap = exon_start - down_pos;
-         
-         //fit = fitting(up_prb+(read_size-w_size),up_prb+read_size,down_prb+overlap,down_prb+overlap+w_size);
-         //if (fit != 1)
-         //   goto end;
 
-         new_chr     = up_chr;
-         new_strand  = up_strand;
+      overlap = exon_stop - up_pos;
 
-         strncat(new_seq,new_up_seq+(read_size-overlap),overlap);
-         strncat(new_prb,up_prb+(read_size-overlap),overlap);
-         strncat(new_cal_prb,up_cal_prb+(read_size-overlap),overlap);
-         strncat(new_chastity,up_chastity+(read_size-overlap),overlap);
+      for(down_idx=0;down_idx<down_size;down_idx++) {
+         if( used_flag[down_idx] == 1)
+            continue;
 
-         strncat(new_seq,new_down_seq+overlap,read_size-overlap);
-         strncat(new_prb,down_prb+overlap,read_size-overlap);
-         strncat(new_cal_prb,down_cal_prb+overlap,read_size-overlap);
-         strncat(new_chastity,down_chastity+overlap,read_size-overlap);
+         strncpy(downstream_line,downstream[down_idx],256);
+         status = sscanf(downstream_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
+         &down_chr,&down_pos,down_seq,&down_id,&down_strand,&down_mismatch,&down_occurrence,&down_sz,
+         &down_cut,down_prb,down_cal_prb,down_chastity);
 
-         //printf("Between exon stop/start %d/%d : merging pos %d %d with overlap %d\n",exon_stop,exon_start,up_pos+35,down_pos, overlap);
+         remove_ambiguities(down_seq,strlen(down_seq),new_down_seq);
 
-      } // merge with read which is upstream overlapping
-      
-      if (down_pos == exon_start) {
-         overlap = up_pos+read_size - exon_stop;
-         //printf("overlap is %d\n",overlap);
-         //printf("pos are: %d %d\n",up_pos,down_pos);
+         new_seq[0] = '\0';
+         new_prb[0] = '\0';
+         new_cal_prb[0] = '\0';
+         new_chastity[0] = '\0';
          
-         //fit = fitting(up_prb+read_size-overlap-w_size,up_prb+read_size-overlap,down_prb,down_prb+w_size);
-         //if (fit != 1)
-         //   goto end;
+         fit = fitting(up_prb+(36-overlap),down_prb);
+         if (fit == -1)
+            continue;
+
+         if (! (fit  < overlap ))
+            continue;
 
          new_chr     = up_chr;
          new_strand  = up_strand;
 
-         strncat(new_seq,new_up_seq,(read_size-overlap));
-         strncat(new_prb,up_prb,(read_size-overlap));
-         strncat(new_cal_prb,up_cal_prb,(read_size-overlap));
-         strncat(new_chastity,up_chastity,(read_size-overlap));
+         strncat(new_seq,new_up_seq,overlap);
+         strncat(new_prb,up_prb,overlap);
+         strncat(new_cal_prb,up_cal_prb,overlap);
+         strncat(new_chastity,up_chastity,overlap);
 
-         strncat(new_seq,new_down_seq,overlap);
-         strncat(new_prb,down_prb,overlap);
-         strncat(new_cal_prb,down_cal_prb,overlap);
-         strncat(new_chastity,down_chastity,overlap);
+         strncat(new_seq,new_down_seq+fit,read_size-overlap);
+         strncat(new_prb,down_prb+fit,read_size-overlap);
+         strncat(new_cal_prb,down_cal_prb+fit,read_size-overlap);
+         strncat(new_chastity,down_chastity+fit,read_size-overlap);
 
-         //printf("Between exon stop/start %d/%d : merging pos %d %d with overlap %d\n",exon_stop,exon_start,up_pos,down_pos, overlap);
-      }
+         fprintf(out_fs,"%d\t%c\t%s\t%d\t%s\t%s\t%s\n",
+         new_chr,new_strand,new_seq,read_size,new_prb,new_cal_prb,new_chastity);
 
-      if ( !(up_pos+35 == exon_stop) && !(down_pos == exon_start) )
-         printf("ERROR: Between exon stop/start %d/%d : merging pos %d %d with overlap %d\n",exon_stop,exon_start,up_pos,down_pos, overlap);
-
-      fprintf(out_fs,"%d\t%c\t%s\t%d\t%s\t%s\t%s\n",
-      new_chr,new_strand,new_seq,read_size,new_prb,new_cal_prb,new_chastity);
-    
-      end:
-
-      up_idx++;
-      down_idx++;
+         combined_reads++;
+         used_flag[down_idx] = 1;
+      }
    }
 
    free(upstream_line);
@@ -473,62 +421,102 @@ void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, v
    free(new_up_seq);
    free(new_down_seq); 
 
+   free(up_seq);
    free(up_prb);
    free(up_cal_prb);
    free(up_chastity);
 
+   free(down_seq);
    free(down_prb);
    free(down_cal_prb);
    free(down_chastity);
 
+   free(new_seq);
    free(new_prb);
    free(new_cal_prb);
    free(new_chastity);
-
 }
 
-int fitting(char* up_prb, char* up_prb_end, char* down_prb, char* down_prb_end) {
-   double epsilon_mean = 15.0;
-   double epsilon_var = 10.0;
-   double mean_up = 0;
-   double variance_up = 0;
-   double mean_down = 0;
-   double variance_down = 0;
-
-   char *up_ptr = up_prb;
-   char *down_ptr = down_prb;
-
-   int w_size = 0;
-   while(up_ptr != up_prb_end) {
-      mean_up += (*up_ptr)-50;
-      mean_down += (*down_ptr)-50;
-      w_size++;
-      up_ptr++;
-      down_ptr++;
+int fitting(char* up_prb, char* down_prb) {
+   double epsilon_mean = 30.0;
+   double epsilon_var = 30.0;
+   int w_size = 6;
+
+   double current_mean_up = 0;
+   double current_variance_up = 0;
+   double current_mean_down = 0;
+   double current_variance_down = 0;
+
+   double* mean_up       = malloc(sizeof(double)*read_size-2*w_size);
+   double* variance_up   = malloc(sizeof(double)*read_size-2*w_size);
+   double* mean_down     = malloc(sizeof(double)*read_size-2*w_size);
+   double* variance_down = malloc(sizeof(double)*read_size-2*w_size);
+
+   int iidx = -1;
+   int uidx;
+   int didx;
+
+   for(uidx=iidx-w_size;uidx<iidx;uidx++) {
+      didx = uidx+w_size;
+      current_mean_up += up_prb[uidx]-50;
+      current_mean_down += up_prb[didx]-50;
+
    }
-   mean_up /= w_size;
-   mean_down /= w_size;
-
-
-   up_ptr = up_prb;
-   down_ptr = down_prb;
-   w_size = 0;
-   while(up_ptr != up_prb_end) {
-      variance_up += pow((*up_prb)-50 - mean_up,2);
-      variance_down += pow((*down_prb)-50 - mean_down,2);
-      w_size++;
-      up_ptr++;
-      down_ptr++;
+   current_mean_up /= w_size;
+   current_mean_down /= w_size;
+
+   for(uidx=iidx-w_size;uidx<iidx;uidx++) {
+      didx = uidx+w_size;
+      current_variance_up += pow(up_prb[uidx]-50 - current_mean_up,2);
+      current_variance_down += pow(up_prb[didx]-50 - current_mean_up,2);
+      
    }
-   variance_up /= (w_size-1);
-   variance_down /= (w_size-1);
+   current_variance_up /= (w_size-1);
+   current_variance_down /= (w_size-1);
 
-   //printf("means: %f %f, variances: %f %f\n",mean_up,mean_down,variance_up,variance_down);
+   for(iidx=w_size;iidx<30;iidx++) {
+      for(uidx=iidx-w_size;uidx<iidx;uidx++) {
+         didx = uidx+w_size;
+         mean_up[iidx-w_size] += down_prb[uidx]-50;
+         mean_down[iidx-w_size] += down_prb[didx]-50;
 
-   if ( abs(mean_up - mean_down) < epsilon_mean && abs(variance_up - variance_down) < epsilon_var )
-      return 1;
-      
-   return 0;
+      }
+      mean_up[iidx-w_size] /= w_size;
+      mean_down[iidx-w_size] /= w_size;
+
+      for(uidx=iidx-w_size;uidx<iidx;uidx++) {
+         didx = uidx+w_size;
+         variance_up[iidx-w_size] += pow(down_prb[uidx]-50 - mean_up[iidx-w_size],2);
+         variance_down[iidx-w_size] += pow(down_prb[didx]-50 - mean_down[iidx-w_size],2);
+      }
+      variance_up[iidx-w_size] /= (w_size-1);
+      variance_down[iidx-w_size] /= (w_size-1);
+
+      //printf("means: %f %f %f %f, variances: %f %f %f %f\n",current_mean_up,current_mean_down,mean_up,mean_down,current_variance_up,current_variance_down,variance_up,variance_down);
+      //if ( abs(current_mean_up - mean_up) < epsilon_mean && abs(current_variance_up - variance_up) < epsilon_var 
+      //&& abs(current_mean_down - mean_down) < epsilon_mean && abs(current_variance_down - variance_down) < epsilon_var )
+      //   return iidx;
+   }
+
+   int bidx;
+   int bestIdx = -1;
+   double min = 1000.0;
+   for(bidx=0;bidx<read_size-2*w_size;bidx++) {
+      if ( abs(current_mean_up - mean_up[bidx]) < epsilon_mean && abs(current_variance_up - variance_up[bidx]) < epsilon_var 
+      && abs(current_mean_down - mean_down[bidx]) < epsilon_mean && abs(current_variance_down - variance_down[bidx]) < epsilon_var ) {
+         if ( abs(current_mean_up - mean_up[bidx]) + abs(current_variance_up - variance_up[bidx]) + abs(current_mean_down - mean_down[bidx]) + abs(current_variance_down - variance_down[bidx])) {
+            min = abs(current_mean_up - mean_up[bidx]) + abs(current_variance_up - variance_up[bidx]) + abs(current_mean_down - mean_down[bidx]) + abs(current_variance_down - variance_down[bidx]);
+            bestIdx = bidx;
+         }
+      }
+   }
+
+   free(mean_up);
+   free(variance_up);
+   free(mean_down);
+   free(variance_down);
+
+   return bestIdx;
 }
 
 void remove_ambiguities(char * old_seq, int old_seq_size, char* new_seq) {
@@ -537,24 +525,36 @@ void remove_ambiguities(char * old_seq, int old_seq_size, char* new_seq) {
 
    int idx=0;
    int new_idx = 0;
+
    while(idx<old_seq_size) {
+      //printf("Current elem %c pos %d %d\n",old_seq[idx],idx,new_idx);
+      if (old_seq[idx] == ']' || old_seq[idx] == '-' ) {
+         idx++;
+         continue;
+      }
+
       if (old_seq[idx] == '[') {
-         new_seq[new_idx] = old_seq[++idx];
-         new_idx++;
-         idx += 3;
+         idx += 2;
+         //printf("%c %c\n",old_seq[idx-2],old_seq[idx]);
          continue;
       }
 
-      new_seq[new_idx] = old_seq[idx++];
+      new_seq[new_idx] = old_seq[idx];
+      idx++;
       new_idx++;
    }
-   //printf("old seq: %s\n",old_seq);
-   //printf("new seq: %s\n",new_seq);
+
+   if (new_idx != 36) {
+      printf("Error: Sequence is not of length 36!\n");
+      printf("old seq: %s\n",old_seq);
+      printf("new seq: %s\n",new_seq);
+      exit(EXIT_FAILURE);
+   }
 }
 
 /*
  * TODO:
  * - Check strand -> done simple (only if equal)
  * - check for [AC] and similar entries -> done simple (see function
- *   remove_ambiguities (exchanges [XY] by the first entry)
+ * - remove_ambiguities (exchanges [XY] by the second entry)
  */