+ small changes
[qpalma.git] / tools / data_tools / filterReads.c
index 669045e..aea1b25 100644 (file)
 #include <unistd.h>
 #include <math.h>
 
-#include "common.h"
 #include "datastructures.h"
 
+#define _FILE_OFFSET_BITS == 64
+
+int compare_gene_struct(struct gene* a, struct gene* b) {
+   return a->stop - b->start;
+}
+
 int parse_gff(char* filename,FILE* fid,struct gene*** allGenes);
 
+void sort_genes(struct gene*** allGenes, int numGenes);
+
 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes,FILE* out_fs);
 
-void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size, FILE* out_fs);
+void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size, FILE* out_fs,const char* gene_id);
 
-int fitting(char* up_prb, char* up_prb_end, char* down_prb, char* down_prb_end);
+//int fitting(char* up_prb, char* up_prb_end, char* down_prb, char* down_prb_end);
+int fitting(char* up_prb, char* down_prb);
+
+void remove_ambiguities(char * old_seq, int old_seq_size, char* new_seq);
 
 static char *info = "Usage is:\n./filterReads gff reads output";
 
 const int read_size = 36;
 
+int combined_reads = 0;
 int main(int argc, char* argv[]) {
 
    if(argc != 4) {
@@ -38,9 +49,9 @@ int main(int argc, char* argv[]) {
 
    int status;
    int filenameSize = 256;
-   char *gff_filename = malloc(sizeof(char)*filenameSize);
-   char *reads_filename = malloc(sizeof(char)*filenameSize);
-   char *output_filename = malloc(sizeof(char)*filenameSize);
+   chargff_filename = malloc(sizeof(char)*filenameSize);
+   charreads_filename = malloc(sizeof(char)*filenameSize);
+   charoutput_filename = malloc(sizeof(char)*filenameSize);
 
    strncpy(gff_filename,argv[1],filenameSize);
    strncpy(reads_filename,argv[2],filenameSize);
@@ -68,21 +79,35 @@ int main(int argc, char* argv[]) {
    struct gene** allGenes;
    int numGenes = parse_gff(gff_filename,gff_fs,&allGenes);
    status = fclose(gff_fs);
+   free(gff_filename);
    if(status != 0)
       printf("closing of gff filestream failed!\n");
 
    printf("Successfully parsed gff file! Found %d genes.\n",numGenes);
 
+   // some entries in the gff files are not in a sorted order
+   //printf("Sorting genes...\n");
+   //sort_genes(&allGenes,numGenes);
+   //qsort(allGenes,numGenes,sizeof(struct gene*),compare_gene_struct);
+   ///printf("Genes were sorted!\n");
+   
+   int gidx, eidx;
+   int exon_cov = 0;
+   for(gidx=0;gidx<numGenes;gidx++) {
+      for(eidx=0;eidx<allGenes[gidx]->num_exons;eidx++) {
+         exon_cov += allGenes[gidx]->exon_stops[eidx] - allGenes[gidx]->exon_starts[eidx];
+   }}
+   printf("Exon coverage is %f\n",(double)exon_cov/30432563);
+
    process_reads(reads_fs,&allGenes,numGenes,out_fs);
 
    status = fclose(reads_fs);
-   status += fclose(out_fs);
+   status = fclose(out_fs);
    if(status != 0)
-      printf("closing of filestreams failed!\n");
+      perror("fclose");
       
-   //free(allGenes);
-   free(gff_filename);
    free(reads_filename);
+   free(output_filename);
    return 0;
 }
 
@@ -110,8 +135,8 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
       exit(EXIT_FAILURE);
    }
    off_t reads_filesize = reads_stat.st_size;
-
    printf("Reads file is of size %lu bytes\n",(unsigned long) reads_filesize);
+   int numReads = reads_filesize / 178.0;
 
    void *reads_area = mmap (NULL,reads_filesize,PROT_READ|PROT_WRITE,MAP_PRIVATE,reads_fid,0);
    if (reads_area == MAP_FAILED) {
@@ -119,7 +144,6 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
       exit(EXIT_FAILURE);
    }
    close(reads_fid);
-   
    printf("Successfully mapped %lu bytes of reads file into memory\n",(unsigned long)reads_filesize);
 
    void* linePtr = reads_area;
@@ -136,6 +160,7 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
 
    int skippedLinesCounter = 0;
 
+   int prev_exon_start = -1;
    int prev_exon_stop = -1;
    int cur_exon_start = -1;
 
@@ -143,6 +168,18 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
    int gene_idx = 0;
    int exon_idx = 1;
    struct gene* currentGene = (*allGenes)[gene_idx];
+   char* gene_id = currentGene->id;
+
+   char* disamb_seq = malloc(sizeof(char)*read_size);
+
+   int skippedReadCtr = 0;
+   int uselessReadCtr = 0;
+   int exonicReadCtr = 0;
+   int endPrevCtr = 0;
+   int prevOverlapCtr = 0;
+   int currentStartCtr = 0;
+   int currentOverlapCtr = 0;
+   int multioccurReadCtr = 0;
 
    int readCtr = 0;
    // start of the parsing loop 
@@ -150,27 +187,26 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
       if (gene_idx == numGenes || strcmp(current_line,"") == 0)
          break;
 
-      if (readCtr != 0 && readCtr % 100000 == 0)
-         printf("Processed %d reads.\n",readCtr);
+      gene_id = currentGene->id;
 
-      //if (readCtr == 10000000)
-      //   exit(EXIT_FAILURE);
+      if (readCtr != 0 && readCtr % 1000000 == 0)
+         printf("Processed %d/%d genes and %d/%d reads.\n",gene_idx,numGenes,readCtr,numReads);
+
+      //if (gene_idx >= 1833)
+      //   printf("currentGene start/stop: %d/%d. Positions is %d\n",currentGene->start,currentGene->stop,pos);
 
       status = sscanf(current_line,"%d\t%d\t%s\t%lu\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
       &chr,&pos,seq,&id,&strand,&mismatch,&occurrence,&size,&cut,prb,cal_prb,chastity);
       if (status < 12) {
          skippedLinesCounter++;
+         goto next_read;
       }
 
-      //printf("gene start/stop %d,%d, pos: %d\n",currentGene->start,currentGene->stop,pos);
-
       // if the read is occurring several times elsewhere then get rid of it 
-      if(occurrence != 1) {
-         while (*(char*)linePtr != '\n') linePtr++;
-         linePtr++;
-         readCtr += 1;
-         current_line = strncpy(current_line,linePtr,256);
-         continue;
+      //if(!(occurrence >= 1 && occurrence <= 25)) {
+      if ( occurrence != 1 ) {
+         multioccurReadCtr++;
+         goto next_read;
       }
 
       if (!(currentGene->start <= pos && (pos + read_size-1) <= currentGene->stop)) { // read is not within gene borders
@@ -179,10 +215,14 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
             gene_idx++;
             exon_idx = 1;
             currentGene = (*allGenes)[gene_idx];
+            //printf("currentGene->start / currentGene->stop %d/%d pos is %d\n",currentGene->start,currentGene->stop,pos);
+            ue = uo = ds = dov = 0;
             continue;
          }
 
          if ( pos < currentGene->start ) { // go to next read
+            skippedReadCtr++;
+            
             next_read:
 
             while (*(char*)linePtr != '\n') linePtr++;
@@ -195,10 +235,6 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
       } else { // read IS within gene borders
 
          exon_label:
-         prev_exon_stop = currentGene->exon_stops[exon_idx-1];
-         cur_exon_start = currentGene->exon_starts[exon_idx];
-         //printf("prev_exon_stop %d cur_exon_start %d pos %d\n",prev_exon_stop,cur_exon_start,pos);
-         //printf("exon_idx %d\n",exon_idx);
 
          if (exon_idx == currentGene->num_exons) {
             gene_idx++;
@@ -206,52 +242,77 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
             currentGene = (*allGenes)[gene_idx];
             continue;
          }
+
+         prev_exon_start = currentGene->exon_starts[exon_idx-1];
+         prev_exon_stop = currentGene->exon_stops[exon_idx-1];
+         cur_exon_start = currentGene->exon_starts[exon_idx];
+
+         //printf("exon %d %d inton til %d pos %d\n",prev_exon_start,prev_exon_stop,cur_exon_start,pos);
             
          if (cur_exon_start - prev_exon_stop < 6 || cur_exon_start < pos ) { // go to next exon
             exon_idx++;
-            combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov,out_fs);
-            combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds,out_fs);
+
+            if (ue != 0 && dov != 0)
+               combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov,out_fs,gene_id);
+
+            if (uo != 0 && ds != 0) 
+               combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds,out_fs,gene_id);
+
             ue = uo = ds = dov = 0;
             goto exon_label;
          }
 
-         if ( pos < prev_exon_stop - read_size + 1  ) { // go to next read
+         if ( prev_exon_start < pos && (pos+read_size) < prev_exon_stop ) { // read is inside previous exon
+            exonicReadCtr++;
+
+   // Removed exonic reads
+
+            //remove_ambiguities(seq,strlen(seq),disamb_seq);
+            //fprintf(out_fs,"%d\t%c\t%s\t%d\t%s\t%s\t%s\n",chr,strand,disamb_seq,read_size,prb,cal_prb,chastity);
+
             goto next_read;
          }
 
+         if ( pos + (read_size-1) < prev_exon_stop ) // go to next read
+            goto next_read;
+
          // if this position is reached the read is somehow overlapping or
          // exactly on the exon boundary. now determine where exactly:
-
          if (pos + (read_size-1) == prev_exon_stop) {  // read ends at previous exon end
+            endPrevCtr++;
             upstream_end[ue] = linePtr;
             ue++;
             goto next_read;
          }
 
-         if (prev_exon_stop - pos >= 6 && prev_exon_stop - pos <= 30) { // read overlaps with previous exon boundary
+         if ( (prev_exon_stop - pos) >= 6 && (prev_exon_stop - pos) <= 30) { // read overlaps with previous exon boundary
+            prevOverlapCtr++;
             upstream_overlap[uo] = linePtr;
             uo++;
             goto next_read;
          }
 
-         if (pos == cur_exon_start) { // read starts at current exon start
+         if ( pos == cur_exon_start ) { // read starts at current exon start
+            currentStartCtr++;
             downstream_start[ds] = linePtr;
             ds++;
             goto next_read;
          }
 
-         if (cur_exon_start - pos >= 6 && cur_exon_start - pos <= 30 ) { // read overlaps with current exon boundary
+         if ( (cur_exon_start - pos) >= 6 && (cur_exon_start - pos) <= 30 ) { // read overlaps with current exon boundary
+            currentOverlapCtr++;
             downstream_overlap[dov] = linePtr;
             dov++;
             goto next_read;
          }
 
-         goto next_read;
+         uselessReadCtr++;
+         goto next_read; // read was not useful i.e. not overlapping/starting at exon boundaries
       }
    }
 
-   combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov,out_fs);
-   combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds,out_fs);
+   combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov,out_fs,gene_id);
+   combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds,out_fs,gene_id);
 
    free(upstream_end);
    free(upstream_overlap);
@@ -259,23 +320,27 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
    free(downstream_overlap);
 
    printf("Processed %d reads in total while %d reads where skipped.\n",readCtr,skippedLinesCounter);
+   printf("%d were totally intronic. %d were outside of genes and %d were occurring on several positions.\n",uselessReadCtr,skippedReadCtr,multioccurReadCtr);
+   printf("%d reads were useless in total\n",uselessReadCtr+skippedReadCtr+multioccurReadCtr);
+   printf("%d reads were totally exonic\n",exonicReadCtr);
+   printf("%d reads overlap with prev. exon. %d reads overlap with current exon\n",prevOverlapCtr,currentOverlapCtr);
+   printf("%d reads where newly combined from two original reads\n",combined_reads);
+   printf("Total used reads: %d\n",exonicReadCtr+endPrevCtr+prevOverlapCtr+currentStartCtr+currentOverlapCtr);
 
    status = munmap(reads_area,reads_filesize);
    if(status != 0)
-      printf("munmap failed!\n");
+      perror("munmap");
 
+   //free(current_line);
    free(seq);
    free(prb);
    free(cal_prb);
    free(chastity);
 }
 
-void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size,FILE* out_fs) {
-   //printf("up_/down_size %d %d\n",up_size,down_size);
-
-   if (up_size == 0 || down_size == 0 || exon_stop == -1)
-      return;
-
+void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size,FILE* out_fs,const char* gene_id) {
+   //printf("up/down size is %d/%d\n",up_size,down_size);
+   
    int up_idx, down_idx, status;
    char* upstream_line = malloc(sizeof(char)*256);
    char* downstream_line = malloc(sizeof(char)*256);
@@ -314,90 +379,97 @@ void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, v
    char* new_prb        = malloc(sizeof(char)*buffer_size);
    char* new_cal_prb    = malloc(sizeof(char)*buffer_size);
    char* new_chastity   = malloc(sizeof(char)*buffer_size);
+   char* new_up_seq = malloc(sizeof(char)*read_size);
+   char* new_down_seq = malloc(sizeof(char)*read_size); 
 
-   up_idx=0;
-   down_idx=0;
-   while(1) {
-      //printf("up_/down_idx %d %d\n",up_idx,down_idx);
+   int overlap;
+   int fit;
 
-      if (up_idx == up_size || down_idx == down_size)
-         break;
+   char* used_flag = calloc(down_size,sizeof(char));
 
+   for(up_idx=0;up_idx<up_size;up_idx++) {
       strncpy(upstream_line,upstream[up_idx],256);
       status = sscanf(upstream_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
       &up_chr,&up_pos,up_seq,&up_id,&up_strand,&up_mismatch,&up_occurrence,&up_sz,
       &up_cut,up_prb,up_cal_prb,up_chastity);
-      
-      strncpy(downstream_line,downstream[down_idx],256);
-      status = sscanf(downstream_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
-      &down_chr,&down_pos,down_seq,&down_id,&down_strand,&down_mismatch,&down_occurrence,&down_sz,
-      &down_cut,down_prb,down_cal_prb,down_chastity);
-
-      new_prb[0] = '\0';
-      new_cal_prb[0] = '\0';
-      new_chastity[0] = '\0';
-         
-      int fit;
-      int w_size = 5;
-      if (up_pos+35 == exon_stop) { // merge with read which is downstream overlapping
-         int overlap = exon_start - down_pos;
-         //printf("overlap is %d\n",overlap);
-         //printf("pos are: %d %d\n",up_pos,down_pos);
-         
-         fit = fitting(up_prb+(36-w_size),up_prb+36,down_prb+overlap,down_prb+overlap+w_size);
-         if (fit != 1)
-            goto end;
 
-         new_chr     = up_chr;
-         new_strand  = up_strand;
+      remove_ambiguities(up_seq,strlen(up_seq),new_up_seq);
 
-         strncat(new_seq,up_seq+(36-overlap),overlap);
-         strncat(new_prb,up_prb+(36-overlap),overlap);
-         strncat(new_cal_prb,up_cal_prb+(36-overlap),overlap);
-         strncat(new_chastity,up_chastity+(36-overlap),overlap);
+      overlap = exon_stop - up_pos;
 
-         strncat(new_seq,down_seq+overlap,36-overlap);
-         strncat(new_prb,down_prb+overlap,36-overlap);
-         strncat(new_cal_prb,down_cal_prb+overlap,36-overlap);
-         strncat(new_chastity,down_chastity+overlap,36-overlap);
+      for(down_idx=0;down_idx<down_size;down_idx++) {
+         if( used_flag[down_idx] == 1)
+            continue;
 
-      } // merge with read which is upstream overlapping
-      
-      if (down_pos == exon_start) {
-         int overlap = up_pos+36 - exon_stop;
-         //printf("overlap is %d\n",overlap);
-         //printf("pos are: %d %d\n",up_pos,down_pos);
-         fit = fitting(up_prb+36-overlap-w_size,up_prb+36-overlap,down_prb,down_prb+w_size);
-         if (fit != 1)
-            goto end;
+         strncpy(downstream_line,downstream[down_idx],256);
+         status = sscanf(downstream_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
+         &down_chr,&down_pos,down_seq,&down_id,&down_strand,&down_mismatch,&down_occurrence,&down_sz,
+         &down_cut,down_prb,down_cal_prb,down_chastity);
+
+         remove_ambiguities(down_seq,strlen(down_seq),new_down_seq);
+
+         new_seq[0] = '\0';
+         new_prb[0] = '\0';
+         new_cal_prb[0] = '\0';
+         new_chastity[0] = '\0';
+         int splitpos = 0;
+         
+         fit = fitting(up_prb+(36-overlap),down_prb);
+         if (fit == -1)
+            continue;
+
+         if (! (fit  < overlap ))
+            continue;
 
          new_chr     = up_chr;
          new_strand  = up_strand;
 
-         strncat(new_seq,up_seq,(36-overlap));
-         strncat(new_prb,up_prb,(36-overlap));
-         strncat(new_cal_prb,up_cal_prb,(36-overlap));
-         strncat(new_chastity,up_chastity,(36-overlap));
+         strncat(new_seq,new_up_seq,overlap);
+         strncat(new_prb,up_prb,overlap);
+         strncat(new_cal_prb,up_cal_prb,overlap);
+         strncat(new_chastity,up_chastity,overlap);
 
-         strncat(new_seq,down_seq,overlap);
-         strncat(new_prb,down_prb,overlap);
-         strncat(new_cal_prb,down_cal_prb,overlap);
-         strncat(new_chastity,down_chastity,overlap);
-      }
+         strncat(new_seq,new_down_seq+fit,read_size-overlap);
+         strncat(new_prb,down_prb+fit,read_size-overlap);
+         strncat(new_cal_prb,down_cal_prb+fit,read_size-overlap);
+         strncat(new_chastity,down_chastity+fit,read_size-overlap);
 
-      //printf("New entry!\n");
-      fprintf(out_fs,"%d\t%c\t%s\t%d\t%s\t%s\t%s\n",
-      new_chr,new_strand,new_seq,read_size,new_prb,new_cal_prb,new_chastity);
-    
-      end:
+         fprintf(out_fs,"%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n",
+         new_chr,new_strand,new_seq,splitpos,read_size,new_prb,new_cal_prb,new_chastity,gene_id);
 
-      up_idx++;
-      down_idx++;
+         combined_reads++;
+         used_flag[down_idx] = 1;
+      }
    }
+
+   free(upstream_line);
+   free(downstream_line);
+  
+   free(new_up_seq);
+   free(new_down_seq); 
+
+   free(up_seq);
+   free(up_prb);
+   free(up_cal_prb);
+   free(up_chastity);
+
+   free(down_seq);
+   free(down_prb);
+   free(down_cal_prb);
+   free(down_chastity);
+
+   free(new_seq);
+   free(new_prb);
+   free(new_cal_prb);
+   free(new_chastity);
 }
 
+
+
+
+/*
 int fitting(char* up_prb, char* up_prb_end, char* down_prb, char* down_prb_end) {
-   double epsilon_mean = 10.0;
+   double epsilon_mean = 15.0;
    double epsilon_var = 10.0;
    double mean_up = 0;
    double variance_up = 0;
@@ -418,7 +490,6 @@ int fitting(char* up_prb, char* up_prb_end, char* down_prb, char* down_prb_end)
    mean_up /= w_size;
    mean_down /= w_size;
 
-   //printf("means: %f %f\n",mean_up,mean_down);
 
    up_ptr = up_prb;
    down_ptr = down_prb;
@@ -433,16 +504,130 @@ int fitting(char* up_prb, char* up_prb_end, char* down_prb, char* down_prb_end)
    variance_up /= (w_size-1);
    variance_down /= (w_size-1);
 
-   //printf("variances: %f %f\n",variance_up,variance_down);
+   //printf("means: %f %f, variances: %f %f\n",mean_up,mean_down,variance_up,variance_down);
 
    if ( abs(mean_up - mean_down) < epsilon_mean && abs(variance_up - variance_down) < epsilon_var )
       return 1;
       
    return 0;
 }
+*/
+
+int fitting(char* up_prb, char* down_prb) {
+   double epsilon_mean = 30.0;
+   double epsilon_var = 30.0;
+   int w_size = 6;
+
+   double current_mean_up = 0;
+   double current_variance_up = 0;
+   double current_mean_down = 0;
+   double current_variance_down = 0;
+
+   double* mean_up       = malloc(sizeof(double)*read_size-2*w_size);
+   double* variance_up   = malloc(sizeof(double)*read_size-2*w_size);
+   double* mean_down     = malloc(sizeof(double)*read_size-2*w_size);
+   double* variance_down = malloc(sizeof(double)*read_size-2*w_size);
+
+   int iidx = -1;
+   int uidx;
+   int didx;
+
+   for(uidx=iidx-w_size;uidx<iidx;uidx++) {
+      didx = uidx+w_size;
+      current_mean_up += up_prb[uidx]-50;
+      current_mean_down += up_prb[didx]-50;
+
+   }
+   current_mean_up /= w_size;
+   current_mean_down /= w_size;
+
+   for(uidx=iidx-w_size;uidx<iidx;uidx++) {
+      didx = uidx+w_size;
+      current_variance_up += pow(up_prb[uidx]-50 - current_mean_up,2);
+      current_variance_down += pow(up_prb[didx]-50 - current_mean_up,2);
+      
+   }
+   current_variance_up /= (w_size-1);
+   current_variance_down /= (w_size-1);
+
+   for(iidx=w_size;iidx<30;iidx++) {
+      for(uidx=iidx-w_size;uidx<iidx;uidx++) {
+         didx = uidx+w_size;
+         mean_up[iidx-w_size] += down_prb[uidx]-50;
+         mean_down[iidx-w_size] += down_prb[didx]-50;
+
+      }
+      mean_up[iidx-w_size] /= w_size;
+      mean_down[iidx-w_size] /= w_size;
+
+      for(uidx=iidx-w_size;uidx<iidx;uidx++) {
+         didx = uidx+w_size;
+         variance_up[iidx-w_size] += pow(down_prb[uidx]-50 - mean_up[iidx-w_size],2);
+         variance_down[iidx-w_size] += pow(down_prb[didx]-50 - mean_down[iidx-w_size],2);
+      }
+      variance_up[iidx-w_size] /= (w_size-1);
+      variance_down[iidx-w_size] /= (w_size-1);
+
+      //printf("means: %f %f %f %f, variances: %f %f %f %f\n",current_mean_up,current_mean_down,mean_up,mean_down,current_variance_up,current_variance_down,variance_up,variance_down);
+      //if ( abs(current_mean_up - mean_up) < epsilon_mean && abs(current_variance_up - variance_up) < epsilon_var 
+      //&& abs(current_mean_down - mean_down) < epsilon_mean && abs(current_variance_down - variance_down) < epsilon_var )
+      //   return iidx;
+   }
+
+   int bidx;
+   int bestIdx = -1;
+   double min = 1000.0;
+   for(bidx=0;bidx<read_size-2*w_size;bidx++) {
+      if ( abs(current_mean_up - mean_up[bidx]) < epsilon_mean && abs(current_variance_up - variance_up[bidx]) < epsilon_var 
+      && abs(current_mean_down - mean_down[bidx]) < epsilon_mean && abs(current_variance_down - variance_down[bidx]) < epsilon_var ) {
+         if ( abs(current_mean_up - mean_up[bidx]) + abs(current_variance_up - variance_up[bidx]) + abs(current_mean_down - mean_down[bidx]) + abs(current_variance_down - variance_down[bidx])) {
+            min = abs(current_mean_up - mean_up[bidx]) + abs(current_variance_up - variance_up[bidx]) + abs(current_mean_down - mean_down[bidx]) + abs(current_variance_down - variance_down[bidx]);
+            bestIdx = bidx;
+         }
+      }
+   }
+
+   free(mean_up);
+   free(variance_up);
+   free(mean_down);
+   free(variance_down);
+
+   return bestIdx;
+}
+
+void remove_ambiguities(char * old_seq, int old_seq_size, char* new_seq) {
+   int idx=0;
+   int new_idx = 0;
+
+   while(idx<old_seq_size) {
+      //printf("Current elem %c pos %d %d\n",old_seq[idx],idx,new_idx);
+      if (old_seq[idx] == ']' || old_seq[idx] == '-' ) {
+         idx++;
+         continue;
+      }
+
+      if (old_seq[idx] == '[') {
+         idx += 2;
+         //printf("%c %c\n",old_seq[idx-2],old_seq[idx]);
+         continue;
+      }
+
+      new_seq[new_idx] = old_seq[idx];
+      idx++;
+      new_idx++;
+   }
+
+   if (new_idx != 36) {
+      printf("Error: Sequence is not of length 36!\n");
+      printf("old seq: %s\n",old_seq);
+      printf("new seq: %s\n",new_seq);
+      exit(EXIT_FAILURE);
+   }
+}
 
 /*
- * TODO
- * - Check strand
- * - check for [AC] and similar entries
+ * TODO:
+ * - Check strand -> done simple (only if equal)
+ * - check for [AC] and similar entries -> done simple (see function
+ * - remove_ambiguities (exchanges [XY] by the second entry)
  */