+ found bug which caused ~10 reads of overlap "0"
[qpalma.git] / tools / data_tools / filterReads.c
index 9bca1b1..bbd54b4 100644 (file)
@@ -78,9 +78,9 @@ int main(int argc, char* argv[]) {
    process_reads(reads_fs,&allGenes,numGenes,out_fs);
 
    status = fclose(reads_fs);
-   status += fclose(out_fs);
+   status = fclose(out_fs);
    if(status != 0)
-      printf("closing of filestreams failed!\n");
+      perror("fclose");
       
    //free(allGenes);
    free(gff_filename);
@@ -112,8 +112,8 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
       exit(EXIT_FAILURE);
    }
    off_t reads_filesize = reads_stat.st_size;
-
    printf("Reads file is of size %lu bytes\n",(unsigned long) reads_filesize);
+   int numReads = reads_filesize / 178.0;
 
    void *reads_area = mmap (NULL,reads_filesize,PROT_READ|PROT_WRITE,MAP_PRIVATE,reads_fid,0);
    if (reads_area == MAP_FAILED) {
@@ -121,7 +121,6 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
       exit(EXIT_FAILURE);
    }
    close(reads_fid);
-   
    printf("Successfully mapped %lu bytes of reads file into memory\n",(unsigned long)reads_filesize);
 
    void* linePtr = reads_area;
@@ -147,25 +146,30 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
    struct gene* currentGene = (*allGenes)[gene_idx];
 
    int readCtr = 0;
+   int old_gene_stop = -1;
+   int old_pos = 0;
+   char* posPtr;
+   int new_chr = 0;
+   int new_pos = 0;
+   char* tmp_line = malloc(sizeof(char)*256);
    // start of the parsing loop 
    while(1) {
       if (gene_idx == numGenes || strcmp(current_line,"") == 0)
          break;
 
-      if (readCtr != 0 && readCtr % 100000 == 0)
-         printf("Processed %d reads.\n",readCtr);
+      if (readCtr != 0 && readCtr % 1000000 == 0)
+         printf("Processed %d/%d genes and %d/%d reads.\n",readCtr,gene_idx,numGenes,readCtr,numReads);
 
-      //if (readCtr == 10000000)
-      //   exit(EXIT_FAILURE);
+      //if (gene_idx >= 1833)
+      //   printf("currentGene start/stop: %d/%d. Positions is %d\n",currentGene->start,currentGene->stop,pos);
 
+      old_pos = pos;
       status = sscanf(current_line,"%d\t%d\t%s\t%lu\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
       &chr,&pos,seq,&id,&strand,&mismatch,&occurrence,&size,&cut,prb,cal_prb,chastity);
       if (status < 12) {
          skippedLinesCounter++;
       }
-
       //printf("gene start/stop %d,%d, pos: %d\n",currentGene->start,currentGene->stop,pos);
-
       // if the read is occurring several times elsewhere then get rid of it 
       if(occurrence != 1) {
          while (*(char*)linePtr != '\n') linePtr++;
@@ -177,30 +181,51 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
 
       if (!(currentGene->start <= pos && (pos + read_size-1) <= currentGene->stop)) { // read is not within gene borders
 
-         if ( currentGene->stop < (pos + read_size-1) ) { // go to next gene
+         if ( currentGene->stop < (pos + read_size-1) || currentGene->start < old_gene_stop ) { // go to next gene
             gene_idx++;
             exon_idx = 1;
+            old_gene_stop = currentGene->stop ;
             currentGene = (*allGenes)[gene_idx];
+            //printf("currentGene->start / currentGene->stop %d/%d pos is %d\n",currentGene->start,currentGene->stop,pos);
+            ue = uo = ds = dov = 0;
             continue;
          }
 
-         if ( pos < currentGene->start ) { // go to next read
+         if ( pos < currentGene->start || pos < old_pos) { // go to next read
             next_read:
 
+            //posPtr = linePtr;
+            //while (1) {
+            //   printf("posPtr points to %c\n",*(char*)posPtr);
+            //   if ((*(char*)posPtr) == '\n') {
+            //      posPtr++;
+            //      tmp_line = strncpy(tmp_line,posPtr,256);
+
+            //      if (strcmp(tmp_line,"") == 0)
+            //         break;
+
+            //      sscanf(tmp_line,"%d\t%d\t",new_chr,new_pos);
+            //      printf("new_pos %d\n",new_pos);
+            //      if (new_pos >= currentGene->start) {
+            //         linePtr = posPtr;
+            //         break;
+            //      }
+            //   }
+            //   posPtr++;
+            //}
+            //printf("Went out!\n");
+
             while (*(char*)linePtr != '\n') linePtr++;
             linePtr++;
             readCtr += 1;
             current_line = strncpy(current_line,linePtr,256);
+            
             continue;
          }
 
       } else { // read IS within gene borders
 
          exon_label:
-         prev_exon_stop = currentGene->exon_stops[exon_idx-1];
-         cur_exon_start = currentGene->exon_starts[exon_idx];
-         //printf("prev_exon_stop %d cur_exon_start %d pos %d\n",prev_exon_stop,cur_exon_start,pos);
-         //printf("exon_idx %d\n",exon_idx);
 
          if (exon_idx == currentGene->num_exons) {
             gene_idx++;
@@ -208,47 +233,53 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
             currentGene = (*allGenes)[gene_idx];
             continue;
          }
+
+         prev_exon_stop = currentGene->exon_stops[exon_idx-1];
+         cur_exon_start = currentGene->exon_starts[exon_idx];
             
          if (cur_exon_start - prev_exon_stop < 6 || cur_exon_start < pos ) { // go to next exon
             exon_idx++;
-            combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov,out_fs);
-            combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds,out_fs);
+
+            if (ue != 0 && dov != 0)
+               combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov,out_fs);
+
+            if (uo != 0 && ds != 0) 
+               combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds,out_fs);
+
             ue = uo = ds = dov = 0;
             goto exon_label;
          }
 
-         if ( pos < prev_exon_stop - read_size + 1  ) { // go to next read
+         if ( pos + (read_size-1) < prev_exon_stop ) // go to next read
             goto next_read;
-         }
 
          // if this position is reached the read is somehow overlapping or
          // exactly on the exon boundary. now determine where exactly:
-
          if (pos + (read_size-1) == prev_exon_stop) {  // read ends at previous exon end
             upstream_end[ue] = linePtr;
             ue++;
             goto next_read;
          }
 
-         if (prev_exon_stop - pos >= 6 && prev_exon_stop - pos <= 30) { // read overlaps with previous exon boundary
+         if ( (prev_exon_stop - pos) >= 6 && (prev_exon_stop - pos) <= 30) { // read overlaps with previous exon boundary
             upstream_overlap[uo] = linePtr;
             uo++;
             goto next_read;
          }
 
-         if (pos == cur_exon_start) { // read starts at current exon start
+         if ( pos == cur_exon_start ) { // read starts at current exon start
             downstream_start[ds] = linePtr;
             ds++;
             goto next_read;
          }
 
-         if (cur_exon_start - pos >= 6 && cur_exon_start - pos <= 30 ) { // read overlaps with current exon boundary
+         if ( (cur_exon_start - pos) >= 6 && (cur_exon_start - pos) <= 30 ) { // read overlaps with current exon boundary
             downstream_overlap[dov] = linePtr;
             dov++;
             goto next_read;
          }
 
-         goto next_read;
+         goto next_read; // read was not useful i.e. not overlapping/starting at exon boundaries
       }
    }
 
@@ -273,11 +304,6 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
 }
 
 void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size,FILE* out_fs) {
-   //printf("up_/down_size %d %d\n",up_size,down_size);
-
-   if (up_size == 0 || down_size == 0 || exon_stop == -1)
-      return;
-
    int up_idx, down_idx, status;
    char* upstream_line = malloc(sizeof(char)*256);
    char* downstream_line = malloc(sizeof(char)*256);
@@ -320,12 +346,7 @@ void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, v
    up_idx=0;
    down_idx=0;
    while(1) {
-      //printf("up_/down_idx %d %d\n",up_idx,down_idx);
-
-      if (up_idx == up_size || down_idx == down_size)
-         break;
-
-      if ( up_strand != down_strand )
+      if (up_idx == up_size || down_idx == down_size || up_strand != down_strand)
          break;
 
       strncpy(upstream_line,upstream[up_idx],256);
@@ -350,11 +371,10 @@ void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, v
       new_chastity[0] = '\0';
          
       int fit;
-      int w_size = 5;
+      int w_size = 6;
+      int overlap = 0;
       if (up_pos+35 == exon_stop) { // merge with read which is downstream overlapping
-         int overlap = exon_start - down_pos;
-         //printf("overlap is %d\n",overlap);
-         //printf("pos are: %d %d\n",up_pos,down_pos);
+         overlap = exon_start - down_pos;
          
          fit = fitting(up_prb+(36-w_size),up_prb+36,down_prb+overlap,down_prb+overlap+w_size);
          if (fit != 1)
@@ -373,10 +393,12 @@ void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, v
          strncat(new_cal_prb,down_cal_prb+overlap,36-overlap);
          strncat(new_chastity,down_chastity+overlap,36-overlap);
 
+         //printf("Between exon stop/start %d/%d : merging pos %d %d with overlap %d\n",exon_stop,exon_start,up_pos+35,down_pos, overlap);
+
       } // merge with read which is upstream overlapping
       
       if (down_pos == exon_start) {
-         int overlap = up_pos+36 - exon_stop;
+         overlap = up_pos+36 - exon_stop;
          //printf("overlap is %d\n",overlap);
          //printf("pos are: %d %d\n",up_pos,down_pos);
          fit = fitting(up_prb+36-overlap-w_size,up_prb+36-overlap,down_prb,down_prb+w_size);
@@ -395,9 +417,13 @@ void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, v
          strncat(new_prb,down_prb,overlap);
          strncat(new_cal_prb,down_cal_prb,overlap);
          strncat(new_chastity,down_chastity,overlap);
+
+         //printf("Between exon stop/start %d/%d : merging pos %d %d with overlap %d\n",exon_stop,exon_start,up_pos,down_pos, overlap);
       }
 
-      //printf("New entry!\n");
+      if ( !(up_pos+35 == exon_stop) && !(down_pos == exon_start) )
+         printf("ERROR: Between exon stop/start %d/%d : merging pos %d %d with overlap %d\n",exon_stop,exon_start,up_pos,down_pos, overlap);
+
       fprintf(out_fs,"%d\t%c\t%s\t%d\t%s\t%s\t%s\n",
       new_chr,new_strand,new_seq,read_size,new_prb,new_cal_prb,new_chastity);
     
@@ -412,7 +438,7 @@ void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, v
 }
 
 int fitting(char* up_prb, char* up_prb_end, char* down_prb, char* down_prb_end) {
-   double epsilon_mean = 10.0;
+   double epsilon_mean = 15.0;
    double epsilon_var = 10.0;
    double mean_up = 0;
    double variance_up = 0;
@@ -433,7 +459,6 @@ int fitting(char* up_prb, char* up_prb_end, char* down_prb, char* down_prb_end)
    mean_up /= w_size;
    mean_down /= w_size;
 
-   //printf("means: %f %f\n",mean_up,mean_down);
 
    up_ptr = up_prb;
    down_ptr = down_prb;
@@ -448,7 +473,7 @@ int fitting(char* up_prb, char* up_prb_end, char* down_prb, char* down_prb_end)
    variance_up /= (w_size-1);
    variance_down /= (w_size-1);
 
-   //printf("variances: %f %f\n",variance_up,variance_down);
+   //printf("means: %f %f, variances: %f %f\n",mean_up,mean_down,variance_up,variance_down);
 
    if ( abs(mean_up - mean_down) < epsilon_mean && abs(variance_up - variance_down) < epsilon_var )
       return 1;