+ small changes
[qpalma.git] / tools / data_tools / filterReads.c
index 1204315..aea1b25 100644 (file)
 #include <unistd.h>
 #include <math.h>
 
-#include "common.h"
 #include "datastructures.h"
 
+#define _FILE_OFFSET_BITS == 64
+
 int compare_gene_struct(struct gene* a, struct gene* b) {
    return a->stop - b->start;
 }
@@ -27,8 +28,9 @@ void sort_genes(struct gene*** allGenes, int numGenes);
 
 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes,FILE* out_fs);
 
-void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size, FILE* out_fs);
+void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size, FILE* out_fs,const char* gene_id);
 
+//int fitting(char* up_prb, char* up_prb_end, char* down_prb, char* down_prb_end);
 int fitting(char* up_prb, char* down_prb);
 
 void remove_ambiguities(char * old_seq, int old_seq_size, char* new_seq);
@@ -38,9 +40,7 @@ static char *info = "Usage is:\n./filterReads gff reads output";
 const int read_size = 36;
 
 int combined_reads = 0;
-
 int main(int argc, char* argv[]) {
-   combined_reads = 0;
 
    if(argc != 4) {
       printf("%s\n",info);
@@ -151,10 +151,12 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
 
    int SIZE = 500;
    // initialize boundary arrays
+   void** upstream_end      = malloc(sizeof(void*)*SIZE);
    void** upstream_overlap  = malloc(sizeof(void*)*SIZE);
+   void** downstream_start  = malloc(sizeof(void*)*SIZE);
    void** downstream_overlap= malloc(sizeof(void*)*SIZE);
-   int uo,dov;
-   uo = dov = 0;
+   int ue,uo,ds,dov;
+   ue = uo = ds = dov = 0;
 
    int skippedLinesCounter = 0;
 
@@ -166,6 +168,7 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
    int gene_idx = 0;
    int exon_idx = 1;
    struct gene* currentGene = (*allGenes)[gene_idx];
+   char* gene_id = currentGene->id;
 
    char* disamb_seq = malloc(sizeof(char)*read_size);
 
@@ -184,6 +187,8 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
       if (gene_idx == numGenes || strcmp(current_line,"") == 0)
          break;
 
+      gene_id = currentGene->id;
+
       if (readCtr != 0 && readCtr % 1000000 == 0)
          printf("Processed %d/%d genes and %d/%d reads.\n",gene_idx,numGenes,readCtr,numReads);
 
@@ -211,7 +216,7 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
             exon_idx = 1;
             currentGene = (*allGenes)[gene_idx];
             //printf("currentGene->start / currentGene->stop %d/%d pos is %d\n",currentGene->start,currentGene->stop,pos);
-            uo = dov = 0;
+            ue = uo = ds = dov = 0;
             continue;
          }
 
@@ -247,26 +252,39 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
          if (cur_exon_start - prev_exon_stop < 6 || cur_exon_start < pos ) { // go to next exon
             exon_idx++;
 
-            if (uo != 0 && dov != 0) 
-               combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_overlap,dov,out_fs);
+            if (ue != 0 && dov != 0)
+               combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov,out_fs,gene_id);
+
+            if (uo != 0 && ds != 0) 
+               combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds,out_fs,gene_id);
 
-            uo = dov = 0;
+            ue = uo = ds = dov = 0;
             goto exon_label;
          }
 
          if ( prev_exon_start < pos && (pos+read_size) < prev_exon_stop ) { // read is inside previous exon
             exonicReadCtr++;
-            remove_ambiguities(seq,strlen(seq),disamb_seq);
-            fprintf(out_fs,"%d\t%c\t%s\t%d\t%s\t%s\t%s\n",chr,strand,disamb_seq,read_size,prb,cal_prb,chastity);
+
+   // Removed exonic reads
+
+            //remove_ambiguities(seq,strlen(seq),disamb_seq);
+            //fprintf(out_fs,"%d\t%c\t%s\t%d\t%s\t%s\t%s\n",chr,strand,disamb_seq,read_size,prb,cal_prb,chastity);
+
             goto next_read;
          }
 
          if ( pos + (read_size-1) < prev_exon_stop ) // go to next read
             goto next_read;
 
-         // if this position is reached the read is somehow overlapping at 
-         // the exon boundary. now determine where exactly:
-         
+         // if this position is reached the read is somehow overlapping or
+         // exactly on the exon boundary. now determine where exactly:
+         if (pos + (read_size-1) == prev_exon_stop) {  // read ends at previous exon end
+            endPrevCtr++;
+            upstream_end[ue] = linePtr;
+            ue++;
+            goto next_read;
+         }
+
          if ( (prev_exon_stop - pos) >= 6 && (prev_exon_stop - pos) <= 30) { // read overlaps with previous exon boundary
             prevOverlapCtr++;
             upstream_overlap[uo] = linePtr;
@@ -274,6 +292,13 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
             goto next_read;
          }
 
+         if ( pos == cur_exon_start ) { // read starts at current exon start
+            currentStartCtr++;
+            downstream_start[ds] = linePtr;
+            ds++;
+            goto next_read;
+         }
+
          if ( (cur_exon_start - pos) >= 6 && (cur_exon_start - pos) <= 30 ) { // read overlaps with current exon boundary
             currentOverlapCtr++;
             downstream_overlap[dov] = linePtr;
@@ -286,9 +311,12 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
       }
    }
 
-   combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_overlap,dov,out_fs);
+   combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov,out_fs,gene_id);
+   combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds,out_fs,gene_id);
 
+   free(upstream_end);
    free(upstream_overlap);
+   free(downstream_start);
    free(downstream_overlap);
 
    printf("Processed %d reads in total while %d reads where skipped.\n",readCtr,skippedLinesCounter);
@@ -310,11 +338,10 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
    free(chastity);
 }
 
-void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size,FILE* out_fs) {
+void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size,FILE* out_fs,const char* gene_id) {
    //printf("up/down size is %d/%d\n",up_size,down_size);
-
+   
    int up_idx, down_idx, status;
-
    char* upstream_line = malloc(sizeof(char)*256);
    char* downstream_line = malloc(sizeof(char)*256);
 
@@ -352,7 +379,6 @@ void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, v
    char* new_prb        = malloc(sizeof(char)*buffer_size);
    char* new_cal_prb    = malloc(sizeof(char)*buffer_size);
    char* new_chastity   = malloc(sizeof(char)*buffer_size);
-
    char* new_up_seq = malloc(sizeof(char)*read_size);
    char* new_down_seq = malloc(sizeof(char)*read_size); 
 
@@ -386,6 +412,7 @@ void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, v
          new_prb[0] = '\0';
          new_cal_prb[0] = '\0';
          new_chastity[0] = '\0';
+         int splitpos = 0;
          
          fit = fitting(up_prb+(36-overlap),down_prb);
          if (fit == -1)
@@ -407,8 +434,8 @@ void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, v
          strncat(new_cal_prb,down_cal_prb+fit,read_size-overlap);
          strncat(new_chastity,down_chastity+fit,read_size-overlap);
 
-         fprintf(out_fs,"%d\t%c\t%s\t%d\t%s\t%s\t%s\n",
-         new_chr,new_strand,new_seq,read_size,new_prb,new_cal_prb,new_chastity);
+         fprintf(out_fs,"%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n",
+         new_chr,new_strand,new_seq,splitpos,read_size,new_prb,new_cal_prb,new_chastity,gene_id);
 
          combined_reads++;
          used_flag[down_idx] = 1;
@@ -437,6 +464,55 @@ void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, v
    free(new_chastity);
 }
 
+
+
+
+/*
+int fitting(char* up_prb, char* up_prb_end, char* down_prb, char* down_prb_end) {
+   double epsilon_mean = 15.0;
+   double epsilon_var = 10.0;
+   double mean_up = 0;
+   double variance_up = 0;
+   double mean_down = 0;
+   double variance_down = 0;
+
+   char *up_ptr = up_prb;
+   char *down_ptr = down_prb;
+
+   int w_size = 0;
+   while(up_ptr != up_prb_end) {
+      mean_up += (*up_ptr)-50;
+      mean_down += (*down_ptr)-50;
+      w_size++;
+      up_ptr++;
+      down_ptr++;
+   }
+   mean_up /= w_size;
+   mean_down /= w_size;
+
+
+   up_ptr = up_prb;
+   down_ptr = down_prb;
+   w_size = 0;
+   while(up_ptr != up_prb_end) {
+      variance_up += pow((*up_prb)-50 - mean_up,2);
+      variance_down += pow((*down_prb)-50 - mean_down,2);
+      w_size++;
+      up_ptr++;
+      down_ptr++;
+   }
+   variance_up /= (w_size-1);
+   variance_down /= (w_size-1);
+
+   //printf("means: %f %f, variances: %f %f\n",mean_up,mean_down,variance_up,variance_down);
+
+   if ( abs(mean_up - mean_down) < epsilon_mean && abs(variance_up - variance_down) < epsilon_var )
+      return 1;
+      
+   return 0;
+}
+*/
+
 int fitting(char* up_prb, char* down_prb) {
    double epsilon_mean = 30.0;
    double epsilon_var = 30.0;
@@ -520,9 +596,6 @@ int fitting(char* up_prb, char* down_prb) {
 }
 
 void remove_ambiguities(char * old_seq, int old_seq_size, char* new_seq) {
-   //printf("old seq: %s\n",old_seq);
-   //printf("new seq: %s\n",new_seq);
-
    int idx=0;
    int new_idx = 0;