+ small changes
[qpalma.git] / tools / data_tools / filterReads.c
index 98a3e2b..aea1b25 100644 (file)
 #include <unistd.h>
 #include <math.h>
 
-#include "common.h"
 #include "datastructures.h"
 
+#define _FILE_OFFSET_BITS == 64
+
 int compare_gene_struct(struct gene* a, struct gene* b) {
    return a->stop - b->start;
 }
@@ -186,6 +187,8 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
       if (gene_idx == numGenes || strcmp(current_line,"") == 0)
          break;
 
+      gene_id = currentGene->id;
+
       if (readCtr != 0 && readCtr % 1000000 == 0)
          printf("Processed %d/%d genes and %d/%d reads.\n",gene_idx,numGenes,readCtr,numReads);
 
@@ -213,7 +216,6 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
             exon_idx = 1;
             currentGene = (*allGenes)[gene_idx];
             //printf("currentGene->start / currentGene->stop %d/%d pos is %d\n",currentGene->start,currentGene->stop,pos);
-            gene_id = currentGene->id;
             ue = uo = ds = dov = 0;
             continue;
          }
@@ -336,164 +338,9 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
    free(chastity);
 }
 
-/*
-void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size,FILE* out_fs) {
-   //printf("up/down size is %d/%d\n",up_size,down_size);
-
-   int up_idx, down_idx, status;
-   char* upstream_line = malloc(sizeof(char)*256);
-   char* downstream_line = malloc(sizeof(char)*256);
-
-   int buffer_size= 64;
-
-   int up_chr        = 0;
-   int up_pos        = 0;
-   char* up_seq      = malloc(sizeof(char)*buffer_size);
-   int up_id         = 0;
-   char up_strand    = ' ';
-   int up_mismatch   = 0;
-   int up_occurrence = 0;
-   int up_sz       = 0;
-   int up_cut        = 0;
-   char* up_prb      = malloc(sizeof(char)*buffer_size);
-   char* up_cal_prb  = malloc(sizeof(char)*buffer_size);
-   char* up_chastity = malloc(sizeof(char)*buffer_size);
-
-   int down_chr        = 0;
-   int down_pos        = 0;
-   char* down_seq      = malloc(sizeof(char)*buffer_size);
-   int down_id         = 0;
-   char down_strand    = ' ';
-   int down_mismatch   = 0;
-   int down_occurrence = 0;
-   int down_sz         = 0;
-   int down_cut        = 0;
-   char* down_prb      = malloc(sizeof(char)*buffer_size);
-   char* down_cal_prb  = malloc(sizeof(char)*buffer_size);
-   char* down_chastity = malloc(sizeof(char)*buffer_size);
-
-   int new_chr        = 0;
-   char* new_seq      = malloc(sizeof(char)*buffer_size);
-   char new_strand    = ' ';
-   char* new_prb        = malloc(sizeof(char)*buffer_size);
-   char* new_cal_prb    = malloc(sizeof(char)*buffer_size);
-   char* new_chastity   = malloc(sizeof(char)*buffer_size);
-   char* new_up_seq = malloc(sizeof(char)*read_size);
-   char* new_down_seq = malloc(sizeof(char)*read_size); 
-
-   up_idx=0;
-   down_idx=0;
-   while(1) {
-      if (up_idx == up_size || down_idx == down_size || up_strand != down_strand)
-         break;
-
-      strncpy(upstream_line,upstream[up_idx],256);
-      status = sscanf(upstream_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
-      &up_chr,&up_pos,up_seq,&up_id,&up_strand,&up_mismatch,&up_occurrence,&up_sz,
-      &up_cut,up_prb,up_cal_prb,up_chastity);
-      
-      strncpy(downstream_line,downstream[down_idx],256);
-      status = sscanf(downstream_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
-      &down_chr,&down_pos,down_seq,&down_id,&down_strand,&down_mismatch,&down_occurrence,&down_sz,
-      &down_cut,down_prb,down_cal_prb,down_chastity);
-
-      remove_ambiguities(up_seq,strlen(up_seq),new_up_seq);
-      remove_ambiguities(down_seq,strlen(down_seq),new_down_seq);
-
-      new_seq[0] = '\0';
-      new_prb[0] = '\0';
-      new_cal_prb[0] = '\0';
-      new_chastity[0] = '\0';
-         
-      int fit;
-      int w_size = 6;
-      int overlap = 0;
-      if (up_pos+35 == exon_stop) { // merge with read which is downstream overlapping
-         overlap = exon_start - down_pos;
-         
-         //fit = fitting(up_prb+(read_size-w_size),up_prb+read_size,down_prb+overlap,down_prb+overlap+w_size);
-         //if (fit != 1)
-         //   goto end;
-
-         new_chr     = up_chr;
-         new_strand  = up_strand;
-
-         strncat(new_seq,new_up_seq+(read_size-overlap),overlap);
-         strncat(new_prb,up_prb+(read_size-overlap),overlap);
-         strncat(new_cal_prb,up_cal_prb+(read_size-overlap),overlap);
-         strncat(new_chastity,up_chastity+(read_size-overlap),overlap);
-
-         strncat(new_seq,new_down_seq+overlap,read_size-overlap);
-         strncat(new_prb,down_prb+overlap,read_size-overlap);
-         strncat(new_cal_prb,down_cal_prb+overlap,read_size-overlap);
-         strncat(new_chastity,down_chastity+overlap,read_size-overlap);
-
-         //printf("Between exon stop/start %d/%d : merging pos %d %d with overlap %d\n",exon_stop,exon_start,up_pos+35,down_pos, overlap);
-
-      } // merge with read which is upstream overlapping
-      
-      if (down_pos == exon_start) {
-         overlap = up_pos+read_size - exon_stop;
-         //printf("overlap is %d\n",overlap);
-         //printf("pos are: %d %d\n",up_pos,down_pos);
-         
-         fit = fitting(up_prb+read_size-overlap-w_size,up_prb+read_size-overlap,down_prb,down_prb+w_size);
-         if (fit == -1)
-
-         //   goto end;
-
-         new_chr     = up_chr;
-         new_strand  = up_strand;
-
-         strncat(new_seq,new_up_seq,(read_size-overlap));
-         strncat(new_prb,up_prb,(read_size-overlap));
-         strncat(new_cal_prb,up_cal_prb,(read_size-overlap));
-         strncat(new_chastity,up_chastity,(read_size-overlap));
-
-         strncat(new_seq,new_down_seq,overlap);
-         strncat(new_prb,down_prb,overlap);
-         strncat(new_cal_prb,down_cal_prb,overlap);
-         strncat(new_chastity,down_chastity,overlap);
-
-         //printf("Between exon stop/start %d/%d : merging pos %d %d with overlap %d\n",exon_stop,exon_start,up_pos,down_pos, overlap);
-      }
-
-      if ( !(up_pos+35 == exon_stop) && !(down_pos == exon_start) )
-         printf("ERROR: Between exon stop/start %d/%d : merging pos %d %d with overlap %d\n",exon_stop,exon_start,up_pos,down_pos, overlap);
-
-      fprintf(out_fs,"%d\t%c\t%s\t%d\t%s\t%s\t%s\n",
-      new_chr,new_strand,new_seq,read_size,new_prb,new_cal_prb,new_chastity);
-    
-      end:
-
-      up_idx++;
-      down_idx++;
-   }
-
-   free(upstream_line);
-   free(downstream_line);
-  
-   free(new_up_seq);
-   free(new_down_seq); 
-
-   free(up_prb);
-   free(up_cal_prb);
-   free(up_chastity);
-
-   free(down_prb);
-   free(down_cal_prb);
-   free(down_chastity);
-
-   free(new_prb);
-   free(new_cal_prb);
-   free(new_chastity);
-
-}
-*/ 
-
 void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size,FILE* out_fs,const char* gene_id) {
    //printf("up/down size is %d/%d\n",up_size,down_size);
-
+   
    int up_idx, down_idx, status;
    char* upstream_line = malloc(sizeof(char)*256);
    char* downstream_line = malloc(sizeof(char)*256);