+ some fixes of the filterReads and parser functions
[qpalma.git] / tools / data_tools / filterReads.c
index be348b7..69fb2bb 100644 (file)
@@ -41,6 +41,8 @@ static char *info = "Usage is:\n./filterReads gff reads output";
  *
  */
 
  *
  */
 
+const char* line_format = "%d\t%d\t%s\t%lu\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n";
+
 const int read_size = 36;
 
 const int min_overlap = 6;
 const int read_size = 36;
 
 const int min_overlap = 6;
@@ -62,6 +64,7 @@ int main(int argc, char* argv[]) {
    char* reads_filename = malloc(sizeof(char)*filenameSize);
    char* output_filename = malloc(sizeof(char)*filenameSize);
 
    char* reads_filename = malloc(sizeof(char)*filenameSize);
    char* output_filename = malloc(sizeof(char)*filenameSize);
 
+
    strncpy(gff_filename,argv[1],filenameSize);
    strncpy(reads_filename,argv[2],filenameSize);
    strncpy(output_filename,argv[3],filenameSize);
    strncpy(gff_filename,argv[1],filenameSize);
    strncpy(reads_filename,argv[2],filenameSize);
    strncpy(output_filename,argv[3],filenameSize);
@@ -132,6 +135,14 @@ int main(int argc, char* argv[]) {
 
    process_reads(reads_fs,&allGenes,numGenes,out_fs);
 
 
    process_reads(reads_fs,&allGenes,numGenes,out_fs);
 
+   for(g_idx=0;g_idx<numGenes;g_idx++) {
+      if(allGenes[g_idx] != 0)
+         free_gene(allGenes[g_idx]);
+         free(allGenes[g_idx]);
+
+   }
+   free(allGenes);
+
    status = fclose(reads_fs);
    status = fclose(out_fs);
    if(status != 0)
    status = fclose(reads_fs);
    status = fclose(out_fs);
    if(status != 0)
@@ -231,8 +242,7 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
       if (readCtr != 0 && readCtr % 1000000 == 0)
          printf("Processed %d/%d genes and %d/%d reads.\n",gene_idx,numGenes,readCtr,numReads);
 
       if (readCtr != 0 && readCtr % 1000000 == 0)
          printf("Processed %d/%d genes and %d/%d reads.\n",gene_idx,numGenes,readCtr,numReads);
 
-      status = sscanf(current_line,"%d\t%d\t%s\t%lu\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
-      &chr,&pos,seq,&id,&strand,&mismatch,&occurrence,&size,&cut,prb,cal_prb,chastity);
+      status = sscanf(current_line,line_format,&chr,&pos,seq,&id,&strand,&mismatch,&occurrence,&size,&cut,prb,cal_prb,chastity);
       if (status < 12) {
          skippedLinesCounter++;
          goto next_read;
       if (status < 12) {
          skippedLinesCounter++;
          goto next_read;
@@ -249,6 +259,11 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
       if (!(currentGene->start <= pos && (pos + read_size-1) <= currentGene->stop)) { // read is not within gene borders
 
          if ( currentGene->stop < (pos + read_size-1) ) { // go to next gene
       if (!(currentGene->start <= pos && (pos + read_size-1) <= currentGene->stop)) { // read is not within gene borders
 
          if ( currentGene->stop < (pos + read_size-1) ) { // go to next gene
+
+            if (uov != 0 && dov != 0)
+               combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
+            uov = dov = 0;
+
             gene_idx++;
             exon_idx = 1;
             currentGene = (*allGenes)[gene_idx];
             gene_idx++;
             exon_idx = 1;
             currentGene = (*allGenes)[gene_idx];
@@ -257,7 +272,6 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
                gene_idx++;
             }
             //printf("currentGene->start / currentGene->stop %d/%d pos is %d\n",currentGene->start,currentGene->stop,pos);
                gene_idx++;
             }
             //printf("currentGene->start / currentGene->stop %d/%d pos is %d\n",currentGene->start,currentGene->stop,pos);
-            uov = dov = 0;
             continue;
          }
 
             continue;
          }
 
@@ -294,13 +308,15 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
          prev_exon_stop = currentGene->exon_stops[exon_idx-1];
          cur_exon_start = currentGene->exon_starts[exon_idx];
 
          prev_exon_stop = currentGene->exon_stops[exon_idx-1];
          cur_exon_start = currentGene->exon_starts[exon_idx];
 
+         //printf("exon: %d %d %d\t pos: %d\n",prev_exon_start,prev_exon_stop,cur_exon_start,pos);
+
          if (cur_exon_start - prev_exon_stop < min_overlap || cur_exon_start < pos ) { // go to next exon
 
             if (uov != 0 && dov != 0)
                combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
          if (cur_exon_start - prev_exon_stop < min_overlap || cur_exon_start < pos ) { // go to next exon
 
             if (uov != 0 && dov != 0)
                combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
+            uov = dov = 0;
 
             exon_idx++;
 
             exon_idx++;
-            uov = dov = 0;
             goto exon_label;
          }
 
             goto exon_label;
          }
 
@@ -353,7 +369,7 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
    if(status != 0)
       perror("munmap");
 
    if(status != 0)
       perror("munmap");
 
-   //free(current_line);
+   free(current_line);
    free(seq);
    free(prb);
    free(cal_prb);
    free(seq);
    free(prb);
    free(cal_prb);
@@ -362,12 +378,11 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
 
 void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size,FILE* out_fs,const char* gene_id, int exon_idx) {
 
 
 void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size,FILE* out_fs,const char* gene_id, int exon_idx) {
 
-   //printf("up/down size is %d/%d\n",up_size,down_size);
+   printf("up/down size is %d/%d\n",up_size,down_size);
 
 
-   /*
    int up_idx, down_idx, status;
    int up_idx, down_idx, status;
-   char* upstream_line = malloc(sizeof(char)*256);
-   char* downstream_line = malloc(sizeof(char)*256);
+   char* upstream_line = malloc(sizeof(char)*512);
+   char* downstream_line = malloc(sizeof(char)*512);
    int p_start, p_stop;
 
    int buffer_size= 64;
    int p_start, p_stop;
 
    int buffer_size= 64;
@@ -375,7 +390,7 @@ void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, v
    int up_chr        = 0;
    int up_pos        = 0;
    char* up_seq      = malloc(sizeof(char)*buffer_size);
    int up_chr        = 0;
    int up_pos        = 0;
    char* up_seq      = malloc(sizeof(char)*buffer_size);
-   int up_id         = 0;
+   unsigned long up_id         = 0;
    char up_strand    = ' ';
    int up_mismatch   = 0;
    int up_occurrence = 0;
    char up_strand    = ' ';
    int up_mismatch   = 0;
    int up_occurrence = 0;
@@ -388,7 +403,7 @@ void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, v
    int down_chr        = 0;
    int down_pos        = 0;
    char* down_seq      = malloc(sizeof(char)*buffer_size);
    int down_chr        = 0;
    int down_pos        = 0;
    char* down_seq      = malloc(sizeof(char)*buffer_size);
-   int down_id         = 0;
+   unsigned long down_id         = 0;
    char down_strand    = ' ';
    int down_mismatch   = 0;
    int down_occurrence = 0;
    char down_strand    = ' ';
    int down_mismatch   = 0;
    int down_occurrence = 0;
@@ -410,85 +425,114 @@ void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, v
    int overlap;
    int fit;
 
    int overlap;
    int fit;
 
-   char* used_flag = calloc(down_size,sizeof(char));
+   char* lineBeginPtr;
+   char* lineEndPtr;
+
+   char* up_used_flag = calloc(up_size,sizeof(char));
+   char* down_used_flag = calloc(down_size,sizeof(char));
 
    for(up_idx=0;up_idx<up_size;up_idx++) {
 
    for(up_idx=0;up_idx<up_size;up_idx++) {
-      strncpy(upstream_line,upstream[up_idx],256);
-      status = sscanf(upstream_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
-      &up_chr,&up_pos,up_seq,&up_id,&up_strand,&up_mismatch,&up_occurrence,&up_sz,
-      &up_cut,up_prb,up_cal_prb,up_chastity);
+      if( up_used_flag[up_idx] == 1)
+         continue;
+
+      lineBeginPtr = upstream[up_idx];
+      lineEndPtr = lineBeginPtr;
+      while (*(char*)lineEndPtr != '\n') lineEndPtr++;
+      lineEndPtr++;
+      strncpy(upstream_line,lineBeginPtr,lineEndPtr-lineBeginPtr);
+      upstream_line[lineEndPtr-lineBeginPtr] = '\0';
+
+      status = sscanf(upstream_line,line_format,&up_chr,&up_pos,up_seq,&up_id,
+      &up_strand,&up_mismatch,&up_occurrence,&up_sz,&up_cut,up_prb,up_cal_prb,up_chastity);
 
       remove_ambiguities(up_seq,strlen(up_seq),new_up_seq);
 
       overlap = exon_stop - up_pos;
 
       for(down_idx=0;down_idx<down_size;down_idx++) {
 
       remove_ambiguities(up_seq,strlen(up_seq),new_up_seq);
 
       overlap = exon_stop - up_pos;
 
       for(down_idx=0;down_idx<down_size;down_idx++) {
-         if( used_flag[down_idx] == 1)
+         if( down_used_flag[down_idx] == 1)
             continue;
 
             continue;
 
-         strncpy(downstream_line,downstream[down_idx],256);
-         status = sscanf(downstream_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
+         lineBeginPtr = downstream[down_idx];
+         lineEndPtr = lineBeginPtr;
+         while (*(char*)lineEndPtr != '\n') lineEndPtr++;
+         lineEndPtr++;
+         strncpy(downstream_line,lineBeginPtr,lineEndPtr-lineBeginPtr);
+         downstream_line[lineEndPtr-lineBeginPtr] = '\0';
+
+         status = sscanf(downstream_line,line_format,
          &down_chr,&down_pos,down_seq,&down_id,&down_strand,&down_mismatch,&down_occurrence,&down_sz,
          &down_cut,down_prb,down_cal_prb,down_chastity);
 
          &down_chr,&down_pos,down_seq,&down_id,&down_strand,&down_mismatch,&down_occurrence,&down_sz,
          &down_cut,down_prb,down_cal_prb,down_chastity);
 
-         remove_ambiguities(down_seq,strlen(down_seq),new_down_seq);
-
          if(up_strand != down_strand)
             continue;
 
          if(up_strand != down_strand)
             continue;
 
-         new_seq[0] = '\0';
-         new_prb[0] = '\0';
-         new_cal_prb[0] = '\0';
-         new_chastity[0] = '\0';
-         int splitpos = 0;
+         remove_ambiguities(down_seq,strlen(down_seq),new_down_seq);
          
          
-         fit = fitting(up_prb+(36-overlap)-6,down_prb+(exon_start-down_pos)-6);
-         if (fit == -1)
-            continue;
-
-         if (! (fit  < overlap ))
-            continue;
-
-         new_chr     = up_chr;
-         new_strand  = up_strand;
-         splitpos = overlap;
-
-         p_start = up_pos;
-         p_stop =  exon_start+(read_size-overlap);
-
-         strncat(new_seq,new_up_seq,overlap);
-         strncat(new_prb,up_prb,overlap);
-         strncat(new_cal_prb,up_cal_prb,overlap);
-         strncat(new_chastity,up_chastity,overlap);
-
-         //strncat(new_seq,new_down_seq+fit,read_size-overlap);
-         //strncat(new_prb,down_prb+fit,read_size-overlap);
-         //strncat(new_cal_prb,down_cal_prb+fit,read_size-overlap);
-         //strncat(new_chastity,down_chastity+fit,read_size-overlap);
+         printf(line_format,
+         up_chr,up_pos,up_seq,up_id,up_strand,up_mismatch,up_occurrence,up_sz,
+         up_cut,up_prb,up_cal_prb,up_chastity);
+
+         printf(line_format,
+         down_chr,down_pos,down_seq,down_id,down_strand,down_mismatch,down_occurrence,down_sz,
+         down_cut,down_prb,down_cal_prb,down_chastity);
+
+         //new_seq[0] = '\0';
+         //new_prb[0] = '\0';
+         //new_cal_prb[0] = '\0';
+         //new_chastity[0] = '\0';
+         //int splitpos = 0;
          
          
-         int offset = exon_start-down_pos;
-         strncat(new_seq,new_down_seq+offset,read_size-overlap);
-         strncat(new_prb,down_prb+offset,read_size-overlap);
-         strncat(new_cal_prb,down_cal_prb+offset,read_size-overlap);
-         strncat(new_chastity,down_chastity+offset,read_size-overlap);
-
-         // The four last integers specify the positions of 
-         // p_start : the position in the dna where the first read starts
-         // exons_stop  : the position in the dna where the first exons ends
-         // exons_start : the position in the dna where the second exons starts
-         // p_stop  : the position in the dna where the (truncated) second read ends
-         fprintf(out_fs,"%d\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\n",
-         read_nr,new_chr,new_strand,new_seq,splitpos,read_size,new_prb,new_cal_prb,new_chastity,gene_id,p_start,exon_stop,exon_start,p_stop);
-         read_nr++;
-
-         combined_reads++;
-         used_flag[down_idx] = 1;
+         //
+         //fit = fitting(up_prb+(36-overlap)-6,down_prb+(exon_start-down_pos)-6);
+         //if (fit == -1)
+         //   continue;
+
+         //if (! (fit  < overlap ))
+         //   continue;
+
+         //new_chr     = up_chr;
+         //new_strand  = up_strand;
+         //splitpos = overlap;
+
+         //p_start = up_pos;
+         //p_stop =  exon_start+(read_size-overlap);
+
+         //strncat(new_seq,new_up_seq,overlap);
+         //strncat(new_prb,up_prb,overlap);
+         //strncat(new_cal_prb,up_cal_prb,overlap);
+         //strncat(new_chastity,up_chastity,overlap);
+
+         ////strncat(new_seq,new_down_seq+fit,read_size-overlap);
+         ////strncat(new_prb,down_prb+fit,read_size-overlap);
+         ////strncat(new_cal_prb,down_cal_prb+fit,read_size-overlap);
+         ////strncat(new_chastity,down_chastity+fit,read_size-overlap);
+         //
+         //int offset = exon_start-down_pos;
+         //strncat(new_seq,new_down_seq+offset,read_size-overlap);
+         //strncat(new_prb,down_prb+offset,read_size-overlap);
+         //strncat(new_cal_prb,down_cal_prb+offset,read_size-overlap);
+         //strncat(new_chastity,down_chastity+offset,read_size-overlap);
+
+         //// The four last integers specify the positions of 
+         //// p_start : the position in the dna where the first read starts
+         //// exons_stop  : the position in the dna where the first exons ends
+         //// exons_start : the position in the dna where the second exons starts
+         //// p_stop  : the position in the dna where the (truncated) second read ends
+         //fprintf(out_fs,"%d\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\n",
+         //read_nr,new_chr,new_strand,new_seq,splitpos,read_size,new_prb,new_cal_prb,new_chastity,gene_id,p_start,exon_stop,exon_start,p_stop);
+         //read_nr++;
+
+         //combined_reads++;
+         //used_flag[down_idx] = 1;
       }
    }
 
    free(upstream_line);
    free(downstream_line);
       }
    }
 
    free(upstream_line);
    free(downstream_line);
-   free(used_flag);
+
+   free(up_used_flag);
+   free(down_used_flag);
   
    free(new_up_seq);
    free(new_down_seq); 
   
    free(new_up_seq);
    free(new_down_seq); 
@@ -507,7 +551,6 @@ void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, v
    free(new_prb);
    free(new_cal_prb);
    free(new_chastity);
    free(new_prb);
    free(new_cal_prb);
    free(new_chastity);
-   */
 }
 
 /*
 }
 
 /*