+ fixed some issues with the splice site scores and ugly code fragments
[qpalma.git] / tools / data_tools / filterReads.c
index ae84946..139c681 100644 (file)
@@ -2,13 +2,10 @@
 // The purpose of this program is to read a gff and a    
 // solexa reads file and create a data set used by QPalma. 
 //
-//
-//
 // Notes:
 //
 // - Both the read indices and the gff gene indices are one-based
 //
-//
 ////////////////////////////////////////////////////////////////////////////////
 
 #include <sys/mman.h>
@@ -46,6 +43,10 @@ Tuple join_seq(char* new_seq, char* up_seq,int u_off, int u_size, char* down_seq
 
 static char *info = "Usage is:\n./filterReads gff reads output";
 
+void reverse_complement(char** seq, int len);
+
+char current_strand;
+
 /*
  * Some constants specifying the exact behavior of the filter
  *
@@ -54,8 +55,6 @@ static char *info = "Usage is:\n./filterReads gff reads output";
 //#define _FDEBUG_ 0
 //#define DEBUG_READ 38603
 
-
-//
 /*
  * TODO:
  * - Check strand -> done simple (only if equal)
@@ -91,8 +90,6 @@ static char *info = "Usage is:\n./filterReads gff reads output";
 
 int combined_reads = 0;
 
-char current_strand;
-
 /*
  *
  *
@@ -123,7 +120,7 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
    }
    off_t reads_filesize = reads_stat.st_size;
    printf("Reads file is of size %lu bytes\n",(unsigned long) reads_filesize);
-   int numReads = reads_filesize / 178.0;
+   //int numReads = reads_filesize / 178.0;
 
    void *reads_area = mmap (NULL,reads_filesize,PROT_READ,MAP_PRIVATE,reads_fid,0);
    if (reads_area == MAP_FAILED) {
@@ -186,34 +183,28 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
    int read_start, read_stop;
    // start of the parsing loop 
   
-   //static char *all_strands = "+-";
-   //static char *all_strands = "DP";
-   //int strand_idx;
-   //for(strand_idx=0;strand_idx<2;strand_idx++) {
-   //   char current_strand = all_strands[strand_idx];
-
       while(1) {
          if (gene_idx == numGenes || strcmp(current_line,"") == 0)
             break;
 
          gene_id = currentGene->id;
 
-         if (readCtr != 0 && readCtr % 1000000 == 0) {
-            printf("Processed %d/%d genes and %d/%d reads.\n",gene_idx,numGenes,readCtr,numReads);
-            printf("%d reads in total while %d reads where skipped.\n",readCtr,skippedLinesCounter);
-            printf("%d were totally intronic. %d were outside of genes and %d were occurring on several positions.\n",uselessReadCtr,skippedReadCtr,multioccurReadCtr);
-            printf("%d reads were useless in total\n",uselessReadCtr+skippedReadCtr+multioccurReadCtr);
-            printf("\t%d useless reads\n",uselessReadCtr);
-            printf("\t%d skipped reads\n",skippedReadCtr);
-            printf("\t%d multioccurring\n",multioccurReadCtr);
-            printf("\t%d wrong_strand\n",wrong_strand_ctr);
-            printf("%d within gene\n",read_within_gene_ctr);
-            printf("%d outside gene\n",read_outside_gene_ctr);
-            printf("%d reads were totally exonic\n",exonicReadCtr);
-            printf("%d reads overlap with prev. exon. %d reads overlap with current exon\n",previousOverlapCtr,currentOverlapCtr);
-            printf("%d reads where newly combined from two original reads\n",combined_reads);
-            printf("Total used reads: %d (some may not be combined)\n",exonicReadCtr+combined_reads);
-         }
+         //if (readCtr != 0 && readCtr % 1000000 == 0) {
+         //   printf("Processed %d/%d genes and %d/%d reads.\n",gene_idx,numGenes,readCtr,numReads);
+         //   printf("%d reads in total while %d reads where skipped.\n",readCtr,skippedLinesCounter);
+         //   printf("%d were totally intronic. %d were outside of genes and %d were occurring on several positions.\n",uselessReadCtr,skippedReadCtr,multioccurReadCtr);
+         //   printf("%d reads were useless in total\n",uselessReadCtr+skippedReadCtr+multioccurReadCtr);
+         //   printf("\t%d useless reads\n",uselessReadCtr);
+         //   printf("\t%d skipped reads\n",skippedReadCtr);
+         //   printf("\t%d multioccurring\n",multioccurReadCtr);
+         //   printf("\t%d wrong_strand\n",wrong_strand_ctr);
+         //   printf("%d within gene\n",read_within_gene_ctr);
+         //   printf("%d outside gene\n",read_outside_gene_ctr);
+         //   printf("%d reads were totally exonic\n",exonicReadCtr);
+         //   printf("%d reads overlap with prev. exon. %d reads overlap with current exon\n",previousOverlapCtr,currentOverlapCtr);
+         //   printf("%d reads where newly combined from two original reads\n",combined_reads);
+         //   printf("Total used reads: %d (some may not be combined)\n",exonicReadCtr+combined_reads);
+         //}
 
          // pos of the reads is one-based
          status = sscanf(current_line,line_format,&chr,&pos,seq,&id,&strand,&mismatch,&occurrence,&size,&cut,prb,cal_prb,chastity);
@@ -228,15 +219,17 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
             goto next_read;
          }
 
+         //printf("before rc %s\n",seq);
+         if (current_strand == 'P')
+            reverse_complement(&seq,strlen(seq));
+         //printf("after rc %s\n",seq);
+
+         //printf("new read: %s at %d\n",seq,pos);
+   
          // define read start and stop positions
          read_start = pos;
          read_stop  = pos + read_size-1;
 
-         if( strand != current_strand ) {
-            wrong_strand_ctr++;
-            goto next_read;
-         }
-
          FA(strlen(seq) >= read_size);
 
          FA(currentGene != 0);
@@ -250,7 +243,7 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
                gene_idx++;
                exon_idx = 1;
                currentGene = (*allGenes)[gene_idx];
-               while( (currentGene == 0 || currentGene->strand != current_strand) && gene_idx < numGenes) {
+               while( currentGene == 0 && gene_idx < numGenes) {
                   currentGene = (*allGenes)[gene_idx];
                   gene_idx++;
                }
@@ -455,6 +448,7 @@ void combine_info(int exon_stop, int exon_start, Read** upstream, int up_size, R
       }
    }
 
+   exit(0);
    free(up_used_flag);
    free(down_used_flag);
 }
@@ -500,15 +494,13 @@ int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FI
    if( u_size > up_range || d_size > down_range)
       return 0;
 
-   const int p_start  = exon_stop  - u_size + 1;
-   const int p_stop   = exon_start + d_size - 1;
+   int p_start  = exon_stop  - u_size + 1;
+   int p_stop   = exon_start + d_size - 1;
 
    int u_off = p_start - up_read_start;
    int d_off = exon_start - down_read_start;
 
-   FA(u_off >= 0);
-   FA(d_off >= 0);
-
+   FA( u_off >= 0 && d_off >= 0 );
    FA( exon_stop - p_start + p_stop - exon_start + 2 == read_size);
    FA( u_size + d_size == read_size );
 
@@ -518,39 +510,84 @@ int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FI
    char* new_seq = malloc(sizeof(char)*buf_size);
    memset(new_seq,'z',sizeof(char)*buf_size);
 
+   if ( current_strand == 'P' ) {
+      printf("flipping read sequences...\n");
+      printf("%s %s\n",up_read->seq,down_read->seq);
+      char *tmp = malloc(sizeof(char)*(strlen(up_read->seq)+1));
+      strncpy(tmp,up_read->seq,strlen(up_read->seq));
+      tmp[strlen(up_read->seq)]='\0';
+
+      realloc(up_read->seq,sizeof(char)*(strlen(down_read->seq)+1));
+      strncpy(up_read->seq,down_read->seq,strlen(down_read->seq));
+      up_read->seq[strlen(down_read->seq)] = '\0';
+
+      if (up_read->seq == NULL)
+         perror("realloc\n");
+
+      realloc(down_read->seq,sizeof(char)*strlen(tmp)+1);
+      strncpy(down_read->seq,tmp,strlen(tmp));
+      down_read->seq[strlen(tmp)] = '\0';
+      
+      if (down_read->seq == NULL)
+         perror("realloc\n");
+      
+      free(tmp);
+      printf("flipping done...\n");
+      printf("%s %s\n",up_read->seq,down_read->seq);
+   }
+   
+   printf("start joining...\n");
    Tuple jinfo = join_seq(new_seq,up_read->seq,u_off,u_size,down_read->seq,d_off,d_size,down_range);
+   printf("end of joining...\n");
+
+   printf("new_seq is %s\n",new_seq);
+
 
    int cut_pos = jinfo.first;
    int additional_pos = jinfo.second;
+   printf("jinfo contains %d/%d\n",jinfo.first,jinfo.second);
 
    buf_size = read_size+1+additional_pos;
 
+   printf("allocating quality arrays (size=%d)...\n",buf_size);
    char* new_prb        = malloc(sizeof(char)*buf_size);
    char* new_cal_prb    = malloc(sizeof(char)*buf_size);
    char* new_chastity   = malloc(sizeof(char)*buf_size);
 
-   if( jinfo.first == -1 ) {
-      retval = 0;
-      goto free;
-   }
-
-   if( additional_pos > (down_range - d_size) ) {
+   if (new_prb == NULL || new_cal_prb == NULL || new_chastity == NULL)
+      perror("malloc\n");
+   
+   if( jinfo.first == -1 || (additional_pos > (down_range - d_size)) ) {
       retval = 0;
       goto free;
    }
 
+   printf("joining qualities...\n");
    strncpy(new_prb, up_read->prb+u_off, u_size);
    strncpy(new_prb+u_size, down_read->prb+d_off, d_size+additional_pos);
-   new_prb[read_size+additional_pos] = '\0';
+   new_prb[buf_size-1] = '\0';
 
    strncpy(new_cal_prb, up_read->cal_prb+u_off, u_size);
    strncpy(new_cal_prb+u_size, down_read->cal_prb+d_off, d_size+additional_pos);
-   new_cal_prb[read_size+additional_pos] = '\0';
+   new_cal_prb[buf_size-1] = '\0';
 
    strncpy(new_chastity, up_read->chastity+u_off, u_size);
    strncpy(new_chastity+u_size, down_read->chastity+d_off, d_size+additional_pos);
-   new_chastity[read_size+additional_pos] = '\0';
-
+   new_chastity[buf_size-1] = '\0';
+   printf("end of joining qualities...\n");
+
+   //printf("old reads: %s %s (%d %d %d/%d)\n",up_read->seq,down_read->seq,up_read->pos,down_read->pos,u_off,d_off);
+   //printf("new read: %s %d %d\n",new_seq,cut_pos,u_size);
+   //if ( current_strand == 'P' ) {
+   //   int alpha   = read_size - u_off - 1;
+   //   int beta    = alpha - u_size ;
+   //   p_start     = up_read->pos + beta + 1;
+   //   exon_stop   = up_read->pos + alpha;
+   //   alpha   = read_size - d_off - 1;
+   //   beta    = alpha - (read_size - u_size);
+   //   exon_start  = down_read->pos + beta + 1; 
+   //   p_stop      = down_read->pos + alpha;
+   //}
 
    int status = 1; //fitting(up_read->prb,down_read->prb,u_off,d_off,u_size,d_size);
 
@@ -561,12 +598,6 @@ int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FI
 
    retval = status;
 
-   //fprintf(out_fs,"%d\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%s\t%s\n",
-   //   read_nr,up_read->chr,up_read->strand,new_seq,cut_pos,read_size,new_prb,new_cal_prb,new_chastity,gene_id,p_start,exon_stop,exon_start,p_stop,up_read->seq,down_read->seq);
-
-   //fprintf(out_fs,"%lu\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\n",
-   //   read_nr,up_read->chr,up_read->strand,new_seq,cut_pos,read_size,new_prb,new_cal_prb,new_chastity,gene_id,p_start,exon_stop,exon_start,p_stop,u_size);
-
    fprintf(out_fs,"%lu\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\n",
       read_nr,up_read->chr,up_read->strand,new_seq,cut_pos,read_size,new_prb,new_cal_prb,new_chastity,gene_id,p_start,exon_stop,exon_start,p_stop,u_size);
 
@@ -582,6 +613,36 @@ int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FI
    return retval;
 }
 
+static char* translation = "T G   C            A      [ ]";
+
+void reverse_complement(char** seq, int len) {
+   int idx;
+   char *temp = malloc(sizeof(char)*len);
+   for(idx=0;idx<len;idx++)
+      temp[idx] = translation[(*seq)[idx]-65];
+
+   idx=0;
+   int temp_idx=len-1;
+   while(1) {
+      if (temp[temp_idx] == ']') {
+         (*seq)[idx] = '[';
+         (*seq)[idx+1] = temp[temp_idx-2];
+         (*seq)[idx+2] = temp[temp_idx-1];
+         (*seq)[idx+3] = ']';
+         idx      += 4;
+         temp_idx -= 4;
+      } else {
+         (*seq)[idx] = temp[temp_idx];
+         idx++;
+         temp_idx--;
+      }
+
+      if (idx == len || temp_idx == -1)
+         break;
+   }
+   free(temp);
+}
+
 
 void print_read(Read* cRead) {
    printf(line_format,
@@ -620,12 +681,13 @@ int main(int argc, char* argv[]) {
       exit(EXIT_FAILURE);
    }
 
+   current_strand = argv[1][0];
+   printf("Current strand is %c\n",current_strand);
+
    int status;
    int filenameSize = 256;
    char* gff_filename = malloc(sizeof(char)*filenameSize);
 
-   current_strand = argv[1][0];
-   printf("current strand is %c\n",current_strand);
    strncpy(gff_filename,argv[2],filenameSize);
 
    FILE *gff_fs;
@@ -668,7 +730,10 @@ int main(int argc, char* argv[]) {
          printf("Invalid positions for gene %s!\n",currentGene->id);
 
       if (! (old_gene_stop <  currentGene->start ) ) {
+         printf("Gene %s is overlapping\n",currentGene->id);
          old_gene_stop = currentGene->stop;
+         free_gene(allGenes[g_idx]);
+         free(allGenes[g_idx]);
          allGenes[g_idx] = 0;
          nulled_genes++;
          continue;