+ changed filter criteria
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 28 Dec 2007 14:36:15 +0000 (14:36 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 28 Dec 2007 14:36:15 +0000 (14:36 +0000)
+ added some counters to infer why certain reads are excluded
+ disabled smoothness checks for now

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@7216 e1793c9e-67f9-0310-80fc-b846ff1f7b36

tools/data_tools/Makefile
tools/data_tools/filterReads.c
tools/data_tools/parser.c
tools/data_tools/sort_genes.c [new file with mode: 0644]

index 1d4a721..2651c8e 100644 (file)
@@ -1,5 +1,6 @@
 
 SRCS= parser.c\
+       sort_genes.c\
        datastructures.c
 
 OBJS = $(SRCS:%.cpp=%.o)
index 49a0bf0..436168c 100644 (file)
 #include "common.h"
 #include "datastructures.h"
 
+int compare_gene_struct(struct gene* a, struct gene* b) {
+   return a->stop - b->start;
+}
+
 int parse_gff(char* filename,FILE* fid,struct gene*** allGenes);
 
+void sort_genes(struct gene*** allGenes, int numGenes);
+
 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes,FILE* out_fs);
 
 void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size, FILE* out_fs);
@@ -76,7 +82,22 @@ int main(int argc, char* argv[]) {
 
    printf("Successfully parsed gff file! Found %d genes.\n",numGenes);
 
+   // some entries in the gff files are not in a sorted order
+   //printf("Sorting genes...\n");
+   //sort_genes(&allGenes,numGenes);
+   //qsort(allGenes,numGenes,sizeof(struct gene*),compare_gene_struct);
+   ///printf("Genes were sorted!\n");
+   
+   int gidx, eidx;
+   int exon_cov = 0;
+   for(gidx=0;gidx<numGenes;gidx++) {
+      for(eidx=0;eidx<allGenes[gidx]->num_exons;eidx++) {
+         exon_cov += allGenes[gidx]->exon_stops[eidx] - allGenes[gidx]->exon_starts[eidx];
+   }}
+   printf("Exon coverage is %f\n",(double)exon_cov/30432563);
+
    process_reads(reads_fs,&allGenes,numGenes,out_fs);
+
    status = fclose(reads_fs);
    status = fclose(out_fs);
    if(status != 0)
@@ -147,6 +168,15 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
 
    char* disamb_seq = malloc(sizeof(char)*read_size);
 
+   int skippedReadCtr = 0;
+   int uselessReadCtr = 0;
+   int exonicReadCtr = 0;
+   int endPrevCtr = 0;
+   int prevOverlapCtr = 0;
+   int currentStartCtr = 0;
+   int currentOverlapCtr = 0;
+   int multioccurReadCtr = 0;
+
    int readCtr = 0;
    // start of the parsing loop 
    while(1) {
@@ -154,7 +184,7 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
          break;
 
       if (readCtr != 0 && readCtr % 1000000 == 0)
-         printf("Processed %d reads. Processed %d/%d genes and %d/%d reads.\n",readCtr,gene_idx,numGenes,readCtr,numReads);
+         printf("Processed %d/%d genes and %d/%d reads.\n",gene_idx,numGenes,readCtr,numReads);
 
       //if (gene_idx >= 1833)
       //   printf("currentGene start/stop: %d/%d. Positions is %d\n",currentGene->start,currentGene->stop,pos);
@@ -165,15 +195,19 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
       &chr,&pos,seq,&id,&strand,&mismatch,&occurrence,&size,&cut,prb,cal_prb,chastity);
       if (status < 12) {
          skippedLinesCounter++;
+         goto next_read;
       }
       //printf("gene start/stop %d,%d, pos: %d\n",currentGene->start,currentGene->stop,pos);
       // if the read is occurring several times elsewhere then get rid of it 
-      if(occurrence != 1) {
-         while (*(char*)linePtr != '\n') linePtr++;
-         linePtr++;
-         readCtr += 1;
-         current_line = strncpy(current_line,linePtr,256);
-         continue;
+      if(!(occurrence >= 1 && occurrence <= 25)) {
+      //if ( occurrence != 1 ) {
+         multioccurReadCtr++;
+         goto next_read;
+         //while (*(char*)linePtr != '\n') linePtr++;
+         //linePtr++;
+         //readCtr += 1;
+         //current_line = strncpy(current_line,linePtr,256);
+         //continue;
       }
 
       if (!(currentGene->start <= pos && (pos + read_size-1) <= currentGene->stop)) { // read is not within gene borders
@@ -188,6 +222,8 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
          }
 
          if ( pos < currentGene->start ) { // go to next read
+            skippedReadCtr++;
+            
             next_read:
 
             while (*(char*)linePtr != '\n') linePtr++;
@@ -228,6 +264,7 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
          }
 
          if ( prev_exon_start < pos && (pos+read_size) < prev_exon_stop ) { // read is inside previous exon
+            exonicReadCtr++;
             remove_ambiguities(seq,strlen(seq),disamb_seq);
             fprintf(out_fs,"%d\t%c\t%s\t%d\t%s\t%s\t%s\n",chr,strand,disamb_seq,read_size,prb,cal_prb,chastity);
             goto next_read;
@@ -239,29 +276,34 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
          // if this position is reached the read is somehow overlapping or
          // exactly on the exon boundary. now determine where exactly:
          if (pos + (read_size-1) == prev_exon_stop) {  // read ends at previous exon end
+            endPrevCtr++;
             upstream_end[ue] = linePtr;
             ue++;
             goto next_read;
          }
 
          if ( (prev_exon_stop - pos) >= 6 && (prev_exon_stop - pos) <= 30) { // read overlaps with previous exon boundary
+            prevOverlapCtr++;
             upstream_overlap[uo] = linePtr;
             uo++;
             goto next_read;
          }
 
          if ( pos == cur_exon_start ) { // read starts at current exon start
+            currentStartCtr++;
             downstream_start[ds] = linePtr;
             ds++;
             goto next_read;
          }
 
          if ( (cur_exon_start - pos) >= 6 && (cur_exon_start - pos) <= 30 ) { // read overlaps with current exon boundary
+            currentOverlapCtr++;
             downstream_overlap[dov] = linePtr;
             dov++;
             goto next_read;
          }
 
+         uselessReadCtr++;
          goto next_read; // read was not useful i.e. not overlapping/starting at exon boundaries
       }
    }
@@ -275,6 +317,12 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
    free(downstream_overlap);
 
    printf("Processed %d reads in total while %d reads where skipped.\n",readCtr,skippedLinesCounter);
+   printf("%d were totally intronic. %d were outside of genes and %d were occurring on several positions.\n",uselessReadCtr,skippedReadCtr,multioccurReadCtr);
+   printf("%d reads were useless in total\n",uselessReadCtr+skippedReadCtr+multioccurReadCtr);
+   printf("%d reads were totally exonic\n",exonicReadCtr);
+   printf("%d reads end at prev. exon. %d reads overlap with prev. exon.\n", endPrevCtr, prevOverlapCtr);
+   printf("%d reads start at current exon. %d reads overlap with current exon.\n",currentStartCtr,currentOverlapCtr);
+   printf("Total used reads: %d\n",exonicReadCtr+endPrevCtr+prevOverlapCtr+currentStartCtr+currentOverlapCtr);
 
    status = munmap(reads_area,reads_filesize);
    if(status != 0)
@@ -288,6 +336,8 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
 }
 
 void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size,FILE* out_fs) {
+   //printf("up/down size is %d/%d\n",up_size,down_size);
+
    int up_idx, down_idx, status;
    char* upstream_line = malloc(sizeof(char)*256);
    char* downstream_line = malloc(sizeof(char)*256);
@@ -359,42 +409,43 @@ void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, v
       if (up_pos+35 == exon_stop) { // merge with read which is downstream overlapping
          overlap = exon_start - down_pos;
          
-         fit = fitting(up_prb+(36-w_size),up_prb+36,down_prb+overlap,down_prb+overlap+w_size);
-         if (fit != 1)
-            goto end;
+         //fit = fitting(up_prb+(read_size-w_size),up_prb+read_size,down_prb+overlap,down_prb+overlap+w_size);
+         //if (fit != 1)
+         //   goto end;
 
          new_chr     = up_chr;
          new_strand  = up_strand;
 
-         strncat(new_seq,new_up_seq+(36-overlap),overlap);
-         strncat(new_prb,up_prb+(36-overlap),overlap);
-         strncat(new_cal_prb,up_cal_prb+(36-overlap),overlap);
-         strncat(new_chastity,up_chastity+(36-overlap),overlap);
+         strncat(new_seq,new_up_seq+(read_size-overlap),overlap);
+         strncat(new_prb,up_prb+(read_size-overlap),overlap);
+         strncat(new_cal_prb,up_cal_prb+(read_size-overlap),overlap);
+         strncat(new_chastity,up_chastity+(read_size-overlap),overlap);
 
-         strncat(new_seq,new_down_seq+overlap,36-overlap);
-         strncat(new_prb,down_prb+overlap,36-overlap);
-         strncat(new_cal_prb,down_cal_prb+overlap,36-overlap);
-         strncat(new_chastity,down_chastity+overlap,36-overlap);
+         strncat(new_seq,new_down_seq+overlap,read_size-overlap);
+         strncat(new_prb,down_prb+overlap,read_size-overlap);
+         strncat(new_cal_prb,down_cal_prb+overlap,read_size-overlap);
+         strncat(new_chastity,down_chastity+overlap,read_size-overlap);
 
          //printf("Between exon stop/start %d/%d : merging pos %d %d with overlap %d\n",exon_stop,exon_start,up_pos+35,down_pos, overlap);
 
       } // merge with read which is upstream overlapping
       
       if (down_pos == exon_start) {
-         overlap = up_pos+36 - exon_stop;
+         overlap = up_pos+read_size - exon_stop;
          //printf("overlap is %d\n",overlap);
          //printf("pos are: %d %d\n",up_pos,down_pos);
-         fit = fitting(up_prb+36-overlap-w_size,up_prb+36-overlap,down_prb,down_prb+w_size);
-         if (fit != 1)
-            goto end;
+         
+         //fit = fitting(up_prb+read_size-overlap-w_size,up_prb+read_size-overlap,down_prb,down_prb+w_size);
+         //if (fit != 1)
+         //   goto end;
 
          new_chr     = up_chr;
          new_strand  = up_strand;
 
-         strncat(new_seq,new_up_seq,(36-overlap));
-         strncat(new_prb,up_prb,(36-overlap));
-         strncat(new_cal_prb,up_cal_prb,(36-overlap));
-         strncat(new_chastity,up_chastity,(36-overlap));
+         strncat(new_seq,new_up_seq,(read_size-overlap));
+         strncat(new_prb,up_prb,(read_size-overlap));
+         strncat(new_cal_prb,up_cal_prb,(read_size-overlap));
+         strncat(new_chastity,up_chastity,(read_size-overlap));
 
          strncat(new_seq,new_down_seq,overlap);
          strncat(new_prb,down_prb,overlap);
index 7b7b431..2e587ac 100644 (file)
@@ -32,7 +32,7 @@ int parse_gff(char *filename, FILE* fid,struct gene*** allGenes) {
 
    }
    freopen(filename,"r",fid);
+
    int idx = 0;
    (*allGenes) = malloc(sizeof(struct gene*)*numGenes);
    (*allGenes)[idx] = NULL;
diff --git a/tools/data_tools/sort_genes.c b/tools/data_tools/sort_genes.c
new file mode 100644 (file)
index 0000000..bb50c1a
--- /dev/null
@@ -0,0 +1,39 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "datastructures.h"
+
+void sort_genes(struct gene*** allGenes, int numGenes) {
+
+   int gstart = (*allGenes)[0]->start;
+   int gstop = (*allGenes)[0]->stop;
+   int pre_start;
+   int pre_stop;
+
+   struct gene* tmp;
+   int round;
+   int gidx;
+
+   for(round=0;round<1;round++) {
+      printf("Next round!\n");
+
+      for(gidx=1;gidx<numGenes;gidx++) {
+         pre_start = gstart;
+         pre_stop = gstop;
+         gstart = (*allGenes)[gidx]->start;
+         gstop  = (*allGenes)[gidx]->stop;
+
+         if ( pre_stop > gstart ) {
+            printf("Swapping genes!\n");
+            printf("idx is %d\n",gidx);
+            //printf("pos are %d %d - %d %d\n",pre_start,pre_stop,gstart,gstop);
+            //(*allGenes)[gidx-1] = (*allGenes)[gidx];
+            //(*allGenes)[gidx] = tmp;
+            //printf("pos with new indices: %d %d - %d %d\n",(*allGenes)[gidx-1]->start,(*allGenes)[gidx-1]->stop,(*allGenes)[gidx]->start,(*allGenes)[gidx]->stop);
+            gstart = (*allGenes)[gidx]->start;
+            gstop = (*allGenes)[gidx]->stop;
+            (*allGenes)[gidx] = NULL;
+         }
+      }
+   }
+   printf("End of swapping elements!\n");
+}