+ some optimization
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Wed, 2 Jul 2008 17:34:10 +0000 (17:34 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Wed, 2 Jul 2008 17:34:10 +0000 (17:34 +0000)
git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@9837 e1793c9e-67f9-0310-80fc-b846ff1f7b36

tools/data_tools/Makefile
tools/data_tools/filterReads.c
tools/data_tools/parser.c

index 9fbe627..a956933 100644 (file)
@@ -1,5 +1,5 @@
 CC=gcc
-CFLAGS=-O3 -ggdb -Wall -Wshadow -lm
+CFLAGS=-ggdb -Wall -Wshadow -lm
 
 SRCS= parser.c\
        sort_genes.c\
index c0431aa..ae84946 100644 (file)
@@ -15,6 +15,7 @@
 #include <sys/stat.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <assert.h>
 #include <string.h>
 #include <unistd.h>
 #include <math.h>
@@ -41,7 +42,6 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
 void combine_info(int exon_stop, int exon_start, Read** upstream, int up_size, Read** downstream, int down_size, FILE* out_fs,const char* gene_id, int exon_idx);
 int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FILE* out_fs,const char* gene_id, int exon_idx);
 int fitting(char* up_prb, char* down_prb, int u_off, int d_off, int u_size, int d_size);
-int remove_ambiguities(char * old_seq, char* new_seq);
 Tuple join_seq(char* new_seq, char* up_seq,int u_off, int u_size, char* down_seq, int d_off, int d_size, int d_range);
 
 static char *info = "Usage is:\n./filterReads gff reads output";
@@ -54,8 +54,50 @@ static char *info = "Usage is:\n./filterReads gff reads output";
 //#define _FDEBUG_ 0
 //#define DEBUG_READ 38603
 
+
+//
+/*
+ * TODO:
+ * - Check strand -> done simple (only if equal)
+ * - check for [AC] and similar entries -> done simple (see function
+ */
+
+#ifdef _FDEBUG_
+   if(read_nr == DEBUG_READ) {
+      printf("read nr %lu",read_nr);
+      printf("pos: %d %d %d %d\n",p_start,exon_stop,exon_start,p_stop);
+      printf("u/d range: %d %d\n",up_range,down_range);
+      printf("u/d size: %d %d\n",u_size,d_size);
+      printf("u/d off: %d %d\n",u_off,d_off);
+      printf("add_pos/down_range-d_size: %d %d\n",additional_pos,(down_range - d_size));
+
+      printf("******************\n");
+
+      printf("%s\n",up_read->seq);
+      //printf("%s\n",new_up_seq);
+
+      printf("******************\n");
+
+      printf("%s\n",down_read->seq);
+      //printf("%s\n",new_down_seq);
+
+      printf("******************\n");
+      printf("%s\n",new_seq);
+      printf("%s\n",new_prb);
+      printf("%s\n",new_cal_prb);
+      printf("%s\n",new_chastity);
+   }
+#endif // _FDEBUG_
+
 int combined_reads = 0;
 
+char current_strand;
+
+/*
+ *
+ *
+ */
+
 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* out_fs, FILE* unfiltered_out_fs) {
    int status;
 
@@ -101,7 +143,7 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
    char* current_line = malloc(sizeof(char)*512);
    memset(current_line,0,512);
 
-   int SIZE = 500;
+   int SIZE = 5000;
    // initialize boundary arrays
    Read** upstream_overlap  = malloc(sizeof(Read*)*SIZE);
    Read** downstream_overlap= malloc(sizeof(Read*)*SIZE);
@@ -128,175 +170,214 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
    int skippedReadCtr = 0;
    int uselessReadCtr = 0;
    int exonicReadCtr = 0;
-   int endPrevCtr = 0;
-   int prevOverlapCtr = 0;
-   int currentStartCtr = 0;
+
    int currentOverlapCtr = 0;
+   int previousOverlapCtr = 0;
+
    int multioccurReadCtr = 0;
 
    Read* currentRead;
    int up_idx, down_idx;
 
    int readCtr = 0;
+   int wrong_strand_ctr = 0;
+   int read_within_gene_ctr = 0;
+   int read_outside_gene_ctr = 0;
    int read_start, read_stop;
    // start of the parsing loop 
-   while(1) {
-      if (gene_idx == numGenes || strcmp(current_line,"") == 0)
-         break;
-
-      gene_id = currentGene->id;
-
-      if (readCtr != 0 && readCtr % 1000000 == 0)
-         printf("Processed %d/%d genes and %d/%d reads.\n",gene_idx,numGenes,readCtr,numReads);
+  
+   //static char *all_strands = "+-";
+   //static char *all_strands = "DP";
+   //int strand_idx;
+   //for(strand_idx=0;strand_idx<2;strand_idx++) {
+   //   char current_strand = all_strands[strand_idx];
+
+      while(1) {
+         if (gene_idx == numGenes || strcmp(current_line,"") == 0)
+            break;
+
+         gene_id = currentGene->id;
+
+         if (readCtr != 0 && readCtr % 1000000 == 0) {
+            printf("Processed %d/%d genes and %d/%d reads.\n",gene_idx,numGenes,readCtr,numReads);
+            printf("%d reads in total while %d reads where skipped.\n",readCtr,skippedLinesCounter);
+            printf("%d were totally intronic. %d were outside of genes and %d were occurring on several positions.\n",uselessReadCtr,skippedReadCtr,multioccurReadCtr);
+            printf("%d reads were useless in total\n",uselessReadCtr+skippedReadCtr+multioccurReadCtr);
+            printf("\t%d useless reads\n",uselessReadCtr);
+            printf("\t%d skipped reads\n",skippedReadCtr);
+            printf("\t%d multioccurring\n",multioccurReadCtr);
+            printf("\t%d wrong_strand\n",wrong_strand_ctr);
+            printf("%d within gene\n",read_within_gene_ctr);
+            printf("%d outside gene\n",read_outside_gene_ctr);
+            printf("%d reads were totally exonic\n",exonicReadCtr);
+            printf("%d reads overlap with prev. exon. %d reads overlap with current exon\n",previousOverlapCtr,currentOverlapCtr);
+            printf("%d reads where newly combined from two original reads\n",combined_reads);
+            printf("Total used reads: %d (some may not be combined)\n",exonicReadCtr+combined_reads);
+         }
 
-      // pos of the reads is one-based
-      status = sscanf(current_line,line_format,&chr,&pos,seq,&id,&strand,&mismatch,&occurrence,&size,&cut,prb,cal_prb,chastity);
-      if (status < 12) {
-         skippedLinesCounter++;
-         goto next_read;
-      }
+         // pos of the reads is one-based
+         status = sscanf(current_line,line_format,&chr,&pos,seq,&id,&strand,&mismatch,&occurrence,&size,&cut,prb,cal_prb,chastity);
+         if (status < 12) {
+            skippedLinesCounter++;
+            goto next_read;
+         }
 
-      // if the read is occurring several times elsewhere then get rid of it 
-      if ( occurrence != 1 ) {
-         multioccurReadCtr++;
-         goto next_read;
-      }
+         // if the read is occurring several times elsewhere then get rid of it 
+         if ( occurrence != 1 ) {
+            multioccurReadCtr++;
+            goto next_read;
+         }
 
-      // define read start and stop positions
-      read_start = pos;
-      read_stop  = pos + read_size-1;
+         // define read start and stop positions
+         read_start = pos;
+         read_stop  = pos + read_size-1;
 
-      if( strand != 'D' )
-         goto next_read;
+         if( strand != current_strand ) {
+            wrong_strand_ctr++;
+            goto next_read;
+         }
 
-      FA(strlen(seq) >= read_size);
+         FA(strlen(seq) >= read_size);
 
-      FA(currentGene != 0);
+         FA(currentGene != 0);
 
-      if ( currentGene->start <= read_start && read_stop <= currentGene->stop) { // read is within gene borders
+         if ( currentGene->start <= read_start && read_stop <= currentGene->stop) { // read is within gene borders
+            read_within_gene_ctr++;
 
-         exon_label:
+            exon_label:
 
-         if (exon_idx == currentGene->num_exons) {
-            gene_idx++;
-            exon_idx = 1;
-            currentGene = (*allGenes)[gene_idx];
-            while( currentGene == 0 && gene_idx < numGenes) {
-               currentGene = (*allGenes)[gene_idx];
+            if (exon_idx == currentGene->num_exons) {
                gene_idx++;
+               exon_idx = 1;
+               currentGene = (*allGenes)[gene_idx];
+               while( (currentGene == 0 || currentGene->strand != current_strand) && gene_idx < numGenes) {
+                  currentGene = (*allGenes)[gene_idx];
+                  gene_idx++;
+               }
+               continue;
             }
-            continue;
-         }
 
-         prev_exon_start = currentGene->exon_starts[exon_idx-1];
-         prev_exon_stop = currentGene->exon_stops[exon_idx-1];
-         cur_exon_start = currentGene->exon_starts[exon_idx];
+            prev_exon_start = currentGene->exon_starts[exon_idx-1];
+            prev_exon_stop = currentGene->exon_stops[exon_idx-1];
+            cur_exon_start = currentGene->exon_starts[exon_idx];
 
-         //printf("exon: %d %d %d\t pos: %d\n",prev_exon_start,prev_exon_stop,cur_exon_start,pos);
+            //printf("id: %s,exon_idx: %d intron: %d %d read start/stop: %d / %d\n",currentGene->id,exon_idx,prev_exon_stop,cur_exon_start,read_start,read_stop);
 
-         if ( (cur_exon_start - prev_exon_stop - 1) < min_overlap || cur_exon_start < read_start ) { // go to next exon
+            if ( (cur_exon_start - prev_exon_stop - 1) < min_overlap || cur_exon_start < read_start ) { // go to next exon
 
-            if (uov != 0 && dov != 0)
-               combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
+               if (uov != 0 && dov != 0)
+                  combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
 
-            for(up_idx=0;up_idx<uov;up_idx++) {
-               free_read(upstream_overlap[up_idx]);
-               free(upstream_overlap[up_idx]);
-            }
+               for(up_idx=0;up_idx<uov;up_idx++) {
+                  free_read(upstream_overlap[up_idx]);
+                  free(upstream_overlap[up_idx]);
+               }
 
-            for(down_idx=0;down_idx<dov;down_idx++) {
-               free_read(downstream_overlap[down_idx]);
-               free(downstream_overlap[down_idx]);
-            }
+               for(down_idx=0;down_idx<dov;down_idx++) {
+                  free_read(downstream_overlap[down_idx]);
+                  free(downstream_overlap[down_idx]);
+               }
 
-            uov = dov = 0;
+               uov = dov = 0;
 
-            exon_idx++;
-            goto exon_label;
-         }
+               exon_idx++;
+               goto exon_label;
+            }
 
-         if ( prev_exon_start <= read_start && read_stop <= prev_exon_stop ) { // read is inside previous exon
+            if ( prev_exon_start <= read_start && read_stop <= prev_exon_stop ) { // read is inside previous exon
 
-            // output of unused i.e. unspliced reads
-            fprintf(unfiltered_out_fs,"%lu\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\n",
-            unfiltered_read_nr,chr,strand,seq,0,read_size,prb,cal_prb,chastity,gene_id,read_start,-1,-1,read_stop,-1);
-            unfiltered_read_nr++;
-            
-            exonicReadCtr++;
-            goto next_read;
-         }
+               // output of unused i.e. unspliced reads
+               fprintf(unfiltered_out_fs,"%lu\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\n",
+               unfiltered_read_nr,chr,strand,seq,0,read_size,prb,cal_prb,chastity,gene_id,read_start,-1,-1,read_stop,-1);
+               unfiltered_read_nr++;
+               
+               exonicReadCtr++;
+               goto next_read;
+            }
 
-         if ( read_stop < prev_exon_stop ) // go to next read
-            goto next_read;
+            if ( read_stop < prev_exon_stop ) { // go to next read 
+               //printf("%d\t%d\t%d\n",read_start,prev_exon_start,prev_exon_stop);
+               //if( exon_idx > 1) {
+               //   printf("%d\t%d\n", currentGene->exon_starts[exon_idx-2], currentGene->exon_stops[exon_idx-2]);
+               //   printf("---\n");
+               //}
 
-         // if this position is reached the read is somehow overlapping or
-         // exactly on the exon boundary.
-         if ( (prev_exon_stop - read_start + 1) >= min_overlap && (prev_exon_stop - read_start + 1) <= max_overlap ) { // read overlaps with previous exon boundary
-            //printf("%s\n",current_line);
-            prevOverlapCtr++;
-            currentRead = create_read(chr,read_start,seq,id,strand,mismatch,occurrence,size,cut,prb,cal_prb,chastity);
-            upstream_overlap[uov] = currentRead;
-            uov++;
-            goto next_read;
-         }
-      
-         if ( ( read_stop - cur_exon_start + 1) >= min_overlap && (read_stop - cur_exon_start + 1) <= max_overlap ) { // read overlaps with current exon boundary
-            //printf("%s\n",current_line);
-            currentOverlapCtr++;
-            currentRead = create_read(chr,read_start,seq,id,strand,mismatch,occurrence,size,cut,prb,cal_prb,chastity);
-            downstream_overlap[dov] = currentRead;
-            dov++;
-            goto next_read;
-         }
+               uselessReadCtr++;
+               goto next_read;
+            }
 
-         uselessReadCtr++;
-         goto next_read; // read was not useful i.e. not overlapping/starting at exon boundaries
+            // if this position is reached the read is somehow overlapping or
+            // exactly on the exon boundary.
+            if ( (prev_exon_stop - read_start + 1) >= min_overlap && (prev_exon_stop - read_start + 1) <= max_overlap ) { // read overlaps with previous exon boundary
+               //printf("%s\n",current_line);
+               previousOverlapCtr++;
+               currentRead = create_read(chr,read_start,seq,id,strand,mismatch,occurrence,size,cut,prb,cal_prb,chastity);
+               assert (uov < SIZE);
+               upstream_overlap[uov] = currentRead;
+               uov++;
+               goto next_read;
+            }
+         
+            if ( ( read_stop - cur_exon_start + 1) >= min_overlap && (read_stop - cur_exon_start + 1) <= max_overlap ) { // read overlaps with current exon boundary
+               //printf("%s\n",current_line);
+               currentOverlapCtr++;
+               currentRead = create_read(chr,read_start,seq,id,strand,mismatch,occurrence,size,cut,prb,cal_prb,chastity);
+               assert (dov < SIZE);
+               downstream_overlap[dov] = currentRead;
+               dov++;
+               goto next_read;
+            }
 
-      } else { // read is not within gene borders
+            uselessReadCtr++;
+            goto next_read; // read was not useful i.e. not overlapping/starting at exon boundaries
 
+         } else { // read is not within gene borders
+            read_outside_gene_ctr++;
 
-         if (uov != 0 && dov != 0)
-            combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
+            if (uov != 0 && dov != 0)
+               combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
 
-         for(up_idx=0;up_idx<uov;up_idx++) {
-            free_read(upstream_overlap[up_idx]);
-            free(upstream_overlap[up_idx]);
-         }
+            for(up_idx=0;up_idx<uov;up_idx++) {
+               free_read(upstream_overlap[up_idx]);
+               free(upstream_overlap[up_idx]);
+            }
 
-         for(down_idx=0;down_idx<dov;down_idx++) {
-            free_read(downstream_overlap[down_idx]);
-            free(downstream_overlap[down_idx]);
-         }
+            for(down_idx=0;down_idx<dov;down_idx++) {
+               free_read(downstream_overlap[down_idx]);
+               free(downstream_overlap[down_idx]);
+            }
 
-         uov = dov = 0;
+            uov = dov = 0;
 
-         if ( currentGene->stop < read_stop ) { // go to next gene
-            gene_idx++;
-            exon_idx = 1;
-            currentGene = (*allGenes)[gene_idx];
-            while( currentGene == 0 && gene_idx < numGenes) {
-               currentGene = (*allGenes)[gene_idx];
+            if ( currentGene->stop < read_stop ) { // go to next gene
                gene_idx++;
+               exon_idx = 1;
+               currentGene = (*allGenes)[gene_idx];
+               while( currentGene == 0 && gene_idx < numGenes) {
+                  currentGene = (*allGenes)[gene_idx];
+                  gene_idx++;
+               }
+               //printf("currentGene->start / currentGene->stop %d/%d pos is %d\n",currentGene->start,currentGene->stop,pos);
+               continue;
             }
-            //printf("currentGene->start / currentGene->stop %d/%d pos is %d\n",currentGene->start,currentGene->stop,pos);
-            continue;
-         }
 
-         if ( read_start < currentGene->start ) { // go to next read
-            skippedReadCtr++;
-            
-            next_read:
-            
-            lineBeginPtr = lineEndPtr;
-            while (*(char*)lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
-            lineEndPtr++;
-            readCtr += 1;
-            current_line = strncpy(current_line,lineBeginPtr,lineEndPtr-lineBeginPtr);
-            current_line[lineEndPtr-lineBeginPtr] = '\0';
-            continue;
+            if ( read_start < currentGene->start ) { // go to next read
+               skippedReadCtr++;
+               
+               next_read:
+               
+               lineBeginPtr = lineEndPtr;
+               while (*(char*)lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
+               lineEndPtr++;
+               readCtr += 1;
+               current_line = strncpy(current_line,lineBeginPtr,lineEndPtr-lineBeginPtr);
+               current_line[lineEndPtr-lineBeginPtr] = '\0';
+               continue;
+            }
          }
       }
-   }
+
+   //} // end of for all strands
 
    if (uov != 0 && dov != 0)
       combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
@@ -319,10 +400,14 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
    printf("Processed %d reads in total while %d reads where skipped.\n",readCtr,skippedLinesCounter);
    printf("%d were totally intronic. %d were outside of genes and %d were occurring on several positions.\n",uselessReadCtr,skippedReadCtr,multioccurReadCtr);
    printf("%d reads were useless in total\n",uselessReadCtr+skippedReadCtr+multioccurReadCtr);
+   printf("\t%d useless reads\n",uselessReadCtr);
+   printf("\t%d skipped reads\n",skippedReadCtr);
+   printf("\t%d multioccurring\n",multioccurReadCtr);
+   printf("\t%d wrong_strand\n",wrong_strand_ctr);
    printf("%d reads were totally exonic\n",exonicReadCtr);
-   printf("%d reads overlap with prev. exon. %d reads overlap with current exon\n",prevOverlapCtr,currentOverlapCtr);
+   printf("%d reads overlap with prev. exon. %d reads overlap with current exon\n",previousOverlapCtr,currentOverlapCtr);
    printf("%d reads where newly combined from two original reads\n",combined_reads);
-   printf("Total used reads: %d\n",exonicReadCtr+endPrevCtr+prevOverlapCtr+currentStartCtr+currentOverlapCtr);
+   printf("Total used reads: %d (some may not be combined)\n",exonicReadCtr+combined_reads);
 
    status = munmap(reads_area,reads_filesize);
    if(status != 0)
@@ -375,13 +460,13 @@ void combine_info(int exon_stop, int exon_start, Read** upstream, int up_size, R
 }
 
 /*
- *
- *
+ * Now we join the candidate reads wherever possible according to the following
+ * scheme:
  *
  * ACGTACGTCA GTXXXXXXXXAG ACGTAGACGT
  * p1       e1             e2       p2  
  *
- *
+ * 
  *
  *
  */
@@ -506,14 +591,31 @@ void print_read(Read* cRead) {
    cRead->chastity);
 }
 
+
+void open_file(const char* filename, const char* mode, FILE** fs) {
+   *fs = fopen(filename,mode);
+   if(*fs == NULL) {
+      printf("Error: Could not open file: %s",filename);
+      exit(EXIT_FAILURE);
+   }
+}
+
 /*
- * The mainloop 
+ * The program expects 7 arguments, namely:
+ *
+ * - The strand to be used
+ * - The filename of the gff file
+ * - The filename of the reads file
+ * - The name of the spliced reads ouput file
+ * - The name of the unspliced reads ouput file
+ * - The offset for counting new spliced reads
+ * - The offset for counting new unspliced reads
  *
  */
 
 int main(int argc, char* argv[]) {
 
-   if(argc != 7) {
+   if(argc != 8) {
       printf("%s\n",info);
       exit(EXIT_FAILURE);
    }
@@ -521,46 +623,29 @@ int main(int argc, char* argv[]) {
    int status;
    int filenameSize = 256;
    char* gff_filename = malloc(sizeof(char)*filenameSize);
-   char* reads_filename = malloc(sizeof(char)*filenameSize);
-   char* output_filename = malloc(sizeof(char)*filenameSize);
-   char* unfiltered_output_filename = malloc(sizeof(char)*filenameSize);
 
-   strncpy(gff_filename,argv[1],filenameSize);
-   strncpy(reads_filename,argv[2],filenameSize);
-   strncpy(output_filename,argv[3],filenameSize);
-   strncpy(unfiltered_output_filename,argv[4],filenameSize);
+   current_strand = argv[1][0];
+   printf("current strand is %c\n",current_strand);
+   strncpy(gff_filename,argv[2],filenameSize);
+
+   FILE *gff_fs;
+   FILE *reads_fs;
+   FILE *out_fs;
+   FILE *unfiltered_out_fs;
 
-   FILE *gff_fs = fopen(gff_filename,"r");
-   FILE *reads_fs = fopen(reads_filename,"r");
-   FILE *out_fs = fopen(output_filename,"w");
-   FILE *unfiltered_out_fs = fopen(unfiltered_output_filename,"w");
+   // open file streams for all needed input/output files
+   open_file(argv[2],"r",&gff_fs);
+   open_file(argv[3],"r",&reads_fs);
+   open_file(argv[4],"w",&out_fs);
+   open_file(argv[5],"w",&unfiltered_out_fs);
 
-   read_nr = strtoul(argv[5],NULL,10);
+   read_nr = strtoul(argv[6],NULL,10);
    read_nr++;
 
-   unfiltered_read_nr = strtoul(argv[6],NULL,10);
+   unfiltered_read_nr = strtoul(argv[7],NULL,10);
    unfiltered_read_nr++;
 
-   if(gff_fs == NULL) {
-      printf("Error: Could not open file: %s",gff_filename);
-      exit(EXIT_FAILURE);
-   }
-
-   if(reads_fs == NULL) {
-      printf("Error: Could not open file: %s",reads_filename);
-      exit(EXIT_FAILURE);
-   }
-
-   if(out_fs == NULL) {
-      printf("Error: Could not open file: %s",output_filename);
-      exit(EXIT_FAILURE);
-   }
-
-   if(unfiltered_out_fs == NULL) {
-      printf("Error: Could not open file: %s",unfiltered_output_filename);
-      exit(EXIT_FAILURE);
-   }
-
+   // allocate and load all genes and then close the gff file stream
    struct gene** allGenes;
    int numGenes = parse_gff(gff_filename,gff_fs,&allGenes);
    status = fclose(gff_fs);
@@ -603,13 +688,15 @@ int main(int argc, char* argv[]) {
    }}
    printf("Exon coverage is %f\n",(double)exon_cov/30432563);
 
+   // now that we loaded all neccessary data we start to process the reads
    process_reads(reads_fs,&allGenes,numGenes,out_fs,unfiltered_out_fs);
 
+   // free all allocated ressources
    for(g_idx=0;g_idx<numGenes;g_idx++) {
-      if(allGenes[g_idx] != 0)
+      if(allGenes[g_idx] != 0) {
          free_gene(allGenes[g_idx]);
          free(allGenes[g_idx]);
-
+      }
    }
    free(allGenes);
 
@@ -618,42 +705,5 @@ int main(int argc, char* argv[]) {
    if(status != 0)
       perror("fclose");
       
-   free(reads_filename);
-   free(output_filename);
-
    return 0;
 }
-
-/*
- * TODO:
- * - Check strand -> done simple (only if equal)
- * - check for [AC] and similar entries -> done simple (see function
- * - remove_ambiguities (exchanges [XY] by the second entry)
- */
-
-#ifdef _FDEBUG_
-   if(read_nr == DEBUG_READ) {
-      printf("read nr %lu",read_nr);
-      printf("pos: %d %d %d %d\n",p_start,exon_stop,exon_start,p_stop);
-      printf("u/d range: %d %d\n",up_range,down_range);
-      printf("u/d size: %d %d\n",u_size,d_size);
-      printf("u/d off: %d %d\n",u_off,d_off);
-      printf("add_pos/down_range-d_size: %d %d\n",additional_pos,(down_range - d_size));
-
-      printf("******************\n");
-
-      printf("%s\n",up_read->seq);
-      //printf("%s\n",new_up_seq);
-
-      printf("******************\n");
-
-      printf("%s\n",down_read->seq);
-      //printf("%s\n",new_down_seq);
-
-      printf("******************\n");
-      printf("%s\n",new_seq);
-      printf("%s\n",new_prb);
-      printf("%s\n",new_cal_prb);
-      printf("%s\n",new_chastity);
-   }
-#endif // _FDEBUG_
index 79eddf9..6a0d4eb 100644 (file)
@@ -41,6 +41,8 @@ char* get_id(regex_t rx, const char* desc) {
       exit(EXIT_FAILURE);
    }
 
+   //printf("%s\n",desc);
+
    int start = all_matches[0].rm_so+3;
    int end = all_matches[0].rm_eo-1;
    assert( start <= end);
@@ -114,7 +116,19 @@ int parse_gff(char *filename, FILE* fid,struct gene*** allGenes) {
          (*allGenes)[idx] = gene_alloc();
          (*allGenes)[idx]->start = start;
          (*allGenes)[idx]->stop = stop;
-         (*allGenes)[idx]->strand = (*strand);
+
+         //printf("strand: %s %d\n",strand,strcmp(strand,"+"));
+
+         if (strcmp(strand,"+") == 0) {
+            (*allGenes)[idx]->strand = 'D';
+         } else {
+            if (strcmp(strand,"-") == 0)
+               (*allGenes)[idx]->strand = 'P';
+            else
+               (*allGenes)[idx]->strand = 'z';
+         }
+         assert( (*allGenes)[idx]->strand != 'z' );
+
          (*allGenes)[idx]->id = get_id(rx,desc);
          //printf("gene start/stop: %d/%d\n",start,stop);
          continue;
@@ -150,4 +164,3 @@ int parse_gff(char *filename, FILE* fid,struct gene*** allGenes) {
 
    return numGenes;
 }
-