+ fixed some issues with the splice site scores and ugly code fragments
[qpalma.git] / tools / data_tools / filterReads.c
index 0a6c30f..139c681 100644 (file)
-////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
 // The purpose of this program is to read a gff and a    
 // solexa reads file and create a data set used by QPalma. 
 //
+// Notes:
 //
-////////////////////////////////////////////////////////////
+// - Both the read indices and the gff gene indices are one-based
+//
+////////////////////////////////////////////////////////////////////////////////
 
 #include <sys/mman.h>
 #include <sys/stat.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <assert.h>
 #include <string.h>
 #include <unistd.h>
 #include <math.h>
 
 #include "debug_tools.h"
+#include "join.h"
 #include "datastructures.h"
 
 #define _FILE_OFFSET_BITS == 64
 
-typedef struct tuple {
-   int first;
-   int second;
-} Tuple;
+const char* line_format = "%d\t%d\t%s\t%lu\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n";
 
+const int read_size = 36;
 
-int compare_gene_struct(struct gene* a, struct gene* b) {
-   return a->stop - b->start;
-}
+const int min_overlap = 1;
+const int max_overlap = 35;
+
+unsigned long read_nr = 1;
+unsigned long unfiltered_read_nr = 1;
 
 int parse_gff(char* filename,FILE* fid,struct gene*** allGenes);
 void sort_genes(struct gene*** allGenes, int numGenes);
-void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes,FILE* out_fs);
+void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* out_fs, FILE* unfiltered_out_fs);
 void combine_info(int exon_stop, int exon_start, Read** upstream, int up_size, Read** downstream, int down_size, FILE* out_fs,const char* gene_id, int exon_idx);
 int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FILE* out_fs,const char* gene_id, int exon_idx);
 int fitting(char* up_prb, char* down_prb, int u_off, int d_off, int u_size, int d_size);
-void remove_ambiguities(char * old_seq, char* new_seq);
 Tuple join_seq(char* new_seq, char* up_seq,int u_off, int u_size, char* down_seq, int d_off, int d_size, int d_range);
 
 static char *info = "Usage is:\n./filterReads gff reads output";
 
+void reverse_complement(char** seq, int len);
+
+char current_strand;
+
 /*
  * Some constants specifying the exact behavior of the filter
  *
  */
 
-#define _FDEBUG_ 1
-
-#define DEBUG_READ 38603
-
-const char* line_format = "%d\t%d\t%s\t%lu\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n";
-
-const int read_size = 36;
-
-const int min_overlap = 6;
-const int max_overlap = 30;
-
-int read_nr = 1;
-
-int combined_reads = 0;
-
-int main(int argc, char* argv[]) {
-
-   if(argc != 5) {
-      printf("%s\n",info);
-      exit(EXIT_FAILURE);
-   }
-
-   int status;
-   int filenameSize = 256;
-   char* gff_filename = malloc(sizeof(char)*filenameSize);
-   char* reads_filename = malloc(sizeof(char)*filenameSize);
-   char* output_filename = malloc(sizeof(char)*filenameSize);
-
-
-   strncpy(gff_filename,argv[1],filenameSize);
-   strncpy(reads_filename,argv[2],filenameSize);
-   strncpy(output_filename,argv[3],filenameSize);
-
-   FILE *gff_fs = fopen(gff_filename,"r");
-   FILE *reads_fs = fopen(reads_filename,"r");
-   FILE *out_fs = fopen(output_filename,"w");
+//#define _FDEBUG_ 0
+//#define DEBUG_READ 38603
 
-   read_nr = atoi(argv[4]);
-   read_nr++;
-
-   if(gff_fs == NULL) {
-      printf("Error: Could not open file: %s",gff_filename);
-      exit(EXIT_FAILURE);
-   }
-
-   if(reads_fs == NULL) {
-      printf("Error: Could not open file: %s",reads_filename);
-      exit(EXIT_FAILURE);
-   }
+/*
+ * TODO:
+ * - Check strand -> done simple (only if equal)
+ * - check for [AC] and similar entries -> done simple (see function
+ */
 
-   if(out_fs == NULL) {
-      printf("Error: Could not open file: %s",output_filename);
-      exit(EXIT_FAILURE);
-   }
+#ifdef _FDEBUG_
+   if(read_nr == DEBUG_READ) {
+      printf("read nr %lu",read_nr);
+      printf("pos: %d %d %d %d\n",p_start,exon_stop,exon_start,p_stop);
+      printf("u/d range: %d %d\n",up_range,down_range);
+      printf("u/d size: %d %d\n",u_size,d_size);
+      printf("u/d off: %d %d\n",u_off,d_off);
+      printf("add_pos/down_range-d_size: %d %d\n",additional_pos,(down_range - d_size));
 
-   struct gene** allGenes;
-   int numGenes = parse_gff(gff_filename,gff_fs,&allGenes);
-   status = fclose(gff_fs);
-   free(gff_filename);
-   if(status != 0)
-      printf("closing of gff filestream failed!\n");
+      printf("******************\n");
 
-   printf("Successfully parsed gff file! Found %d genes.\n",numGenes);
+      printf("%s\n",up_read->seq);
+      //printf("%s\n",new_up_seq);
 
-   // check if allGenes is sorted. if not throw away those genes that do not
-   // occur in the sorted order
-   int g_idx;
-   struct gene* currentGene = 0;
-   int nulled_genes=0;
-   int old_gene_stop = -1;
-   for(g_idx=0;g_idx<numGenes;g_idx++) {
-      currentGene = allGenes[g_idx];
+      printf("******************\n");
 
-      if (! (currentGene->start < currentGene->stop))
-         printf("Invalid positions for gene %s!\n",currentGene->id);
+      printf("%s\n",down_read->seq);
+      //printf("%s\n",new_down_seq);
 
-      if (! (old_gene_stop <  currentGene->start ) ) {
-         old_gene_stop = currentGene->stop;
-         allGenes[g_idx] = 0;
-         nulled_genes++;
-         continue;
-      }
-      old_gene_stop = currentGene->stop;
+      printf("******************\n");
+      printf("%s\n",new_seq);
+      printf("%s\n",new_prb);
+      printf("%s\n",new_cal_prb);
+      printf("%s\n",new_chastity);
    }
+#endif // _FDEBUG_
 
-   printf("Found %d unordered genes.\n",nulled_genes);
-   int gidx, eidx;
-   int exon_cov = 0;
-   for(gidx=0;gidx<numGenes;gidx++) {
-      if (allGenes[gidx] == 0)
-         continue;
-
-      for(eidx=0;eidx<allGenes[gidx]->num_exons;eidx++) {
-         exon_cov += allGenes[gidx]->exon_stops[eidx] - allGenes[gidx]->exon_starts[eidx];
-   }}
-   printf("Exon coverage is %f\n",(double)exon_cov/30432563);
-
-   process_reads(reads_fs,&allGenes,numGenes,out_fs);
-
-   for(g_idx=0;g_idx<numGenes;g_idx++) {
-      if(allGenes[g_idx] != 0)
-         free_gene(allGenes[g_idx]);
-         free(allGenes[g_idx]);
-
-   }
-   free(allGenes);
+int combined_reads = 0;
 
-   status = fclose(reads_fs);
-   status = fclose(out_fs);
-   if(status != 0)
-      perror("fclose");
-      
-   free(reads_filename);
-   free(output_filename);
-   return 0;
-}
+/*
+ *
+ *
+ */
 
-void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* out_fs) {
+void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* out_fs, FILE* unfiltered_out_fs) {
    int status;
 
    int buffer_size= 64;
@@ -186,7 +120,7 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
    }
    off_t reads_filesize = reads_stat.st_size;
    printf("Reads file is of size %lu bytes\n",(unsigned long) reads_filesize);
-   int numReads = reads_filesize / 178.0;
+   //int numReads = reads_filesize / 178.0;
 
    void *reads_area = mmap (NULL,reads_filesize,PROT_READ,MAP_PRIVATE,reads_fid,0);
    if (reads_area == MAP_FAILED) {
@@ -206,7 +140,7 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
    char* current_line = malloc(sizeof(char)*512);
    memset(current_line,0,512);
 
-   int SIZE = 500;
+   int SIZE = 5000;
    // initialize boundary arrays
    Read** upstream_overlap  = malloc(sizeof(Read*)*SIZE);
    Read** downstream_overlap= malloc(sizeof(Read*)*SIZE);
@@ -233,160 +167,210 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
    int skippedReadCtr = 0;
    int uselessReadCtr = 0;
    int exonicReadCtr = 0;
-   int endPrevCtr = 0;
-   int prevOverlapCtr = 0;
-   int currentStartCtr = 0;
+
    int currentOverlapCtr = 0;
+   int previousOverlapCtr = 0;
+
    int multioccurReadCtr = 0;
 
    Read* currentRead;
    int up_idx, down_idx;
 
    int readCtr = 0;
+   int wrong_strand_ctr = 0;
+   int read_within_gene_ctr = 0;
+   int read_outside_gene_ctr = 0;
+   int read_start, read_stop;
    // start of the parsing loop 
-   while(1) {
-      if (gene_idx == numGenes || strcmp(current_line,"") == 0)
-         break;
-
-      gene_id = currentGene->id;
+  
+      while(1) {
+         if (gene_idx == numGenes || strcmp(current_line,"") == 0)
+            break;
+
+         gene_id = currentGene->id;
+
+         //if (readCtr != 0 && readCtr % 1000000 == 0) {
+         //   printf("Processed %d/%d genes and %d/%d reads.\n",gene_idx,numGenes,readCtr,numReads);
+         //   printf("%d reads in total while %d reads where skipped.\n",readCtr,skippedLinesCounter);
+         //   printf("%d were totally intronic. %d were outside of genes and %d were occurring on several positions.\n",uselessReadCtr,skippedReadCtr,multioccurReadCtr);
+         //   printf("%d reads were useless in total\n",uselessReadCtr+skippedReadCtr+multioccurReadCtr);
+         //   printf("\t%d useless reads\n",uselessReadCtr);
+         //   printf("\t%d skipped reads\n",skippedReadCtr);
+         //   printf("\t%d multioccurring\n",multioccurReadCtr);
+         //   printf("\t%d wrong_strand\n",wrong_strand_ctr);
+         //   printf("%d within gene\n",read_within_gene_ctr);
+         //   printf("%d outside gene\n",read_outside_gene_ctr);
+         //   printf("%d reads were totally exonic\n",exonicReadCtr);
+         //   printf("%d reads overlap with prev. exon. %d reads overlap with current exon\n",previousOverlapCtr,currentOverlapCtr);
+         //   printf("%d reads where newly combined from two original reads\n",combined_reads);
+         //   printf("Total used reads: %d (some may not be combined)\n",exonicReadCtr+combined_reads);
+         //}
+
+         // pos of the reads is one-based
+         status = sscanf(current_line,line_format,&chr,&pos,seq,&id,&strand,&mismatch,&occurrence,&size,&cut,prb,cal_prb,chastity);
+         if (status < 12) {
+            skippedLinesCounter++;
+            goto next_read;
+         }
 
-      if (readCtr != 0 && readCtr % 1000000 == 0)
-         printf("Processed %d/%d genes and %d/%d reads.\n",gene_idx,numGenes,readCtr,numReads);
+         // if the read is occurring several times elsewhere then get rid of it 
+         if ( occurrence != 1 ) {
+            multioccurReadCtr++;
+            goto next_read;
+         }
 
-      status = sscanf(current_line,line_format,&chr,&pos,seq,&id,&strand,&mismatch,&occurrence,&size,&cut,prb,cal_prb,chastity);
-      if (status < 12) {
-         skippedLinesCounter++;
-         goto next_read;
-      }
+         //printf("before rc %s\n",seq);
+         if (current_strand == 'P')
+            reverse_complement(&seq,strlen(seq));
+         //printf("after rc %s\n",seq);
 
-      // if the read is occurring several times elsewhere then get rid of it 
-      if ( occurrence != 1 ) {
-         multioccurReadCtr++;
-         goto next_read;
-      }
+         //printf("new read: %s at %d\n",seq,pos);
+   
+         // define read start and stop positions
+         read_start = pos;
+         read_stop  = pos + read_size-1;
 
-      FA(strlen(seq) >= read_size);
+         FA(strlen(seq) >= read_size);
 
-      FA(currentGene != 0);
+         FA(currentGene != 0);
 
-      if ( currentGene->start <= pos && (pos + read_size-1) <= currentGene->stop) { // read is within gene borders
+         if ( currentGene->start <= read_start && read_stop <= currentGene->stop) { // read is within gene borders
+            read_within_gene_ctr++;
 
-         exon_label:
+            exon_label:
 
-         if (exon_idx == currentGene->num_exons) {
-            gene_idx++;
-            exon_idx = 1;
-            currentGene = (*allGenes)[gene_idx];
-            while( currentGene == 0 && gene_idx < numGenes) {
-               currentGene = (*allGenes)[gene_idx];
+            if (exon_idx == currentGene->num_exons) {
                gene_idx++;
+               exon_idx = 1;
+               currentGene = (*allGenes)[gene_idx];
+               while( currentGene == 0 && gene_idx < numGenes) {
+                  currentGene = (*allGenes)[gene_idx];
+                  gene_idx++;
+               }
+               continue;
             }
-            continue;
-         }
 
-         prev_exon_start = currentGene->exon_starts[exon_idx-1];
-         prev_exon_stop = currentGene->exon_stops[exon_idx-1];
-         cur_exon_start = currentGene->exon_starts[exon_idx];
+            prev_exon_start = currentGene->exon_starts[exon_idx-1];
+            prev_exon_stop = currentGene->exon_stops[exon_idx-1];
+            cur_exon_start = currentGene->exon_starts[exon_idx];
 
-         //printf("exon: %d %d %d\t pos: %d\n",prev_exon_start,prev_exon_stop,cur_exon_start,pos);
+            //printf("id: %s,exon_idx: %d intron: %d %d read start/stop: %d / %d\n",currentGene->id,exon_idx,prev_exon_stop,cur_exon_start,read_start,read_stop);
 
-         if (cur_exon_start - prev_exon_stop < min_overlap || cur_exon_start < pos ) { // go to next exon
+            if ( (cur_exon_start - prev_exon_stop - 1) < min_overlap || cur_exon_start < read_start ) { // go to next exon
 
-            if (uov != 0 && dov != 0)
-               combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
+               if (uov != 0 && dov != 0)
+                  combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
 
-            for(up_idx=0;up_idx<uov;up_idx++) {
-               free_read(upstream_overlap[up_idx]);
-               free(upstream_overlap[up_idx]);
-            }
+               for(up_idx=0;up_idx<uov;up_idx++) {
+                  free_read(upstream_overlap[up_idx]);
+                  free(upstream_overlap[up_idx]);
+               }
 
-            for(down_idx=0;down_idx<dov;down_idx++) {
-               free_read(downstream_overlap[down_idx]);
-               free(downstream_overlap[down_idx]);
+               for(down_idx=0;down_idx<dov;down_idx++) {
+                  free_read(downstream_overlap[down_idx]);
+                  free(downstream_overlap[down_idx]);
+               }
+
+               uov = dov = 0;
+
+               exon_idx++;
+               goto exon_label;
             }
 
-            uov = dov = 0;
+            if ( prev_exon_start <= read_start && read_stop <= prev_exon_stop ) { // read is inside previous exon
 
-            exon_idx++;
-            goto exon_label;
-         }
+               // output of unused i.e. unspliced reads
+               fprintf(unfiltered_out_fs,"%lu\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\n",
+               unfiltered_read_nr,chr,strand,seq,0,read_size,prb,cal_prb,chastity,gene_id,read_start,-1,-1,read_stop,-1);
+               unfiltered_read_nr++;
+               
+               exonicReadCtr++;
+               goto next_read;
+            }
 
-         if ( prev_exon_start < pos && (pos+read_size) < prev_exon_stop ) { // read is inside previous exon
-            exonicReadCtr++;
-            goto next_read;
-         }
+            if ( read_stop < prev_exon_stop ) { // go to next read 
+               //printf("%d\t%d\t%d\n",read_start,prev_exon_start,prev_exon_stop);
+               //if( exon_idx > 1) {
+               //   printf("%d\t%d\n", currentGene->exon_starts[exon_idx-2], currentGene->exon_stops[exon_idx-2]);
+               //   printf("---\n");
+               //}
 
-         if ( pos + (read_size-1) < prev_exon_stop ) // go to next read
-            goto next_read;
+               uselessReadCtr++;
+               goto next_read;
+            }
 
-         // if this position is reached the read is somehow overlapping or
-         // exactly on the exon boundary.
-         if ( (prev_exon_stop - pos + 1) >= min_overlap && (prev_exon_stop - pos + 1) <= max_overlap ) { // read overlaps with previous exon boundary
-            //printf("%s\n",current_line);
-            prevOverlapCtr++;
-            currentRead = create_read(chr,pos,seq,id,strand,mismatch,occurrence,size,cut,prb,cal_prb,chastity);
-            upstream_overlap[uov] = currentRead;
-            uov++;
-            goto next_read;
-         }
-      
-         int end_pos = pos+read_size-1;
-         if ( (end_pos - cur_exon_start + 1) >= min_overlap && (end_pos - cur_exon_start + 1) <= max_overlap ) { // read overlaps with current exon boundary
-            //printf("%s\n",current_line);
-            currentOverlapCtr++;
-            currentRead = create_read(chr,pos,seq,id,strand,mismatch,occurrence,size,cut,prb,cal_prb,chastity);
-            downstream_overlap[dov] = currentRead;
-            dov++;
-            goto next_read;
-         }
+            // if this position is reached the read is somehow overlapping or
+            // exactly on the exon boundary.
+            if ( (prev_exon_stop - read_start + 1) >= min_overlap && (prev_exon_stop - read_start + 1) <= max_overlap ) { // read overlaps with previous exon boundary
+               //printf("%s\n",current_line);
+               previousOverlapCtr++;
+               currentRead = create_read(chr,read_start,seq,id,strand,mismatch,occurrence,size,cut,prb,cal_prb,chastity);
+               assert (uov < SIZE);
+               upstream_overlap[uov] = currentRead;
+               uov++;
+               goto next_read;
+            }
+         
+            if ( ( read_stop - cur_exon_start + 1) >= min_overlap && (read_stop - cur_exon_start + 1) <= max_overlap ) { // read overlaps with current exon boundary
+               //printf("%s\n",current_line);
+               currentOverlapCtr++;
+               currentRead = create_read(chr,read_start,seq,id,strand,mismatch,occurrence,size,cut,prb,cal_prb,chastity);
+               assert (dov < SIZE);
+               downstream_overlap[dov] = currentRead;
+               dov++;
+               goto next_read;
+            }
 
-         uselessReadCtr++;
-         goto next_read; // read was not useful i.e. not overlapping/starting at exon boundaries
+            uselessReadCtr++;
+            goto next_read; // read was not useful i.e. not overlapping/starting at exon boundaries
 
-      } else { // read is not within gene borders
+         } else { // read is not within gene borders
+            read_outside_gene_ctr++;
 
-         if (uov != 0 && dov != 0)
-            combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
+            if (uov != 0 && dov != 0)
+               combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
 
-         for(up_idx=0;up_idx<uov;up_idx++) {
-            free_read(upstream_overlap[up_idx]);
-            free(upstream_overlap[up_idx]);
-         }
+            for(up_idx=0;up_idx<uov;up_idx++) {
+               free_read(upstream_overlap[up_idx]);
+               free(upstream_overlap[up_idx]);
+            }
 
-         for(down_idx=0;down_idx<dov;down_idx++) {
-            free_read(downstream_overlap[down_idx]);
-            free(downstream_overlap[down_idx]);
-         }
+            for(down_idx=0;down_idx<dov;down_idx++) {
+               free_read(downstream_overlap[down_idx]);
+               free(downstream_overlap[down_idx]);
+            }
 
-         uov = dov = 0;
+            uov = dov = 0;
 
-         if ( currentGene->stop < (pos + read_size-1) ) { // go to next gene
-            gene_idx++;
-            exon_idx = 1;
-            currentGene = (*allGenes)[gene_idx];
-            while( currentGene == 0 && gene_idx < numGenes) {
-               currentGene = (*allGenes)[gene_idx];
+            if ( currentGene->stop < read_stop ) { // go to next gene
                gene_idx++;
+               exon_idx = 1;
+               currentGene = (*allGenes)[gene_idx];
+               while( currentGene == 0 && gene_idx < numGenes) {
+                  currentGene = (*allGenes)[gene_idx];
+                  gene_idx++;
+               }
+               //printf("currentGene->start / currentGene->stop %d/%d pos is %d\n",currentGene->start,currentGene->stop,pos);
+               continue;
             }
-            //printf("currentGene->start / currentGene->stop %d/%d pos is %d\n",currentGene->start,currentGene->stop,pos);
-            continue;
-         }
 
-         if ( pos < currentGene->start ) { // go to next read
-            skippedReadCtr++;
-            
-            next_read:
-            
-            lineBeginPtr = lineEndPtr;
-            while (*(char*)lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
-            lineEndPtr++;
-            readCtr += 1;
-            current_line = strncpy(current_line,lineBeginPtr,lineEndPtr-lineBeginPtr);
-            current_line[lineEndPtr-lineBeginPtr] = '\0';
-            continue;
+            if ( read_start < currentGene->start ) { // go to next read
+               skippedReadCtr++;
+               
+               next_read:
+               
+               lineBeginPtr = lineEndPtr;
+               while (*(char*)lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
+               lineEndPtr++;
+               readCtr += 1;
+               current_line = strncpy(current_line,lineBeginPtr,lineEndPtr-lineBeginPtr);
+               current_line[lineEndPtr-lineBeginPtr] = '\0';
+               continue;
+            }
          }
       }
-   }
+
+   //} // end of for all strands
 
    if (uov != 0 && dov != 0)
       combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
@@ -409,10 +393,14 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
    printf("Processed %d reads in total while %d reads where skipped.\n",readCtr,skippedLinesCounter);
    printf("%d were totally intronic. %d were outside of genes and %d were occurring on several positions.\n",uselessReadCtr,skippedReadCtr,multioccurReadCtr);
    printf("%d reads were useless in total\n",uselessReadCtr+skippedReadCtr+multioccurReadCtr);
+   printf("\t%d useless reads\n",uselessReadCtr);
+   printf("\t%d skipped reads\n",skippedReadCtr);
+   printf("\t%d multioccurring\n",multioccurReadCtr);
+   printf("\t%d wrong_strand\n",wrong_strand_ctr);
    printf("%d reads were totally exonic\n",exonicReadCtr);
-   printf("%d reads overlap with prev. exon. %d reads overlap with current exon\n",prevOverlapCtr,currentOverlapCtr);
+   printf("%d reads overlap with prev. exon. %d reads overlap with current exon\n",previousOverlapCtr,currentOverlapCtr);
    printf("%d reads where newly combined from two original reads\n",combined_reads);
-   printf("Total used reads: %d\n",exonicReadCtr+endPrevCtr+prevOverlapCtr+currentStartCtr+currentOverlapCtr);
+   printf("Total used reads: %d (some may not be combined)\n",exonicReadCtr+combined_reads);
 
    status = munmap(reads_area,reads_filesize);
    if(status != 0)
@@ -425,10 +413,8 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
    free(chastity);
 }
 
-void combine_info(int exon_stop, int exon_start, Read** upstream, int up_size, Read** downstream, int down_size, FILE* out_fs,const char* gene_id, int exon_idx) {
-
-   //printf("up/down size is %d/%d\n",up_size,down_size);
 
+void combine_info(int exon_stop, int exon_start, Read** upstream, int up_size, Read** downstream, int down_size, FILE* out_fs,const char* gene_id, int exon_idx) {
    int up_idx, down_idx, success;
 
    char* up_used_flag = calloc(up_size,sizeof(char));
@@ -459,129 +445,151 @@ void combine_info(int exon_stop, int exon_start, Read** upstream, int up_size, R
             up_used_flag[up_idx] = 1;
             down_used_flag[down_idx] = 1;
          }
-
       }
    }
 
+   exit(0);
    free(up_used_flag);
    free(down_used_flag);
 }
 
+/*
+ * Now we join the candidate reads wherever possible according to the following
+ * scheme:
+ *
+ * ACGTACGTCA GTXXXXXXXXAG ACGTAGACGT
+ * p1       e1             e2       p2  
+ *
+ * 
+ *
+ *
+ */
+
 int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FILE* out_fs,const char* gene_id, int exon_idx) {
    // range of possible sequence length on exon side
-   int up_range   = exon_stop - up_read->pos+1;
-   int down_range = down_read->pos+(read_size-1) - exon_start+1;
+   int up_read_start = up_read->pos;
+   //int up_read_stop  = up_read->pos+read_size-1;
+
+   int down_read_start = down_read->pos;
+   int down_read_stop  = down_read->pos+read_size-1;
+
+   int up_range   = exon_stop - up_read_start + 1;
+   int down_range = down_read_stop - exon_start + 1;
    int retval;
 
-   int u_off, d_off, u_size, d_size;
+   int u_size, d_size;
    u_size = d_size = -1;
 
-   //printf("possible range %d %d\n",up_range,down_range);
-
    if(up_range+down_range < read_size)
       return 0;
 
-   int half_read = read_size / 2;
-
-   // both overlap more than a half of the new read
-   if(up_range >= half_read &&  down_range >= half_read) {
-
-      u_size = half_read;
-      d_size = half_read;
-
+   if (read_nr % 2 != 0) {
+      d_size = down_range;
+      u_size = read_size - d_size;
    } else {
-
-      if(up_range < down_range) {
-         u_size = up_range;
-         d_size = read_size - u_size;
-      }
-
-      if(up_range > down_range) {
-         d_size = down_range;
-         u_size = read_size - d_size;
-      }
+      u_size = up_range;
+      d_size = read_size - u_size;
    }
 
-   int p_start  = exon_stop - u_size + 1;
+   if( u_size > up_range || d_size > down_range)
+      return 0;
+
+   int p_start  = exon_stop  - u_size + 1;
    int p_stop   = exon_start + d_size - 1;
 
-   u_off = p_start - up_read->pos;
-   d_off = exon_start - down_read->pos;
+   int u_off = p_start - up_read_start;
+   int d_off = exon_start - down_read_start;
 
-   FA(u_size != -1);
-   FA(d_size != -1);
-   FA(u_size + d_size == read_size);
+   FA( u_off >= 0 && d_off >= 0 );
+   FA( exon_stop - p_start + p_stop - exon_start + 2 == read_size);
+   FA( u_size + d_size == read_size );
 
-   int buf_size = 4*read_size;
-   char* new_up_seq     = malloc(sizeof(char)*buf_size);
-   char* new_down_seq   = malloc(sizeof(char)*buf_size); 
-   char* new_seq        = malloc(sizeof(char)*buf_size);
+   // seems reasonable up to here
 
-   memset(new_up_seq,0,sizeof(char)*buf_size);
-   memset(new_down_seq,0,sizeof(char)*buf_size);
-   memset(new_seq,0,sizeof(char)*buf_size);
+   int buf_size = 4*read_size;
+   char* new_seq = malloc(sizeof(char)*buf_size);
+   memset(new_seq,'z',sizeof(char)*buf_size);
+
+   if ( current_strand == 'P' ) {
+      printf("flipping read sequences...\n");
+      printf("%s %s\n",up_read->seq,down_read->seq);
+      char *tmp = malloc(sizeof(char)*(strlen(up_read->seq)+1));
+      strncpy(tmp,up_read->seq,strlen(up_read->seq));
+      tmp[strlen(up_read->seq)]='\0';
+
+      realloc(up_read->seq,sizeof(char)*(strlen(down_read->seq)+1));
+      strncpy(up_read->seq,down_read->seq,strlen(down_read->seq));
+      up_read->seq[strlen(down_read->seq)] = '\0';
+
+      if (up_read->seq == NULL)
+         perror("realloc\n");
+
+      realloc(down_read->seq,sizeof(char)*strlen(tmp)+1);
+      strncpy(down_read->seq,tmp,strlen(tmp));
+      down_read->seq[strlen(tmp)] = '\0';
+      
+      if (down_read->seq == NULL)
+         perror("realloc\n");
+      
+      free(tmp);
+      printf("flipping done...\n");
+      printf("%s %s\n",up_read->seq,down_read->seq);
+   }
    
-   //remove_ambiguities(up_read->seq,new_up_seq);
-   //remove_ambiguities(down_read->seq,new_down_seq);
-   //
-   strncpy(new_up_seq,up_read->seq,strlen(up_read->seq));
-   strncpy(new_down_seq,down_read->seq,strlen(down_read->seq));
+   printf("start joining...\n");
+   Tuple jinfo = join_seq(new_seq,up_read->seq,u_off,u_size,down_read->seq,d_off,d_size,down_range);
+   printf("end of joining...\n");
+
+   printf("new_seq is %s\n",new_seq);
+
 
-   Tuple jinfo = join_seq(new_seq,new_up_seq,u_off,u_size,new_down_seq,d_off,d_size,down_range);
    int cut_pos = jinfo.first;
    int additional_pos = jinfo.second;
+   printf("jinfo contains %d/%d\n",jinfo.first,jinfo.second);
 
    buf_size = read_size+1+additional_pos;
 
+   printf("allocating quality arrays (size=%d)...\n",buf_size);
    char* new_prb        = malloc(sizeof(char)*buf_size);
    char* new_cal_prb    = malloc(sizeof(char)*buf_size);
    char* new_chastity   = malloc(sizeof(char)*buf_size);
 
-   if( additional_pos > (down_range - d_size) ) {
+   if (new_prb == NULL || new_cal_prb == NULL || new_chastity == NULL)
+      perror("malloc\n");
+   
+   if( jinfo.first == -1 || (additional_pos > (down_range - d_size)) ) {
       retval = 0;
       goto free;
    }
 
+   printf("joining qualities...\n");
    strncpy(new_prb, up_read->prb+u_off, u_size);
    strncpy(new_prb+u_size, down_read->prb+d_off, d_size+additional_pos);
-   new_prb[read_size+additional_pos] = '\0';
+   new_prb[buf_size-1] = '\0';
 
    strncpy(new_cal_prb, up_read->cal_prb+u_off, u_size);
    strncpy(new_cal_prb+u_size, down_read->cal_prb+d_off, d_size+additional_pos);
-   new_cal_prb[read_size+additional_pos] = '\0';
+   new_cal_prb[buf_size-1] = '\0';
 
    strncpy(new_chastity, up_read->chastity+u_off, u_size);
    strncpy(new_chastity+u_size, down_read->chastity+d_off, d_size+additional_pos);
-   new_chastity[read_size+additional_pos] = '\0';
-
-#ifdef _FDEBUG_
-   if(read_nr == DEBUG_READ) {
-      printf("read nr %d",read_nr);
-      printf("pos: %d %d %d %d\n",p_start,exon_stop,exon_start,p_stop);
-      printf("u/d range: %d %d\n",up_range,down_range);
-      printf("u/d size: %d %d\n",u_size,d_size);
-      printf("u/d off: %d %d\n",u_off,d_off);
-      printf("add_pos/down_range-d_size: %d %d\n",additional_pos,(down_range - d_size));
-
-      printf("******************\n");
-
-      printf("%s\n",up_read->seq);
-      printf("%s\n",new_up_seq);
-
-      printf("******************\n");
-
-      printf("%s\n",down_read->seq);
-      printf("%s\n",new_down_seq);
-
-      printf("******************\n");
-      printf("%s\n",new_seq);
-      printf("%s\n",new_prb);
-      printf("%s\n",new_cal_prb);
-      printf("%s\n",new_chastity);
-   }
-#endif // _FDEBUG_
+   new_chastity[buf_size-1] = '\0';
+   printf("end of joining qualities...\n");
+
+   //printf("old reads: %s %s (%d %d %d/%d)\n",up_read->seq,down_read->seq,up_read->pos,down_read->pos,u_off,d_off);
+   //printf("new read: %s %d %d\n",new_seq,cut_pos,u_size);
+   //if ( current_strand == 'P' ) {
+   //   int alpha   = read_size - u_off - 1;
+   //   int beta    = alpha - u_size ;
+   //   p_start     = up_read->pos + beta + 1;
+   //   exon_stop   = up_read->pos + alpha;
+   //   alpha   = read_size - d_off - 1;
+   //   beta    = alpha - (read_size - u_size);
+   //   exon_start  = down_read->pos + beta + 1; 
+   //   p_stop      = down_read->pos + alpha;
+   //}
 
-   int status = fitting(up_read->prb,down_read->prb,u_off,d_off,u_size,d_size);
+   int status = 1; //fitting(up_read->prb,down_read->prb,u_off,d_off,u_size,d_size);
 
    if(status != 1) {
       retval = 0;
@@ -590,16 +598,13 @@ int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FI
 
    retval = status;
 
-   fprintf(out_fs,"%d\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%s\t%s\n",
-      read_nr,up_read->chr,up_read->strand,new_seq,cut_pos,read_size,new_prb,new_cal_prb,new_chastity,gene_id,p_start,exon_stop,exon_start,p_stop,up_read->seq,down_read->seq);
+   fprintf(out_fs,"%lu\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\n",
+      read_nr,up_read->chr,up_read->strand,new_seq,cut_pos,read_size,new_prb,new_cal_prb,new_chastity,gene_id,p_start,exon_stop,exon_start,p_stop,u_size);
 
    read_nr++;
    combined_reads++;
 
    free:
-   free(new_up_seq);
-   free(new_down_seq); 
-
    free(new_seq);
    free(new_prb);
    free(new_cal_prb);
@@ -608,319 +613,162 @@ int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FI
    return retval;
 }
 
-int fitting(char* up_prb, char* down_prb, int u_off, int d_off, int u_size, int d_size) {
-   double current_mean_up = 0;
-   double current_mean_down = 0;
-
-   int w_size = 2;
+static char* translation = "T G   C            A      [ ]";
 
+void reverse_complement(char** seq, int len) {
    int idx;
+   char *temp = malloc(sizeof(char)*len);
+   for(idx=0;idx<len;idx++)
+      temp[idx] = translation[(*seq)[idx]-65];
 
-   //printf("prb %s\n",up_prb);
-   //printf("prb %s\n",down_prb);
-
-   for(idx=0;idx<w_size;idx++) {
-      current_mean_up += up_prb[u_off+u_size+idx]-50;
-      current_mean_up += up_prb[u_off+u_size-idx]-50;
-      current_mean_up -= up_prb[idx]-50;
+   idx=0;
+   int temp_idx=len-1;
+   while(1) {
+      if (temp[temp_idx] == ']') {
+         (*seq)[idx] = '[';
+         (*seq)[idx+1] = temp[temp_idx-2];
+         (*seq)[idx+2] = temp[temp_idx-1];
+         (*seq)[idx+3] = ']';
+         idx      += 4;
+         temp_idx -= 4;
+      } else {
+         (*seq)[idx] = temp[temp_idx];
+         idx++;
+         temp_idx--;
+      }
 
-      current_mean_down += up_prb[d_off+idx]-50;
-      current_mean_down += up_prb[d_off+idx]-50;
-      current_mean_down -= up_prb[idx]-50;
+      if (idx == len || temp_idx == -1)
+         break;
    }
+   free(temp);
+}
 
-   current_mean_up /= (2*w_size+1);
-   current_mean_down /= (2*w_size+1);
-
-   double ratio;
-   if( current_mean_up  > current_mean_down)
-      ratio = current_mean_down / current_mean_up;
-   else
-      ratio = current_mean_up / current_mean_down;
 
-   //printf("ratio is %f\n",ratio);
+void print_read(Read* cRead) {
+   printf(line_format,
+   cRead->chr, cRead->pos, cRead->seq, cRead->id,
+   cRead->strand, cRead->mismatch, cRead->occurrence,
+   cRead->size, cRead->cut, cRead->prb, cRead->cal_prb,
+   cRead->chastity);
+}
 
-   if (ratio < 0.35)
-      return 0;
 
-   return 1;
+void open_file(const char* filename, const char* mode, FILE** fs) {
+   *fs = fopen(filename,mode);
+   if(*fs == NULL) {
+      printf("Error: Could not open file: %s",filename);
+      exit(EXIT_FAILURE);
+   }
 }
 
 /*
- * As we can have 3 kinds of ambiguities
+ * The program expects 7 arguments, namely:
  *
- * [AG]
- * [A-]
- * [-A]
+ * - The strand to be used
+ * - The filename of the gff file
+ * - The filename of the reads file
+ * - The name of the spliced reads ouput file
+ * - The name of the unspliced reads ouput file
+ * - The offset for counting new spliced reads
+ * - The offset for counting new unspliced reads
  *
- * we want to remove/store these ambiguities in the filtered reads
  */
 
-void remove_ambiguities(char * old_seq, char* new_seq) {
-
-   FA(0);
-
-   int idx=0;
-   int new_idx = 0;
-   int old_seq_size = strlen(old_seq);
-
-   while(idx<old_seq_size) {
-      //printf("Current elem %c pos %d %d\n",old_seq[idx],idx,new_idx);
-      if (old_seq[idx] == ']') {
-         idx++;
-         continue;
-      }
-
-      if (old_seq[idx] == '[') {
-
-         // we have a indel either on the dna or read sequence
-         if (old_seq[idx+1] == '-' || old_seq[idx+2] == '-') {
-            
-
-            while(1) { 
-               new_seq[new_idx] = old_seq[idx];
-               if(old_seq[idx] == ']')
-                  break;
-               new_idx++;
-               idx++;
-            }
-
-         } else {
-            idx += 2;
-            continue;
-         }
-      }
+int main(int argc, char* argv[]) {
 
-      new_seq[new_idx] = old_seq[idx];
-      idx++;
-      new_idx++;
+   if(argc != 8) {
+      printf("%s\n",info);
+      exit(EXIT_FAILURE);
    }
 
-   new_seq[new_idx] = '\0';
-   //printf("new_seq is %d :: %s\n",new_idx,new_seq);
-}
-
+   current_strand = argv[1][0];
+   printf("Current strand is %c\n",current_strand);
 
-Tuple join_seq(char* new_seq, char* up_seq,int u_off, int u_size, char* down_seq, int d_off, int d_size, int d_range) {
-   int new_idx, idx;
-   new_idx = idx = 0;
-   int elem_ctr = 0;
-   int read_del_ctr = 0;
+   int status;
+   int filenameSize = 256;
+   char* gff_filename = malloc(sizeof(char)*filenameSize);
 
-   int u_idx = 0;
-   int skipped_pos_ctr = 0;
-   while(1) {
-      if( u_idx >= read_size || skipped_pos_ctr == u_off) 
-         break;
-        
-      if(up_seq[u_idx] == '[') {
-         u_idx += 4;
-         skipped_pos_ctr++;
-      } else {
-         u_idx++;
-         skipped_pos_ctr++;
-      }
-   }
+   strncpy(gff_filename,argv[2],filenameSize);
 
-   if(read_nr == DEBUG_READ) {
-      printf("u_off is %d\n",u_off);
-      printf("u_idx is %d\n",u_idx);
-   }
-   u_off = u_idx;
+   FILE *gff_fs;
+   FILE *reads_fs;
+   FILE *out_fs;
+   FILE *unfiltered_out_fs;
 
+   // open file streams for all needed input/output files
+   open_file(argv[2],"r",&gff_fs);
+   open_file(argv[3],"r",&reads_fs);
+   open_file(argv[4],"w",&out_fs);
+   open_file(argv[5],"w",&unfiltered_out_fs);
 
-   if( u_off > 0 && up_seq[u_off+idx-1] == '[' )
-      idx = -1;
+   read_nr = strtoul(argv[6],NULL,10);
+   read_nr++;
 
-   if( u_off > 1 && up_seq[u_off+idx-2] == '[' )
-      idx = -2;
+   unfiltered_read_nr = strtoul(argv[7],NULL,10);
+   unfiltered_read_nr++;
 
-   if( u_off > 2 && up_seq[u_off+idx-3] == '[' )
-      idx = -3;
+   // allocate and load all genes and then close the gff file stream
+   struct gene** allGenes;
+   int numGenes = parse_gff(gff_filename,gff_fs,&allGenes);
+   status = fclose(gff_fs);
+   free(gff_filename);
+   if(status != 0)
+      printf("closing of gff filestream failed!\n");
 
-   while(1) {
-      if (elem_ctr == u_size)
-         break;
+   printf("Successfully parsed gff file! Found %d genes.\n",numGenes);
 
-      if(up_seq[u_off+idx] == '[') {
+   // check if allGenes is sorted. if not throw away those genes that do not
+   // occur in the sorted order
+   int g_idx;
+   struct gene* currentGene = 0;
+   int nulled_genes=0;
+   int old_gene_stop = -1;
+   for(g_idx=0;g_idx<numGenes;g_idx++) {
+      currentGene = allGenes[g_idx];
 
-         if(up_seq[u_off+idx+2] == '-') // del in read seq.
-            read_del_ctr++;
+      if (! (currentGene->start < currentGene->stop))
+         printf("Invalid positions for gene %s!\n",currentGene->id);
 
-         while(1) { 
-            new_seq[new_idx] = up_seq[u_off+idx];
-            new_idx++;
-            idx++;
-            if(up_seq[u_off+idx-1] == ']')
-               break;
-         }
-         elem_ctr++;
+      if (! (old_gene_stop <  currentGene->start ) ) {
+         printf("Gene %s is overlapping\n",currentGene->id);
+         old_gene_stop = currentGene->stop;
+         free_gene(allGenes[g_idx]);
+         free(allGenes[g_idx]);
+         allGenes[g_idx] = 0;
+         nulled_genes++;
          continue;
       }
-
-      new_seq[new_idx] = up_seq[u_off+idx];
-      idx++;
-      new_idx++;
-      elem_ctr++;
-   }
-
-   elem_ctr = 0;
-   idx = 0;
-
-   Tuple tuple;
-   tuple.first = new_idx;
-
-   int d_idx = 0;
-   skipped_pos_ctr = 0;
-   while(1) {
-      if( d_idx >= read_size || skipped_pos_ctr == d_off) 
-         break;
-        
-      if(down_seq[d_idx] == '[') {
-         d_idx += 4;
-         skipped_pos_ctr++;
-      } else {
-         d_idx++;
-         skipped_pos_ctr++;
-      }
-   }
-
-   if(read_nr == DEBUG_READ) {
-      printf("d_off is %d\n",d_off);
-      printf("d_idx is %d\n",d_idx);
+      old_gene_stop = currentGene->stop;
    }
-   d_off = d_idx;
-   //if(read_nr == DEBUG_READ) {
-   //   printf("%s\n",new_seq);
-   //   printf("%c\n",down_seq[d_off+idx]);
-   //}
-
-   if( d_off > 0 && down_seq[d_off+idx-1] == '[' )
-      idx = -1;
-
-   if( d_off > 1 && down_seq[d_off+idx-2] == '[' )
-      idx = -2;
-
-   if( d_off > 2 && down_seq[d_off+idx-3] == '[' )
-      idx = -3;
-
-   while(1) {
-      if (elem_ctr == d_size)
-         break;
-
-      if(down_seq[d_off+idx] == '[') {
 
-         if(down_seq[d_off+idx+2] == '-') // del in read seq.
-            read_del_ctr++;
-
-         while(1) { 
-            new_seq[new_idx] = down_seq[d_off+idx];
-            new_idx++;
-            idx++;
-            if(down_seq[d_off+idx-1] == ']')
-               break;
-         }
-         elem_ctr++;
+   printf("Found %d unordered genes.\n",nulled_genes);
+   int gidx, eidx;
+   int exon_cov = 0;
+   for(gidx=0;gidx<numGenes;gidx++) {
+      if (allGenes[gidx] == 0)
          continue;
-      }
-
-      new_seq[new_idx] = down_seq[d_off+idx];
-      idx++;
-      new_idx++;
-      elem_ctr++;
-   }
-
-   elem_ctr = 0;
-
-#ifdef _FDEBUG_
-   if(read_nr == DEBUG_READ) {
-      printf("read_del_ctr %d\n",read_del_ctr);
-      printf("idx/new_idx: %d %d\n",idx,new_idx);
-      printf("next char: %c\n",down_seq[d_off+idx]);
-      printf("down_seq: %s\n",down_seq);
-   }
-#endif // _FDEBUG_
 
-   while(1) {
-      if (elem_ctr == read_del_ctr || d_off+idx >= strlen(down_seq))
-         break;
+      for(eidx=0;eidx<allGenes[gidx]->num_exons;eidx++) {
+         exon_cov += allGenes[gidx]->exon_stops[eidx] - allGenes[gidx]->exon_starts[eidx];
+   }}
+   printf("Exon coverage is %f\n",(double)exon_cov/30432563);
 
-      if(down_seq[d_off+idx] == '[') {
+   // now that we loaded all neccessary data we start to process the reads
+   process_reads(reads_fs,&allGenes,numGenes,out_fs,unfiltered_out_fs);
 
-         while(1) { 
-            new_seq[new_idx] = down_seq[d_off+idx];
-            new_idx++;
-            idx++;
-            if(down_seq[d_off+idx-1] == ']')
-               break;
-         }
-         elem_ctr++;
-         continue;
+   // free all allocated ressources
+   for(g_idx=0;g_idx<numGenes;g_idx++) {
+      if(allGenes[g_idx] != 0) {
+         free_gene(allGenes[g_idx]);
+         free(allGenes[g_idx]);
       }
-
-#ifdef _FDEBUG_
-      if(read_nr == DEBUG_READ)
-         printf("current char: %c\n",down_seq[d_off+idx]);
-#endif // _FDEBUG_
-
-      new_seq[new_idx] = down_seq[d_off+idx];
-      new_idx++;
-      idx++;
-      elem_ctr++;
    }
+   free(allGenes);
 
-   new_seq[new_idx+1] = '\0';
-
-#ifdef _FDEBUG_
-   if(read_nr == DEBUG_READ)
-      printf("idx/new_idx: %d %d\n",idx,new_idx);
-#endif // _FDEBUG_
-
-   tuple.second = read_del_ctr;
-   return tuple;
-}
-
-void print_read(Read* cRead) {
-
-   printf(line_format,
-   cRead->chr, cRead->pos, cRead->seq, cRead->id,
-   cRead->strand, cRead->mismatch, cRead->occurrence,
-   cRead->size, cRead->cut, cRead->prb, cRead->cal_prb,
-   cRead->chastity);
+   status = fclose(reads_fs);
+   status = fclose(out_fs);
+   if(status != 0)
+      perror("fclose");
+      
+   return 0;
 }
-
-/*
- * TODO:
- * - Check strand -> done simple (only if equal)
- * - check for [AC] and similar entries -> done simple (see function
- * - remove_ambiguities (exchanges [XY] by the second entry)
- */
-
-   //if( read_nr == 13 || read_nr == 14 )  {
-   //   printf("read nr %d",read_nr);
-   //   printf("pos: %d %d %d %d\n",p_start,exon_stop,exon_start,p_stop);
-   //   printf("u/d range: %d %d\n",up_range,down_range);
-   //   printf("u/d size: %d %d\n",u_size,d_size);
-   //   printf("u/d off: %d %d\n",u_off,d_off);
-   //   printf("******************\n");
-
-   //   printf("%s\n",up_read->seq);
-   //   printf("%s\n",down_read->seq);
-
-   //   printf("******************\n");
-
-   //   printf("%s\n",new_up_seq);
-   //   printf("%s\n",new_down_seq);
-
-   //   printf("******************\n");
-   //   printf("%s\n",new_seq);
-   //   printf("%s\n",new_prb);
-   //   printf("%s\n",new_cal_prb);
-   //   printf("%s\n",new_chastity);
-   //}
-
-   //// The four last integers specify the positions of 
-   //// p_start : the position in the dna where the first read starts
-   //// exons_stop  : the position in the dna where the first exons ends
-   //// exons_start : the position in the dna where the second exons starts
-   //// p_stop  : the position in the dna where the (truncated) second read ends