#include <sys/stat.h>
#include <stdio.h>
#include <stdlib.h>
+#include <assert.h>
#include <string.h>
#include <unistd.h>
#include <math.h>
void combine_info(int exon_stop, int exon_start, Read** upstream, int up_size, Read** downstream, int down_size, FILE* out_fs,const char* gene_id, int exon_idx);
int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FILE* out_fs,const char* gene_id, int exon_idx);
int fitting(char* up_prb, char* down_prb, int u_off, int d_off, int u_size, int d_size);
-int remove_ambiguities(char * old_seq, char* new_seq);
Tuple join_seq(char* new_seq, char* up_seq,int u_off, int u_size, char* down_seq, int d_off, int d_size, int d_range);
static char *info = "Usage is:\n./filterReads gff reads output";
//#define _FDEBUG_ 0
//#define DEBUG_READ 38603
+
+//
+/*
+ * TODO:
+ * - Check strand -> done simple (only if equal)
+ * - check for [AC] and similar entries -> done simple (see function
+ */
+
+#ifdef _FDEBUG_
+ if(read_nr == DEBUG_READ) {
+ printf("read nr %lu",read_nr);
+ printf("pos: %d %d %d %d\n",p_start,exon_stop,exon_start,p_stop);
+ printf("u/d range: %d %d\n",up_range,down_range);
+ printf("u/d size: %d %d\n",u_size,d_size);
+ printf("u/d off: %d %d\n",u_off,d_off);
+ printf("add_pos/down_range-d_size: %d %d\n",additional_pos,(down_range - d_size));
+
+ printf("******************\n");
+
+ printf("%s\n",up_read->seq);
+ //printf("%s\n",new_up_seq);
+
+ printf("******************\n");
+
+ printf("%s\n",down_read->seq);
+ //printf("%s\n",new_down_seq);
+
+ printf("******************\n");
+ printf("%s\n",new_seq);
+ printf("%s\n",new_prb);
+ printf("%s\n",new_cal_prb);
+ printf("%s\n",new_chastity);
+ }
+#endif // _FDEBUG_
+
int combined_reads = 0;
+char current_strand;
+
+/*
+ *
+ *
+ */
+
void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* out_fs, FILE* unfiltered_out_fs) {
int status;
char* current_line = malloc(sizeof(char)*512);
memset(current_line,0,512);
- int SIZE = 500;
+ int SIZE = 5000;
// initialize boundary arrays
Read** upstream_overlap = malloc(sizeof(Read*)*SIZE);
Read** downstream_overlap= malloc(sizeof(Read*)*SIZE);
int skippedReadCtr = 0;
int uselessReadCtr = 0;
int exonicReadCtr = 0;
- int endPrevCtr = 0;
- int prevOverlapCtr = 0;
- int currentStartCtr = 0;
+
int currentOverlapCtr = 0;
+ int previousOverlapCtr = 0;
+
int multioccurReadCtr = 0;
Read* currentRead;
int up_idx, down_idx;
int readCtr = 0;
+ int wrong_strand_ctr = 0;
+ int read_within_gene_ctr = 0;
+ int read_outside_gene_ctr = 0;
int read_start, read_stop;
// start of the parsing loop
- while(1) {
- if (gene_idx == numGenes || strcmp(current_line,"") == 0)
- break;
-
- gene_id = currentGene->id;
-
- if (readCtr != 0 && readCtr % 1000000 == 0)
- printf("Processed %d/%d genes and %d/%d reads.\n",gene_idx,numGenes,readCtr,numReads);
+
+ //static char *all_strands = "+-";
+ //static char *all_strands = "DP";
+ //int strand_idx;
+ //for(strand_idx=0;strand_idx<2;strand_idx++) {
+ // char current_strand = all_strands[strand_idx];
+
+ while(1) {
+ if (gene_idx == numGenes || strcmp(current_line,"") == 0)
+ break;
+
+ gene_id = currentGene->id;
+
+ if (readCtr != 0 && readCtr % 1000000 == 0) {
+ printf("Processed %d/%d genes and %d/%d reads.\n",gene_idx,numGenes,readCtr,numReads);
+ printf("%d reads in total while %d reads where skipped.\n",readCtr,skippedLinesCounter);
+ printf("%d were totally intronic. %d were outside of genes and %d were occurring on several positions.\n",uselessReadCtr,skippedReadCtr,multioccurReadCtr);
+ printf("%d reads were useless in total\n",uselessReadCtr+skippedReadCtr+multioccurReadCtr);
+ printf("\t%d useless reads\n",uselessReadCtr);
+ printf("\t%d skipped reads\n",skippedReadCtr);
+ printf("\t%d multioccurring\n",multioccurReadCtr);
+ printf("\t%d wrong_strand\n",wrong_strand_ctr);
+ printf("%d within gene\n",read_within_gene_ctr);
+ printf("%d outside gene\n",read_outside_gene_ctr);
+ printf("%d reads were totally exonic\n",exonicReadCtr);
+ printf("%d reads overlap with prev. exon. %d reads overlap with current exon\n",previousOverlapCtr,currentOverlapCtr);
+ printf("%d reads where newly combined from two original reads\n",combined_reads);
+ printf("Total used reads: %d (some may not be combined)\n",exonicReadCtr+combined_reads);
+ }
- // pos of the reads is one-based
- status = sscanf(current_line,line_format,&chr,&pos,seq,&id,&strand,&mismatch,&occurrence,&size,&cut,prb,cal_prb,chastity);
- if (status < 12) {
- skippedLinesCounter++;
- goto next_read;
- }
+ // pos of the reads is one-based
+ status = sscanf(current_line,line_format,&chr,&pos,seq,&id,&strand,&mismatch,&occurrence,&size,&cut,prb,cal_prb,chastity);
+ if (status < 12) {
+ skippedLinesCounter++;
+ goto next_read;
+ }
- // if the read is occurring several times elsewhere then get rid of it
- if ( occurrence != 1 ) {
- multioccurReadCtr++;
- goto next_read;
- }
+ // if the read is occurring several times elsewhere then get rid of it
+ if ( occurrence != 1 ) {
+ multioccurReadCtr++;
+ goto next_read;
+ }
- // define read start and stop positions
- read_start = pos;
- read_stop = pos + read_size-1;
+ // define read start and stop positions
+ read_start = pos;
+ read_stop = pos + read_size-1;
- if( strand != 'D' )
- goto next_read;
+ if( strand != current_strand ) {
+ wrong_strand_ctr++;
+ goto next_read;
+ }
- FA(strlen(seq) >= read_size);
+ FA(strlen(seq) >= read_size);
- FA(currentGene != 0);
+ FA(currentGene != 0);
- if ( currentGene->start <= read_start && read_stop <= currentGene->stop) { // read is within gene borders
+ if ( currentGene->start <= read_start && read_stop <= currentGene->stop) { // read is within gene borders
+ read_within_gene_ctr++;
- exon_label:
+ exon_label:
- if (exon_idx == currentGene->num_exons) {
- gene_idx++;
- exon_idx = 1;
- currentGene = (*allGenes)[gene_idx];
- while( currentGene == 0 && gene_idx < numGenes) {
- currentGene = (*allGenes)[gene_idx];
+ if (exon_idx == currentGene->num_exons) {
gene_idx++;
+ exon_idx = 1;
+ currentGene = (*allGenes)[gene_idx];
+ while( (currentGene == 0 || currentGene->strand != current_strand) && gene_idx < numGenes) {
+ currentGene = (*allGenes)[gene_idx];
+ gene_idx++;
+ }
+ continue;
}
- continue;
- }
- prev_exon_start = currentGene->exon_starts[exon_idx-1];
- prev_exon_stop = currentGene->exon_stops[exon_idx-1];
- cur_exon_start = currentGene->exon_starts[exon_idx];
+ prev_exon_start = currentGene->exon_starts[exon_idx-1];
+ prev_exon_stop = currentGene->exon_stops[exon_idx-1];
+ cur_exon_start = currentGene->exon_starts[exon_idx];
- //printf("exon: %d %d %d\t pos: %d\n",prev_exon_start,prev_exon_stop,cur_exon_start,pos);
+ //printf("id: %s,exon_idx: %d intron: %d %d read start/stop: %d / %d\n",currentGene->id,exon_idx,prev_exon_stop,cur_exon_start,read_start,read_stop);
- if ( (cur_exon_start - prev_exon_stop - 1) < min_overlap || cur_exon_start < read_start ) { // go to next exon
+ if ( (cur_exon_start - prev_exon_stop - 1) < min_overlap || cur_exon_start < read_start ) { // go to next exon
- if (uov != 0 && dov != 0)
- combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
+ if (uov != 0 && dov != 0)
+ combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
- for(up_idx=0;up_idx<uov;up_idx++) {
- free_read(upstream_overlap[up_idx]);
- free(upstream_overlap[up_idx]);
- }
+ for(up_idx=0;up_idx<uov;up_idx++) {
+ free_read(upstream_overlap[up_idx]);
+ free(upstream_overlap[up_idx]);
+ }
- for(down_idx=0;down_idx<dov;down_idx++) {
- free_read(downstream_overlap[down_idx]);
- free(downstream_overlap[down_idx]);
- }
+ for(down_idx=0;down_idx<dov;down_idx++) {
+ free_read(downstream_overlap[down_idx]);
+ free(downstream_overlap[down_idx]);
+ }
- uov = dov = 0;
+ uov = dov = 0;
- exon_idx++;
- goto exon_label;
- }
+ exon_idx++;
+ goto exon_label;
+ }
- if ( prev_exon_start <= read_start && read_stop <= prev_exon_stop ) { // read is inside previous exon
+ if ( prev_exon_start <= read_start && read_stop <= prev_exon_stop ) { // read is inside previous exon
- // output of unused i.e. unspliced reads
- fprintf(unfiltered_out_fs,"%lu\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\n",
- unfiltered_read_nr,chr,strand,seq,0,read_size,prb,cal_prb,chastity,gene_id,read_start,-1,-1,read_stop,-1);
- unfiltered_read_nr++;
-
- exonicReadCtr++;
- goto next_read;
- }
+ // output of unused i.e. unspliced reads
+ fprintf(unfiltered_out_fs,"%lu\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\n",
+ unfiltered_read_nr,chr,strand,seq,0,read_size,prb,cal_prb,chastity,gene_id,read_start,-1,-1,read_stop,-1);
+ unfiltered_read_nr++;
+
+ exonicReadCtr++;
+ goto next_read;
+ }
- if ( read_stop < prev_exon_stop ) // go to next read
- goto next_read;
+ if ( read_stop < prev_exon_stop ) { // go to next read
+ //printf("%d\t%d\t%d\n",read_start,prev_exon_start,prev_exon_stop);
+ //if( exon_idx > 1) {
+ // printf("%d\t%d\n", currentGene->exon_starts[exon_idx-2], currentGene->exon_stops[exon_idx-2]);
+ // printf("---\n");
+ //}
- // if this position is reached the read is somehow overlapping or
- // exactly on the exon boundary.
- if ( (prev_exon_stop - read_start + 1) >= min_overlap && (prev_exon_stop - read_start + 1) <= max_overlap ) { // read overlaps with previous exon boundary
- //printf("%s\n",current_line);
- prevOverlapCtr++;
- currentRead = create_read(chr,read_start,seq,id,strand,mismatch,occurrence,size,cut,prb,cal_prb,chastity);
- upstream_overlap[uov] = currentRead;
- uov++;
- goto next_read;
- }
-
- if ( ( read_stop - cur_exon_start + 1) >= min_overlap && (read_stop - cur_exon_start + 1) <= max_overlap ) { // read overlaps with current exon boundary
- //printf("%s\n",current_line);
- currentOverlapCtr++;
- currentRead = create_read(chr,read_start,seq,id,strand,mismatch,occurrence,size,cut,prb,cal_prb,chastity);
- downstream_overlap[dov] = currentRead;
- dov++;
- goto next_read;
- }
+ uselessReadCtr++;
+ goto next_read;
+ }
- uselessReadCtr++;
- goto next_read; // read was not useful i.e. not overlapping/starting at exon boundaries
+ // if this position is reached the read is somehow overlapping or
+ // exactly on the exon boundary.
+ if ( (prev_exon_stop - read_start + 1) >= min_overlap && (prev_exon_stop - read_start + 1) <= max_overlap ) { // read overlaps with previous exon boundary
+ //printf("%s\n",current_line);
+ previousOverlapCtr++;
+ currentRead = create_read(chr,read_start,seq,id,strand,mismatch,occurrence,size,cut,prb,cal_prb,chastity);
+ assert (uov < SIZE);
+ upstream_overlap[uov] = currentRead;
+ uov++;
+ goto next_read;
+ }
+
+ if ( ( read_stop - cur_exon_start + 1) >= min_overlap && (read_stop - cur_exon_start + 1) <= max_overlap ) { // read overlaps with current exon boundary
+ //printf("%s\n",current_line);
+ currentOverlapCtr++;
+ currentRead = create_read(chr,read_start,seq,id,strand,mismatch,occurrence,size,cut,prb,cal_prb,chastity);
+ assert (dov < SIZE);
+ downstream_overlap[dov] = currentRead;
+ dov++;
+ goto next_read;
+ }
- } else { // read is not within gene borders
+ uselessReadCtr++;
+ goto next_read; // read was not useful i.e. not overlapping/starting at exon boundaries
+ } else { // read is not within gene borders
+ read_outside_gene_ctr++;
- if (uov != 0 && dov != 0)
- combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
+ if (uov != 0 && dov != 0)
+ combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
- for(up_idx=0;up_idx<uov;up_idx++) {
- free_read(upstream_overlap[up_idx]);
- free(upstream_overlap[up_idx]);
- }
+ for(up_idx=0;up_idx<uov;up_idx++) {
+ free_read(upstream_overlap[up_idx]);
+ free(upstream_overlap[up_idx]);
+ }
- for(down_idx=0;down_idx<dov;down_idx++) {
- free_read(downstream_overlap[down_idx]);
- free(downstream_overlap[down_idx]);
- }
+ for(down_idx=0;down_idx<dov;down_idx++) {
+ free_read(downstream_overlap[down_idx]);
+ free(downstream_overlap[down_idx]);
+ }
- uov = dov = 0;
+ uov = dov = 0;
- if ( currentGene->stop < read_stop ) { // go to next gene
- gene_idx++;
- exon_idx = 1;
- currentGene = (*allGenes)[gene_idx];
- while( currentGene == 0 && gene_idx < numGenes) {
- currentGene = (*allGenes)[gene_idx];
+ if ( currentGene->stop < read_stop ) { // go to next gene
gene_idx++;
+ exon_idx = 1;
+ currentGene = (*allGenes)[gene_idx];
+ while( currentGene == 0 && gene_idx < numGenes) {
+ currentGene = (*allGenes)[gene_idx];
+ gene_idx++;
+ }
+ //printf("currentGene->start / currentGene->stop %d/%d pos is %d\n",currentGene->start,currentGene->stop,pos);
+ continue;
}
- //printf("currentGene->start / currentGene->stop %d/%d pos is %d\n",currentGene->start,currentGene->stop,pos);
- continue;
- }
- if ( read_start < currentGene->start ) { // go to next read
- skippedReadCtr++;
-
- next_read:
-
- lineBeginPtr = lineEndPtr;
- while (*(char*)lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
- lineEndPtr++;
- readCtr += 1;
- current_line = strncpy(current_line,lineBeginPtr,lineEndPtr-lineBeginPtr);
- current_line[lineEndPtr-lineBeginPtr] = '\0';
- continue;
+ if ( read_start < currentGene->start ) { // go to next read
+ skippedReadCtr++;
+
+ next_read:
+
+ lineBeginPtr = lineEndPtr;
+ while (*(char*)lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
+ lineEndPtr++;
+ readCtr += 1;
+ current_line = strncpy(current_line,lineBeginPtr,lineEndPtr-lineBeginPtr);
+ current_line[lineEndPtr-lineBeginPtr] = '\0';
+ continue;
+ }
}
}
- }
+
+ //} // end of for all strands
if (uov != 0 && dov != 0)
combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
printf("Processed %d reads in total while %d reads where skipped.\n",readCtr,skippedLinesCounter);
printf("%d were totally intronic. %d were outside of genes and %d were occurring on several positions.\n",uselessReadCtr,skippedReadCtr,multioccurReadCtr);
printf("%d reads were useless in total\n",uselessReadCtr+skippedReadCtr+multioccurReadCtr);
+ printf("\t%d useless reads\n",uselessReadCtr);
+ printf("\t%d skipped reads\n",skippedReadCtr);
+ printf("\t%d multioccurring\n",multioccurReadCtr);
+ printf("\t%d wrong_strand\n",wrong_strand_ctr);
printf("%d reads were totally exonic\n",exonicReadCtr);
- printf("%d reads overlap with prev. exon. %d reads overlap with current exon\n",prevOverlapCtr,currentOverlapCtr);
+ printf("%d reads overlap with prev. exon. %d reads overlap with current exon\n",previousOverlapCtr,currentOverlapCtr);
printf("%d reads where newly combined from two original reads\n",combined_reads);
- printf("Total used reads: %d\n",exonicReadCtr+endPrevCtr+prevOverlapCtr+currentStartCtr+currentOverlapCtr);
+ printf("Total used reads: %d (some may not be combined)\n",exonicReadCtr+combined_reads);
status = munmap(reads_area,reads_filesize);
if(status != 0)
}
/*
- *
- *
+ * Now we join the candidate reads wherever possible according to the following
+ * scheme:
*
* ACGTACGTCA GTXXXXXXXXAG ACGTAGACGT
* p1 e1 e2 p2
*
- *
+ *
*
*
*/
cRead->chastity);
}
+
+void open_file(const char* filename, const char* mode, FILE** fs) {
+ *fs = fopen(filename,mode);
+ if(*fs == NULL) {
+ printf("Error: Could not open file: %s",filename);
+ exit(EXIT_FAILURE);
+ }
+}
+
/*
- * The mainloop
+ * The program expects 7 arguments, namely:
+ *
+ * - The strand to be used
+ * - The filename of the gff file
+ * - The filename of the reads file
+ * - The name of the spliced reads ouput file
+ * - The name of the unspliced reads ouput file
+ * - The offset for counting new spliced reads
+ * - The offset for counting new unspliced reads
*
*/
int main(int argc, char* argv[]) {
- if(argc != 7) {
+ if(argc != 8) {
printf("%s\n",info);
exit(EXIT_FAILURE);
}
int status;
int filenameSize = 256;
char* gff_filename = malloc(sizeof(char)*filenameSize);
- char* reads_filename = malloc(sizeof(char)*filenameSize);
- char* output_filename = malloc(sizeof(char)*filenameSize);
- char* unfiltered_output_filename = malloc(sizeof(char)*filenameSize);
- strncpy(gff_filename,argv[1],filenameSize);
- strncpy(reads_filename,argv[2],filenameSize);
- strncpy(output_filename,argv[3],filenameSize);
- strncpy(unfiltered_output_filename,argv[4],filenameSize);
+ current_strand = argv[1][0];
+ printf("current strand is %c\n",current_strand);
+ strncpy(gff_filename,argv[2],filenameSize);
+
+ FILE *gff_fs;
+ FILE *reads_fs;
+ FILE *out_fs;
+ FILE *unfiltered_out_fs;
- FILE *gff_fs = fopen(gff_filename,"r");
- FILE *reads_fs = fopen(reads_filename,"r");
- FILE *out_fs = fopen(output_filename,"w");
- FILE *unfiltered_out_fs = fopen(unfiltered_output_filename,"w");
+ // open file streams for all needed input/output files
+ open_file(argv[2],"r",&gff_fs);
+ open_file(argv[3],"r",&reads_fs);
+ open_file(argv[4],"w",&out_fs);
+ open_file(argv[5],"w",&unfiltered_out_fs);
- read_nr = strtoul(argv[5],NULL,10);
+ read_nr = strtoul(argv[6],NULL,10);
read_nr++;
- unfiltered_read_nr = strtoul(argv[6],NULL,10);
+ unfiltered_read_nr = strtoul(argv[7],NULL,10);
unfiltered_read_nr++;
- if(gff_fs == NULL) {
- printf("Error: Could not open file: %s",gff_filename);
- exit(EXIT_FAILURE);
- }
-
- if(reads_fs == NULL) {
- printf("Error: Could not open file: %s",reads_filename);
- exit(EXIT_FAILURE);
- }
-
- if(out_fs == NULL) {
- printf("Error: Could not open file: %s",output_filename);
- exit(EXIT_FAILURE);
- }
-
- if(unfiltered_out_fs == NULL) {
- printf("Error: Could not open file: %s",unfiltered_output_filename);
- exit(EXIT_FAILURE);
- }
-
+ // allocate and load all genes and then close the gff file stream
struct gene** allGenes;
int numGenes = parse_gff(gff_filename,gff_fs,&allGenes);
status = fclose(gff_fs);
}}
printf("Exon coverage is %f\n",(double)exon_cov/30432563);
+ // now that we loaded all neccessary data we start to process the reads
process_reads(reads_fs,&allGenes,numGenes,out_fs,unfiltered_out_fs);
+ // free all allocated ressources
for(g_idx=0;g_idx<numGenes;g_idx++) {
- if(allGenes[g_idx] != 0)
+ if(allGenes[g_idx] != 0) {
free_gene(allGenes[g_idx]);
free(allGenes[g_idx]);
-
+ }
}
free(allGenes);
if(status != 0)
perror("fclose");
- free(reads_filename);
- free(output_filename);
-
return 0;
}
-
-/*
- * TODO:
- * - Check strand -> done simple (only if equal)
- * - check for [AC] and similar entries -> done simple (see function
- * - remove_ambiguities (exchanges [XY] by the second entry)
- */
-
-#ifdef _FDEBUG_
- if(read_nr == DEBUG_READ) {
- printf("read nr %lu",read_nr);
- printf("pos: %d %d %d %d\n",p_start,exon_stop,exon_start,p_stop);
- printf("u/d range: %d %d\n",up_range,down_range);
- printf("u/d size: %d %d\n",u_size,d_size);
- printf("u/d off: %d %d\n",u_off,d_off);
- printf("add_pos/down_range-d_size: %d %d\n",additional_pos,(down_range - d_size));
-
- printf("******************\n");
-
- printf("%s\n",up_read->seq);
- //printf("%s\n",new_up_seq);
-
- printf("******************\n");
-
- printf("%s\n",down_read->seq);
- //printf("%s\n",new_down_seq);
-
- printf("******************\n");
- printf("%s\n",new_seq);
- printf("%s\n",new_prb);
- printf("%s\n",new_cal_prb);
- printf("%s\n",new_chastity);
- }
-#endif // _FDEBUG_