//
//
//
-//
////////////////////////////////////////////////////////////
#include <sys/mman.h>
+#include <sys/stat.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
+#include <unistd.h>
+#include <math.h>
+#include "common.h"
#include "datastructures.h"
-void parse_gff(char* filename,FILE* fid,struct gene* allGenes);
+int parse_gff(char* filename,FILE* fid,struct gene*** allGenes);
+
+void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes,FILE* out_fs);
+
+void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size, FILE* out_fs);
+
+int fitting(char* up_prb, char* up_prb_end, char* down_prb, char* down_prb_end);
-static char *info = "Usage is:\n./filterReads gff reads";
+void remove_ambiguities(char * old_seq, int old_seq_size, char* new_seq);
+
+static char *info = "Usage is:\n./filterReads gff reads output";
+
+const int read_size = 36;
int main(int argc, char* argv[]) {
- if(argc != 3) {
+ if(argc != 4) {
printf("%s\n",info);
exit(EXIT_FAILURE);
}
- //static size_t page_size;
- // page_size = (size_t) sysconf (_SC_PAGESIZE);
+ int status;
int filenameSize = 256;
char *gff_filename = malloc(sizeof(char)*filenameSize);
char *reads_filename = malloc(sizeof(char)*filenameSize);
+ char *output_filename = malloc(sizeof(char)*filenameSize);
strncpy(gff_filename,argv[1],filenameSize);
strncpy(reads_filename,argv[2],filenameSize);
+ strncpy(output_filename,argv[3],filenameSize);
FILE *gff_fs = fopen(gff_filename,"r");
FILE *reads_fs = fopen(reads_filename,"r");
+ FILE *out_fs = fopen(output_filename,"w");
if(gff_fs == NULL) {
printf("Error: Could not open file: %s",gff_filename);
exit(EXIT_FAILURE);
}
- struct gene **allGenes;
- parse_gff(gff_filename,gff_fs,allGenes);
+ if(out_fs == NULL) {
+ printf("Error: Could not open file: %s",output_filename);
+ exit(EXIT_FAILURE);
+ }
+
+ struct gene** allGenes;
+ int numGenes = parse_gff(gff_filename,gff_fs,&allGenes);
+ status = fclose(gff_fs);
+ if(status != 0)
+ printf("closing of gff filestream failed!\n");
+ printf("Successfully parsed gff file! Found %d genes.\n",numGenes);
+
+ process_reads(reads_fs,&allGenes,numGenes,out_fs);
+
+ status = fclose(reads_fs);
+ status = fclose(out_fs);
+ if(status != 0)
+ perror("fclose");
+
+ //free(allGenes);
+ free(gff_filename);
+ free(reads_filename);
+ return 0;
+}
+
+void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* out_fs) {
+ int status;
+
+ int buffer_size= 64;
+ int chr = 0;
+ int pos = 0;
+ char* seq = malloc(sizeof(char)*buffer_size);
+ unsigned long id = 0;
+ char strand = ' ';
+ int mismatch = 0;
+ int occurrence = 0;
+ int size = 0;
+ int cut = 0;
+ char* prb = malloc(sizeof(char)*buffer_size);
+ char* cal_prb = malloc(sizeof(char)*buffer_size);
+ char* chastity = malloc(sizeof(char)*buffer_size);
int reads_fid = fileno(reads_fs);
- size_t mmapAreaSize = 16;
- void *reads_area = mmap (NULL,mmapAreaSize,PROT_READ|PROT_WRITE,MAP_PRIVATE,reads_fid,0);
- if((long int) reads_area == -1)
- printf("mmapping failed!\n");
-
- char *test_string = malloc(sizeof(char)*10);
- strncpy(test_string,(char*)reads_area,10);
- //printf("Reads area at %s\n",test_string);
-
- int status = munmap(reads_area,mmapAreaSize);
+ struct stat reads_stat;
+ if ( fstat(reads_fid,&reads_stat) == -1) {
+ perror("fstat");
+ exit(EXIT_FAILURE);
+ }
+ off_t reads_filesize = reads_stat.st_size;
+ printf("Reads file is of size %lu bytes\n",(unsigned long) reads_filesize);
+ int numReads = reads_filesize / 178.0;
+
+ void *reads_area = mmap (NULL,reads_filesize,PROT_READ|PROT_WRITE,MAP_PRIVATE,reads_fid,0);
+ if (reads_area == MAP_FAILED) {
+ perror("mmap");
+ exit(EXIT_FAILURE);
+ }
+ close(reads_fid);
+ printf("Successfully mapped %lu bytes of reads file into memory\n",(unsigned long)reads_filesize);
+
+ void* linePtr = reads_area;
+ char* current_line = malloc(sizeof(char)*256);
+
+ int SIZE = 500;
+ // initialize boundary arrays
+ void** upstream_end = malloc(sizeof(void*)*SIZE);
+ void** upstream_overlap = malloc(sizeof(void*)*SIZE);
+ void** downstream_start = malloc(sizeof(void*)*SIZE);
+ void** downstream_overlap= malloc(sizeof(void*)*SIZE);
+ int ue,uo,ds,dov;
+ ue = uo = ds = dov = 0;
+
+ int skippedLinesCounter = 0;
+
+ int prev_exon_stop = -1;
+ int cur_exon_start = -1;
+
+ current_line = strncpy(current_line,linePtr,256);
+ int gene_idx = 0;
+ int exon_idx = 1;
+ struct gene* currentGene = (*allGenes)[gene_idx];
+
+ int readCtr = 0;
+ int old_gene_stop = -1;
+ int old_pos = 0;
+ char* posPtr;
+ int new_chr = 0;
+ int new_pos = 0;
+ char* tmp_line = malloc(sizeof(char)*256);
+ // start of the parsing loop
+ while(1) {
+ if (gene_idx == numGenes || strcmp(current_line,"") == 0)
+ break;
+
+ if (readCtr != 0 && readCtr % 1000000 == 0)
+ printf("Processed %d/%d genes and %d/%d reads.\n",readCtr,gene_idx,numGenes,readCtr,numReads);
+
+ //if (gene_idx >= 1833)
+ // printf("currentGene start/stop: %d/%d. Positions is %d\n",currentGene->start,currentGene->stop,pos);
+
+ old_pos = pos;
+ status = sscanf(current_line,"%d\t%d\t%s\t%lu\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
+ &chr,&pos,seq,&id,&strand,&mismatch,&occurrence,&size,&cut,prb,cal_prb,chastity);
+ if (status < 12) {
+ skippedLinesCounter++;
+ }
+ //printf("gene start/stop %d,%d, pos: %d\n",currentGene->start,currentGene->stop,pos);
+ // if the read is occurring several times elsewhere then get rid of it
+ if(occurrence != 1) {
+ while (*(char*)linePtr != '\n') linePtr++;
+ linePtr++;
+ readCtr += 1;
+ current_line = strncpy(current_line,linePtr,256);
+ continue;
+ }
+
+ if (!(currentGene->start <= pos && (pos + read_size-1) <= currentGene->stop)) { // read is not within gene borders
+
+ if ( currentGene->stop < (pos + read_size-1) || currentGene->start < old_gene_stop ) { // go to next gene
+ gene_idx++;
+ exon_idx = 1;
+ old_gene_stop = currentGene->stop ;
+ currentGene = (*allGenes)[gene_idx];
+ //printf("currentGene->start / currentGene->stop %d/%d pos is %d\n",currentGene->start,currentGene->stop,pos);
+ ue = uo = ds = dov = 0;
+ continue;
+ }
+
+ if ( pos < currentGene->start || pos < old_pos) { // go to next read
+ next_read:
+
+ //posPtr = linePtr;
+ //while (1) {
+ // printf("posPtr points to %c\n",*(char*)posPtr);
+ // if ((*(char*)posPtr) == '\n') {
+ // posPtr++;
+ // tmp_line = strncpy(tmp_line,posPtr,256);
+
+ // if (strcmp(tmp_line,"") == 0)
+ // break;
+
+ // sscanf(tmp_line,"%d\t%d\t",new_chr,new_pos);
+ // printf("new_pos %d\n",new_pos);
+ // if (new_pos >= currentGene->start) {
+ // linePtr = posPtr;
+ // break;
+ // }
+ // }
+ // posPtr++;
+ //}
+ //printf("Went out!\n");
+
+ while (*(char*)linePtr != '\n') linePtr++;
+ linePtr++;
+ readCtr += 1;
+ current_line = strncpy(current_line,linePtr,256);
+
+ continue;
+ }
+
+ } else { // read IS within gene borders
+
+ exon_label:
+
+ if (exon_idx == currentGene->num_exons) {
+ gene_idx++;
+ exon_idx = 1;
+ currentGene = (*allGenes)[gene_idx];
+ continue;
+ }
+
+ prev_exon_stop = currentGene->exon_stops[exon_idx-1];
+ cur_exon_start = currentGene->exon_starts[exon_idx];
+
+ if (cur_exon_start - prev_exon_stop < 6 || cur_exon_start < pos ) { // go to next exon
+ exon_idx++;
+
+ if (ue != 0 && dov != 0)
+ combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov,out_fs);
+
+ if (uo != 0 && ds != 0)
+ combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds,out_fs);
+
+ ue = uo = ds = dov = 0;
+ goto exon_label;
+ }
+
+ if ( pos + (read_size-1) < prev_exon_stop ) // go to next read
+ goto next_read;
+
+ // if this position is reached the read is somehow overlapping or
+ // exactly on the exon boundary. now determine where exactly:
+ if (pos + (read_size-1) == prev_exon_stop) { // read ends at previous exon end
+ upstream_end[ue] = linePtr;
+ ue++;
+ goto next_read;
+ }
+
+ if ( (prev_exon_stop - pos) >= 6 && (prev_exon_stop - pos) <= 30) { // read overlaps with previous exon boundary
+ upstream_overlap[uo] = linePtr;
+ uo++;
+ goto next_read;
+ }
+
+ if ( pos == cur_exon_start ) { // read starts at current exon start
+ downstream_start[ds] = linePtr;
+ ds++;
+ goto next_read;
+ }
+
+ if ( (cur_exon_start - pos) >= 6 && (cur_exon_start - pos) <= 30 ) { // read overlaps with current exon boundary
+ downstream_overlap[dov] = linePtr;
+ dov++;
+ goto next_read;
+ }
+
+ goto next_read; // read was not useful i.e. not overlapping/starting at exon boundaries
+ }
+ }
+
+ combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov,out_fs);
+ combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds,out_fs);
+
+ free(upstream_end);
+ free(upstream_overlap);
+ free(downstream_start);
+ free(downstream_overlap);
+
+ printf("Processed %d reads in total while %d reads where skipped.\n",readCtr,skippedLinesCounter);
+
+ status = munmap(reads_area,reads_filesize);
if(status != 0)
printf("munmap failed!\n");
- status = fclose(gff_fs);
- status += fclose(reads_fs);
- if(status != 0)
- printf("closing of filestreams failed!\n");
+ free(seq);
+ free(prb);
+ free(cal_prb);
+ free(chastity);
+}
+
+void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size,FILE* out_fs) {
+ int up_idx, down_idx, status;
+ char* upstream_line = malloc(sizeof(char)*256);
+ char* downstream_line = malloc(sizeof(char)*256);
+
+ int buffer_size= 64;
+
+ int up_chr = 0;
+ int up_pos = 0;
+ char* up_seq = malloc(sizeof(char)*buffer_size);
+ int up_id = 0;
+ char up_strand = ' ';
+ int up_mismatch = 0;
+ int up_occurrence = 0;
+ int up_sz = 0;
+ int up_cut = 0;
+ char* up_prb = malloc(sizeof(char)*buffer_size);
+ char* up_cal_prb = malloc(sizeof(char)*buffer_size);
+ char* up_chastity = malloc(sizeof(char)*buffer_size);
+
+ int down_chr = 0;
+ int down_pos = 0;
+ char* down_seq = malloc(sizeof(char)*buffer_size);
+ int down_id = 0;
+ char down_strand = ' ';
+ int down_mismatch = 0;
+ int down_occurrence = 0;
+ int down_sz = 0;
+ int down_cut = 0;
+ char* down_prb = malloc(sizeof(char)*buffer_size);
+ char* down_cal_prb = malloc(sizeof(char)*buffer_size);
+ char* down_chastity = malloc(sizeof(char)*buffer_size);
+
+ int new_chr = 0;
+ char* new_seq = malloc(sizeof(char)*buffer_size);
+ char new_strand = ' ';
+ char* new_prb = malloc(sizeof(char)*buffer_size);
+ char* new_cal_prb = malloc(sizeof(char)*buffer_size);
+ char* new_chastity = malloc(sizeof(char)*buffer_size);
+
+ up_idx=0;
+ down_idx=0;
+ while(1) {
+ if (up_idx == up_size || down_idx == down_size || up_strand != down_strand)
+ break;
+
+ strncpy(upstream_line,upstream[up_idx],256);
+ status = sscanf(upstream_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
+ &up_chr,&up_pos,up_seq,&up_id,&up_strand,&up_mismatch,&up_occurrence,&up_sz,
+ &up_cut,up_prb,up_cal_prb,up_chastity);
- printf("sizeof(struct gene) %d\n",sizeof(struct gene));
- printf("sizeof(struct read) %d\n",sizeof(struct read));
+ strncpy(downstream_line,downstream[down_idx],256);
+ status = sscanf(downstream_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
+ &down_chr,&down_pos,down_seq,&down_id,&down_strand,&down_mismatch,&down_occurrence,&down_sz,
+ &down_cut,down_prb,down_cal_prb,down_chastity);
+ char* new_up_seq = malloc(sizeof(char)*read_size);
+ char* new_down_seq = malloc(sizeof(char)*read_size);
- free(gff_filename);
- free(reads_filename);
+ remove_ambiguities(up_seq,strlen(up_seq),new_up_seq);
+ remove_ambiguities(down_seq,strlen(down_seq),new_down_seq);
+
+ new_seq[0] = '\0';
+ new_prb[0] = '\0';
+ new_cal_prb[0] = '\0';
+ new_chastity[0] = '\0';
+
+ int fit;
+ int w_size = 6;
+ int overlap = 0;
+ if (up_pos+35 == exon_stop) { // merge with read which is downstream overlapping
+ overlap = exon_start - down_pos;
+
+ fit = fitting(up_prb+(36-w_size),up_prb+36,down_prb+overlap,down_prb+overlap+w_size);
+ if (fit != 1)
+ goto end;
+
+ new_chr = up_chr;
+ new_strand = up_strand;
+
+ strncat(new_seq,new_up_seq+(36-overlap),overlap);
+ strncat(new_prb,up_prb+(36-overlap),overlap);
+ strncat(new_cal_prb,up_cal_prb+(36-overlap),overlap);
+ strncat(new_chastity,up_chastity+(36-overlap),overlap);
+
+ strncat(new_seq,new_down_seq+overlap,36-overlap);
+ strncat(new_prb,down_prb+overlap,36-overlap);
+ strncat(new_cal_prb,down_cal_prb+overlap,36-overlap);
+ strncat(new_chastity,down_chastity+overlap,36-overlap);
+
+ //printf("Between exon stop/start %d/%d : merging pos %d %d with overlap %d\n",exon_stop,exon_start,up_pos+35,down_pos, overlap);
+
+ } // merge with read which is upstream overlapping
+
+ if (down_pos == exon_start) {
+ overlap = up_pos+36 - exon_stop;
+ //printf("overlap is %d\n",overlap);
+ //printf("pos are: %d %d\n",up_pos,down_pos);
+ fit = fitting(up_prb+36-overlap-w_size,up_prb+36-overlap,down_prb,down_prb+w_size);
+ if (fit != 1)
+ goto end;
+
+ new_chr = up_chr;
+ new_strand = up_strand;
+
+ strncat(new_seq,new_up_seq,(36-overlap));
+ strncat(new_prb,up_prb,(36-overlap));
+ strncat(new_cal_prb,up_cal_prb,(36-overlap));
+ strncat(new_chastity,up_chastity,(36-overlap));
+
+ strncat(new_seq,new_down_seq,overlap);
+ strncat(new_prb,down_prb,overlap);
+ strncat(new_cal_prb,down_cal_prb,overlap);
+ strncat(new_chastity,down_chastity,overlap);
+
+ //printf("Between exon stop/start %d/%d : merging pos %d %d with overlap %d\n",exon_stop,exon_start,up_pos,down_pos, overlap);
+ }
+
+ if ( !(up_pos+35 == exon_stop) && !(down_pos == exon_start) )
+ printf("ERROR: Between exon stop/start %d/%d : merging pos %d %d with overlap %d\n",exon_stop,exon_start,up_pos,down_pos, overlap);
+
+ fprintf(out_fs,"%d\t%c\t%s\t%d\t%s\t%s\t%s\n",
+ new_chr,new_strand,new_seq,read_size,new_prb,new_cal_prb,new_chastity);
+
+ end:
+
+ up_idx++;
+ down_idx++;
+
+ free(new_up_seq);
+ free(new_down_seq);
+ }
+}
+
+int fitting(char* up_prb, char* up_prb_end, char* down_prb, char* down_prb_end) {
+ double epsilon_mean = 15.0;
+ double epsilon_var = 10.0;
+ double mean_up = 0;
+ double variance_up = 0;
+ double mean_down = 0;
+ double variance_down = 0;
+
+ char *up_ptr = up_prb;
+ char *down_ptr = down_prb;
+
+ int w_size = 0;
+ while(up_ptr != up_prb_end) {
+ mean_up += (*up_ptr)-50;
+ mean_down += (*down_ptr)-50;
+ w_size++;
+ up_ptr++;
+ down_ptr++;
+ }
+ mean_up /= w_size;
+ mean_down /= w_size;
+
+
+ up_ptr = up_prb;
+ down_ptr = down_prb;
+ w_size = 0;
+ while(up_ptr != up_prb_end) {
+ variance_up += pow((*up_prb)-50 - mean_up,2);
+ variance_down += pow((*down_prb)-50 - mean_down,2);
+ w_size++;
+ up_ptr++;
+ down_ptr++;
+ }
+ variance_up /= (w_size-1);
+ variance_down /= (w_size-1);
+
+ //printf("means: %f %f, variances: %f %f\n",mean_up,mean_down,variance_up,variance_down);
+
+ if ( abs(mean_up - mean_down) < epsilon_mean && abs(variance_up - variance_down) < epsilon_var )
+ return 1;
+
return 0;
}
+
+void remove_ambiguities(char * old_seq, int old_seq_size, char* new_seq) {
+ //printf("old seq: %s\n",old_seq);
+ //printf("new seq: %s\n",new_seq);
+
+ int idx=0;
+ int new_idx = 0;
+ while(idx<old_seq_size) {
+ if (old_seq[idx] == '[') {
+ new_seq[new_idx] = old_seq[++idx];
+ new_idx++;
+ idx += 3;
+ continue;
+ }
+
+ new_seq[new_idx] = old_seq[idx++];
+ new_idx++;
+ }
+ //printf("old seq: %s\n",old_seq);
+ //printf("new seq: %s\n",new_seq);
+}
+
+/*
+ * TODO:
+ * - Check strand -> done simple (only if equal)
+ * - check for [AC] and similar entries -> done simple (see function
+ * remove_ambiguities (exchanges [XY] by the first entry)
+ */