#include <stdlib.h>
#include <string.h>
#include <unistd.h>
+#include <math.h>
+#include "common.h"
#include "datastructures.h"
int parse_gff(char* filename,FILE* fid,struct gene*** allGenes);
void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size);
+int fitting(char* up_prb, char* up_prb_end, char* down_prb, char* down_prb_end);
+
static char *info = "Usage is:\n./filterReads gff reads";
+const int read_size = 36;
+
int main(int argc, char* argv[]) {
if(argc != 3) {
if(status != 0)
printf("closing of gff filestream failed!\n");
+ printf("Successfully parsed gff file! Found %d genes.\n",numGenes);
+
process_reads(reads_fs,&allGenes,numGenes);
- status += fclose(reads_fs);
+ status = fclose(reads_fs);
if(status != 0)
printf("closing of filestreams failed!\n");
return 0;
}
-
void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes) {
int status;
int id = 0;
char strand = ' ';
int mismatch = 0;
- int repetition = 0;
+ int occurrence = 0;
int size = 0;
int cut = 0;
char* prb = malloc(sizeof(char)*buffer_size);
int reads_fid = fileno(reads_fs);
struct stat* reads_stat = malloc(sizeof(struct stat));
status = fstat(reads_fid,reads_stat);
- if(status != 0)
- printf("could not stat reads file");
+ if(status != 0) {
+ fprintf(stderr,"could not stat reads file");
+ exit(EXIT_FAILURE);
+ }
off_t reads_filesize = reads_stat->st_size;
free(reads_stat);
const size_t page_size = (size_t) sysconf(_SC_PAGESIZE);
- printf("page_size is %d\n",page_size);
- printf("reads file is of size %d bytes\n",reads_filesize);
size_t mmapAreaSize = (reads_filesize / page_size)+1 ;
+ printf("Page_size is %d. Reads file is of size %d bytes\n",(int)page_size,reads_filesize);
- void *reads_area = mmap (NULL,mmapAreaSize,PROT_READ|PROT_WRITE,MAP_PRIVATE,reads_fid,0);
- if((long int) reads_area == -1)
- printf("mmapping failed!\n");
+ void *reads_area = mmap (NULL,reads_filesize,PROT_READ|PROT_WRITE,MAP_PRIVATE,reads_fid,0);
+ if((long int) reads_area == -1) {
+ fprintf(stderr,"mmapping failed!\n");
+ exit(EXIT_FAILURE);
+ }
+
+ printf("Successfully mapped %d bytes of reads file into memory\n",reads_filesize);
void* linePtr = reads_area;
char* current_line = malloc(sizeof(char)*256);
int gene_idx = 0;
int exon_idx = 1;
struct gene* currentGene = (*allGenes)[gene_idx];
+
+ int readCtr = 0;
+ // start of the parsing loop
while(1) {
if (gene_idx == numGenes || strcmp(current_line,"") == 0)
break;
- label:
+ if (readCtr % 10000 == 0)
+ printf("Processed %d reads.\n",readCtr);
+
+ //if (readCtr == 10000000)
+ // exit(EXIT_FAILURE);
+
status = sscanf(current_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
- &chr,&pos,seq,&id,&strand,&mismatch,&repetition,&size,&cut,prb,cal_prb,chastity);
+ &chr,&pos,seq,&id,&strand,&mismatch,&occurrence,&size,&cut,prb,cal_prb,chastity);
if (status < 12) {
skippedLinesCounter++;
}
- printf("gene start/stop %d,%d, pos: %d\n",currentGene->start,currentGene->stop,pos);
+ //printf("gene start/stop %d,%d, pos: %d\n",currentGene->start,currentGene->stop,pos);
- if (!(currentGene->start <= pos && pos+35 <= currentGene->stop)) {
+ // if the read is occurring several times elsewhere then get rid of it
+ if(occurrence != 1) {
+ while (*(char*)linePtr != '\n') linePtr++;
+ linePtr++;
+ readCtr += 1;
+ current_line = strncpy(current_line,linePtr,256);
+ continue;
+ }
+
+ if (!(currentGene->start <= pos && (pos + read_size-1) <= currentGene->stop)) { // read is not within gene borders
- // go to next gene
- if ( currentGene->stop < pos+35 ) {
+ if ( currentGene->stop < (pos + read_size-1) ) { // go to next gene
gene_idx++;
exon_idx = 1;
currentGene = (*allGenes)[gene_idx];
continue;
}
- // go to next line
- if ( pos < currentGene->start ) {
+ if ( pos < currentGene->start ) { // go to next read
+ next_read:
+
while (*(char*)linePtr != '\n') linePtr++;
linePtr++;
-
+ readCtr += 1;
current_line = strncpy(current_line,linePtr,256);
continue;
}
- } else {
+ } else { // read IS within gene borders
exon_label:
prev_exon_stop = currentGene->exon_stops[exon_idx-1];
cur_exon_start = currentGene->exon_starts[exon_idx];
- printf("prev_exon_stop %d cur_exon_start %d pos %d\n",prev_exon_stop,cur_exon_start,pos);
- printf("exon_idx %d\n",exon_idx);
-
- if (exon_idx == currentGene->num_exons)
- break;
+ //printf("prev_exon_stop %d cur_exon_start %d pos %d\n",prev_exon_stop,cur_exon_start,pos);
+ //printf("exon_idx %d\n",exon_idx);
- if (cur_exon_start - prev_exon_stop < 6 || cur_exon_start+36 < pos ) {
+ if (exon_idx == currentGene->num_exons) {
+ gene_idx++;
+ exon_idx = 1;
+ currentGene = (*allGenes)[gene_idx];
+ continue;
+ }
+
+ if (cur_exon_start - prev_exon_stop < 6 || cur_exon_start < pos ) { // go to next exon
exon_idx++;
combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov);
combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds);
goto exon_label;
}
- if ( pos < prev_exon_stop - 36 ) {
- while (*(char*)linePtr != '\n') linePtr++;
- linePtr++;
-
- current_line = strncpy(current_line,linePtr,256);
- continue;
+ if ( pos < prev_exon_stop - read_size + 1 ) { // go to next read
+ goto next_read;
}
- // upstream ending
- if (pos+35 == prev_exon_stop) {
+ // if this position is reached the read is somehow overlapping or
+ // exactly on the exon boundary. now determine where exactly:
+
+ if (pos + (read_size-1) == prev_exon_stop) { // read ends at previous exon end
upstream_end[ue] = linePtr;
ue++;
- } else {
+ goto next_read;
+ }
- // upstream overlapping
- if (prev_exon_stop - pos >= 6 && prev_exon_stop - pos <= 30) {
- upstream_overlap[uo] = linePtr;
- uo++;
- }
+ if (prev_exon_stop - pos >= 6 && prev_exon_stop - pos <= 30) { // read overlaps previous exon boundary
+ upstream_overlap[uo] = linePtr;
+ uo++;
+ goto next_read;
}
- // downstream starting
- if (pos == cur_exon_start) {
+ if (pos == cur_exon_start) { // reads start at current exon start
downstream_start[ds] = linePtr;
ds++;
- } else {
+ goto next_read;
+ }
- // downstream overlapping
- if (cur_exon_start - pos >= 6 && cur_exon_start - pos <= 30 ) {
- downstream_overlap[dov] = linePtr;
- dov++;
- }
+ if (cur_exon_start - pos >= 6 && cur_exon_start - pos <= 30 ) { // read overlaps previous exon boundary
+ downstream_overlap[dov] = linePtr;
+ dov++;
+ goto next_read;
}
- while (*(char*)linePtr != '\n') linePtr++;
- linePtr++;
- current_line = strncpy(current_line,linePtr,256);
- continue;
+ goto next_read;
}
-
- gene_idx++;
}
combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov);
}
void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size) {
- printf("up_/down_size %d %d\n",up_size,down_size);
+ //printf("up_/down_size %d %d\n",up_size,down_size);
if (up_size == 0 || down_size == 0 || exon_stop == -1)
return;
int up_id = 0;
char up_strand = ' ';
int up_mismatch = 0;
- int up_repetition = 0;
+ int up_occurrence = 0;
int up_sz = 0;
int up_cut = 0;
char* up_prb = malloc(sizeof(char)*buffer_size);
int down_id = 0;
char down_strand = ' ';
int down_mismatch = 0;
- int down_repetition = 0;
+ int down_occurrence = 0;
int down_sz = 0;
int down_cut = 0;
char* down_prb = malloc(sizeof(char)*buffer_size);
strncpy(upstream_line,upstream[up_idx],256);
status = sscanf(upstream_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
- &up_chr,&up_pos,up_seq,&up_id,&up_strand,&up_mismatch,&up_repetition,&up_sz,
+ &up_chr,&up_pos,up_seq,&up_id,&up_strand,&up_mismatch,&up_occurrence,&up_sz,
&up_cut,up_prb,up_cal_prb,up_chastity);
strncpy(downstream_line,downstream[down_idx],256);
status = sscanf(downstream_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
- &down_chr,&down_pos,down_seq,&down_id,&down_strand,&down_mismatch,&down_repetition,&down_sz,
+ &down_chr,&down_pos,down_seq,&down_id,&down_strand,&down_mismatch,&down_occurrence,&down_sz,
&down_cut,down_prb,down_cal_prb,down_chastity);
new_prb[0] = '\0';
new_cal_prb[0] = '\0';
new_chastity[0] = '\0';
+ int w_size = 5;
// is downstream overlapping
if (up_pos+35 == exon_stop) {
int overlap = exon_start - down_pos;
//printf("overlap is %d\n",overlap);
//printf("pos are: %d %d\n",up_pos,down_pos);
+
+
+ fitting(up_prb+(36-w_size),up_prb+36,down_prb+overlap,down_prb+overlap+w_size);
strncat(new_prb,up_prb+(36-overlap),overlap);
strncat(new_cal_prb,up_cal_prb+(36-overlap),overlap);
int overlap = up_pos+36 - exon_stop;
//printf("overlap is %d\n",overlap);
//printf("pos are: %d %d\n",up_pos,down_pos);
+ fitting(up_prb+36-overlap-w_size,up_prb+36-overlap,down_prb,down_prb+w_size);
strncat(new_prb,up_prb,(36-overlap));
strncat(new_cal_prb,up_cal_prb,(36-overlap));
strncat(new_chastity,down_chastity,overlap);
}
- printf("up_prb: %s\n",up_prb);
- printf("down_prb: %s\n",down_prb);
- printf("new_prb: %s\n",new_prb);
+ //printf("up_prb: %s\n",up_prb);
+ //printf("down_prb: %s\n",down_prb);
+ //printf("new_prb: %s\n",new_prb);
up_idx++;
down_idx++;
}
}
+int fitting(char* up_prb, char* up_prb_end, char* down_prb, char* down_prb_end) {
+ double epsilon_mean = 0.1;
+ double epsilon_var = 0.1;
+ double mean_up = 0;
+ double variance_up = 0;
+ double mean_down = 0;
+ double variance_down = 0;
+
+ char *up_ptr = up_prb;
+ char *down_ptr = down_prb;
+
+ int w_size = 0;
+ while(up_ptr != up_prb_end) {
+ mean_up += (*up_ptr)-50;
+ mean_down += (*down_ptr)-50;
+ w_size++;
+ up_ptr++;
+ down_ptr++;
+ }
+ mean_up /= w_size;
+ mean_down /= w_size;
+
+ //printf("means: %f %f\n",mean_up,mean_down);
+
+ up_ptr = up_prb;
+ down_ptr = down_prb;
+ w_size = 0;
+ while(up_ptr != up_prb_end) {
+ variance_up += pow((*up_prb)-50 - mean_up,2);
+ variance_down += pow((*down_prb)-50 - mean_down,2);
+ w_size++;
+ up_ptr++;
+ down_ptr++;
+ }
+ variance_up /= (w_size-1);
+ variance_down /= (w_size-1);
+
+ //printf("variances: %f %f\n",variance_up,variance_down);
+
+ if ( abs(mean_up - mean_down) < epsilon_mean && abs(variance_up - variance_down) < epsilon_var )
+ return 1;
+
+ return 0;
+}
+
/*
* TODO
* - Check strand
- * - check repetition
+ * - check occurrence
*
*/