//
//
//
-//
////////////////////////////////////////////////////////////
#include <sys/mman.h>
#include "datastructures.h"
-void parse_gff(char* filename,FILE* fid,struct gene* allGenes);
+int parse_gff(char* filename,FILE* fid,struct gene*** allGenes);
+
+void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes);
static char *info = "Usage is:\n./filterReads gff reads";
//static size_t page_size;
// page_size = (size_t) sysconf (_SC_PAGESIZE);
+ int status;
int filenameSize = 256;
char *gff_filename = malloc(sizeof(char)*filenameSize);
char *reads_filename = malloc(sizeof(char)*filenameSize);
exit(EXIT_FAILURE);
}
- struct gene **allGenes;
- parse_gff(gff_filename,gff_fs,allGenes);
+ struct gene** allGenes;
+ int numGenes = parse_gff(gff_filename,gff_fs,&allGenes);
+ status = fclose(gff_fs);
+ if(status != 0)
+ printf("closing of gff filestream failed!\n");
+ process_reads(reads_fs,&allGenes,numGenes);
+
+ status += fclose(reads_fs);
+ if(status != 0)
+ printf("closing of filestreams failed!\n");
+
+ //free(allGenes);
+ free(gff_filename);
+ free(reads_filename);
+ return 0;
+}
+
+
+void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes) {
+ int status;
+
+ int buffer_size= 64;
+ int chr = 0;
+ int pos = 0;
+ char* seq = malloc(sizeof(char)*buffer_size);
+ int id = 0;
+ char strand = ' ';
+ int mismatch = 0;
+ int repetition = 0;
+ int size = 0;
+ int cut = 0;
+ char* prb = malloc(sizeof(char)*buffer_size);
+ char* cal_prb = malloc(sizeof(char)*buffer_size);
+ char* chastity = malloc(sizeof(char)*buffer_size);
int reads_fid = fileno(reads_fs);
- size_t mmapAreaSize = 16;
+ size_t mmapAreaSize = 20000;
+
void *reads_area = mmap (NULL,mmapAreaSize,PROT_READ|PROT_WRITE,MAP_PRIVATE,reads_fid,0);
if((long int) reads_area == -1)
printf("mmapping failed!\n");
+ void* linePtr = reads_area;
+
char *test_string = malloc(sizeof(char)*10);
strncpy(test_string,(char*)reads_area,10);
- //printf("Reads area at %s\n",test_string);
- int status = munmap(reads_area,mmapAreaSize);
+ status = munmap(reads_area,mmapAreaSize);
if(status != 0)
printf("munmap failed!\n");
- status = fclose(gff_fs);
- status += fclose(reads_fs);
- if(status != 0)
- printf("closing of filestreams failed!\n");
+ void** upstream_end;
+ void** upstream_overlap;
+
+ void** downstream_start;
+ void** downstream_overlap;
+
+ int ue,uo,ds,dov;
- printf("sizeof(struct gene) %d\n",sizeof(struct gene));
- printf("sizeof(struct read) %d\n",sizeof(struct read));
+ int SIZE = 50;
+
+ int skippedLinesCounter = 0;
+
+ printf("Entering loop...\n");
+
+ int gene_idx;
+ for (gene_idx = 0; gene_idx < numGenes;) {
+ struct gene* currentGene = (*allGenes)[gene_idx];
+
+ int exon_idx;
+ for(exon_idx=1;exon_idx<currentGene->num_exons-1;exon_idx++) {
+ int prev_exon_stop = currentGene->exon_stops[exon_idx-1];
+ int cur_exon_start = currentGene->exon_starts[exon_idx];
+
+ if (cur_exon_start - prev_exon_stop < 6)
+ continue;
+
+ // initialize boundary arrays
+ upstream_end = malloc(sizeof(void*)*SIZE);
+ upstream_overlap = malloc(sizeof(void*)*SIZE);
+ downstream_start = malloc(sizeof(void*)*SIZE);
+ downstream_overlap= malloc(sizeof(void*)*SIZE);
+ ue = 0;
+ uo = 0;
+ ds = 0;
+ dov = 0;
+
+ printf("Fetching reads line...\n");
+
+ label:
+
+ status = sscanf((char*)linePtr,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
+ &chr,&pos,seq,&id,&strand,&mismatch,&repetition,&size,&cut,prb,cal_prb,chastity);
+ if (status < 12) {
+ skippedLinesCounter++;
+ continue;
+ }
+
+ printf("read line!");
+
+ if (currentGene->start <= pos && pos+35 <= currentGene->stop) {
+ printf("gene boundaries: %d %d, pos: %d\n",currentGene->start,currentGene->stop,pos);
+
+ // upstream ending
+ if (pos+35 == prev_exon_stop) {
+ upstream_end[ue] = linePtr;
+ ue++;
+ }
+
+ // upstream overlapping
+ if (prev_exon_stop - pos <= 30) {
+ upstream_overlap[uo] = linePtr;
+ uo++;
+ }
+
+ // downstream starting
+ if (pos == cur_exon_start) {
+ downstream_start[ds] = linePtr;
+ ds++;
+ }
+
+ // downstream overlapping
+ if (cur_exon_start - pos >= 6) {
+ downstream_overlap[dov] = linePtr;
+ dov++;
+ }
+
+ while (*(char*)linePtr != '\n') {
+ printf("linePtr is %d",linePtr);
+ linePtr++;
+ }
+
+ linePtr++;
+ goto label;
+
+ free(upstream_end);
+ free(upstream_overlap);
+ free(downstream_start);
+ free(downstream_overlap);
+ }
+
+ while (*(char*)linePtr != '\n') {
+ printf("linePtr is %d",linePtr);
+ linePtr++;
+ }
+
+ linePtr++;
+ goto label;
+
+ }
+
+ gene_idx++;
+ }
+ printf("skipped %d lines.\n",skippedLinesCounter);
- free(gff_filename);
- free(reads_filename);
- return 0;
+ free(seq);
+ free(prb);
+ free(cal_prb);
+ free(chastity);
}
+
+// int skippedLinesCounter = 0;
+// while(1) {
+// status = fscanf(reads_fs,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",&chr,&pos,seq,&id,&strand,&mismatch,&repetition,&size,&cut,prb,cal_prb,chastity);
+// if(status == EOF)
+// break;
+//
+// if (status < 12) {
+// skippedLinesCounter++;
+// continue;
+// }
+//
+// label:
+//
+// if (pos < currentGene->start)
+// continue;
+//
+// if (currentGene->stop < pos) {
+// gene_idx++;
+// currentGene = (*allGenes)[gene_idx];
+// goto label;
+// }
+//
#include "datastructures.h"
-void parse_gff(char *filename, FILE* fid,struct gene** allGenes) {
+int parse_gff(char *filename, FILE* fid,struct gene*** allGenes) {
int buffer_size = 256;
char* chr = malloc(sizeof(char)*buffer_size);
}
freopen(filename,"r",fid);
- printf("Found %d genes!\n",numGenes);
+ //printf("Found %d genes!\n",numGenes);
- allGenes = malloc(sizeof(struct gene)*numGenes);
- struct gene *currentGene = gene_alloc();
+ int idx = 0;
+ (*allGenes) = malloc(sizeof(struct gene*)*numGenes);
+ (*allGenes)[idx] = NULL;
int skippedLinesCounter = 0;
- int idx = 0;
while(1) {
status = fscanf(fid,"%s\t%s\t%s\t%d\t%d\t%c\t%c\t%c\t%s\n",chr,blah,id,&start,&stop,xy,strand,xz,desc);
if(status == EOF)
}
if (strcmp(id,"gene")==0) {
- if ( currentGene->start != -1)
- allGenes[idx] = currentGene;
+ if ( (*allGenes)[idx] !=NULL )
idx++;
- currentGene = gene_alloc();
- currentGene->start = start;
- currentGene->stop = stop;
- currentGene->strand = (*strand);
+ (*allGenes)[idx] = gene_alloc();
+ (*allGenes)[idx]->start = start;
+ (*allGenes)[idx]->stop = stop;
+ (*allGenes)[idx]->strand = (*strand);
//printf("gene start/stop: %d/%d\n",start,stop);
continue;
}
if (strcmp(id,"exon")==0) {
- add_exon(currentGene,start,stop);
+ add_exon((*allGenes)[idx],start,stop);
//printf("exon start/stop: %d/%d\n",start,stop);
continue;
}
if (strcmp(id,"pseudogene")==0) {
- if ( currentGene->start != -1)
- allGenes[idx] = currentGene;
+ if ( (*allGenes)[idx] !=NULL )
idx++;
}
}
- if ( currentGene->start != -1)
- allGenes[idx] = currentGene;
+ if ( (*allGenes)[idx] !=NULL )
idx++;
+ //printf("allGenes[0] is %d\n",(*allGenes)[0]);
+ //printf("allGenes[1] is %d\n",(*allGenes)[1]);
+ //printf("Skipped %d lines.\n",skippedLinesCounter);
+
free(chr);
free(blah);
free(id);
free(xy);
free(strand);
free(xz);
+
+ return numGenes;
}
+