//
//
//
-//
////////////////////////////////////////////////////////////
#include <sys/mman.h>
#include "datastructures.h"
-void parse_gff(char* filename,FILE* fid,struct gene* allGenes);
+int parse_gff(char* filename,FILE* fid,struct gene*** allGenes);
+
+void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes);
+
+void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size);
static char *info = "Usage is:\n./filterReads gff reads";
//static size_t page_size;
// page_size = (size_t) sysconf (_SC_PAGESIZE);
+ int status;
int filenameSize = 256;
char *gff_filename = malloc(sizeof(char)*filenameSize);
char *reads_filename = malloc(sizeof(char)*filenameSize);
exit(EXIT_FAILURE);
}
- struct gene **allGenes;
- parse_gff(gff_filename,gff_fs,allGenes);
+ struct gene** allGenes;
+ int numGenes = parse_gff(gff_filename,gff_fs,&allGenes);
+ status = fclose(gff_fs);
+ if(status != 0)
+ printf("closing of gff filestream failed!\n");
+
+ process_reads(reads_fs,&allGenes,numGenes);
+
+ status += fclose(reads_fs);
+ if(status != 0)
+ printf("closing of filestreams failed!\n");
+
+ //free(allGenes);
+ free(gff_filename);
+ free(reads_filename);
+ return 0;
+}
+
+void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes) {
+ int status;
+
+ int buffer_size= 64;
+ int chr = 0;
+ int pos = 0;
+ char* seq = malloc(sizeof(char)*buffer_size);
+ int id = 0;
+ char strand = ' ';
+ int mismatch = 0;
+ int repetition = 0;
+ int size = 0;
+ int cut = 0;
+ char* prb = malloc(sizeof(char)*buffer_size);
+ char* cal_prb = malloc(sizeof(char)*buffer_size);
+ char* chastity = malloc(sizeof(char)*buffer_size);
int reads_fid = fileno(reads_fs);
- size_t mmapAreaSize = 16;
+ size_t mmapAreaSize = 20000;
+
void *reads_area = mmap (NULL,mmapAreaSize,PROT_READ|PROT_WRITE,MAP_PRIVATE,reads_fid,0);
if((long int) reads_area == -1)
printf("mmapping failed!\n");
- char *test_string = malloc(sizeof(char)*10);
- strncpy(test_string,(char*)reads_area,10);
- //printf("Reads area at %s\n",test_string);
-
- int status = munmap(reads_area,mmapAreaSize);
+ void* linePtr = reads_area;
+ char* current_line = malloc(sizeof(char)*256);
+
+ int SIZE = 500;
+ // initialize boundary arrays
+ void** upstream_end = malloc(sizeof(void*)*SIZE);
+ void** upstream_overlap = malloc(sizeof(void*)*SIZE);
+ void** downstream_start = malloc(sizeof(void*)*SIZE);
+ void** downstream_overlap= malloc(sizeof(void*)*SIZE);
+ int ue,uo,ds,dov;
+ ue = uo = ds = dov = 0;
+
+ int skippedLinesCounter = 0;
+
+ int prev_exon_stop = -1;
+ int cur_exon_start = -1;
+
+ int gene_idx;
+ for (gene_idx = 0; gene_idx < numGenes;) {
+ struct gene* currentGene = (*allGenes)[gene_idx];
+
+ int exon_idx;
+ for(exon_idx=1;exon_idx<currentGene->num_exons;exon_idx++) {
+
+ combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov);
+ combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds);
+
+ prev_exon_stop = currentGene->exon_stops[exon_idx-1];
+ cur_exon_start = currentGene->exon_starts[exon_idx];
+
+ if (cur_exon_start - prev_exon_stop < 6)
+ continue;
+
+ ue = uo = ds = dov = 0;
+
+ label:
+
+ current_line = strncpy(current_line,linePtr,256);
+ if (strcmp(current_line,"") == 0)
+ goto end;
+
+ status = sscanf(current_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
+ &chr,&pos,seq,&id,&strand,&mismatch,&repetition,&size,&cut,prb,cal_prb,chastity);
+ if (status < 12) {
+ //printf("skipped line is %s\n",current_line);
+ skippedLinesCounter++;
+ }
+
+ //printf("%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
+ //chr,pos,seq,id,strand,mismatch,repetition,size,cut,prb,cal_prb,chastity);
+ //printf("read position: %d gene start/stop: %d-%d\n ",pos,currentGene->start,currentGene->stop);
+
+ if (currentGene->start <= pos && pos+35 <= currentGene->stop) {
+ //printf("gene boundaries: %d %d, pos: %d linePtr %d\n",currentGene->start,currentGene->stop,pos,linePtr);
+ // upstream ending
+ if (pos+35 == prev_exon_stop) {
+ upstream_end[ue] = linePtr;
+ ue++;
+ } else {
+
+ // upstream overlapping
+ if (prev_exon_stop - pos >= 6 && prev_exon_stop - pos <= 30) {
+ upstream_overlap[uo] = linePtr;
+ uo++;
+ }
+ }
+
+ // downstream starting
+ if (pos == cur_exon_start) {
+ downstream_start[ds] = linePtr;
+ ds++;
+ } else {
+
+ // downstream overlapping
+ if (cur_exon_start - pos >= 6 && cur_exon_start - pos <= 30 ) {
+ downstream_overlap[dov] = linePtr;
+ dov++;
+ }
+ }
+
+ while (*(char*)linePtr != '\n') linePtr++;
+ linePtr++;
+ goto label;
+
+ } else {
+
+ if (currentGene->stop < pos) {
+ //printf("read is too far downstream\n");
+ break;
+ }
+
+ //printf("read is too far upstream\n");
+ while (*(char*)linePtr != '\n') linePtr++;
+ linePtr++;
+ goto label;
+ }
+ }
+
+ gene_idx++;
+ }
+
+ end:
+
+ free(upstream_end);
+ free(upstream_overlap);
+ free(downstream_start);
+ free(downstream_overlap);
+
+ printf("skipped %d lines.\n",skippedLinesCounter);
+
+ status = munmap(reads_area,mmapAreaSize);
if(status != 0)
printf("munmap failed!\n");
- status = fclose(gff_fs);
- status += fclose(reads_fs);
- if(status != 0)
- printf("closing of filestreams failed!\n");
+ free(seq);
+ free(prb);
+ free(cal_prb);
+ free(chastity);
+}
+
+void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size) {
+ //printf("up_/down_size %d %d\n",up_size,down_size);
+
+ if (up_size == 0 || down_size == 0 || exon_stop == -1)
+ return;
+
+ int up_idx, down_idx, status;
+ char* upstream_line = malloc(sizeof(char)*256);
+ char* downstream_line = malloc(sizeof(char)*256);
+
+ int buffer_size= 64;
+
+ int up_chr = 0;
+ int up_pos = 0;
+ char* up_seq = malloc(sizeof(char)*buffer_size);
+ int up_id = 0;
+ char up_strand = ' ';
+ int up_mismatch = 0;
+ int up_repetition = 0;
+ int up_sz = 0;
+ int up_cut = 0;
+ char* up_prb = malloc(sizeof(char)*buffer_size);
+ char* up_cal_prb = malloc(sizeof(char)*buffer_size);
+ char* up_chastity = malloc(sizeof(char)*buffer_size);
+
+ int down_chr = 0;
+ int down_pos = 0;
+ char* down_seq = malloc(sizeof(char)*buffer_size);
+ int down_id = 0;
+ char down_strand = ' ';
+ int down_mismatch = 0;
+ int down_repetition = 0;
+ int down_sz = 0;
+ int down_cut = 0;
+ char* down_prb = malloc(sizeof(char)*buffer_size);
+ char* down_cal_prb = malloc(sizeof(char)*buffer_size);
+ char* down_chastity = malloc(sizeof(char)*buffer_size);
+
+ int new_chr = 0;
+ int new_pos = 0;
+ char* new_seq = malloc(sizeof(char)*buffer_size);
+ int new_id = 0;
+ char new_strand = ' ';
+ char* new_prb = malloc(sizeof(char)*buffer_size);
+ char* new_cal_prb = malloc(sizeof(char)*buffer_size);
+ char* new_chastity = malloc(sizeof(char)*buffer_size);
+
+ up_idx=0;
+ down_idx=0;
+ while(1) {
+ //printf("up_/down_idx %d %d\n",up_idx,down_idx);
+
+ if (up_idx == up_size || down_idx == down_size)
+ break;
+
+ strncpy(upstream_line,upstream[up_idx],256);
+ status = sscanf(upstream_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
+ &up_chr,&up_pos,up_seq,&up_id,&up_strand,&up_mismatch,&up_repetition,&up_sz,
+ &up_cut,up_prb,up_cal_prb,up_chastity);
- printf("sizeof(struct gene) %d\n",sizeof(struct gene));
- printf("sizeof(struct read) %d\n",sizeof(struct read));
+ strncpy(downstream_line,downstream[down_idx],256);
+ status = sscanf(downstream_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
+ &down_chr,&down_pos,down_seq,&down_id,&down_strand,&down_mismatch,&down_repetition,&down_sz,
+ &down_cut,down_prb,down_cal_prb,down_chastity);
+ new_prb[0] = '\0';
+ new_cal_prb[0] = '\0';
+ new_chastity[0] = '\0';
+
+ // is downstream overlapping
+ if (up_pos+35 == exon_stop) {
+ int overlap = exon_start - down_pos;
+ //printf("overlap is %d\n",overlap);
+ //printf("pos are: %d %d\n",up_pos,down_pos);
- free(gff_filename);
- free(reads_filename);
- return 0;
+ strncat(new_prb,up_prb+(36-overlap),overlap);
+ strncat(new_cal_prb,up_cal_prb+(36-overlap),overlap);
+ strncat(new_chastity,up_chastity+(36-overlap),overlap);
+
+ strncat(new_prb,down_prb+overlap,36-overlap);
+ strncat(new_cal_prb,down_cal_prb+overlap,36-overlap);
+ strncat(new_chastity,down_chastity+overlap,36-overlap);
+
+ //printf("up_prb: %s\n",up_prb);
+ //printf("down_prb: %s\n",down_prb);
+ //printf("new_prb: %s\n",new_prb);
+
+ // is upstream overlapping
+ } else {
+
+ }
+
+ up_idx++;
+ down_idx++;
+ }
}
+
+/*
+ * TODO
+ * - Check strand
+ * - check repetition
+ *
+ */