--- /dev/null
+
+SRCS= parser.c\
+ datastructures.c
+
+
+OBJS = $(SRCS:%.cpp=%.o)
+
+all: $(OBJS)
+ gcc -Wall -o filterReads $(OBJS) filterReads.c
class Gene:
- chromosome = ''
- strand = ''
- start = -1
- stop = -1
- exons = []
- fiveUTR = []
- threeUTR = []
-
-
def __init__(self,chr,begin,end,strand):
assert chr != ''
assert begin >= 0 and begin <= end and end >= 0
self.start = begin
self.stop = end
self.strand = strand
-
+ self.exons = []
+ self.fiveUTR = []
+ self.threeUTR = []
def addExon(self,start,stop):
self.exons.append((start,stop))
You have to supply two files. One containing the gff information and the other
one containing the information of the Solexa(R) reads.
-Usage: ./createTestSet.py gff.pickle reads.pickle
+Usage: ./createTestSet.py gff.pickle reads.txt
"""
doc=\
this method does not check the chromosome info
"""
-def check(annot,reads):
- print len(annot)
- print len(reads)
+def printRow(row):
+ print "".join([elem+' ' for elem in row]).strip()
-
+def check(annotation,reads):
+ skippedReadsCounter = 0
+ reader = csv.reader(reads, delimiter='\t', quoting=csv.QUOTE_NONE)
+ for pos,row in enumerate(reader):
+ if len(row) != 12:
+ skippedReadsCounter += 1
+ continue
+
+ if pos % 10000 == 0:
+ print >> sys.stderr, pos
+
+ chr = row[0]
+ pos = int(row[1])
+ readStart = pos
+ readStop = pos+35
+
+ for gene in annotation:
+ # read inside current gene positions
+
+ if gene.start <= readStart and pos+35 <= gene.stop:
+ overlapping = True
+
+ for exonStart,exonStop in gene.exons:
+ if exonStart <= readStart and readStop <= exonStop:
+ overlapping = False
+ break
+
+ if overlapping:
+ printRow(row)
+ continue
+
+ # read is overlapping with gene boundaries
+ if pos < gene.start and pos+35 > gene.start:
+ printRow(row)
+ continue
+
+ if pos < gene.stop and pos+35 > gene.stop:
+ printRow(row)
+ continue
+
+ print skippedReadsCounter
if __name__ == '__main__':
assert len(sys.argv) >= 3, info
readsFile = sys.argv[2]
assert os.path.exists(annotFile), 'File %s does not exist!' % annotFile
assert os.path.exists(readsFile), 'File %s does not exist!' % readsFile
- print doc
+ print >> sys.stderr, doc
annotation = cPickle.load(open(annotFile))
- reads = cPickle.load(open(readsFile))
+ reads = open(readsFile)
check(annotation,reads)
-
--- /dev/null
+#include <stdlib.h>
+#include <string.h>
+
+#include "datastructures.h"
+
+struct gene* gene_alloc(void) {
+ struct gene* newGene = (struct gene*) malloc(sizeof(struct gene));
+ newGene->exon_starts = malloc(2*sizeof(int));
+ newGene->exon_stops = malloc(2*sizeof(int));
+ newGene->num_exons = 0;
+ newGene->max_exons = 2;
+ newGene->start = -1;
+
+ return newGene;
+}
+
+void free_gene(struct gene* oldGene) {
+ free(oldGene->exon_starts);
+ free(oldGene->exon_stops);
+}
+
+void add_exon(struct gene* currentGene,int start, int stop) {
+
+ if (currentGene->num_exons < currentGene->num_exons) {
+ int idx = currentGene->num_exons;
+ currentGene->exon_starts[idx] = start;
+ currentGene->exon_stops[idx] = stop;
+ currentGene->num_exons++;
+ } else {
+ currentGene->exon_starts = realloc(currentGene->exon_starts,sizeof(int)*2*currentGene->max_exons);
+ currentGene->exon_stops = realloc(currentGene->exon_stops,sizeof(int)*2*currentGene->max_exons);
+ }
+}
--- /dev/null
+#ifndef __DATA_STRUCTURES_H__
+#define __DATA_STRUCTURES_H__
+
+struct gene {
+ int start;
+ int stop;
+ int* exon_starts;
+ int* exon_stops;
+ int num_exons;
+ int max_exons;
+ char strand;
+};
+
+struct read {
+ int start;
+ int stop;
+ char* seq;
+ char strand;
+
+ int* prb;
+ int* cprb;
+ int* chastity;
+};
+
+struct gene* gene_alloc(void);
+void free_gene(struct gene* oldGene);
+void add_exon(struct gene* currentGene,int start, int stop);
+
+#endif // __DATA_STRUCTURES_H__
--- /dev/null
+////////////////////////////////////////////////////////////
+// The purpose of this program is to read a gff and a
+// solexa reads file and create a data set used by QPalma.
+//
+//
+//
+//
+////////////////////////////////////////////////////////////
+
+#include <sys/mman.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "datastructures.h"
+
+void parse_gff(char* filename,FILE* fid,struct gene* allGenes);
+
+static char *info = "Usage is:\n./filterReads gff reads";
+
+int main(int argc, char* argv[]) {
+
+ if(argc != 3) {
+ printf("%s\n",info);
+ exit(EXIT_FAILURE);
+ }
+
+ //static size_t page_size;
+ // page_size = (size_t) sysconf (_SC_PAGESIZE);
+ int filenameSize = 256;
+ char *gff_filename = malloc(sizeof(char)*filenameSize);
+ char *reads_filename = malloc(sizeof(char)*filenameSize);
+
+ strncpy(gff_filename,argv[1],filenameSize);
+ strncpy(reads_filename,argv[2],filenameSize);
+
+ FILE *gff_fs = fopen(gff_filename,"r");
+ FILE *reads_fs = fopen(reads_filename,"r");
+
+ if(gff_fs == NULL) {
+ printf("Error: Could not open file: %s",gff_filename);
+ exit(EXIT_FAILURE);
+ }
+
+ if(reads_fs == NULL) {
+ printf("Error: Could not open file: %s",reads_filename);
+ exit(EXIT_FAILURE);
+ }
+
+ struct gene **allGenes;
+ parse_gff(gff_filename,gff_fs,allGenes);
+
+
+ int reads_fid = fileno(reads_fs);
+ size_t mmapAreaSize = 16;
+ void *reads_area = mmap (NULL,mmapAreaSize,PROT_READ|PROT_WRITE,MAP_PRIVATE,reads_fid,0);
+ if((long int) reads_area == -1)
+ printf("mmapping failed!\n");
+
+ char *test_string = malloc(sizeof(char)*10);
+ strncpy(test_string,(char*)reads_area,10);
+ //printf("Reads area at %s\n",test_string);
+
+ int status = munmap(reads_area,mmapAreaSize);
+ if(status != 0)
+ printf("munmap failed!\n");
+
+ status = fclose(gff_fs);
+ status += fclose(reads_fs);
+ if(status != 0)
+ printf("closing of filestreams failed!\n");
+
+ printf("sizeof(struct gene) %d\n",sizeof(struct gene));
+ printf("sizeof(struct read) %d\n",sizeof(struct read));
+
+
+ free(gff_filename);
+ free(reads_filename);
+ return 0;
+}
import csv
from PyGff import *
import cPickle
+import copy
def parse(gff_fid,pickle_fid):
reader = csv.reader(gff_fid, delimiter='\t', quoting=csv.QUOTE_NONE)
if id == 'gene':
if currentGene != None:
allGenes.append(currentGene)
- currentGene = None
currentGene = Gene(chr,start,stop,strand)
pass
elif id == 'exon':
+ assert currentGene != None
currentGene.addExon(start,stop)
elif id == 'CDS':
elif id == 'pseudogenic_transcript':
pass
- elif id == 'snoRNA':
+ elif id == 'miRNA':
pass
- elif id == 'snRNA':
+ elif id == 'rRNA':
pass
- elif id == 'miRNA':
+ elif id == 'snoRNA':
+ pass
+
+ elif id == 'snRNA':
pass
elif id == 'tRNA':
if currentGene != None:
allGenes.append(currentGene)
currentGene = None
-
else:
assert False, 'Error: Unknown identifier \'%s\'' % id
- cPickle.dump(allGenes,pickle_fid)
+ if currentGene != None:
+ allGenes.append(currentGene)
+ cPickle.dump(allGenes,pickle_fid)
if __name__ == '__main__':
assert len(sys.argv) >= 3
solexa_fid = open(solexaFile)
pickle_fid = open(pickleFile,'w+')
parse(solexa_fid,pickle_fid)
-
--- /dev/null
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "datastructures.h"
+
+void parse_gff(char *filename, FILE* fid,struct gene** allGenes) {
+
+ int buffer_size = 256;
+ char* chr = malloc(sizeof(char)*buffer_size);
+ char* blah = malloc(sizeof(char)*buffer_size);
+ char* id = malloc(sizeof(char)*buffer_size);
+ char* desc = malloc(sizeof(char)*buffer_size);
+ int start = 0;
+ int stop = 0;
+ char* xy = malloc(sizeof(char)*4);
+ char* strand = malloc(sizeof(char)*4);
+ char* xz = malloc(sizeof(char)*4);
+
+ // do one pass through the gff line to determine the number of
+ // genes
+ int numGenes = 0;
+
+ int status = 0;
+ while(1) {
+ status = fscanf(fid,"%s\t%s\t%s\t%d\t%d\t%c\t%c\t%c\t%s\n",chr,blah,id,&start,&stop,xy,strand,xz,desc);
+ if(status == EOF)
+ break;
+
+ if ( status > 5 && strcmp(id,"gene")==0)
+ numGenes++;
+ }
+ freopen(filename,"r",fid);
+
+ printf("Found %d genes!\n",numGenes);
+
+ allGenes = malloc(sizeof(struct gene)*numGenes);
+ struct gene *currentGene = gene_alloc();
+
+ int skippedLinesCounter = 0;
+ int idx = 0;
+ while(1) {
+ status = fscanf(fid,"%s\t%s\t%s\t%d\t%d\t%c\t%c\t%c\t%s\n",chr,blah,id,&start,&stop,xy,strand,xz,desc);
+ if(status == EOF)
+ break;
+
+ if (status < 7) {
+ skippedLinesCounter++;
+ continue;
+ }
+
+ if (strcmp(id,"gene")==0) {
+ if ( currentGene->start != -1)
+ allGenes[idx] = currentGene;
+ idx++;
+
+ currentGene = gene_alloc();
+ currentGene->start = start;
+ currentGene->stop = stop;
+ currentGene->strand = (*strand);
+ //printf("gene start/stop: %d/%d\n",start,stop);
+ continue;
+ }
+
+ if (strcmp(id,"exon")==0) {
+ add_exon(currentGene,start,stop);
+ //printf("exon start/stop: %d/%d\n",start,stop);
+ continue;
+ }
+
+ if (strcmp(id,"pseudogene")==0) {
+ if ( currentGene->start != -1)
+ allGenes[idx] = currentGene;
+ idx++;
+ }
+ }
+
+ if ( currentGene->start != -1)
+ allGenes[idx] = currentGene;
+ idx++;
+
+ free(chr);
+ free(blah);
+ free(id);
+ free(desc);
+ free(xy);
+ free(strand);
+ free(xz);
+}