+ wrote faster parser / file processing tools in C
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Tue, 18 Dec 2007 14:44:40 +0000 (14:44 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Tue, 18 Dec 2007 14:44:40 +0000 (14:44 +0000)
TODO
+ finish solexa reads parser

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@7164 e1793c9e-67f9-0310-80fc-b846ff1f7b36

tools/data_tools/Makefile [new file with mode: 0644]
tools/data_tools/PyGff.py
tools/data_tools/createTestSet.py
tools/data_tools/datastructures.c [new file with mode: 0644]
tools/data_tools/datastructures.h [new file with mode: 0644]
tools/data_tools/filterReads.c [new file with mode: 0644]
tools/data_tools/parseGff.py
tools/data_tools/parseSolexa.py
tools/data_tools/parser.c [new file with mode: 0644]

diff --git a/tools/data_tools/Makefile b/tools/data_tools/Makefile
new file mode 100644 (file)
index 0000000..71b500f
--- /dev/null
@@ -0,0 +1,9 @@
+
+SRCS= parser.c\
+       datastructures.c
+
+                         
+OBJS = $(SRCS:%.cpp=%.o)
+
+all: $(OBJS)
+       gcc -Wall -o filterReads $(OBJS) filterReads.c
index 618c2aa..b463512 100644 (file)
@@ -3,15 +3,6 @@
 
 class Gene:
    
 
 class Gene:
    
-   chromosome = ''
-   strand = ''
-   start = -1
-   stop = -1
-   exons = []
-   fiveUTR = []
-   threeUTR = []
-
-
    def __init__(self,chr,begin,end,strand):
       assert chr != ''
       assert begin >= 0 and begin <= end and end >= 0
    def __init__(self,chr,begin,end,strand):
       assert chr != ''
       assert begin >= 0 and begin <= end and end >= 0
@@ -21,7 +12,9 @@ class Gene:
       self.start = begin
       self.stop = end
       self.strand = strand
       self.start = begin
       self.stop = end
       self.strand = strand
-
+      self.exons = []
+      self.fiveUTR = []
+      self.threeUTR = []
 
    def addExon(self,start,stop):
       self.exons.append((start,stop))
 
    def addExon(self,start,stop):
       self.exons.append((start,stop))
index 03c9d06..67f6bf9 100644 (file)
@@ -12,7 +12,7 @@ info=\
 You have to supply two files. One containing the gff information and the other
 one containing the information of the Solexa(R) reads.
 
 You have to supply two files. One containing the gff information and the other
 one containing the information of the Solexa(R) reads.
 
-Usage: ./createTestSet.py gff.pickle reads.pickle
+Usage: ./createTestSet.py gff.pickle reads.txt
 """
 
 doc=\
 """
 
 doc=\
@@ -21,12 +21,51 @@ Make sure that you split the data into chromosome files beforehand because
 this method does not check the chromosome info
 """
 
 this method does not check the chromosome info
 """
 
-def check(annot,reads):
-   print len(annot)
-   print len(reads)
+def printRow(row):
+   print "".join([elem+' ' for elem in row]).strip()
 
 
-   
+def check(annotation,reads):
+   skippedReadsCounter = 0
 
 
+   reader = csv.reader(reads, delimiter='\t', quoting=csv.QUOTE_NONE)
+   for pos,row in enumerate(reader):
+      if len(row) != 12:
+         skippedReadsCounter += 1
+         continue
+
+      if pos % 10000 == 0:
+         print >> sys.stderr, pos
+
+      chr = row[0]
+      pos = int(row[1])
+      readStart = pos
+      readStop = pos+35
+
+      for gene in annotation:
+         # read inside current gene positions
+
+         if gene.start <= readStart and pos+35 <= gene.stop:
+            overlapping = True
+            
+            for exonStart,exonStop in gene.exons:
+               if exonStart <= readStart and readStop <= exonStop:
+                  overlapping = False
+                  break
+         
+            if overlapping:
+               printRow(row)
+               continue
+
+         # read is overlapping with gene boundaries
+         if pos < gene.start and pos+35 > gene.start:
+            printRow(row)
+            continue
+
+         if pos < gene.stop and pos+35 > gene.stop:
+            printRow(row)
+            continue
+
+   print skippedReadsCounter     
       
 if __name__ == '__main__':
    assert len(sys.argv) >= 3, info
       
 if __name__ == '__main__':
    assert len(sys.argv) >= 3, info
@@ -34,10 +73,9 @@ if __name__ == '__main__':
    readsFile = sys.argv[2]
    assert os.path.exists(annotFile), 'File %s does not exist!' % annotFile
    assert os.path.exists(readsFile), 'File %s does not exist!' % readsFile
    readsFile = sys.argv[2]
    assert os.path.exists(annotFile), 'File %s does not exist!' % annotFile
    assert os.path.exists(readsFile), 'File %s does not exist!' % readsFile
-   print doc
+   print >> sys.stderr, doc
 
    annotation = cPickle.load(open(annotFile))
 
    annotation = cPickle.load(open(annotFile))
-   reads = cPickle.load(open(readsFile))
+   reads = open(readsFile)
 
    check(annotation,reads)
 
    check(annotation,reads)
-
diff --git a/tools/data_tools/datastructures.c b/tools/data_tools/datastructures.c
new file mode 100644 (file)
index 0000000..2f464c4
--- /dev/null
@@ -0,0 +1,33 @@
+#include <stdlib.h>
+#include <string.h>
+
+#include "datastructures.h"
+
+struct gene* gene_alloc(void) {
+   struct gene* newGene = (struct gene*) malloc(sizeof(struct gene));
+   newGene->exon_starts = malloc(2*sizeof(int));
+   newGene->exon_stops = malloc(2*sizeof(int));
+   newGene->num_exons = 0;
+   newGene->max_exons = 2;
+   newGene->start = -1;
+
+   return newGene;
+}
+
+void free_gene(struct gene* oldGene) {
+   free(oldGene->exon_starts);
+   free(oldGene->exon_stops);
+}
+
+void add_exon(struct gene* currentGene,int start, int stop) {
+
+   if (currentGene->num_exons < currentGene->num_exons) {
+      int idx = currentGene->num_exons;
+      currentGene->exon_starts[idx] = start;
+      currentGene->exon_stops[idx] = stop;
+      currentGene->num_exons++;
+   } else {
+      currentGene->exon_starts = realloc(currentGene->exon_starts,sizeof(int)*2*currentGene->max_exons);
+      currentGene->exon_stops = realloc(currentGene->exon_stops,sizeof(int)*2*currentGene->max_exons);
+   }
+}
diff --git a/tools/data_tools/datastructures.h b/tools/data_tools/datastructures.h
new file mode 100644 (file)
index 0000000..64c48c1
--- /dev/null
@@ -0,0 +1,29 @@
+#ifndef __DATA_STRUCTURES_H__
+#define __DATA_STRUCTURES_H__
+
+struct gene {
+   int start;
+   int stop;
+   int* exon_starts;
+   int* exon_stops;
+   int num_exons;
+   int max_exons;
+   char strand;
+};
+
+struct read {
+   int start;
+   int stop;
+   char* seq;
+   char strand;
+
+   int* prb;
+   int* cprb;
+   int* chastity;
+};
+
+struct gene* gene_alloc(void);
+void free_gene(struct gene* oldGene);
+void add_exon(struct gene* currentGene,int start, int stop);
+
+#endif // __DATA_STRUCTURES_H__
diff --git a/tools/data_tools/filterReads.c b/tools/data_tools/filterReads.c
new file mode 100644 (file)
index 0000000..4e7ec9f
--- /dev/null
@@ -0,0 +1,80 @@
+////////////////////////////////////////////////////////////
+// The purpose of this program is to read a gff and a    
+// solexa reads file and create a data set used by QPalma. 
+//
+//
+//
+//
+////////////////////////////////////////////////////////////
+
+#include <sys/mman.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "datastructures.h"
+
+void parse_gff(char* filename,FILE* fid,struct gene* allGenes);
+
+static char *info = "Usage is:\n./filterReads gff reads";
+
+int main(int argc, char* argv[]) {
+
+   if(argc != 3) {
+      printf("%s\n",info);
+      exit(EXIT_FAILURE);
+   }
+
+   //static size_t page_size;
+   // page_size = (size_t) sysconf (_SC_PAGESIZE);
+   int filenameSize = 256;
+   char *gff_filename = malloc(sizeof(char)*filenameSize);
+   char *reads_filename = malloc(sizeof(char)*filenameSize);
+
+   strncpy(gff_filename,argv[1],filenameSize);
+   strncpy(reads_filename,argv[2],filenameSize);
+
+   FILE *gff_fs = fopen(gff_filename,"r");
+   FILE *reads_fs = fopen(reads_filename,"r");
+
+   if(gff_fs == NULL) {
+      printf("Error: Could not open file: %s",gff_filename);
+      exit(EXIT_FAILURE);
+   }
+
+   if(reads_fs == NULL) {
+      printf("Error: Could not open file: %s",reads_filename);
+      exit(EXIT_FAILURE);
+   }
+
+   struct gene **allGenes;
+   parse_gff(gff_filename,gff_fs,allGenes);
+
+
+   int reads_fid = fileno(reads_fs);
+   size_t mmapAreaSize = 16;
+   void *reads_area = mmap (NULL,mmapAreaSize,PROT_READ|PROT_WRITE,MAP_PRIVATE,reads_fid,0);
+   if((long int) reads_area == -1)
+      printf("mmapping failed!\n");
+
+   char *test_string = malloc(sizeof(char)*10);
+   strncpy(test_string,(char*)reads_area,10);
+   //printf("Reads area at %s\n",test_string);
+   
+   int status = munmap(reads_area,mmapAreaSize);
+   if(status != 0)
+      printf("munmap failed!\n");
+
+   status = fclose(gff_fs);
+   status += fclose(reads_fs);
+   if(status != 0)
+      printf("closing of filestreams failed!\n");
+      
+   printf("sizeof(struct gene) %d\n",sizeof(struct gene));
+   printf("sizeof(struct read) %d\n",sizeof(struct read));
+
+
+   free(gff_filename);
+   free(reads_filename);
+   return 0;
+}
index 5329b8e..a721633 100644 (file)
@@ -6,6 +6,7 @@ import os.path
 import csv
 from PyGff import *
 import cPickle
 import csv
 from PyGff import *
 import cPickle
+import copy
 
 def parse(gff_fid,pickle_fid):
    reader = csv.reader(gff_fid, delimiter='\t', quoting=csv.QUOTE_NONE)
 
 def parse(gff_fid,pickle_fid):
    reader = csv.reader(gff_fid, delimiter='\t', quoting=csv.QUOTE_NONE)
@@ -27,7 +28,6 @@ def parse(gff_fid,pickle_fid):
       if id == 'gene':
          if currentGene != None:
             allGenes.append(currentGene)
       if id == 'gene':
          if currentGene != None:
             allGenes.append(currentGene)
-            currentGene = None
          
          currentGene = Gene(chr,start,stop,strand)
 
          
          currentGene = Gene(chr,start,stop,strand)
 
@@ -41,6 +41,7 @@ def parse(gff_fid,pickle_fid):
          pass
 
       elif id == 'exon':
          pass
 
       elif id == 'exon':
+         assert currentGene != None
          currentGene.addExon(start,stop)
 
       elif id == 'CDS':
          currentGene.addExon(start,stop)
 
       elif id == 'CDS':
@@ -55,13 +56,16 @@ def parse(gff_fid,pickle_fid):
       elif id == 'pseudogenic_transcript':
          pass
 
       elif id == 'pseudogenic_transcript':
          pass
 
-      elif id == 'snoRNA':
+      elif id == 'miRNA':
          pass
 
          pass
 
-      elif id == 'snRNA':
+      elif id == 'rRNA':
          pass
 
          pass
 
-      elif id == 'miRNA':
+      elif id == 'snoRNA':
+         pass
+
+      elif id == 'snRNA':
          pass
 
       elif id == 'tRNA':
          pass
 
       elif id == 'tRNA':
@@ -71,12 +75,13 @@ def parse(gff_fid,pickle_fid):
          if currentGene != None:
             allGenes.append(currentGene)
             currentGene = None
          if currentGene != None:
             allGenes.append(currentGene)
             currentGene = None
-
       else:
          assert False, 'Error: Unknown identifier \'%s\'' % id
 
       else:
          assert False, 'Error: Unknown identifier \'%s\'' % id
 
-   cPickle.dump(allGenes,pickle_fid)
+   if currentGene != None:
+      allGenes.append(currentGene)
 
 
+   cPickle.dump(allGenes,pickle_fid)
       
 if __name__ == '__main__':
    assert len(sys.argv) >= 3
       
 if __name__ == '__main__':
    assert len(sys.argv) >= 3
index f562cbd..dd62b37 100644 (file)
@@ -51,4 +51,3 @@ if __name__ == '__main__':
    solexa_fid = open(solexaFile)
    pickle_fid = open(pickleFile,'w+')
    parse(solexa_fid,pickle_fid)
    solexa_fid = open(solexaFile)
    pickle_fid = open(pickleFile,'w+')
    parse(solexa_fid,pickle_fid)
-   
diff --git a/tools/data_tools/parser.c b/tools/data_tools/parser.c
new file mode 100644 (file)
index 0000000..e686cf3
--- /dev/null
@@ -0,0 +1,89 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "datastructures.h"
+
+void parse_gff(char *filename, FILE* fid,struct gene** allGenes) {
+
+   int buffer_size = 256;
+   char* chr   = malloc(sizeof(char)*buffer_size);
+   char* blah  = malloc(sizeof(char)*buffer_size);
+   char* id    = malloc(sizeof(char)*buffer_size);
+   char* desc    = malloc(sizeof(char)*buffer_size);
+   int start  = 0;
+   int stop   = 0;
+   char* xy    = malloc(sizeof(char)*4);
+   char* strand = malloc(sizeof(char)*4);
+   char* xz    = malloc(sizeof(char)*4);
+
+   // do one pass through the gff line to determine the number of
+   // genes
+   int numGenes = 0;
+
+   int status = 0;
+   while(1) {
+      status = fscanf(fid,"%s\t%s\t%s\t%d\t%d\t%c\t%c\t%c\t%s\n",chr,blah,id,&start,&stop,xy,strand,xz,desc);
+      if(status == EOF)
+         break;
+
+      if ( status > 5 && strcmp(id,"gene")==0)
+         numGenes++;
+   }
+   freopen(filename,"r",fid);
+   printf("Found %d genes!\n",numGenes);
+
+   allGenes = malloc(sizeof(struct gene)*numGenes);
+   struct gene *currentGene = gene_alloc();
+
+   int skippedLinesCounter = 0;
+   int idx = 0;
+   while(1) {
+      status = fscanf(fid,"%s\t%s\t%s\t%d\t%d\t%c\t%c\t%c\t%s\n",chr,blah,id,&start,&stop,xy,strand,xz,desc);
+      if(status == EOF)
+         break;
+
+      if (status < 7) {
+         skippedLinesCounter++;
+         continue;
+      }
+
+      if (strcmp(id,"gene")==0) {
+         if ( currentGene->start != -1)
+            allGenes[idx] = currentGene;
+            idx++;
+       
+         currentGene = gene_alloc();
+         currentGene->start = start;
+         currentGene->stop = stop;
+         currentGene->strand = (*strand);
+         //printf("gene start/stop: %d/%d\n",start,stop);
+         continue;
+      }
+
+      if (strcmp(id,"exon")==0) {
+         add_exon(currentGene,start,stop);
+         //printf("exon start/stop: %d/%d\n",start,stop);
+         continue;
+      }
+
+      if (strcmp(id,"pseudogene")==0) {
+         if ( currentGene->start != -1)
+            allGenes[idx] = currentGene;
+            idx++;
+      }
+   }
+
+   if ( currentGene->start != -1)
+      allGenes[idx] = currentGene;
+      idx++;
+
+   free(chr);
+   free(blah);
+   free(id);
+   free(desc);
+   free(xy);
+   free(strand);
+   free(xz);
+}