+ wrote faster parser / file processing tools in C
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Tue, 18 Dec 2007 14:44:40 +0000 (14:44 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Tue, 18 Dec 2007 14:44:40 +0000 (14:44 +0000)
TODO
+ finish solexa reads parser

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@7164 e1793c9e-67f9-0310-80fc-b846ff1f7b36

tools/data_tools/Makefile [new file with mode: 0644]
tools/data_tools/PyGff.py
tools/data_tools/createTestSet.py
tools/data_tools/datastructures.c [new file with mode: 0644]
tools/data_tools/datastructures.h [new file with mode: 0644]
tools/data_tools/filterReads.c [new file with mode: 0644]
tools/data_tools/parseGff.py
tools/data_tools/parseSolexa.py
tools/data_tools/parser.c [new file with mode: 0644]

diff --git a/tools/data_tools/Makefile b/tools/data_tools/Makefile
new file mode 100644 (file)
index 0000000..71b500f
--- /dev/null
@@ -0,0 +1,9 @@
+
+SRCS= parser.c\
+       datastructures.c
+
+                         
+OBJS = $(SRCS:%.cpp=%.o)
+
+all: $(OBJS)
+       gcc -Wall -o filterReads $(OBJS) filterReads.c
index 618c2aa..b463512 100644 (file)
@@ -3,15 +3,6 @@
 
 class Gene:
    
-   chromosome = ''
-   strand = ''
-   start = -1
-   stop = -1
-   exons = []
-   fiveUTR = []
-   threeUTR = []
-
-
    def __init__(self,chr,begin,end,strand):
       assert chr != ''
       assert begin >= 0 and begin <= end and end >= 0
@@ -21,7 +12,9 @@ class Gene:
       self.start = begin
       self.stop = end
       self.strand = strand
-
+      self.exons = []
+      self.fiveUTR = []
+      self.threeUTR = []
 
    def addExon(self,start,stop):
       self.exons.append((start,stop))
index 03c9d06..67f6bf9 100644 (file)
@@ -12,7 +12,7 @@ info=\
 You have to supply two files. One containing the gff information and the other
 one containing the information of the Solexa(R) reads.
 
-Usage: ./createTestSet.py gff.pickle reads.pickle
+Usage: ./createTestSet.py gff.pickle reads.txt
 """
 
 doc=\
@@ -21,12 +21,51 @@ Make sure that you split the data into chromosome files beforehand because
 this method does not check the chromosome info
 """
 
-def check(annot,reads):
-   print len(annot)
-   print len(reads)
+def printRow(row):
+   print "".join([elem+' ' for elem in row]).strip()
 
-   
+def check(annotation,reads):
+   skippedReadsCounter = 0
 
+   reader = csv.reader(reads, delimiter='\t', quoting=csv.QUOTE_NONE)
+   for pos,row in enumerate(reader):
+      if len(row) != 12:
+         skippedReadsCounter += 1
+         continue
+
+      if pos % 10000 == 0:
+         print >> sys.stderr, pos
+
+      chr = row[0]
+      pos = int(row[1])
+      readStart = pos
+      readStop = pos+35
+
+      for gene in annotation:
+         # read inside current gene positions
+
+         if gene.start <= readStart and pos+35 <= gene.stop:
+            overlapping = True
+            
+            for exonStart,exonStop in gene.exons:
+               if exonStart <= readStart and readStop <= exonStop:
+                  overlapping = False
+                  break
+         
+            if overlapping:
+               printRow(row)
+               continue
+
+         # read is overlapping with gene boundaries
+         if pos < gene.start and pos+35 > gene.start:
+            printRow(row)
+            continue
+
+         if pos < gene.stop and pos+35 > gene.stop:
+            printRow(row)
+            continue
+
+   print skippedReadsCounter     
       
 if __name__ == '__main__':
    assert len(sys.argv) >= 3, info
@@ -34,10 +73,9 @@ if __name__ == '__main__':
    readsFile = sys.argv[2]
    assert os.path.exists(annotFile), 'File %s does not exist!' % annotFile
    assert os.path.exists(readsFile), 'File %s does not exist!' % readsFile
-   print doc
+   print >> sys.stderr, doc
 
    annotation = cPickle.load(open(annotFile))
-   reads = cPickle.load(open(readsFile))
+   reads = open(readsFile)
 
    check(annotation,reads)
-
diff --git a/tools/data_tools/datastructures.c b/tools/data_tools/datastructures.c
new file mode 100644 (file)
index 0000000..2f464c4
--- /dev/null
@@ -0,0 +1,33 @@
+#include <stdlib.h>
+#include <string.h>
+
+#include "datastructures.h"
+
+struct gene* gene_alloc(void) {
+   struct gene* newGene = (struct gene*) malloc(sizeof(struct gene));
+   newGene->exon_starts = malloc(2*sizeof(int));
+   newGene->exon_stops = malloc(2*sizeof(int));
+   newGene->num_exons = 0;
+   newGene->max_exons = 2;
+   newGene->start = -1;
+
+   return newGene;
+}
+
+void free_gene(struct gene* oldGene) {
+   free(oldGene->exon_starts);
+   free(oldGene->exon_stops);
+}
+
+void add_exon(struct gene* currentGene,int start, int stop) {
+
+   if (currentGene->num_exons < currentGene->num_exons) {
+      int idx = currentGene->num_exons;
+      currentGene->exon_starts[idx] = start;
+      currentGene->exon_stops[idx] = stop;
+      currentGene->num_exons++;
+   } else {
+      currentGene->exon_starts = realloc(currentGene->exon_starts,sizeof(int)*2*currentGene->max_exons);
+      currentGene->exon_stops = realloc(currentGene->exon_stops,sizeof(int)*2*currentGene->max_exons);
+   }
+}
diff --git a/tools/data_tools/datastructures.h b/tools/data_tools/datastructures.h
new file mode 100644 (file)
index 0000000..64c48c1
--- /dev/null
@@ -0,0 +1,29 @@
+#ifndef __DATA_STRUCTURES_H__
+#define __DATA_STRUCTURES_H__
+
+struct gene {
+   int start;
+   int stop;
+   int* exon_starts;
+   int* exon_stops;
+   int num_exons;
+   int max_exons;
+   char strand;
+};
+
+struct read {
+   int start;
+   int stop;
+   char* seq;
+   char strand;
+
+   int* prb;
+   int* cprb;
+   int* chastity;
+};
+
+struct gene* gene_alloc(void);
+void free_gene(struct gene* oldGene);
+void add_exon(struct gene* currentGene,int start, int stop);
+
+#endif // __DATA_STRUCTURES_H__
diff --git a/tools/data_tools/filterReads.c b/tools/data_tools/filterReads.c
new file mode 100644 (file)
index 0000000..4e7ec9f
--- /dev/null
@@ -0,0 +1,80 @@
+////////////////////////////////////////////////////////////
+// The purpose of this program is to read a gff and a    
+// solexa reads file and create a data set used by QPalma. 
+//
+//
+//
+//
+////////////////////////////////////////////////////////////
+
+#include <sys/mman.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "datastructures.h"
+
+void parse_gff(char* filename,FILE* fid,struct gene* allGenes);
+
+static char *info = "Usage is:\n./filterReads gff reads";
+
+int main(int argc, char* argv[]) {
+
+   if(argc != 3) {
+      printf("%s\n",info);
+      exit(EXIT_FAILURE);
+   }
+
+   //static size_t page_size;
+   // page_size = (size_t) sysconf (_SC_PAGESIZE);
+   int filenameSize = 256;
+   char *gff_filename = malloc(sizeof(char)*filenameSize);
+   char *reads_filename = malloc(sizeof(char)*filenameSize);
+
+   strncpy(gff_filename,argv[1],filenameSize);
+   strncpy(reads_filename,argv[2],filenameSize);
+
+   FILE *gff_fs = fopen(gff_filename,"r");
+   FILE *reads_fs = fopen(reads_filename,"r");
+
+   if(gff_fs == NULL) {
+      printf("Error: Could not open file: %s",gff_filename);
+      exit(EXIT_FAILURE);
+   }
+
+   if(reads_fs == NULL) {
+      printf("Error: Could not open file: %s",reads_filename);
+      exit(EXIT_FAILURE);
+   }
+
+   struct gene **allGenes;
+   parse_gff(gff_filename,gff_fs,allGenes);
+
+
+   int reads_fid = fileno(reads_fs);
+   size_t mmapAreaSize = 16;
+   void *reads_area = mmap (NULL,mmapAreaSize,PROT_READ|PROT_WRITE,MAP_PRIVATE,reads_fid,0);
+   if((long int) reads_area == -1)
+      printf("mmapping failed!\n");
+
+   char *test_string = malloc(sizeof(char)*10);
+   strncpy(test_string,(char*)reads_area,10);
+   //printf("Reads area at %s\n",test_string);
+   
+   int status = munmap(reads_area,mmapAreaSize);
+   if(status != 0)
+      printf("munmap failed!\n");
+
+   status = fclose(gff_fs);
+   status += fclose(reads_fs);
+   if(status != 0)
+      printf("closing of filestreams failed!\n");
+      
+   printf("sizeof(struct gene) %d\n",sizeof(struct gene));
+   printf("sizeof(struct read) %d\n",sizeof(struct read));
+
+
+   free(gff_filename);
+   free(reads_filename);
+   return 0;
+}
index 5329b8e..a721633 100644 (file)
@@ -6,6 +6,7 @@ import os.path
 import csv
 from PyGff import *
 import cPickle
+import copy
 
 def parse(gff_fid,pickle_fid):
    reader = csv.reader(gff_fid, delimiter='\t', quoting=csv.QUOTE_NONE)
@@ -27,7 +28,6 @@ def parse(gff_fid,pickle_fid):
       if id == 'gene':
          if currentGene != None:
             allGenes.append(currentGene)
-            currentGene = None
          
          currentGene = Gene(chr,start,stop,strand)
 
@@ -41,6 +41,7 @@ def parse(gff_fid,pickle_fid):
          pass
 
       elif id == 'exon':
+         assert currentGene != None
          currentGene.addExon(start,stop)
 
       elif id == 'CDS':
@@ -55,13 +56,16 @@ def parse(gff_fid,pickle_fid):
       elif id == 'pseudogenic_transcript':
          pass
 
-      elif id == 'snoRNA':
+      elif id == 'miRNA':
          pass
 
-      elif id == 'snRNA':
+      elif id == 'rRNA':
          pass
 
-      elif id == 'miRNA':
+      elif id == 'snoRNA':
+         pass
+
+      elif id == 'snRNA':
          pass
 
       elif id == 'tRNA':
@@ -71,12 +75,13 @@ def parse(gff_fid,pickle_fid):
          if currentGene != None:
             allGenes.append(currentGene)
             currentGene = None
-
       else:
          assert False, 'Error: Unknown identifier \'%s\'' % id
 
-   cPickle.dump(allGenes,pickle_fid)
+   if currentGene != None:
+      allGenes.append(currentGene)
 
+   cPickle.dump(allGenes,pickle_fid)
       
 if __name__ == '__main__':
    assert len(sys.argv) >= 3
index f562cbd..dd62b37 100644 (file)
@@ -51,4 +51,3 @@ if __name__ == '__main__':
    solexa_fid = open(solexaFile)
    pickle_fid = open(pickleFile,'w+')
    parse(solexa_fid,pickle_fid)
-   
diff --git a/tools/data_tools/parser.c b/tools/data_tools/parser.c
new file mode 100644 (file)
index 0000000..e686cf3
--- /dev/null
@@ -0,0 +1,89 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "datastructures.h"
+
+void parse_gff(char *filename, FILE* fid,struct gene** allGenes) {
+
+   int buffer_size = 256;
+   char* chr   = malloc(sizeof(char)*buffer_size);
+   char* blah  = malloc(sizeof(char)*buffer_size);
+   char* id    = malloc(sizeof(char)*buffer_size);
+   char* desc    = malloc(sizeof(char)*buffer_size);
+   int start  = 0;
+   int stop   = 0;
+   char* xy    = malloc(sizeof(char)*4);
+   char* strand = malloc(sizeof(char)*4);
+   char* xz    = malloc(sizeof(char)*4);
+
+   // do one pass through the gff line to determine the number of
+   // genes
+   int numGenes = 0;
+
+   int status = 0;
+   while(1) {
+      status = fscanf(fid,"%s\t%s\t%s\t%d\t%d\t%c\t%c\t%c\t%s\n",chr,blah,id,&start,&stop,xy,strand,xz,desc);
+      if(status == EOF)
+         break;
+
+      if ( status > 5 && strcmp(id,"gene")==0)
+         numGenes++;
+   }
+   freopen(filename,"r",fid);
+   printf("Found %d genes!\n",numGenes);
+
+   allGenes = malloc(sizeof(struct gene)*numGenes);
+   struct gene *currentGene = gene_alloc();
+
+   int skippedLinesCounter = 0;
+   int idx = 0;
+   while(1) {
+      status = fscanf(fid,"%s\t%s\t%s\t%d\t%d\t%c\t%c\t%c\t%s\n",chr,blah,id,&start,&stop,xy,strand,xz,desc);
+      if(status == EOF)
+         break;
+
+      if (status < 7) {
+         skippedLinesCounter++;
+         continue;
+      }
+
+      if (strcmp(id,"gene")==0) {
+         if ( currentGene->start != -1)
+            allGenes[idx] = currentGene;
+            idx++;
+       
+         currentGene = gene_alloc();
+         currentGene->start = start;
+         currentGene->stop = stop;
+         currentGene->strand = (*strand);
+         //printf("gene start/stop: %d/%d\n",start,stop);
+         continue;
+      }
+
+      if (strcmp(id,"exon")==0) {
+         add_exon(currentGene,start,stop);
+         //printf("exon start/stop: %d/%d\n",start,stop);
+         continue;
+      }
+
+      if (strcmp(id,"pseudogene")==0) {
+         if ( currentGene->start != -1)
+            allGenes[idx] = currentGene;
+            idx++;
+      }
+   }
+
+   if ( currentGene->start != -1)
+      allGenes[idx] = currentGene;
+      idx++;
+
+   free(chr);
+   free(blah);
+   free(id);
+   free(desc);
+   free(xy);
+   free(strand);
+   free(xz);
+}