+ added generice parser code
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Mon, 2 Jun 2008 07:41:24 +0000 (07:41 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Mon, 2 Jun 2008 07:41:24 +0000 (07:41 +0000)
git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@9327 e1793c9e-67f9-0310-80fc-b846ff1f7b36

ParaParser/.ParaParser.cpp.swp [new file with mode: 0644]
ParaParser/CParaParser.cpp [new file with mode: 0644]
ParaParser/CParaParser.h [new file with mode: 0644]
ParaParser/Makefile [new file with mode: 0644]
ParaParser/ParaParser.cpp [new file with mode: 0644]
ParaParser/ParaParser.h [new file with mode: 0644]
ParaParser/ParaParser.i [new file with mode: 0644]
ParaParser/doc/manual.tex [new file with mode: 0644]
ParaParser/simple_example.py [new file with mode: 0644]
ParaParser/test.data [new file with mode: 0644]

diff --git a/ParaParser/.ParaParser.cpp.swp b/ParaParser/.ParaParser.cpp.swp
new file mode 100644 (file)
index 0000000..35e9c73
Binary files /dev/null and b/ParaParser/.ParaParser.cpp.swp differ
diff --git a/ParaParser/CParaParser.cpp b/ParaParser/CParaParser.cpp
new file mode 100644 (file)
index 0000000..1f0992d
--- /dev/null
@@ -0,0 +1,145 @@
+#include "CParaParser.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+
+/*
+ * The constructor needs the format string to be used with sscanf and the names
+ * of the respective fields for the dictionary.
+ *
+ */
+
+ParaParser::ParaParser(const char* fmt, const char** fields) {
+   size_t buf_size = 512;
+   format_string = malloc(sizeof(char)*buf_size);
+   if (strlen(fmt) > buf_size)
+      perror("format string to long!");
+
+   strncpy(format_string,fmt,strlen(fmt));
+}
+
+/*
+ * 
+ */
+int create_read_from_line(Read* newRead, const char* current_line) {
+   //printf("current line is %s\n",current_line);
+
+   int entries_found = sscanf(current_line,line_format,&(newRead->id),
+   &(newRead->chr),&(newRead->strand),newRead->seq,&(newRead->splitpos),&(newRead->size),
+   newRead->prb,newRead->cal_prb,newRead->chastity,newRead->gene_id,&(newRead->p_start),
+   &(newRead->exon_stop),&(newRead->exon_start),&(newRead->p_stop),&(newRead->true_cut));
+
+   if (entries_found != 15) {
+      return -1;
+   }
+
+   // make sequence lowercase but don't destroy brackets
+   Py_ssize_t idx;
+   for(idx=0;idx<strlen(newRead->seq);idx++) {
+      if ( 65 <= newRead->seq[idx] && newRead->seq[idx] < 85)
+         newRead->seq[idx] = newRead->seq[idx]+32;
+   }
+
+   if ( newRead->strand == 'D' )
+      newRead->strand = '+';
+
+   if ( newRead->strand == 'P' )
+      newRead->strand = '-';
+
+   return 0;
+}
+
+
+ParaParser::parseFile(const char* reads_filename) {
+
+   // first define some constant strings
+
+   FILE *reads_fs = fopen(reads_filename,"r");     
+
+   if(reads_fs == NULL) {
+      printf("Error: Could not open file: %s",reads_filename);
+      exit(EXIT_FAILURE);
+   }
+
+   int reads_fid = fileno(reads_fs);
+   struct stat reads_stat;
+   if ( fstat(reads_fid,&reads_stat) == -1) {
+      perror("fstat");
+      exit(EXIT_FAILURE);
+   }
+
+   off_t reads_filesize = reads_stat.st_size;                                                                                                                                                                                  
+   //printf("Reads file is of size %lu bytes\n",(unsigned long) reads_filesize);
+
+   // ATTENTION this is an overestimator of the reads in the file
+   // it is NOT the exact number
+   int num_init_reads = reads_filesize / 200.0;
+
+   read_array = malloc(sizeof(Read*)*num_init_reads);
+   id_map = malloc(sizeof(unsigned long)*num_init_reads);
+
+   //printf("Found %d reads.",numReads);
+
+   // try to acquire file using mmap
+   void *reads_area = mmap (NULL,reads_filesize,PROT_READ,MAP_PRIVATE,reads_fid,0);
+   if (reads_area == MAP_FAILED) {
+      perror("mmap");
+      exit(EXIT_FAILURE);
+   }
+
+   close(reads_fid);
+   printf("Successfully mapped %lu bytes of reads file into memory\n",(unsigned long)reads_filesize);
+                                      
+   char* lineBeginPtr = (char*) reads_area;
+   char* lineEndPtr = (char*) reads_area;     
+   char* end_of_mapped_area = ((char*) reads_area) + reads_filesize;
+
+   while (*lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
+   lineEndPtr++;
+         
+   char* current_line = malloc(sizeof(char)*512);
+   memset(current_line,0,512);
+
+   unsigned long line_size = lineEndPtr - lineBeginPtr;
+   strncpy(current_line,lineBeginPtr,line_size);                                                                                                                                                                               
+   current_line[line_size] = '\0';
+
+   int readCtr = 0;
+   int status = 0;
+
+   num_reads = 0;
+   map_idx = 0;
+
+   while(1) {
+      if (strcmp(current_line,"") == 0) 
+         break;
+
+      status = set_item_from_line(current_line);
+      if (status != 0 )
+         printf("Error while parsing line (status=%d).",status);
+
+      lineBeginPtr = lineEndPtr;                                                                                                                                                                                         
+      while (*(char*)lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
+      lineEndPtr++;
+
+      current_line = strncpy(current_line,lineBeginPtr,lineEndPtr-lineBeginPtr);
+      current_line[lineEndPtr-lineBeginPtr] = '\0';
+
+      readCtr += 1;
+   }
+
+   // clean up
+   status = munmap(reads_area,reads_filesize);                                                                                                                                                                                 
+   if(status != 0)
+      perror("munmap");
+
+   free(current_line);
+
+   return PyInt_FromLong(num_reads);
+fetchEntry(int id);
+
+~ParaParser();
+
+
+
diff --git a/ParaParser/CParaParser.h b/ParaParser/CParaParser.h
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/ParaParser/Makefile b/ParaParser/Makefile
new file mode 100644 (file)
index 0000000..b67d52a
--- /dev/null
@@ -0,0 +1,19 @@
+SRCS= ParaParser.cpp
+
+OBJS = $(SRCS:%.cpp=%.o)
+
+#CXXFLAGS=-O3 -fPIC
+#CXXFLAGS=-O3 -fPIC -pg -fprofile-arcs
+CXXFLAGS=-O3 -fPIC -I/usr/include/python2.5
+
+PROJ=ParaParser
+
+all: $(OBJS)
+       swig -c++ -python ${PROJ}.i
+       g++ $(CXXFLAGS) -I/usr/include/python2.5 -c ${PROJ}_wrap.cxx -o ${PROJ}_wrap.o
+       g++ $(CXXFLAGS) -shared -lpython2.5 $(OBJS) ${PROJ}_wrap.o -o _${PROJ}.so
+       python -c "import ${PROJ}"
+
+clean:
+       rm *.o *.so *.cxx ${PROJ}.py ${PROJ}.pyc
+
diff --git a/ParaParser/ParaParser.cpp b/ParaParser/ParaParser.cpp
new file mode 100644 (file)
index 0000000..22dee7d
--- /dev/null
@@ -0,0 +1,235 @@
+#include "ParaParser.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+
+/*
+ * The constructor needs the format string to be used with sscanf and the names
+ * of the respective fields for the dictionary.
+ *
+ */
+
+ParaParser::ParaParser(const char* fmt, char** _fields, int num_entries) {
+   size_t buf_size = 512;
+   format_string = (char*) malloc(sizeof(char)*buf_size);
+   if (strlen(fmt) > buf_size)
+      perror("format string to long!");
+
+   strncpy(format_string,fmt,strlen(fmt));
+
+   // count how many entries are parsed in one line (number of %'s)
+   int format_num_entries = 0;
+   for(int fidx=0;fidx<strlen(fmt);fidx++)
+      if (format_string[fidx] == '%')
+         format_num_entries++;
+
+   if (format_num_entries != num_entries) 
+      printf("Error: For every entry in the format string you have to supply a name!");
+
+   field_names = (char**) malloc(sizeof(char*)*num_entries);
+
+   for(int idx=0;idx<num_entries;idx++) {
+      field_names[idx] = (char*) malloc(sizeof(char)*buf_size);
+      strncpy(field_names[idx],_fields[idx],strlen(_fields[idx]));
+   }
+
+   // subtract number of tabs as we don't want them in the parsing
+   char* pruned_format_string = (char*)malloc(sizeof(char*)*strlen(format_string)-num_entries+2);
+   int pruned_idx = 0;
+
+   for(int idx=0;idx<strlen(format_string);idx++) {
+      if (format_string[idx] == '\t')
+         continue;
+
+      pruned_format_string[pruned_idx] = format_string[idx];
+      pruned_idx++;
+   }
+   pruned_format_string[strlen(format_string)-num_entries+1] = '%';
+
+   types_list = (char**) malloc(sizeof(char**)*num_entries);
+   printf("types list\n");
+   char *pruned_ptr = pruned_format_string;
+   for(int f_idx=0;f_idx<num_entries;f_idx++) {
+      char *part = strtok (pruned_ptr, "%");
+      pruned_ptr = NULL;
+
+      types_list[f_idx] = (char*) malloc(sizeof(char*)*strlen(part));
+      strncpy(types_list[f_idx],part,strlen(part));
+      printf("%s(%d) ",part,strlen(part));
+   }
+   printf("\n");
+}
+
+/*
+ * 
+ */
+
+void ParaParser::create_entry_from_line(const char* current_line, char* format_string, ... ) {
+   printf("current line is %s\n",current_line);
+   
+   void*** current_entry = (void***) malloc(sizeof(void***));
+
+   char *line_ptr = (char*) malloc(sizeof(char)*strlen(current_line));
+   strncpy(line_ptr,current_line,strlen(current_line));
+
+   char* current_type = (char*) malloc(sizeof(char)*6);
+
+   for(int idx=0; idx<num_columns;idx++) {
+      char* col = strtok(line_ptr,"\t");
+      line_ptr = NULL;
+      strncpy(current_type,types_list[idx],strlen(types_list[idx]));
+
+      printf("%s ",col);
+      printf("%s ",current_type);
+
+      if ( strcmp(current_type,"d")==0 ) {
+         (*current_entry)[idx] = (int*) malloc(sizeof(int));
+         memcpy((*current_entry)[idx],col,sizeof(int*));
+      }
+      if ( strcmp(current_type,"f")==0 ) {
+         (*current_entry)[idx] = (double*) malloc(sizeof(double));
+         memcpy((*current_entry)[idx],col,sizeof(double*));
+      }
+      if ( strcmp(current_type,"s")==0 ) {
+         (*current_entry)[idx] = (char*) malloc(sizeof(char*));
+         memcpy((*current_entry)[idx],col,sizeof(char*));
+      }
+      if ( strcmp(current_type,"lu")==0 ) {
+         (*current_entry)[idx] = (unsigned long*) malloc(sizeof(unsigned long));
+         memcpy((*current_entry)[idx],col,sizeof(unsigned long*));
+      }
+   }
+  
+   printf("\n");
+
+   int* id = (int*)(*current_entry)[0];
+   printf("id is %d\n",id);
+   (*entries)[*id] = current_entry;
+}
+
+
+int ParaParser::parseFile(char* reads_filename) {
+   size_t buf_size = 512;
+   char* line = (char*) malloc(sizeof(char)*buf_size);
+
+   printf("open %s\n",reads_filename);
+   FILE *input_fs = fopen(reads_filename,"r");
+   if (input_fs == NULL)
+      perror("fopen");
+
+   int line_ctr = 0;
+
+   while (getline (&line, &buf_size, input_fs) >= 0)
+      line_ctr++;
+
+   free(line);
+
+   printf("file has %d lines\n",line_ctr);
+
+   if(input_fs == NULL) {
+      printf("Error: Could not open file: %s",reads_filename);
+      exit(EXIT_FAILURE);
+   }
+
+   int reads_fid = fileno(input_fs);
+   struct stat reads_stat;
+   if ( fstat(reads_fid,&reads_stat) == -1) {
+      perror("fstat");
+      exit(EXIT_FAILURE);
+   }
+
+   off_t reads_filesize = reads_stat.st_size;
+   //printf("Reads file is of size %lu bytes\n",(unsigned long) reads_filesize);
+
+   entries = new map<unsigned long,void***,KeyCmp>();
+
+   // try to acquire file using mmap
+   void *reads_area = mmap (NULL,reads_filesize,PROT_READ,MAP_PRIVATE,reads_fid,0);
+   if (reads_area == MAP_FAILED) {
+      perror("mmap");
+      exit(EXIT_FAILURE);
+   }
+
+   close(reads_fid);
+   printf("Successfully mapped %lu bytes of reads file into memory\n",(unsigned long)reads_filesize);
+                                      
+   char* lineBeginPtr = (char*) reads_area;
+   char* lineEndPtr = (char*) reads_area;     
+   char* end_of_mapped_area = ((char*) reads_area) + reads_filesize;
+
+   while (*lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
+   lineEndPtr++;
+         
+   char* current_line = (char*) malloc(sizeof(char)*512);
+   memset(current_line,0,512);
+
+   unsigned long line_size = lineEndPtr - lineBeginPtr;
+   strncpy(current_line,lineBeginPtr,line_size);                                                                                                                                                                               
+   current_line[line_size] = '\0';
+
+   int readCtr = 0;
+   int status = 0;
+
+   int num_reads = 0;
+
+   while(1) {
+      if (strcmp(current_line,"") == 0) 
+         break;
+
+      create_entry_from_line(current_line,format_string,current_line);
+      if (status != 0 )
+         printf("Error while parsing line (status=%d).",status);
+
+      lineBeginPtr = lineEndPtr;                                                                                                                                                                                         
+      while (*(char*)lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
+      lineEndPtr++;
+
+      current_line = strncpy(current_line,lineBeginPtr,lineEndPtr-lineBeginPtr);
+      current_line[lineEndPtr-lineBeginPtr] = '\0';
+
+      readCtr += 1;
+   }
+
+   // clean up
+   status = munmap(reads_area,reads_filesize);                                                                                                                                                                                 
+   if(status != 0)
+      perror("munmap");
+
+   free(current_line);
+
+   return 99;
+}
+
+/*
+PyObject* ParaParser::fetchEntry(int id) {
+
+   void*** current_entry = (*entries)[id];
+
+   PyObject* line_dict = PyDict_New();
+
+   int status;
+
+   for(int idx=0;idx<num_columns;idx++) {
+      char* current_type = types_list[idx];
+      if ( strcmp(current_type,"f")==0 )
+         status = PyDict_SetItem(line_dict, PyString_FromString(field_names[idx]), PyString_FromString((char*)(*current_entry)[idx]) );
+      
+      if ( strcmp(current_type,"s")==0 )
+         status = PyDict_SetItem(line_dict, PyString_FromString(field_names[idx]), PyString_FromString((char*)(*current_entry)[idx]) );
+
+      if ( strcmp(current_type,"lu")==0 || strcmp(current_type,"d")==0 )
+         status = PyDict_SetItem(line_dict, PyString_FromString(field_names[idx]), PyInt_FromLong(*(int*)(*current_entry)[idx]) );
+
+   }
+   
+   //size_t idx;
+   //for(idx=0;idx<current_read->size;idx++) {
+   //   status = PyList_SetItem( prb_list, idx, PyInt_FromLong(current_read->prb[idx]-50) );
+   //   status = PyList_SetItem( cal_prb_list, idx, PyInt_FromLong(current_read->cal_prb[idx]-64) );
+   //   status = PyList_SetItem( chastity_list, idx, PyInt_FromLong(current_read->chastity[idx]+10) );
+   //}
+
+   return line_dict;
+}
+*/
diff --git a/ParaParser/ParaParser.h b/ParaParser/ParaParser.h
new file mode 100644 (file)
index 0000000..92c21a2
--- /dev/null
@@ -0,0 +1,35 @@
+#ifndef __PARAPARSER_H__
+#define __PARAPARSER_H__
+
+#include <Python.h>
+#include <map>
+using namespace std;
+
+struct KeyCmp {
+   bool operator()( unsigned long s1, unsigned long s2 ) const {
+          return ( s1 == s2 );
+   }
+};
+
+
+class ParaParser{
+
+   private:
+      char* format_string;
+      char** field_names;
+      int num_columns;
+      char** types_list;
+
+      map<unsigned long,void***,KeyCmp> *entries;
+
+   public:
+      ParaParser(const char* fmt, char** _fields, int num_entries);
+      int parseFile(char* reads_filename);
+      void create_entry_from_line(const char* current_line, char* format_string, ...);
+      //PyObject* fetchEntry(int id);
+
+      ~ParaParser(){}
+};
+
+#endif // __PARAPARSER_H__
+
diff --git a/ParaParser/ParaParser.i b/ParaParser/ParaParser.i
new file mode 100644 (file)
index 0000000..0ff463f
--- /dev/null
@@ -0,0 +1,33 @@
+%module ParaParser
+%{
+
+#include "ParaParser.h"
+
+%}
+
+%include "typemaps.i"
+
+%typemap(in) char ** {
+    /* Check if is a list */
+    if (PyList_Check($input)) {
+      int size = PyList_Size($input);
+      int i = 0;
+      $1 = (char **) malloc((size+1)*sizeof(char *));
+      for (i = 0; i < size; i++) {
+        PyObject *o = PyList_GetItem($input,i);
+        if (PyString_Check(o))
+     $1[i] = PyString_AsString(PyList_GetItem($input,i));
+        else {
+     PyErr_SetString(PyExc_TypeError,"list must contain strings");
+     free($1);
+     return NULL;
+        }
+      }
+      $1[i] = 0;
+    } else {
+      PyErr_SetString(PyExc_TypeError,"not a list");
+      return NULL;
+    }
+}
+
+%include "ParaParser.h"
diff --git a/ParaParser/doc/manual.tex b/ParaParser/doc/manual.tex
new file mode 100644 (file)
index 0000000..3ee632e
--- /dev/null
@@ -0,0 +1,5 @@
+\documentclass{article}
+\begin{document}
+
+\end{document}
+
diff --git a/ParaParser/simple_example.py b/ParaParser/simple_example.py
new file mode 100644 (file)
index 0000000..747e147
--- /dev/null
@@ -0,0 +1,16 @@
+#!/usr/bin/env python 
+# -*- coding: utf-8 -*- 
+
+import sys
+from ParaParser import *
+
+def run(file):
+   parser = ParaParser("%d%s%s%d%d",["field0","field1","field2","field3","field4"],5)
+   parser.parseFile(file)
+   entry1_dict = parser.fetchEntry(101)
+   print entry1_dict
+   entry2_dict = parser.fetchEntry(102)
+   print entry2_dict
+
+if __name__ == '__main__':
+   run('test.data')
diff --git a/ParaParser/test.data b/ParaParser/test.data
new file mode 100644 (file)
index 0000000..88a2821
--- /dev/null
@@ -0,0 +1,2 @@
+11111111       alpha    beta    1       2
+22222222       gamma    delta  99      100