+ added python module for reads parsing as plain python code is to slow
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Thu, 15 May 2008 10:26:35 +0000 (10:26 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Thu, 15 May 2008 10:26:35 +0000 (10:26 +0000)
git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@9019 e1793c9e-67f9-0310-80fc-b846ff1f7b36

cparser/ERR [new file with mode: 0644]
cparser/build/lib.linux-x86_64-2.5/qparser.so [new file with mode: 0755]
cparser/qparser.c [new file with mode: 0644]
cparser/qparser.so [new file with mode: 0755]
cparser/setup.py [new file with mode: 0644]
cparser/test.py [new file with mode: 0644]

diff --git a/cparser/ERR b/cparser/ERR
new file mode 100644 (file)
index 0000000..4f64ea6
--- /dev/null
@@ -0,0 +1,26 @@
+Error while parsing line.: No such file or directory
+Error while parsing line.: Invalid argument
+Error while parsing line.: Invalid argument
+Error while parsing line.: Invalid argument
+Error while parsing line.: Invalid argument
+Error while parsing line.: Invalid argument
+Error while parsing line.: Invalid argument
+Error while parsing line.: Invalid argument
+Error while parsing line.: Invalid argument
+Error while parsing line.: Invalid argument
+Error while parsing line.: Invalid argument
+Error while parsing line.: Invalid argument
+Error while parsing line.: Invalid argument
+Error while parsing line.: Invalid argument
+Error while parsing line.: Invalid argument
+Error while parsing line.: Invalid argument
+Error while parsing line.: Invalid argument
+Error while parsing line.: Invalid argument
+Error while parsing line.: Invalid argument
+Error while parsing line.: Invalid argument
+Traceback (most recent call last):
+  File "test.py", line 16, in <module>
+    test_module()
+  File "test.py", line 13, in test_module
+    pdb.set_trace()
+NameError: global name 'pdb' is not defined
diff --git a/cparser/build/lib.linux-x86_64-2.5/qparser.so b/cparser/build/lib.linux-x86_64-2.5/qparser.so
new file mode 100755 (executable)
index 0000000..e088c2c
Binary files /dev/null and b/cparser/build/lib.linux-x86_64-2.5/qparser.so differ
diff --git a/cparser/qparser.c b/cparser/qparser.c
new file mode 100644 (file)
index 0000000..32d4665
--- /dev/null
@@ -0,0 +1,189 @@
+#include <Python.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+
+
+// the line format is defined as follows
+// id chr strand seq splitpos size q1 q2 q3 geneId p1 p2 p3 p4 true_cut
+const char* line_format = "%lu\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\n";
+int buffer_size= 64;                                                                                                                                                                                                        
+unsigned long id = 0;
+int chr        = 0;
+char strand    = ' ';
+int splitpos   = 0;
+int size       = 0;
+int p_start    = 0;
+int exon_stop  = 0;
+int exon_start = 0;
+int p_stop     = 0;
+int true_cut   = 0;
+
+char* seq      = 0;
+char* prb      = 0;
+char* cal_prb  = 0;
+char* chastity = 0;
+
+char* geneId   = 0;
+
+static int set_item_from_line(PyObject *result_dict, const char* current_line) {
+
+   int status;
+
+   status = sscanf(current_line,line_format,&id,&chr,&strand,seq,&splitpos,&size,prb,cal_prb,chastity,geneId,p_start,exon_stop,exon_start,p_stop,&true_cut);
+
+   if (status != 15) {
+      return -1;
+   }
+
+   // create dictionary representing one line
+   PyObject* entry_dict = PyDict_New();
+   PyObject *id_py = PyInt_FromLong(id);
+   
+   // add entries of that line
+   status = PyDict_SetItem(result_dict, PyString_FromString("id"),         id_py );
+   status = PyDict_SetItem(result_dict, PyString_FromString("chr"),        PyInt_FromLong(chr) );
+
+   PyObject *strand_py = PyString_FromString("--");
+
+   if ( strand == 'D' )
+      strand_py = PyString_FromString("+");
+
+   if ( strand == 'P' )
+      strand_py = PyString_FromString("-");
+
+
+   status = status || PyDict_SetItem(result_dict, PyString_FromString("strand"),     strand_py );
+   status = PyDict_SetItem(result_dict, PyString_FromString("seq"),        PyString_FromString(seq) );
+   status = PyDict_SetItem(result_dict, PyString_FromString("splitpos"),   PyInt_FromLong(splitpos) );
+   status = PyDict_SetItem(result_dict, PyString_FromString("size"),       PyInt_FromLong(size) );
+   status = PyDict_SetItem(result_dict, PyString_FromString("prb"),        PyString_FromString(prb) );
+   status = PyDict_SetItem(result_dict, PyString_FromString("cal_prb"),    PyString_FromString(cal_prb) );
+   status = PyDict_SetItem(result_dict, PyString_FromString("chastity"),   PyString_FromString(chastity) );
+   //status = PyDict_SetItem(result_dict, FromString("gene_id"), id_py );
+   status = PyDict_SetItem(result_dict, PyString_FromString("p_start"),    PyInt_FromLong(p_start) );
+   status = PyDict_SetItem(result_dict, PyString_FromString("exon_stop"),  PyInt_FromLong(exon_stop) );
+   status = PyDict_SetItem(result_dict, PyString_FromString("exon_start"), PyInt_FromLong(exon_start) );
+   status = PyDict_SetItem(result_dict, PyString_FromString("p_stop"),     PyInt_FromLong(p_stop) );
+
+   // now save the dictionary representing one line in the dictionary
+   // representing the whole file
+   status = PyDict_SetItem(result_dict, id_py, entry_dict);
+   if (status != 0) {
+               PyErr_Warn(PyExc_Warning, "qparser.parse_reads: Failed to add item!");
+   }
+
+   return status;
+}
+
+/*
+ * This function parses the original reads file and stores the lines in a
+ * dictionary indexed by the key.
+ *
+ */
+
+static PyObject * Py_parse_reads(PyObject *obj, PyObject *args) {
+
+   // first define some constant strings
+   const char* reads_filename;
+
+   if (!PyArg_ParseTuple(args, "s", &reads_filename)) {
+               PyErr_Warn(PyExc_Warning, "qparser.parse_reads: Invalid parameters.");
+      return NULL;
+   }
+
+   FILE *reads_fs = fopen(reads_filename,"r");     
+
+   if(reads_fs == NULL) {
+      printf("Error: Could not open file: %s",reads_filename);
+      exit(EXIT_FAILURE);
+   }
+
+   int reads_fid = fileno(reads_fs);
+   struct stat reads_stat;
+   if ( fstat(reads_fid,&reads_stat) == -1) {
+      perror("fstat");
+      exit(EXIT_FAILURE);
+   }
+
+   off_t reads_filesize = reads_stat.st_size;                                                                                                                                                                                  
+   printf("Reads file is of size %lu bytes\n",(unsigned long) reads_filesize);
+   int numReads = reads_filesize / 178.0;
+
+   // try to acquire file using mmap
+   void *reads_area = mmap (NULL,reads_filesize,PROT_READ,MAP_PRIVATE,reads_fid,0);
+   if (reads_area == MAP_FAILED) {                                                                                                                                                                                             
+      perror("mmap");
+      exit(EXIT_FAILURE);
+   }
+
+   close(reads_fid);
+   printf("Successfully mapped %lu bytes of reads file into memory\n",(unsigned long)reads_filesize);
+                                      
+   char* lineBeginPtr = (char*) reads_area;
+   char* lineEndPtr = (char*) reads_area;     
+   char* end_of_mapped_area = ((char*) reads_area) + reads_filesize;
+
+   while (*lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
+      lineEndPtr++;
+         
+   char* current_line = malloc(sizeof(char)*512);
+   memset(current_line,0,512);
+
+   unsigned long line_size = lineEndPtr - lineBeginPtr;
+   strncpy(current_line,lineBeginPtr,line_size);                                                                                                                                                                               
+   current_line[line_size] = '\0';
+
+   int readCtr = 0;
+   int skippedLinesCounter = 0;
+   int status = 0;
+
+   // The result dict stores all lines in the form of small dictionaries
+   // it is indexed by the unique id of the read.
+   PyObject* result_dict = PyDict_New();
+
+   while(1) {
+      if (strcmp(current_line,"") == 0) 
+         break;
+
+      //printf("current line is %s",current_line);
+
+      status = set_item_from_line(result_dict,current_line);
+      if (status != 0 )
+         perror("Error while parsing line.");
+
+      lineBeginPtr = lineEndPtr;                                                                                                                                                                                         
+      while (*(char*)lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
+         lineEndPtr++;
+
+      readCtr += 1;
+      current_line = strncpy(current_line,lineBeginPtr,lineEndPtr-lineBeginPtr);
+      current_line[lineEndPtr-lineBeginPtr] = '\0';
+   }
+
+
+   //Py_ssize_t PyDict_Size( PyObject *p)
+
+   return result_dict;
+}
+
+static PyMethodDef qparserMethods[] = {
+       {"parse_reads",  Py_parse_reads, METH_VARARGS,"Test UInt8 behaviour."},
+       {NULL, NULL, 0, NULL}        /* Sentinel */
+};
+
+PyMODINIT_FUNC initqparser(void) {
+       (void) Py_InitModule("qparser", qparserMethods);
+
+   seq      = malloc(sizeof(char)*buffer_size);
+   prb      = malloc(sizeof(char)*buffer_size);
+   cal_prb  = malloc(sizeof(char)*buffer_size);
+   chastity = malloc(sizeof(char)*buffer_size);
+}
+
+int main(int argc, char *argv[]) {
+       Py_SetProgramName(argv[0]);
+       Py_Initialize();
+       initqparser();
+}
diff --git a/cparser/qparser.so b/cparser/qparser.so
new file mode 100755 (executable)
index 0000000..e088c2c
Binary files /dev/null and b/cparser/qparser.so differ
diff --git a/cparser/setup.py b/cparser/setup.py
new file mode 100644 (file)
index 0000000..90841d4
--- /dev/null
@@ -0,0 +1,6 @@
+#!/usr/bin/env python
+
+from distutils.core import setup, Extension
+
+module1 = Extension('qparser',sources = ['qparser.c'])
+setup (name = 'Fabio',version = '1.0',description = 'This is an experimental implementation of a basic qparser algorithm',ext_modules = [module1])
diff --git a/cparser/test.py b/cparser/test.py
new file mode 100644 (file)
index 0000000..cd8f430
--- /dev/null
@@ -0,0 +1,16 @@
+#!/usr/bin/env python 
+# -*- coding: utf-8 -*- 
+
+import qparser
+import sys
+
+
+
+def test_module():
+   filename = '/fml/ag-raetsch/share/projects/qpalma/solexa/new_run/allReads.full_20'
+   result = qparser.parse_reads(filename)
+
+   pdb.set_trace()
+
+if __name__ == '__main__':
+   test_module()