--- /dev/null
+#include "CParaParser.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+
+/*
+ * The constructor needs the format string to be used with sscanf and the names
+ * of the respective fields for the dictionary.
+ *
+ */
+
+ParaParser::ParaParser(const char* fmt, const char** fields) {
+ size_t buf_size = 512;
+ format_string = malloc(sizeof(char)*buf_size);
+ if (strlen(fmt) > buf_size)
+ perror("format string to long!");
+
+ strncpy(format_string,fmt,strlen(fmt));
+}
+
+/*
+ *
+ */
+int create_read_from_line(Read* newRead, const char* current_line) {
+ //printf("current line is %s\n",current_line);
+
+ int entries_found = sscanf(current_line,line_format,&(newRead->id),
+ &(newRead->chr),&(newRead->strand),newRead->seq,&(newRead->splitpos),&(newRead->size),
+ newRead->prb,newRead->cal_prb,newRead->chastity,newRead->gene_id,&(newRead->p_start),
+ &(newRead->exon_stop),&(newRead->exon_start),&(newRead->p_stop),&(newRead->true_cut));
+
+ if (entries_found != 15) {
+ return -1;
+ }
+
+ // make sequence lowercase but don't destroy brackets
+ Py_ssize_t idx;
+ for(idx=0;idx<strlen(newRead->seq);idx++) {
+ if ( 65 <= newRead->seq[idx] && newRead->seq[idx] < 85)
+ newRead->seq[idx] = newRead->seq[idx]+32;
+ }
+
+ if ( newRead->strand == 'D' )
+ newRead->strand = '+';
+
+ if ( newRead->strand == 'P' )
+ newRead->strand = '-';
+
+ return 0;
+}
+
+
+ParaParser::parseFile(const char* reads_filename) {
+
+ // first define some constant strings
+
+ FILE *reads_fs = fopen(reads_filename,"r");
+
+ if(reads_fs == NULL) {
+ printf("Error: Could not open file: %s",reads_filename);
+ exit(EXIT_FAILURE);
+ }
+
+ int reads_fid = fileno(reads_fs);
+ struct stat reads_stat;
+ if ( fstat(reads_fid,&reads_stat) == -1) {
+ perror("fstat");
+ exit(EXIT_FAILURE);
+ }
+
+ off_t reads_filesize = reads_stat.st_size;
+ //printf("Reads file is of size %lu bytes\n",(unsigned long) reads_filesize);
+
+ // ATTENTION this is an overestimator of the reads in the file
+ // it is NOT the exact number
+ int num_init_reads = reads_filesize / 200.0;
+
+ read_array = malloc(sizeof(Read*)*num_init_reads);
+ id_map = malloc(sizeof(unsigned long)*num_init_reads);
+
+ //printf("Found %d reads.",numReads);
+
+ // try to acquire file using mmap
+ void *reads_area = mmap (NULL,reads_filesize,PROT_READ,MAP_PRIVATE,reads_fid,0);
+ if (reads_area == MAP_FAILED) {
+ perror("mmap");
+ exit(EXIT_FAILURE);
+ }
+
+ close(reads_fid);
+ printf("Successfully mapped %lu bytes of reads file into memory\n",(unsigned long)reads_filesize);
+
+ char* lineBeginPtr = (char*) reads_area;
+ char* lineEndPtr = (char*) reads_area;
+ char* end_of_mapped_area = ((char*) reads_area) + reads_filesize;
+
+ while (*lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
+ lineEndPtr++;
+
+ char* current_line = malloc(sizeof(char)*512);
+ memset(current_line,0,512);
+
+ unsigned long line_size = lineEndPtr - lineBeginPtr;
+ strncpy(current_line,lineBeginPtr,line_size);
+ current_line[line_size] = '\0';
+
+ int readCtr = 0;
+ int status = 0;
+
+ num_reads = 0;
+ map_idx = 0;
+
+ while(1) {
+ if (strcmp(current_line,"") == 0)
+ break;
+
+ status = set_item_from_line(current_line);
+ if (status != 0 )
+ printf("Error while parsing line (status=%d).",status);
+
+ lineBeginPtr = lineEndPtr;
+ while (*(char*)lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
+ lineEndPtr++;
+
+ current_line = strncpy(current_line,lineBeginPtr,lineEndPtr-lineBeginPtr);
+ current_line[lineEndPtr-lineBeginPtr] = '\0';
+
+ readCtr += 1;
+ }
+
+ // clean up
+ status = munmap(reads_area,reads_filesize);
+ if(status != 0)
+ perror("munmap");
+
+ free(current_line);
+
+ return PyInt_FromLong(num_reads);
+fetchEntry(int id);
+
+~ParaParser();
+
+
+
--- /dev/null
+#include "ParaParser.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+
+/*
+ * The constructor needs the format string to be used with sscanf and the names
+ * of the respective fields for the dictionary.
+ *
+ */
+
+ParaParser::ParaParser(const char* fmt, char** _fields, int num_entries) {
+ size_t buf_size = 512;
+ format_string = (char*) malloc(sizeof(char)*buf_size);
+ if (strlen(fmt) > buf_size)
+ perror("format string to long!");
+
+ strncpy(format_string,fmt,strlen(fmt));
+
+ // count how many entries are parsed in one line (number of %'s)
+ int format_num_entries = 0;
+ for(int fidx=0;fidx<strlen(fmt);fidx++)
+ if (format_string[fidx] == '%')
+ format_num_entries++;
+
+ if (format_num_entries != num_entries)
+ printf("Error: For every entry in the format string you have to supply a name!");
+
+ field_names = (char**) malloc(sizeof(char*)*num_entries);
+
+ for(int idx=0;idx<num_entries;idx++) {
+ field_names[idx] = (char*) malloc(sizeof(char)*buf_size);
+ strncpy(field_names[idx],_fields[idx],strlen(_fields[idx]));
+ }
+
+ // subtract number of tabs as we don't want them in the parsing
+ char* pruned_format_string = (char*)malloc(sizeof(char*)*strlen(format_string)-num_entries+2);
+ int pruned_idx = 0;
+
+ for(int idx=0;idx<strlen(format_string);idx++) {
+ if (format_string[idx] == '\t')
+ continue;
+
+ pruned_format_string[pruned_idx] = format_string[idx];
+ pruned_idx++;
+ }
+ pruned_format_string[strlen(format_string)-num_entries+1] = '%';
+
+ types_list = (char**) malloc(sizeof(char**)*num_entries);
+ printf("types list\n");
+ char *pruned_ptr = pruned_format_string;
+ for(int f_idx=0;f_idx<num_entries;f_idx++) {
+ char *part = strtok (pruned_ptr, "%");
+ pruned_ptr = NULL;
+
+ types_list[f_idx] = (char*) malloc(sizeof(char*)*strlen(part));
+ strncpy(types_list[f_idx],part,strlen(part));
+ printf("%s(%d) ",part,strlen(part));
+ }
+ printf("\n");
+}
+
+/*
+ *
+ */
+
+void ParaParser::create_entry_from_line(const char* current_line, char* format_string, ... ) {
+ printf("current line is %s\n",current_line);
+
+ void*** current_entry = (void***) malloc(sizeof(void***));
+
+ char *line_ptr = (char*) malloc(sizeof(char)*strlen(current_line));
+ strncpy(line_ptr,current_line,strlen(current_line));
+
+ char* current_type = (char*) malloc(sizeof(char)*6);
+
+ for(int idx=0; idx<num_columns;idx++) {
+ char* col = strtok(line_ptr,"\t");
+ line_ptr = NULL;
+ strncpy(current_type,types_list[idx],strlen(types_list[idx]));
+
+ printf("%s ",col);
+ printf("%s ",current_type);
+
+ if ( strcmp(current_type,"d")==0 ) {
+ (*current_entry)[idx] = (int*) malloc(sizeof(int));
+ memcpy((*current_entry)[idx],col,sizeof(int*));
+ }
+ if ( strcmp(current_type,"f")==0 ) {
+ (*current_entry)[idx] = (double*) malloc(sizeof(double));
+ memcpy((*current_entry)[idx],col,sizeof(double*));
+ }
+ if ( strcmp(current_type,"s")==0 ) {
+ (*current_entry)[idx] = (char*) malloc(sizeof(char*));
+ memcpy((*current_entry)[idx],col,sizeof(char*));
+ }
+ if ( strcmp(current_type,"lu")==0 ) {
+ (*current_entry)[idx] = (unsigned long*) malloc(sizeof(unsigned long));
+ memcpy((*current_entry)[idx],col,sizeof(unsigned long*));
+ }
+ }
+
+ printf("\n");
+
+ int* id = (int*)(*current_entry)[0];
+ printf("id is %d\n",id);
+ (*entries)[*id] = current_entry;
+}
+
+
+int ParaParser::parseFile(char* reads_filename) {
+ size_t buf_size = 512;
+ char* line = (char*) malloc(sizeof(char)*buf_size);
+
+ printf("open %s\n",reads_filename);
+ FILE *input_fs = fopen(reads_filename,"r");
+ if (input_fs == NULL)
+ perror("fopen");
+
+ int line_ctr = 0;
+
+ while (getline (&line, &buf_size, input_fs) >= 0)
+ line_ctr++;
+
+ free(line);
+
+ printf("file has %d lines\n",line_ctr);
+
+ if(input_fs == NULL) {
+ printf("Error: Could not open file: %s",reads_filename);
+ exit(EXIT_FAILURE);
+ }
+
+ int reads_fid = fileno(input_fs);
+ struct stat reads_stat;
+ if ( fstat(reads_fid,&reads_stat) == -1) {
+ perror("fstat");
+ exit(EXIT_FAILURE);
+ }
+
+ off_t reads_filesize = reads_stat.st_size;
+ //printf("Reads file is of size %lu bytes\n",(unsigned long) reads_filesize);
+
+ entries = new map<unsigned long,void***,KeyCmp>();
+
+ // try to acquire file using mmap
+ void *reads_area = mmap (NULL,reads_filesize,PROT_READ,MAP_PRIVATE,reads_fid,0);
+ if (reads_area == MAP_FAILED) {
+ perror("mmap");
+ exit(EXIT_FAILURE);
+ }
+
+ close(reads_fid);
+ printf("Successfully mapped %lu bytes of reads file into memory\n",(unsigned long)reads_filesize);
+
+ char* lineBeginPtr = (char*) reads_area;
+ char* lineEndPtr = (char*) reads_area;
+ char* end_of_mapped_area = ((char*) reads_area) + reads_filesize;
+
+ while (*lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
+ lineEndPtr++;
+
+ char* current_line = (char*) malloc(sizeof(char)*512);
+ memset(current_line,0,512);
+
+ unsigned long line_size = lineEndPtr - lineBeginPtr;
+ strncpy(current_line,lineBeginPtr,line_size);
+ current_line[line_size] = '\0';
+
+ int readCtr = 0;
+ int status = 0;
+
+ int num_reads = 0;
+
+ while(1) {
+ if (strcmp(current_line,"") == 0)
+ break;
+
+ create_entry_from_line(current_line,format_string,current_line);
+ if (status != 0 )
+ printf("Error while parsing line (status=%d).",status);
+
+ lineBeginPtr = lineEndPtr;
+ while (*(char*)lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
+ lineEndPtr++;
+
+ current_line = strncpy(current_line,lineBeginPtr,lineEndPtr-lineBeginPtr);
+ current_line[lineEndPtr-lineBeginPtr] = '\0';
+
+ readCtr += 1;
+ }
+
+ // clean up
+ status = munmap(reads_area,reads_filesize);
+ if(status != 0)
+ perror("munmap");
+
+ free(current_line);
+
+ return 99;
+}
+
+/*
+PyObject* ParaParser::fetchEntry(int id) {
+
+ void*** current_entry = (*entries)[id];
+
+ PyObject* line_dict = PyDict_New();
+
+ int status;
+
+ for(int idx=0;idx<num_columns;idx++) {
+ char* current_type = types_list[idx];
+ if ( strcmp(current_type,"f")==0 )
+ status = PyDict_SetItem(line_dict, PyString_FromString(field_names[idx]), PyString_FromString((char*)(*current_entry)[idx]) );
+
+ if ( strcmp(current_type,"s")==0 )
+ status = PyDict_SetItem(line_dict, PyString_FromString(field_names[idx]), PyString_FromString((char*)(*current_entry)[idx]) );
+
+ if ( strcmp(current_type,"lu")==0 || strcmp(current_type,"d")==0 )
+ status = PyDict_SetItem(line_dict, PyString_FromString(field_names[idx]), PyInt_FromLong(*(int*)(*current_entry)[idx]) );
+
+ }
+
+ //size_t idx;
+ //for(idx=0;idx<current_read->size;idx++) {
+ // status = PyList_SetItem( prb_list, idx, PyInt_FromLong(current_read->prb[idx]-50) );
+ // status = PyList_SetItem( cal_prb_list, idx, PyInt_FromLong(current_read->cal_prb[idx]-64) );
+ // status = PyList_SetItem( chastity_list, idx, PyInt_FromLong(current_read->chastity[idx]+10) );
+ //}
+
+ return line_dict;
+}
+*/