+ added generice parser code
[qpalma.git] / ParaParser / ParaParser.cpp
1 #include "ParaParser.h"
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <sys/mman.h>
5 #include <sys/stat.h>
6
7 /*
8 * The constructor needs the format string to be used with sscanf and the names
9 * of the respective fields for the dictionary.
10 *
11 */
12
13 ParaParser::ParaParser(const char* fmt, char** _fields, int num_entries) {
14 size_t buf_size = 512;
15 format_string = (char*) malloc(sizeof(char)*buf_size);
16 if (strlen(fmt) > buf_size)
17 perror("format string to long!");
18
19 strncpy(format_string,fmt,strlen(fmt));
20
21 // count how many entries are parsed in one line (number of %'s)
22 int format_num_entries = 0;
23 for(int fidx=0;fidx<strlen(fmt);fidx++)
24 if (format_string[fidx] == '%')
25 format_num_entries++;
26
27 if (format_num_entries != num_entries)
28 printf("Error: For every entry in the format string you have to supply a name!");
29
30 field_names = (char**) malloc(sizeof(char*)*num_entries);
31
32 for(int idx=0;idx<num_entries;idx++) {
33 field_names[idx] = (char*) malloc(sizeof(char)*buf_size);
34 strncpy(field_names[idx],_fields[idx],strlen(_fields[idx]));
35 }
36
37 // subtract number of tabs as we don't want them in the parsing
38 char* pruned_format_string = (char*)malloc(sizeof(char*)*strlen(format_string)-num_entries+2);
39 int pruned_idx = 0;
40
41 for(int idx=0;idx<strlen(format_string);idx++) {
42 if (format_string[idx] == '\t')
43 continue;
44
45 pruned_format_string[pruned_idx] = format_string[idx];
46 pruned_idx++;
47 }
48 pruned_format_string[strlen(format_string)-num_entries+1] = '%';
49
50 types_list = (char**) malloc(sizeof(char**)*num_entries);
51 printf("types list\n");
52 char *pruned_ptr = pruned_format_string;
53 for(int f_idx=0;f_idx<num_entries;f_idx++) {
54 char *part = strtok (pruned_ptr, "%");
55 pruned_ptr = NULL;
56
57 types_list[f_idx] = (char*) malloc(sizeof(char*)*strlen(part));
58 strncpy(types_list[f_idx],part,strlen(part));
59 printf("%s(%d) ",part,strlen(part));
60 }
61 printf("\n");
62 }
63
64 /*
65 *
66 */
67
68 void ParaParser::create_entry_from_line(const char* current_line, char* format_string, ... ) {
69 printf("current line is %s\n",current_line);
70
71 void*** current_entry = (void***) malloc(sizeof(void***));
72
73 char *line_ptr = (char*) malloc(sizeof(char)*strlen(current_line));
74 strncpy(line_ptr,current_line,strlen(current_line));
75
76 char* current_type = (char*) malloc(sizeof(char)*6);
77
78 for(int idx=0; idx<num_columns;idx++) {
79 char* col = strtok(line_ptr,"\t");
80 line_ptr = NULL;
81 strncpy(current_type,types_list[idx],strlen(types_list[idx]));
82
83 printf("%s ",col);
84 printf("%s ",current_type);
85
86 if ( strcmp(current_type,"d")==0 ) {
87 (*current_entry)[idx] = (int*) malloc(sizeof(int));
88 memcpy((*current_entry)[idx],col,sizeof(int*));
89 }
90 if ( strcmp(current_type,"f")==0 ) {
91 (*current_entry)[idx] = (double*) malloc(sizeof(double));
92 memcpy((*current_entry)[idx],col,sizeof(double*));
93 }
94 if ( strcmp(current_type,"s")==0 ) {
95 (*current_entry)[idx] = (char*) malloc(sizeof(char*));
96 memcpy((*current_entry)[idx],col,sizeof(char*));
97 }
98 if ( strcmp(current_type,"lu")==0 ) {
99 (*current_entry)[idx] = (unsigned long*) malloc(sizeof(unsigned long));
100 memcpy((*current_entry)[idx],col,sizeof(unsigned long*));
101 }
102 }
103
104 printf("\n");
105
106 int* id = (int*)(*current_entry)[0];
107 printf("id is %d\n",id);
108 (*entries)[*id] = current_entry;
109 }
110
111
112 int ParaParser::parseFile(char* reads_filename) {
113 size_t buf_size = 512;
114 char* line = (char*) malloc(sizeof(char)*buf_size);
115
116 printf("open %s\n",reads_filename);
117 FILE *input_fs = fopen(reads_filename,"r");
118 if (input_fs == NULL)
119 perror("fopen");
120
121 int line_ctr = 0;
122
123 while (getline (&line, &buf_size, input_fs) >= 0)
124 line_ctr++;
125
126 free(line);
127
128 printf("file has %d lines\n",line_ctr);
129
130 if(input_fs == NULL) {
131 printf("Error: Could not open file: %s",reads_filename);
132 exit(EXIT_FAILURE);
133 }
134
135 int reads_fid = fileno(input_fs);
136 struct stat reads_stat;
137 if ( fstat(reads_fid,&reads_stat) == -1) {
138 perror("fstat");
139 exit(EXIT_FAILURE);
140 }
141
142 off_t reads_filesize = reads_stat.st_size;
143 //printf("Reads file is of size %lu bytes\n",(unsigned long) reads_filesize);
144
145 entries = new map<unsigned long,void***,KeyCmp>();
146
147 // try to acquire file using mmap
148 void *reads_area = mmap (NULL,reads_filesize,PROT_READ,MAP_PRIVATE,reads_fid,0);
149 if (reads_area == MAP_FAILED) {
150 perror("mmap");
151 exit(EXIT_FAILURE);
152 }
153
154 close(reads_fid);
155 printf("Successfully mapped %lu bytes of reads file into memory\n",(unsigned long)reads_filesize);
156
157 char* lineBeginPtr = (char*) reads_area;
158 char* lineEndPtr = (char*) reads_area;
159 char* end_of_mapped_area = ((char*) reads_area) + reads_filesize;
160
161 while (*lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
162 lineEndPtr++;
163
164 char* current_line = (char*) malloc(sizeof(char)*512);
165 memset(current_line,0,512);
166
167 unsigned long line_size = lineEndPtr - lineBeginPtr;
168 strncpy(current_line,lineBeginPtr,line_size);
169 current_line[line_size] = '\0';
170
171 int readCtr = 0;
172 int status = 0;
173
174 int num_reads = 0;
175
176 while(1) {
177 if (strcmp(current_line,"") == 0)
178 break;
179
180 create_entry_from_line(current_line,format_string,current_line);
181 if (status != 0 )
182 printf("Error while parsing line (status=%d).",status);
183
184 lineBeginPtr = lineEndPtr;
185 while (*(char*)lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
186 lineEndPtr++;
187
188 current_line = strncpy(current_line,lineBeginPtr,lineEndPtr-lineBeginPtr);
189 current_line[lineEndPtr-lineBeginPtr] = '\0';
190
191 readCtr += 1;
192 }
193
194 // clean up
195 status = munmap(reads_area,reads_filesize);
196 if(status != 0)
197 perror("munmap");
198
199 free(current_line);
200
201 return 99;
202 }
203
204 /*
205 PyObject* ParaParser::fetchEntry(int id) {
206
207 void*** current_entry = (*entries)[id];
208
209 PyObject* line_dict = PyDict_New();
210
211 int status;
212
213 for(int idx=0;idx<num_columns;idx++) {
214 char* current_type = types_list[idx];
215 if ( strcmp(current_type,"f")==0 )
216 status = PyDict_SetItem(line_dict, PyString_FromString(field_names[idx]), PyString_FromString((char*)(*current_entry)[idx]) );
217
218 if ( strcmp(current_type,"s")==0 )
219 status = PyDict_SetItem(line_dict, PyString_FromString(field_names[idx]), PyString_FromString((char*)(*current_entry)[idx]) );
220
221 if ( strcmp(current_type,"lu")==0 || strcmp(current_type,"d")==0 )
222 status = PyDict_SetItem(line_dict, PyString_FromString(field_names[idx]), PyInt_FromLong(*(int*)(*current_entry)[idx]) );
223
224 }
225
226 //size_t idx;
227 //for(idx=0;idx<current_read->size;idx++) {
228 // status = PyList_SetItem( prb_list, idx, PyInt_FromLong(current_read->prb[idx]-50) );
229 // status = PyList_SetItem( cal_prb_list, idx, PyInt_FromLong(current_read->cal_prb[idx]-64) );
230 // status = PyList_SetItem( chastity_list, idx, PyInt_FromLong(current_read->chastity[idx]+10) );
231 //}
232
233 return line_dict;
234 }
235 */