f0289c44bae0ebb4a4eda86a353579e3e16a69d7
[qpalma.git] / ParaParser / ParaParser.cpp
1 #include "ParaParser.h"
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <sys/mman.h>
5 #include <sys/stat.h>
6 #include <string>
7 using namespace std;
8
9 /*
10 *
11 *
12 */
13
14 /*
15 bool check_for_well_formed_format_string(const char* fmt) {
16 types_list = (char**) malloc(sizeof(char**)*num_columns);
17 char *pruned_ptr = pruned_format_string;
18 //printf("types list\n");
19 for(size_t f_idx=0;f_idx<num_columns;f_idx++) {
20 char *part = strtok (pruned_ptr, "%");
21 pruned_ptr = NULL;
22
23 }
24 */
25
26 /**
27 * Split string and return pointers to its parts.
28 *
29 * \param args The string to be split.
30 * \param argv_ptr Pointer to the list of substrings.
31 * \param delim Delimiter.
32 *
33 * This function modifies \a args by replacing each occurance of \a delim by
34 * zero. A \p NULL-terminated array of pointers to char* is allocated dynamically
35 * and these pointers are initialized to point to the broken-up substrings
36 * within \a args. A pointer to this array is returned via \a argv_ptr.
37 *
38 * \return The number of substrings found in \a args.
39 */
40
41 unsigned split_args(char *args, char *** const argv_ptr, const char *delim)
42 {
43 char *p = args;
44 char **argv;
45 size_t n = 0, i, j;
46
47 p = args + strspn(args, delim);
48 for (;;) {
49 i = strcspn(p, delim);
50 if (!i)
51 break;
52 p += i;
53 n++;
54 p += strspn(p, delim);
55 }
56 *argv_ptr = (char**) malloc((n + 1) * sizeof(char *));
57 argv = *argv_ptr;
58 i = 0;
59 p = args + strspn(args, delim);
60 while (p) {
61 argv[i] = p;
62 j = strcspn(p, delim);
63 if (!j)
64 break;
65 p += strcspn(p, delim);
66 if (*p) {
67 *p = '\0';
68 p++;
69 p += strspn(p, delim);
70 }
71 i++;
72 }
73 argv[n] = NULL;
74 return n;
75 }
76
77
78 /*
79 * The constructor needs the format string to be used with sscanf and the names
80 * of the respective fields for the dictionary.
81 *
82 */
83
84 ParaParser::ParaParser(const char* fmt, char** _fields, int num_entries, storage_mode mode) {
85 // check that we have more than zero entries and that the format string
86 // contains exactly num_entries format elements.
87 if ( num_entries < 1 )
88 printf("Error: You need at least one field !\n");
89
90 num_columns = (size_t) num_entries;
91
92 // count how many entries are parsed in one line (number of %'s)
93 size_t format_num_entries = 0;
94 for(size_t fidx=0;fidx<strlen(fmt);fidx++)
95 if (fmt[fidx] == '%')
96 format_num_entries++;
97
98 if (format_num_entries != num_columns) {
99 printf("Error: For every entry in the format string you have to supply a name!\n");
100 exit(EXIT_FAILURE);
101 }
102
103 // copy the field names to the member variable
104 size_t buf_size = 512;
105 field_names = (char**) malloc(sizeof(char*)*num_columns);
106 for(size_t idx=0;idx<num_columns;idx++) {
107 field_names[idx] = (char*) malloc(sizeof(char)*buf_size);
108 strncpy(field_names[idx],_fields[idx],strlen(_fields[idx])+1);
109 field_names[idx][strlen(_fields[idx])] = '\0';
110 //printf("%s\n",field_names[idx]);
111 }
112
113 char* pruned_format_string = (char*) malloc(sizeof(char)*buf_size);
114 size_t pruned_size = 0;
115 for(size_t idx=0;idx<strlen(fmt);idx++) {
116 pruned_format_string[pruned_size] = fmt[idx];
117 pruned_size++;
118 }
119
120 pruned_format_string[pruned_size] = '%';
121 pruned_format_string[++pruned_size] = '\0';
122 //printf("%s\n",pruned_format_string);
123
124 types_list = (char**) malloc(sizeof(char**)*num_columns);
125 char *pruned_ptr = pruned_format_string;
126 //printf("types list\n");
127 for(size_t f_idx=0;f_idx<num_columns;f_idx++) {
128 char *part = strtok (pruned_ptr, "%");
129 pruned_ptr = NULL;
130
131 types_list[f_idx] = (char*) malloc(sizeof(char*)*strlen(part)+1);
132 types_list[f_idx][strlen(part)] = '\0';
133 strncpy(types_list[f_idx],part,strlen(part));
134 //printf("%s(%d) ",part,strlen(part));
135 }
136 //printf("\n");
137 //
138 current_mode = mode;
139 if(current_mode != IN_VECTOR && current_mode != IN_MAP) {
140 printf("Error: Wrong save mode!");
141 exit(EXIT_FAILURE);
142 }
143 }
144
145
146 /*
147 *
148 *
149 *
150 */
151
152 void ParaParser::create_entry_from_line(const char* current_line, char* format_string, char* lineBeginPtr, char* lineEndPtr) {
153 //printf("current line is %s",current_line);
154
155 // create an array of void ptrs
156 char **current_entries = (char**) malloc(sizeof(char*)*num_columns);
157
158 char* mutable_line = (char*) malloc(sizeof(char)*strlen(current_line));
159 strncpy(mutable_line,current_line,strlen(current_line));
160 mutable_line[strlen(current_line)-1] = '\t';
161
162 char** line_parts;
163 size_t num_parts = split_args(mutable_line,&line_parts,"\t");
164
165 for(size_t idx=0; idx<num_columns;idx++) {
166 char* col = line_parts[idx];
167
168 //printf("elem:%s\n",col);
169 current_entries[idx] = (char*) malloc(sizeof(char)*strlen(col)+1);
170 //current_entries2[idx] = string(col);
171 strncpy(current_entries[idx],col,strlen(col));
172 current_entries[idx][strlen(col)] = '\0';
173 }
174 free(mutable_line);
175
176 map_key_t id = strtoul(current_entries[0],NULL,10);
177
178 if ( current_mode == IN_VECTOR ) {
179 //printf("size is %zd\n",v_entries->size());
180 v_entries->push_back(current_entries);
181 v_ptr_entries->push_back(make_pair(lineBeginPtr,lineEndPtr));
182 //printf("size is %zd\n",v_entries->size());
183 }
184
185 if ( current_mode == IN_MAP ) {
186 //printf("size is %zd\n",entries->size());
187 (*entries)[id] = current_entries;
188 //printf("size is %zd\n",entries->size());
189 }
190 }
191
192
193 /*
194 *
195 *
196 *
197 */
198
199 int ParaParser::parseFile(char* reads_filename) {
200 size_t buf_size = 512;
201 char* line = (char*) malloc(sizeof(char)*buf_size);
202
203 //printf("open %s\n",reads_filename);
204 FILE *input_fs = fopen(reads_filename,"r");
205 if (input_fs == NULL)
206 perror("fopen");
207
208 int line_ctr = 0;
209
210 while (getline (&line, &buf_size, input_fs) >= 0)
211 line_ctr++;
212
213 free(line);
214
215 //printf("file has %d lines\n",line_ctr);
216
217 if(input_fs == NULL) {
218 printf("Error: Could not open file: %s",reads_filename);
219 exit(EXIT_FAILURE);
220 }
221
222 int reads_fid = fileno(input_fs);
223 struct stat reads_stat;
224 if ( fstat(reads_fid,&reads_stat) == -1) {
225 perror("fstat");
226 exit(EXIT_FAILURE);
227 }
228
229 off_t reads_filesize = reads_stat.st_size;
230 printf("Reads file is of size %lu bytes\n",(unsigned long) reads_filesize);
231
232 if ( current_mode == IN_VECTOR )
233 v_entries = new VECTOR();
234
235 if ( current_mode == IN_MAP )
236 entries = new MAP();
237
238 // try to acquire file using mmap
239 void *reads_area = mmap (NULL,reads_filesize,PROT_READ,MAP_PRIVATE,reads_fid,0);
240 if (reads_area == MAP_FAILED) {
241 perror("mmap");
242 exit(EXIT_FAILURE);
243 }
244
245 close(reads_fid);
246 printf("Successfully mapped %lu bytes of reads file into memory\n",(unsigned long)reads_filesize);
247
248 char* lineBeginPtr = (char*) reads_area;
249 char* lineEndPtr = (char*) reads_area;
250 char* end_of_mapped_area = ((char*) reads_area) + reads_filesize;
251
252 while (*lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
253 lineEndPtr++;
254
255 char* current_line = (char*) malloc(sizeof(char)*512);
256 memset(current_line,0,512);
257
258 unsigned long line_size = lineEndPtr - lineBeginPtr;
259 strncpy(current_line,lineBeginPtr,line_size);
260 current_line[line_size] = '\0';
261
262 int readCtr = 0;
263 int status = 0;
264
265 printf("Starting to parse file...\n");
266
267 while(1) {
268 if (strcmp(current_line,"") == 0)
269 break;
270
271 create_entry_from_line(current_line,format_string,lineBeginPtr,lineEndPtr);
272
273 if (status != 0 )
274 printf("Error while parsing line (status=%d).",status);
275
276 lineBeginPtr = lineEndPtr;
277 while (*(char*)lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
278 lineEndPtr++;
279
280 current_line = strncpy(current_line,lineBeginPtr,lineEndPtr-lineBeginPtr);
281 current_line[lineEndPtr-lineBeginPtr] = '\0';
282
283 readCtr += 1;
284 }
285
286 printf("Successfully parsed file !\n");
287
288 // unmap parsed file
289 status = munmap(reads_area,reads_filesize);
290 if(status != 0)
291 perror("munmap");
292
293 // free unneeded variables
294 free(current_line);
295
296 return readCtr;
297 }
298
299
300 /*
301 *
302 *
303 *
304 */
305
306 PyObject* ParaParser::fetchEntry(map_key_t id) {
307 PyObject* line_dict = PyDict_New();
308 //printf("begin of fetchEntry\n");
309 //printf("size of map %d\n",entries->size());
310 //printf("keys:\n");
311 //MAP::iterator iter;
312 //for(iter = entries->begin(); iter != entries->end(); iter++)
313 // printf("%d\n", iter->first);
314 //printf("query key is %lu\n",id);
315
316 char** current_entry;
317
318 char* lineBeginPtr = 0;
319 char* lineEndPtr = 0;
320
321 if ( current_mode == IN_VECTOR ) {
322 //printf("IN_VECTOR mode\n");
323 if (id >= v_entries->size())
324 return line_dict;
325
326 //printf("size %d\n",v_entries->size());
327 current_entry = (*v_entries)[id];
328 pair<char*,char*> ptr_pair = (*v_ptr_entries)[id];
329 lineBeginPtr = ptr_pair.first;
330 lineEndPtr = ptr_pair.second;
331 }
332
333 if ( current_mode == IN_MAP ) {
334 //printf("IN_MAP mode\n");
335 MAP::iterator find_it = entries->find(id);
336 if( find_it == entries->end() )
337 return line_dict;
338
339 current_entry = (*entries)[id];
340 }
341
342
343 int status;
344
345 for(size_t idx=0;idx<num_columns;idx++) {
346 char* current_type = types_list[idx];
347 char* current = current_entry[idx];
348
349 // init elem to make compiler happy
350 PyObject* elem = 0;
351
352 if ( strcmp(current_type,"d")==0 )
353 elem = PyInt_FromString(current,NULL,10);
354
355 if ( strcmp(current_type,"f")==0 )
356 elem = PyFloat_FromString(PyString_FromString(current),NULL);
357
358 if ( strcmp(current_type,"s")==0 )
359 elem = PyString_FromString(current);
360
361 if ( strcmp(current_type,"lu")==0 )
362 elem = PyString_FromString(current);
363 //elem = PyInt_FromString(current,NULL,10);
364 //elem = PyLong_FromString(current,NULL,10);
365
366 if (elem == 0)
367 printf("Error: type %s/ elem %s\n",current_type,current);
368
369 free(current);
370
371 status = PyDict_SetItem(line_dict, PyString_FromString(field_names[idx]), elem);
372 }
373
374 /*
375 //for(size_t idx=0;idx<current_read->size;idx++) {
376 // status = PyList_SetItem( prb_list, idx, PyInt_FromLong(current_read->prb[idx]-50) );
377 // status = PyList_SetItem( cal_prb_list, idx, PyInt_FromLong(current_read->cal_prb[idx]-64) );
378 // status = PyList_SetItem( chastity_list, idx, PyInt_FromLong(current_read->chastity[idx]+10) );
379 //}
380 */
381
382 //printf("end of fetchEntry\n");
383 PyObject *return_value = PyTuple_New(2);
384 PyTuple_SetItem(return_value,0,line_dict);
385
386 char* current_line = (char*) malloc(sizeof(char)*512);
387 memset(current_line,0,512);
388 unsigned long line_size = lineEndPtr - lineBeginPtr;
389 strncpy(current_line,lineBeginPtr,line_size);
390
391 PyObject *original_line = PyString_FromString(current_line);
392 PyTuple_SetItem(return_value,1,original_line);
393
394 return line_dict;
395 }