+ changed PipelinHeuristic to support new data access functions
[qpalma.git] / ParaParser / ParaParser.cpp
1 #include "ParaParser.h"
2 #include <string>
3 using namespace std;
4
5 /*
6 *
7 *
8 */
9
10 /*
11 bool check_for_well_formed_format_string(const char* fmt) {
12 types_list = (char**) malloc(sizeof(char**)*num_columns);
13 char *pruned_ptr = pruned_format_string;
14 //printf("types list\n");
15 for(size_t f_idx=0;f_idx<num_columns;f_idx++) {
16 char *part = strtok (pruned_ptr, "%");
17 pruned_ptr = NULL;
18
19 }
20 */
21
22 /**
23 * Split string and return pointers to its parts.
24 *
25 * \param args The string to be split.
26 * \param argv_ptr Pointer to the list of substrings.
27 * \param delim Delimiter.
28 *
29 * This function modifies \a args by replacing each occurance of \a delim by
30 * zero. A \p NULL-terminated array of pointers to char* is allocated dynamically
31 * and these pointers are initialized to point to the broken-up substrings
32 * within \a args. A pointer to this array is returned via \a argv_ptr.
33 *
34 * \return The number of substrings found in \a args.
35 */
36
37 unsigned split_args(char *args, char *** const argv_ptr, const char *delim)
38 {
39 char *p = args;
40 char **argv;
41 size_t n = 0, i, j;
42
43 p = args + strspn(args, delim);
44 for (;;) {
45 i = strcspn(p, delim);
46 if (!i)
47 break;
48 p += i;
49 n++;
50 p += strspn(p, delim);
51 }
52 *argv_ptr = (char**) malloc((n + 1) * sizeof(char *));
53 argv = *argv_ptr;
54 i = 0;
55 p = args + strspn(args, delim);
56 while (p) {
57 argv[i] = p;
58 j = strcspn(p, delim);
59 if (!j)
60 break;
61 p += strcspn(p, delim);
62 if (*p) {
63 *p = '\0';
64 p++;
65 p += strspn(p, delim);
66 }
67 i++;
68 }
69 argv[n] = NULL;
70 return n;
71 }
72
73
74 /*
75 * The constructor needs the format string to be used with sscanf and the names
76 * of the respective fields for the dictionary.
77 *
78 */
79
80 ParaParser::ParaParser(const char* fmt, char** _fields, int num_entries, storage_mode mode) {
81 // check that we have more than zero entries and that the format string
82 // contains exactly num_entries format elements.
83 if ( num_entries < 1 )
84 printf("Error: You need at least one field !\n");
85
86 num_columns = (size_t) num_entries;
87
88 // count how many entries are parsed in one line (number of %'s)
89 size_t format_num_entries = 0;
90 for(size_t fidx=0;fidx<strlen(fmt);fidx++)
91 if (fmt[fidx] == '%')
92 format_num_entries++;
93
94 if (format_num_entries != num_columns) {
95 printf("Error: For every entry in the format string you have to supply a name!\n");
96 exit(EXIT_FAILURE);
97 }
98
99 // copy the field names to the member variable
100 size_t buf_size = 512;
101 field_names = (char**) malloc(sizeof(char*)*num_columns);
102 for(size_t idx=0;idx<num_columns;idx++) {
103 field_names[idx] = (char*) malloc(sizeof(char)*buf_size);
104 strncpy(field_names[idx],_fields[idx],strlen(_fields[idx])+1);
105 field_names[idx][strlen(_fields[idx])] = '\0';
106 //printf("%s\n",field_names[idx]);
107 }
108
109 char* pruned_format_string = (char*) malloc(sizeof(char)*buf_size);
110 size_t pruned_size = 0;
111 for(size_t idx=0;idx<strlen(fmt);idx++) {
112 pruned_format_string[pruned_size] = fmt[idx];
113 pruned_size++;
114 }
115
116 pruned_format_string[pruned_size] = '%';
117 pruned_format_string[++pruned_size] = '\0';
118 //printf("%s\n",pruned_format_string);
119
120 types_list = (char**) malloc(sizeof(char**)*num_columns);
121 char *pruned_ptr = pruned_format_string;
122 //printf("types list\n");
123 for(size_t f_idx=0;f_idx<num_columns;f_idx++) {
124 char *part = strtok (pruned_ptr, "%");
125 pruned_ptr = NULL;
126
127 types_list[f_idx] = (char*) malloc(sizeof(char*)*strlen(part)+1);
128 types_list[f_idx][strlen(part)] = '\0';
129 strncpy(types_list[f_idx],part,strlen(part));
130 //printf("%s(%d) ",part,strlen(part));
131 }
132 //printf("\n");
133 //
134 current_mode = mode;
135 if(current_mode != IN_VECTOR && current_mode != IN_MAP) {
136 printf("Error: Wrong save mode!");
137 exit(EXIT_FAILURE);
138 }
139 }
140
141
142
143 ParaParser::~ParaParser() {
144 // unmap parsed file
145 int status = munmap(reads_area,reads_filesize);
146 if(status != 0)
147 perror("munmap");
148 }
149
150
151 /*
152 *
153 *
154 *
155 */
156
157 void ParaParser::create_entry_from_line(const char* current_line, char* format_string, char* lineBeginPtr, char* lineEndPtr) {
158 //printf("current line is %s",current_line);
159
160 // create an array of void ptrs
161 char **current_entries = (char**) malloc(sizeof(char*)*num_columns);
162
163 char* mutable_line = (char*) malloc(sizeof(char)*strlen(current_line));
164 strncpy(mutable_line,current_line,strlen(current_line));
165 mutable_line[strlen(current_line)-1] = '\t';
166
167 char** line_parts;
168 size_t num_parts = split_args(mutable_line,&line_parts,"\t");
169
170 for(size_t idx=0; idx<num_columns;idx++) {
171 char* col = line_parts[idx];
172
173 //printf("elem:%s\n",col);
174 current_entries[idx] = (char*) malloc(sizeof(char)*strlen(col)+1);
175 //current_entries2[idx] = string(col);
176 strncpy(current_entries[idx],col,strlen(col));
177 current_entries[idx][strlen(col)] = '\0';
178 }
179 free(mutable_line);
180
181 map_key_t id = strtoul(current_entries[0],NULL,10);
182
183 if ( current_mode == IN_VECTOR ) {
184 //printf("size is %zd\n",v_entries->size());
185 v_entries->push_back(current_entries);
186 pair<char*,char*> p;
187 p.first = lineBeginPtr;
188 p.second = lineEndPtr;
189 v_ptr_entries->push_back(p);
190 //printf("size is %zd\n",v_entries->size());
191 }
192
193 if ( current_mode == IN_MAP ) {
194 //printf("size is %zd\n",entries->size());
195 (*entries)[id] = current_entries;
196 //printf("size is %zd\n",entries->size());
197 }
198 }
199
200
201 /*
202 *
203 *
204 *
205 */
206
207 int ParaParser::parseFile(char* reads_filename) {
208 size_t buf_size = 512;
209 char* line = (char*) malloc(sizeof(char)*buf_size);
210
211 //printf("open %s\n",reads_filename);
212 FILE *input_fs = fopen(reads_filename,"r");
213 if (input_fs == NULL)
214 perror("fopen");
215
216 int line_ctr = 0;
217
218 while (getline (&line, &buf_size, input_fs) >= 0)
219 line_ctr++;
220
221 free(line);
222
223 //printf("file has %d lines\n",line_ctr);
224
225 if(input_fs == NULL) {
226 printf("Error: Could not open file: %s",reads_filename);
227 exit(EXIT_FAILURE);
228 }
229
230 int reads_fid = fileno(input_fs);
231 struct stat reads_stat;
232 if ( fstat(reads_fid,&reads_stat) == -1) {
233 perror("fstat");
234 exit(EXIT_FAILURE);
235 }
236
237 reads_filesize = reads_stat.st_size;
238 printf("Reads file is of size %lu bytes\n",(unsigned long) reads_filesize);
239
240 if ( current_mode == IN_VECTOR ) {
241 v_entries = new VECTOR();
242 v_ptr_entries = new PTR_VECTOR();
243 }
244
245 if ( current_mode == IN_MAP )
246 entries = new MAP();
247
248 // try to acquire file using mmap
249 reads_area = mmap (NULL,reads_filesize,PROT_READ,MAP_PRIVATE,reads_fid,0);
250 if (reads_area == MAP_FAILED) {
251 perror("mmap");
252 exit(EXIT_FAILURE);
253 }
254
255 close(reads_fid);
256 printf("Successfully mapped %lu bytes of reads file into memory\n",(unsigned long)reads_filesize);
257
258 char* lineBeginPtr = (char*) reads_area;
259 char* lineEndPtr = (char*) reads_area;
260 char* end_of_mapped_area = ((char*) reads_area) + reads_filesize;
261
262 while (*lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
263 lineEndPtr++;
264
265 char* current_line = (char*) malloc(sizeof(char)*512);
266 memset(current_line,0,512);
267
268 unsigned long line_size = lineEndPtr - lineBeginPtr;
269 strncpy(current_line,lineBeginPtr,line_size);
270 current_line[line_size] = '\0';
271
272 int readCtr = 0;
273 int status = 0;
274
275 printf("Starting to parse file...\n");
276
277 while(1) {
278 if (strcmp(current_line,"") == 0)
279 break;
280
281 create_entry_from_line(current_line,format_string,lineBeginPtr,lineEndPtr);
282
283 if (status != 0 )
284 printf("Error while parsing line (status=%d).",status);
285
286 lineBeginPtr = lineEndPtr;
287 while (*(char*)lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
288 lineEndPtr++;
289
290 current_line = strncpy(current_line,lineBeginPtr,lineEndPtr-lineBeginPtr);
291 current_line[lineEndPtr-lineBeginPtr] = '\0';
292
293 readCtr += 1;
294 }
295
296 printf("Successfully parsed file !\n");
297
298
299 // free unneeded variables
300 free(current_line);
301
302 return readCtr;
303 }
304
305
306 /*
307 *
308 *
309 *
310 */
311
312 PyObject* ParaParser::fetchEntry(map_key_t id) {
313 PyObject* line_dict = PyDict_New();
314
315 //printf("begin of fetchEntry\n");
316 //printf("size of map %d\n",entries->size());
317 //printf("keys:\n");
318 //MAP::iterator iter;
319 //for(iter = entries->begin(); iter != entries->end(); iter++)
320 // printf("%d\n", iter->first);
321 //printf("query key is %lu\n",id);
322
323 char** current_entry;
324
325 char* lineBeginPtr = 0;
326 char* lineEndPtr = 0;
327
328 if ( current_mode == IN_VECTOR ) {
329 //printf("IN_VECTOR mode\n");
330 if (id >= v_entries->size())
331 return line_dict;
332
333 //printf("size %d\n",v_entries->size());
334 current_entry = (*v_entries)[id];
335 pair<char*,char*> ptr_pair = (*v_ptr_entries)[id];
336 lineBeginPtr = ptr_pair.first;
337 lineEndPtr = ptr_pair.second;
338 }
339
340 assert (lineBeginPtr != 0);
341 assert (lineEndPtr != 0);
342
343 if ( current_mode == IN_MAP ) {
344 //printf("IN_MAP mode\n");
345 MAP::iterator find_it = entries->find(id);
346 if( find_it == entries->end() )
347 return line_dict;
348
349 current_entry = (*entries)[id];
350 }
351
352
353 int status;
354
355 for(size_t idx=0;idx<num_columns;idx++) {
356 char* current_type = types_list[idx];
357 char* current = current_entry[idx];
358
359 // init elem to make compiler happy
360 PyObject* elem = 0;
361
362 if ( strcmp(current_type,"d")==0 )
363 elem = PyInt_FromString(current,NULL,10);
364
365 if ( strcmp(current_type,"f")==0 )
366 elem = PyFloat_FromString(PyString_FromString(current),NULL);
367
368 if ( strcmp(current_type,"s")==0 )
369 elem = PyString_FromString(current);
370
371 if ( strcmp(current_type,"lu")==0 )
372 elem = PyString_FromString(current);
373 //elem = PyInt_FromString(current,NULL,10);
374 //elem = PyLong_FromString(current,NULL,10);
375
376 if (elem == 0)
377 printf("Error: type %s/ elem %s\n",current_type,current);
378
379 free(current);
380
381 status = PyDict_SetItem(line_dict, PyString_FromString(field_names[idx]), elem);
382 }
383
384 /*
385 //for(size_t idx=0;idx<current_read->size;idx++) {
386 // status = PyList_SetItem( prb_list, idx, PyInt_FromLong(current_read->prb[idx]-50) );
387 // status = PyList_SetItem( cal_prb_list, idx, PyInt_FromLong(current_read->cal_prb[idx]-64) );
388 // status = PyList_SetItem( chastity_list, idx, PyInt_FromLong(current_read->chastity[idx]+10) );
389 //}
390 */
391
392 PyObject *return_value = PyTuple_New(2);
393 PyTuple_SetItem(return_value,0,line_dict);
394
395 char* current_line = (char*) malloc(sizeof(char)*512);
396 memset(current_line,0,512);
397 unsigned long line_size = lineEndPtr - lineBeginPtr;
398 strncpy(current_line,lineBeginPtr,line_size-1);
399
400
401 PyObject *original_line = PyString_FromString(current_line);
402 PyTuple_SetItem(return_value,1,original_line);
403
404 return return_value;
405 }