+ removed stupid temp file
[qpalma.git] / ParaParser / ParaParser.cpp
1 #include "ParaParser.h"
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <sys/mman.h>
5 #include <sys/stat.h>
6 #include <string>
7 using namespace std;
8
9 /*
10 *
11 *
12 */
13
14 /*
15 bool check_for_well_formed_format_string(const char* fmt) {
16 types_list = (char**) malloc(sizeof(char**)*num_columns);
17 char *pruned_ptr = pruned_format_string;
18 //printf("types list\n");
19 for(size_t f_idx=0;f_idx<num_columns;f_idx++) {
20 char *part = strtok (pruned_ptr, "%");
21 pruned_ptr = NULL;
22
23 }
24 */
25
26 /**
27 * Split string and return pointers to its parts.
28 *
29 * \param args The string to be split.
30 * \param argv_ptr Pointer to the list of substrings.
31 * \param delim Delimiter.
32 *
33 * This function modifies \a args by replacing each occurance of \a delim by
34 * zero. A \p NULL-terminated array of pointers to char* is allocated dynamically
35 * and these pointers are initialized to point to the broken-up substrings
36 * within \a args. A pointer to this array is returned via \a argv_ptr.
37 *
38 * \return The number of substrings found in \a args.
39 */
40
41 unsigned split_args(char *args, char *** const argv_ptr, const char *delim)
42 {
43 char *p = args;
44 char **argv;
45 size_t n = 0, i, j;
46
47 p = args + strspn(args, delim);
48 for (;;) {
49 i = strcspn(p, delim);
50 if (!i)
51 break;
52 p += i;
53 n++;
54 p += strspn(p, delim);
55 }
56 *argv_ptr = (char**) malloc((n + 1) * sizeof(char *));
57 argv = *argv_ptr;
58 i = 0;
59 p = args + strspn(args, delim);
60 while (p) {
61 argv[i] = p;
62 j = strcspn(p, delim);
63 if (!j)
64 break;
65 p += strcspn(p, delim);
66 if (*p) {
67 *p = '\0';
68 p++;
69 p += strspn(p, delim);
70 }
71 i++;
72 }
73 argv[n] = NULL;
74 return n;
75 }
76
77
78 /*
79 * The constructor needs the format string to be used with sscanf and the names
80 * of the respective fields for the dictionary.
81 *
82 */
83
84 ParaParser::ParaParser(const char* fmt, char** _fields, int num_entries, storage_mode mode) {
85 // check that we have more than zero entries and that the format string
86 // contains exactly num_entries format elements.
87 if ( num_entries < 1 )
88 printf("Error: You need at least one field !\n");
89
90 num_columns = (size_t) num_entries;
91
92 // count how many entries are parsed in one line (number of %'s)
93 size_t format_num_entries = 0;
94 for(size_t fidx=0;fidx<strlen(fmt);fidx++)
95 if (fmt[fidx] == '%')
96 format_num_entries++;
97
98 if (format_num_entries != num_columns) {
99 printf("Error: For every entry in the format string you have to supply a name!\n");
100 exit(EXIT_FAILURE);
101 }
102
103 // copy the field names to the member variable
104 size_t buf_size = 512;
105 field_names = (char**) malloc(sizeof(char*)*num_columns);
106 for(size_t idx=0;idx<num_columns;idx++) {
107 field_names[idx] = (char*) malloc(sizeof(char)*buf_size);
108 strncpy(field_names[idx],_fields[idx],strlen(_fields[idx])+1);
109 field_names[idx][strlen(_fields[idx])] = '\0';
110 //printf("%s\n",field_names[idx]);
111 }
112
113 char* pruned_format_string = (char*) malloc(sizeof(char)*buf_size);
114 size_t pruned_size = 0;
115 for(size_t idx=0;idx<strlen(fmt);idx++) {
116 pruned_format_string[pruned_size] = fmt[idx];
117 pruned_size++;
118 }
119
120 pruned_format_string[pruned_size] = '%';
121 pruned_format_string[++pruned_size] = '\0';
122 //printf("%s\n",pruned_format_string);
123
124 types_list = (char**) malloc(sizeof(char**)*num_columns);
125 char *pruned_ptr = pruned_format_string;
126 //printf("types list\n");
127 for(size_t f_idx=0;f_idx<num_columns;f_idx++) {
128 char *part = strtok (pruned_ptr, "%");
129 pruned_ptr = NULL;
130
131 types_list[f_idx] = (char*) malloc(sizeof(char*)*strlen(part)+1);
132 types_list[f_idx][strlen(part)] = '\0';
133 strncpy(types_list[f_idx],part,strlen(part));
134 //printf("%s(%d) ",part,strlen(part));
135 }
136 //printf("\n");
137 //
138 current_mode = mode;
139 if(current_mode != IN_VECTOR && current_mode != IN_MAP) {
140 printf("Error: Wrong save mode!");
141 exit(EXIT_FAILURE);
142 }
143 }
144
145
146 /*
147 *
148 *
149 *
150 */
151
152 void ParaParser::create_entry_from_line(const char* current_line, char* format_string) {
153 //printf("current line is %s",current_line);
154
155 // create an array of void ptrs
156 char **current_entries = (char**) malloc(sizeof(char*)*num_columns);
157
158 char* mutable_line = (char*) malloc(sizeof(char)*strlen(current_line));
159 strncpy(mutable_line,current_line,strlen(current_line));
160 mutable_line[strlen(current_line)-1] = '\t';
161
162 char** line_parts;
163 size_t num_parts = split_args(mutable_line,&line_parts,"\t");
164
165 for(size_t idx=0; idx<num_columns;idx++) {
166 char* col = line_parts[idx];
167
168 //printf("elem:%s\n",col);
169 current_entries[idx] = (char*) malloc(sizeof(char)*strlen(col)+1);
170 //current_entries2[idx] = string(col);
171 strncpy(current_entries[idx],col,strlen(col));
172 current_entries[idx][strlen(col)] = '\0';
173 }
174 free(mutable_line);
175
176 map_key_t id = strtoul(current_entries[0],NULL,10);
177
178 if ( current_mode == IN_VECTOR ) {
179 //printf("size is %zd\n",v_entries->size());
180 v_entries->push_back(current_entries);
181 //printf("size is %zd\n",v_entries->size());
182 }
183
184 if ( current_mode == IN_MAP ) {
185 //printf("size is %zd\n",entries->size());
186 (*entries)[id] = current_entries;
187 //printf("size is %zd\n",entries->size());
188 }
189 }
190
191
192 /*
193 *
194 *
195 *
196 */
197
198 int ParaParser::parseFile(char* reads_filename) {
199 size_t buf_size = 512;
200 char* line = (char*) malloc(sizeof(char)*buf_size);
201
202 //printf("open %s\n",reads_filename);
203 FILE *input_fs = fopen(reads_filename,"r");
204 if (input_fs == NULL)
205 perror("fopen");
206
207 int line_ctr = 0;
208
209 while (getline (&line, &buf_size, input_fs) >= 0)
210 line_ctr++;
211
212 free(line);
213
214 //printf("file has %d lines\n",line_ctr);
215
216 if(input_fs == NULL) {
217 printf("Error: Could not open file: %s",reads_filename);
218 exit(EXIT_FAILURE);
219 }
220
221 int reads_fid = fileno(input_fs);
222 struct stat reads_stat;
223 if ( fstat(reads_fid,&reads_stat) == -1) {
224 perror("fstat");
225 exit(EXIT_FAILURE);
226 }
227
228 off_t reads_filesize = reads_stat.st_size;
229 //printf("Reads file is of size %lu bytes\n",(unsigned long) reads_filesize);
230
231 if ( current_mode == IN_VECTOR )
232 v_entries = new VECTOR();
233
234 if ( current_mode == IN_MAP )
235 entries = new MAP();
236
237 // try to acquire file using mmap
238 void *reads_area = mmap (NULL,reads_filesize,PROT_READ,MAP_PRIVATE,reads_fid,0);
239 if (reads_area == MAP_FAILED) {
240 perror("mmap");
241 exit(EXIT_FAILURE);
242 }
243
244 close(reads_fid);
245 //printf("Successfully mapped %lu bytes of reads file into memory\n",(unsigned long)reads_filesize);
246
247 char* lineBeginPtr = (char*) reads_area;
248 char* lineEndPtr = (char*) reads_area;
249 char* end_of_mapped_area = ((char*) reads_area) + reads_filesize;
250
251 while (*lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
252 lineEndPtr++;
253
254 char* current_line = (char*) malloc(sizeof(char)*512);
255 memset(current_line,0,512);
256
257 unsigned long line_size = lineEndPtr - lineBeginPtr;
258 strncpy(current_line,lineBeginPtr,line_size);
259 current_line[line_size] = '\0';
260
261 int readCtr = 0;
262 int status = 0;
263
264 while(1) {
265 if (strcmp(current_line,"") == 0)
266 break;
267
268 create_entry_from_line(current_line,format_string);
269
270 if (status != 0 )
271 printf("Error while parsing line (status=%d).",status);
272
273 lineBeginPtr = lineEndPtr;
274 while (*(char*)lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
275 lineEndPtr++;
276
277 current_line = strncpy(current_line,lineBeginPtr,lineEndPtr-lineBeginPtr);
278 current_line[lineEndPtr-lineBeginPtr] = '\0';
279
280 readCtr += 1;
281 }
282
283 // unmap parsed file
284 status = munmap(reads_area,reads_filesize);
285 if(status != 0)
286 perror("munmap");
287
288 // free unneeded variables
289 free(current_line);
290
291 return readCtr;
292 }
293
294
295 /*
296 *
297 *
298 *
299 */
300
301 PyObject* ParaParser::fetchEntry(map_key_t id) {
302 PyObject* line_dict = PyDict_New();
303 //printf("begin of fetchEntry\n");
304 //printf("size of map %d\n",entries->size());
305 //printf("keys:\n");
306 //MAP::iterator iter;
307 //for(iter = entries->begin(); iter != entries->end(); iter++)
308 // printf("%d\n", iter->first);
309 //printf("query key is %lu\n",id);
310
311 char** current_entry;
312
313 if ( current_mode == IN_VECTOR ) {
314 //printf("IN_VECTOR mode\n");
315 if (id >= v_entries->size())
316 return line_dict;
317
318 //printf("size %d\n",v_entries->size());
319 current_entry = (*v_entries)[id];
320 }
321
322 if ( current_mode == IN_MAP ) {
323 //printf("IN_MAP mode\n");
324 MAP::iterator find_it = entries->find(id);
325 if( find_it == entries->end() )
326 return line_dict;
327
328 current_entry = (*entries)[id];
329 }
330
331
332 int status;
333
334 for(size_t idx=0;idx<num_columns;idx++) {
335 char* current_type = types_list[idx];
336 char* current = current_entry[idx];
337
338 // init elem to make compiler happy
339 PyObject* elem = 0;
340
341 if ( strcmp(current_type,"d")==0 )
342 elem = PyInt_FromString(current,NULL,10);
343
344 if ( strcmp(current_type,"f")==0 )
345 elem = PyFloat_FromString(PyString_FromString(current),NULL);
346
347 if ( strcmp(current_type,"s")==0 )
348 elem = PyString_FromString(current);
349
350 if ( strcmp(current_type,"lu")==0 )
351 elem = PyString_FromString(current);
352 //elem = PyInt_FromString(current,NULL,10);
353 //elem = PyLong_FromString(current,NULL,10);
354
355 if (elem == 0)
356 printf("Error: type %s/ elem %s\n",current_type,current);
357
358 free(current);
359
360 status = PyDict_SetItem(line_dict, PyString_FromString(field_names[idx]), elem);
361 }
362
363 /*
364 //for(size_t idx=0;idx<current_read->size;idx++) {
365 // status = PyList_SetItem( prb_list, idx, PyInt_FromLong(current_read->prb[idx]-50) );
366 // status = PyList_SetItem( cal_prb_list, idx, PyInt_FromLong(current_read->cal_prb[idx]-64) );
367 // status = PyList_SetItem( chastity_list, idx, PyInt_FromLong(current_read->chastity[idx]+10) );
368 //}
369 */
370
371 //printf("end of fetchEntry\n");
372 return line_dict;
373 }