+ rewrote some data storage code
[qpalma.git] / ParaParser / ParaParser.cpp
1 #include "ParaParser.h"
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <sys/mman.h>
5 #include <sys/stat.h>
6
7
8 /**
9 * Split string and return pointers to its parts.
10 *
11 * \param args The string to be split.
12 * \param argv_ptr Pointer to the list of substrings.
13 * \param delim Delimiter.
14 *
15 * This function modifies \a args by replacing each occurance of \a delim by
16 * zero. A \p NULL-terminated array of pointers to char* is allocated dynamically
17 * and these pointers are initialized to point to the broken-up substrings
18 * within \a args. A pointer to this array is returned via \a argv_ptr.
19 *
20 * \return The number of substrings found in \a args.
21 */
22
23 unsigned split_args(char *args, char *** const argv_ptr, const char *delim)
24 {
25 char *p = args;
26 char **argv;
27 size_t n = 0, i, j;
28
29 p = args + strspn(args, delim);
30 for (;;) {
31 i = strcspn(p, delim);
32 if (!i)
33 break;
34 p += i;
35 n++;
36 p += strspn(p, delim);
37 }
38 *argv_ptr = (char**) malloc((n + 1) * sizeof(char *));
39 argv = *argv_ptr;
40 i = 0;
41 p = args + strspn(args, delim);
42 while (p) {
43 argv[i] = p;
44 j = strcspn(p, delim);
45 if (!j)
46 break;
47 p += strcspn(p, delim);
48 if (*p) {
49 *p = '\0';
50 p++;
51 p += strspn(p, delim);
52 }
53 i++;
54 }
55 argv[n] = NULL;
56 return n;
57 }
58
59
60 /*
61 * The constructor needs the format string to be used with sscanf and the names
62 * of the respective fields for the dictionary.
63 *
64 */
65
66 ParaParser::ParaParser(const char* fmt, char** _fields, int num_entries) {
67 num_columns = num_entries;
68 size_t buf_size = 512;
69 format_string = (char*) malloc(sizeof(char)*buf_size);
70 if (strlen(fmt) > buf_size)
71 perror("format string to long!");
72
73 strncpy(format_string,fmt,strlen(fmt));
74
75 // count how many entries are parsed in one line (number of %'s)
76 int format_num_entries = 0;
77 for(int fidx=0;fidx<strlen(fmt);fidx++)
78 if (format_string[fidx] == '%')
79 format_num_entries++;
80
81 if (format_num_entries != num_columns)
82 printf("Error: For every entry in the format string you have to supply a name!");
83
84 field_names = (char**) malloc(sizeof(char*)*num_columns);
85
86 for(int idx=0;idx<num_columns;idx++) {
87 field_names[idx] = (char*) malloc(sizeof(char)*buf_size);
88 strncpy(field_names[idx],_fields[idx],strlen(_fields[idx]));
89 }
90
91 // subtract number of tabs as we don't want them in the parsing
92 char* pruned_format_string = (char*)malloc(sizeof(char*)*strlen(format_string)-num_columns+2);
93 int pruned_idx = 0;
94
95 for(int idx=0;idx<strlen(format_string);idx++) {
96 if (format_string[idx] == '\t')
97 continue;
98
99 pruned_format_string[pruned_idx] = format_string[idx];
100 pruned_idx++;
101 }
102 pruned_format_string[strlen(format_string)-num_columns+1] = '%';
103
104 types_list = (char**) malloc(sizeof(char**)*num_columns);
105 printf("types list\n");
106 char *pruned_ptr = pruned_format_string;
107 for(int f_idx=0;f_idx<num_columns;f_idx++) {
108 char *part = strtok (pruned_ptr, "%");
109 pruned_ptr = NULL;
110
111 types_list[f_idx] = (char*) malloc(sizeof(char*)*strlen(part)+1);
112 types_list[f_idx][strlen(part)] = '\0';
113 strncpy(types_list[f_idx],part,strlen(part));
114 printf("%s(%d) ",part,strlen(part));
115 }
116 printf("\n");
117 }
118
119 /*
120 *
121 */
122
123 void ParaParser::create_entry_from_line(const char* current_line, char* format_string) {
124 printf("current line is %s",current_line);
125
126 // create an array of void ptrs
127 void **vptr_array = (void**) malloc(sizeof(void*)*num_columns);
128
129 char* mutable_line = (char*) malloc(sizeof(char)*strlen(current_line));
130 strncpy(mutable_line,current_line,strlen(current_line));
131
132 char** line_parts;
133 int num_parts = split_args(mutable_line,&line_parts,"\t");
134
135 assert(num_parts == num_columns);
136
137 printf("size of map %d\n",entries->size());
138
139 for(int idx=0; idx<num_columns;idx++) {
140 char* col = line_parts[idx];
141 char* current_type = types_list[idx];
142
143 printf("elem:%s\n",col);
144
145 if ( strcmp(current_type,"d")==0 ) {
146 printf("found int\n");
147 vptr_array[idx] = (int*) malloc(sizeof(int));
148 int elem = atoi(col);
149 memcpy(vptr_array[idx],&elem,sizeof(int));
150 }
151
152 if ( strcmp(current_type,"f")==0 ) {
153 printf("found double\n");
154 vptr_array[idx] = (double*) malloc(sizeof(double));
155 double elem = atof(col);
156 memcpy(vptr_array[idx],&elem,sizeof(double));
157 }
158
159 if ( strcmp(current_type,"s")==0 ) {
160 printf("found string\n");
161 vptr_array[idx] = (char*) malloc(sizeof(char)*strlen(col));
162 memcpy(vptr_array[idx],col,strlen(col));
163 }
164
165 if ( strcmp(current_type,"lu")==0 ) {
166 printf("found unsigned long\n");
167 vptr_array[idx] = (unsigned long*) malloc(sizeof(unsigned long));
168 unsigned long elem = strtoul(col,NULL,10);
169 memcpy(vptr_array[idx],&elem,sizeof(unsigned long));
170 }
171 }
172
173 int *id = (int*) vptr_array[0];
174 (*entries)[*id] = vptr_array;
175 printf("size of map %d\n",entries->size());
176
177 free(mutable_line);
178 }
179
180
181
182 int ParaParser::parseFile(char* reads_filename) {
183 size_t buf_size = 512;
184 char* line = (char*) malloc(sizeof(char)*buf_size);
185
186 printf("open %s\n",reads_filename);
187 FILE *input_fs = fopen(reads_filename,"r");
188 if (input_fs == NULL)
189 perror("fopen");
190
191 int line_ctr = 0;
192
193 while (getline (&line, &buf_size, input_fs) >= 0)
194 line_ctr++;
195
196 free(line);
197
198 printf("file has %d lines\n",line_ctr);
199
200 if(input_fs == NULL) {
201 printf("Error: Could not open file: %s",reads_filename);
202 exit(EXIT_FAILURE);
203 }
204
205 int reads_fid = fileno(input_fs);
206 struct stat reads_stat;
207 if ( fstat(reads_fid,&reads_stat) == -1) {
208 perror("fstat");
209 exit(EXIT_FAILURE);
210 }
211
212 off_t reads_filesize = reads_stat.st_size;
213 //printf("Reads file is of size %lu bytes\n",(unsigned long) reads_filesize);
214
215 entries = new map<unsigned long,void**,KeyCmp>();
216
217 // try to acquire file using mmap
218 void *reads_area = mmap (NULL,reads_filesize,PROT_READ,MAP_PRIVATE,reads_fid,0);
219 if (reads_area == MAP_FAILED) {
220 perror("mmap");
221 exit(EXIT_FAILURE);
222 }
223
224 close(reads_fid);
225 printf("Successfully mapped %lu bytes of reads file into memory\n",(unsigned long)reads_filesize);
226
227 char* lineBeginPtr = (char*) reads_area;
228 char* lineEndPtr = (char*) reads_area;
229 char* end_of_mapped_area = ((char*) reads_area) + reads_filesize;
230
231 while (*lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
232 lineEndPtr++;
233
234 char* current_line = (char*) malloc(sizeof(char)*512);
235 memset(current_line,0,512);
236
237 unsigned long line_size = lineEndPtr - lineBeginPtr;
238 strncpy(current_line,lineBeginPtr,line_size);
239 current_line[line_size] = '\0';
240
241 int readCtr = 0;
242 int status = 0;
243
244 int num_reads = 0;
245
246 while(1) {
247 if (strcmp(current_line,"") == 0)
248 break;
249
250 create_entry_from_line(current_line,format_string);
251 if (status != 0 )
252 printf("Error while parsing line (status=%d).",status);
253
254 lineBeginPtr = lineEndPtr;
255 while (*(char*)lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
256 lineEndPtr++;
257
258 current_line = strncpy(current_line,lineBeginPtr,lineEndPtr-lineBeginPtr);
259 current_line[lineEndPtr-lineBeginPtr] = '\0';
260
261 readCtr += 1;
262 }
263
264 printf("After parsing lines\n");
265
266 // unmap parsed file
267 status = munmap(reads_area,reads_filesize);
268 if(status != 0)
269 perror("munmap");
270
271 printf("After parsing lines\n");
272 // free unneeded variables
273 free(current_line);
274
275 printf("After parsing lines\n");
276 return readCtr;
277 }
278
279
280 /*
281 *
282 *
283 *
284 */
285
286 PyObject* ParaParser::fetchEntry(int id) {
287 printf("begin of fetchEntry\n");
288 PyObject* line_dict = PyDict_New();
289
290
291 printf("size of map %d\n",entries->size());
292
293 map<unsigned long,void**,KeyCmp>::iterator find_it = entries->find((unsigned long)id);
294 if( find_it == entries->end() )
295 return line_dict;
296
297 void** current_entry = (*entries)[id];
298
299 int* _id = (int*) current_entry[0];
300 printf("id is %d\n",*_id);
301
302
303 int status;
304
305 PyObject* elem;
306
307 for(int idx=0;idx<num_columns;idx++) {
308 char* current_type = types_list[idx];
309 if ( strcmp(current_type,"d")==0 )
310 elem = PyInt_FromLong(*(int*)current_entry[idx]);
311
312 if ( strcmp(current_type,"f")==0 )
313 elem = PyFloat_FromDouble(*(double*)current_entry[idx]);
314
315 if ( strcmp(current_type,"s")==0 )
316 elem = PyString_FromString((char*)current_entry[idx]);
317
318 if ( strcmp(current_type,"lu")==0 || strcmp(current_type,"d")==0 )
319 elem = PyLong_FromUnsignedLong(*(int*)current_entry[idx]);
320
321 printf("add item\n");
322 status = PyDict_SetItem(line_dict, PyString_FromString(field_names[idx]), elem);
323 }
324
325 /*
326 *
327 //size_t idx;
328 //for(idx=0;idx<current_read->size;idx++) {
329 // status = PyList_SetItem( prb_list, idx, PyInt_FromLong(current_read->prb[idx]-50) );
330 // status = PyList_SetItem( cal_prb_list, idx, PyInt_FromLong(current_read->cal_prb[idx]-64) );
331 // status = PyList_SetItem( chastity_list, idx, PyInt_FromLong(current_read->chastity[idx]+10) );
332 //}
333
334 */
335
336 printf("end of fetchEntry\n");
337 return line_dict;
338 }
339