+ added some assertions
[qpalma.git] / ParaParser / ParaParser.cpp
1 #include "ParaParser.h"
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <sys/mman.h>
5 #include <sys/stat.h>
6 #include <string>
7 using namespace std;
8
9
10 /**
11 * Split string and return pointers to its parts.
12 *
13 * \param args The string to be split.
14 * \param argv_ptr Pointer to the list of substrings.
15 * \param delim Delimiter.
16 *
17 * This function modifies \a args by replacing each occurance of \a delim by
18 * zero. A \p NULL-terminated array of pointers to char* is allocated dynamically
19 * and these pointers are initialized to point to the broken-up substrings
20 * within \a args. A pointer to this array is returned via \a argv_ptr.
21 *
22 * \return The number of substrings found in \a args.
23 */
24
25 unsigned split_args(char *args, char *** const argv_ptr, const char *delim)
26 {
27 char *p = args;
28 char **argv;
29 size_t n = 0, i, j;
30
31 p = args + strspn(args, delim);
32 for (;;) {
33 i = strcspn(p, delim);
34 if (!i)
35 break;
36 p += i;
37 n++;
38 p += strspn(p, delim);
39 }
40 *argv_ptr = (char**) malloc((n + 1) * sizeof(char *));
41 argv = *argv_ptr;
42 i = 0;
43 p = args + strspn(args, delim);
44 while (p) {
45 argv[i] = p;
46 j = strcspn(p, delim);
47 if (!j)
48 break;
49 p += strcspn(p, delim);
50 if (*p) {
51 *p = '\0';
52 p++;
53 p += strspn(p, delim);
54 }
55 i++;
56 }
57 argv[n] = NULL;
58 return n;
59 }
60
61
62 /*
63 * The constructor needs the format string to be used with sscanf and the names
64 * of the respective fields for the dictionary.
65 *
66 */
67
68 ParaParser::ParaParser(const char* fmt, char** _fields, int num_entries) {
69 // check that we have more than zero entries and that the format string
70 // contains exactly num_entries format elements.
71 assert(num_entries>0);
72
73 num_columns = (size_t) num_entries;
74
75 // count how many entries are parsed in one line (number of %'s)
76 size_t format_num_entries = 0;
77 for(size_t fidx=0;fidx<strlen(fmt);fidx++)
78 if (fmt[fidx] == '%')
79 format_num_entries++;
80
81 if (format_num_entries != num_columns) {
82 printf("Error: For every entry in the format string you have to supply a name!\n");
83 exit(EXIT_FAILURE);
84 }
85
86 // copy the field names to the member variable
87 size_t buf_size = 512;
88 field_names = (char**) malloc(sizeof(char*)*num_columns);
89 for(size_t idx=0;idx<num_columns;idx++) {
90 field_names[idx] = (char*) malloc(sizeof(char)*buf_size);
91 strncpy(field_names[idx],_fields[idx],strlen(_fields[idx]));
92 }
93
94 char* pruned_format_string = (char*) malloc(sizeof(char)*buf_size);
95 size_t pruned_size = 0;
96 for(size_t idx=0;idx<strlen(fmt);idx++) {
97 pruned_format_string[pruned_size] = fmt[idx];
98 pruned_size++;
99 }
100
101 pruned_format_string[pruned_size] = '%';
102 pruned_format_string[++pruned_size] = '\0';
103 //printf("%s\n",pruned_format_string);
104
105 types_list = (char**) malloc(sizeof(char**)*num_columns);
106 char *pruned_ptr = pruned_format_string;
107 //printf("types list\n");
108 for(size_t f_idx=0;f_idx<num_columns;f_idx++) {
109 char *part = strtok (pruned_ptr, "%");
110 pruned_ptr = NULL;
111
112 types_list[f_idx] = (char*) malloc(sizeof(char*)*strlen(part)+1);
113 types_list[f_idx][strlen(part)] = '\0';
114 strncpy(types_list[f_idx],part,strlen(part));
115 //printf("%s(%d) ",part,strlen(part));
116 }
117 //printf("\n");
118 }
119
120
121 /*
122 *
123 *
124 *
125 */
126
127 void ParaParser::create_entry_from_line(const char* current_line, char* format_string) {
128 //printf("current line is %s",current_line);
129
130 // create an array of void ptrs
131 char **current_entries = (char**) malloc(sizeof(char*)*num_columns);
132
133 char* mutable_line = (char*) malloc(sizeof(char)*strlen(current_line));
134 strncpy(mutable_line,current_line,strlen(current_line));
135 mutable_line[strlen(current_line)-1] = '\t';
136
137 char** line_parts;
138 size_t num_parts = split_args(mutable_line,&line_parts,"\t");
139
140 for(size_t idx=0; idx<num_columns;idx++) {
141 char* col = line_parts[idx];
142
143 //printf("elem:%s\n",col);
144 current_entries[idx] = (char*) malloc(sizeof(char)*strlen(col)+1);
145 //current_entries2[idx] = string(col);
146 strncpy(current_entries[idx],col,strlen(col));
147 current_entries[idx][strlen(col)] = '\0';
148 }
149 free(mutable_line);
150
151 int id = atoi(current_entries[0]);
152 //printf("id is %d\n",id);
153 //printf("size of map %d\n",entries->size());
154 (*entries)[id] = current_entries;
155 //printf("size of map %d\n",entries->size());
156
157 }
158
159
160 /*
161 *
162 *
163 *
164 */
165
166 int ParaParser::parseFile(char* reads_filename) {
167 size_t buf_size = 512;
168 char* line = (char*) malloc(sizeof(char)*buf_size);
169
170 //printf("open %s\n",reads_filename);
171 FILE *input_fs = fopen(reads_filename,"r");
172 if (input_fs == NULL)
173 perror("fopen");
174
175 int line_ctr = 0;
176
177 while (getline (&line, &buf_size, input_fs) >= 0)
178 line_ctr++;
179
180 free(line);
181
182 //printf("file has %d lines\n",line_ctr);
183
184 if(input_fs == NULL) {
185 printf("Error: Could not open file: %s",reads_filename);
186 exit(EXIT_FAILURE);
187 }
188
189 int reads_fid = fileno(input_fs);
190 struct stat reads_stat;
191 if ( fstat(reads_fid,&reads_stat) == -1) {
192 perror("fstat");
193 exit(EXIT_FAILURE);
194 }
195
196 off_t reads_filesize = reads_stat.st_size;
197 //printf("Reads file is of size %lu bytes\n",(unsigned long) reads_filesize);
198
199 entries = new MAP();
200
201 // try to acquire file using mmap
202 void *reads_area = mmap (NULL,reads_filesize,PROT_READ,MAP_PRIVATE,reads_fid,0);
203 if (reads_area == MAP_FAILED) {
204 perror("mmap");
205 exit(EXIT_FAILURE);
206 }
207
208 close(reads_fid);
209 //printf("Successfully mapped %lu bytes of reads file into memory\n",(unsigned long)reads_filesize);
210
211 char* lineBeginPtr = (char*) reads_area;
212 char* lineEndPtr = (char*) reads_area;
213 char* end_of_mapped_area = ((char*) reads_area) + reads_filesize;
214
215 while (*lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
216 lineEndPtr++;
217
218 char* current_line = (char*) malloc(sizeof(char)*512);
219 memset(current_line,0,512);
220
221 unsigned long line_size = lineEndPtr - lineBeginPtr;
222 strncpy(current_line,lineBeginPtr,line_size);
223 current_line[line_size] = '\0';
224
225 int readCtr = 0;
226 int status = 0;
227
228 while(1) {
229 if (strcmp(current_line,"") == 0)
230 break;
231
232 create_entry_from_line(current_line,format_string);
233
234 if (status != 0 )
235 printf("Error while parsing line (status=%d).",status);
236
237 lineBeginPtr = lineEndPtr;
238 while (*(char*)lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
239 lineEndPtr++;
240
241 current_line = strncpy(current_line,lineBeginPtr,lineEndPtr-lineBeginPtr);
242 current_line[lineEndPtr-lineBeginPtr] = '\0';
243
244 readCtr += 1;
245 }
246
247 // unmap parsed file
248 status = munmap(reads_area,reads_filesize);
249 if(status != 0)
250 perror("munmap");
251
252 // free unneeded variables
253 free(current_line);
254
255 return readCtr;
256 }
257
258
259 /*
260 *
261 *
262 *
263 */
264
265 PyObject* ParaParser::fetchEntry(int id) {
266 PyObject* line_dict = PyDict_New();
267 //printf("begin of fetchEntry\n");
268 //printf("size of map %d\n",entries->size());
269 //printf("keys:\n");
270 //MAP::iterator iter;
271 //for(iter = entries->begin(); iter != entries->end(); iter++)
272 // printf("%d\n", iter->first);
273 //printf("query key is %d\n",id);
274
275 MAP::iterator find_it = entries->find(id);
276 if( find_it == entries->end() )
277 return line_dict;
278
279 char** current_entry = (*entries)[id];
280
281 int status;
282
283 for(size_t idx=0;idx<num_columns;idx++) {
284 char* current_type = types_list[idx];
285 char* current = current_entry[idx];
286
287 // init elem to make compiler happy
288 PyObject* elem = 0;
289
290 if ( strcmp(current_type,"d")==0 )
291 elem = PyInt_FromString(current,NULL,10);
292
293 if ( strcmp(current_type,"f")==0 )
294 elem = PyFloat_FromString(PyString_FromString(current),NULL);
295
296 if ( strcmp(current_type,"s")==0 )
297 elem = PyString_FromString(current);
298
299 if ( strcmp(current_type,"lu")==0 )
300 elem = PyLong_FromString(current,NULL,10);
301
302 if (elem == 0)
303 printf("Error: type %s/ elem %s\n",current_type,current);
304
305 //printf("add item\n");
306 free(current);
307
308 status = PyDict_SetItem(line_dict, PyString_FromString(field_names[idx]), elem);
309 }
310
311 /*
312 //for(size_t idx=0;idx<current_read->size;idx++) {
313 // status = PyList_SetItem( prb_list, idx, PyInt_FromLong(current_read->prb[idx]-50) );
314 // status = PyList_SetItem( cal_prb_list, idx, PyInt_FromLong(current_read->cal_prb[idx]-64) );
315 // status = PyList_SetItem( chastity_list, idx, PyInt_FromLong(current_read->chastity[idx]+10) );
316 //}
317 */
318
319 //printf("end of fetchEntry\n");
320 return line_dict;
321 }