6d019a5430df3d1a9f4ee8cfc82f93cef99c27e3
[qpalma.git] / ParaParser / ParaParser.cpp
1 #include "ParaParser.h"
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <sys/mman.h>
5 #include <sys/stat.h>
6
7
8 /**
9 * Split string and return pointers to its parts.
10 *
11 * \param args The string to be split.
12 * \param argv_ptr Pointer to the list of substrings.
13 * \param delim Delimiter.
14 *
15 * This function modifies \a args by replacing each occurance of \a delim by
16 * zero. A \p NULL-terminated array of pointers to char* is allocated dynamically
17 * and these pointers are initialized to point to the broken-up substrings
18 * within \a args. A pointer to this array is returned via \a argv_ptr.
19 *
20 * \return The number of substrings found in \a args.
21 */
22
23 unsigned split_args(char *args, char *** const argv_ptr, const char *delim)
24 {
25 char *p = args;
26 char **argv;
27 size_t n = 0, i, j;
28
29 p = args + strspn(args, delim);
30 for (;;) {
31 i = strcspn(p, delim);
32 if (!i)
33 break;
34 p += i;
35 n++;
36 p += strspn(p, delim);
37 }
38 *argv_ptr = (char**) malloc((n + 1) * sizeof(char *));
39 argv = *argv_ptr;
40 i = 0;
41 p = args + strspn(args, delim);
42 while (p) {
43 argv[i] = p;
44 j = strcspn(p, delim);
45 if (!j)
46 break;
47 p += strcspn(p, delim);
48 if (*p) {
49 *p = '\0';
50 p++;
51 p += strspn(p, delim);
52 }
53 i++;
54 }
55 argv[n] = NULL;
56 return n;
57 }
58
59
60 /*
61 * The constructor needs the format string to be used with sscanf and the names
62 * of the respective fields for the dictionary.
63 *
64 */
65
66 ParaParser::ParaParser(const char* fmt, char** _fields, int num_entries) {
67 assert(num_entries>0);
68 num_columns = (size_t) num_entries;
69 size_t buf_size = 512;
70 format_string = (char*) malloc(sizeof(char)*buf_size);
71 if (strlen(fmt) > buf_size)
72 perror("format string to long!");
73
74 strncpy(format_string,fmt,strlen(fmt));
75
76 // count how many entries are parsed in one line (number of %'s)
77 size_t format_num_entries = 0;
78 for(size_t fidx=0;fidx<strlen(fmt);fidx++)
79 if (format_string[fidx] == '%')
80 format_num_entries++;
81
82 if (format_num_entries != num_columns)
83 printf("Error: For every entry in the format string you have to supply a name!");
84
85 field_names = (char**) malloc(sizeof(char*)*num_columns);
86
87 for(size_t idx=0;idx<num_columns;idx++) {
88 field_names[idx] = (char*) malloc(sizeof(char)*buf_size);
89 strncpy(field_names[idx],_fields[idx],strlen(_fields[idx]));
90 }
91
92 // subtract number of tabs as we don't want them in the parsing
93 char* pruned_format_string = (char*)malloc(sizeof(char*)*strlen(format_string)-num_columns+2);
94 int pruned_idx = 0;
95
96 for(size_t idx=0;idx<strlen(format_string);idx++) {
97 if (format_string[idx] == '\t')
98 continue;
99
100 pruned_format_string[pruned_idx] = format_string[idx];
101 pruned_idx++;
102 }
103 pruned_format_string[strlen(format_string)-num_columns+1] = '%';
104
105 types_list = (char**) malloc(sizeof(char**)*num_columns);
106 printf("types list\n");
107 char *pruned_ptr = pruned_format_string;
108 for(size_t f_idx=0;f_idx<num_columns;f_idx++) {
109 char *part = strtok (pruned_ptr, "%");
110 pruned_ptr = NULL;
111
112 types_list[f_idx] = (char*) malloc(sizeof(char*)*strlen(part)+1);
113 types_list[f_idx][strlen(part)] = '\0';
114 strncpy(types_list[f_idx],part,strlen(part));
115 printf("%s(%d) ",part,strlen(part));
116 }
117 printf("\n");
118 }
119
120 /*
121 *
122 */
123
124 void ParaParser::create_entry_from_line(const char* current_line, char* format_string) {
125 printf("current line is %s",current_line);
126
127 // create an array of void ptrs
128 char **current_entries = (char**) malloc(sizeof(char*)*num_columns);
129
130 char* mutable_line = (char*) malloc(sizeof(char)*strlen(current_line));
131 strncpy(mutable_line,current_line,strlen(current_line));
132 mutable_line[strlen(current_line)-1] = '\t';
133
134 char** line_parts;
135 size_t num_parts = split_args(mutable_line,&line_parts,"\t");
136
137 printf("size of map %d\n",entries->size());
138
139 for(size_t idx=0; idx<num_columns;idx++) {
140 char* col = line_parts[idx];
141 char* current_type = types_list[idx];
142
143 printf("elem:%s\n",col);
144 current_entries[idx] = (char*) malloc(sizeof(char)*strlen(col)+1);
145 strncpy(current_entries[idx],col,strlen(col));
146 current_entries[idx][strlen(col)] = '\0';
147 }
148
149 int id = atoi(current_entries[0]);
150 printf("id is %d\n",id);
151 (*entries)[id] = (void**) current_entries;
152 printf("size of map %d\n",entries->size());
153
154 free(mutable_line);
155 }
156
157
158
159 int ParaParser::parseFile(char* reads_filename) {
160 size_t buf_size = 512;
161 char* line = (char*) malloc(sizeof(char)*buf_size);
162
163 printf("open %s\n",reads_filename);
164 FILE *input_fs = fopen(reads_filename,"r");
165 if (input_fs == NULL)
166 perror("fopen");
167
168 int line_ctr = 0;
169
170 while (getline (&line, &buf_size, input_fs) >= 0)
171 line_ctr++;
172
173 free(line);
174
175 printf("file has %d lines\n",line_ctr);
176
177 if(input_fs == NULL) {
178 printf("Error: Could not open file: %s",reads_filename);
179 exit(EXIT_FAILURE);
180 }
181
182 int reads_fid = fileno(input_fs);
183 struct stat reads_stat;
184 if ( fstat(reads_fid,&reads_stat) == -1) {
185 perror("fstat");
186 exit(EXIT_FAILURE);
187 }
188
189 off_t reads_filesize = reads_stat.st_size;
190 //printf("Reads file is of size %lu bytes\n",(unsigned long) reads_filesize);
191
192 entries = new map<unsigned long,void**,KeyCmp>();
193
194 // try to acquire file using mmap
195 void *reads_area = mmap (NULL,reads_filesize,PROT_READ,MAP_PRIVATE,reads_fid,0);
196 if (reads_area == MAP_FAILED) {
197 perror("mmap");
198 exit(EXIT_FAILURE);
199 }
200
201 close(reads_fid);
202 printf("Successfully mapped %lu bytes of reads file into memory\n",(unsigned long)reads_filesize);
203
204 char* lineBeginPtr = (char*) reads_area;
205 char* lineEndPtr = (char*) reads_area;
206 char* end_of_mapped_area = ((char*) reads_area) + reads_filesize;
207
208 while (*lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
209 lineEndPtr++;
210
211 char* current_line = (char*) malloc(sizeof(char)*512);
212 memset(current_line,0,512);
213
214 unsigned long line_size = lineEndPtr - lineBeginPtr;
215 strncpy(current_line,lineBeginPtr,line_size);
216 current_line[line_size] = '\0';
217
218 int readCtr = 0;
219 int status = 0;
220
221 while(1) {
222 if (strcmp(current_line,"") == 0)
223 break;
224
225 create_entry_from_line(current_line,format_string);
226 if (status != 0 )
227 printf("Error while parsing line (status=%d).",status);
228
229 lineBeginPtr = lineEndPtr;
230 while (*(char*)lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
231 lineEndPtr++;
232
233 current_line = strncpy(current_line,lineBeginPtr,lineEndPtr-lineBeginPtr);
234 current_line[lineEndPtr-lineBeginPtr] = '\0';
235
236 readCtr += 1;
237 }
238
239 printf("After parsing lines\n");
240
241 // unmap parsed file
242 status = munmap(reads_area,reads_filesize);
243 if(status != 0)
244 perror("munmap");
245
246 printf("After parsing lines\n");
247 // free unneeded variables
248 free(current_line);
249
250 printf("After parsing lines\n");
251 return readCtr;
252 }
253
254
255 /*
256 *
257 *
258 *
259 */
260
261 PyObject* ParaParser::fetchEntry(int id) {
262 printf("begin of fetchEntry\n");
263 PyObject* line_dict = PyDict_New();
264
265 printf("size of map %d\n",entries->size());
266
267 map<unsigned long,void**,KeyCmp>::iterator find_it = entries->find((unsigned long)id);
268 if( find_it == entries->end() )
269 return line_dict;
270
271 char** current_entry = (char**) (*entries)[id];
272
273 int _id = atoi(current_entry[0]);
274 printf("id is %d\n",_id);
275
276 int status;
277
278 for(size_t idx=0;idx<num_columns;idx++) {
279 char* current_type = types_list[idx];
280 char* current = current_entry[idx];
281
282 // init elem to make compiler happy
283 PyObject* elem = 0;
284
285 if ( strcmp(current_type,"d")==0 )
286 elem = PyInt_FromString(current,NULL,10);
287
288 if ( strcmp(current_type,"f")==0 )
289 elem = PyFloat_FromString(PyString_FromString(current),NULL);
290
291 if ( strcmp(current_type,"s")==0 )
292 elem = PyString_FromString(current);
293
294 if ( strcmp(current_type,"lu")==0 )
295 elem = PyLong_FromString(current,NULL,10);
296
297 if (elem == 0)
298 printf("Error: type %s/ elem %s\n",current_type,current);
299
300 printf("add item\n");
301
302 status = PyDict_SetItem(line_dict, PyString_FromString(field_names[idx]), elem);
303 }
304
305 /*
306 //;
307 //for(size_t idx=0;idx<current_read->size;idx++) {
308 // status = PyList_SetItem( prb_list, idx, PyInt_FromLong(current_read->prb[idx]-50) );
309 // status = PyList_SetItem( cal_prb_list, idx, PyInt_FromLong(current_read->cal_prb[idx]-64) );
310 // status = PyList_SetItem( chastity_list, idx, PyInt_FromLong(current_read->chastity[idx]+10) );
311 //}
312
313 */
314
315 printf("end of fetchEntry\n");
316 return line_dict;
317 }