+ added missing free for line array
[qpalma.git] / cparser / qparser.c
1 #include <Python.h>
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <sys/mman.h>
5 #include <sys/stat.h>
6
7
8 #include "read.h"
9
10 #define WITH_QUALITIES 0
11
12 // these two array store the reads respective their ids
13 unsigned long *id_map;
14
15 int map_idx;
16
17 const int offset = 1000000000000;
18
19 Read **read_array;
20
21 int num_reads;
22
23 const char* pos_strand = "+";
24 const char* neg_strand = "-";
25
26
27 // the line format is defined as follows
28 // id chr strand seq splitpos size q1 q2 q3 geneId p1 p2 p3 p4 true_cut
29 const char* line_format = "%lu\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\n";
30
31 int create_read_from_line(Read* newRead, const char* current_line) {
32 //printf("current line is %s\n",current_line);
33
34 int entries_found = sscanf(current_line,line_format,&(newRead->id),
35 &(newRead->chr),&(newRead->strand),newRead->seq,&(newRead->splitpos),&(newRead->size),
36 newRead->prb,newRead->cal_prb,newRead->chastity,newRead->gene_id,&(newRead->p_start),
37 &(newRead->exon_stop),&(newRead->exon_start),&(newRead->p_stop),&(newRead->true_cut));
38
39 if (entries_found != 15) {
40 return -1;
41 }
42
43 // make sequence lowercase but don't destroy brackets
44 Py_ssize_t idx;
45 for(idx=0;idx<strlen(newRead->seq);idx++) {
46 if ( 65 <= newRead->seq[idx] && newRead->seq[idx] < 85)
47 newRead->seq[idx] = newRead->seq[idx]+32;
48 }
49
50 if ( newRead->strand == 'D' )
51 newRead->strand = '+';
52
53 if ( newRead->strand == 'P' )
54 newRead->strand = '-';
55
56 return 0;
57 }
58
59
60 static void Py_free_everything() {
61
62 size_t idx;
63 for(idx=0;idx<num_reads;idx++)
64 free_read(read_array[idx]);
65
66 free(read_array);
67 free(id_map);
68 }
69
70
71 static int set_item_from_line(const char* current_line) {
72 Read* current_read = read_alloc();
73 int status = create_read_from_line(current_read,current_line);
74 if(status != 0)
75 return status;
76
77 //print_read(current_read);
78
79 //int read_idx = current_read->id - offset;
80 //if (read_idx > num_reads)
81 // printf("offset error\n");
82 //read_array[read_idx] = current_read;
83
84 read_array[num_reads] = current_read;
85 id_map[num_reads] = current_read->id;
86 num_reads++;
87
88 return 0;
89 }
90
91
92
93 static void assign_read(PyObject *read_dict, int index) {
94
95 int status;
96
97 Read* current_read = read_array[index];
98
99 status = PyDict_SetItem(read_dict, PyString_FromString("id"), PyInt_FromLong(current_read->id) );
100 status = PyDict_SetItem(read_dict, PyString_FromString("chr"), PyInt_FromLong(current_read->chr) );
101 status = PyDict_SetItem(read_dict, PyString_FromString("seq"), PyString_FromString(current_read->seq) );
102
103 if ( current_read->strand == '+' )
104 status = PyDict_SetItem(read_dict, PyString_FromString("strand"), PyString_FromString(pos_strand) );
105
106 if ( current_read->strand == '-' )
107 status = PyDict_SetItem(read_dict, PyString_FromString("strand"), PyString_FromString(neg_strand) );
108
109 status = PyDict_SetItem(read_dict, PyString_FromString("splitpos"), PyInt_FromLong(current_read->splitpos) );
110 status = PyDict_SetItem(read_dict, PyString_FromString("read_size"), PyInt_FromLong(current_read->size) );
111 status = PyDict_SetItem(read_dict, PyString_FromString("true_cut"), PyInt_FromLong(current_read->true_cut) );
112
113 PyObject* prb_list = PyList_New(current_read->size);
114 PyObject* cal_prb_list = PyList_New(current_read->size);
115 PyObject* chastity_list = PyList_New(current_read->size);
116
117 size_t idx;
118 for(idx=0;idx<current_read->size;idx++) {
119 status = PyList_SetItem( prb_list, idx, PyInt_FromLong(current_read->prb[idx]-50) );
120 status = PyList_SetItem( cal_prb_list, idx, PyInt_FromLong(current_read->cal_prb[idx]-64) );
121 status = PyList_SetItem( chastity_list, idx, PyInt_FromLong(current_read->chastity[idx]+10) );
122 }
123
124 status = PyDict_SetItem(read_dict, PyString_FromString("prb"), prb_list );
125 status = PyDict_SetItem(read_dict, PyString_FromString("cal_prb"), cal_prb_list );
126 status = PyDict_SetItem(read_dict, PyString_FromString("chastity"), chastity_list );
127 status = PyDict_SetItem(read_dict, PyString_FromString("gene_id"), PyString_FromString(current_read->gene_id) );
128 status = PyDict_SetItem(read_dict, PyString_FromString("p_start"), PyInt_FromLong(current_read->p_start) );
129 status = PyDict_SetItem(read_dict, PyString_FromString("exon_stop"), PyInt_FromLong(current_read->exon_stop) );
130 status = PyDict_SetItem(read_dict, PyString_FromString("exon_start"), PyInt_FromLong(current_read->exon_start) );
131 status = PyDict_SetItem(read_dict, PyString_FromString("p_stop"), PyInt_FromLong(current_read->p_stop) );
132
133 }
134
135
136 static PyObject * Py_fetch_read(PyObject *obj, PyObject *args) {
137 // first define some constant strings
138 unsigned long read_id;
139 unsigned long current_read_id;
140
141 if (!PyArg_ParseTuple(args, "l", &read_id)) {
142 PyErr_Warn(PyExc_Warning, "qparser.parse_reads: Invalid parameters.");
143 return NULL;
144 }
145
146 //printf("got index %lu\n",read_id);
147
148 PyObject* read_dict = PyDict_New();
149
150 //size_t idx = read_id - offset;
151 //assign_read(read_dict,idx);
152
153 size_t idx;
154 for(idx=0;idx<num_reads;idx++) {
155 current_read_id = id_map[idx];
156 if (current_read_id == read_id) {
157 assign_read(read_dict,idx);
158 }
159 }
160
161 return read_dict;
162 }
163
164
165 /*
166 * This function parses the original reads file and stores the lines in a
167 * dictionary indexed by the key.
168 *
169 */
170
171 static PyObject * Py_parse_reads(PyObject *obj, PyObject *args) {
172
173 // first define some constant strings
174 const char* reads_filename;
175
176 if (!PyArg_ParseTuple(args, "s", &reads_filename)) {
177 PyErr_Warn(PyExc_Warning, "qparser.parse_reads: Invalid parameters.");
178 return NULL;
179 }
180
181 FILE *reads_fs = fopen(reads_filename,"r");
182
183 if(reads_fs == NULL) {
184 printf("Error: Could not open file: %s",reads_filename);
185 exit(EXIT_FAILURE);
186 }
187
188 int reads_fid = fileno(reads_fs);
189 struct stat reads_stat;
190 if ( fstat(reads_fid,&reads_stat) == -1) {
191 perror("fstat");
192 exit(EXIT_FAILURE);
193 }
194
195 off_t reads_filesize = reads_stat.st_size;
196 //printf("Reads file is of size %lu bytes\n",(unsigned long) reads_filesize);
197
198 // ATTENTION this is an overestimator of the reads in the file
199 // it is NOT the exact number
200 int num_init_reads = reads_filesize / 200.0;
201
202 read_array = malloc(sizeof(Read*)*num_init_reads);
203 id_map = malloc(sizeof(unsigned long)*num_init_reads);
204
205 //printf("Found %d reads.",numReads);
206
207 // try to acquire file using mmap
208 void *reads_area = mmap (NULL,reads_filesize,PROT_READ,MAP_PRIVATE,reads_fid,0);
209 if (reads_area == MAP_FAILED) {
210 perror("mmap");
211 exit(EXIT_FAILURE);
212 }
213
214 close(reads_fid);
215 printf("Successfully mapped %lu bytes of reads file into memory\n",(unsigned long)reads_filesize);
216
217 char* lineBeginPtr = (char*) reads_area;
218 char* lineEndPtr = (char*) reads_area;
219 char* end_of_mapped_area = ((char*) reads_area) + reads_filesize;
220
221 while (*lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
222 lineEndPtr++;
223
224 char* current_line = malloc(sizeof(char)*512);
225 memset(current_line,0,512);
226
227 unsigned long line_size = lineEndPtr - lineBeginPtr;
228 strncpy(current_line,lineBeginPtr,line_size);
229 current_line[line_size] = '\0';
230
231 int readCtr = 0;
232 int status = 0;
233
234 num_reads = 0;
235 map_idx = 0;
236
237 while(1) {
238 if (strcmp(current_line,"") == 0)
239 break;
240
241 status = set_item_from_line(current_line);
242 if (status != 0 )
243 printf("Error while parsing line (status=%d).",status);
244
245 lineBeginPtr = lineEndPtr;
246 while (*(char*)lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
247 lineEndPtr++;
248
249 current_line = strncpy(current_line,lineBeginPtr,lineEndPtr-lineBeginPtr);
250 current_line[lineEndPtr-lineBeginPtr] = '\0';
251
252 readCtr += 1;
253 }
254
255 // clean up
256 status = munmap(reads_area,reads_filesize);
257 if(status != 0)
258 perror("munmap");
259
260 free(current_line);
261
262 return PyInt_FromLong(num_reads);
263 }
264
265
266 static PyMethodDef qparserMethods[] = {
267 {"parse_reads", Py_parse_reads, METH_VARARGS,"Test UInt8 behaviour."},
268 {"fetch_read", Py_fetch_read, METH_VARARGS,"Test UInt8 behaviour."},
269 //{"free_everything", Py_free_everything, 0,"Test UInt8 behaviour."},
270 {NULL, NULL, 0, NULL} /* Sentinel */
271 };
272
273
274 PyMODINIT_FUNC initqparser(void) {
275 (void) Py_InitModule("qparser", qparserMethods);
276 }
277
278
279 int main(int argc, char *argv[]) {
280 Py_SetProgramName(argv[0]);
281 Py_Initialize();
282 initqparser();
283 }
284
285
286 /*
287 static int set_item_from_line(PyObject *result_dict, const char* current_line) {
288
289 // increment the reference count for the result_dict object because we want
290 // ot modify it
291 Py_INCREF( result_dict );
292
293 //printf("current line is %s\n",current_line);
294 int entries_found = sscanf(current_line,line_format,&id,&chr,&strand,seq,&splitpos,&size,prb,cal_prb,chastity,geneId,&p_start,&exon_stop,&exon_start,&p_stop,&true_cut);
295
296 if (entries_found != 15) {
297 return entries_found;
298 }
299
300 //printf("after sscanf\n");
301 int status;
302
303 // create dictionary representing one line
304 //PyObject* entry_dict = PyDict_New();
305
306 // alternative way using a list instead of a dictionary
307 PyObject* entry_list = PyList_New(15);
308 Py_INCREF( entry_list );
309
310 PyObject *id_py = PyInt_FromLong(id);
311 PyObject *strand_py = PyString_FromString("--");
312
313 if ( strand == 'D' )
314 strand_py = PyString_FromString("+");
315
316 if ( strand == 'P' )
317 strand_py = PyString_FromString("-");
318
319 //printf("before : %s\n",seq);
320 Py_ssize_t idx;
321 for(idx=0;idx<strlen(seq);idx++) {
322 if ( 65 <= seq[idx] && seq[idx] < 85)
323 seq[idx] = seq[idx]+32;
324 }
325 //printf("after : %s\n",seq);
326
327 // add entries of that line
328
329 //status = PyDict_SetItem(entry_dict, PyString_FromString("id"), id_py );
330 //status = PyDict_SetItem(entry_dict, PyString_FromString("chr"), PyInt_FromLong(chr) );
331 //status = PyDict_SetItem(entry_dict, PyString_FromString("seq"), PyString_FromString(seq) );
332 //status = PyDict_SetItem(entry_dict, PyString_FromString("strand"), strand_py );
333 //status = PyDict_SetItem(entry_dict, PyString_FromString("splitpos"), PyInt_FromLong(splitpos) );
334 //status = PyDict_SetItem(entry_dict, PyString_FromString("read_size"), PyInt_FromLong(size) );
335 //status = PyDict_SetItem(entry_dict, PyString_FromString("true_cut"), PyInt_FromLong(true_cut) );
336
337 status = PyList_SetItem(entry_list, 0, id_py );
338 status = PyList_SetItem(entry_list, 1, PyInt_FromLong(chr) );
339 status = PyList_SetItem(entry_list, 2, PyString_FromString(seq) );
340 status = PyList_SetItem(entry_list, 3, strand_py );
341 status = PyList_SetItem(entry_list, 4, PyInt_FromLong(splitpos) );
342 status = PyList_SetItem(entry_list, 5, PyInt_FromLong(size) );
343 status = PyList_SetItem(entry_list, 6, PyInt_FromLong(true_cut) );
344
345 PyObject* prb_list = PyList_New(size);
346 PyObject* cal_prb_list = PyList_New(size);
347 PyObject* chastity_list = PyList_New(size);
348
349 #if WITH_QUALITIES
350 for(idx=0;idx<size;idx++) {
351 status = PyList_SetItem( prb_list, idx, PyInt_FromLong(prb[idx]-50) );
352 status = PyList_SetItem( cal_prb_list, idx, PyInt_FromLong(cal_prb[idx]-64) );
353 status = PyList_SetItem( chastity_list, idx, PyInt_FromLong(chastity[idx]+10) );
354 }
355
356 status = PyList_SetItem(entry_list, 7, prb_list );
357 status = PyList_SetItem(entry_list, 8, cal_prb_list );
358 status = PyList_SetItem(entry_list, 9, chastity_list );
359 #else
360 status = PyList_SetItem(entry_list, 7, PyString_FromString("") );
361 status = PyList_SetItem(entry_list, 8, PyString_FromString("") );
362 status = PyList_SetItem(entry_list, 9, PyString_FromString("") );
363 #endif
364
365
366 status = PyList_SetItem(entry_list, 10, PyString_FromString(geneId) );
367 status = PyList_SetItem(entry_list, 11, PyInt_FromLong(p_start) );
368 status = PyList_SetItem(entry_list, 12, PyInt_FromLong(exon_stop) );
369 status = PyList_SetItem(entry_list, 13, PyInt_FromLong(exon_start) );
370 status = PyList_SetItem(entry_list, 14, PyInt_FromLong(p_stop) );
371
372 //status = PyDict_SetItem(entry_dict, PyString_FromString("prb"), prb_list );
373 //status = PyDict_SetItem(entry_dict, PyString_FromString("cal_prb"), cal_prb_list );
374 //status = PyDict_SetItem(entry_dict, PyString_FromString("chastity"), chastity_list );
375 //status = PyDict_SetItem(entry_dict, PyString_FromString("gene_id"), PyString_FromString(geneId) );
376 //status = PyDict_SetItem(entry_dict, PyString_FromString("p_start"), PyInt_FromLong(p_start) );
377 //status = PyDict_SetItem(entry_dict, PyString_FromString("exon_stop"), PyInt_FromLong(exon_stop) );
378 //status = PyDict_SetItem(entry_dict, PyString_FromString("exon_start"), PyInt_FromLong(exon_start) );
379 //status = PyDict_SetItem(entry_dict, PyString_FromString("p_stop"), PyInt_FromLong(p_stop) );
380
381 // now save the dictionary representing one line in the dictionary
382 // representing the whole file
383 //status = PyDict_SetItem(result_dict, id_py, entry_dict);
384
385 status = PyDict_SetItem(result_dict, id_py, entry_list);
386 if (status != 0) {
387 PyErr_Warn(PyExc_Warning, "qparser.parse_reads: Failed to add item!");
388 }
389
390 Py_DECREF( entry_list );
391 // decrement the reference count as we are finished with the local
392 // modification of the object
393 Py_DECREF( result_dict );
394
395 return status;
396 }
397 */