+ minor modifications
[qpalma.git] / cparser / qparser.c
1 #include <Python.h>
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <sys/mman.h>
5 #include <sys/stat.h>
6
7
8 #include "read.h"
9
10 #define WITH_QUALITIES 0
11
12 // these two array store the reads respective their ids
13 unsigned long *id_map;
14
15 int map_idx;
16
17 Read **read_array;
18
19 int num_reads;
20
21
22 // the line format is defined as follows
23 // id chr strand seq splitpos size q1 q2 q3 geneId p1 p2 p3 p4 true_cut
24 const char* line_format = "%lu\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\n";
25
26 int create_read_from_line(Read* newRead, const char* current_line) {
27 int buffer_size= 64;
28 printf("current line is %s\n",current_line);
29
30 int entries_found = sscanf(current_line,line_format,&(newRead->id),
31 &(newRead->chr),&(newRead->strand),newRead->seq,&(newRead->splitpos),&(newRead->size),
32 newRead->prb,newRead->cal_prb,newRead->chastity,newRead->gene_id,&(newRead->p_start),
33 &(newRead->exon_stop),&(newRead->exon_start),&(newRead->p_stop),&(newRead->true_cut));
34
35 if (entries_found != 15) {
36 return -1;
37 }
38
39 Py_ssize_t idx;
40 for(idx=0;idx<strlen(newRead->seq);idx++) {
41 if ( 65 <= newRead->seq[idx] && newRead->seq[idx] < 85)
42 newRead->seq[idx] = newRead->seq[idx]+32;
43 }
44
45 for(idx=0;idx<newRead->size;idx++) {
46 newRead->prb[idx] -= 50;
47 newRead->cal_prb[idx] -= 64;
48 newRead->chastity[idx] += 10;
49 }
50
51 if ( newRead->strand == 'D' )
52 newRead->strand = '+';
53
54 if ( newRead->strand == 'P' )
55 newRead->strand = '-';
56
57 return 0;
58 }
59
60
61 static void Py_free_everything() {
62
63 size_t idx;
64 for(idx=0;idx<num_reads;idx++)
65 free_read(read_array[idx]);
66
67 free(read_array);
68 free(id_map);
69 }
70
71
72 static int set_item_from_line(const char* current_line) {
73 Read* current_read = read_alloc();
74 int status = create_read_from_line(current_read,current_line);
75 if(status != 0)
76 return status;
77
78 read_array[num_reads] = current_read;
79 id_map[num_reads] = current_read->id;
80 num_reads++;
81
82 return 0;
83 }
84
85
86
87 static void assign_read(PyObject *read_dict, int index) {
88
89 int status;
90
91 Read* current_read = read_array[index];
92
93 status = PyDict_SetItem(read_dict, PyString_FromString("id"), PyInt_FromLong(current_read->id) );
94 status = PyDict_SetItem(read_dict, PyString_FromString("chr"), PyInt_FromLong(current_read->chr) );
95 status = PyDict_SetItem(read_dict, PyString_FromString("seq"), PyString_FromString(current_read->seq) );
96 status = PyDict_SetItem(read_dict, PyString_FromString("strand"), PyString_FromString(current_read->strand) );
97 status = PyDict_SetItem(read_dict, PyString_FromString("splitpos"), PyInt_FromLong(current_read->splitpos) );
98 status = PyDict_SetItem(read_dict, PyString_FromString("read_size"), PyInt_FromLong(current_read->size) );
99 status = PyDict_SetItem(read_dict, PyString_FromString("true_cut"), PyInt_FromLong(current_read->true_cut) );
100
101 PyObject* prb_list = PyList_New(current_read->size);
102 PyObject* cal_prb_list = PyList_New(current_read->size);
103 PyObject* chastity_list = PyList_New(current_read->size);
104
105 size_t idx;
106 for(idx=0;idx<current_read->size;idx++) {
107 status = PyList_SetItem( prb_list, idx, PyInt_FromLong(current_read->prb[idx]) );
108 status = PyList_SetItem( cal_prb_list, idx, PyInt_FromLong(current_read->cal_prb[idx]) );
109 status = PyList_SetItem( chastity_list, idx, PyInt_FromLong(current_read->chastity[idx]) );
110 }
111
112 status = PyDict_SetItem(read_dict, PyString_FromString("prb"), prb_list );
113 status = PyDict_SetItem(read_dict, PyString_FromString("cal_prb"), cal_prb_list );
114 status = PyDict_SetItem(read_dict, PyString_FromString("chastity"), chastity_list );
115 status = PyDict_SetItem(read_dict, PyString_FromString("gene_id"), PyString_FromString(current_read->gene_id) );
116 status = PyDict_SetItem(read_dict, PyString_FromString("p_start"), PyInt_FromLong(current_read->p_start) );
117 status = PyDict_SetItem(read_dict, PyString_FromString("exon_stop"), PyInt_FromLong(current_read->exon_stop) );
118 status = PyDict_SetItem(read_dict, PyString_FromString("exon_start"), PyInt_FromLong(current_read->exon_start) );
119 status = PyDict_SetItem(read_dict, PyString_FromString("p_stop"), PyInt_FromLong(current_read->p_stop) );
120
121 }
122
123
124 static PyObject * Py_fetch_read(PyObject *obj, PyObject *args) {
125 // first define some constant strings
126 unsigned long read_id;
127 unsigned long current_read_id;
128
129 if (!PyArg_ParseTuple(args, "l", &read_id)) {
130 PyErr_Warn(PyExc_Warning, "qparser.parse_reads: Invalid parameters.");
131 return NULL;
132 }
133
134 //printf("got index %lu\n",read_id);
135
136 PyObject* read_dict = PyDict_New();
137
138 size_t idx;
139 for(idx=0;idx<num_reads;idx++) {
140 current_read_id = id_map[idx];
141 if (current_read_id == read_id) {
142 assign_read(read_dict,idx);
143 }
144 }
145
146 return read_dict;
147 }
148
149
150 /*
151 * This function parses the original reads file and stores the lines in a
152 * dictionary indexed by the key.
153 *
154 */
155
156 static PyObject * Py_parse_reads(PyObject *obj, PyObject *args) {
157
158 // first define some constant strings
159 const char* reads_filename;
160
161 if (!PyArg_ParseTuple(args, "s", &reads_filename)) {
162 PyErr_Warn(PyExc_Warning, "qparser.parse_reads: Invalid parameters.");
163 return NULL;
164 }
165
166 FILE *reads_fs = fopen(reads_filename,"r");
167
168 if(reads_fs == NULL) {
169 printf("Error: Could not open file: %s",reads_filename);
170 exit(EXIT_FAILURE);
171 }
172
173 int reads_fid = fileno(reads_fs);
174 struct stat reads_stat;
175 if ( fstat(reads_fid,&reads_stat) == -1) {
176 perror("fstat");
177 exit(EXIT_FAILURE);
178 }
179
180 off_t reads_filesize = reads_stat.st_size;
181 printf("Reads file is of size %lu bytes\n",(unsigned long) reads_filesize);
182
183 // ATTENTION this is an overestimator of the reads in the file
184 // it is NOT the exact number
185 int numReads = reads_filesize / 200.0;
186
187 read_array = malloc(sizeof(Read*)*numReads);
188 id_map = malloc(sizeof(unsigned long)*numReads);
189
190 //printf("Found %d reads.",numReads);
191
192 // try to acquire file using mmap
193 void *reads_area = mmap (NULL,reads_filesize,PROT_READ,MAP_PRIVATE,reads_fid,0);
194 if (reads_area == MAP_FAILED) {
195 perror("mmap");
196 exit(EXIT_FAILURE);
197 }
198
199 close(reads_fid);
200 printf("Successfully mapped %lu bytes of reads file into memory\n",(unsigned long)reads_filesize);
201
202 char* lineBeginPtr = (char*) reads_area;
203 char* lineEndPtr = (char*) reads_area;
204 char* end_of_mapped_area = ((char*) reads_area) + reads_filesize;
205
206 while (*lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
207 lineEndPtr++;
208
209 char* current_line = malloc(sizeof(char)*512);
210 memset(current_line,0,512);
211
212 unsigned long line_size = lineEndPtr - lineBeginPtr;
213 strncpy(current_line,lineBeginPtr,line_size);
214 current_line[line_size] = '\0';
215
216 int readCtr = 0;
217 int status = 0;
218
219 num_reads = 0;
220 map_idx = 0;
221
222 while(1) {
223 if (strcmp(current_line,"") == 0)
224 break;
225
226 status = set_item_from_line(current_line);
227 if (status != 0 )
228 printf("Error while parsing line (status=%d).",status);
229
230 lineBeginPtr = lineEndPtr;
231 while (*(char*)lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
232 lineEndPtr++;
233
234 current_line = strncpy(current_line,lineBeginPtr,lineEndPtr-lineBeginPtr);
235 current_line[lineEndPtr-lineBeginPtr] = '\0';
236
237 readCtr += 1;
238 }
239
240 // clean up
241 status = munmap(reads_area,reads_filesize);
242 if(status != 0)
243 perror("munmap");
244
245 return PyInt_FromLong(0);
246 }
247
248
249 static PyMethodDef qparserMethods[] = {
250 {"parse_reads", Py_parse_reads, METH_VARARGS,"Test UInt8 behaviour."},
251 {"fetch_read", Py_fetch_read, METH_VARARGS,"Test UInt8 behaviour."},
252 //{"free_everything", Py_free_everything, 0,"Test UInt8 behaviour."},
253 {NULL, NULL, 0, NULL} /* Sentinel */
254 };
255
256
257 PyMODINIT_FUNC initqparser(void) {
258 (void) Py_InitModule("qparser", qparserMethods);
259 }
260
261
262 int main(int argc, char *argv[]) {
263 Py_SetProgramName(argv[0]);
264 Py_Initialize();
265 initqparser();
266 }
267
268
269 /*
270 static int set_item_from_line(PyObject *result_dict, const char* current_line) {
271
272 // increment the reference count for the result_dict object because we want
273 // ot modify it
274 Py_INCREF( result_dict );
275
276 //printf("current line is %s\n",current_line);
277 int entries_found = sscanf(current_line,line_format,&id,&chr,&strand,seq,&splitpos,&size,prb,cal_prb,chastity,geneId,&p_start,&exon_stop,&exon_start,&p_stop,&true_cut);
278
279 if (entries_found != 15) {
280 return entries_found;
281 }
282
283 //printf("after sscanf\n");
284 int status;
285
286 // create dictionary representing one line
287 //PyObject* entry_dict = PyDict_New();
288
289 // alternative way using a list instead of a dictionary
290 PyObject* entry_list = PyList_New(15);
291 Py_INCREF( entry_list );
292
293 PyObject *id_py = PyInt_FromLong(id);
294 PyObject *strand_py = PyString_FromString("--");
295
296 if ( strand == 'D' )
297 strand_py = PyString_FromString("+");
298
299 if ( strand == 'P' )
300 strand_py = PyString_FromString("-");
301
302 //printf("before : %s\n",seq);
303 Py_ssize_t idx;
304 for(idx=0;idx<strlen(seq);idx++) {
305 if ( 65 <= seq[idx] && seq[idx] < 85)
306 seq[idx] = seq[idx]+32;
307 }
308 //printf("after : %s\n",seq);
309
310 // add entries of that line
311
312 //status = PyDict_SetItem(entry_dict, PyString_FromString("id"), id_py );
313 //status = PyDict_SetItem(entry_dict, PyString_FromString("chr"), PyInt_FromLong(chr) );
314 //status = PyDict_SetItem(entry_dict, PyString_FromString("seq"), PyString_FromString(seq) );
315 //status = PyDict_SetItem(entry_dict, PyString_FromString("strand"), strand_py );
316 //status = PyDict_SetItem(entry_dict, PyString_FromString("splitpos"), PyInt_FromLong(splitpos) );
317 //status = PyDict_SetItem(entry_dict, PyString_FromString("read_size"), PyInt_FromLong(size) );
318 //status = PyDict_SetItem(entry_dict, PyString_FromString("true_cut"), PyInt_FromLong(true_cut) );
319
320 status = PyList_SetItem(entry_list, 0, id_py );
321 status = PyList_SetItem(entry_list, 1, PyInt_FromLong(chr) );
322 status = PyList_SetItem(entry_list, 2, PyString_FromString(seq) );
323 status = PyList_SetItem(entry_list, 3, strand_py );
324 status = PyList_SetItem(entry_list, 4, PyInt_FromLong(splitpos) );
325 status = PyList_SetItem(entry_list, 5, PyInt_FromLong(size) );
326 status = PyList_SetItem(entry_list, 6, PyInt_FromLong(true_cut) );
327
328 PyObject* prb_list = PyList_New(size);
329 PyObject* cal_prb_list = PyList_New(size);
330 PyObject* chastity_list = PyList_New(size);
331
332 #if WITH_QUALITIES
333 for(idx=0;idx<size;idx++) {
334 status = PyList_SetItem( prb_list, idx, PyInt_FromLong(prb[idx]-50) );
335 status = PyList_SetItem( cal_prb_list, idx, PyInt_FromLong(cal_prb[idx]-64) );
336 status = PyList_SetItem( chastity_list, idx, PyInt_FromLong(chastity[idx]+10) );
337 }
338
339 status = PyList_SetItem(entry_list, 7, prb_list );
340 status = PyList_SetItem(entry_list, 8, cal_prb_list );
341 status = PyList_SetItem(entry_list, 9, chastity_list );
342 #else
343 status = PyList_SetItem(entry_list, 7, PyString_FromString("") );
344 status = PyList_SetItem(entry_list, 8, PyString_FromString("") );
345 status = PyList_SetItem(entry_list, 9, PyString_FromString("") );
346 #endif
347
348
349 status = PyList_SetItem(entry_list, 10, PyString_FromString(geneId) );
350 status = PyList_SetItem(entry_list, 11, PyInt_FromLong(p_start) );
351 status = PyList_SetItem(entry_list, 12, PyInt_FromLong(exon_stop) );
352 status = PyList_SetItem(entry_list, 13, PyInt_FromLong(exon_start) );
353 status = PyList_SetItem(entry_list, 14, PyInt_FromLong(p_stop) );
354
355 //status = PyDict_SetItem(entry_dict, PyString_FromString("prb"), prb_list );
356 //status = PyDict_SetItem(entry_dict, PyString_FromString("cal_prb"), cal_prb_list );
357 //status = PyDict_SetItem(entry_dict, PyString_FromString("chastity"), chastity_list );
358 //status = PyDict_SetItem(entry_dict, PyString_FromString("gene_id"), PyString_FromString(geneId) );
359 //status = PyDict_SetItem(entry_dict, PyString_FromString("p_start"), PyInt_FromLong(p_start) );
360 //status = PyDict_SetItem(entry_dict, PyString_FromString("exon_stop"), PyInt_FromLong(exon_stop) );
361 //status = PyDict_SetItem(entry_dict, PyString_FromString("exon_start"), PyInt_FromLong(exon_start) );
362 //status = PyDict_SetItem(entry_dict, PyString_FromString("p_stop"), PyInt_FromLong(p_stop) );
363
364 // now save the dictionary representing one line in the dictionary
365 // representing the whole file
366 //status = PyDict_SetItem(result_dict, id_py, entry_dict);
367
368 status = PyDict_SetItem(result_dict, id_py, entry_list);
369 if (status != 0) {
370 PyErr_Warn(PyExc_Warning, "qparser.parse_reads: Failed to add item!");
371 }
372
373 Py_DECREF( entry_list );
374 // decrement the reference count as we are finished with the local
375 // modification of the object
376 Py_DECREF( result_dict );
377
378 return status;
379 }
380 */