3e52d1628cd571f46905700c2109ae4f4f06d707
[qpalma.git] / cparser / qparser.c
1 #include <Python.h>
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <sys/mman.h>
5 #include <sys/stat.h>
6
7
8 #include "read.h"
9
10 #define WITH_QUALITIES 0
11
12 // these two array store the reads respective their ids
13 unsigned long *id_map;
14
15 int map_idx;
16
17 Read **read_array;
18
19 int num_reads;
20
21
22 // the line format is defined as follows
23 // id chr strand seq splitpos size q1 q2 q3 geneId p1 p2 p3 p4 true_cut
24 const char* line_format = "%lu\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\n";
25
26 int create_read_from_line(Read* newRead, const char* current_line) {
27
28 int size = 72;
29 newRead->seq = malloc(sizeof(char)*(size));
30
31 size = 36;
32 newRead->prb = malloc(sizeof(char)*(size));
33 newRead->cal_prb = malloc(sizeof(char)*(size));
34 newRead->chastity = malloc(sizeof(char)*(size));
35 newRead->gene_id = malloc(sizeof(char)*(size));
36
37 int entries_found = sscanf(current_line,line_format,&(newRead->id),
38 &(newRead->chr),&(newRead->strand),newRead->seq,&(newRead->splitpos),&(newRead->size),
39 newRead->prb,newRead->cal_prb,newRead->chastity,newRead->gene_id,&(newRead->p_start),
40 &(newRead->exon_stop),&(newRead->exon_start),&(newRead->p_stop),&(newRead->true_cut));
41
42 Py_ssize_t idx;
43 for(idx=0;idx<strlen(newRead->seq);idx++) {
44 if ( 65 <= newRead->seq[idx] && newRead->seq[idx] < 85)
45 newRead->seq[idx] = newRead->seq[idx]+32;
46 }
47
48 for(idx=0;idx<size;idx++) {
49 newRead->prb[idx] -= 50;
50 newRead->cal_prb[idx] -= 64;
51 newRead->chastity[idx] += 10;
52 }
53
54 if ( newRead->strand == 'D' )
55 newRead->strand = '+';
56
57 if ( newRead->strand == 'P' )
58 newRead->strand = '-';
59
60 return entries_found;
61 }
62
63
64 static void Py_free_everything(){
65
66 size_t idx;
67 for(idx=0;idx<num_reads;idx++)
68 free_read(read_array[idx]);
69
70 free(read_array);
71 free(id_map);
72 }
73
74
75 static int set_item_from_line(const char* current_line) {
76 printf("current line is %s\n",current_line);
77
78 Read* current_read = read_alloc();
79 int entries_found = create_read_from_line(current_read,current_line);
80
81 if (entries_found != 15) {
82 return entries_found;
83 }
84
85 read_array[num_reads] = current_read;
86 id_map[num_reads] = current_read->id;
87 num_reads++;
88 }
89
90
91
92 static void assign_read(PyObject *read_dict, int index) {
93
94 int status;
95
96 Read* current_read = read_array[index];
97
98 status = PyDict_SetItem(read_dict, PyString_FromString("id"), PyInt_FromLong(current_read->id) );
99 status = PyDict_SetItem(read_dict, PyString_FromString("chr"), PyInt_FromLong(current_read->chr) );
100 status = PyDict_SetItem(read_dict, PyString_FromString("seq"), PyString_FromString(current_read->seq) );
101 status = PyDict_SetItem(read_dict, PyString_FromString("strand"), PyString_FromString(current_read->strand) );
102 status = PyDict_SetItem(read_dict, PyString_FromString("splitpos"), PyInt_FromLong(current_read->splitpos) );
103 status = PyDict_SetItem(read_dict, PyString_FromString("read_size"), PyInt_FromLong(current_read->size) );
104 status = PyDict_SetItem(read_dict, PyString_FromString("true_cut"), PyInt_FromLong(current_read->true_cut) );
105
106 PyObject* prb_list = PyList_New(current_read->size);
107 PyObject* cal_prb_list = PyList_New(current_read->size);
108 PyObject* chastity_list = PyList_New(current_read->size);
109
110 size_t idx;
111 for(idx=0;idx<current_read->size;idx++) {
112 status = PyList_SetItem( prb_list, idx, PyInt_FromLong(current_read->prb[idx]) );
113 status = PyList_SetItem( cal_prb_list, idx, PyInt_FromLong(current_read->cal_prb[idx]) );
114 status = PyList_SetItem( chastity_list, idx, PyInt_FromLong(current_read->chastity[idx]) );
115 }
116
117 status = PyDict_SetItem(read_dict, PyString_FromString("prb"), prb_list );
118 status = PyDict_SetItem(read_dict, PyString_FromString("cal_prb"), cal_prb_list );
119 status = PyDict_SetItem(read_dict, PyString_FromString("chastity"), chastity_list );
120 status = PyDict_SetItem(read_dict, PyString_FromString("gene_id"), PyString_FromString(current_read->gene_id) );
121 status = PyDict_SetItem(read_dict, PyString_FromString("p_start"), PyInt_FromLong(current_read->p_start) );
122 status = PyDict_SetItem(read_dict, PyString_FromString("exon_stop"), PyInt_FromLong(current_read->exon_stop) );
123 status = PyDict_SetItem(read_dict, PyString_FromString("exon_start"), PyInt_FromLong(current_read->exon_start) );
124 status = PyDict_SetItem(read_dict, PyString_FromString("p_stop"), PyInt_FromLong(current_read->p_stop) );
125
126 }
127
128
129 static PyObject * Py_fetch_read(PyObject *obj, PyObject *args) {
130 // first define some constant strings
131 unsigned long read_id;
132 unsigned long current_read_id;
133
134 if (!PyArg_ParseTuple(args, "i", &read_id)) {
135 PyErr_Warn(PyExc_Warning, "qparser.parse_reads: Invalid parameters.");
136 return NULL;
137 }
138
139 PyObject* read_dict = PyDict_New();
140
141 size_t idx;
142 for(idx=0;idx<num_reads;idx++) {
143 current_read_id = id_map[idx];
144 if (current_read_id == read_id) {
145 assign_read(read_dict,idx);
146 }
147 }
148
149 return read_dict;
150 }
151
152
153 /*
154 * This function parses the original reads file and stores the lines in a
155 * dictionary indexed by the key.
156 *
157 */
158
159 static PyObject * Py_parse_reads(PyObject *obj, PyObject *args) {
160
161 // first define some constant strings
162 const char* reads_filename;
163
164 if (!PyArg_ParseTuple(args, "s", &reads_filename)) {
165 PyErr_Warn(PyExc_Warning, "qparser.parse_reads: Invalid parameters.");
166 return NULL;
167 }
168
169 FILE *reads_fs = fopen(reads_filename,"r");
170
171 if(reads_fs == NULL) {
172 printf("Error: Could not open file: %s",reads_filename);
173 exit(EXIT_FAILURE);
174 }
175
176 int reads_fid = fileno(reads_fs);
177 struct stat reads_stat;
178 if ( fstat(reads_fid,&reads_stat) == -1) {
179 perror("fstat");
180 exit(EXIT_FAILURE);
181 }
182
183 off_t reads_filesize = reads_stat.st_size;
184 printf("Reads file is of size %lu bytes\n",(unsigned long) reads_filesize);
185
186 // ATTENTION this is an overestimator of the reads in the file
187 // it is NOT the exact number
188 int numReads = reads_filesize / 200.0;
189
190 read_array = malloc(sizeof(Read*)*numReads);
191 id_map = malloc(sizeof(unsigned long)*numReads);
192
193 //printf("Found %d reads.",numReads);
194
195 // try to acquire file using mmap
196 void *reads_area = mmap (NULL,reads_filesize,PROT_READ,MAP_PRIVATE,reads_fid,0);
197 if (reads_area == MAP_FAILED) {
198 perror("mmap");
199 exit(EXIT_FAILURE);
200 }
201
202 close(reads_fid);
203 printf("Successfully mapped %lu bytes of reads file into memory\n",(unsigned long)reads_filesize);
204
205 char* lineBeginPtr = (char*) reads_area;
206 char* lineEndPtr = (char*) reads_area;
207 char* end_of_mapped_area = ((char*) reads_area) + reads_filesize;
208
209 while (*lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
210 lineEndPtr++;
211
212 char* current_line = malloc(sizeof(char)*512);
213 memset(current_line,0,512);
214
215 unsigned long line_size = lineEndPtr - lineBeginPtr;
216 strncpy(current_line,lineBeginPtr,line_size);
217 current_line[line_size] = '\0';
218
219 int readCtr = 0;
220 int status = 0;
221
222 num_reads = 0;
223 map_idx = 0;
224
225 while(1) {
226 if (strcmp(current_line,"") == 0)
227 break;
228
229 status = set_item_from_line(current_line);
230 if (status != 0 )
231 printf("Error while parsing line (status=%d).",status);
232
233 lineBeginPtr = lineEndPtr;
234 while (*(char*)lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
235 lineEndPtr++;
236
237 current_line = strncpy(current_line,lineBeginPtr,lineEndPtr-lineBeginPtr);
238 current_line[lineEndPtr-lineBeginPtr] = '\0';
239
240 readCtr += 1;
241 }
242
243 // clean up
244 status = munmap(reads_area,reads_filesize);
245 if(status != 0)
246 perror("munmap");
247
248 return PyInt_FromLong(0);
249 }
250
251
252 static PyMethodDef qparserMethods[] = {
253 {"parse_reads", Py_parse_reads, METH_VARARGS,"Test UInt8 behaviour."},
254 {"free_everything", Py_free_everything, METH_VARARGS,"Test UInt8 behaviour."},
255 {NULL, NULL, 0, NULL} /* Sentinel */
256 };
257
258
259 PyMODINIT_FUNC initqparser(void) {
260 (void) Py_InitModule("qparser", qparserMethods);
261 }
262
263
264 int main(int argc, char *argv[]) {
265 Py_SetProgramName(argv[0]);
266 Py_Initialize();
267 initqparser();
268 }
269
270
271 /*
272 static int set_item_from_line(PyObject *result_dict, const char* current_line) {
273
274 // increment the reference count for the result_dict object because we want
275 // ot modify it
276 Py_INCREF( result_dict );
277
278 //printf("current line is %s\n",current_line);
279 int entries_found = sscanf(current_line,line_format,&id,&chr,&strand,seq,&splitpos,&size,prb,cal_prb,chastity,geneId,&p_start,&exon_stop,&exon_start,&p_stop,&true_cut);
280
281 if (entries_found != 15) {
282 return entries_found;
283 }
284
285 //printf("after sscanf\n");
286 int status;
287
288 // create dictionary representing one line
289 //PyObject* entry_dict = PyDict_New();
290
291 // alternative way using a list instead of a dictionary
292 PyObject* entry_list = PyList_New(15);
293 Py_INCREF( entry_list );
294
295 PyObject *id_py = PyInt_FromLong(id);
296 PyObject *strand_py = PyString_FromString("--");
297
298 if ( strand == 'D' )
299 strand_py = PyString_FromString("+");
300
301 if ( strand == 'P' )
302 strand_py = PyString_FromString("-");
303
304 //printf("before : %s\n",seq);
305 Py_ssize_t idx;
306 for(idx=0;idx<strlen(seq);idx++) {
307 if ( 65 <= seq[idx] && seq[idx] < 85)
308 seq[idx] = seq[idx]+32;
309 }
310 //printf("after : %s\n",seq);
311
312 // add entries of that line
313
314 //status = PyDict_SetItem(entry_dict, PyString_FromString("id"), id_py );
315 //status = PyDict_SetItem(entry_dict, PyString_FromString("chr"), PyInt_FromLong(chr) );
316 //status = PyDict_SetItem(entry_dict, PyString_FromString("seq"), PyString_FromString(seq) );
317 //status = PyDict_SetItem(entry_dict, PyString_FromString("strand"), strand_py );
318 //status = PyDict_SetItem(entry_dict, PyString_FromString("splitpos"), PyInt_FromLong(splitpos) );
319 //status = PyDict_SetItem(entry_dict, PyString_FromString("read_size"), PyInt_FromLong(size) );
320 //status = PyDict_SetItem(entry_dict, PyString_FromString("true_cut"), PyInt_FromLong(true_cut) );
321
322 status = PyList_SetItem(entry_list, 0, id_py );
323 status = PyList_SetItem(entry_list, 1, PyInt_FromLong(chr) );
324 status = PyList_SetItem(entry_list, 2, PyString_FromString(seq) );
325 status = PyList_SetItem(entry_list, 3, strand_py );
326 status = PyList_SetItem(entry_list, 4, PyInt_FromLong(splitpos) );
327 status = PyList_SetItem(entry_list, 5, PyInt_FromLong(size) );
328 status = PyList_SetItem(entry_list, 6, PyInt_FromLong(true_cut) );
329
330 PyObject* prb_list = PyList_New(size);
331 PyObject* cal_prb_list = PyList_New(size);
332 PyObject* chastity_list = PyList_New(size);
333
334 #if WITH_QUALITIES
335 for(idx=0;idx<size;idx++) {
336 status = PyList_SetItem( prb_list, idx, PyInt_FromLong(prb[idx]-50) );
337 status = PyList_SetItem( cal_prb_list, idx, PyInt_FromLong(cal_prb[idx]-64) );
338 status = PyList_SetItem( chastity_list, idx, PyInt_FromLong(chastity[idx]+10) );
339 }
340
341 status = PyList_SetItem(entry_list, 7, prb_list );
342 status = PyList_SetItem(entry_list, 8, cal_prb_list );
343 status = PyList_SetItem(entry_list, 9, chastity_list );
344 #else
345 status = PyList_SetItem(entry_list, 7, PyString_FromString("") );
346 status = PyList_SetItem(entry_list, 8, PyString_FromString("") );
347 status = PyList_SetItem(entry_list, 9, PyString_FromString("") );
348 #endif
349
350
351 status = PyList_SetItem(entry_list, 10, PyString_FromString(geneId) );
352 status = PyList_SetItem(entry_list, 11, PyInt_FromLong(p_start) );
353 status = PyList_SetItem(entry_list, 12, PyInt_FromLong(exon_stop) );
354 status = PyList_SetItem(entry_list, 13, PyInt_FromLong(exon_start) );
355 status = PyList_SetItem(entry_list, 14, PyInt_FromLong(p_stop) );
356
357 //status = PyDict_SetItem(entry_dict, PyString_FromString("prb"), prb_list );
358 //status = PyDict_SetItem(entry_dict, PyString_FromString("cal_prb"), cal_prb_list );
359 //status = PyDict_SetItem(entry_dict, PyString_FromString("chastity"), chastity_list );
360 //status = PyDict_SetItem(entry_dict, PyString_FromString("gene_id"), PyString_FromString(geneId) );
361 //status = PyDict_SetItem(entry_dict, PyString_FromString("p_start"), PyInt_FromLong(p_start) );
362 //status = PyDict_SetItem(entry_dict, PyString_FromString("exon_stop"), PyInt_FromLong(exon_stop) );
363 //status = PyDict_SetItem(entry_dict, PyString_FromString("exon_start"), PyInt_FromLong(exon_start) );
364 //status = PyDict_SetItem(entry_dict, PyString_FromString("p_stop"), PyInt_FromLong(p_stop) );
365
366 // now save the dictionary representing one line in the dictionary
367 // representing the whole file
368 //status = PyDict_SetItem(result_dict, id_py, entry_dict);
369
370 status = PyDict_SetItem(result_dict, id_py, entry_list);
371 if (status != 0) {
372 PyErr_Warn(PyExc_Warning, "qparser.parse_reads: Failed to add item!");
373 }
374
375 Py_DECREF( entry_list );
376 // decrement the reference count as we are finished with the local
377 // modification of the object
378 Py_DECREF( result_dict );
379
380 return status;
381 }
382 */