+ changed dataset compilation to support new framework
[qpalma.git] / cparser / qparser.c
1 #include <Python.h>
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <sys/mman.h>
5 #include <sys/stat.h>
6
7 #include "read.h"
8
9 #define WITH_QUALITIES 0
10
11 // these two array store the reads respective their ids
12 unsigned long *id_map;
13
14 int map_idx;
15
16 const int offset = 1000000000000;
17
18 Read **read_array;
19
20 int num_reads;
21
22 const char* pos_strand = "+";
23 const char* neg_strand = "-";
24
25
26 // the line format is defined as follows
27 // id chr strand seq splitpos size q1 q2 q3 geneId p1 p2 p3 p4 true_cut
28 const char* line_format = "%lu\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\n";
29
30 int create_read_from_line(Read* newRead, const char* current_line) {
31 //printf("current line is %s\n",current_line);
32
33 int entries_found = sscanf(current_line,line_format,&(newRead->id),
34 &(newRead->chr),&(newRead->strand),newRead->seq,&(newRead->splitpos),&(newRead->size),
35 newRead->prb,newRead->cal_prb,newRead->chastity,newRead->gene_id,&(newRead->p_start),
36 &(newRead->exon_stop),&(newRead->exon_start),&(newRead->p_stop),&(newRead->true_cut));
37
38 if (entries_found != 15) {
39 return -1;
40 }
41
42 // make sequence lowercase but don't destroy brackets
43 Py_ssize_t idx;
44 for(idx=0;idx<strlen(newRead->seq);idx++) {
45 if ( 65 <= newRead->seq[idx] && newRead->seq[idx] < 85)
46 newRead->seq[idx] = newRead->seq[idx]+32;
47 }
48
49 if ( newRead->strand == 'D' )
50 newRead->strand = '+';
51
52 if ( newRead->strand == 'P' )
53 newRead->strand = '-';
54
55 return 0;
56 }
57
58
59 static void Py_free_everything() {
60
61 size_t idx;
62 for(idx=0;idx<num_reads;idx++)
63 free_read(read_array[idx]);
64
65 free(read_array);
66 free(id_map);
67 }
68
69
70 static int set_item_from_line(const char* current_line) {
71 Read* current_read = read_alloc();
72 int status = create_read_from_line(current_read,current_line);
73 if(status != 0)
74 return status;
75
76 //print_read(current_read);
77
78 //int read_idx = current_read->id - offset;
79 //if (read_idx > num_reads)
80 // printf("offset error\n");
81 //read_array[read_idx] = current_read;
82
83 read_array[num_reads] = current_read;
84 id_map[num_reads] = current_read->id;
85 num_reads++;
86
87 return 0;
88 }
89
90
91
92 static void assign_read(PyObject *read_dict, int index) {
93
94 int status;
95
96 Read* current_read = read_array[index];
97
98 status = PyDict_SetItem(read_dict, PyString_FromString("id"), PyInt_FromLong(current_read->id) );
99 status = PyDict_SetItem(read_dict, PyString_FromString("chr"), PyInt_FromLong(current_read->chr) );
100 status = PyDict_SetItem(read_dict, PyString_FromString("seq"), PyString_FromString(current_read->seq) );
101
102 if ( current_read->strand == '+' )
103 status = PyDict_SetItem(read_dict, PyString_FromString("strand"), PyString_FromString(pos_strand) );
104
105 if ( current_read->strand == '-' )
106 status = PyDict_SetItem(read_dict, PyString_FromString("strand"), PyString_FromString(neg_strand) );
107
108 status = PyDict_SetItem(read_dict, PyString_FromString("splitpos"), PyInt_FromLong(current_read->splitpos) );
109 status = PyDict_SetItem(read_dict, PyString_FromString("read_size"), PyInt_FromLong(current_read->size) );
110 status = PyDict_SetItem(read_dict, PyString_FromString("true_cut"), PyInt_FromLong(current_read->true_cut) );
111
112 PyObject* prb_list = PyList_New(current_read->size);
113 PyObject* cal_prb_list = PyList_New(current_read->size);
114 PyObject* chastity_list = PyList_New(current_read->size);
115
116 size_t idx;
117 for(idx=0;idx<current_read->size;idx++) {
118 status = PyList_SetItem( prb_list, idx, PyInt_FromLong(current_read->prb[idx]-50) );
119 status = PyList_SetItem( cal_prb_list, idx, PyInt_FromLong(current_read->cal_prb[idx]-64) );
120 status = PyList_SetItem( chastity_list, idx, PyInt_FromLong(current_read->chastity[idx]+10) );
121 }
122
123 status = PyDict_SetItem(read_dict, PyString_FromString("prb"), prb_list );
124 status = PyDict_SetItem(read_dict, PyString_FromString("cal_prb"), cal_prb_list );
125 status = PyDict_SetItem(read_dict, PyString_FromString("chastity"), chastity_list );
126 status = PyDict_SetItem(read_dict, PyString_FromString("gene_id"), PyString_FromString(current_read->gene_id) );
127 status = PyDict_SetItem(read_dict, PyString_FromString("p_start"), PyInt_FromLong(current_read->p_start) );
128 status = PyDict_SetItem(read_dict, PyString_FromString("exon_stop"), PyInt_FromLong(current_read->exon_stop) );
129 status = PyDict_SetItem(read_dict, PyString_FromString("exon_start"), PyInt_FromLong(current_read->exon_start) );
130 status = PyDict_SetItem(read_dict, PyString_FromString("p_stop"), PyInt_FromLong(current_read->p_stop) );
131
132 }
133
134
135 static PyObject * Py_fetch_read(PyObject *obj, PyObject *args) {
136 // first define some constant strings
137 unsigned long read_id;
138 unsigned long current_read_id;
139
140 if (!PyArg_ParseTuple(args, "l", &read_id)) {
141 PyErr_Warn(PyExc_Warning, "qparser.parse_reads: Invalid parameters.");
142 return NULL;
143 }
144
145 //printf("got index %lu\n",read_id);
146
147 PyObject* read_dict = PyDict_New();
148
149 //size_t idx = read_id - offset;
150 //assign_read(read_dict,idx);
151
152 size_t idx;
153 for(idx=0;idx<num_reads;idx++) {
154 current_read_id = id_map[idx];
155 if (current_read_id == read_id) {
156 assign_read(read_dict,idx);
157 }
158 }
159
160 return read_dict;
161 }
162
163
164 /*
165 * This function parses the original reads file and stores the lines in a
166 * dictionary indexed by the key.
167 *
168 */
169
170 static PyObject * Py_parse_reads(PyObject *obj, PyObject *args) {
171
172 // first define some constant strings
173 const char* reads_filename;
174
175 if (!PyArg_ParseTuple(args, "s", &reads_filename)) {
176 PyErr_Warn(PyExc_Warning, "qparser.parse_reads: Invalid parameters.");
177 return NULL;
178 }
179
180 FILE *reads_fs = fopen(reads_filename,"r");
181
182 if(reads_fs == NULL) {
183 printf("Error: Could not open file: %s",reads_filename);
184 exit(EXIT_FAILURE);
185 }
186
187 int reads_fid = fileno(reads_fs);
188 struct stat reads_stat;
189 if ( fstat(reads_fid,&reads_stat) == -1) {
190 perror("fstat");
191 exit(EXIT_FAILURE);
192 }
193
194 off_t reads_filesize = reads_stat.st_size;
195 //printf("Reads file is of size %lu bytes\n",(unsigned long) reads_filesize);
196
197 // ATTENTION this is an overestimator of the reads in the file
198 // it is NOT the exact number
199 int num_init_reads = reads_filesize / 200.0;
200
201 read_array = malloc(sizeof(Read*)*num_init_reads);
202 id_map = malloc(sizeof(unsigned long)*num_init_reads);
203
204 //printf("Found %d reads.",numReads);
205
206 // try to acquire file using mmap
207 void *reads_area = mmap (NULL,reads_filesize,PROT_READ,MAP_PRIVATE,reads_fid,0);
208 if (reads_area == MAP_FAILED) {
209 perror("mmap");
210 exit(EXIT_FAILURE);
211 }
212
213 close(reads_fid);
214 printf("Successfully mapped %lu bytes of reads file into memory\n",(unsigned long)reads_filesize);
215
216 char* lineBeginPtr = (char*) reads_area;
217 char* lineEndPtr = (char*) reads_area;
218 char* end_of_mapped_area = ((char*) reads_area) + reads_filesize;
219
220 while (*lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
221 lineEndPtr++;
222
223 char* current_line = malloc(sizeof(char)*512);
224 memset(current_line,0,512);
225
226 unsigned long line_size = lineEndPtr - lineBeginPtr;
227 strncpy(current_line,lineBeginPtr,line_size);
228 current_line[line_size] = '\0';
229
230 int readCtr = 0;
231 int status = 0;
232
233 num_reads = 0;
234 map_idx = 0;
235
236 while(1) {
237 if (strcmp(current_line,"") == 0)
238 break;
239
240 status = set_item_from_line(current_line);
241 if (status != 0 )
242 printf("Error while parsing line (status=%d).",status);
243
244 lineBeginPtr = lineEndPtr;
245 while (*(char*)lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
246 lineEndPtr++;
247
248 current_line = strncpy(current_line,lineBeginPtr,lineEndPtr-lineBeginPtr);
249 current_line[lineEndPtr-lineBeginPtr] = '\0';
250
251 readCtr += 1;
252 }
253
254 // clean up
255 status = munmap(reads_area,reads_filesize);
256 if(status != 0)
257 perror("munmap");
258
259 free(current_line);
260
261 return PyInt_FromLong(num_reads);
262 }
263
264
265 static PyMethodDef qparserMethods[] = {
266 {"parse_reads", Py_parse_reads, METH_VARARGS,"Test UInt8 behaviour."},
267 {"fetch_read", Py_fetch_read, METH_VARARGS,"Test UInt8 behaviour."},
268 //{"free_everything", Py_free_everything, 0,"Test UInt8 behaviour."},
269 {NULL, NULL, 0, NULL} /* Sentinel */
270 };
271
272
273 PyMODINIT_FUNC initqparser(void) {
274 (void) Py_InitModule("qparser", qparserMethods);
275 }
276
277
278 int main(int argc, char *argv[]) {
279 Py_SetProgramName(argv[0]);
280 Py_Initialize();
281 initqparser();
282 }
283