+ compacted parser code
[qpalma.git] / tools / data_tools / filterReads.c
1 ////////////////////////////////////////////////////////////
2 // The purpose of this program is to read a gff and a
3 // solexa reads file and create a data set used by QPalma.
4 //
5 //
6 //
7 ////////////////////////////////////////////////////////////
8
9 #include <sys/mman.h>
10 #include <sys/stat.h>
11 #include <stdio.h>
12 #include <stdlib.h>
13 #include <string.h>
14 #include <unistd.h>
15
16 #include "datastructures.h"
17
18 int parse_gff(char* filename,FILE* fid,struct gene*** allGenes);
19
20 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes);
21
22 void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size);
23
24 static char *info = "Usage is:\n./filterReads gff reads";
25
26 int main(int argc, char* argv[]) {
27
28 if(argc != 3) {
29 printf("%s\n",info);
30 exit(EXIT_FAILURE);
31 }
32
33 int status;
34 int filenameSize = 256;
35 char *gff_filename = malloc(sizeof(char)*filenameSize);
36 char *reads_filename = malloc(sizeof(char)*filenameSize);
37
38 strncpy(gff_filename,argv[1],filenameSize);
39 strncpy(reads_filename,argv[2],filenameSize);
40
41 FILE *gff_fs = fopen(gff_filename,"r");
42 FILE *reads_fs = fopen(reads_filename,"r");
43
44 if(gff_fs == NULL) {
45 printf("Error: Could not open file: %s",gff_filename);
46 exit(EXIT_FAILURE);
47 }
48
49 if(reads_fs == NULL) {
50 printf("Error: Could not open file: %s",reads_filename);
51 exit(EXIT_FAILURE);
52 }
53
54 struct gene** allGenes;
55 int numGenes = parse_gff(gff_filename,gff_fs,&allGenes);
56 status = fclose(gff_fs);
57 if(status != 0)
58 printf("closing of gff filestream failed!\n");
59
60 process_reads(reads_fs,&allGenes,numGenes);
61
62 status += fclose(reads_fs);
63 if(status != 0)
64 printf("closing of filestreams failed!\n");
65
66 //free(allGenes);
67 free(gff_filename);
68 free(reads_filename);
69 return 0;
70 }
71
72
73 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes) {
74 int status;
75
76 int buffer_size= 64;
77 int chr = 0;
78 int pos = 0;
79 char* seq = malloc(sizeof(char)*buffer_size);
80 int id = 0;
81 char strand = ' ';
82 int mismatch = 0;
83 int repetition = 0;
84 int size = 0;
85 int cut = 0;
86 char* prb = malloc(sizeof(char)*buffer_size);
87 char* cal_prb = malloc(sizeof(char)*buffer_size);
88 char* chastity = malloc(sizeof(char)*buffer_size);
89
90 int reads_fid = fileno(reads_fs);
91 struct stat* reads_stat = malloc(sizeof(struct stat));
92 status = fstat(reads_fid,reads_stat);
93 if(status != 0)
94 printf("could not stat reads file");
95
96 off_t reads_filesize = reads_stat->st_size;
97 free(reads_stat);
98
99 const size_t page_size = (size_t) sysconf(_SC_PAGESIZE);
100 printf("page_size is %d\n",page_size);
101 printf("reads file is of size %d bytes\n",reads_filesize);
102 size_t mmapAreaSize = (reads_filesize / page_size)+1 ;
103
104 void *reads_area = mmap (NULL,mmapAreaSize,PROT_READ|PROT_WRITE,MAP_PRIVATE,reads_fid,0);
105 if((long int) reads_area == -1)
106 printf("mmapping failed!\n");
107
108 void* linePtr = reads_area;
109 char* current_line = malloc(sizeof(char)*256);
110
111 int SIZE = 500;
112 // initialize boundary arrays
113 void** upstream_end = malloc(sizeof(void*)*SIZE);
114 void** upstream_overlap = malloc(sizeof(void*)*SIZE);
115 void** downstream_start = malloc(sizeof(void*)*SIZE);
116 void** downstream_overlap= malloc(sizeof(void*)*SIZE);
117 int ue,uo,ds,dov;
118 ue = uo = ds = dov = 0;
119
120 int skippedLinesCounter = 0;
121
122 int prev_exon_stop = -1;
123 int cur_exon_start = -1;
124
125 current_line = strncpy(current_line,linePtr,256);
126 int gene_idx = 0;
127 int exon_idx = 1;
128 struct gene* currentGene = (*allGenes)[gene_idx];
129 while(1) {
130 if (gene_idx == numGenes || strcmp(current_line,"") == 0)
131 break;
132
133 label:
134 status = sscanf(current_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
135 &chr,&pos,seq,&id,&strand,&mismatch,&repetition,&size,&cut,prb,cal_prb,chastity);
136 if (status < 12) {
137 skippedLinesCounter++;
138 }
139
140 printf("gene start/stop %d,%d, pos: %d\n",currentGene->start,currentGene->stop,pos);
141
142 if (!(currentGene->start <= pos && pos+35 <= currentGene->stop)) {
143
144 // go to next gene
145 if ( currentGene->stop < pos+35 ) {
146 gene_idx++;
147 exon_idx = 1;
148 currentGene = (*allGenes)[gene_idx];
149 continue;
150 }
151
152 // go to next line
153 if ( pos < currentGene->start ) {
154 while (*(char*)linePtr != '\n') linePtr++;
155 linePtr++;
156
157 current_line = strncpy(current_line,linePtr,256);
158 continue;
159 }
160
161 } else {
162
163 exon_label:
164 prev_exon_stop = currentGene->exon_stops[exon_idx-1];
165 cur_exon_start = currentGene->exon_starts[exon_idx];
166 printf("prev_exon_stop %d cur_exon_start %d pos %d\n",prev_exon_stop,cur_exon_start,pos);
167 printf("exon_idx %d\n",exon_idx);
168
169 if (exon_idx == currentGene->num_exons)
170 break;
171
172 if (cur_exon_start - prev_exon_stop < 6 || cur_exon_start+36 < pos ) {
173 exon_idx++;
174 combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov);
175 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds);
176 ue = uo = ds = dov = 0;
177 goto exon_label;
178 }
179
180 if ( pos < prev_exon_stop - 36 ) {
181 while (*(char*)linePtr != '\n') linePtr++;
182 linePtr++;
183
184 current_line = strncpy(current_line,linePtr,256);
185 continue;
186 }
187
188 // upstream ending
189 if (pos+35 == prev_exon_stop) {
190 upstream_end[ue] = linePtr;
191 ue++;
192 } else {
193
194 // upstream overlapping
195 if (prev_exon_stop - pos >= 6 && prev_exon_stop - pos <= 30) {
196 upstream_overlap[uo] = linePtr;
197 uo++;
198 }
199 }
200
201 // downstream starting
202 if (pos == cur_exon_start) {
203 downstream_start[ds] = linePtr;
204 ds++;
205 } else {
206
207 // downstream overlapping
208 if (cur_exon_start - pos >= 6 && cur_exon_start - pos <= 30 ) {
209 downstream_overlap[dov] = linePtr;
210 dov++;
211 }
212 }
213
214 while (*(char*)linePtr != '\n') linePtr++;
215 linePtr++;
216 current_line = strncpy(current_line,linePtr,256);
217 continue;
218 }
219
220 gene_idx++;
221 }
222
223 combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov);
224 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds);
225
226 free(upstream_end);
227 free(upstream_overlap);
228 free(downstream_start);
229 free(downstream_overlap);
230
231 printf("skipped %d lines.\n",skippedLinesCounter);
232
233 status = munmap(reads_area,mmapAreaSize);
234 if(status != 0)
235 printf("munmap failed!\n");
236
237 free(seq);
238 free(prb);
239 free(cal_prb);
240 free(chastity);
241 }
242
243 void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size) {
244 printf("up_/down_size %d %d\n",up_size,down_size);
245
246 if (up_size == 0 || down_size == 0 || exon_stop == -1)
247 return;
248
249 int up_idx, down_idx, status;
250 char* upstream_line = malloc(sizeof(char)*256);
251 char* downstream_line = malloc(sizeof(char)*256);
252
253 int buffer_size= 64;
254
255 int up_chr = 0;
256 int up_pos = 0;
257 char* up_seq = malloc(sizeof(char)*buffer_size);
258 int up_id = 0;
259 char up_strand = ' ';
260 int up_mismatch = 0;
261 int up_repetition = 0;
262 int up_sz = 0;
263 int up_cut = 0;
264 char* up_prb = malloc(sizeof(char)*buffer_size);
265 char* up_cal_prb = malloc(sizeof(char)*buffer_size);
266 char* up_chastity = malloc(sizeof(char)*buffer_size);
267
268 int down_chr = 0;
269 int down_pos = 0;
270 char* down_seq = malloc(sizeof(char)*buffer_size);
271 int down_id = 0;
272 char down_strand = ' ';
273 int down_mismatch = 0;
274 int down_repetition = 0;
275 int down_sz = 0;
276 int down_cut = 0;
277 char* down_prb = malloc(sizeof(char)*buffer_size);
278 char* down_cal_prb = malloc(sizeof(char)*buffer_size);
279 char* down_chastity = malloc(sizeof(char)*buffer_size);
280
281 int new_chr = 0;
282 int new_pos = 0;
283 char* new_seq = malloc(sizeof(char)*buffer_size);
284 int new_id = 0;
285 char new_strand = ' ';
286 char* new_prb = malloc(sizeof(char)*buffer_size);
287 char* new_cal_prb = malloc(sizeof(char)*buffer_size);
288 char* new_chastity = malloc(sizeof(char)*buffer_size);
289
290 up_idx=0;
291 down_idx=0;
292 while(1) {
293 //printf("up_/down_idx %d %d\n",up_idx,down_idx);
294
295 if (up_idx == up_size || down_idx == down_size)
296 break;
297
298 strncpy(upstream_line,upstream[up_idx],256);
299 status = sscanf(upstream_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
300 &up_chr,&up_pos,up_seq,&up_id,&up_strand,&up_mismatch,&up_repetition,&up_sz,
301 &up_cut,up_prb,up_cal_prb,up_chastity);
302
303 strncpy(downstream_line,downstream[down_idx],256);
304 status = sscanf(downstream_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
305 &down_chr,&down_pos,down_seq,&down_id,&down_strand,&down_mismatch,&down_repetition,&down_sz,
306 &down_cut,down_prb,down_cal_prb,down_chastity);
307
308 new_prb[0] = '\0';
309 new_cal_prb[0] = '\0';
310 new_chastity[0] = '\0';
311
312 // is downstream overlapping
313 if (up_pos+35 == exon_stop) {
314 int overlap = exon_start - down_pos;
315 //printf("overlap is %d\n",overlap);
316 //printf("pos are: %d %d\n",up_pos,down_pos);
317
318 strncat(new_prb,up_prb+(36-overlap),overlap);
319 strncat(new_cal_prb,up_cal_prb+(36-overlap),overlap);
320 strncat(new_chastity,up_chastity+(36-overlap),overlap);
321
322 strncat(new_prb,down_prb+overlap,36-overlap);
323 strncat(new_cal_prb,down_cal_prb+overlap,36-overlap);
324 strncat(new_chastity,down_chastity+overlap,36-overlap);
325 // is upstream overlapping
326 }
327
328 if (down_pos == exon_start) {
329 int overlap = up_pos+36 - exon_stop;
330 //printf("overlap is %d\n",overlap);
331 //printf("pos are: %d %d\n",up_pos,down_pos);
332
333 strncat(new_prb,up_prb,(36-overlap));
334 strncat(new_cal_prb,up_cal_prb,(36-overlap));
335 strncat(new_chastity,up_chastity,(36-overlap));
336
337 strncat(new_prb,down_prb,overlap);
338 strncat(new_cal_prb,down_cal_prb,overlap);
339 strncat(new_chastity,down_chastity,overlap);
340 }
341
342 printf("up_prb: %s\n",up_prb);
343 printf("down_prb: %s\n",down_prb);
344 printf("new_prb: %s\n",new_prb);
345
346 up_idx++;
347 down_idx++;
348 }
349 }
350
351 /*
352 * TODO
353 * - Check strand
354 * - check repetition
355 *
356 */