+ added support for writing in result file
[qpalma.git] / tools / data_tools / filterReads.c
1 ////////////////////////////////////////////////////////////
2 // The purpose of this program is to read a gff and a
3 // solexa reads file and create a data set used by QPalma.
4 //
5 //
6 //
7 ////////////////////////////////////////////////////////////
8
9 #include <sys/mman.h>
10 #include <sys/stat.h>
11 #include <stdio.h>
12 #include <stdlib.h>
13 #include <string.h>
14 #include <unistd.h>
15 #include <math.h>
16
17 #include "common.h"
18 #include "datastructures.h"
19
20 int parse_gff(char* filename,FILE* fid,struct gene*** allGenes);
21
22 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes,FILE* out_fs);
23
24 void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size, FILE* out_fs);
25
26 int fitting(char* up_prb, char* up_prb_end, char* down_prb, char* down_prb_end);
27
28 static char *info = "Usage is:\n./filterReads gff reads output";
29
30 const int read_size = 36;
31
32 int main(int argc, char* argv[]) {
33
34 if(argc != 4) {
35 printf("%s\n",info);
36 exit(EXIT_FAILURE);
37 }
38
39 int status;
40 int filenameSize = 256;
41 char *gff_filename = malloc(sizeof(char)*filenameSize);
42 char *reads_filename = malloc(sizeof(char)*filenameSize);
43 char *output_filename = malloc(sizeof(char)*filenameSize);
44
45 strncpy(gff_filename,argv[1],filenameSize);
46 strncpy(reads_filename,argv[2],filenameSize);
47 strncpy(output_filename,argv[3],filenameSize);
48
49 FILE *gff_fs = fopen(gff_filename,"r");
50 FILE *reads_fs = fopen(reads_filename,"r");
51 FILE *out_fs = fopen(output_filename,"w");
52
53 if(gff_fs == NULL) {
54 printf("Error: Could not open file: %s",gff_filename);
55 exit(EXIT_FAILURE);
56 }
57
58 if(reads_fs == NULL) {
59 printf("Error: Could not open file: %s",reads_filename);
60 exit(EXIT_FAILURE);
61 }
62
63 if(out_fs == NULL) {
64 printf("Error: Could not open file: %s",output_filename);
65 exit(EXIT_FAILURE);
66 }
67
68 struct gene** allGenes;
69 int numGenes = parse_gff(gff_filename,gff_fs,&allGenes);
70 status = fclose(gff_fs);
71 if(status != 0)
72 printf("closing of gff filestream failed!\n");
73
74 printf("Successfully parsed gff file! Found %d genes.\n",numGenes);
75
76 process_reads(reads_fs,&allGenes,numGenes,out_fs);
77
78 status = fclose(reads_fs);
79 status += fclose(out_fs);
80 if(status != 0)
81 printf("closing of filestreams failed!\n");
82
83 //free(allGenes);
84 free(gff_filename);
85 free(reads_filename);
86 return 0;
87 }
88
89 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* out_fs) {
90 int status;
91
92 int buffer_size= 64;
93 int chr = 0;
94 int pos = 0;
95 char* seq = malloc(sizeof(char)*buffer_size);
96 unsigned long id = 0;
97 char strand = ' ';
98 int mismatch = 0;
99 int occurrence = 0;
100 int size = 0;
101 int cut = 0;
102 char* prb = malloc(sizeof(char)*buffer_size);
103 char* cal_prb = malloc(sizeof(char)*buffer_size);
104 char* chastity = malloc(sizeof(char)*buffer_size);
105
106 int reads_fid = fileno(reads_fs);
107 struct stat reads_stat;
108 if ( fstat(reads_fid,&reads_stat) == -1) {
109 perror("fstat");
110 exit(EXIT_FAILURE);
111 }
112 off_t reads_filesize = reads_stat.st_size;
113
114 printf("Reads file is of size %lu bytes\n",(unsigned long) reads_filesize);
115
116 void *reads_area = mmap (NULL,reads_filesize,PROT_READ|PROT_WRITE,MAP_PRIVATE,reads_fid,0);
117 if (reads_area == MAP_FAILED) {
118 perror("mmap");
119 exit(EXIT_FAILURE);
120 }
121 close(reads_fid);
122
123 printf("Successfully mapped %lu bytes of reads file into memory\n",(unsigned long)reads_filesize);
124
125 void* linePtr = reads_area;
126 char* current_line = malloc(sizeof(char)*256);
127
128 int SIZE = 500;
129 // initialize boundary arrays
130 void** upstream_end = malloc(sizeof(void*)*SIZE);
131 void** upstream_overlap = malloc(sizeof(void*)*SIZE);
132 void** downstream_start = malloc(sizeof(void*)*SIZE);
133 void** downstream_overlap= malloc(sizeof(void*)*SIZE);
134 int ue,uo,ds,dov;
135 ue = uo = ds = dov = 0;
136
137 int skippedLinesCounter = 0;
138
139 int prev_exon_stop = -1;
140 int cur_exon_start = -1;
141
142 current_line = strncpy(current_line,linePtr,256);
143 int gene_idx = 0;
144 int exon_idx = 1;
145 struct gene* currentGene = (*allGenes)[gene_idx];
146
147 int readCtr = 0;
148 // start of the parsing loop
149 while(1) {
150 if (gene_idx == numGenes || strcmp(current_line,"") == 0)
151 break;
152
153 if (readCtr != 0 && readCtr % 100000 == 0)
154 printf("Processed %d reads.\n",readCtr);
155
156 //if (readCtr == 10000000)
157 // exit(EXIT_FAILURE);
158
159 status = sscanf(current_line,"%d\t%d\t%s\t%lu\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
160 &chr,&pos,seq,&id,&strand,&mismatch,&occurrence,&size,&cut,prb,cal_prb,chastity);
161 if (status < 12) {
162 skippedLinesCounter++;
163 }
164
165 //printf("gene start/stop %d,%d, pos: %d\n",currentGene->start,currentGene->stop,pos);
166
167 // if the read is occurring several times elsewhere then get rid of it
168 if(occurrence != 1) {
169 while (*(char*)linePtr != '\n') linePtr++;
170 linePtr++;
171 readCtr += 1;
172 current_line = strncpy(current_line,linePtr,256);
173 continue;
174 }
175
176 if (!(currentGene->start <= pos && (pos + read_size-1) <= currentGene->stop)) { // read is not within gene borders
177
178 if ( currentGene->stop < (pos + read_size-1) ) { // go to next gene
179 gene_idx++;
180 exon_idx = 1;
181 currentGene = (*allGenes)[gene_idx];
182 continue;
183 }
184
185 if ( pos < currentGene->start ) { // go to next read
186 next_read:
187
188 while (*(char*)linePtr != '\n') linePtr++;
189 linePtr++;
190 readCtr += 1;
191 current_line = strncpy(current_line,linePtr,256);
192 continue;
193 }
194
195 } else { // read IS within gene borders
196
197 exon_label:
198 prev_exon_stop = currentGene->exon_stops[exon_idx-1];
199 cur_exon_start = currentGene->exon_starts[exon_idx];
200 //printf("prev_exon_stop %d cur_exon_start %d pos %d\n",prev_exon_stop,cur_exon_start,pos);
201 //printf("exon_idx %d\n",exon_idx);
202
203 if (exon_idx == currentGene->num_exons) {
204 gene_idx++;
205 exon_idx = 1;
206 currentGene = (*allGenes)[gene_idx];
207 continue;
208 }
209
210 if (cur_exon_start - prev_exon_stop < 6 || cur_exon_start < pos ) { // go to next exon
211 exon_idx++;
212 combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov,out_fs);
213 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds,out_fs);
214 ue = uo = ds = dov = 0;
215 goto exon_label;
216 }
217
218 if ( pos < prev_exon_stop - read_size + 1 ) { // go to next read
219 goto next_read;
220 }
221
222 // if this position is reached the read is somehow overlapping or
223 // exactly on the exon boundary. now determine where exactly:
224
225 if (pos + (read_size-1) == prev_exon_stop) { // read ends at previous exon end
226 upstream_end[ue] = linePtr;
227 ue++;
228 goto next_read;
229 }
230
231 if (prev_exon_stop - pos >= 6 && prev_exon_stop - pos <= 30) { // read overlaps with previous exon boundary
232 upstream_overlap[uo] = linePtr;
233 uo++;
234 goto next_read;
235 }
236
237 if (pos == cur_exon_start) { // read starts at current exon start
238 downstream_start[ds] = linePtr;
239 ds++;
240 goto next_read;
241 }
242
243 if (cur_exon_start - pos >= 6 && cur_exon_start - pos <= 30 ) { // read overlaps with current exon boundary
244 downstream_overlap[dov] = linePtr;
245 dov++;
246 goto next_read;
247 }
248
249 goto next_read;
250 }
251 }
252
253 combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov,out_fs);
254 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds,out_fs);
255
256 free(upstream_end);
257 free(upstream_overlap);
258 free(downstream_start);
259 free(downstream_overlap);
260
261 printf("Processed %d reads in total while %d reads where skipped.\n",readCtr,skippedLinesCounter);
262
263 status = munmap(reads_area,reads_filesize);
264 if(status != 0)
265 printf("munmap failed!\n");
266
267 free(seq);
268 free(prb);
269 free(cal_prb);
270 free(chastity);
271 }
272
273 void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size,FILE* out_fs) {
274 //printf("up_/down_size %d %d\n",up_size,down_size);
275
276 if (up_size == 0 || down_size == 0 || exon_stop == -1)
277 return;
278
279 int up_idx, down_idx, status;
280 char* upstream_line = malloc(sizeof(char)*256);
281 char* downstream_line = malloc(sizeof(char)*256);
282
283 int buffer_size= 64;
284
285 int up_chr = 0;
286 int up_pos = 0;
287 char* up_seq = malloc(sizeof(char)*buffer_size);
288 int up_id = 0;
289 char up_strand = ' ';
290 int up_mismatch = 0;
291 int up_occurrence = 0;
292 int up_sz = 0;
293 int up_cut = 0;
294 char* up_prb = malloc(sizeof(char)*buffer_size);
295 char* up_cal_prb = malloc(sizeof(char)*buffer_size);
296 char* up_chastity = malloc(sizeof(char)*buffer_size);
297
298 int down_chr = 0;
299 int down_pos = 0;
300 char* down_seq = malloc(sizeof(char)*buffer_size);
301 int down_id = 0;
302 char down_strand = ' ';
303 int down_mismatch = 0;
304 int down_occurrence = 0;
305 int down_sz = 0;
306 int down_cut = 0;
307 char* down_prb = malloc(sizeof(char)*buffer_size);
308 char* down_cal_prb = malloc(sizeof(char)*buffer_size);
309 char* down_chastity = malloc(sizeof(char)*buffer_size);
310
311 int new_chr = 0;
312 char* new_seq = malloc(sizeof(char)*buffer_size);
313 char new_strand = ' ';
314 char* new_prb = malloc(sizeof(char)*buffer_size);
315 char* new_cal_prb = malloc(sizeof(char)*buffer_size);
316 char* new_chastity = malloc(sizeof(char)*buffer_size);
317
318 up_idx=0;
319 down_idx=0;
320 while(1) {
321 //printf("up_/down_idx %d %d\n",up_idx,down_idx);
322
323 if (up_idx == up_size || down_idx == down_size)
324 break;
325
326 strncpy(upstream_line,upstream[up_idx],256);
327 status = sscanf(upstream_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
328 &up_chr,&up_pos,up_seq,&up_id,&up_strand,&up_mismatch,&up_occurrence,&up_sz,
329 &up_cut,up_prb,up_cal_prb,up_chastity);
330
331 strncpy(downstream_line,downstream[down_idx],256);
332 status = sscanf(downstream_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
333 &down_chr,&down_pos,down_seq,&down_id,&down_strand,&down_mismatch,&down_occurrence,&down_sz,
334 &down_cut,down_prb,down_cal_prb,down_chastity);
335
336 new_prb[0] = '\0';
337 new_cal_prb[0] = '\0';
338 new_chastity[0] = '\0';
339
340 int fit;
341 int w_size = 5;
342 if (up_pos+35 == exon_stop) { // merge with read which is downstream overlapping
343 int overlap = exon_start - down_pos;
344 //printf("overlap is %d\n",overlap);
345 //printf("pos are: %d %d\n",up_pos,down_pos);
346
347 fit = fitting(up_prb+(36-w_size),up_prb+36,down_prb+overlap,down_prb+overlap+w_size);
348 if (fit != 1)
349 goto end;
350
351 new_chr = up_chr;
352 new_strand = up_strand;
353
354 strncat(new_seq,up_seq+(36-overlap),overlap);
355 strncat(new_prb,up_prb+(36-overlap),overlap);
356 strncat(new_cal_prb,up_cal_prb+(36-overlap),overlap);
357 strncat(new_chastity,up_chastity+(36-overlap),overlap);
358
359 strncat(new_seq,down_seq+overlap,36-overlap);
360 strncat(new_prb,down_prb+overlap,36-overlap);
361 strncat(new_cal_prb,down_cal_prb+overlap,36-overlap);
362 strncat(new_chastity,down_chastity+overlap,36-overlap);
363
364 } // merge with read which is upstream overlapping
365
366 if (down_pos == exon_start) {
367 int overlap = up_pos+36 - exon_stop;
368 //printf("overlap is %d\n",overlap);
369 //printf("pos are: %d %d\n",up_pos,down_pos);
370 fit = fitting(up_prb+36-overlap-w_size,up_prb+36-overlap,down_prb,down_prb+w_size);
371 if (fit != 1)
372 goto end;
373
374 new_chr = up_chr;
375 new_strand = up_strand;
376
377 strncat(new_seq,up_seq,(36-overlap));
378 strncat(new_prb,up_prb,(36-overlap));
379 strncat(new_cal_prb,up_cal_prb,(36-overlap));
380 strncat(new_chastity,up_chastity,(36-overlap));
381
382 strncat(new_seq,down_seq,overlap);
383 strncat(new_prb,down_prb,overlap);
384 strncat(new_cal_prb,down_cal_prb,overlap);
385 strncat(new_chastity,down_chastity,overlap);
386 }
387
388 //printf("New entry!\n");
389 fprintf(out_fs,"%d\t%c\t%s\t%d\t%s\t%s\t%s\n",
390 new_chr,new_strand,new_seq,read_size,new_prb,new_cal_prb,new_chastity);
391
392 end:
393
394 up_idx++;
395 down_idx++;
396 }
397 }
398
399 int fitting(char* up_prb, char* up_prb_end, char* down_prb, char* down_prb_end) {
400 double epsilon_mean = 10.0;
401 double epsilon_var = 10.0;
402 double mean_up = 0;
403 double variance_up = 0;
404 double mean_down = 0;
405 double variance_down = 0;
406
407 char *up_ptr = up_prb;
408 char *down_ptr = down_prb;
409
410 int w_size = 0;
411 while(up_ptr != up_prb_end) {
412 mean_up += (*up_ptr)-50;
413 mean_down += (*down_ptr)-50;
414 w_size++;
415 up_ptr++;
416 down_ptr++;
417 }
418 mean_up /= w_size;
419 mean_down /= w_size;
420
421 //printf("means: %f %f\n",mean_up,mean_down);
422
423 up_ptr = up_prb;
424 down_ptr = down_prb;
425 w_size = 0;
426 while(up_ptr != up_prb_end) {
427 variance_up += pow((*up_prb)-50 - mean_up,2);
428 variance_down += pow((*down_prb)-50 - mean_down,2);
429 w_size++;
430 up_ptr++;
431 down_ptr++;
432 }
433 variance_up /= (w_size-1);
434 variance_down /= (w_size-1);
435
436 //printf("variances: %f %f\n",variance_up,variance_down);
437
438 if ( abs(mean_up - mean_down) < epsilon_mean && abs(variance_up - variance_down) < epsilon_var )
439 return 1;
440
441 return 0;
442 }
443
444 /*
445 * TODO
446 * - Check strand
447 * - check for [AC] and similar entries
448 */