+ loosened filtering criterion
[qpalma.git] / tools / data_tools / filterReads.c
1 ////////////////////////////////////////////////////////////
2 // The purpose of this program is to read a gff and a
3 // solexa reads file and create a data set used by QPalma.
4 //
5 //
6 //
7 ////////////////////////////////////////////////////////////
8
9 #include <sys/mman.h>
10 #include <sys/stat.h>
11 #include <stdio.h>
12 #include <stdlib.h>
13 #include <string.h>
14 #include <unistd.h>
15 #include <math.h>
16
17 #include "common.h"
18 #include "datastructures.h"
19
20 int parse_gff(char* filename,FILE* fid,struct gene*** allGenes);
21
22 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes,FILE* out_fs);
23
24 void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size, FILE* out_fs);
25
26 int fitting(char* up_prb, char* up_prb_end, char* down_prb, char* down_prb_end);
27
28 void remove_ambiguities(char * old_seq, int old_seq_size, char* new_seq);
29
30 static char *info = "Usage is:\n./filterReads gff reads output";
31
32 const int read_size = 36;
33
34 int main(int argc, char* argv[]) {
35
36 if(argc != 4) {
37 printf("%s\n",info);
38 exit(EXIT_FAILURE);
39 }
40
41 int status;
42 int filenameSize = 256;
43 char* gff_filename = malloc(sizeof(char)*filenameSize);
44 char* reads_filename = malloc(sizeof(char)*filenameSize);
45 char* output_filename = malloc(sizeof(char)*filenameSize);
46
47 strncpy(gff_filename,argv[1],filenameSize);
48 strncpy(reads_filename,argv[2],filenameSize);
49 strncpy(output_filename,argv[3],filenameSize);
50
51 FILE *gff_fs = fopen(gff_filename,"r");
52 FILE *reads_fs = fopen(reads_filename,"r");
53 FILE *out_fs = fopen(output_filename,"w");
54
55 if(gff_fs == NULL) {
56 printf("Error: Could not open file: %s",gff_filename);
57 exit(EXIT_FAILURE);
58 }
59
60 if(reads_fs == NULL) {
61 printf("Error: Could not open file: %s",reads_filename);
62 exit(EXIT_FAILURE);
63 }
64
65 if(out_fs == NULL) {
66 printf("Error: Could not open file: %s",output_filename);
67 exit(EXIT_FAILURE);
68 }
69
70 struct gene** allGenes;
71 int numGenes = parse_gff(gff_filename,gff_fs,&allGenes);
72 status = fclose(gff_fs);
73 free(gff_filename);
74 if(status != 0)
75 printf("closing of gff filestream failed!\n");
76
77 printf("Successfully parsed gff file! Found %d genes.\n",numGenes);
78
79 process_reads(reads_fs,&allGenes,numGenes,out_fs);
80 status = fclose(reads_fs);
81 status = fclose(out_fs);
82 if(status != 0)
83 perror("fclose");
84
85 free(reads_filename);
86 free(output_filename);
87 return 0;
88 }
89
90 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* out_fs) {
91 int status;
92
93 int buffer_size= 64;
94 int chr = 0;
95 int pos = 0;
96 char* seq = malloc(sizeof(char)*buffer_size);
97 unsigned long id = 0;
98 char strand = ' ';
99 int mismatch = 0;
100 int occurrence = 0;
101 int size = 0;
102 int cut = 0;
103 char* prb = malloc(sizeof(char)*buffer_size);
104 char* cal_prb = malloc(sizeof(char)*buffer_size);
105 char* chastity = malloc(sizeof(char)*buffer_size);
106
107 int reads_fid = fileno(reads_fs);
108 struct stat reads_stat;
109 if ( fstat(reads_fid,&reads_stat) == -1) {
110 perror("fstat");
111 exit(EXIT_FAILURE);
112 }
113 off_t reads_filesize = reads_stat.st_size;
114 printf("Reads file is of size %lu bytes\n",(unsigned long) reads_filesize);
115 int numReads = reads_filesize / 178.0;
116
117 void *reads_area = mmap (NULL,reads_filesize,PROT_READ|PROT_WRITE,MAP_PRIVATE,reads_fid,0);
118 if (reads_area == MAP_FAILED) {
119 perror("mmap");
120 exit(EXIT_FAILURE);
121 }
122 close(reads_fid);
123 printf("Successfully mapped %lu bytes of reads file into memory\n",(unsigned long)reads_filesize);
124
125 void* linePtr = reads_area;
126 char* current_line = malloc(sizeof(char)*256);
127
128 int SIZE = 500;
129 // initialize boundary arrays
130 void** upstream_end = malloc(sizeof(void*)*SIZE);
131 void** upstream_overlap = malloc(sizeof(void*)*SIZE);
132 void** downstream_start = malloc(sizeof(void*)*SIZE);
133 void** downstream_overlap= malloc(sizeof(void*)*SIZE);
134 int ue,uo,ds,dov;
135 ue = uo = ds = dov = 0;
136
137 int skippedLinesCounter = 0;
138
139 int prev_exon_start = -1;
140 int prev_exon_stop = -1;
141 int cur_exon_start = -1;
142
143 current_line = strncpy(current_line,linePtr,256);
144 int gene_idx = 0;
145 int exon_idx = 1;
146 struct gene* currentGene = (*allGenes)[gene_idx];
147
148 char* disamb_seq = malloc(sizeof(char)*read_size);
149
150 int readCtr = 0;
151 // start of the parsing loop
152 while(1) {
153 if (gene_idx == numGenes || strcmp(current_line,"") == 0)
154 break;
155
156 if (readCtr != 0 && readCtr % 1000000 == 0)
157 printf("Processed %d reads. Processed %d/%d genes and %d/%d reads.\n",readCtr,gene_idx,numGenes,readCtr,numReads);
158
159 //if (gene_idx >= 1833)
160 // printf("currentGene start/stop: %d/%d. Positions is %d\n",currentGene->start,currentGene->stop,pos);
161 //if (readCtr == 2000000)
162 // exit(EXIT_SUCCESS);
163
164 status = sscanf(current_line,"%d\t%d\t%s\t%lu\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
165 &chr,&pos,seq,&id,&strand,&mismatch,&occurrence,&size,&cut,prb,cal_prb,chastity);
166 if (status < 12) {
167 skippedLinesCounter++;
168 }
169 //printf("gene start/stop %d,%d, pos: %d\n",currentGene->start,currentGene->stop,pos);
170 // if the read is occurring several times elsewhere then get rid of it
171 if(occurrence != 1) {
172 while (*(char*)linePtr != '\n') linePtr++;
173 linePtr++;
174 readCtr += 1;
175 current_line = strncpy(current_line,linePtr,256);
176 continue;
177 }
178
179 if (!(currentGene->start <= pos && (pos + read_size-1) <= currentGene->stop)) { // read is not within gene borders
180
181 if ( currentGene->stop < (pos + read_size-1) ) { // go to next gene
182 gene_idx++;
183 exon_idx = 1;
184 currentGene = (*allGenes)[gene_idx];
185 //printf("currentGene->start / currentGene->stop %d/%d pos is %d\n",currentGene->start,currentGene->stop,pos);
186 ue = uo = ds = dov = 0;
187 continue;
188 }
189
190 if ( pos < currentGene->start ) { // go to next read
191 next_read:
192
193 while (*(char*)linePtr != '\n') linePtr++;
194 linePtr++;
195 readCtr += 1;
196 current_line = strncpy(current_line,linePtr,256);
197 continue;
198 }
199
200 } else { // read IS within gene borders
201
202 exon_label:
203
204 if (exon_idx == currentGene->num_exons) {
205 gene_idx++;
206 exon_idx = 1;
207 currentGene = (*allGenes)[gene_idx];
208 continue;
209 }
210
211 prev_exon_start = currentGene->exon_starts[exon_idx-1];
212 prev_exon_stop = currentGene->exon_stops[exon_idx-1];
213 cur_exon_start = currentGene->exon_starts[exon_idx];
214
215 //printf("exon %d %d inton til %d pos %d\n",prev_exon_start,prev_exon_stop,cur_exon_start,pos);
216
217 if (cur_exon_start - prev_exon_stop < 6 || cur_exon_start < pos ) { // go to next exon
218 exon_idx++;
219
220 if (ue != 0 && dov != 0)
221 combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov,out_fs);
222
223 if (uo != 0 && ds != 0)
224 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds,out_fs);
225
226 ue = uo = ds = dov = 0;
227 goto exon_label;
228 }
229
230 if ( prev_exon_start < pos && (pos+read_size) < prev_exon_stop ) { // read is inside previous exon
231 remove_ambiguities(seq,strlen(seq),disamb_seq);
232 fprintf(out_fs,"%d\t%c\t%s\t%d\t%s\t%s\t%s\n",chr,strand,disamb_seq,read_size,prb,cal_prb,chastity);
233 goto next_read;
234 }
235
236 if ( pos + (read_size-1) < prev_exon_stop ) // go to next read
237 goto next_read;
238
239 // if this position is reached the read is somehow overlapping or
240 // exactly on the exon boundary. now determine where exactly:
241 if (pos + (read_size-1) == prev_exon_stop) { // read ends at previous exon end
242 upstream_end[ue] = linePtr;
243 ue++;
244 goto next_read;
245 }
246
247 if ( (prev_exon_stop - pos) >= 6 && (prev_exon_stop - pos) <= 30) { // read overlaps with previous exon boundary
248 upstream_overlap[uo] = linePtr;
249 uo++;
250 goto next_read;
251 }
252
253 if ( pos == cur_exon_start ) { // read starts at current exon start
254 downstream_start[ds] = linePtr;
255 ds++;
256 goto next_read;
257 }
258
259 if ( (cur_exon_start - pos) >= 6 && (cur_exon_start - pos) <= 30 ) { // read overlaps with current exon boundary
260 downstream_overlap[dov] = linePtr;
261 dov++;
262 goto next_read;
263 }
264
265 goto next_read; // read was not useful i.e. not overlapping/starting at exon boundaries
266 }
267 }
268
269 combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov,out_fs);
270 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds,out_fs);
271
272 free(upstream_end);
273 free(upstream_overlap);
274 free(downstream_start);
275 free(downstream_overlap);
276
277 printf("Processed %d reads in total while %d reads where skipped.\n",readCtr,skippedLinesCounter);
278
279 status = munmap(reads_area,reads_filesize);
280 if(status != 0)
281 perror("munmap");
282
283 //free(current_line);
284 free(seq);
285 free(prb);
286 free(cal_prb);
287 free(chastity);
288 }
289
290 void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size,FILE* out_fs) {
291 int up_idx, down_idx, status;
292 char* upstream_line = malloc(sizeof(char)*256);
293 char* downstream_line = malloc(sizeof(char)*256);
294
295 int buffer_size= 64;
296
297 int up_chr = 0;
298 int up_pos = 0;
299 char* up_seq = malloc(sizeof(char)*buffer_size);
300 int up_id = 0;
301 char up_strand = ' ';
302 int up_mismatch = 0;
303 int up_occurrence = 0;
304 int up_sz = 0;
305 int up_cut = 0;
306 char* up_prb = malloc(sizeof(char)*buffer_size);
307 char* up_cal_prb = malloc(sizeof(char)*buffer_size);
308 char* up_chastity = malloc(sizeof(char)*buffer_size);
309
310 int down_chr = 0;
311 int down_pos = 0;
312 char* down_seq = malloc(sizeof(char)*buffer_size);
313 int down_id = 0;
314 char down_strand = ' ';
315 int down_mismatch = 0;
316 int down_occurrence = 0;
317 int down_sz = 0;
318 int down_cut = 0;
319 char* down_prb = malloc(sizeof(char)*buffer_size);
320 char* down_cal_prb = malloc(sizeof(char)*buffer_size);
321 char* down_chastity = malloc(sizeof(char)*buffer_size);
322
323 int new_chr = 0;
324 char* new_seq = malloc(sizeof(char)*buffer_size);
325 char new_strand = ' ';
326 char* new_prb = malloc(sizeof(char)*buffer_size);
327 char* new_cal_prb = malloc(sizeof(char)*buffer_size);
328 char* new_chastity = malloc(sizeof(char)*buffer_size);
329 char* new_up_seq = malloc(sizeof(char)*read_size);
330 char* new_down_seq = malloc(sizeof(char)*read_size);
331
332 up_idx=0;
333 down_idx=0;
334 while(1) {
335 if (up_idx == up_size || down_idx == down_size || up_strand != down_strand)
336 break;
337
338 strncpy(upstream_line,upstream[up_idx],256);
339 status = sscanf(upstream_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
340 &up_chr,&up_pos,up_seq,&up_id,&up_strand,&up_mismatch,&up_occurrence,&up_sz,
341 &up_cut,up_prb,up_cal_prb,up_chastity);
342
343 strncpy(downstream_line,downstream[down_idx],256);
344 status = sscanf(downstream_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
345 &down_chr,&down_pos,down_seq,&down_id,&down_strand,&down_mismatch,&down_occurrence,&down_sz,
346 &down_cut,down_prb,down_cal_prb,down_chastity);
347
348 remove_ambiguities(up_seq,strlen(up_seq),new_up_seq);
349 remove_ambiguities(down_seq,strlen(down_seq),new_down_seq);
350
351 new_seq[0] = '\0';
352 new_prb[0] = '\0';
353 new_cal_prb[0] = '\0';
354 new_chastity[0] = '\0';
355
356 int fit;
357 int w_size = 6;
358 int overlap = 0;
359 if (up_pos+35 == exon_stop) { // merge with read which is downstream overlapping
360 overlap = exon_start - down_pos;
361
362 fit = fitting(up_prb+(36-w_size),up_prb+36,down_prb+overlap,down_prb+overlap+w_size);
363 if (fit != 1)
364 goto end;
365
366 new_chr = up_chr;
367 new_strand = up_strand;
368
369 strncat(new_seq,new_up_seq+(36-overlap),overlap);
370 strncat(new_prb,up_prb+(36-overlap),overlap);
371 strncat(new_cal_prb,up_cal_prb+(36-overlap),overlap);
372 strncat(new_chastity,up_chastity+(36-overlap),overlap);
373
374 strncat(new_seq,new_down_seq+overlap,36-overlap);
375 strncat(new_prb,down_prb+overlap,36-overlap);
376 strncat(new_cal_prb,down_cal_prb+overlap,36-overlap);
377 strncat(new_chastity,down_chastity+overlap,36-overlap);
378
379 //printf("Between exon stop/start %d/%d : merging pos %d %d with overlap %d\n",exon_stop,exon_start,up_pos+35,down_pos, overlap);
380
381 } // merge with read which is upstream overlapping
382
383 if (down_pos == exon_start) {
384 overlap = up_pos+36 - exon_stop;
385 //printf("overlap is %d\n",overlap);
386 //printf("pos are: %d %d\n",up_pos,down_pos);
387 fit = fitting(up_prb+36-overlap-w_size,up_prb+36-overlap,down_prb,down_prb+w_size);
388 if (fit != 1)
389 goto end;
390
391 new_chr = up_chr;
392 new_strand = up_strand;
393
394 strncat(new_seq,new_up_seq,(36-overlap));
395 strncat(new_prb,up_prb,(36-overlap));
396 strncat(new_cal_prb,up_cal_prb,(36-overlap));
397 strncat(new_chastity,up_chastity,(36-overlap));
398
399 strncat(new_seq,new_down_seq,overlap);
400 strncat(new_prb,down_prb,overlap);
401 strncat(new_cal_prb,down_cal_prb,overlap);
402 strncat(new_chastity,down_chastity,overlap);
403
404 //printf("Between exon stop/start %d/%d : merging pos %d %d with overlap %d\n",exon_stop,exon_start,up_pos,down_pos, overlap);
405 }
406
407 if ( !(up_pos+35 == exon_stop) && !(down_pos == exon_start) )
408 printf("ERROR: Between exon stop/start %d/%d : merging pos %d %d with overlap %d\n",exon_stop,exon_start,up_pos,down_pos, overlap);
409
410 fprintf(out_fs,"%d\t%c\t%s\t%d\t%s\t%s\t%s\n",
411 new_chr,new_strand,new_seq,read_size,new_prb,new_cal_prb,new_chastity);
412
413 end:
414
415 up_idx++;
416 down_idx++;
417 }
418
419 free(upstream_line);
420 free(downstream_line);
421
422 free(new_up_seq);
423 free(new_down_seq);
424
425 free(up_prb);
426 free(up_cal_prb);
427 free(up_chastity);
428
429 free(down_prb);
430 free(down_cal_prb);
431 free(down_chastity);
432
433 free(new_prb);
434 free(new_cal_prb);
435 free(new_chastity);
436
437 }
438
439 int fitting(char* up_prb, char* up_prb_end, char* down_prb, char* down_prb_end) {
440 double epsilon_mean = 15.0;
441 double epsilon_var = 10.0;
442 double mean_up = 0;
443 double variance_up = 0;
444 double mean_down = 0;
445 double variance_down = 0;
446
447 char *up_ptr = up_prb;
448 char *down_ptr = down_prb;
449
450 int w_size = 0;
451 while(up_ptr != up_prb_end) {
452 mean_up += (*up_ptr)-50;
453 mean_down += (*down_ptr)-50;
454 w_size++;
455 up_ptr++;
456 down_ptr++;
457 }
458 mean_up /= w_size;
459 mean_down /= w_size;
460
461
462 up_ptr = up_prb;
463 down_ptr = down_prb;
464 w_size = 0;
465 while(up_ptr != up_prb_end) {
466 variance_up += pow((*up_prb)-50 - mean_up,2);
467 variance_down += pow((*down_prb)-50 - mean_down,2);
468 w_size++;
469 up_ptr++;
470 down_ptr++;
471 }
472 variance_up /= (w_size-1);
473 variance_down /= (w_size-1);
474
475 //printf("means: %f %f, variances: %f %f\n",mean_up,mean_down,variance_up,variance_down);
476
477 if ( abs(mean_up - mean_down) < epsilon_mean && abs(variance_up - variance_down) < epsilon_var )
478 return 1;
479
480 return 0;
481 }
482
483 void remove_ambiguities(char * old_seq, int old_seq_size, char* new_seq) {
484 //printf("old seq: %s\n",old_seq);
485 //printf("new seq: %s\n",new_seq);
486
487 int idx=0;
488 int new_idx = 0;
489 while(idx<old_seq_size) {
490 if (old_seq[idx] == '[') {
491 new_seq[new_idx] = old_seq[++idx];
492 new_idx++;
493 idx += 3;
494 continue;
495 }
496
497 new_seq[new_idx] = old_seq[idx++];
498 new_idx++;
499 }
500 //printf("old seq: %s\n",old_seq);
501 //printf("new seq: %s\n",new_seq);
502 }
503
504 /*
505 * TODO:
506 * - Check strand -> done simple (only if equal)
507 * - check for [AC] and similar entries -> done simple (see function
508 * remove_ambiguities (exchanges [XY] by the first entry)
509 */