+ added missing check for strand orientation
[qpalma.git] / tools / data_tools / filterReads.c
1 ////////////////////////////////////////////////////////////
2 // The purpose of this program is to read a gff and a
3 // solexa reads file and create a data set used by QPalma.
4 //
5 //
6 //
7 ////////////////////////////////////////////////////////////
8
9 #include <sys/mman.h>
10 #include <sys/stat.h>
11 #include <stdio.h>
12 #include <stdlib.h>
13 #include <string.h>
14 #include <unistd.h>
15 #include <math.h>
16
17 #include "common.h"
18 #include "datastructures.h"
19
20 int parse_gff(char* filename,FILE* fid,struct gene*** allGenes);
21
22 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes,FILE* out_fs);
23
24 void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size, FILE* out_fs);
25
26 int fitting(char* up_prb, char* up_prb_end, char* down_prb, char* down_prb_end);
27
28 void remove_ambiguities(char * old_seq, int old_seq_size, char* new_seq);
29
30 static char *info = "Usage is:\n./filterReads gff reads output";
31
32 const int read_size = 36;
33
34 int main(int argc, char* argv[]) {
35
36 if(argc != 4) {
37 printf("%s\n",info);
38 exit(EXIT_FAILURE);
39 }
40
41 int status;
42 int filenameSize = 256;
43 char *gff_filename = malloc(sizeof(char)*filenameSize);
44 char *reads_filename = malloc(sizeof(char)*filenameSize);
45 char *output_filename = malloc(sizeof(char)*filenameSize);
46
47 strncpy(gff_filename,argv[1],filenameSize);
48 strncpy(reads_filename,argv[2],filenameSize);
49 strncpy(output_filename,argv[3],filenameSize);
50
51 FILE *gff_fs = fopen(gff_filename,"r");
52 FILE *reads_fs = fopen(reads_filename,"r");
53 FILE *out_fs = fopen(output_filename,"w");
54
55 if(gff_fs == NULL) {
56 printf("Error: Could not open file: %s",gff_filename);
57 exit(EXIT_FAILURE);
58 }
59
60 if(reads_fs == NULL) {
61 printf("Error: Could not open file: %s",reads_filename);
62 exit(EXIT_FAILURE);
63 }
64
65 if(out_fs == NULL) {
66 printf("Error: Could not open file: %s",output_filename);
67 exit(EXIT_FAILURE);
68 }
69
70 struct gene** allGenes;
71 int numGenes = parse_gff(gff_filename,gff_fs,&allGenes);
72 status = fclose(gff_fs);
73 if(status != 0)
74 printf("closing of gff filestream failed!\n");
75
76 printf("Successfully parsed gff file! Found %d genes.\n",numGenes);
77
78 process_reads(reads_fs,&allGenes,numGenes,out_fs);
79
80 status = fclose(reads_fs);
81 status += fclose(out_fs);
82 if(status != 0)
83 printf("closing of filestreams failed!\n");
84
85 //free(allGenes);
86 free(gff_filename);
87 free(reads_filename);
88 return 0;
89 }
90
91 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* out_fs) {
92 int status;
93
94 int buffer_size= 64;
95 int chr = 0;
96 int pos = 0;
97 char* seq = malloc(sizeof(char)*buffer_size);
98 unsigned long id = 0;
99 char strand = ' ';
100 int mismatch = 0;
101 int occurrence = 0;
102 int size = 0;
103 int cut = 0;
104 char* prb = malloc(sizeof(char)*buffer_size);
105 char* cal_prb = malloc(sizeof(char)*buffer_size);
106 char* chastity = malloc(sizeof(char)*buffer_size);
107
108 int reads_fid = fileno(reads_fs);
109 struct stat reads_stat;
110 if ( fstat(reads_fid,&reads_stat) == -1) {
111 perror("fstat");
112 exit(EXIT_FAILURE);
113 }
114 off_t reads_filesize = reads_stat.st_size;
115
116 printf("Reads file is of size %lu bytes\n",(unsigned long) reads_filesize);
117
118 void *reads_area = mmap (NULL,reads_filesize,PROT_READ|PROT_WRITE,MAP_PRIVATE,reads_fid,0);
119 if (reads_area == MAP_FAILED) {
120 perror("mmap");
121 exit(EXIT_FAILURE);
122 }
123 close(reads_fid);
124
125 printf("Successfully mapped %lu bytes of reads file into memory\n",(unsigned long)reads_filesize);
126
127 void* linePtr = reads_area;
128 char* current_line = malloc(sizeof(char)*256);
129
130 int SIZE = 500;
131 // initialize boundary arrays
132 void** upstream_end = malloc(sizeof(void*)*SIZE);
133 void** upstream_overlap = malloc(sizeof(void*)*SIZE);
134 void** downstream_start = malloc(sizeof(void*)*SIZE);
135 void** downstream_overlap= malloc(sizeof(void*)*SIZE);
136 int ue,uo,ds,dov;
137 ue = uo = ds = dov = 0;
138
139 int skippedLinesCounter = 0;
140
141 int prev_exon_stop = -1;
142 int cur_exon_start = -1;
143
144 current_line = strncpy(current_line,linePtr,256);
145 int gene_idx = 0;
146 int exon_idx = 1;
147 struct gene* currentGene = (*allGenes)[gene_idx];
148
149 int readCtr = 0;
150 // start of the parsing loop
151 while(1) {
152 if (gene_idx == numGenes || strcmp(current_line,"") == 0)
153 break;
154
155 if (readCtr != 0 && readCtr % 100000 == 0)
156 printf("Processed %d reads.\n",readCtr);
157
158 //if (readCtr == 10000000)
159 // exit(EXIT_FAILURE);
160
161 status = sscanf(current_line,"%d\t%d\t%s\t%lu\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
162 &chr,&pos,seq,&id,&strand,&mismatch,&occurrence,&size,&cut,prb,cal_prb,chastity);
163 if (status < 12) {
164 skippedLinesCounter++;
165 }
166
167 //printf("gene start/stop %d,%d, pos: %d\n",currentGene->start,currentGene->stop,pos);
168
169 // if the read is occurring several times elsewhere then get rid of it
170 if(occurrence != 1) {
171 while (*(char*)linePtr != '\n') linePtr++;
172 linePtr++;
173 readCtr += 1;
174 current_line = strncpy(current_line,linePtr,256);
175 continue;
176 }
177
178 if (!(currentGene->start <= pos && (pos + read_size-1) <= currentGene->stop)) { // read is not within gene borders
179
180 if ( currentGene->stop < (pos + read_size-1) ) { // go to next gene
181 gene_idx++;
182 exon_idx = 1;
183 currentGene = (*allGenes)[gene_idx];
184 continue;
185 }
186
187 if ( pos < currentGene->start ) { // go to next read
188 next_read:
189
190 while (*(char*)linePtr != '\n') linePtr++;
191 linePtr++;
192 readCtr += 1;
193 current_line = strncpy(current_line,linePtr,256);
194 continue;
195 }
196
197 } else { // read IS within gene borders
198
199 exon_label:
200 prev_exon_stop = currentGene->exon_stops[exon_idx-1];
201 cur_exon_start = currentGene->exon_starts[exon_idx];
202 //printf("prev_exon_stop %d cur_exon_start %d pos %d\n",prev_exon_stop,cur_exon_start,pos);
203 //printf("exon_idx %d\n",exon_idx);
204
205 if (exon_idx == currentGene->num_exons) {
206 gene_idx++;
207 exon_idx = 1;
208 currentGene = (*allGenes)[gene_idx];
209 continue;
210 }
211
212 if (cur_exon_start - prev_exon_stop < 6 || cur_exon_start < pos ) { // go to next exon
213 exon_idx++;
214 combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov,out_fs);
215 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds,out_fs);
216 ue = uo = ds = dov = 0;
217 goto exon_label;
218 }
219
220 if ( pos < prev_exon_stop - read_size + 1 ) { // go to next read
221 goto next_read;
222 }
223
224 // if this position is reached the read is somehow overlapping or
225 // exactly on the exon boundary. now determine where exactly:
226
227 if (pos + (read_size-1) == prev_exon_stop) { // read ends at previous exon end
228 upstream_end[ue] = linePtr;
229 ue++;
230 goto next_read;
231 }
232
233 if (prev_exon_stop - pos >= 6 && prev_exon_stop - pos <= 30) { // read overlaps with previous exon boundary
234 upstream_overlap[uo] = linePtr;
235 uo++;
236 goto next_read;
237 }
238
239 if (pos == cur_exon_start) { // read starts at current exon start
240 downstream_start[ds] = linePtr;
241 ds++;
242 goto next_read;
243 }
244
245 if (cur_exon_start - pos >= 6 && cur_exon_start - pos <= 30 ) { // read overlaps with current exon boundary
246 downstream_overlap[dov] = linePtr;
247 dov++;
248 goto next_read;
249 }
250
251 goto next_read;
252 }
253 }
254
255 combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov,out_fs);
256 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds,out_fs);
257
258 free(upstream_end);
259 free(upstream_overlap);
260 free(downstream_start);
261 free(downstream_overlap);
262
263 printf("Processed %d reads in total while %d reads where skipped.\n",readCtr,skippedLinesCounter);
264
265 status = munmap(reads_area,reads_filesize);
266 if(status != 0)
267 printf("munmap failed!\n");
268
269 free(seq);
270 free(prb);
271 free(cal_prb);
272 free(chastity);
273 }
274
275 void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size,FILE* out_fs) {
276 //printf("up_/down_size %d %d\n",up_size,down_size);
277
278 if (up_size == 0 || down_size == 0 || exon_stop == -1)
279 return;
280
281 int up_idx, down_idx, status;
282 char* upstream_line = malloc(sizeof(char)*256);
283 char* downstream_line = malloc(sizeof(char)*256);
284
285 int buffer_size= 64;
286
287 int up_chr = 0;
288 int up_pos = 0;
289 char* up_seq = malloc(sizeof(char)*buffer_size);
290 int up_id = 0;
291 char up_strand = ' ';
292 int up_mismatch = 0;
293 int up_occurrence = 0;
294 int up_sz = 0;
295 int up_cut = 0;
296 char* up_prb = malloc(sizeof(char)*buffer_size);
297 char* up_cal_prb = malloc(sizeof(char)*buffer_size);
298 char* up_chastity = malloc(sizeof(char)*buffer_size);
299
300 int down_chr = 0;
301 int down_pos = 0;
302 char* down_seq = malloc(sizeof(char)*buffer_size);
303 int down_id = 0;
304 char down_strand = ' ';
305 int down_mismatch = 0;
306 int down_occurrence = 0;
307 int down_sz = 0;
308 int down_cut = 0;
309 char* down_prb = malloc(sizeof(char)*buffer_size);
310 char* down_cal_prb = malloc(sizeof(char)*buffer_size);
311 char* down_chastity = malloc(sizeof(char)*buffer_size);
312
313 int new_chr = 0;
314 char* new_seq = malloc(sizeof(char)*buffer_size);
315 char new_strand = ' ';
316 char* new_prb = malloc(sizeof(char)*buffer_size);
317 char* new_cal_prb = malloc(sizeof(char)*buffer_size);
318 char* new_chastity = malloc(sizeof(char)*buffer_size);
319
320 up_idx=0;
321 down_idx=0;
322 while(1) {
323 //printf("up_/down_idx %d %d\n",up_idx,down_idx);
324
325 if (up_idx == up_size || down_idx == down_size)
326 break;
327
328 if ( up_strand != down_strand )
329 break;
330
331 strncpy(upstream_line,upstream[up_idx],256);
332 status = sscanf(upstream_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
333 &up_chr,&up_pos,up_seq,&up_id,&up_strand,&up_mismatch,&up_occurrence,&up_sz,
334 &up_cut,up_prb,up_cal_prb,up_chastity);
335
336 strncpy(downstream_line,downstream[down_idx],256);
337 status = sscanf(downstream_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
338 &down_chr,&down_pos,down_seq,&down_id,&down_strand,&down_mismatch,&down_occurrence,&down_sz,
339 &down_cut,down_prb,down_cal_prb,down_chastity);
340
341 char* new_up_seq = malloc(sizeof(char)*read_size);
342 char* new_down_seq = malloc(sizeof(char)*read_size);
343
344 remove_ambiguities(up_seq,strlen(up_seq),new_up_seq);
345 remove_ambiguities(down_seq,strlen(down_seq),new_down_seq);
346
347 new_seq[0] = '\0';
348 new_prb[0] = '\0';
349 new_cal_prb[0] = '\0';
350 new_chastity[0] = '\0';
351
352 int fit;
353 int w_size = 5;
354 if (up_pos+35 == exon_stop) { // merge with read which is downstream overlapping
355 int overlap = exon_start - down_pos;
356 //printf("overlap is %d\n",overlap);
357 //printf("pos are: %d %d\n",up_pos,down_pos);
358
359 fit = fitting(up_prb+(36-w_size),up_prb+36,down_prb+overlap,down_prb+overlap+w_size);
360 if (fit != 1)
361 goto end;
362
363 new_chr = up_chr;
364 new_strand = up_strand;
365
366 strncat(new_seq,new_up_seq+(36-overlap),overlap);
367 strncat(new_prb,up_prb+(36-overlap),overlap);
368 strncat(new_cal_prb,up_cal_prb+(36-overlap),overlap);
369 strncat(new_chastity,up_chastity+(36-overlap),overlap);
370
371 strncat(new_seq,new_down_seq+overlap,36-overlap);
372 strncat(new_prb,down_prb+overlap,36-overlap);
373 strncat(new_cal_prb,down_cal_prb+overlap,36-overlap);
374 strncat(new_chastity,down_chastity+overlap,36-overlap);
375
376 } // merge with read which is upstream overlapping
377
378 if (down_pos == exon_start) {
379 int overlap = up_pos+36 - exon_stop;
380 //printf("overlap is %d\n",overlap);
381 //printf("pos are: %d %d\n",up_pos,down_pos);
382 fit = fitting(up_prb+36-overlap-w_size,up_prb+36-overlap,down_prb,down_prb+w_size);
383 if (fit != 1)
384 goto end;
385
386 new_chr = up_chr;
387 new_strand = up_strand;
388
389 strncat(new_seq,new_up_seq,(36-overlap));
390 strncat(new_prb,up_prb,(36-overlap));
391 strncat(new_cal_prb,up_cal_prb,(36-overlap));
392 strncat(new_chastity,up_chastity,(36-overlap));
393
394 strncat(new_seq,new_down_seq,overlap);
395 strncat(new_prb,down_prb,overlap);
396 strncat(new_cal_prb,down_cal_prb,overlap);
397 strncat(new_chastity,down_chastity,overlap);
398 }
399
400 //printf("New entry!\n");
401 fprintf(out_fs,"%d\t%c\t%s\t%d\t%s\t%s\t%s\n",
402 new_chr,new_strand,new_seq,read_size,new_prb,new_cal_prb,new_chastity);
403
404 end:
405
406 up_idx++;
407 down_idx++;
408
409 free(new_up_seq);
410 free(new_down_seq);
411 }
412 }
413
414 int fitting(char* up_prb, char* up_prb_end, char* down_prb, char* down_prb_end) {
415 double epsilon_mean = 10.0;
416 double epsilon_var = 10.0;
417 double mean_up = 0;
418 double variance_up = 0;
419 double mean_down = 0;
420 double variance_down = 0;
421
422 char *up_ptr = up_prb;
423 char *down_ptr = down_prb;
424
425 int w_size = 0;
426 while(up_ptr != up_prb_end) {
427 mean_up += (*up_ptr)-50;
428 mean_down += (*down_ptr)-50;
429 w_size++;
430 up_ptr++;
431 down_ptr++;
432 }
433 mean_up /= w_size;
434 mean_down /= w_size;
435
436 //printf("means: %f %f\n",mean_up,mean_down);
437
438 up_ptr = up_prb;
439 down_ptr = down_prb;
440 w_size = 0;
441 while(up_ptr != up_prb_end) {
442 variance_up += pow((*up_prb)-50 - mean_up,2);
443 variance_down += pow((*down_prb)-50 - mean_down,2);
444 w_size++;
445 up_ptr++;
446 down_ptr++;
447 }
448 variance_up /= (w_size-1);
449 variance_down /= (w_size-1);
450
451 //printf("variances: %f %f\n",variance_up,variance_down);
452
453 if ( abs(mean_up - mean_down) < epsilon_mean && abs(variance_up - variance_down) < epsilon_var )
454 return 1;
455
456 return 0;
457 }
458
459 void remove_ambiguities(char * old_seq, int old_seq_size, char* new_seq) {
460 //printf("old seq: %s\n",old_seq);
461 //printf("new seq: %s\n",new_seq);
462
463 int idx=0;
464 int new_idx = 0;
465 while(idx<old_seq_size) {
466 if (old_seq[idx] == '[') {
467 new_seq[new_idx] = old_seq[++idx];
468 new_idx++;
469 idx += 3;
470 continue;
471 }
472
473 new_seq[new_idx] = old_seq[idx++];
474 new_idx++;
475 }
476 //printf("old seq: %s\n",old_seq);
477 //printf("new seq: %s\n",new_seq);
478 }
479
480 /*
481 * TODO:
482 * - Check strand -> done simple (only if equal)
483 * - check for [AC] and similar entries -> done simple (see function
484 * remove_ambiguities (exchanges [XY] by the first entry)
485 */