+ refactored code further
[qpalma.git] / tools / data_tools / filterReads.c
1 ////////////////////////////////////////////////////////////
2 // The purpose of this program is to read a gff and a
3 // solexa reads file and create a data set used by QPalma.
4 //
5 //
6 ////////////////////////////////////////////////////////////
7
8 #include <sys/mman.h>
9 #include <sys/stat.h>
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <string.h>
13 #include <unistd.h>
14 #include <math.h>
15
16 #include "debug_tools.h"
17 #include "datastructures.h"
18
19 #define _FILE_OFFSET_BITS == 64
20
21 typedef struct tuple {
22 int first;
23 int second;
24 } Tuple;
25
26
27 int compare_gene_struct(struct gene* a, struct gene* b) {
28 return a->stop - b->start;
29 }
30
31 int parse_gff(char* filename,FILE* fid,struct gene*** allGenes);
32 void sort_genes(struct gene*** allGenes, int numGenes);
33 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes,FILE* out_fs);
34 void combine_info(int exon_stop, int exon_start, Read** upstream, int up_size, Read** downstream, int down_size, FILE* out_fs,const char* gene_id, int exon_idx);
35 int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FILE* out_fs,const char* gene_id, int exon_idx);
36 int fitting(char* up_prb, char* down_prb, int u_off, int d_off, int u_size, int d_size);
37 void remove_ambiguities(char * old_seq, char* new_seq);
38 Tuple join_seq(char* new_seq, char* up_seq,int u_off, int u_size, char* down_seq, int d_off, int d_size, int d_range);
39
40 static char *info = "Usage is:\n./filterReads gff reads output";
41
42 /*
43 * Some constants specifying the exact behavior of the filter
44 *
45 */
46
47 #define _FDEBUG_ 1
48
49 #define DEBUG_READ 38603
50
51 const char* line_format = "%d\t%d\t%s\t%lu\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n";
52
53 const int read_size = 36;
54
55 const int min_overlap = 1;
56 const int max_overlap = 35;
57
58 unsigned long read_nr = 1;
59
60 int combined_reads = 0;
61
62
63 int main(int argc, char* argv[]) {
64
65 if(argc != 5) {
66 printf("%s\n",info);
67 exit(EXIT_FAILURE);
68 }
69
70 int status;
71 int filenameSize = 256;
72 char* gff_filename = malloc(sizeof(char)*filenameSize);
73 char* reads_filename = malloc(sizeof(char)*filenameSize);
74 char* output_filename = malloc(sizeof(char)*filenameSize);
75
76
77 strncpy(gff_filename,argv[1],filenameSize);
78 strncpy(reads_filename,argv[2],filenameSize);
79 strncpy(output_filename,argv[3],filenameSize);
80
81 FILE *gff_fs = fopen(gff_filename,"r");
82 FILE *reads_fs = fopen(reads_filename,"r");
83 FILE *out_fs = fopen(output_filename,"w");
84
85 read_nr = strtoul(argv[4],NULL,10);
86 read_nr++;
87
88 if(gff_fs == NULL) {
89 printf("Error: Could not open file: %s",gff_filename);
90 exit(EXIT_FAILURE);
91 }
92
93 if(reads_fs == NULL) {
94 printf("Error: Could not open file: %s",reads_filename);
95 exit(EXIT_FAILURE);
96 }
97
98 if(out_fs == NULL) {
99 printf("Error: Could not open file: %s",output_filename);
100 exit(EXIT_FAILURE);
101 }
102
103 struct gene** allGenes;
104 int numGenes = parse_gff(gff_filename,gff_fs,&allGenes);
105 status = fclose(gff_fs);
106 free(gff_filename);
107 if(status != 0)
108 printf("closing of gff filestream failed!\n");
109
110 printf("Successfully parsed gff file! Found %d genes.\n",numGenes);
111
112 // check if allGenes is sorted. if not throw away those genes that do not
113 // occur in the sorted order
114 int g_idx;
115 struct gene* currentGene = 0;
116 int nulled_genes=0;
117 int old_gene_stop = -1;
118 for(g_idx=0;g_idx<numGenes;g_idx++) {
119 currentGene = allGenes[g_idx];
120
121 if (! (currentGene->start < currentGene->stop))
122 printf("Invalid positions for gene %s!\n",currentGene->id);
123
124 if (! (old_gene_stop < currentGene->start ) ) {
125 old_gene_stop = currentGene->stop;
126 allGenes[g_idx] = 0;
127 nulled_genes++;
128 continue;
129 }
130 old_gene_stop = currentGene->stop;
131 }
132
133 printf("Found %d unordered genes.\n",nulled_genes);
134 int gidx, eidx;
135 int exon_cov = 0;
136 for(gidx=0;gidx<numGenes;gidx++) {
137 if (allGenes[gidx] == 0)
138 continue;
139
140 for(eidx=0;eidx<allGenes[gidx]->num_exons;eidx++) {
141 exon_cov += allGenes[gidx]->exon_stops[eidx] - allGenes[gidx]->exon_starts[eidx];
142 }}
143 printf("Exon coverage is %f\n",(double)exon_cov/30432563);
144
145 process_reads(reads_fs,&allGenes,numGenes,out_fs);
146
147 for(g_idx=0;g_idx<numGenes;g_idx++) {
148 if(allGenes[g_idx] != 0)
149 free_gene(allGenes[g_idx]);
150 free(allGenes[g_idx]);
151
152 }
153 free(allGenes);
154
155 status = fclose(reads_fs);
156 status = fclose(out_fs);
157 if(status != 0)
158 perror("fclose");
159
160 free(reads_filename);
161 free(output_filename);
162 return 0;
163 }
164
165 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* out_fs) {
166 int status;
167
168 int buffer_size= 64;
169 int chr = 0;
170 int pos = 0;
171 char* seq = malloc(sizeof(char)*buffer_size);
172 unsigned long id = 0;
173 char strand = ' ';
174 int mismatch = 0;
175 int occurrence = 0;
176 int size = 0;
177 int cut = 0;
178 char* prb = malloc(sizeof(char)*buffer_size);
179 char* cal_prb = malloc(sizeof(char)*buffer_size);
180 char* chastity = malloc(sizeof(char)*buffer_size);
181
182 int reads_fid = fileno(reads_fs);
183 struct stat reads_stat;
184 if ( fstat(reads_fid,&reads_stat) == -1) {
185 perror("fstat");
186 exit(EXIT_FAILURE);
187 }
188 off_t reads_filesize = reads_stat.st_size;
189 printf("Reads file is of size %lu bytes\n",(unsigned long) reads_filesize);
190 int numReads = reads_filesize / 178.0;
191
192 void *reads_area = mmap (NULL,reads_filesize,PROT_READ,MAP_PRIVATE,reads_fid,0);
193 if (reads_area == MAP_FAILED) {
194 perror("mmap");
195 exit(EXIT_FAILURE);
196 }
197 close(reads_fid);
198 printf("Successfully mapped %lu bytes of reads file into memory\n",(unsigned long)reads_filesize);
199
200 char* lineBeginPtr = (char*) reads_area;
201 char* lineEndPtr = (char*) reads_area;
202 char* end_of_mapped_area = ((char*) reads_area) + reads_filesize;
203
204 while (*lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
205 lineEndPtr++;
206
207 char* current_line = malloc(sizeof(char)*512);
208 memset(current_line,0,512);
209
210 int SIZE = 500;
211 // initialize boundary arrays
212 Read** upstream_overlap = malloc(sizeof(Read*)*SIZE);
213 Read** downstream_overlap= malloc(sizeof(Read*)*SIZE);
214 int uov,dov;
215 uov = dov = 0;
216
217 int skippedLinesCounter = 0;
218
219 int prev_exon_start = -1;
220 int prev_exon_stop = -1;
221 int cur_exon_start = -1;
222
223 unsigned long line_size = lineEndPtr - lineBeginPtr;
224 //printf("distance is %lu\n",line_size);
225 strncpy(current_line,lineBeginPtr,line_size);
226 current_line[line_size] = '\0';
227 //printf("%s test",current_line);
228
229 int gene_idx = 0;
230 int exon_idx = 1;
231 struct gene* currentGene = (*allGenes)[gene_idx];
232 char* gene_id = currentGene->id;
233
234 int skippedReadCtr = 0;
235 int uselessReadCtr = 0;
236 int exonicReadCtr = 0;
237 int endPrevCtr = 0;
238 int prevOverlapCtr = 0;
239 int currentStartCtr = 0;
240 int currentOverlapCtr = 0;
241 int multioccurReadCtr = 0;
242
243 Read* currentRead;
244 int up_idx, down_idx;
245
246 int readCtr = 0;
247 // start of the parsing loop
248 while(1) {
249 if (gene_idx == numGenes || strcmp(current_line,"") == 0)
250 break;
251
252 gene_id = currentGene->id;
253
254 if (readCtr != 0 && readCtr % 1000000 == 0)
255 printf("Processed %d/%d genes and %d/%d reads.\n",gene_idx,numGenes,readCtr,numReads);
256
257 status = sscanf(current_line,line_format,&chr,&pos,seq,&id,&strand,&mismatch,&occurrence,&size,&cut,prb,cal_prb,chastity);
258 if (status < 12) {
259 skippedLinesCounter++;
260 goto next_read;
261 }
262
263 // if the read is occurring several times elsewhere then get rid of it
264 if ( occurrence != 1 ) {
265 multioccurReadCtr++;
266 goto next_read;
267 }
268
269 FA(strlen(seq) >= read_size);
270
271 FA(currentGene != 0);
272
273 if ( currentGene->start <= pos && (pos + read_size-1) <= currentGene->stop) { // read is within gene borders
274
275 exon_label:
276
277 if (exon_idx == currentGene->num_exons) {
278 gene_idx++;
279 exon_idx = 1;
280 currentGene = (*allGenes)[gene_idx];
281 while( currentGene == 0 && gene_idx < numGenes) {
282 currentGene = (*allGenes)[gene_idx];
283 gene_idx++;
284 }
285 continue;
286 }
287
288 prev_exon_start = currentGene->exon_starts[exon_idx-1];
289 prev_exon_stop = currentGene->exon_stops[exon_idx-1];
290 cur_exon_start = currentGene->exon_starts[exon_idx];
291
292 //printf("exon: %d %d %d\t pos: %d\n",prev_exon_start,prev_exon_stop,cur_exon_start,pos);
293
294 if (cur_exon_start - prev_exon_stop < min_overlap || cur_exon_start < pos ) { // go to next exon
295
296 if (uov != 0 && dov != 0)
297 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
298
299 for(up_idx=0;up_idx<uov;up_idx++) {
300 free_read(upstream_overlap[up_idx]);
301 free(upstream_overlap[up_idx]);
302 }
303
304 for(down_idx=0;down_idx<dov;down_idx++) {
305 free_read(downstream_overlap[down_idx]);
306 free(downstream_overlap[down_idx]);
307 }
308
309 uov = dov = 0;
310
311 exon_idx++;
312 goto exon_label;
313 }
314
315 if ( prev_exon_start < pos && (pos+read_size) < prev_exon_stop ) { // read is inside previous exon
316 exonicReadCtr++;
317 goto next_read;
318 }
319
320 if ( pos + (read_size-1) < prev_exon_stop ) // go to next read
321 goto next_read;
322
323 // if this position is reached the read is somehow overlapping or
324 // exactly on the exon boundary.
325 if ( (prev_exon_stop - pos + 1) >= min_overlap && (prev_exon_stop - pos + 1) <= max_overlap ) { // read overlaps with previous exon boundary
326 //printf("%s\n",current_line);
327 prevOverlapCtr++;
328 currentRead = create_read(chr,pos,seq,id,strand,mismatch,occurrence,size,cut,prb,cal_prb,chastity);
329 upstream_overlap[uov] = currentRead;
330 uov++;
331 goto next_read;
332 }
333
334 int end_pos = pos+read_size-1;
335 if ( (end_pos - cur_exon_start + 1) >= min_overlap && (end_pos - cur_exon_start + 1) <= max_overlap ) { // read overlaps with current exon boundary
336 //printf("%s\n",current_line);
337 currentOverlapCtr++;
338 currentRead = create_read(chr,pos,seq,id,strand,mismatch,occurrence,size,cut,prb,cal_prb,chastity);
339 downstream_overlap[dov] = currentRead;
340 dov++;
341 goto next_read;
342 }
343
344 uselessReadCtr++;
345 goto next_read; // read was not useful i.e. not overlapping/starting at exon boundaries
346
347 } else { // read is not within gene borders
348
349 if (uov != 0 && dov != 0)
350 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
351
352 for(up_idx=0;up_idx<uov;up_idx++) {
353 free_read(upstream_overlap[up_idx]);
354 free(upstream_overlap[up_idx]);
355 }
356
357 for(down_idx=0;down_idx<dov;down_idx++) {
358 free_read(downstream_overlap[down_idx]);
359 free(downstream_overlap[down_idx]);
360 }
361
362 uov = dov = 0;
363
364 if ( currentGene->stop < (pos + read_size-1) ) { // go to next gene
365 gene_idx++;
366 exon_idx = 1;
367 currentGene = (*allGenes)[gene_idx];
368 while( currentGene == 0 && gene_idx < numGenes) {
369 currentGene = (*allGenes)[gene_idx];
370 gene_idx++;
371 }
372 //printf("currentGene->start / currentGene->stop %d/%d pos is %d\n",currentGene->start,currentGene->stop,pos);
373 continue;
374 }
375
376 if ( pos < currentGene->start ) { // go to next read
377 skippedReadCtr++;
378
379 next_read:
380
381 lineBeginPtr = lineEndPtr;
382 while (*(char*)lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
383 lineEndPtr++;
384 readCtr += 1;
385 current_line = strncpy(current_line,lineBeginPtr,lineEndPtr-lineBeginPtr);
386 current_line[lineEndPtr-lineBeginPtr] = '\0';
387 continue;
388 }
389 }
390 }
391
392 if (uov != 0 && dov != 0)
393 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
394
395 for(up_idx=0;up_idx<uov;up_idx++) {
396 free_read(upstream_overlap[up_idx]);
397 free(upstream_overlap[up_idx]);
398 }
399
400 for(down_idx=0;down_idx<dov;down_idx++) {
401 free_read(downstream_overlap[down_idx]);
402 free(downstream_overlap[down_idx]);
403 }
404
405 uov = dov = 0;
406
407 free(upstream_overlap);
408 free(downstream_overlap);
409
410 printf("Processed %d reads in total while %d reads where skipped.\n",readCtr,skippedLinesCounter);
411 printf("%d were totally intronic. %d were outside of genes and %d were occurring on several positions.\n",uselessReadCtr,skippedReadCtr,multioccurReadCtr);
412 printf("%d reads were useless in total\n",uselessReadCtr+skippedReadCtr+multioccurReadCtr);
413 printf("%d reads were totally exonic\n",exonicReadCtr);
414 printf("%d reads overlap with prev. exon. %d reads overlap with current exon\n",prevOverlapCtr,currentOverlapCtr);
415 printf("%d reads where newly combined from two original reads\n",combined_reads);
416 printf("Total used reads: %d\n",exonicReadCtr+endPrevCtr+prevOverlapCtr+currentStartCtr+currentOverlapCtr);
417
418 status = munmap(reads_area,reads_filesize);
419 if(status != 0)
420 perror("munmap");
421
422 free(current_line);
423 free(seq);
424 free(prb);
425 free(cal_prb);
426 free(chastity);
427 }
428
429 void combine_info(int exon_stop, int exon_start, Read** upstream, int up_size, Read** downstream, int down_size, FILE* out_fs,const char* gene_id, int exon_idx) {
430
431 //printf("up/down size is %d/%d\n",up_size,down_size);
432
433 int up_idx, down_idx, success;
434
435 char* up_used_flag = calloc(up_size,sizeof(char));
436 char* down_used_flag = calloc(down_size,sizeof(char));
437
438 Read* currentUpRead;
439 Read* currentDownRead;
440
441 for(up_idx=0;up_idx<up_size;up_idx++) {
442 if( up_used_flag[up_idx] == 1)
443 continue;
444
445 currentUpRead = upstream[up_idx];
446
447 for(down_idx=0;down_idx<down_size;down_idx++) {
448
449 if( up_used_flag[up_idx] == 1 || down_used_flag[down_idx] == 1)
450 continue;
451
452 currentDownRead = downstream[down_idx];
453
454 if(currentUpRead->strand != currentDownRead->strand)
455 continue;
456
457 success = join_reads(exon_stop,exon_start,currentUpRead,currentDownRead,out_fs,gene_id,exon_idx);
458
459 if(success == 1) {
460 up_used_flag[up_idx] = 1;
461 down_used_flag[down_idx] = 1;
462 }
463
464 }
465 }
466
467 free(up_used_flag);
468 free(down_used_flag);
469 }
470
471 int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FILE* out_fs,const char* gene_id, int exon_idx) {
472 // range of possible sequence length on exon side
473 int up_range = exon_stop - up_read->pos+1;
474 int down_range = down_read->pos+(read_size-1) - exon_start+1;
475 int retval;
476
477 int u_off, d_off, u_size, d_size;
478 u_size = d_size = -1;
479
480 //printf("possible range %d %d\n",up_range,down_range);
481
482 if(up_range+down_range < read_size)
483 return 0;
484
485 // both overlap more than a half of the new read
486 //if(up_range >= half_read && down_range >= half_read) {
487
488 // u_size = half_read;
489 // d_size = half_read;
490
491 //} else {
492
493 //if(up_range < down_range) {
494 // u_size = up_range;
495 // d_size = read_size - u_size;
496 //}
497
498 //if(up_range > down_range) {
499 // d_size = down_range;
500 // u_size = read_size - d_size;
501 //}
502
503 //if(up_range < down_range) {
504 if (read_nr % 2 != 0) {
505 d_size = down_range;
506 u_size = read_size - d_size;
507 }
508
509 //if(up_range >= down_range) {
510 if (read_nr % 2 == 0) {
511 u_size = up_range;
512 d_size = read_size - u_size;
513 }
514 //}
515
516 int p_start = exon_stop - u_size + 1;
517 int p_stop = exon_start + d_size - 1;
518
519 u_off = p_start - up_read->pos;
520 d_off = exon_start - down_read->pos;
521
522 FA(u_size != -1);
523 FA(d_size != -1);
524 FA(u_size + d_size == read_size);
525
526 int buf_size = 4*read_size;
527 char* new_up_seq = malloc(sizeof(char)*buf_size);
528 char* new_down_seq = malloc(sizeof(char)*buf_size);
529 char* new_seq = malloc(sizeof(char)*buf_size);
530
531 memset(new_up_seq,0,sizeof(char)*buf_size);
532 memset(new_down_seq,0,sizeof(char)*buf_size);
533 memset(new_seq,0,sizeof(char)*buf_size);
534
535 //remove_ambiguities(up_read->seq,new_up_seq);
536 //remove_ambiguities(down_read->seq,new_down_seq);
537 //
538 strncpy(new_up_seq,up_read->seq,strlen(up_read->seq));
539 strncpy(new_down_seq,down_read->seq,strlen(down_read->seq));
540
541 Tuple jinfo = join_seq(new_seq,new_up_seq,u_off,u_size,new_down_seq,d_off,d_size,down_range);
542 int cut_pos = jinfo.first;
543 int additional_pos = jinfo.second;
544
545 buf_size = read_size+1+additional_pos;
546
547 char* new_prb = malloc(sizeof(char)*buf_size);
548 char* new_cal_prb = malloc(sizeof(char)*buf_size);
549 char* new_chastity = malloc(sizeof(char)*buf_size);
550
551 if( additional_pos > (down_range - d_size) ) {
552 retval = 0;
553 goto free;
554 }
555
556 strncpy(new_prb, up_read->prb+u_off, u_size);
557 strncpy(new_prb+u_size, down_read->prb+d_off, d_size+additional_pos);
558 new_prb[read_size+additional_pos] = '\0';
559
560 strncpy(new_cal_prb, up_read->cal_prb+u_off, u_size);
561 strncpy(new_cal_prb+u_size, down_read->cal_prb+d_off, d_size+additional_pos);
562 new_cal_prb[read_size+additional_pos] = '\0';
563
564 strncpy(new_chastity, up_read->chastity+u_off, u_size);
565 strncpy(new_chastity+u_size, down_read->chastity+d_off, d_size+additional_pos);
566 new_chastity[read_size+additional_pos] = '\0';
567
568 #ifdef _FDEBUG_
569 if(read_nr == DEBUG_READ) {
570 printf("read nr %lu",read_nr);
571 printf("pos: %d %d %d %d\n",p_start,exon_stop,exon_start,p_stop);
572 printf("u/d range: %d %d\n",up_range,down_range);
573 printf("u/d size: %d %d\n",u_size,d_size);
574 printf("u/d off: %d %d\n",u_off,d_off);
575 printf("add_pos/down_range-d_size: %d %d\n",additional_pos,(down_range - d_size));
576
577 printf("******************\n");
578
579 printf("%s\n",up_read->seq);
580 printf("%s\n",new_up_seq);
581
582 printf("******************\n");
583
584 printf("%s\n",down_read->seq);
585 printf("%s\n",new_down_seq);
586
587 printf("******************\n");
588 printf("%s\n",new_seq);
589 printf("%s\n",new_prb);
590 printf("%s\n",new_cal_prb);
591 printf("%s\n",new_chastity);
592 }
593 #endif // _FDEBUG_
594
595 int status = 1; //fitting(up_read->prb,down_read->prb,u_off,d_off,u_size,d_size);
596
597 if(status != 1) {
598 retval = 0;
599 goto free;
600 }
601
602 retval = status;
603
604 //fprintf(out_fs,"%d\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%s\t%s\n",
605 // read_nr,up_read->chr,up_read->strand,new_seq,cut_pos,read_size,new_prb,new_cal_prb,new_chastity,gene_id,p_start,exon_stop,exon_start,p_stop,up_read->seq,down_read->seq);
606
607 fprintf(out_fs,"%lu\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\n",
608 read_nr,up_read->chr,up_read->strand,new_seq,cut_pos,read_size,new_prb,new_cal_prb,new_chastity,gene_id,p_start,exon_stop,exon_start,p_stop,u_size);
609
610 read_nr++;
611 combined_reads++;
612
613 free:
614 free(new_up_seq);
615 free(new_down_seq);
616
617 free(new_seq);
618 free(new_prb);
619 free(new_cal_prb);
620 free(new_chastity);
621
622 return retval;
623 }
624
625 int fitting(char* up_prb, char* down_prb, int u_off, int d_off, int u_size, int d_size) {
626 double current_mean_up = 0;
627 double current_mean_down = 0;
628
629 int w_size = 2;
630
631 int idx;
632
633 //printf("prb %s\n",up_prb);
634 //printf("prb %s\n",down_prb);
635
636 for(idx=0;idx<w_size;idx++) {
637 current_mean_up += up_prb[u_off+u_size+idx]-50;
638 current_mean_up += up_prb[u_off+u_size-idx]-50;
639 current_mean_up -= up_prb[idx]-50;
640
641 current_mean_down += up_prb[d_off+idx]-50;
642 current_mean_down += up_prb[d_off+idx]-50;
643 current_mean_down -= up_prb[idx]-50;
644 }
645
646 current_mean_up /= (2*w_size+1);
647 current_mean_down /= (2*w_size+1);
648
649 double ratio;
650 if( current_mean_up > current_mean_down)
651 ratio = current_mean_down / current_mean_up;
652 else
653 ratio = current_mean_up / current_mean_down;
654
655 //printf("ratio is %f\n",ratio);
656
657 if (ratio < 0.35)
658 return 0;
659
660 return 1;
661 }
662
663 /*
664 * As we can have 3 kinds of ambiguities
665 *
666 * [AG]
667 * [A-]
668 * [-A]
669 *
670 * we want to remove/store these ambiguities in the filtered reads
671 */
672
673 void remove_ambiguities(char * old_seq, char* new_seq) {
674
675 FA(0);
676
677 int idx=0;
678 int new_idx = 0;
679 int old_seq_size = strlen(old_seq);
680
681 while(idx<old_seq_size) {
682 //printf("Current elem %c pos %d %d\n",old_seq[idx],idx,new_idx);
683 if (old_seq[idx] == ']') {
684 idx++;
685 continue;
686 }
687
688 if (old_seq[idx] == '[') {
689
690 // we have a indel either on the dna or read sequence
691 if (old_seq[idx+1] == '-' || old_seq[idx+2] == '-') {
692
693
694 while(1) {
695 new_seq[new_idx] = old_seq[idx];
696 if(old_seq[idx] == ']')
697 break;
698 new_idx++;
699 idx++;
700 }
701
702 } else {
703 idx += 2;
704 continue;
705 }
706 }
707
708 new_seq[new_idx] = old_seq[idx];
709 idx++;
710 new_idx++;
711 }
712
713 new_seq[new_idx] = '\0';
714 //printf("new_seq is %d :: %s\n",new_idx,new_seq);
715 }
716
717
718 Tuple join_seq(char* new_seq, char* up_seq,int u_off, int u_size, char* down_seq, int d_off, int d_size, int d_range) {
719 int new_idx, idx;
720 new_idx = idx = 0;
721 int elem_ctr = 0;
722 int read_del_ctr = 0;
723
724 int u_idx = 0;
725 int skipped_pos_ctr = 0;
726 while(1) {
727 if( u_idx >= read_size || skipped_pos_ctr == u_off)
728 break;
729
730 if(up_seq[u_idx] == '[') {
731 u_idx += 4;
732 skipped_pos_ctr++;
733 } else {
734 u_idx++;
735 skipped_pos_ctr++;
736 }
737 }
738
739 if(read_nr == DEBUG_READ) {
740 printf("u_off is %d\n",u_off);
741 printf("u_idx is %d\n",u_idx);
742 }
743 u_off = u_idx;
744
745
746 if( u_off > 0 && up_seq[u_off+idx-1] == '[' )
747 idx = -1;
748
749 if( u_off > 1 && up_seq[u_off+idx-2] == '[' )
750 idx = -2;
751
752 if( u_off > 2 && up_seq[u_off+idx-3] == '[' )
753 idx = -3;
754
755 while(1) {
756 if (elem_ctr == u_size)
757 break;
758
759 if(up_seq[u_off+idx] == '[') {
760
761 if(up_seq[u_off+idx+2] == '-') // del in read seq.
762 read_del_ctr++;
763
764 while(1) {
765 new_seq[new_idx] = up_seq[u_off+idx];
766 new_idx++;
767 idx++;
768 if(up_seq[u_off+idx-1] == ']')
769 break;
770 }
771 elem_ctr++;
772 continue;
773 }
774
775 new_seq[new_idx] = up_seq[u_off+idx];
776 idx++;
777 new_idx++;
778 elem_ctr++;
779 }
780
781 elem_ctr = 0;
782 idx = 0;
783
784 Tuple tuple;
785 tuple.first = new_idx;
786
787 int d_idx = 0;
788 skipped_pos_ctr = 0;
789 while(1) {
790 if( d_idx >= read_size || skipped_pos_ctr == d_off)
791 break;
792
793 if(down_seq[d_idx] == '[') {
794 d_idx += 4;
795 skipped_pos_ctr++;
796 } else {
797 d_idx++;
798 skipped_pos_ctr++;
799 }
800 }
801
802 if(read_nr == DEBUG_READ) {
803 printf("d_off is %d\n",d_off);
804 printf("d_idx is %d\n",d_idx);
805 }
806 d_off = d_idx;
807 //if(read_nr == DEBUG_READ) {
808 // printf("%s\n",new_seq);
809 // printf("%c\n",down_seq[d_off+idx]);
810 //}
811
812 if( d_off > 0 && down_seq[d_off+idx-1] == '[' )
813 idx = -1;
814
815 if( d_off > 1 && down_seq[d_off+idx-2] == '[' )
816 idx = -2;
817
818 if( d_off > 2 && down_seq[d_off+idx-3] == '[' )
819 idx = -3;
820
821 while(1) {
822 if (elem_ctr == d_size)
823 break;
824
825 if(down_seq[d_off+idx] == '[') {
826
827 if(down_seq[d_off+idx+2] == '-') // del in read seq.
828 read_del_ctr++;
829
830 while(1) {
831 new_seq[new_idx] = down_seq[d_off+idx];
832 new_idx++;
833 idx++;
834 if(down_seq[d_off+idx-1] == ']')
835 break;
836 }
837 elem_ctr++;
838 continue;
839 }
840
841 new_seq[new_idx] = down_seq[d_off+idx];
842 idx++;
843 new_idx++;
844 elem_ctr++;
845 }
846
847 elem_ctr = 0;
848
849 #ifdef _FDEBUG_
850 if(read_nr == DEBUG_READ) {
851 printf("read_del_ctr %d\n",read_del_ctr);
852 printf("idx/new_idx: %d %d\n",idx,new_idx);
853 printf("next char: %c\n",down_seq[d_off+idx]);
854 printf("down_seq: %s\n",down_seq);
855 }
856 #endif // _FDEBUG_
857
858 while(1) {
859 if (elem_ctr == read_del_ctr || d_off+idx >= strlen(down_seq))
860 break;
861
862 if(down_seq[d_off+idx] == '[') {
863
864 while(1) {
865 new_seq[new_idx] = down_seq[d_off+idx];
866 new_idx++;
867 idx++;
868 if(down_seq[d_off+idx-1] == ']')
869 break;
870 }
871 elem_ctr++;
872 continue;
873 }
874
875 #ifdef _FDEBUG_
876 if(read_nr == DEBUG_READ)
877 printf("current char: %c\n",down_seq[d_off+idx]);
878 #endif // _FDEBUG_
879
880 new_seq[new_idx] = down_seq[d_off+idx];
881 new_idx++;
882 idx++;
883 elem_ctr++;
884 }
885
886 new_seq[new_idx+1] = '\0';
887
888 #ifdef _FDEBUG_
889 if(read_nr == DEBUG_READ)
890 printf("idx/new_idx: %d %d\n",idx,new_idx);
891 #endif // _FDEBUG_
892
893 tuple.second = read_del_ctr;
894 return tuple;
895 }
896
897 void print_read(Read* cRead) {
898
899 printf(line_format,
900 cRead->chr, cRead->pos, cRead->seq, cRead->id,
901 cRead->strand, cRead->mismatch, cRead->occurrence,
902 cRead->size, cRead->cut, cRead->prb, cRead->cal_prb,
903 cRead->chastity);
904 }
905
906 /*
907 * TODO:
908 * - Check strand -> done simple (only if equal)
909 * - check for [AC] and similar entries -> done simple (see function
910 * - remove_ambiguities (exchanges [XY] by the second entry)
911 */
912
913 //if( read_nr == 13 || read_nr == 14 ) {
914 // printf("read nr %d",read_nr);
915 // printf("pos: %d %d %d %d\n",p_start,exon_stop,exon_start,p_stop);
916 // printf("u/d range: %d %d\n",up_range,down_range);
917 // printf("u/d size: %d %d\n",u_size,d_size);
918 // printf("u/d off: %d %d\n",u_off,d_off);
919 // printf("******************\n");
920
921 // printf("%s\n",up_read->seq);
922 // printf("%s\n",down_read->seq);
923
924 // printf("******************\n");
925
926 // printf("%s\n",new_up_seq);
927 // printf("%s\n",new_down_seq);
928
929 // printf("******************\n");
930 // printf("%s\n",new_seq);
931 // printf("%s\n",new_prb);
932 // printf("%s\n",new_cal_prb);
933 // printf("%s\n",new_chastity);
934 //}
935
936 //// The four last integers specify the positions of
937 //// p_start : the position in the dna where the first read starts
938 //// exons_stop : the position in the dna where the first exons ends
939 //// exons_start : the position in the dna where the second exons starts
940 //// p_stop : the position in the dna where the (truncated) second read ends