+ fixed index of reads in filterReads
[qpalma.git] / tools / data_tools / filterReads.c
1 ////////////////////////////////////////////////////////////
2 // The purpose of this program is to read a gff and a
3 // solexa reads file and create a data set used by QPalma.
4 //
5 //
6 ////////////////////////////////////////////////////////////
7
8 #include <sys/mman.h>
9 #include <sys/stat.h>
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <string.h>
13 #include <unistd.h>
14 #include <math.h>
15
16 #include "debug_tools.h"
17 #include "datastructures.h"
18
19 #define _FILE_OFFSET_BITS == 64
20
21 typedef struct tuple {
22 int first;
23 int second;
24 } Tuple;
25
26
27 int compare_gene_struct(struct gene* a, struct gene* b) {
28 return a->stop - b->start;
29 }
30
31 int parse_gff(char* filename,FILE* fid,struct gene*** allGenes);
32 void sort_genes(struct gene*** allGenes, int numGenes);
33 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes,FILE* out_fs);
34 void combine_info(int exon_stop, int exon_start, Read** upstream, int up_size, Read** downstream, int down_size, FILE* out_fs,const char* gene_id, int exon_idx);
35 int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FILE* out_fs,const char* gene_id, int exon_idx);
36 int fitting(char* up_prb, char* down_prb, int u_off, int d_off, int u_size, int d_size);
37 void remove_ambiguities(char * old_seq, char* new_seq);
38 Tuple join_seq(char* new_seq, char* up_seq,int u_off, int u_size, char* down_seq, int d_off, int d_size, int d_range);
39
40 static char *info = "Usage is:\n./filterReads gff reads output";
41
42 /*
43 * Some constants specifying the exact behavior of the filter
44 *
45 */
46
47 #define _FDEBUG_ 0
48
49 #define DEBUG_READ 38603
50
51 const char* line_format = "%d\t%d\t%s\t%lu\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n";
52
53 const int read_size = 36;
54
55 const int min_overlap = 1;
56 const int max_overlap = 35;
57
58 unsigned long read_nr = 1;
59
60 int combined_reads = 0;
61
62
63 int main(int argc, char* argv[]) {
64
65 if(argc != 5) {
66 printf("%s\n",info);
67 exit(EXIT_FAILURE);
68 }
69
70 int status;
71 int filenameSize = 256;
72 char* gff_filename = malloc(sizeof(char)*filenameSize);
73 char* reads_filename = malloc(sizeof(char)*filenameSize);
74 char* output_filename = malloc(sizeof(char)*filenameSize);
75
76
77 strncpy(gff_filename,argv[1],filenameSize);
78 strncpy(reads_filename,argv[2],filenameSize);
79 strncpy(output_filename,argv[3],filenameSize);
80
81 FILE *gff_fs = fopen(gff_filename,"r");
82 FILE *reads_fs = fopen(reads_filename,"r");
83 FILE *out_fs = fopen(output_filename,"w");
84
85 read_nr = strtoul(argv[4],NULL,10);
86 read_nr++;
87
88 if(gff_fs == NULL) {
89 printf("Error: Could not open file: %s",gff_filename);
90 exit(EXIT_FAILURE);
91 }
92
93 if(reads_fs == NULL) {
94 printf("Error: Could not open file: %s",reads_filename);
95 exit(EXIT_FAILURE);
96 }
97
98 if(out_fs == NULL) {
99 printf("Error: Could not open file: %s",output_filename);
100 exit(EXIT_FAILURE);
101 }
102
103 struct gene** allGenes;
104 int numGenes = parse_gff(gff_filename,gff_fs,&allGenes);
105 status = fclose(gff_fs);
106 free(gff_filename);
107 if(status != 0)
108 printf("closing of gff filestream failed!\n");
109
110 printf("Successfully parsed gff file! Found %d genes.\n",numGenes);
111
112 // check if allGenes is sorted. if not throw away those genes that do not
113 // occur in the sorted order
114 int g_idx;
115 struct gene* currentGene = 0;
116 int nulled_genes=0;
117 int old_gene_stop = -1;
118 for(g_idx=0;g_idx<numGenes;g_idx++) {
119 currentGene = allGenes[g_idx];
120
121 if (! (currentGene->start < currentGene->stop))
122 printf("Invalid positions for gene %s!\n",currentGene->id);
123
124 if (! (old_gene_stop < currentGene->start ) ) {
125 old_gene_stop = currentGene->stop;
126 allGenes[g_idx] = 0;
127 nulled_genes++;
128 continue;
129 }
130 old_gene_stop = currentGene->stop;
131 }
132
133 printf("Found %d unordered genes.\n",nulled_genes);
134 int gidx, eidx;
135 int exon_cov = 0;
136 for(gidx=0;gidx<numGenes;gidx++) {
137 if (allGenes[gidx] == 0)
138 continue;
139
140 for(eidx=0;eidx<allGenes[gidx]->num_exons;eidx++) {
141 exon_cov += allGenes[gidx]->exon_stops[eidx] - allGenes[gidx]->exon_starts[eidx];
142 }}
143 printf("Exon coverage is %f\n",(double)exon_cov/30432563);
144
145 process_reads(reads_fs,&allGenes,numGenes,out_fs);
146
147 for(g_idx=0;g_idx<numGenes;g_idx++) {
148 if(allGenes[g_idx] != 0)
149 free_gene(allGenes[g_idx]);
150 free(allGenes[g_idx]);
151
152 }
153 free(allGenes);
154
155 status = fclose(reads_fs);
156 status = fclose(out_fs);
157 if(status != 0)
158 perror("fclose");
159
160 free(reads_filename);
161 free(output_filename);
162 return 0;
163 }
164
165 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* out_fs) {
166 int status;
167
168 int buffer_size= 64;
169 int chr = 0;
170 int pos = 0;
171 char* seq = malloc(sizeof(char)*buffer_size);
172 unsigned long id = 0;
173 char strand = ' ';
174 int mismatch = 0;
175 int occurrence = 0;
176 int size = 0;
177 int cut = 0;
178 char* prb = malloc(sizeof(char)*buffer_size);
179 char* cal_prb = malloc(sizeof(char)*buffer_size);
180 char* chastity = malloc(sizeof(char)*buffer_size);
181
182 int reads_fid = fileno(reads_fs);
183 struct stat reads_stat;
184 if ( fstat(reads_fid,&reads_stat) == -1) {
185 perror("fstat");
186 exit(EXIT_FAILURE);
187 }
188 off_t reads_filesize = reads_stat.st_size;
189 printf("Reads file is of size %lu bytes\n",(unsigned long) reads_filesize);
190 int numReads = reads_filesize / 178.0;
191
192 void *reads_area = mmap (NULL,reads_filesize,PROT_READ,MAP_PRIVATE,reads_fid,0);
193 if (reads_area == MAP_FAILED) {
194 perror("mmap");
195 exit(EXIT_FAILURE);
196 }
197 close(reads_fid);
198 printf("Successfully mapped %lu bytes of reads file into memory\n",(unsigned long)reads_filesize);
199
200 char* lineBeginPtr = (char*) reads_area;
201 char* lineEndPtr = (char*) reads_area;
202 char* end_of_mapped_area = ((char*) reads_area) + reads_filesize;
203
204 while (*lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
205 lineEndPtr++;
206
207 char* current_line = malloc(sizeof(char)*512);
208 memset(current_line,0,512);
209
210 int SIZE = 500;
211 // initialize boundary arrays
212 Read** upstream_overlap = malloc(sizeof(Read*)*SIZE);
213 Read** downstream_overlap= malloc(sizeof(Read*)*SIZE);
214 int uov,dov;
215 uov = dov = 0;
216
217 int skippedLinesCounter = 0;
218
219 int prev_exon_start = -1;
220 int prev_exon_stop = -1;
221 int cur_exon_start = -1;
222
223 unsigned long line_size = lineEndPtr - lineBeginPtr;
224 //printf("distance is %lu\n",line_size);
225 strncpy(current_line,lineBeginPtr,line_size);
226 current_line[line_size] = '\0';
227 //printf("%s test",current_line);
228
229 int gene_idx = 0;
230 int exon_idx = 1;
231 struct gene* currentGene = (*allGenes)[gene_idx];
232 char* gene_id = currentGene->id;
233
234 int skippedReadCtr = 0;
235 int uselessReadCtr = 0;
236 int exonicReadCtr = 0;
237 int endPrevCtr = 0;
238 int prevOverlapCtr = 0;
239 int currentStartCtr = 0;
240 int currentOverlapCtr = 0;
241 int multioccurReadCtr = 0;
242
243 Read* currentRead;
244 int up_idx, down_idx;
245
246 int readCtr = 0;
247 // start of the parsing loop
248 while(1) {
249 if (gene_idx == numGenes || strcmp(current_line,"") == 0)
250 break;
251
252 gene_id = currentGene->id;
253
254 if (readCtr != 0 && readCtr % 1000000 == 0)
255 printf("Processed %d/%d genes and %d/%d reads.\n",gene_idx,numGenes,readCtr,numReads);
256
257 status = sscanf(current_line,line_format,&chr,&pos,seq,&id,&strand,&mismatch,&occurrence,&size,&cut,prb,cal_prb,chastity);
258 if (status < 12) {
259 skippedLinesCounter++;
260 goto next_read;
261 }
262
263 // if the read is occurring several times elsewhere then get rid of it
264 if ( occurrence != 1 ) {
265 multioccurReadCtr++;
266 goto next_read;
267 }
268
269 if( strand != 'D' )
270 goto next_read;
271
272 FA(strlen(seq) >= read_size);
273
274 FA(currentGene != 0);
275
276 if ( currentGene->start <= pos && (pos + read_size-1) <= currentGene->stop) { // read is within gene borders
277
278 exon_label:
279
280 if (exon_idx == currentGene->num_exons) {
281 gene_idx++;
282 exon_idx = 1;
283 currentGene = (*allGenes)[gene_idx];
284 while( currentGene == 0 && gene_idx < numGenes) {
285 currentGene = (*allGenes)[gene_idx];
286 gene_idx++;
287 }
288 continue;
289 }
290
291 prev_exon_start = currentGene->exon_starts[exon_idx-1];
292 prev_exon_stop = currentGene->exon_stops[exon_idx-1];
293 cur_exon_start = currentGene->exon_starts[exon_idx];
294
295 //printf("exon: %d %d %d\t pos: %d\n",prev_exon_start,prev_exon_stop,cur_exon_start,pos);
296
297 if (cur_exon_start - prev_exon_stop < min_overlap || cur_exon_start < pos ) { // go to next exon
298
299 if (uov != 0 && dov != 0)
300 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
301
302 for(up_idx=0;up_idx<uov;up_idx++) {
303 free_read(upstream_overlap[up_idx]);
304 free(upstream_overlap[up_idx]);
305 }
306
307 for(down_idx=0;down_idx<dov;down_idx++) {
308 free_read(downstream_overlap[down_idx]);
309 free(downstream_overlap[down_idx]);
310 }
311
312 uov = dov = 0;
313
314 exon_idx++;
315 goto exon_label;
316 }
317
318 if ( prev_exon_start < pos && (pos+read_size) < prev_exon_stop ) { // read is inside previous exon
319 exonicReadCtr++;
320 goto next_read;
321 }
322
323 if ( pos + (read_size-1) < prev_exon_stop ) // go to next read
324 goto next_read;
325
326 // if this position is reached the read is somehow overlapping or
327 // exactly on the exon boundary.
328 if ( (prev_exon_stop - pos) >= min_overlap && (prev_exon_stop - pos) <= max_overlap ) { // read overlaps with previous exon boundary
329 //printf("%s\n",current_line);
330 prevOverlapCtr++;
331 currentRead = create_read(chr,pos,seq,id,strand,mismatch,occurrence,size,cut,prb,cal_prb,chastity);
332 upstream_overlap[uov] = currentRead;
333 uov++;
334 goto next_read;
335 }
336
337 int end_pos = pos+read_size-1;
338 if ( (end_pos - cur_exon_start + 1) >= min_overlap && (end_pos - cur_exon_start + 1) <= max_overlap ) { // read overlaps with current exon boundary
339 //printf("%s\n",current_line);
340 currentOverlapCtr++;
341 currentRead = create_read(chr,pos,seq,id,strand,mismatch,occurrence,size,cut,prb,cal_prb,chastity);
342 downstream_overlap[dov] = currentRead;
343 dov++;
344 goto next_read;
345 }
346
347 uselessReadCtr++;
348 goto next_read; // read was not useful i.e. not overlapping/starting at exon boundaries
349
350 } else { // read is not within gene borders
351
352 if (uov != 0 && dov != 0)
353 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
354
355 for(up_idx=0;up_idx<uov;up_idx++) {
356 free_read(upstream_overlap[up_idx]);
357 free(upstream_overlap[up_idx]);
358 }
359
360 for(down_idx=0;down_idx<dov;down_idx++) {
361 free_read(downstream_overlap[down_idx]);
362 free(downstream_overlap[down_idx]);
363 }
364
365 uov = dov = 0;
366
367 if ( currentGene->stop < (pos + read_size-1) ) { // go to next gene
368 gene_idx++;
369 exon_idx = 1;
370 currentGene = (*allGenes)[gene_idx];
371 while( currentGene == 0 && gene_idx < numGenes) {
372 currentGene = (*allGenes)[gene_idx];
373 gene_idx++;
374 }
375 //printf("currentGene->start / currentGene->stop %d/%d pos is %d\n",currentGene->start,currentGene->stop,pos);
376 continue;
377 }
378
379 if ( pos < currentGene->start ) { // go to next read
380 skippedReadCtr++;
381
382 next_read:
383
384 lineBeginPtr = lineEndPtr;
385 while (*(char*)lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
386 lineEndPtr++;
387 readCtr += 1;
388 current_line = strncpy(current_line,lineBeginPtr,lineEndPtr-lineBeginPtr);
389 current_line[lineEndPtr-lineBeginPtr] = '\0';
390 continue;
391 }
392 }
393 }
394
395 if (uov != 0 && dov != 0)
396 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
397
398 for(up_idx=0;up_idx<uov;up_idx++) {
399 free_read(upstream_overlap[up_idx]);
400 free(upstream_overlap[up_idx]);
401 }
402
403 for(down_idx=0;down_idx<dov;down_idx++) {
404 free_read(downstream_overlap[down_idx]);
405 free(downstream_overlap[down_idx]);
406 }
407
408 uov = dov = 0;
409
410 free(upstream_overlap);
411 free(downstream_overlap);
412
413 printf("Processed %d reads in total while %d reads where skipped.\n",readCtr,skippedLinesCounter);
414 printf("%d were totally intronic. %d were outside of genes and %d were occurring on several positions.\n",uselessReadCtr,skippedReadCtr,multioccurReadCtr);
415 printf("%d reads were useless in total\n",uselessReadCtr+skippedReadCtr+multioccurReadCtr);
416 printf("%d reads were totally exonic\n",exonicReadCtr);
417 printf("%d reads overlap with prev. exon. %d reads overlap with current exon\n",prevOverlapCtr,currentOverlapCtr);
418 printf("%d reads where newly combined from two original reads\n",combined_reads);
419 printf("Total used reads: %d\n",exonicReadCtr+endPrevCtr+prevOverlapCtr+currentStartCtr+currentOverlapCtr);
420
421 status = munmap(reads_area,reads_filesize);
422 if(status != 0)
423 perror("munmap");
424
425 free(current_line);
426 free(seq);
427 free(prb);
428 free(cal_prb);
429 free(chastity);
430 }
431
432 void combine_info(int exon_stop, int exon_start, Read** upstream, int up_size, Read** downstream, int down_size, FILE* out_fs,const char* gene_id, int exon_idx) {
433
434 //printf("up/down size is %d/%d\n",up_size,down_size);
435
436 int up_idx, down_idx, success;
437
438 char* up_used_flag = calloc(up_size,sizeof(char));
439 char* down_used_flag = calloc(down_size,sizeof(char));
440
441 Read* currentUpRead;
442 Read* currentDownRead;
443
444 for(up_idx=0;up_idx<up_size;up_idx++) {
445 if( up_used_flag[up_idx] == 1)
446 continue;
447
448 currentUpRead = upstream[up_idx];
449
450 for(down_idx=0;down_idx<down_size;down_idx++) {
451
452 if( up_used_flag[up_idx] == 1 || down_used_flag[down_idx] == 1)
453 continue;
454
455 currentDownRead = downstream[down_idx];
456
457 if(currentUpRead->strand != currentDownRead->strand)
458 continue;
459
460 success = join_reads(exon_stop,exon_start,currentUpRead,currentDownRead,out_fs,gene_id,exon_idx);
461
462 if(success == 1) {
463 up_used_flag[up_idx] = 1;
464 down_used_flag[down_idx] = 1;
465 }
466
467 }
468 }
469
470 free(up_used_flag);
471 free(down_used_flag);
472 }
473
474 int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FILE* out_fs,const char* gene_id, int exon_idx) {
475 // range of possible sequence length on exon side
476 int up_range = exon_stop - up_read->pos;
477 int down_range = down_read->pos+(read_size-1) - exon_start+1;
478 int retval;
479
480 int u_size, d_size;
481 u_size = d_size = -1;
482
483 if(up_range+down_range < read_size)
484 return 0;
485
486 //if(up_range < down_range) {
487 if (read_nr % 2 != 0) {
488 d_size = down_range;
489 u_size = read_size - d_size;
490 }
491
492 //if(up_range >= down_range) {
493 if (read_nr % 2 == 0) {
494 u_size = up_range;
495 d_size = read_size - u_size;
496 }
497
498 const int p_start = exon_stop - u_size;
499 const int p_stop = exon_start + d_size - 1;
500
501 const int u_off = p_start - up_read->pos;
502 const int d_off = exon_start - down_read->pos;
503
504 FA(u_size != -1);
505 FA(d_size != -1);
506 FA(u_size + d_size == read_size);
507
508 int buf_size = 4*read_size;
509 char* new_up_seq = malloc(sizeof(char)*buf_size);
510 char* new_down_seq = malloc(sizeof(char)*buf_size);
511 char* new_seq = malloc(sizeof(char)*buf_size);
512
513 memset(new_up_seq,0,sizeof(char)*buf_size);
514 memset(new_down_seq,0,sizeof(char)*buf_size);
515 memset(new_seq,0,sizeof(char)*buf_size);
516
517 strncpy(new_up_seq,up_read->seq,strlen(up_read->seq));
518 strncpy(new_down_seq,down_read->seq,strlen(down_read->seq));
519
520 Tuple jinfo = join_seq(new_seq,new_up_seq,u_off,u_size,new_down_seq,d_off,d_size,down_range);
521 int cut_pos = jinfo.first;
522 int additional_pos = jinfo.second;
523
524 buf_size = read_size+1+additional_pos;
525
526 char* new_prb = malloc(sizeof(char)*buf_size);
527 char* new_cal_prb = malloc(sizeof(char)*buf_size);
528 char* new_chastity = malloc(sizeof(char)*buf_size);
529
530 if( additional_pos > (down_range - d_size) ) {
531 retval = 0;
532 goto free;
533 }
534
535 strncpy(new_prb, up_read->prb+u_off, u_size);
536 strncpy(new_prb+u_size, down_read->prb+d_off, d_size+additional_pos);
537 new_prb[read_size+additional_pos] = '\0';
538
539 strncpy(new_cal_prb, up_read->cal_prb+u_off, u_size);
540 strncpy(new_cal_prb+u_size, down_read->cal_prb+d_off, d_size+additional_pos);
541 new_cal_prb[read_size+additional_pos] = '\0';
542
543 strncpy(new_chastity, up_read->chastity+u_off, u_size);
544 strncpy(new_chastity+u_size, down_read->chastity+d_off, d_size+additional_pos);
545 new_chastity[read_size+additional_pos] = '\0';
546
547 #ifdef _FDEBUG_
548 if(read_nr == DEBUG_READ) {
549 printf("read nr %lu",read_nr);
550 printf("pos: %d %d %d %d\n",p_start,exon_stop,exon_start,p_stop);
551 printf("u/d range: %d %d\n",up_range,down_range);
552 printf("u/d size: %d %d\n",u_size,d_size);
553 printf("u/d off: %d %d\n",u_off,d_off);
554 printf("add_pos/down_range-d_size: %d %d\n",additional_pos,(down_range - d_size));
555
556 printf("******************\n");
557
558 printf("%s\n",up_read->seq);
559 printf("%s\n",new_up_seq);
560
561 printf("******************\n");
562
563 printf("%s\n",down_read->seq);
564 printf("%s\n",new_down_seq);
565
566 printf("******************\n");
567 printf("%s\n",new_seq);
568 printf("%s\n",new_prb);
569 printf("%s\n",new_cal_prb);
570 printf("%s\n",new_chastity);
571 }
572 #endif // _FDEBUG_
573
574 int status = 1; //fitting(up_read->prb,down_read->prb,u_off,d_off,u_size,d_size);
575
576 if(status != 1) {
577 retval = 0;
578 goto free;
579 }
580
581 retval = status;
582
583 //fprintf(out_fs,"%d\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%s\t%s\n",
584 // read_nr,up_read->chr,up_read->strand,new_seq,cut_pos,read_size,new_prb,new_cal_prb,new_chastity,gene_id,p_start,exon_stop,exon_start,p_stop,up_read->seq,down_read->seq);
585
586 fprintf(out_fs,"%lu\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\n",
587 read_nr,up_read->chr,up_read->strand,new_seq,cut_pos,read_size,new_prb,new_cal_prb,new_chastity,gene_id,p_start,exon_stop,exon_start,p_stop,u_size);
588
589 read_nr++;
590 combined_reads++;
591
592 free:
593 free(new_up_seq);
594 free(new_down_seq);
595
596 free(new_seq);
597 free(new_prb);
598 free(new_cal_prb);
599 free(new_chastity);
600
601 return retval;
602 }
603
604
605 Tuple join_seq(char* new_seq, char* up_seq,int u_off, int u_size, char* down_seq, int d_off, int d_size, int d_range) {
606 int new_idx, idx;
607 new_idx = idx = 0;
608 int elem_ctr = 0;
609 int read_del_ctr = 0;
610
611 int u_idx = 0;
612 int skipped_pos_ctr = 0;
613 while(1) {
614 if( u_idx >= read_size || skipped_pos_ctr == u_off)
615 break;
616
617 if(up_seq[u_idx] == '[') {
618 u_idx += 4;
619 skipped_pos_ctr++;
620 } else {
621 u_idx++;
622 skipped_pos_ctr++;
623 }
624 }
625
626 if(read_nr == DEBUG_READ) {
627 printf("u_off is %d\n",u_off);
628 printf("u_idx is %d\n",u_idx);
629 }
630 u_off = u_idx+1;
631
632
633 if( u_off > 0 && up_seq[u_off+idx-1] == '[' )
634 idx = -1;
635
636 if( u_off > 1 && up_seq[u_off+idx-2] == '[' )
637 idx = -2;
638
639 if( u_off > 2 && up_seq[u_off+idx-3] == '[' )
640 idx = -3;
641
642 while(1) {
643 if (elem_ctr == u_size)
644 break;
645
646 if(up_seq[u_off+idx] == '[') {
647
648 if(up_seq[u_off+idx+2] == '-') // del in read seq.
649 read_del_ctr++;
650
651 while(1) {
652 new_seq[new_idx] = up_seq[u_off+idx];
653 new_idx++;
654 idx++;
655 if(up_seq[u_off+idx-1] == ']')
656 break;
657 }
658 elem_ctr++;
659 continue;
660 }
661
662 new_seq[new_idx] = up_seq[u_off+idx];
663 idx++;
664 new_idx++;
665 elem_ctr++;
666 }
667
668 elem_ctr = 0;
669 idx = 0;
670
671 Tuple tuple;
672 tuple.first = new_idx;
673
674 int d_idx = 0;
675 skipped_pos_ctr = 0;
676 while(1) {
677 if( d_idx >= read_size || skipped_pos_ctr == d_off)
678 break;
679
680 if(down_seq[d_idx] == '[') {
681 d_idx += 4;
682 skipped_pos_ctr++;
683 } else {
684 d_idx++;
685 skipped_pos_ctr++;
686 }
687 }
688
689 if(read_nr == DEBUG_READ) {
690 printf("d_off is %d\n",d_off);
691 printf("d_idx is %d\n",d_idx);
692 }
693 d_off = d_idx+1;
694 //if(read_nr == DEBUG_READ) {
695 // printf("%s\n",new_seq);
696 // printf("%c\n",down_seq[d_off+idx]);
697 //}
698
699 if( d_off > 0 && down_seq[d_off+idx-1] == '[' )
700 idx = -1;
701
702 if( d_off > 1 && down_seq[d_off+idx-2] == '[' )
703 idx = -2;
704
705 if( d_off > 2 && down_seq[d_off+idx-3] == '[' )
706 idx = -3;
707
708 while(1) {
709 if (elem_ctr == d_size)
710 break;
711
712 if(down_seq[d_off+idx] == '[') {
713
714 if(down_seq[d_off+idx+2] == '-') // del in read seq.
715 read_del_ctr++;
716
717 while(1) {
718 new_seq[new_idx] = down_seq[d_off+idx];
719 new_idx++;
720 idx++;
721 if(down_seq[d_off+idx-1] == ']')
722 break;
723 }
724 elem_ctr++;
725 continue;
726 }
727
728 new_seq[new_idx] = down_seq[d_off+idx];
729 idx++;
730 new_idx++;
731 elem_ctr++;
732 }
733
734 elem_ctr = 0;
735
736 #ifdef _FDEBUG_
737 if(read_nr == DEBUG_READ) {
738 printf("read_del_ctr %d\n",read_del_ctr);
739 printf("idx/new_idx: %d %d\n",idx,new_idx);
740 printf("next char: %c\n",down_seq[d_off+idx]);
741 printf("down_seq: %s\n",down_seq);
742 }
743 #endif // _FDEBUG_
744
745 while(1) {
746 if (elem_ctr == read_del_ctr || d_off+idx >= strlen(down_seq))
747 break;
748
749 if(down_seq[d_off+idx] == '[') {
750
751 while(1) {
752 new_seq[new_idx] = down_seq[d_off+idx];
753 new_idx++;
754 idx++;
755 if(down_seq[d_off+idx-1] == ']')
756 break;
757 }
758 elem_ctr++;
759 continue;
760 }
761
762 #ifdef _FDEBUG_
763 if(read_nr == DEBUG_READ)
764 printf("current char: %c\n",down_seq[d_off+idx]);
765 #endif // _FDEBUG_
766
767 new_seq[new_idx] = down_seq[d_off+idx];
768 new_idx++;
769 idx++;
770 elem_ctr++;
771 }
772
773 new_seq[new_idx+1] = '\0';
774
775 #ifdef _FDEBUG_
776 if(read_nr == DEBUG_READ)
777 printf("idx/new_idx: %d %d\n",idx,new_idx);
778 #endif // _FDEBUG_
779
780 tuple.second = read_del_ctr;
781 return tuple;
782 }
783
784 void print_read(Read* cRead) {
785
786 printf(line_format,
787 cRead->chr, cRead->pos, cRead->seq, cRead->id,
788 cRead->strand, cRead->mismatch, cRead->occurrence,
789 cRead->size, cRead->cut, cRead->prb, cRead->cal_prb,
790 cRead->chastity);
791 }
792
793 /*
794 * TODO:
795 * - Check strand -> done simple (only if equal)
796 * - check for [AC] and similar entries -> done simple (see function
797 * - remove_ambiguities (exchanges [XY] by the second entry)
798 */