+ changed filter behavior (tendency towards smaller read fragments)
[qpalma.git] / tools / data_tools / filterReads.c
1 ////////////////////////////////////////////////////////////
2 // The purpose of this program is to read a gff and a
3 // solexa reads file and create a data set used by QPalma.
4 //
5 //
6 ////////////////////////////////////////////////////////////
7
8 #include <sys/mman.h>
9 #include <sys/stat.h>
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <string.h>
13 #include <unistd.h>
14 #include <math.h>
15
16 #include "debug_tools.h"
17 #include "datastructures.h"
18
19 #define _FILE_OFFSET_BITS == 64
20
21 typedef struct tuple {
22 int first;
23 int second;
24 } Tuple;
25
26
27 int compare_gene_struct(struct gene* a, struct gene* b) {
28 return a->stop - b->start;
29 }
30
31 int parse_gff(char* filename,FILE* fid,struct gene*** allGenes);
32 void sort_genes(struct gene*** allGenes, int numGenes);
33 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes,FILE* out_fs);
34 void combine_info(int exon_stop, int exon_start, Read** upstream, int up_size, Read** downstream, int down_size, FILE* out_fs,const char* gene_id, int exon_idx);
35 int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FILE* out_fs,const char* gene_id, int exon_idx);
36 int fitting(char* up_prb, char* down_prb, int u_off, int d_off, int u_size, int d_size);
37 void remove_ambiguities(char * old_seq, char* new_seq);
38 Tuple join_seq(char* new_seq, char* up_seq,int u_off, int u_size, char* down_seq, int d_off, int d_size, int d_range);
39
40 static char *info = "Usage is:\n./filterReads gff reads output";
41
42 /*
43 * Some constants specifying the exact behavior of the filter
44 *
45 */
46
47 #define _FDEBUG_ 1
48
49 #define DEBUG_READ 38603
50
51 const char* line_format = "%d\t%d\t%s\t%lu\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n";
52
53 const int read_size = 36;
54
55 const int min_overlap = 1;
56 const int max_overlap = 35;
57
58 int read_nr = 1;
59
60 int combined_reads = 0;
61
62 int main(int argc, char* argv[]) {
63
64 if(argc != 5) {
65 printf("%s\n",info);
66 exit(EXIT_FAILURE);
67 }
68
69 int status;
70 int filenameSize = 256;
71 char* gff_filename = malloc(sizeof(char)*filenameSize);
72 char* reads_filename = malloc(sizeof(char)*filenameSize);
73 char* output_filename = malloc(sizeof(char)*filenameSize);
74
75
76 strncpy(gff_filename,argv[1],filenameSize);
77 strncpy(reads_filename,argv[2],filenameSize);
78 strncpy(output_filename,argv[3],filenameSize);
79
80 FILE *gff_fs = fopen(gff_filename,"r");
81 FILE *reads_fs = fopen(reads_filename,"r");
82 FILE *out_fs = fopen(output_filename,"w");
83
84 read_nr = atoi(argv[4]);
85 read_nr++;
86
87 if(gff_fs == NULL) {
88 printf("Error: Could not open file: %s",gff_filename);
89 exit(EXIT_FAILURE);
90 }
91
92 if(reads_fs == NULL) {
93 printf("Error: Could not open file: %s",reads_filename);
94 exit(EXIT_FAILURE);
95 }
96
97 if(out_fs == NULL) {
98 printf("Error: Could not open file: %s",output_filename);
99 exit(EXIT_FAILURE);
100 }
101
102 struct gene** allGenes;
103 int numGenes = parse_gff(gff_filename,gff_fs,&allGenes);
104 status = fclose(gff_fs);
105 free(gff_filename);
106 if(status != 0)
107 printf("closing of gff filestream failed!\n");
108
109 printf("Successfully parsed gff file! Found %d genes.\n",numGenes);
110
111 // check if allGenes is sorted. if not throw away those genes that do not
112 // occur in the sorted order
113 int g_idx;
114 struct gene* currentGene = 0;
115 int nulled_genes=0;
116 int old_gene_stop = -1;
117 for(g_idx=0;g_idx<numGenes;g_idx++) {
118 currentGene = allGenes[g_idx];
119
120 if (! (currentGene->start < currentGene->stop))
121 printf("Invalid positions for gene %s!\n",currentGene->id);
122
123 if (! (old_gene_stop < currentGene->start ) ) {
124 old_gene_stop = currentGene->stop;
125 allGenes[g_idx] = 0;
126 nulled_genes++;
127 continue;
128 }
129 old_gene_stop = currentGene->stop;
130 }
131
132 printf("Found %d unordered genes.\n",nulled_genes);
133 int gidx, eidx;
134 int exon_cov = 0;
135 for(gidx=0;gidx<numGenes;gidx++) {
136 if (allGenes[gidx] == 0)
137 continue;
138
139 for(eidx=0;eidx<allGenes[gidx]->num_exons;eidx++) {
140 exon_cov += allGenes[gidx]->exon_stops[eidx] - allGenes[gidx]->exon_starts[eidx];
141 }}
142 printf("Exon coverage is %f\n",(double)exon_cov/30432563);
143
144 process_reads(reads_fs,&allGenes,numGenes,out_fs);
145
146 for(g_idx=0;g_idx<numGenes;g_idx++) {
147 if(allGenes[g_idx] != 0)
148 free_gene(allGenes[g_idx]);
149 free(allGenes[g_idx]);
150
151 }
152 free(allGenes);
153
154 status = fclose(reads_fs);
155 status = fclose(out_fs);
156 if(status != 0)
157 perror("fclose");
158
159 free(reads_filename);
160 free(output_filename);
161 return 0;
162 }
163
164 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* out_fs) {
165 int status;
166
167 int buffer_size= 64;
168 int chr = 0;
169 int pos = 0;
170 char* seq = malloc(sizeof(char)*buffer_size);
171 unsigned long id = 0;
172 char strand = ' ';
173 int mismatch = 0;
174 int occurrence = 0;
175 int size = 0;
176 int cut = 0;
177 char* prb = malloc(sizeof(char)*buffer_size);
178 char* cal_prb = malloc(sizeof(char)*buffer_size);
179 char* chastity = malloc(sizeof(char)*buffer_size);
180
181 int reads_fid = fileno(reads_fs);
182 struct stat reads_stat;
183 if ( fstat(reads_fid,&reads_stat) == -1) {
184 perror("fstat");
185 exit(EXIT_FAILURE);
186 }
187 off_t reads_filesize = reads_stat.st_size;
188 printf("Reads file is of size %lu bytes\n",(unsigned long) reads_filesize);
189 int numReads = reads_filesize / 178.0;
190
191 void *reads_area = mmap (NULL,reads_filesize,PROT_READ,MAP_PRIVATE,reads_fid,0);
192 if (reads_area == MAP_FAILED) {
193 perror("mmap");
194 exit(EXIT_FAILURE);
195 }
196 close(reads_fid);
197 printf("Successfully mapped %lu bytes of reads file into memory\n",(unsigned long)reads_filesize);
198
199 char* lineBeginPtr = (char*) reads_area;
200 char* lineEndPtr = (char*) reads_area;
201 char* end_of_mapped_area = ((char*) reads_area) + reads_filesize;
202
203 while (*lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
204 lineEndPtr++;
205
206 char* current_line = malloc(sizeof(char)*512);
207 memset(current_line,0,512);
208
209 int SIZE = 500;
210 // initialize boundary arrays
211 Read** upstream_overlap = malloc(sizeof(Read*)*SIZE);
212 Read** downstream_overlap= malloc(sizeof(Read*)*SIZE);
213 int uov,dov;
214 uov = dov = 0;
215
216 int skippedLinesCounter = 0;
217
218 int prev_exon_start = -1;
219 int prev_exon_stop = -1;
220 int cur_exon_start = -1;
221
222 unsigned long line_size = lineEndPtr - lineBeginPtr;
223 //printf("distance is %lu\n",line_size);
224 strncpy(current_line,lineBeginPtr,line_size);
225 current_line[line_size] = '\0';
226 //printf("%s test",current_line);
227
228 int gene_idx = 0;
229 int exon_idx = 1;
230 struct gene* currentGene = (*allGenes)[gene_idx];
231 char* gene_id = currentGene->id;
232
233 int skippedReadCtr = 0;
234 int uselessReadCtr = 0;
235 int exonicReadCtr = 0;
236 int endPrevCtr = 0;
237 int prevOverlapCtr = 0;
238 int currentStartCtr = 0;
239 int currentOverlapCtr = 0;
240 int multioccurReadCtr = 0;
241
242 Read* currentRead;
243 int up_idx, down_idx;
244
245 int readCtr = 0;
246 // start of the parsing loop
247 while(1) {
248 if (gene_idx == numGenes || strcmp(current_line,"") == 0)
249 break;
250
251 gene_id = currentGene->id;
252
253 if (readCtr != 0 && readCtr % 1000000 == 0)
254 printf("Processed %d/%d genes and %d/%d reads.\n",gene_idx,numGenes,readCtr,numReads);
255
256 status = sscanf(current_line,line_format,&chr,&pos,seq,&id,&strand,&mismatch,&occurrence,&size,&cut,prb,cal_prb,chastity);
257 if (status < 12) {
258 skippedLinesCounter++;
259 goto next_read;
260 }
261
262 // if the read is occurring several times elsewhere then get rid of it
263 if ( occurrence != 1 ) {
264 multioccurReadCtr++;
265 goto next_read;
266 }
267
268 FA(strlen(seq) >= read_size);
269
270 FA(currentGene != 0);
271
272 if ( currentGene->start <= pos && (pos + read_size-1) <= currentGene->stop) { // read is within gene borders
273
274 exon_label:
275
276 if (exon_idx == currentGene->num_exons) {
277 gene_idx++;
278 exon_idx = 1;
279 currentGene = (*allGenes)[gene_idx];
280 while( currentGene == 0 && gene_idx < numGenes) {
281 currentGene = (*allGenes)[gene_idx];
282 gene_idx++;
283 }
284 continue;
285 }
286
287 prev_exon_start = currentGene->exon_starts[exon_idx-1];
288 prev_exon_stop = currentGene->exon_stops[exon_idx-1];
289 cur_exon_start = currentGene->exon_starts[exon_idx];
290
291 //printf("exon: %d %d %d\t pos: %d\n",prev_exon_start,prev_exon_stop,cur_exon_start,pos);
292
293 if (cur_exon_start - prev_exon_stop < min_overlap || cur_exon_start < pos ) { // go to next exon
294
295 if (uov != 0 && dov != 0)
296 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
297
298 for(up_idx=0;up_idx<uov;up_idx++) {
299 free_read(upstream_overlap[up_idx]);
300 free(upstream_overlap[up_idx]);
301 }
302
303 for(down_idx=0;down_idx<dov;down_idx++) {
304 free_read(downstream_overlap[down_idx]);
305 free(downstream_overlap[down_idx]);
306 }
307
308 uov = dov = 0;
309
310 exon_idx++;
311 goto exon_label;
312 }
313
314 if ( prev_exon_start < pos && (pos+read_size) < prev_exon_stop ) { // read is inside previous exon
315 exonicReadCtr++;
316 goto next_read;
317 }
318
319 if ( pos + (read_size-1) < prev_exon_stop ) // go to next read
320 goto next_read;
321
322 // if this position is reached the read is somehow overlapping or
323 // exactly on the exon boundary.
324 if ( (prev_exon_stop - pos + 1) >= min_overlap && (prev_exon_stop - pos + 1) <= max_overlap ) { // read overlaps with previous exon boundary
325 //printf("%s\n",current_line);
326 prevOverlapCtr++;
327 currentRead = create_read(chr,pos,seq,id,strand,mismatch,occurrence,size,cut,prb,cal_prb,chastity);
328 upstream_overlap[uov] = currentRead;
329 uov++;
330 goto next_read;
331 }
332
333 int end_pos = pos+read_size-1;
334 if ( (end_pos - cur_exon_start + 1) >= min_overlap && (end_pos - cur_exon_start + 1) <= max_overlap ) { // read overlaps with current exon boundary
335 //printf("%s\n",current_line);
336 currentOverlapCtr++;
337 currentRead = create_read(chr,pos,seq,id,strand,mismatch,occurrence,size,cut,prb,cal_prb,chastity);
338 downstream_overlap[dov] = currentRead;
339 dov++;
340 goto next_read;
341 }
342
343 uselessReadCtr++;
344 goto next_read; // read was not useful i.e. not overlapping/starting at exon boundaries
345
346 } else { // read is not within gene borders
347
348 if (uov != 0 && dov != 0)
349 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
350
351 for(up_idx=0;up_idx<uov;up_idx++) {
352 free_read(upstream_overlap[up_idx]);
353 free(upstream_overlap[up_idx]);
354 }
355
356 for(down_idx=0;down_idx<dov;down_idx++) {
357 free_read(downstream_overlap[down_idx]);
358 free(downstream_overlap[down_idx]);
359 }
360
361 uov = dov = 0;
362
363 if ( currentGene->stop < (pos + read_size-1) ) { // go to next gene
364 gene_idx++;
365 exon_idx = 1;
366 currentGene = (*allGenes)[gene_idx];
367 while( currentGene == 0 && gene_idx < numGenes) {
368 currentGene = (*allGenes)[gene_idx];
369 gene_idx++;
370 }
371 //printf("currentGene->start / currentGene->stop %d/%d pos is %d\n",currentGene->start,currentGene->stop,pos);
372 continue;
373 }
374
375 if ( pos < currentGene->start ) { // go to next read
376 skippedReadCtr++;
377
378 next_read:
379
380 lineBeginPtr = lineEndPtr;
381 while (*(char*)lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
382 lineEndPtr++;
383 readCtr += 1;
384 current_line = strncpy(current_line,lineBeginPtr,lineEndPtr-lineBeginPtr);
385 current_line[lineEndPtr-lineBeginPtr] = '\0';
386 continue;
387 }
388 }
389 }
390
391 if (uov != 0 && dov != 0)
392 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
393
394 for(up_idx=0;up_idx<uov;up_idx++) {
395 free_read(upstream_overlap[up_idx]);
396 free(upstream_overlap[up_idx]);
397 }
398
399 for(down_idx=0;down_idx<dov;down_idx++) {
400 free_read(downstream_overlap[down_idx]);
401 free(downstream_overlap[down_idx]);
402 }
403
404 uov = dov = 0;
405
406 free(upstream_overlap);
407 free(downstream_overlap);
408
409 printf("Processed %d reads in total while %d reads where skipped.\n",readCtr,skippedLinesCounter);
410 printf("%d were totally intronic. %d were outside of genes and %d were occurring on several positions.\n",uselessReadCtr,skippedReadCtr,multioccurReadCtr);
411 printf("%d reads were useless in total\n",uselessReadCtr+skippedReadCtr+multioccurReadCtr);
412 printf("%d reads were totally exonic\n",exonicReadCtr);
413 printf("%d reads overlap with prev. exon. %d reads overlap with current exon\n",prevOverlapCtr,currentOverlapCtr);
414 printf("%d reads where newly combined from two original reads\n",combined_reads);
415 printf("Total used reads: %d\n",exonicReadCtr+endPrevCtr+prevOverlapCtr+currentStartCtr+currentOverlapCtr);
416
417 status = munmap(reads_area,reads_filesize);
418 if(status != 0)
419 perror("munmap");
420
421 free(current_line);
422 free(seq);
423 free(prb);
424 free(cal_prb);
425 free(chastity);
426 }
427
428 void combine_info(int exon_stop, int exon_start, Read** upstream, int up_size, Read** downstream, int down_size, FILE* out_fs,const char* gene_id, int exon_idx) {
429
430 //printf("up/down size is %d/%d\n",up_size,down_size);
431
432 int up_idx, down_idx, success;
433
434 char* up_used_flag = calloc(up_size,sizeof(char));
435 char* down_used_flag = calloc(down_size,sizeof(char));
436
437 Read* currentUpRead;
438 Read* currentDownRead;
439
440 for(up_idx=0;up_idx<up_size;up_idx++) {
441 if( up_used_flag[up_idx] == 1)
442 continue;
443
444 currentUpRead = upstream[up_idx];
445
446 for(down_idx=0;down_idx<down_size;down_idx++) {
447
448 if( up_used_flag[up_idx] == 1 || down_used_flag[down_idx] == 1)
449 continue;
450
451 currentDownRead = downstream[down_idx];
452
453 if(currentUpRead->strand != currentDownRead->strand)
454 continue;
455
456 success = join_reads(exon_stop,exon_start,currentUpRead,currentDownRead,out_fs,gene_id,exon_idx);
457
458 if(success == 1) {
459 up_used_flag[up_idx] = 1;
460 down_used_flag[down_idx] = 1;
461 }
462
463 }
464 }
465
466 free(up_used_flag);
467 free(down_used_flag);
468 }
469
470 int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FILE* out_fs,const char* gene_id, int exon_idx) {
471 // range of possible sequence length on exon side
472 int up_range = exon_stop - up_read->pos+1;
473 int down_range = down_read->pos+(read_size-1) - exon_start+1;
474 int retval;
475
476 int u_off, d_off, u_size, d_size;
477 u_size = d_size = -1;
478
479 //printf("possible range %d %d\n",up_range,down_range);
480
481 if(up_range+down_range < read_size)
482 return 0;
483
484 int half_read = read_size / 2;
485
486 // both overlap more than a half of the new read
487 if(up_range >= half_read && down_range >= half_read) {
488
489 u_size = half_read;
490 d_size = half_read;
491
492 } else {
493
494 //if(up_range < down_range) {
495 // u_size = up_range;
496 // d_size = read_size - u_size;
497 //}
498
499 //if(up_range > down_range) {
500 // d_size = down_range;
501 // u_size = read_size - d_size;
502 //}
503
504 if(up_range < down_range) {
505 d_size = down_range;
506 u_size = read_size - d_size;
507 }
508
509 if(up_range > down_range) {
510 u_size = up_range;
511 d_size = read_size - u_size;
512 }
513 }
514
515 int p_start = exon_stop - u_size + 1;
516 int p_stop = exon_start + d_size - 1;
517
518 u_off = p_start - up_read->pos;
519 d_off = exon_start - down_read->pos;
520
521 FA(u_size != -1);
522 FA(d_size != -1);
523 FA(u_size + d_size == read_size);
524
525 int buf_size = 4*read_size;
526 char* new_up_seq = malloc(sizeof(char)*buf_size);
527 char* new_down_seq = malloc(sizeof(char)*buf_size);
528 char* new_seq = malloc(sizeof(char)*buf_size);
529
530 memset(new_up_seq,0,sizeof(char)*buf_size);
531 memset(new_down_seq,0,sizeof(char)*buf_size);
532 memset(new_seq,0,sizeof(char)*buf_size);
533
534 //remove_ambiguities(up_read->seq,new_up_seq);
535 //remove_ambiguities(down_read->seq,new_down_seq);
536 //
537 strncpy(new_up_seq,up_read->seq,strlen(up_read->seq));
538 strncpy(new_down_seq,down_read->seq,strlen(down_read->seq));
539
540 Tuple jinfo = join_seq(new_seq,new_up_seq,u_off,u_size,new_down_seq,d_off,d_size,down_range);
541 int cut_pos = jinfo.first;
542 int additional_pos = jinfo.second;
543
544 buf_size = read_size+1+additional_pos;
545
546 char* new_prb = malloc(sizeof(char)*buf_size);
547 char* new_cal_prb = malloc(sizeof(char)*buf_size);
548 char* new_chastity = malloc(sizeof(char)*buf_size);
549
550 if( additional_pos > (down_range - d_size) ) {
551 retval = 0;
552 goto free;
553 }
554
555 strncpy(new_prb, up_read->prb+u_off, u_size);
556 strncpy(new_prb+u_size, down_read->prb+d_off, d_size+additional_pos);
557 new_prb[read_size+additional_pos] = '\0';
558
559 strncpy(new_cal_prb, up_read->cal_prb+u_off, u_size);
560 strncpy(new_cal_prb+u_size, down_read->cal_prb+d_off, d_size+additional_pos);
561 new_cal_prb[read_size+additional_pos] = '\0';
562
563 strncpy(new_chastity, up_read->chastity+u_off, u_size);
564 strncpy(new_chastity+u_size, down_read->chastity+d_off, d_size+additional_pos);
565 new_chastity[read_size+additional_pos] = '\0';
566
567 #ifdef _FDEBUG_
568 if(read_nr == DEBUG_READ) {
569 printf("read nr %d",read_nr);
570 printf("pos: %d %d %d %d\n",p_start,exon_stop,exon_start,p_stop);
571 printf("u/d range: %d %d\n",up_range,down_range);
572 printf("u/d size: %d %d\n",u_size,d_size);
573 printf("u/d off: %d %d\n",u_off,d_off);
574 printf("add_pos/down_range-d_size: %d %d\n",additional_pos,(down_range - d_size));
575
576 printf("******************\n");
577
578 printf("%s\n",up_read->seq);
579 printf("%s\n",new_up_seq);
580
581 printf("******************\n");
582
583 printf("%s\n",down_read->seq);
584 printf("%s\n",new_down_seq);
585
586 printf("******************\n");
587 printf("%s\n",new_seq);
588 printf("%s\n",new_prb);
589 printf("%s\n",new_cal_prb);
590 printf("%s\n",new_chastity);
591 }
592 #endif // _FDEBUG_
593
594 int status = 1; //fitting(up_read->prb,down_read->prb,u_off,d_off,u_size,d_size);
595
596 if(status != 1) {
597 retval = 0;
598 goto free;
599 }
600
601 retval = status;
602
603 fprintf(out_fs,"%d\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%s\t%s\n",
604 read_nr,up_read->chr,up_read->strand,new_seq,cut_pos,read_size,new_prb,new_cal_prb,new_chastity,gene_id,p_start,exon_stop,exon_start,p_stop,up_read->seq,down_read->seq);
605
606 read_nr++;
607 combined_reads++;
608
609 free:
610 free(new_up_seq);
611 free(new_down_seq);
612
613 free(new_seq);
614 free(new_prb);
615 free(new_cal_prb);
616 free(new_chastity);
617
618 return retval;
619 }
620
621 int fitting(char* up_prb, char* down_prb, int u_off, int d_off, int u_size, int d_size) {
622 double current_mean_up = 0;
623 double current_mean_down = 0;
624
625 int w_size = 2;
626
627 int idx;
628
629 //printf("prb %s\n",up_prb);
630 //printf("prb %s\n",down_prb);
631
632 for(idx=0;idx<w_size;idx++) {
633 current_mean_up += up_prb[u_off+u_size+idx]-50;
634 current_mean_up += up_prb[u_off+u_size-idx]-50;
635 current_mean_up -= up_prb[idx]-50;
636
637 current_mean_down += up_prb[d_off+idx]-50;
638 current_mean_down += up_prb[d_off+idx]-50;
639 current_mean_down -= up_prb[idx]-50;
640 }
641
642 current_mean_up /= (2*w_size+1);
643 current_mean_down /= (2*w_size+1);
644
645 double ratio;
646 if( current_mean_up > current_mean_down)
647 ratio = current_mean_down / current_mean_up;
648 else
649 ratio = current_mean_up / current_mean_down;
650
651 //printf("ratio is %f\n",ratio);
652
653 if (ratio < 0.35)
654 return 0;
655
656 return 1;
657 }
658
659 /*
660 * As we can have 3 kinds of ambiguities
661 *
662 * [AG]
663 * [A-]
664 * [-A]
665 *
666 * we want to remove/store these ambiguities in the filtered reads
667 */
668
669 void remove_ambiguities(char * old_seq, char* new_seq) {
670
671 FA(0);
672
673 int idx=0;
674 int new_idx = 0;
675 int old_seq_size = strlen(old_seq);
676
677 while(idx<old_seq_size) {
678 //printf("Current elem %c pos %d %d\n",old_seq[idx],idx,new_idx);
679 if (old_seq[idx] == ']') {
680 idx++;
681 continue;
682 }
683
684 if (old_seq[idx] == '[') {
685
686 // we have a indel either on the dna or read sequence
687 if (old_seq[idx+1] == '-' || old_seq[idx+2] == '-') {
688
689
690 while(1) {
691 new_seq[new_idx] = old_seq[idx];
692 if(old_seq[idx] == ']')
693 break;
694 new_idx++;
695 idx++;
696 }
697
698 } else {
699 idx += 2;
700 continue;
701 }
702 }
703
704 new_seq[new_idx] = old_seq[idx];
705 idx++;
706 new_idx++;
707 }
708
709 new_seq[new_idx] = '\0';
710 //printf("new_seq is %d :: %s\n",new_idx,new_seq);
711 }
712
713
714 Tuple join_seq(char* new_seq, char* up_seq,int u_off, int u_size, char* down_seq, int d_off, int d_size, int d_range) {
715 int new_idx, idx;
716 new_idx = idx = 0;
717 int elem_ctr = 0;
718 int read_del_ctr = 0;
719
720 int u_idx = 0;
721 int skipped_pos_ctr = 0;
722 while(1) {
723 if( u_idx >= read_size || skipped_pos_ctr == u_off)
724 break;
725
726 if(up_seq[u_idx] == '[') {
727 u_idx += 4;
728 skipped_pos_ctr++;
729 } else {
730 u_idx++;
731 skipped_pos_ctr++;
732 }
733 }
734
735 if(read_nr == DEBUG_READ) {
736 printf("u_off is %d\n",u_off);
737 printf("u_idx is %d\n",u_idx);
738 }
739 u_off = u_idx;
740
741
742 if( u_off > 0 && up_seq[u_off+idx-1] == '[' )
743 idx = -1;
744
745 if( u_off > 1 && up_seq[u_off+idx-2] == '[' )
746 idx = -2;
747
748 if( u_off > 2 && up_seq[u_off+idx-3] == '[' )
749 idx = -3;
750
751 while(1) {
752 if (elem_ctr == u_size)
753 break;
754
755 if(up_seq[u_off+idx] == '[') {
756
757 if(up_seq[u_off+idx+2] == '-') // del in read seq.
758 read_del_ctr++;
759
760 while(1) {
761 new_seq[new_idx] = up_seq[u_off+idx];
762 new_idx++;
763 idx++;
764 if(up_seq[u_off+idx-1] == ']')
765 break;
766 }
767 elem_ctr++;
768 continue;
769 }
770
771 new_seq[new_idx] = up_seq[u_off+idx];
772 idx++;
773 new_idx++;
774 elem_ctr++;
775 }
776
777 elem_ctr = 0;
778 idx = 0;
779
780 Tuple tuple;
781 tuple.first = new_idx;
782
783 int d_idx = 0;
784 skipped_pos_ctr = 0;
785 while(1) {
786 if( d_idx >= read_size || skipped_pos_ctr == d_off)
787 break;
788
789 if(down_seq[d_idx] == '[') {
790 d_idx += 4;
791 skipped_pos_ctr++;
792 } else {
793 d_idx++;
794 skipped_pos_ctr++;
795 }
796 }
797
798 if(read_nr == DEBUG_READ) {
799 printf("d_off is %d\n",d_off);
800 printf("d_idx is %d\n",d_idx);
801 }
802 d_off = d_idx;
803 //if(read_nr == DEBUG_READ) {
804 // printf("%s\n",new_seq);
805 // printf("%c\n",down_seq[d_off+idx]);
806 //}
807
808 if( d_off > 0 && down_seq[d_off+idx-1] == '[' )
809 idx = -1;
810
811 if( d_off > 1 && down_seq[d_off+idx-2] == '[' )
812 idx = -2;
813
814 if( d_off > 2 && down_seq[d_off+idx-3] == '[' )
815 idx = -3;
816
817 while(1) {
818 if (elem_ctr == d_size)
819 break;
820
821 if(down_seq[d_off+idx] == '[') {
822
823 if(down_seq[d_off+idx+2] == '-') // del in read seq.
824 read_del_ctr++;
825
826 while(1) {
827 new_seq[new_idx] = down_seq[d_off+idx];
828 new_idx++;
829 idx++;
830 if(down_seq[d_off+idx-1] == ']')
831 break;
832 }
833 elem_ctr++;
834 continue;
835 }
836
837 new_seq[new_idx] = down_seq[d_off+idx];
838 idx++;
839 new_idx++;
840 elem_ctr++;
841 }
842
843 elem_ctr = 0;
844
845 #ifdef _FDEBUG_
846 if(read_nr == DEBUG_READ) {
847 printf("read_del_ctr %d\n",read_del_ctr);
848 printf("idx/new_idx: %d %d\n",idx,new_idx);
849 printf("next char: %c\n",down_seq[d_off+idx]);
850 printf("down_seq: %s\n",down_seq);
851 }
852 #endif // _FDEBUG_
853
854 while(1) {
855 if (elem_ctr == read_del_ctr || d_off+idx >= strlen(down_seq))
856 break;
857
858 if(down_seq[d_off+idx] == '[') {
859
860 while(1) {
861 new_seq[new_idx] = down_seq[d_off+idx];
862 new_idx++;
863 idx++;
864 if(down_seq[d_off+idx-1] == ']')
865 break;
866 }
867 elem_ctr++;
868 continue;
869 }
870
871 #ifdef _FDEBUG_
872 if(read_nr == DEBUG_READ)
873 printf("current char: %c\n",down_seq[d_off+idx]);
874 #endif // _FDEBUG_
875
876 new_seq[new_idx] = down_seq[d_off+idx];
877 new_idx++;
878 idx++;
879 elem_ctr++;
880 }
881
882 new_seq[new_idx+1] = '\0';
883
884 #ifdef _FDEBUG_
885 if(read_nr == DEBUG_READ)
886 printf("idx/new_idx: %d %d\n",idx,new_idx);
887 #endif // _FDEBUG_
888
889 tuple.second = read_del_ctr;
890 return tuple;
891 }
892
893 void print_read(Read* cRead) {
894
895 printf(line_format,
896 cRead->chr, cRead->pos, cRead->seq, cRead->id,
897 cRead->strand, cRead->mismatch, cRead->occurrence,
898 cRead->size, cRead->cut, cRead->prb, cRead->cal_prb,
899 cRead->chastity);
900 }
901
902 /*
903 * TODO:
904 * - Check strand -> done simple (only if equal)
905 * - check for [AC] and similar entries -> done simple (see function
906 * - remove_ambiguities (exchanges [XY] by the second entry)
907 */
908
909 //if( read_nr == 13 || read_nr == 14 ) {
910 // printf("read nr %d",read_nr);
911 // printf("pos: %d %d %d %d\n",p_start,exon_stop,exon_start,p_stop);
912 // printf("u/d range: %d %d\n",up_range,down_range);
913 // printf("u/d size: %d %d\n",u_size,d_size);
914 // printf("u/d off: %d %d\n",u_off,d_off);
915 // printf("******************\n");
916
917 // printf("%s\n",up_read->seq);
918 // printf("%s\n",down_read->seq);
919
920 // printf("******************\n");
921
922 // printf("%s\n",new_up_seq);
923 // printf("%s\n",new_down_seq);
924
925 // printf("******************\n");
926 // printf("%s\n",new_seq);
927 // printf("%s\n",new_prb);
928 // printf("%s\n",new_cal_prb);
929 // printf("%s\n",new_chastity);
930 //}
931
932 //// The four last integers specify the positions of
933 //// p_start : the position in the dna where the first read starts
934 //// exons_stop : the position in the dna where the first exons ends
935 //// exons_start : the position in the dna where the second exons starts
936 //// p_stop : the position in the dna where the (truncated) second read ends