+ fixed bug in the combination of reads with deletions
[qpalma.git] / tools / data_tools / filterReads.c
1 ////////////////////////////////////////////////////////////
2 // The purpose of this program is to read a gff and a
3 // solexa reads file and create a data set used by QPalma.
4 //
5 //
6 ////////////////////////////////////////////////////////////
7
8 #include <sys/mman.h>
9 #include <sys/stat.h>
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <string.h>
13 #include <unistd.h>
14 #include <math.h>
15
16 #include "debug_tools.h"
17 #include "datastructures.h"
18
19 #define _FILE_OFFSET_BITS == 64
20
21 int compare_gene_struct(struct gene* a, struct gene* b) {
22 return a->stop - b->start;
23 }
24
25 int parse_gff(char* filename,FILE* fid,struct gene*** allGenes);
26 void sort_genes(struct gene*** allGenes, int numGenes);
27 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes,FILE* out_fs);
28 void combine_info(int exon_stop, int exon_start, Read** upstream, int up_size, Read** downstream, int down_size, FILE* out_fs,const char* gene_id, int exon_idx);
29 int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FILE* out_fs,const char* gene_id, int exon_idx);
30 int fitting(char* up_prb, char* down_prb, int u_off, int d_off, int u_size, int d_size);
31 void remove_ambiguities(char * old_seq, char* new_seq);
32 int join_seq(char* new_seq, char* up_seq,int u_off, int u_size, char* down_seq, int d_off, int d_size, int d_range);
33
34 static char *info = "Usage is:\n./filterReads gff reads output";
35
36 /*
37 * Some constants specifying the exact behavior of the filter
38 *
39 */
40
41 #define _FDEBUG_ 1
42
43 #define DEBUG_READ 15324
44
45 const char* line_format = "%d\t%d\t%s\t%lu\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n";
46
47 const int read_size = 36;
48
49 const int min_overlap = 6;
50 const int max_overlap = 30;
51
52 int read_nr = 1;
53
54 int combined_reads = 0;
55
56 int main(int argc, char* argv[]) {
57
58 if(argc != 5) {
59 printf("%s\n",info);
60 exit(EXIT_FAILURE);
61 }
62
63 int status;
64 int filenameSize = 256;
65 char* gff_filename = malloc(sizeof(char)*filenameSize);
66 char* reads_filename = malloc(sizeof(char)*filenameSize);
67 char* output_filename = malloc(sizeof(char)*filenameSize);
68
69
70 strncpy(gff_filename,argv[1],filenameSize);
71 strncpy(reads_filename,argv[2],filenameSize);
72 strncpy(output_filename,argv[3],filenameSize);
73
74 FILE *gff_fs = fopen(gff_filename,"r");
75 FILE *reads_fs = fopen(reads_filename,"r");
76 FILE *out_fs = fopen(output_filename,"w");
77
78 read_nr = atoi(argv[4]);
79 read_nr++;
80
81 if(gff_fs == NULL) {
82 printf("Error: Could not open file: %s",gff_filename);
83 exit(EXIT_FAILURE);
84 }
85
86 if(reads_fs == NULL) {
87 printf("Error: Could not open file: %s",reads_filename);
88 exit(EXIT_FAILURE);
89 }
90
91 if(out_fs == NULL) {
92 printf("Error: Could not open file: %s",output_filename);
93 exit(EXIT_FAILURE);
94 }
95
96 struct gene** allGenes;
97 int numGenes = parse_gff(gff_filename,gff_fs,&allGenes);
98 status = fclose(gff_fs);
99 free(gff_filename);
100 if(status != 0)
101 printf("closing of gff filestream failed!\n");
102
103 printf("Successfully parsed gff file! Found %d genes.\n",numGenes);
104
105 // check if allGenes is sorted. if not throw away those genes that do not
106 // occur in the sorted order
107 int g_idx;
108 struct gene* currentGene = 0;
109 int nulled_genes=0;
110 int old_gene_stop = -1;
111 for(g_idx=0;g_idx<numGenes;g_idx++) {
112 currentGene = allGenes[g_idx];
113
114 if (! (currentGene->start < currentGene->stop))
115 printf("Invalid positions for gene %s!\n",currentGene->id);
116
117 if (! (old_gene_stop < currentGene->start ) ) {
118 old_gene_stop = currentGene->stop;
119 allGenes[g_idx] = 0;
120 nulled_genes++;
121 continue;
122 }
123 old_gene_stop = currentGene->stop;
124 }
125
126 printf("Found %d unordered genes.\n",nulled_genes);
127 int gidx, eidx;
128 int exon_cov = 0;
129 for(gidx=0;gidx<numGenes;gidx++) {
130 if (allGenes[gidx] == 0)
131 continue;
132
133 for(eidx=0;eidx<allGenes[gidx]->num_exons;eidx++) {
134 exon_cov += allGenes[gidx]->exon_stops[eidx] - allGenes[gidx]->exon_starts[eidx];
135 }}
136 printf("Exon coverage is %f\n",(double)exon_cov/30432563);
137
138 process_reads(reads_fs,&allGenes,numGenes,out_fs);
139
140 for(g_idx=0;g_idx<numGenes;g_idx++) {
141 if(allGenes[g_idx] != 0)
142 free_gene(allGenes[g_idx]);
143 free(allGenes[g_idx]);
144
145 }
146 free(allGenes);
147
148 status = fclose(reads_fs);
149 status = fclose(out_fs);
150 if(status != 0)
151 perror("fclose");
152
153 free(reads_filename);
154 free(output_filename);
155 return 0;
156 }
157
158 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* out_fs) {
159 int status;
160
161 int buffer_size= 64;
162 int chr = 0;
163 int pos = 0;
164 char* seq = malloc(sizeof(char)*buffer_size);
165 unsigned long id = 0;
166 char strand = ' ';
167 int mismatch = 0;
168 int occurrence = 0;
169 int size = 0;
170 int cut = 0;
171 char* prb = malloc(sizeof(char)*buffer_size);
172 char* cal_prb = malloc(sizeof(char)*buffer_size);
173 char* chastity = malloc(sizeof(char)*buffer_size);
174
175 int reads_fid = fileno(reads_fs);
176 struct stat reads_stat;
177 if ( fstat(reads_fid,&reads_stat) == -1) {
178 perror("fstat");
179 exit(EXIT_FAILURE);
180 }
181 off_t reads_filesize = reads_stat.st_size;
182 printf("Reads file is of size %lu bytes\n",(unsigned long) reads_filesize);
183 int numReads = reads_filesize / 178.0;
184
185 void *reads_area = mmap (NULL,reads_filesize,PROT_READ,MAP_PRIVATE,reads_fid,0);
186 if (reads_area == MAP_FAILED) {
187 perror("mmap");
188 exit(EXIT_FAILURE);
189 }
190 close(reads_fid);
191 printf("Successfully mapped %lu bytes of reads file into memory\n",(unsigned long)reads_filesize);
192
193 char* lineBeginPtr = (char*) reads_area;
194 char* lineEndPtr = (char*) reads_area;
195 char* end_of_mapped_area = ((char*) reads_area) + reads_filesize;
196
197 while (*lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
198 lineEndPtr++;
199
200 char* current_line = malloc(sizeof(char)*512);
201 memset(current_line,0,512);
202
203 int SIZE = 500;
204 // initialize boundary arrays
205 Read** upstream_overlap = malloc(sizeof(Read*)*SIZE);
206 Read** downstream_overlap= malloc(sizeof(Read*)*SIZE);
207 int uov,dov;
208 uov = dov = 0;
209
210 int skippedLinesCounter = 0;
211
212 int prev_exon_start = -1;
213 int prev_exon_stop = -1;
214 int cur_exon_start = -1;
215
216 unsigned long line_size = lineEndPtr - lineBeginPtr;
217 //printf("distance is %lu\n",line_size);
218 strncpy(current_line,lineBeginPtr,line_size);
219 current_line[line_size] = '\0';
220 //printf("%s test",current_line);
221
222 int gene_idx = 0;
223 int exon_idx = 1;
224 struct gene* currentGene = (*allGenes)[gene_idx];
225 char* gene_id = currentGene->id;
226
227 int skippedReadCtr = 0;
228 int uselessReadCtr = 0;
229 int exonicReadCtr = 0;
230 int endPrevCtr = 0;
231 int prevOverlapCtr = 0;
232 int currentStartCtr = 0;
233 int currentOverlapCtr = 0;
234 int multioccurReadCtr = 0;
235
236 Read* currentRead;
237 int up_idx, down_idx;
238
239 int readCtr = 0;
240 // start of the parsing loop
241 while(1) {
242 if (gene_idx == numGenes || strcmp(current_line,"") == 0)
243 break;
244
245 gene_id = currentGene->id;
246
247 if (readCtr != 0 && readCtr % 1000000 == 0)
248 printf("Processed %d/%d genes and %d/%d reads.\n",gene_idx,numGenes,readCtr,numReads);
249
250 status = sscanf(current_line,line_format,&chr,&pos,seq,&id,&strand,&mismatch,&occurrence,&size,&cut,prb,cal_prb,chastity);
251 if (status < 12) {
252 skippedLinesCounter++;
253 goto next_read;
254 }
255
256 // if the read is occurring several times elsewhere then get rid of it
257 if ( occurrence != 1 ) {
258 multioccurReadCtr++;
259 goto next_read;
260 }
261
262 FA(strlen(seq) >= read_size);
263
264 FA(currentGene != 0);
265
266 if ( currentGene->start <= pos && (pos + read_size-1) <= currentGene->stop) { // read is not within gene borders
267
268 exon_label:
269
270 if (exon_idx == currentGene->num_exons) {
271 gene_idx++;
272 exon_idx = 1;
273 currentGene = (*allGenes)[gene_idx];
274 while( currentGene == 0 && gene_idx < numGenes) {
275 currentGene = (*allGenes)[gene_idx];
276 gene_idx++;
277 }
278 continue;
279 }
280
281 prev_exon_start = currentGene->exon_starts[exon_idx-1];
282 prev_exon_stop = currentGene->exon_stops[exon_idx-1];
283 cur_exon_start = currentGene->exon_starts[exon_idx];
284
285 //printf("exon: %d %d %d\t pos: %d\n",prev_exon_start,prev_exon_stop,cur_exon_start,pos);
286
287 if (cur_exon_start - prev_exon_stop < min_overlap || cur_exon_start < pos ) { // go to next exon
288
289 if (uov != 0 && dov != 0)
290 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
291
292 for(up_idx=0;up_idx<uov;up_idx++) {
293 free_read(upstream_overlap[up_idx]);
294 free(upstream_overlap[up_idx]);
295 }
296
297 for(down_idx=0;down_idx<dov;down_idx++) {
298 free_read(downstream_overlap[down_idx]);
299 free(downstream_overlap[down_idx]);
300 }
301
302 uov = dov = 0;
303
304 exon_idx++;
305 goto exon_label;
306 }
307
308 if ( prev_exon_start < pos && (pos+read_size) < prev_exon_stop ) { // read is inside previous exon
309 exonicReadCtr++;
310 goto next_read;
311 }
312
313 if ( pos + (read_size-1) < prev_exon_stop ) // go to next read
314 goto next_read;
315
316 // if this position is reached the read is somehow overlapping or
317 // exactly on the exon boundary.
318 if ( (prev_exon_stop - pos) >= min_overlap && (prev_exon_stop - pos) < read_size) { // read overlaps with previous exon boundary
319 //printf("%s\n",current_line);
320 prevOverlapCtr++;
321 currentRead = create_read(chr,pos,seq,id,strand,mismatch,occurrence,size,cut,prb,cal_prb,chastity);
322 upstream_overlap[uov] = currentRead;
323 uov++;
324 goto next_read;
325 }
326
327 int end_pos = pos+read_size-1;
328 if ( (end_pos - cur_exon_start) >= min_overlap && (end_pos - cur_exon_start) < read_size) { // read overlaps with current exon boundary
329 //printf("%s\n",current_line);
330 currentOverlapCtr++;
331 currentRead = create_read(chr,pos,seq,id,strand,mismatch,occurrence,size,cut,prb,cal_prb,chastity);
332 downstream_overlap[dov] = currentRead;
333 dov++;
334 goto next_read;
335 }
336
337 uselessReadCtr++;
338 goto next_read; // read was not useful i.e. not overlapping/starting at exon boundaries
339
340 } else { // read is not within gene borders
341
342 if (uov != 0 && dov != 0)
343 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
344
345 for(up_idx=0;up_idx<uov;up_idx++) {
346 free_read(upstream_overlap[up_idx]);
347 free(upstream_overlap[up_idx]);
348 }
349
350 for(down_idx=0;down_idx<dov;down_idx++) {
351 free_read(downstream_overlap[down_idx]);
352 free(downstream_overlap[down_idx]);
353 }
354
355 uov = dov = 0;
356
357 if ( currentGene->stop < (pos + read_size-1) ) { // go to next gene
358 gene_idx++;
359 exon_idx = 1;
360 currentGene = (*allGenes)[gene_idx];
361 while( currentGene == 0 && gene_idx < numGenes) {
362 currentGene = (*allGenes)[gene_idx];
363 gene_idx++;
364 }
365 //printf("currentGene->start / currentGene->stop %d/%d pos is %d\n",currentGene->start,currentGene->stop,pos);
366 continue;
367 }
368
369 if ( pos < currentGene->start ) { // go to next read
370 skippedReadCtr++;
371
372 next_read:
373
374 lineBeginPtr = lineEndPtr;
375 while (*(char*)lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
376 lineEndPtr++;
377 readCtr += 1;
378 current_line = strncpy(current_line,lineBeginPtr,lineEndPtr-lineBeginPtr);
379 current_line[lineEndPtr-lineBeginPtr] = '\0';
380 continue;
381 }
382 }
383 }
384
385 if (uov != 0 && dov != 0)
386 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
387
388 for(up_idx=0;up_idx<uov;up_idx++) {
389 free_read(upstream_overlap[up_idx]);
390 free(upstream_overlap[up_idx]);
391 }
392
393 for(down_idx=0;down_idx<dov;down_idx++) {
394 free_read(downstream_overlap[down_idx]);
395 free(downstream_overlap[down_idx]);
396 }
397
398 uov = dov = 0;
399
400 free(upstream_overlap);
401 free(downstream_overlap);
402
403 printf("Processed %d reads in total while %d reads where skipped.\n",readCtr,skippedLinesCounter);
404 printf("%d were totally intronic. %d were outside of genes and %d were occurring on several positions.\n",uselessReadCtr,skippedReadCtr,multioccurReadCtr);
405 printf("%d reads were useless in total\n",uselessReadCtr+skippedReadCtr+multioccurReadCtr);
406 printf("%d reads were totally exonic\n",exonicReadCtr);
407 printf("%d reads overlap with prev. exon. %d reads overlap with current exon\n",prevOverlapCtr,currentOverlapCtr);
408 printf("%d reads where newly combined from two original reads\n",combined_reads);
409 printf("Total used reads: %d\n",exonicReadCtr+endPrevCtr+prevOverlapCtr+currentStartCtr+currentOverlapCtr);
410
411 status = munmap(reads_area,reads_filesize);
412 if(status != 0)
413 perror("munmap");
414
415 free(current_line);
416 free(seq);
417 free(prb);
418 free(cal_prb);
419 free(chastity);
420 }
421
422 void combine_info(int exon_stop, int exon_start, Read** upstream, int up_size, Read** downstream, int down_size, FILE* out_fs,const char* gene_id, int exon_idx) {
423
424 //printf("up/down size is %d/%d\n",up_size,down_size);
425
426 int up_idx, down_idx, success;
427
428 char* up_used_flag = calloc(up_size,sizeof(char));
429 char* down_used_flag = calloc(down_size,sizeof(char));
430
431 Read* currentUpRead;
432 Read* currentDownRead;
433
434 for(up_idx=0;up_idx<up_size;up_idx++) {
435 if( up_used_flag[up_idx] == 1)
436 continue;
437
438 currentUpRead = upstream[up_idx];
439
440 for(down_idx=0;down_idx<down_size;down_idx++) {
441
442 if( up_used_flag[up_idx] == 1 || down_used_flag[down_idx] == 1)
443 continue;
444
445 currentDownRead = downstream[down_idx];
446
447 if(currentUpRead->strand != currentDownRead->strand)
448 continue;
449
450 success = join_reads(exon_stop,exon_start,currentUpRead,currentDownRead,out_fs,gene_id,exon_idx);
451
452 if(success == 1) {
453 up_used_flag[up_idx] = 1;
454 down_used_flag[down_idx] = 1;
455 }
456
457 }
458 }
459
460 free(up_used_flag);
461 free(down_used_flag);
462 }
463
464 int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FILE* out_fs,const char* gene_id, int exon_idx) {
465 // range of possible sequence length on exon side
466 int up_range = exon_stop - up_read->pos+1;
467 int down_range = down_read->pos+(read_size-1) - exon_start+1;
468 int retval;
469
470 int u_off, d_off, u_size, d_size;
471 u_size = d_size = -1;
472
473 //printf("possible range %d %d\n",up_range,down_range);
474
475 if(up_range+down_range < read_size)
476 return 0;
477
478 int half_read = read_size / 2;
479
480 // both overlap more than a half of the new read
481 if(up_range >= half_read && down_range >= half_read) {
482
483 u_size = half_read;
484 d_size = half_read;
485
486 } else {
487
488 if(up_range < down_range) {
489 u_size = up_range;
490 d_size = read_size - u_size;
491 }
492
493 if(up_range > down_range) {
494 d_size = down_range;
495 u_size = read_size - d_size;
496 }
497 }
498
499 int p_start = exon_stop - u_size + 1;
500 int p_stop = exon_start + d_size - 1;
501
502 u_off = p_start - up_read->pos;
503 d_off = exon_start - down_read->pos;
504
505 FA(u_size != -1);
506 FA(d_size != -1);
507 FA(u_size + d_size == read_size);
508
509 int buf_size = 4*read_size;
510 char* new_up_seq = malloc(sizeof(char)*buf_size);
511 char* new_down_seq = malloc(sizeof(char)*buf_size);
512 char* new_seq = malloc(sizeof(char)*buf_size);
513
514 memset(new_up_seq,0,sizeof(char)*buf_size);
515 memset(new_down_seq,0,sizeof(char)*buf_size);
516 memset(new_seq,0,sizeof(char)*buf_size);
517
518 remove_ambiguities(up_read->seq,new_up_seq);
519 remove_ambiguities(down_read->seq,new_down_seq);
520
521 int additional_pos = join_seq(new_seq,new_up_seq,u_off,u_size,new_down_seq,d_off,d_size,down_range);
522
523 buf_size = read_size+1+additional_pos;
524
525 char* new_prb = malloc(sizeof(char)*buf_size);
526 char* new_cal_prb = malloc(sizeof(char)*buf_size);
527 char* new_chastity = malloc(sizeof(char)*buf_size);
528
529 if( additional_pos > (down_range - d_size) ) {
530 retval = 0;
531 goto free;
532 }
533
534 strncpy(new_prb, up_read->prb+u_off, u_size);
535 strncpy(new_prb+u_size, down_read->prb+d_off, d_size+additional_pos);
536 new_prb[read_size+additional_pos] = '\0';
537
538 strncpy(new_cal_prb, up_read->cal_prb+u_off, u_size);
539 strncpy(new_cal_prb+u_size, down_read->cal_prb+d_off, d_size+additional_pos);
540 new_cal_prb[read_size+additional_pos] = '\0';
541
542 strncpy(new_chastity, up_read->chastity+u_off, u_size);
543 strncpy(new_chastity+u_size, down_read->chastity+d_off, d_size+additional_pos);
544 new_chastity[read_size+additional_pos] = '\0';
545
546 #ifdef _FDEBUG_
547 if(read_nr == DEBUG_READ) {
548 printf("read nr %d",read_nr);
549 printf("pos: %d %d %d %d\n",p_start,exon_stop,exon_start,p_stop);
550 printf("u/d range: %d %d\n",up_range,down_range);
551 printf("u/d size: %d %d\n",u_size,d_size);
552 printf("u/d off: %d %d\n",u_off,d_off);
553 printf("add_pos/down_range-d_size: %d %d\n",additional_pos,(down_range - d_size));
554
555 printf("******************\n");
556
557 printf("%s\n",up_read->seq);
558 printf("%s\n",new_up_seq);
559
560 printf("******************\n");
561
562 printf("%s\n",down_read->seq);
563 printf("%s\n",new_down_seq);
564
565 printf("******************\n");
566 printf("%s\n",new_seq);
567 printf("%s\n",new_prb);
568 printf("%s\n",new_cal_prb);
569 printf("%s\n",new_chastity);
570 }
571 #endif // _FDEBUG_
572
573 int status = fitting(up_read->prb,down_read->prb,u_off,d_off,u_size,d_size);
574
575 if(status != 1) {
576 retval = 0;
577 goto free;
578 }
579
580 retval = status;
581
582 fprintf(out_fs,"%d\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\n",
583 read_nr,up_read->chr,up_read->strand,new_seq,u_size,read_size,new_prb,new_cal_prb,new_chastity,gene_id,p_start,exon_stop,exon_start,p_stop);
584
585 read_nr++;
586 combined_reads++;
587
588 free:
589 free(new_up_seq);
590 free(new_down_seq);
591
592 free(new_seq);
593 free(new_prb);
594 free(new_cal_prb);
595 free(new_chastity);
596
597 return retval;
598 }
599
600 int fitting(char* up_prb, char* down_prb, int u_off, int d_off, int u_size, int d_size) {
601 double current_mean_up = 0;
602 double current_mean_down = 0;
603
604 int w_size = 2;
605
606 int idx;
607
608 //printf("prb %s\n",up_prb);
609 //printf("prb %s\n",down_prb);
610
611 for(idx=0;idx<w_size;idx++) {
612 current_mean_up += up_prb[u_off+u_size+idx]-50;
613 current_mean_up += up_prb[u_off+u_size-idx]-50;
614 current_mean_up -= up_prb[idx]-50;
615
616 current_mean_down += up_prb[d_off+idx]-50;
617 current_mean_down += up_prb[d_off+idx]-50;
618 current_mean_down -= up_prb[idx]-50;
619 }
620
621 current_mean_up /= (2*w_size+1);
622 current_mean_down /= (2*w_size+1);
623
624 double ratio;
625 if( current_mean_up > current_mean_down)
626 ratio = current_mean_down / current_mean_up;
627 else
628 ratio = current_mean_up / current_mean_down;
629
630 //printf("ratio is %f\n",ratio);
631
632 if (ratio < 0.35)
633 return 0;
634
635 return 1;
636 }
637
638 /*
639 * As we can have 3 kinds of ambiguities
640 *
641 * [AG]
642 * [A-]
643 * [-A]
644 *
645 * we want to remove/store these ambiguities in the filtered reads
646 */
647
648 void remove_ambiguities(char * old_seq, char* new_seq) {
649 int idx=0;
650 int new_idx = 0;
651 int old_seq_size = strlen(old_seq);
652
653 while(idx<old_seq_size) {
654 //printf("Current elem %c pos %d %d\n",old_seq[idx],idx,new_idx);
655 if (old_seq[idx] == ']') {
656 idx++;
657 continue;
658 }
659
660 if (old_seq[idx] == '[') {
661
662 // we have a indel either on the dna or read sequence
663 if (old_seq[idx+1] == '-' || old_seq[idx+2] == '-') {
664
665
666 while(1) {
667 new_seq[new_idx] = old_seq[idx];
668 if(old_seq[idx] == ']')
669 break;
670 new_idx++;
671 idx++;
672 }
673
674 } else {
675 idx += 2;
676 continue;
677 }
678 }
679
680 new_seq[new_idx] = old_seq[idx];
681 idx++;
682 new_idx++;
683 }
684
685 new_seq[new_idx] = '\0';
686 //printf("new_seq is %d :: %s\n",new_idx,new_seq);
687 }
688
689
690 int join_seq(char* new_seq, char* up_seq,int u_off, int u_size, char* down_seq, int d_off, int d_size, int d_range) {
691 int new_idx, idx;
692 new_idx = idx = 0;
693 int elem_ctr = 0;
694 int read_del_ctr = 0;
695
696 int u_idx = 0;
697 int skipped_pos_ctr = 0;
698 while(1) {
699 if( u_idx >= read_size || skipped_pos_ctr == u_off)
700 break;
701
702 if(up_seq[u_idx] == '[') {
703 u_idx += 4;
704 skipped_pos_ctr++;
705 } else {
706 u_idx++;
707 skipped_pos_ctr++;
708 }
709 }
710
711 if(read_nr == DEBUG_READ) {
712 printf("u_off is %d\n",u_off);
713 printf("u_idx is %d\n",u_idx);
714 }
715 u_off = u_idx;
716
717
718 if( u_off > 0 && up_seq[u_off+idx-1] == '[' )
719 idx = -1;
720
721 if( u_off > 1 && up_seq[u_off+idx-2] == '[' )
722 idx = -2;
723
724 if( u_off > 2 && up_seq[u_off+idx-3] == '[' )
725 idx = -3;
726
727 while(1) {
728 if (elem_ctr == u_size)
729 break;
730
731 if(up_seq[u_off+idx] == '[') {
732
733 if(up_seq[u_off+idx+2] == '-') // del in read seq.
734 read_del_ctr++;
735
736
737 while(1) {
738 new_seq[new_idx] = up_seq[u_off+idx];
739 new_idx++;
740 idx++;
741 if(up_seq[u_off+idx-1] == ']')
742 break;
743 }
744 elem_ctr++;
745 continue;
746 }
747
748 new_seq[new_idx] = up_seq[u_off+idx];
749 idx++;
750 new_idx++;
751 elem_ctr++;
752 }
753
754 elem_ctr = 0;
755 idx = 0;
756
757 int d_idx = 0;
758 skipped_pos_ctr = 0;
759 while(1) {
760 if( d_idx >= read_size || skipped_pos_ctr == d_off)
761 break;
762
763 if(down_seq[d_idx] == '[') {
764 d_idx += 4;
765 skipped_pos_ctr++;
766 } else {
767 d_idx++;
768 skipped_pos_ctr++;
769 }
770 }
771
772 if(read_nr == DEBUG_READ) {
773 printf("d_off is %d\n",d_off);
774 printf("d_idx is %d\n",d_idx);
775 }
776 d_off = d_idx;
777 //if(read_nr == DEBUG_READ) {
778 // printf("%s\n",new_seq);
779 // printf("%c\n",down_seq[d_off+idx]);
780 //}
781
782 if( d_off > 0 && down_seq[d_off+idx-1] == '[' )
783 idx = -1;
784
785 if( d_off > 1 && down_seq[d_off+idx-2] == '[' )
786 idx = -2;
787
788 if( d_off > 2 && down_seq[d_off+idx-3] == '[' )
789 idx = -3;
790
791 while(1) {
792 if (elem_ctr == d_size)
793 break;
794
795 if(down_seq[d_off+idx] == '[') {
796
797 if(down_seq[d_off+idx+2] == '-') // del in read seq.
798 read_del_ctr++;
799
800 while(1) {
801 new_seq[new_idx] = down_seq[d_off+idx];
802 new_idx++;
803 idx++;
804 if(down_seq[d_off+idx-1] == ']')
805 break;
806 }
807 elem_ctr++;
808 continue;
809 }
810
811 new_seq[new_idx] = down_seq[d_off+idx];
812 idx++;
813 new_idx++;
814 elem_ctr++;
815 }
816
817 elem_ctr = 0;
818
819 #ifdef _FDEBUG_
820 if(read_nr == DEBUG_READ) {
821 printf("read_del_ctr %d\n",read_del_ctr);
822 printf("idx/new_idx: %d %d\n",idx,new_idx);
823 printf("next char: %c\n",down_seq[d_off+idx]);
824 printf("down_seq: %s\n",down_seq);
825 }
826 #endif // _FDEBUG_
827
828 while(1) {
829 if (elem_ctr == read_del_ctr || d_off+idx >= strlen(down_seq))
830 break;
831
832 if(down_seq[d_off+idx] == '[') {
833
834 while(1) {
835 new_seq[new_idx] = down_seq[d_off+idx];
836 new_idx++;
837 idx++;
838 if(down_seq[d_off+idx-1] == ']')
839 break;
840 }
841 elem_ctr++;
842 continue;
843 }
844
845 #ifdef _FDEBUG_
846 if(read_nr == DEBUG_READ)
847 printf("current char: %c\n",down_seq[d_off+idx]);
848 #endif // _FDEBUG_
849
850 new_seq[new_idx] = down_seq[d_off+idx];
851 new_idx++;
852 idx++;
853 elem_ctr++;
854 }
855
856 new_seq[new_idx+1] = '\0';
857
858 #ifdef _FDEBUG_
859 if(read_nr == DEBUG_READ)
860 printf("idx/new_idx: %d %d\n",idx,new_idx);
861 #endif // _FDEBUG_
862
863 return read_del_ctr;
864 }
865
866 void print_read(Read* cRead) {
867
868 printf(line_format,
869 cRead->chr, cRead->pos, cRead->seq, cRead->id,
870 cRead->strand, cRead->mismatch, cRead->occurrence,
871 cRead->size, cRead->cut, cRead->prb, cRead->cal_prb,
872 cRead->chastity);
873 }
874
875 /*
876 * TODO:
877 * - Check strand -> done simple (only if equal)
878 * - check for [AC] and similar entries -> done simple (see function
879 * - remove_ambiguities (exchanges [XY] by the second entry)
880 */
881
882 //if( read_nr == 13 || read_nr == 14 ) {
883 // printf("read nr %d",read_nr);
884 // printf("pos: %d %d %d %d\n",p_start,exon_stop,exon_start,p_stop);
885 // printf("u/d range: %d %d\n",up_range,down_range);
886 // printf("u/d size: %d %d\n",u_size,d_size);
887 // printf("u/d off: %d %d\n",u_off,d_off);
888 // printf("******************\n");
889
890 // printf("%s\n",up_read->seq);
891 // printf("%s\n",down_read->seq);
892
893 // printf("******************\n");
894
895 // printf("%s\n",new_up_seq);
896 // printf("%s\n",new_down_seq);
897
898 // printf("******************\n");
899 // printf("%s\n",new_seq);
900 // printf("%s\n",new_prb);
901 // printf("%s\n",new_cal_prb);
902 // printf("%s\n",new_chastity);
903 //}
904
905 //// The four last integers specify the positions of
906 //// p_start : the position in the dna where the first read starts
907 //// exons_stop : the position in the dna where the first exons ends
908 //// exons_start : the position in the dna where the second exons starts
909 //// p_stop : the position in the dna where the (truncated) second read ends