+ fixed uninitialised values
[qpalma.git] / tools / data_tools / filterReads.c
1 ////////////////////////////////////////////////////////////
2 // The purpose of this program is to read a gff and a
3 // solexa reads file and create a data set used by QPalma.
4 //
5 //
6 ////////////////////////////////////////////////////////////
7
8 #include <sys/mman.h>
9 #include <sys/stat.h>
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <string.h>
13 #include <unistd.h>
14 #include <math.h>
15
16 #include "debug_tools.h"
17 #include "datastructures.h"
18
19 #define _FILE_OFFSET_BITS == 64
20
21 int compare_gene_struct(struct gene* a, struct gene* b) {
22 return a->stop - b->start;
23 }
24
25 int parse_gff(char* filename,FILE* fid,struct gene*** allGenes);
26
27 void sort_genes(struct gene*** allGenes, int numGenes);
28
29 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes,FILE* out_fs);
30
31 void combine_info(int exon_stop, int exon_start, Read** upstream, int up_size, Read** downstream, int down_size, FILE* out_fs,const char* gene_id, int exon_idx);
32
33 int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FILE* out_fs,const char* gene_id, int exon_idx);
34
35 int fitting(char* up_prb, char* down_prb);
36
37 void remove_ambiguities(char * old_seq, char* new_seq);
38
39 static char *info = "Usage is:\n./filterReads gff reads output";
40
41 /*
42 * Some constants specifying the exact behavior of the filter
43 *
44 */
45
46 const char* line_format = "%d\t%d\t%s\t%lu\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n";
47
48 const int read_size = 36;
49
50 const int min_overlap = 6;
51 const int max_overlap = 30;
52
53 int read_nr = 1;
54
55 int combined_reads = 0;
56 int main(int argc, char* argv[]) {
57
58 if(argc != 5) {
59 printf("%s\n",info);
60 exit(EXIT_FAILURE);
61 }
62
63 int status;
64 int filenameSize = 256;
65 char* gff_filename = malloc(sizeof(char)*filenameSize);
66 char* reads_filename = malloc(sizeof(char)*filenameSize);
67 char* output_filename = malloc(sizeof(char)*filenameSize);
68
69
70 strncpy(gff_filename,argv[1],filenameSize);
71 strncpy(reads_filename,argv[2],filenameSize);
72 strncpy(output_filename,argv[3],filenameSize);
73
74 FILE *gff_fs = fopen(gff_filename,"r");
75 FILE *reads_fs = fopen(reads_filename,"r");
76 FILE *out_fs = fopen(output_filename,"w");
77
78 read_nr = atoi(argv[4]);
79 read_nr++;
80
81 if(gff_fs == NULL) {
82 printf("Error: Could not open file: %s",gff_filename);
83 exit(EXIT_FAILURE);
84 }
85
86 if(reads_fs == NULL) {
87 printf("Error: Could not open file: %s",reads_filename);
88 exit(EXIT_FAILURE);
89 }
90
91 if(out_fs == NULL) {
92 printf("Error: Could not open file: %s",output_filename);
93 exit(EXIT_FAILURE);
94 }
95
96 struct gene** allGenes;
97 int numGenes = parse_gff(gff_filename,gff_fs,&allGenes);
98 status = fclose(gff_fs);
99 free(gff_filename);
100 if(status != 0)
101 printf("closing of gff filestream failed!\n");
102
103 printf("Successfully parsed gff file! Found %d genes.\n",numGenes);
104
105 // check if allGenes is sorted. if not throw away those genes that do not
106 // occur in the sorted order
107 int g_idx;
108 struct gene* currentGene = 0;
109 int nulled_genes=0;
110 int old_gene_stop = -1;
111 for(g_idx=0;g_idx<numGenes;g_idx++) {
112 currentGene = allGenes[g_idx];
113
114 if (! (currentGene->start < currentGene->stop))
115 printf("Invalid positions for gene %s!\n",currentGene->id);
116
117 if (! (old_gene_stop < currentGene->start ) ) {
118 old_gene_stop = currentGene->stop;
119 allGenes[g_idx] = 0;
120 nulled_genes++;
121 continue;
122 }
123 old_gene_stop = currentGene->stop;
124 }
125
126 printf("Found %d unordered genes.\n",nulled_genes);
127 int gidx, eidx;
128 int exon_cov = 0;
129 for(gidx=0;gidx<numGenes;gidx++) {
130 if (allGenes[gidx] == 0)
131 continue;
132
133 for(eidx=0;eidx<allGenes[gidx]->num_exons;eidx++) {
134 exon_cov += allGenes[gidx]->exon_stops[eidx] - allGenes[gidx]->exon_starts[eidx];
135 }}
136 printf("Exon coverage is %f\n",(double)exon_cov/30432563);
137
138 process_reads(reads_fs,&allGenes,numGenes,out_fs);
139
140 for(g_idx=0;g_idx<numGenes;g_idx++) {
141 if(allGenes[g_idx] != 0)
142 free_gene(allGenes[g_idx]);
143 free(allGenes[g_idx]);
144
145 }
146 free(allGenes);
147
148 status = fclose(reads_fs);
149 status = fclose(out_fs);
150 if(status != 0)
151 perror("fclose");
152
153 free(reads_filename);
154 free(output_filename);
155 return 0;
156 }
157
158 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* out_fs) {
159 int status;
160
161 int buffer_size= 64;
162 int chr = 0;
163 int pos = 0;
164 char* seq = malloc(sizeof(char)*buffer_size);
165 unsigned long id = 0;
166 char strand = ' ';
167 int mismatch = 0;
168 int occurrence = 0;
169 int size = 0;
170 int cut = 0;
171 char* prb = malloc(sizeof(char)*buffer_size);
172 char* cal_prb = malloc(sizeof(char)*buffer_size);
173 char* chastity = malloc(sizeof(char)*buffer_size);
174
175 int reads_fid = fileno(reads_fs);
176 struct stat reads_stat;
177 if ( fstat(reads_fid,&reads_stat) == -1) {
178 perror("fstat");
179 exit(EXIT_FAILURE);
180 }
181 off_t reads_filesize = reads_stat.st_size;
182 printf("Reads file is of size %lu bytes\n",(unsigned long) reads_filesize);
183 int numReads = reads_filesize / 178.0;
184
185 void *reads_area = mmap (NULL,reads_filesize,PROT_READ,MAP_PRIVATE,reads_fid,0);
186 if (reads_area == MAP_FAILED) {
187 perror("mmap");
188 exit(EXIT_FAILURE);
189 }
190 close(reads_fid);
191 printf("Successfully mapped %lu bytes of reads file into memory\n",(unsigned long)reads_filesize);
192
193 char* lineBeginPtr = (char*) reads_area;
194 char* lineEndPtr = (char*) reads_area;
195 char* end_of_mapped_area = ((char*) reads_area) + reads_filesize;
196
197 while (*lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
198 lineEndPtr++;
199
200 char* current_line = malloc(sizeof(char)*512);
201 memset(current_line,0,512);
202
203 int SIZE = 500;
204 // initialize boundary arrays
205 Read** upstream_overlap = malloc(sizeof(Read*)*SIZE);
206 Read** downstream_overlap= malloc(sizeof(Read*)*SIZE);
207 int uov,dov;
208 uov = dov = 0;
209
210 int skippedLinesCounter = 0;
211
212 int prev_exon_start = -1;
213 int prev_exon_stop = -1;
214 int cur_exon_start = -1;
215
216 unsigned long line_size = lineEndPtr - lineBeginPtr;
217 //printf("distance is %lu\n",line_size);
218 strncpy(current_line,lineBeginPtr,line_size);
219 current_line[line_size] = '\0';
220 //printf("%s test",current_line);
221
222 int gene_idx = 0;
223 int exon_idx = 1;
224 struct gene* currentGene = (*allGenes)[gene_idx];
225 char* gene_id = currentGene->id;
226
227 int skippedReadCtr = 0;
228 int uselessReadCtr = 0;
229 int exonicReadCtr = 0;
230 int endPrevCtr = 0;
231 int prevOverlapCtr = 0;
232 int currentStartCtr = 0;
233 int currentOverlapCtr = 0;
234 int multioccurReadCtr = 0;
235
236 Read* currentRead;
237 int up_idx, down_idx;
238
239 int readCtr = 0;
240 // start of the parsing loop
241 while(1) {
242 if (gene_idx == numGenes || strcmp(current_line,"") == 0)
243 break;
244
245 gene_id = currentGene->id;
246
247 if (readCtr != 0 && readCtr % 1000000 == 0)
248 printf("Processed %d/%d genes and %d/%d reads.\n",gene_idx,numGenes,readCtr,numReads);
249
250 status = sscanf(current_line,line_format,&chr,&pos,seq,&id,&strand,&mismatch,&occurrence,&size,&cut,prb,cal_prb,chastity);
251 if (status < 12) {
252 skippedLinesCounter++;
253 goto next_read;
254 }
255
256 // if the read is occurring several times elsewhere then get rid of it
257 if ( occurrence != 1 ) {
258 multioccurReadCtr++;
259 goto next_read;
260 }
261
262 FA(currentGene != 0);
263
264 if (!(currentGene->start <= pos && (pos + read_size-1) <= currentGene->stop)) { // read is not within gene borders
265
266 if ( currentGene->stop < (pos + read_size-1) ) { // go to next gene
267
268 if (uov != 0 && dov != 0)
269 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
270
271 for(up_idx=0;up_idx<uov;up_idx++) {
272 free_read(upstream_overlap[up_idx]);
273 free(upstream_overlap[up_idx]);
274 }
275
276 for(down_idx=0;down_idx<dov;down_idx++) {
277 free_read(downstream_overlap[down_idx]);
278 free(downstream_overlap[down_idx]);
279 }
280
281 uov = dov = 0;
282
283 gene_idx++;
284 exon_idx = 1;
285 currentGene = (*allGenes)[gene_idx];
286 while( currentGene == 0 && gene_idx < numGenes) {
287 currentGene = (*allGenes)[gene_idx];
288 gene_idx++;
289 }
290 //printf("currentGene->start / currentGene->stop %d/%d pos is %d\n",currentGene->start,currentGene->stop,pos);
291 continue;
292 }
293
294 if ( pos < currentGene->start ) { // go to next read
295 skippedReadCtr++;
296
297 next_read:
298
299 lineBeginPtr = lineEndPtr;
300 while (*(char*)lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
301 lineEndPtr++;
302 readCtr += 1;
303 current_line = strncpy(current_line,lineBeginPtr,lineEndPtr-lineBeginPtr);
304 current_line[lineEndPtr-lineBeginPtr] = '\0';
305 continue;
306 }
307
308 } else { // read IS within gene borders
309
310 exon_label:
311
312 if (exon_idx == currentGene->num_exons) {
313 gene_idx++;
314 exon_idx = 1;
315 currentGene = (*allGenes)[gene_idx];
316 while( currentGene == 0 && gene_idx < numGenes) {
317 currentGene = (*allGenes)[gene_idx];
318 gene_idx++;
319 }
320 continue;
321 }
322
323 prev_exon_start = currentGene->exon_starts[exon_idx-1];
324 prev_exon_stop = currentGene->exon_stops[exon_idx-1];
325 cur_exon_start = currentGene->exon_starts[exon_idx];
326
327 //printf("exon: %d %d %d\t pos: %d\n",prev_exon_start,prev_exon_stop,cur_exon_start,pos);
328
329 if (cur_exon_start - prev_exon_stop < min_overlap || cur_exon_start < pos ) { // go to next exon
330
331 if (uov != 0 && dov != 0)
332 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
333
334 for(up_idx=0;up_idx<uov;up_idx++) {
335 free_read(upstream_overlap[up_idx]);
336 free(upstream_overlap[up_idx]);
337 }
338
339 for(down_idx=0;down_idx<dov;down_idx++) {
340 free_read(downstream_overlap[down_idx]);
341 free(downstream_overlap[down_idx]);
342 }
343 uov = dov = 0;
344
345 exon_idx++;
346 goto exon_label;
347 }
348
349 if ( prev_exon_start < pos && (pos+read_size) < prev_exon_stop ) { // read is inside previous exon
350 exonicReadCtr++;
351 goto next_read;
352 }
353
354 if ( pos + (read_size-1) < prev_exon_stop ) // go to next read
355 goto next_read;
356
357 // if this position is reached the read is somehow overlapping or
358 // exactly on the exon boundary.
359 if ( (prev_exon_stop - pos) >= min_overlap && (prev_exon_stop - pos) < read_size) { // read overlaps with previous exon boundary
360 //printf("%s\n",current_line);
361 prevOverlapCtr++;
362 currentRead = create_read(chr,pos,seq,id,strand,mismatch,occurrence,size,cut,prb,cal_prb,chastity);
363 upstream_overlap[uov] = currentRead;
364 uov++;
365 goto next_read;
366 }
367
368 int end_pos = pos+read_size-1;
369 if ( (end_pos - cur_exon_start) >= min_overlap && (end_pos - cur_exon_start) < read_size) { // read overlaps with current exon boundary
370 //printf("%s\n",current_line);
371 currentOverlapCtr++;
372 currentRead = create_read(chr,pos,seq,id,strand,mismatch,occurrence,size,cut,prb,cal_prb,chastity);
373 downstream_overlap[dov] = currentRead;
374 dov++;
375 goto next_read;
376 }
377
378 uselessReadCtr++;
379 goto next_read; // read was not useful i.e. not overlapping/starting at exon boundaries
380 }
381 }
382
383 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
384
385 for(up_idx=0;up_idx<uov;up_idx++) {
386 free_read(upstream_overlap[up_idx]);
387 free(upstream_overlap[up_idx]);
388 }
389
390 for(down_idx=0;down_idx<dov;down_idx++) {
391 free_read(downstream_overlap[down_idx]);
392 free(downstream_overlap[down_idx]);
393 }
394
395 uov = dov = 0;
396
397 free(upstream_overlap);
398 free(downstream_overlap);
399
400 printf("Processed %d reads in total while %d reads where skipped.\n",readCtr,skippedLinesCounter);
401 printf("%d were totally intronic. %d were outside of genes and %d were occurring on several positions.\n",uselessReadCtr,skippedReadCtr,multioccurReadCtr);
402 printf("%d reads were useless in total\n",uselessReadCtr+skippedReadCtr+multioccurReadCtr);
403 printf("%d reads were totally exonic\n",exonicReadCtr);
404 printf("%d reads overlap with prev. exon. %d reads overlap with current exon\n",prevOverlapCtr,currentOverlapCtr);
405 printf("%d reads where newly combined from two original reads\n",combined_reads);
406 printf("Total used reads: %d\n",exonicReadCtr+endPrevCtr+prevOverlapCtr+currentStartCtr+currentOverlapCtr);
407
408 status = munmap(reads_area,reads_filesize);
409 if(status != 0)
410 perror("munmap");
411
412 free(current_line);
413 free(seq);
414 free(prb);
415 free(cal_prb);
416 free(chastity);
417 }
418
419 void combine_info(int exon_stop, int exon_start, Read** upstream, int up_size, Read** downstream, int down_size, FILE* out_fs,const char* gene_id, int exon_idx) {
420
421 printf("up/down size is %d/%d\n",up_size,down_size);
422
423 int up_idx, down_idx, overlap, success;
424
425 char* up_used_flag = calloc(up_size,sizeof(char));
426 char* down_used_flag = calloc(down_size,sizeof(char));
427
428 Read* currentUpRead;
429 Read* currentDownRead;
430
431 for(up_idx=0;up_idx<up_size;up_idx++) {
432 if( up_used_flag[up_idx] == 1)
433 continue;
434
435 currentUpRead = upstream[up_idx];
436
437 overlap = exon_stop - currentUpRead->pos;
438
439 for(down_idx=0;down_idx<down_size;down_idx++) {
440 if( down_used_flag[down_idx] == 1)
441 continue;
442
443 currentDownRead = downstream[down_idx];
444
445 if(currentUpRead->strand != currentDownRead->strand)
446 continue;
447
448 success = join_reads(exon_stop,exon_start,currentUpRead,currentDownRead,out_fs,gene_id,exon_idx);
449
450 if(success) {
451 up_used_flag[up_idx] = 1;
452 down_used_flag[down_idx] = 1;
453 }
454
455 }
456 }
457
458 free(up_used_flag);
459 free(down_used_flag);
460 }
461
462 int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FILE* out_fs,const char* gene_id, int exon_idx) {
463 int buffer_size = read_size+1;
464
465 char* new_seq = malloc(sizeof(char)*buffer_size);
466 char* new_prb = malloc(sizeof(char)*buffer_size);
467 char* new_cal_prb = malloc(sizeof(char)*buffer_size);
468 char* new_chastity = malloc(sizeof(char)*buffer_size);
469
470 char* new_up_seq = malloc(sizeof(char)*(read_size+1));
471 char* new_down_seq = malloc(sizeof(char)*(read_size+1));
472 memset(new_up_seq,0,sizeof(char)*read_size);
473 memset(new_down_seq,0,sizeof(char)*read_size);
474
475 // range of possible sequence length on exon side
476 int up_range = exon_stop - up_read->pos+1;
477 int down_range = down_read->pos+(read_size-1) - exon_start+1;
478
479 int u_off, d_off, u_size, d_size;
480 u_size = -1;
481 d_size = -1;
482
483 printf("possible range %d %d\n",up_range,down_range);
484
485 if(up_range+down_range < read_size)
486 return 0;
487
488 int half_read = read_size / 2;
489
490 // both overlap more than a half of the new read
491 if(up_range >= half_read && down_range >= half_read) {
492
493 u_size = half_read;
494 d_size = half_read;
495
496 } else {
497
498 if(up_range < down_range) {
499 u_size = up_range;
500 d_size = read_size - u_size;
501 }
502
503 if(up_range > down_range) {
504 d_size = down_range;
505 u_size = read_size - d_size;
506 }
507 }
508
509 int p_start = exon_stop - u_size + 1;
510 int p_stop = exon_start + d_size - 1;
511
512 u_off = p_start - up_read->pos;
513 d_off = exon_start - down_read->pos + 1;
514
515 FA(u_size != -1);
516 FA(d_size != -1);
517 FA(u_size + d_size == read_size);
518
519 printf("pos: %d %d %d %d\n",p_start,exon_stop,exon_start,p_stop);
520 printf("u/d size: %d %d\n",u_size,d_size);
521 printf("u/d off: %d %d\n",u_off,d_off);
522
523 remove_ambiguities(up_read->seq,new_up_seq);
524 remove_ambiguities(down_read->seq,new_down_seq);
525
526 strncpy(new_seq,new_up_seq+u_off,u_size);
527 strncpy(new_seq+u_size,new_down_seq+d_off,d_size);
528 new_seq[read_size] = '\0';
529
530 strncpy(new_prb, up_read->prb+u_off, u_size);
531 strncpy(new_prb+u_size, down_read->prb+d_off, d_size);
532 new_prb[read_size] = '\0';
533
534 strncpy(new_cal_prb, up_read->cal_prb+u_off, u_size);
535 strncpy(new_cal_prb+u_size, down_read->cal_prb+d_off, d_size);
536 new_cal_prb[read_size] = '\0';
537
538 strncpy(new_chastity, up_read->chastity+u_off, u_size);
539 strncpy(new_chastity+u_size, down_read->chastity+d_off, d_size);
540 new_chastity[read_size] = '\0';
541
542 //printf("%s\n",new_seq);
543 //printf("%s\n",new_prb);
544 //printf("%s\n",new_cal_prb);
545 //printf("%s\n",new_chastity);
546
547 //// The four last integers specify the positions of
548 //// p_start : the position in the dna where the first read starts
549 //// exons_stop : the position in the dna where the first exons ends
550 //// exons_start : the position in the dna where the second exons starts
551 //// p_stop : the position in the dna where the (truncated) second read ends
552
553 fprintf(out_fs,"%d\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\n",
554 read_nr,up_read->chr,up_read->strand,new_seq,u_size,read_size,new_prb,new_cal_prb,new_chastity,gene_id,p_start,exon_stop,exon_start,p_stop);
555
556 read_nr++;
557
558 free(new_up_seq);
559 free(new_down_seq);
560
561 free(new_seq);
562 free(new_prb);
563 free(new_cal_prb);
564 free(new_chastity);
565
566 return 1;
567 }
568
569 int fitting(char* up_prb, char* down_prb) {
570 double epsilon_mean = 30.0;
571 double epsilon_var = 30.0;
572 int w_size = 6;
573
574 double current_mean_up = 0;
575 double current_variance_up = 0;
576 double current_mean_down = 0;
577 double current_variance_down = 0;
578
579 double* mean_up = malloc(sizeof(double)*read_size-2*w_size);
580 double* variance_up = malloc(sizeof(double)*read_size-2*w_size);
581 double* mean_down = malloc(sizeof(double)*read_size-2*w_size);
582 double* variance_down = malloc(sizeof(double)*read_size-2*w_size);
583
584 int iidx = -1;
585 int uidx;
586 int didx;
587
588 for(uidx=iidx-w_size;uidx<iidx;uidx++) {
589 didx = uidx+w_size;
590 current_mean_up += up_prb[uidx]-50;
591 current_mean_down += up_prb[didx]-50;
592 }
593
594 current_mean_up /= w_size;
595 current_mean_down /= w_size;
596
597 for(uidx=iidx-w_size;uidx<iidx;uidx++) {
598 didx = uidx+w_size;
599 current_variance_up += pow(up_prb[uidx]-50 - current_mean_up,2);
600 current_variance_down += pow(up_prb[didx]-50 - current_mean_up,2);
601 }
602
603 current_variance_up /= (w_size-1);
604 current_variance_down /= (w_size-1);
605
606 for(iidx=w_size;iidx<30;iidx++) {
607 for(uidx=iidx-w_size;uidx<iidx;uidx++) {
608 didx = uidx+w_size;
609 mean_up[iidx-w_size] += down_prb[uidx]-50;
610 mean_down[iidx-w_size] += down_prb[didx]-50;
611
612 }
613 mean_up[iidx-w_size] /= w_size;
614 mean_down[iidx-w_size] /= w_size;
615
616 for(uidx=iidx-w_size;uidx<iidx;uidx++) {
617 didx = uidx+w_size;
618 variance_up[iidx-w_size] += pow(down_prb[uidx]-50 - mean_up[iidx-w_size],2);
619 variance_down[iidx-w_size] += pow(down_prb[didx]-50 - mean_down[iidx-w_size],2);
620 }
621 variance_up[iidx-w_size] /= (w_size-1);
622 variance_down[iidx-w_size] /= (w_size-1);
623
624 //printf("means: %f %f %f %f, variances: %f %f %f %f\n",current_mean_up,current_mean_down,mean_up,mean_down,current_variance_up,current_variance_down,variance_up,variance_down);
625 //if ( abs(current_mean_up - mean_up) < epsilon_mean && abs(current_variance_up - variance_up) < epsilon_var
626 //&& abs(current_mean_down - mean_down) < epsilon_mean && abs(current_variance_down - variance_down) < epsilon_var )
627 // return iidx;
628 }
629
630 int bidx;
631 int bestIdx = -1;
632 double min = 1000.0;
633 for(bidx=0;bidx<read_size-2*w_size;bidx++) {
634 if ( abs(current_mean_up - mean_up[bidx]) < epsilon_mean && abs(current_variance_up - variance_up[bidx]) < epsilon_var
635 && abs(current_mean_down - mean_down[bidx]) < epsilon_mean && abs(current_variance_down - variance_down[bidx]) < epsilon_var ) {
636 if ( abs(current_mean_up - mean_up[bidx]) + abs(current_variance_up - variance_up[bidx]) + abs(current_mean_down - mean_down[bidx]) + abs(current_variance_down - variance_down[bidx])) {
637 min = abs(current_mean_up - mean_up[bidx]) + abs(current_variance_up - variance_up[bidx]) + abs(current_mean_down - mean_down[bidx]) + abs(current_variance_down - variance_down[bidx]);
638 bestIdx = bidx;
639 }
640 }
641 }
642
643 free(mean_up);
644 free(variance_up);
645 free(mean_down);
646 free(variance_down);
647
648 return bestIdx;
649 }
650
651 void remove_ambiguities(char * old_seq, char* new_seq) {
652 int idx=0;
653 int new_idx = 0;
654 int old_seq_size = strlen(old_seq);
655
656 while(idx<old_seq_size) {
657 //printf("Current elem %c pos %d %d\n",old_seq[idx],idx,new_idx);
658 if (old_seq[idx] == ']' || old_seq[idx] == '-' ) {
659 idx++;
660 continue;
661 }
662
663 if (old_seq[idx] == '[') {
664 idx += 2;
665 //printf("%c %c\n",old_seq[idx-2],old_seq[idx]);
666 continue;
667 }
668
669 new_seq[new_idx] = old_seq[idx];
670 idx++;
671 new_idx++;
672 //printf("%s\t%s\n",old_seq,new_seq);
673 //printf("%d\t%d\n",idx,new_idx);
674 }
675
676 if (new_idx != read_size) {
677 printf("Error: Sequence is not of length 36!\n");
678 printf("old seq: %s\n",old_seq);
679 printf("new seq: %s\n",new_seq);
680 exit(EXIT_FAILURE);
681 }
682
683 new_seq[read_size] = '\0';
684 }
685
686 void print_read(Read* cRead) {
687
688 printf(line_format,
689 cRead->chr, cRead->pos, cRead->seq, cRead->id,
690 cRead->strand, cRead->mismatch, cRead->occurrence,
691 cRead->size, cRead->cut, cRead->prb, cRead->cal_prb,
692 cRead->chastity);
693 }
694
695 /*
696 * TODO:
697 * - Check strand -> done simple (only if equal)
698 * - check for [AC] and similar entries -> done simple (see function
699 * - remove_ambiguities (exchanges [XY] by the second entry)
700 */