+ some fixes of the filterReads and parser functions
[qpalma.git] / tools / data_tools / filterReads.c
1 ////////////////////////////////////////////////////////////
2 // The purpose of this program is to read a gff and a
3 // solexa reads file and create a data set used by QPalma.
4 //
5 //
6 ////////////////////////////////////////////////////////////
7
8 #include <sys/mman.h>
9 #include <sys/stat.h>
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <string.h>
13 #include <unistd.h>
14 #include <math.h>
15
16 #include "debug_tools.h"
17 #include "datastructures.h"
18
19 #define _FILE_OFFSET_BITS == 64
20
21 int compare_gene_struct(struct gene* a, struct gene* b) {
22 return a->stop - b->start;
23 }
24
25 int parse_gff(char* filename,FILE* fid,struct gene*** allGenes);
26
27 void sort_genes(struct gene*** allGenes, int numGenes);
28
29 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes,FILE* out_fs);
30
31 void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size, FILE* out_fs,const char* gene_id, int exon_idx);
32
33 int fitting(char* up_prb, char* down_prb);
34
35 void remove_ambiguities(char * old_seq, int old_seq_size, char* new_seq);
36
37 static char *info = "Usage is:\n./filterReads gff reads output";
38
39 /*
40 * Some constants specifying the exact behavior of the filter
41 *
42 */
43
44 const char* line_format = "%d\t%d\t%s\t%lu\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n";
45
46 const int read_size = 36;
47
48 const int min_overlap = 6;
49 const int max_overlap = 30;
50
51 int read_nr = 1;
52
53 int combined_reads = 0;
54 int main(int argc, char* argv[]) {
55
56 if(argc != 5) {
57 printf("%s\n",info);
58 exit(EXIT_FAILURE);
59 }
60
61 int status;
62 int filenameSize = 256;
63 char* gff_filename = malloc(sizeof(char)*filenameSize);
64 char* reads_filename = malloc(sizeof(char)*filenameSize);
65 char* output_filename = malloc(sizeof(char)*filenameSize);
66
67
68 strncpy(gff_filename,argv[1],filenameSize);
69 strncpy(reads_filename,argv[2],filenameSize);
70 strncpy(output_filename,argv[3],filenameSize);
71
72 FILE *gff_fs = fopen(gff_filename,"r");
73 FILE *reads_fs = fopen(reads_filename,"r");
74 FILE *out_fs = fopen(output_filename,"w");
75
76 read_nr = atoi(argv[4]);
77 read_nr++;
78
79 if(gff_fs == NULL) {
80 printf("Error: Could not open file: %s",gff_filename);
81 exit(EXIT_FAILURE);
82 }
83
84 if(reads_fs == NULL) {
85 printf("Error: Could not open file: %s",reads_filename);
86 exit(EXIT_FAILURE);
87 }
88
89 if(out_fs == NULL) {
90 printf("Error: Could not open file: %s",output_filename);
91 exit(EXIT_FAILURE);
92 }
93
94 struct gene** allGenes;
95 int numGenes = parse_gff(gff_filename,gff_fs,&allGenes);
96 status = fclose(gff_fs);
97 free(gff_filename);
98 if(status != 0)
99 printf("closing of gff filestream failed!\n");
100
101 printf("Successfully parsed gff file! Found %d genes.\n",numGenes);
102
103 // check if allGenes is sorted. if not throw away those genes that do not
104 // occur in the sorted order
105 int g_idx;
106 struct gene* currentGene = 0;
107 int nulled_genes=0;
108 int old_gene_stop = -1;
109 for(g_idx=0;g_idx<numGenes;g_idx++) {
110 currentGene = allGenes[g_idx];
111
112 if (! (currentGene->start < currentGene->stop))
113 printf("Invalid positions for gene %s!\n",currentGene->id);
114
115 if (! (old_gene_stop < currentGene->start ) ) {
116 old_gene_stop = currentGene->stop;
117 allGenes[g_idx] = 0;
118 nulled_genes++;
119 continue;
120 }
121 old_gene_stop = currentGene->stop;
122 }
123
124 printf("Found %d unordered genes.\n",nulled_genes);
125 int gidx, eidx;
126 int exon_cov = 0;
127 for(gidx=0;gidx<numGenes;gidx++) {
128 if (allGenes[gidx] == 0)
129 continue;
130
131 for(eidx=0;eidx<allGenes[gidx]->num_exons;eidx++) {
132 exon_cov += allGenes[gidx]->exon_stops[eidx] - allGenes[gidx]->exon_starts[eidx];
133 }}
134 printf("Exon coverage is %f\n",(double)exon_cov/30432563);
135
136 process_reads(reads_fs,&allGenes,numGenes,out_fs);
137
138 for(g_idx=0;g_idx<numGenes;g_idx++) {
139 if(allGenes[g_idx] != 0)
140 free_gene(allGenes[g_idx]);
141 free(allGenes[g_idx]);
142
143 }
144 free(allGenes);
145
146 status = fclose(reads_fs);
147 status = fclose(out_fs);
148 if(status != 0)
149 perror("fclose");
150
151 free(reads_filename);
152 free(output_filename);
153 return 0;
154 }
155
156 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* out_fs) {
157 int status;
158
159 int buffer_size= 64;
160 int chr = 0;
161 int pos = 0;
162 char* seq = malloc(sizeof(char)*buffer_size);
163 unsigned long id = 0;
164 char strand = ' ';
165 int mismatch = 0;
166 int occurrence = 0;
167 int size = 0;
168 int cut = 0;
169 char* prb = malloc(sizeof(char)*buffer_size);
170 char* cal_prb = malloc(sizeof(char)*buffer_size);
171 char* chastity = malloc(sizeof(char)*buffer_size);
172
173 int reads_fid = fileno(reads_fs);
174 struct stat reads_stat;
175 if ( fstat(reads_fid,&reads_stat) == -1) {
176 perror("fstat");
177 exit(EXIT_FAILURE);
178 }
179 off_t reads_filesize = reads_stat.st_size;
180 printf("Reads file is of size %lu bytes\n",(unsigned long) reads_filesize);
181 int numReads = reads_filesize / 178.0;
182
183 void *reads_area = mmap (NULL,reads_filesize,PROT_READ,MAP_PRIVATE,reads_fid,0);
184 if (reads_area == MAP_FAILED) {
185 perror("mmap");
186 exit(EXIT_FAILURE);
187 }
188 close(reads_fid);
189 printf("Successfully mapped %lu bytes of reads file into memory\n",(unsigned long)reads_filesize);
190
191 char* lineBeginPtr = (char*) reads_area;
192 char* lineEndPtr = (char*) reads_area;
193 char* end_of_mapped_area = ((char*) reads_area) + reads_filesize;
194
195 while (*lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
196 lineEndPtr++;
197
198 char* current_line = malloc(sizeof(char)*512);
199 memset(current_line,0,512);
200
201 int SIZE = 500;
202 // initialize boundary arrays
203 void** upstream_overlap = malloc(sizeof(void*)*SIZE);
204 void** downstream_overlap= malloc(sizeof(void*)*SIZE);
205 int uov,dov;
206 uov = dov = 0;
207
208 int skippedLinesCounter = 0;
209
210 int prev_exon_start = -1;
211 int prev_exon_stop = -1;
212 int cur_exon_start = -1;
213
214 unsigned long line_size = lineEndPtr - lineBeginPtr;
215 //printf("distance is %lu\n",line_size);
216 strncpy(current_line,lineBeginPtr,line_size);
217 current_line[line_size] = '\0';
218 //printf("%s test",current_line);
219
220 int gene_idx = 0;
221 int exon_idx = 1;
222 struct gene* currentGene = (*allGenes)[gene_idx];
223 char* gene_id = currentGene->id;
224
225 int skippedReadCtr = 0;
226 int uselessReadCtr = 0;
227 int exonicReadCtr = 0;
228 int endPrevCtr = 0;
229 int prevOverlapCtr = 0;
230 int currentStartCtr = 0;
231 int currentOverlapCtr = 0;
232 int multioccurReadCtr = 0;
233
234 int readCtr = 0;
235 // start of the parsing loop
236 while(1) {
237 if (gene_idx == numGenes || strcmp(current_line,"") == 0)
238 break;
239
240 gene_id = currentGene->id;
241
242 if (readCtr != 0 && readCtr % 1000000 == 0)
243 printf("Processed %d/%d genes and %d/%d reads.\n",gene_idx,numGenes,readCtr,numReads);
244
245 status = sscanf(current_line,line_format,&chr,&pos,seq,&id,&strand,&mismatch,&occurrence,&size,&cut,prb,cal_prb,chastity);
246 if (status < 12) {
247 skippedLinesCounter++;
248 goto next_read;
249 }
250
251 // if the read is occurring several times elsewhere then get rid of it
252 if ( occurrence != 1 ) {
253 multioccurReadCtr++;
254 goto next_read;
255 }
256
257 FA(currentGene != 0);
258
259 if (!(currentGene->start <= pos && (pos + read_size-1) <= currentGene->stop)) { // read is not within gene borders
260
261 if ( currentGene->stop < (pos + read_size-1) ) { // go to next gene
262
263 if (uov != 0 && dov != 0)
264 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
265 uov = dov = 0;
266
267 gene_idx++;
268 exon_idx = 1;
269 currentGene = (*allGenes)[gene_idx];
270 while( currentGene == 0 && gene_idx < numGenes) {
271 currentGene = (*allGenes)[gene_idx];
272 gene_idx++;
273 }
274 //printf("currentGene->start / currentGene->stop %d/%d pos is %d\n",currentGene->start,currentGene->stop,pos);
275 continue;
276 }
277
278 if ( pos < currentGene->start ) { // go to next read
279 skippedReadCtr++;
280
281 next_read:
282
283 lineBeginPtr = lineEndPtr;
284 while (*(char*)lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
285 lineEndPtr++;
286 readCtr += 1;
287 current_line = strncpy(current_line,lineBeginPtr,lineEndPtr-lineBeginPtr);
288 current_line[lineEndPtr-lineBeginPtr] = '\0';
289 continue;
290 }
291
292 } else { // read IS within gene borders
293
294 exon_label:
295
296 if (exon_idx == currentGene->num_exons) {
297 gene_idx++;
298 exon_idx = 1;
299 currentGene = (*allGenes)[gene_idx];
300 while( currentGene == 0 && gene_idx < numGenes) {
301 currentGene = (*allGenes)[gene_idx];
302 gene_idx++;
303 }
304 continue;
305 }
306
307 prev_exon_start = currentGene->exon_starts[exon_idx-1];
308 prev_exon_stop = currentGene->exon_stops[exon_idx-1];
309 cur_exon_start = currentGene->exon_starts[exon_idx];
310
311 //printf("exon: %d %d %d\t pos: %d\n",prev_exon_start,prev_exon_stop,cur_exon_start,pos);
312
313 if (cur_exon_start - prev_exon_stop < min_overlap || cur_exon_start < pos ) { // go to next exon
314
315 if (uov != 0 && dov != 0)
316 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
317 uov = dov = 0;
318
319 exon_idx++;
320 goto exon_label;
321 }
322
323 if ( prev_exon_start < pos && (pos+read_size) < prev_exon_stop ) { // read is inside previous exon
324 exonicReadCtr++;
325 goto next_read;
326 }
327
328 if ( pos + (read_size-1) < prev_exon_stop ) // go to next read
329 goto next_read;
330
331 // if this position is reached the read is somehow overlapping or
332 // exactly on the exon boundary.
333 if ( (prev_exon_stop - pos) >= min_overlap && (prev_exon_stop - pos) < read_size) { // read overlaps with previous exon boundary
334 //printf("%s\n",current_line);
335 prevOverlapCtr++;
336 upstream_overlap[uov] = lineBeginPtr;
337 uov++;
338 goto next_read;
339 }
340
341 int end_pos = pos+read_size-1;
342 if ( (end_pos - cur_exon_start) >= min_overlap && (end_pos - cur_exon_start) < read_size) { // read overlaps with current exon boundary
343 //printf("%s\n",current_line);
344 currentOverlapCtr++;
345 downstream_overlap[dov] = lineBeginPtr;
346 dov++;
347 goto next_read;
348 }
349
350 uselessReadCtr++;
351 goto next_read; // read was not useful i.e. not overlapping/starting at exon boundaries
352 }
353 }
354
355 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
356
357 free(upstream_overlap);
358 free(downstream_overlap);
359
360 printf("Processed %d reads in total while %d reads where skipped.\n",readCtr,skippedLinesCounter);
361 printf("%d were totally intronic. %d were outside of genes and %d were occurring on several positions.\n",uselessReadCtr,skippedReadCtr,multioccurReadCtr);
362 printf("%d reads were useless in total\n",uselessReadCtr+skippedReadCtr+multioccurReadCtr);
363 printf("%d reads were totally exonic\n",exonicReadCtr);
364 printf("%d reads overlap with prev. exon. %d reads overlap with current exon\n",prevOverlapCtr,currentOverlapCtr);
365 printf("%d reads where newly combined from two original reads\n",combined_reads);
366 printf("Total used reads: %d\n",exonicReadCtr+endPrevCtr+prevOverlapCtr+currentStartCtr+currentOverlapCtr);
367
368 status = munmap(reads_area,reads_filesize);
369 if(status != 0)
370 perror("munmap");
371
372 free(current_line);
373 free(seq);
374 free(prb);
375 free(cal_prb);
376 free(chastity);
377 }
378
379 void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size,FILE* out_fs,const char* gene_id, int exon_idx) {
380
381 printf("up/down size is %d/%d\n",up_size,down_size);
382
383 int up_idx, down_idx, status;
384 char* upstream_line = malloc(sizeof(char)*512);
385 char* downstream_line = malloc(sizeof(char)*512);
386 int p_start, p_stop;
387
388 int buffer_size= 64;
389
390 int up_chr = 0;
391 int up_pos = 0;
392 char* up_seq = malloc(sizeof(char)*buffer_size);
393 unsigned long up_id = 0;
394 char up_strand = ' ';
395 int up_mismatch = 0;
396 int up_occurrence = 0;
397 int up_sz = 0;
398 int up_cut = 0;
399 char* up_prb = malloc(sizeof(char)*buffer_size);
400 char* up_cal_prb = malloc(sizeof(char)*buffer_size);
401 char* up_chastity = malloc(sizeof(char)*buffer_size);
402
403 int down_chr = 0;
404 int down_pos = 0;
405 char* down_seq = malloc(sizeof(char)*buffer_size);
406 unsigned long down_id = 0;
407 char down_strand = ' ';
408 int down_mismatch = 0;
409 int down_occurrence = 0;
410 int down_sz = 0;
411 int down_cut = 0;
412 char* down_prb = malloc(sizeof(char)*buffer_size);
413 char* down_cal_prb = malloc(sizeof(char)*buffer_size);
414 char* down_chastity = malloc(sizeof(char)*buffer_size);
415
416 int new_chr = 0;
417 char* new_seq = malloc(sizeof(char)*buffer_size);
418 char new_strand = ' ';
419 char* new_prb = malloc(sizeof(char)*buffer_size);
420 char* new_cal_prb = malloc(sizeof(char)*buffer_size);
421 char* new_chastity = malloc(sizeof(char)*buffer_size);
422 char* new_up_seq = malloc(sizeof(char)*read_size);
423 char* new_down_seq = malloc(sizeof(char)*read_size);
424
425 int overlap;
426 int fit;
427
428 char* lineBeginPtr;
429 char* lineEndPtr;
430
431 char* up_used_flag = calloc(up_size,sizeof(char));
432 char* down_used_flag = calloc(down_size,sizeof(char));
433
434 for(up_idx=0;up_idx<up_size;up_idx++) {
435 if( up_used_flag[up_idx] == 1)
436 continue;
437
438 lineBeginPtr = upstream[up_idx];
439 lineEndPtr = lineBeginPtr;
440 while (*(char*)lineEndPtr != '\n') lineEndPtr++;
441 lineEndPtr++;
442 strncpy(upstream_line,lineBeginPtr,lineEndPtr-lineBeginPtr);
443 upstream_line[lineEndPtr-lineBeginPtr] = '\0';
444
445 status = sscanf(upstream_line,line_format,&up_chr,&up_pos,up_seq,&up_id,
446 &up_strand,&up_mismatch,&up_occurrence,&up_sz,&up_cut,up_prb,up_cal_prb,up_chastity);
447
448 remove_ambiguities(up_seq,strlen(up_seq),new_up_seq);
449
450 overlap = exon_stop - up_pos;
451
452 for(down_idx=0;down_idx<down_size;down_idx++) {
453 if( down_used_flag[down_idx] == 1)
454 continue;
455
456 lineBeginPtr = downstream[down_idx];
457 lineEndPtr = lineBeginPtr;
458 while (*(char*)lineEndPtr != '\n') lineEndPtr++;
459 lineEndPtr++;
460 strncpy(downstream_line,lineBeginPtr,lineEndPtr-lineBeginPtr);
461 downstream_line[lineEndPtr-lineBeginPtr] = '\0';
462
463 status = sscanf(downstream_line,line_format,
464 &down_chr,&down_pos,down_seq,&down_id,&down_strand,&down_mismatch,&down_occurrence,&down_sz,
465 &down_cut,down_prb,down_cal_prb,down_chastity);
466
467 if(up_strand != down_strand)
468 continue;
469
470 remove_ambiguities(down_seq,strlen(down_seq),new_down_seq);
471
472 printf(line_format,
473 up_chr,up_pos,up_seq,up_id,up_strand,up_mismatch,up_occurrence,up_sz,
474 up_cut,up_prb,up_cal_prb,up_chastity);
475
476 printf(line_format,
477 down_chr,down_pos,down_seq,down_id,down_strand,down_mismatch,down_occurrence,down_sz,
478 down_cut,down_prb,down_cal_prb,down_chastity);
479
480 //new_seq[0] = '\0';
481 //new_prb[0] = '\0';
482 //new_cal_prb[0] = '\0';
483 //new_chastity[0] = '\0';
484 //int splitpos = 0;
485
486 //
487 //fit = fitting(up_prb+(36-overlap)-6,down_prb+(exon_start-down_pos)-6);
488 //if (fit == -1)
489 // continue;
490
491 //if (! (fit < overlap ))
492 // continue;
493
494 //new_chr = up_chr;
495 //new_strand = up_strand;
496 //splitpos = overlap;
497
498 //p_start = up_pos;
499 //p_stop = exon_start+(read_size-overlap);
500
501 //strncat(new_seq,new_up_seq,overlap);
502 //strncat(new_prb,up_prb,overlap);
503 //strncat(new_cal_prb,up_cal_prb,overlap);
504 //strncat(new_chastity,up_chastity,overlap);
505
506 ////strncat(new_seq,new_down_seq+fit,read_size-overlap);
507 ////strncat(new_prb,down_prb+fit,read_size-overlap);
508 ////strncat(new_cal_prb,down_cal_prb+fit,read_size-overlap);
509 ////strncat(new_chastity,down_chastity+fit,read_size-overlap);
510 //
511 //int offset = exon_start-down_pos;
512 //strncat(new_seq,new_down_seq+offset,read_size-overlap);
513 //strncat(new_prb,down_prb+offset,read_size-overlap);
514 //strncat(new_cal_prb,down_cal_prb+offset,read_size-overlap);
515 //strncat(new_chastity,down_chastity+offset,read_size-overlap);
516
517 //// The four last integers specify the positions of
518 //// p_start : the position in the dna where the first read starts
519 //// exons_stop : the position in the dna where the first exons ends
520 //// exons_start : the position in the dna where the second exons starts
521 //// p_stop : the position in the dna where the (truncated) second read ends
522 //fprintf(out_fs,"%d\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\n",
523 //read_nr,new_chr,new_strand,new_seq,splitpos,read_size,new_prb,new_cal_prb,new_chastity,gene_id,p_start,exon_stop,exon_start,p_stop);
524 //read_nr++;
525
526 //combined_reads++;
527 //used_flag[down_idx] = 1;
528 }
529 }
530
531 free(upstream_line);
532 free(downstream_line);
533
534 free(up_used_flag);
535 free(down_used_flag);
536
537 free(new_up_seq);
538 free(new_down_seq);
539
540 free(up_seq);
541 free(up_prb);
542 free(up_cal_prb);
543 free(up_chastity);
544
545 free(down_seq);
546 free(down_prb);
547 free(down_cal_prb);
548 free(down_chastity);
549
550 free(new_seq);
551 free(new_prb);
552 free(new_cal_prb);
553 free(new_chastity);
554 }
555
556 /*
557 int fitting(char* up_prb, char* down_prb) {
558 int w_size = 3;
559
560 double current_mean_up = 0;
561 double current_mean_down = 0;
562
563 int uidx;
564 for(uidx=0;uidx<w_size;uidx++) {
565 current_mean_up += up_prb[uidx]-50;
566 current_mean_down += down_prb[uidx]-50;
567 }
568
569 current_mean_up /= w_size;
570 current_mean_down /= w_size;
571
572 double min_val = fmin(current_mean_up,current_mean_down);
573 double max_val = fmax(current_mean_up,current_mean_down);
574 if ( (min_val / max_val) <= 0.33 )
575 return 1;
576
577 return -1;
578 }
579 */
580
581 int fitting(char* up_prb, char* down_prb) {
582 double epsilon_mean = 30.0;
583 double epsilon_var = 30.0;
584 int w_size = 6;
585
586 double current_mean_up = 0;
587 double current_variance_up = 0;
588 double current_mean_down = 0;
589 double current_variance_down = 0;
590
591 double* mean_up = malloc(sizeof(double)*read_size-2*w_size);
592 double* variance_up = malloc(sizeof(double)*read_size-2*w_size);
593 double* mean_down = malloc(sizeof(double)*read_size-2*w_size);
594 double* variance_down = malloc(sizeof(double)*read_size-2*w_size);
595
596 int iidx = -1;
597 int uidx;
598 int didx;
599
600 for(uidx=iidx-w_size;uidx<iidx;uidx++) {
601 didx = uidx+w_size;
602 current_mean_up += up_prb[uidx]-50;
603 current_mean_down += up_prb[didx]-50;
604 }
605
606 current_mean_up /= w_size;
607 current_mean_down /= w_size;
608
609 for(uidx=iidx-w_size;uidx<iidx;uidx++) {
610 didx = uidx+w_size;
611 current_variance_up += pow(up_prb[uidx]-50 - current_mean_up,2);
612 current_variance_down += pow(up_prb[didx]-50 - current_mean_up,2);
613 }
614
615 current_variance_up /= (w_size-1);
616 current_variance_down /= (w_size-1);
617
618 for(iidx=w_size;iidx<30;iidx++) {
619 for(uidx=iidx-w_size;uidx<iidx;uidx++) {
620 didx = uidx+w_size;
621 mean_up[iidx-w_size] += down_prb[uidx]-50;
622 mean_down[iidx-w_size] += down_prb[didx]-50;
623
624 }
625 mean_up[iidx-w_size] /= w_size;
626 mean_down[iidx-w_size] /= w_size;
627
628 for(uidx=iidx-w_size;uidx<iidx;uidx++) {
629 didx = uidx+w_size;
630 variance_up[iidx-w_size] += pow(down_prb[uidx]-50 - mean_up[iidx-w_size],2);
631 variance_down[iidx-w_size] += pow(down_prb[didx]-50 - mean_down[iidx-w_size],2);
632 }
633 variance_up[iidx-w_size] /= (w_size-1);
634 variance_down[iidx-w_size] /= (w_size-1);
635
636 //printf("means: %f %f %f %f, variances: %f %f %f %f\n",current_mean_up,current_mean_down,mean_up,mean_down,current_variance_up,current_variance_down,variance_up,variance_down);
637 //if ( abs(current_mean_up - mean_up) < epsilon_mean && abs(current_variance_up - variance_up) < epsilon_var
638 //&& abs(current_mean_down - mean_down) < epsilon_mean && abs(current_variance_down - variance_down) < epsilon_var )
639 // return iidx;
640 }
641
642 int bidx;
643 int bestIdx = -1;
644 double min = 1000.0;
645 for(bidx=0;bidx<read_size-2*w_size;bidx++) {
646 if ( abs(current_mean_up - mean_up[bidx]) < epsilon_mean && abs(current_variance_up - variance_up[bidx]) < epsilon_var
647 && abs(current_mean_down - mean_down[bidx]) < epsilon_mean && abs(current_variance_down - variance_down[bidx]) < epsilon_var ) {
648 if ( abs(current_mean_up - mean_up[bidx]) + abs(current_variance_up - variance_up[bidx]) + abs(current_mean_down - mean_down[bidx]) + abs(current_variance_down - variance_down[bidx])) {
649 min = abs(current_mean_up - mean_up[bidx]) + abs(current_variance_up - variance_up[bidx]) + abs(current_mean_down - mean_down[bidx]) + abs(current_variance_down - variance_down[bidx]);
650 bestIdx = bidx;
651 }
652 }
653 }
654
655 free(mean_up);
656 free(variance_up);
657 free(mean_down);
658 free(variance_down);
659
660 return bestIdx;
661 }
662
663 void remove_ambiguities(char * old_seq, int old_seq_size, char* new_seq) {
664 int idx=0;
665 int new_idx = 0;
666
667 while(idx<old_seq_size) {
668 //printf("Current elem %c pos %d %d\n",old_seq[idx],idx,new_idx);
669 if (old_seq[idx] == ']' || old_seq[idx] == '-' ) {
670 idx++;
671 continue;
672 }
673
674 if (old_seq[idx] == '[') {
675 idx += 2;
676 //printf("%c %c\n",old_seq[idx-2],old_seq[idx]);
677 continue;
678 }
679
680 new_seq[new_idx] = old_seq[idx];
681 idx++;
682 new_idx++;
683 }
684
685 if (new_idx != 36) {
686 printf("Error: Sequence is not of length 36!\n");
687 printf("old seq: %s\n",old_seq);
688 printf("new seq: %s\n",new_seq);
689 exit(EXIT_FAILURE);
690 }
691 }
692
693 /*
694 * TODO:
695 * - Check strand -> done simple (only if equal)
696 * - check for [AC] and similar entries -> done simple (see function
697 * - remove_ambiguities (exchanges [XY] by the second entry)
698 */