+ added read structure for new parser
[qpalma.git] / tools / data_tools / filterReads.c
1 ////////////////////////////////////////////////////////////
2 // The purpose of this program is to read a gff and a
3 // solexa reads file and create a data set used by QPalma.
4 //
5 //
6 ////////////////////////////////////////////////////////////
7
8 #include <sys/mman.h>
9 #include <sys/stat.h>
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <string.h>
13 #include <unistd.h>
14 #include <math.h>
15
16 #include "datastructures.h"
17
18 #define _FILE_OFFSET_BITS == 64
19
20 int compare_gene_struct(struct gene* a, struct gene* b) {
21 return a->stop - b->start;
22 }
23
24 int parse_gff(char* filename,FILE* fid,struct gene*** allGenes);
25
26 void sort_genes(struct gene*** allGenes, int numGenes);
27
28 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes,FILE* out_fs);
29
30 void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size, FILE* out_fs,const char* gene_id, int exon_idx);
31
32 //int fitting(char* up_prb, char* up_prb_end, char* down_prb, char* down_prb_end);
33 int fitting(char* up_prb, char* down_prb);
34
35 void remove_ambiguities(char * old_seq, int old_seq_size, char* new_seq);
36
37 static char *info = "Usage is:\n./filterReads gff reads output";
38
39 /*
40 * Some constants specifying the exact behavior of the filter
41 *
42 */
43
44 const int read_size = 36;
45
46 const int min_overlap = 6;
47 const int max_overlap = 30;
48
49
50 int combined_reads = 0;
51 int main(int argc, char* argv[]) {
52
53 if(argc != 4) {
54 printf("%s\n",info);
55 exit(EXIT_FAILURE);
56 }
57
58 int status;
59 int filenameSize = 256;
60 char* gff_filename = malloc(sizeof(char)*filenameSize);
61 char* reads_filename = malloc(sizeof(char)*filenameSize);
62 char* output_filename = malloc(sizeof(char)*filenameSize);
63
64 strncpy(gff_filename,argv[1],filenameSize);
65 strncpy(reads_filename,argv[2],filenameSize);
66 strncpy(output_filename,argv[3],filenameSize);
67
68 FILE *gff_fs = fopen(gff_filename,"r");
69 FILE *reads_fs = fopen(reads_filename,"r");
70 FILE *out_fs = fopen(output_filename,"w");
71
72 if(gff_fs == NULL) {
73 printf("Error: Could not open file: %s",gff_filename);
74 exit(EXIT_FAILURE);
75 }
76
77 if(reads_fs == NULL) {
78 printf("Error: Could not open file: %s",reads_filename);
79 exit(EXIT_FAILURE);
80 }
81
82 if(out_fs == NULL) {
83 printf("Error: Could not open file: %s",output_filename);
84 exit(EXIT_FAILURE);
85 }
86
87 struct gene** allGenes;
88 int numGenes = parse_gff(gff_filename,gff_fs,&allGenes);
89 status = fclose(gff_fs);
90 free(gff_filename);
91 if(status != 0)
92 printf("closing of gff filestream failed!\n");
93
94 printf("Successfully parsed gff file! Found %d genes.\n",numGenes);
95
96 // some entries in the gff files are not in a sorted order
97 //printf("Sorting genes...\n");
98 //sort_genes(&allGenes,numGenes);
99 //qsort(allGenes,numGenes,sizeof(struct gene*),compare_gene_struct);
100 ///printf("Genes were sorted!\n");
101
102 int gidx, eidx;
103 int exon_cov = 0;
104 for(gidx=0;gidx<numGenes;gidx++) {
105 for(eidx=0;eidx<allGenes[gidx]->num_exons;eidx++) {
106 exon_cov += allGenes[gidx]->exon_stops[eidx] - allGenes[gidx]->exon_starts[eidx];
107 }}
108 printf("Exon coverage is %f\n",(double)exon_cov/30432563);
109
110 process_reads(reads_fs,&allGenes,numGenes,out_fs);
111
112 status = fclose(reads_fs);
113 status = fclose(out_fs);
114 if(status != 0)
115 perror("fclose");
116
117 free(reads_filename);
118 free(output_filename);
119 return 0;
120 }
121
122 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* out_fs) {
123 int status;
124
125 int buffer_size= 64;
126 int chr = 0;
127 int pos = 0;
128 char* seq = malloc(sizeof(char)*buffer_size);
129 unsigned long id = 0;
130 char strand = ' ';
131 int mismatch = 0;
132 int occurrence = 0;
133 int size = 0;
134 int cut = 0;
135 char* prb = malloc(sizeof(char)*buffer_size);
136 char* cal_prb = malloc(sizeof(char)*buffer_size);
137 char* chastity = malloc(sizeof(char)*buffer_size);
138
139 int reads_fid = fileno(reads_fs);
140 struct stat reads_stat;
141 if ( fstat(reads_fid,&reads_stat) == -1) {
142 perror("fstat");
143 exit(EXIT_FAILURE);
144 }
145 off_t reads_filesize = reads_stat.st_size;
146 printf("Reads file is of size %lu bytes\n",(unsigned long) reads_filesize);
147 int numReads = reads_filesize / 178.0;
148
149 void *reads_area = mmap (NULL,reads_filesize,PROT_READ|PROT_WRITE,MAP_PRIVATE,reads_fid,0);
150 if (reads_area == MAP_FAILED) {
151 perror("mmap");
152 exit(EXIT_FAILURE);
153 }
154 close(reads_fid);
155 printf("Successfully mapped %lu bytes of reads file into memory\n",(unsigned long)reads_filesize);
156
157 void* linePtr = reads_area;
158 char* current_line = malloc(sizeof(char)*256);
159
160 int SIZE = 500;
161 // initialize boundary arrays
162 void** upstream_end = malloc(sizeof(void*)*SIZE);
163 void** upstream_overlap = malloc(sizeof(void*)*SIZE);
164 void** downstream_start = malloc(sizeof(void*)*SIZE);
165 void** downstream_overlap= malloc(sizeof(void*)*SIZE);
166 int ue,uo,ds,dov;
167 ue = uo = ds = dov = 0;
168
169 int skippedLinesCounter = 0;
170
171 int prev_exon_start = -1;
172 int prev_exon_stop = -1;
173 int cur_exon_start = -1;
174
175 current_line = strncpy(current_line,linePtr,256);
176 int gene_idx = 0;
177 int exon_idx = 1;
178 struct gene* currentGene = (*allGenes)[gene_idx];
179 char* gene_id = currentGene->id;
180
181 //char* disamb_seq = malloc(sizeof(char)*read_size);
182
183 int skippedReadCtr = 0;
184 int uselessReadCtr = 0;
185 int exonicReadCtr = 0;
186 int endPrevCtr = 0;
187 int prevOverlapCtr = 0;
188 int currentStartCtr = 0;
189 int currentOverlapCtr = 0;
190 int multioccurReadCtr = 0;
191
192 int readCtr = 0;
193 // start of the parsing loop
194 while(1) {
195 if (gene_idx == numGenes || strcmp(current_line,"") == 0)
196 break;
197
198 gene_id = currentGene->id;
199
200 if (readCtr != 0 && readCtr % 1000000 == 0)
201 printf("Processed %d/%d genes and %d/%d reads.\n",gene_idx,numGenes,readCtr,numReads);
202
203 //if (gene_idx >= 1833)
204 // printf("currentGene start/stop: %d/%d. Positions is %d\n",currentGene->start,currentGene->stop,pos);
205
206 status = sscanf(current_line,"%d\t%d\t%s\t%lu\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
207 &chr,&pos,seq,&id,&strand,&mismatch,&occurrence,&size,&cut,prb,cal_prb,chastity);
208 if (status < 12) {
209 skippedLinesCounter++;
210 goto next_read;
211 }
212
213 // if the read is occurring several times elsewhere then get rid of it
214 //if(!(occurrence >= 1 && occurrence <= 25)) {
215 if ( occurrence != 1 ) {
216 multioccurReadCtr++;
217 goto next_read;
218 }
219
220 if (!(currentGene->start <= pos && (pos + read_size-1) <= currentGene->stop)) { // read is not within gene borders
221
222 if ( currentGene->stop < (pos + read_size-1) ) { // go to next gene
223 gene_idx++;
224 exon_idx = 1;
225 currentGene = (*allGenes)[gene_idx];
226 //printf("currentGene->start / currentGene->stop %d/%d pos is %d\n",currentGene->start,currentGene->stop,pos);
227 ue = uo = ds = dov = 0;
228 continue;
229 }
230
231 if ( pos < currentGene->start ) { // go to next read
232 skippedReadCtr++;
233
234 next_read:
235
236 while (*(char*)linePtr != '\n') linePtr++;
237 linePtr++;
238 readCtr += 1;
239 current_line = strncpy(current_line,linePtr,256);
240 continue;
241 }
242
243 } else { // read IS within gene borders
244
245 exon_label:
246
247 if (exon_idx == currentGene->num_exons) {
248 gene_idx++;
249 exon_idx = 1;
250 currentGene = (*allGenes)[gene_idx];
251 continue;
252 }
253
254 prev_exon_start = currentGene->exon_starts[exon_idx-1];
255 prev_exon_stop = currentGene->exon_stops[exon_idx-1];
256 cur_exon_start = currentGene->exon_starts[exon_idx];
257
258 //printf("exon %d %d inton til %d pos %d\n",prev_exon_start,prev_exon_stop,cur_exon_start,pos);
259
260 if (cur_exon_start - prev_exon_stop < min_overlap || cur_exon_start < pos ) { // go to next exon
261 if (ue != 0 && dov != 0)
262 combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov,out_fs,gene_id,exon_idx);
263
264 if (uo != 0 && ds != 0)
265 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds,out_fs,gene_id,exon_idx);
266
267 exon_idx++;
268 ue = uo = ds = dov = 0;
269 goto exon_label;
270 }
271
272 if ( prev_exon_start < pos && (pos+read_size) < prev_exon_stop ) { // read is inside previous exon
273 exonicReadCtr++;
274
275 // Removed exonic reads
276
277 //remove_ambiguities(seq,strlen(seq),disamb_seq);
278 //fprintf(out_fs,"%d\t%c\t%s\t%d\t%s\t%s\t%s\n",chr,strand,disamb_seq,read_size,prb,cal_prb,chastity);
279
280 goto next_read;
281 }
282
283 if ( pos + (read_size-1) < prev_exon_stop ) // go to next read
284 goto next_read;
285
286 // if this position is reached the read is somehow overlapping or
287 // exactly on the exon boundary. now determine where exactly:
288 if (pos + (read_size-1) == prev_exon_stop) { // read ends at previous exon end
289 endPrevCtr++;
290 upstream_end[ue] = linePtr;
291 ue++;
292 goto next_read;
293 }
294
295 if ( (prev_exon_stop - pos) >= min_overlap && (prev_exon_stop - pos) <= max_overlap) { // read overlaps with previous exon boundary
296 prevOverlapCtr++;
297 upstream_overlap[uo] = linePtr;
298 uo++;
299 goto next_read;
300 }
301
302 if ( pos == cur_exon_start ) { // read starts at current exon start
303 currentStartCtr++;
304 downstream_start[ds] = linePtr;
305 ds++;
306 goto next_read;
307 }
308
309 if ( (cur_exon_start - pos) >= min_overlap && (cur_exon_start - pos) <= max_overlap) { // read overlaps with current exon boundary
310 currentOverlapCtr++;
311 downstream_overlap[dov] = linePtr;
312 dov++;
313 goto next_read;
314 }
315
316 uselessReadCtr++;
317 goto next_read; // read was not useful i.e. not overlapping/starting at exon boundaries
318 }
319 }
320
321 combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov,out_fs,gene_id,exon_idx);
322 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds,out_fs,gene_id,exon_idx);
323
324 free(upstream_end);
325 free(upstream_overlap);
326 free(downstream_start);
327 free(downstream_overlap);
328
329 printf("Processed %d reads in total while %d reads where skipped.\n",readCtr,skippedLinesCounter);
330 printf("%d were totally intronic. %d were outside of genes and %d were occurring on several positions.\n",uselessReadCtr,skippedReadCtr,multioccurReadCtr);
331 printf("%d reads were useless in total\n",uselessReadCtr+skippedReadCtr+multioccurReadCtr);
332 printf("%d reads were totally exonic\n",exonicReadCtr);
333 printf("%d reads overlap with prev. exon. %d reads overlap with current exon\n",prevOverlapCtr,currentOverlapCtr);
334 printf("%d reads where newly combined from two original reads\n",combined_reads);
335 printf("Total used reads: %d\n",exonicReadCtr+endPrevCtr+prevOverlapCtr+currentStartCtr+currentOverlapCtr);
336
337 status = munmap(reads_area,reads_filesize);
338 if(status != 0)
339 perror("munmap");
340
341 //free(current_line);
342 free(seq);
343 free(prb);
344 free(cal_prb);
345 free(chastity);
346 }
347
348 void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size,FILE* out_fs,const char* gene_id, int exon_idx) {
349 //printf("up/down size is %d/%d\n",up_size,down_size);
350
351 int up_idx, down_idx, status;
352 char* upstream_line = malloc(sizeof(char)*256);
353 char* downstream_line = malloc(sizeof(char)*256);
354 int p_start, p_stop;
355
356 int buffer_size= 64;
357
358 int up_chr = 0;
359 int up_pos = 0;
360 char* up_seq = malloc(sizeof(char)*buffer_size);
361 int up_id = 0;
362 char up_strand = ' ';
363 int up_mismatch = 0;
364 int up_occurrence = 0;
365 int up_sz = 0;
366 int up_cut = 0;
367 char* up_prb = malloc(sizeof(char)*buffer_size);
368 char* up_cal_prb = malloc(sizeof(char)*buffer_size);
369 char* up_chastity = malloc(sizeof(char)*buffer_size);
370
371 int down_chr = 0;
372 int down_pos = 0;
373 char* down_seq = malloc(sizeof(char)*buffer_size);
374 int down_id = 0;
375 char down_strand = ' ';
376 int down_mismatch = 0;
377 int down_occurrence = 0;
378 int down_sz = 0;
379 int down_cut = 0;
380 char* down_prb = malloc(sizeof(char)*buffer_size);
381 char* down_cal_prb = malloc(sizeof(char)*buffer_size);
382 char* down_chastity = malloc(sizeof(char)*buffer_size);
383
384 int new_chr = 0;
385 char* new_seq = malloc(sizeof(char)*buffer_size);
386 char new_strand = ' ';
387 char* new_prb = malloc(sizeof(char)*buffer_size);
388 char* new_cal_prb = malloc(sizeof(char)*buffer_size);
389 char* new_chastity = malloc(sizeof(char)*buffer_size);
390 char* new_up_seq = malloc(sizeof(char)*read_size);
391 char* new_down_seq = malloc(sizeof(char)*read_size);
392
393 int overlap;
394 int fit;
395
396 char* used_flag = calloc(down_size,sizeof(char));
397
398 for(up_idx=0;up_idx<up_size;up_idx++) {
399 strncpy(upstream_line,upstream[up_idx],256);
400 status = sscanf(upstream_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
401 &up_chr,&up_pos,up_seq,&up_id,&up_strand,&up_mismatch,&up_occurrence,&up_sz,
402 &up_cut,up_prb,up_cal_prb,up_chastity);
403
404 remove_ambiguities(up_seq,strlen(up_seq),new_up_seq);
405
406 overlap = exon_stop - up_pos;
407
408 for(down_idx=0;down_idx<down_size;down_idx++) {
409 if( used_flag[down_idx] == 1)
410 continue;
411
412 strncpy(downstream_line,downstream[down_idx],256);
413 status = sscanf(downstream_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
414 &down_chr,&down_pos,down_seq,&down_id,&down_strand,&down_mismatch,&down_occurrence,&down_sz,
415 &down_cut,down_prb,down_cal_prb,down_chastity);
416
417 remove_ambiguities(down_seq,strlen(down_seq),new_down_seq);
418
419 if(up_strand != down_strand)
420 continue;
421
422 new_seq[0] = '\0';
423 new_prb[0] = '\0';
424 new_cal_prb[0] = '\0';
425 new_chastity[0] = '\0';
426 int splitpos = 0;
427
428 fit = fitting(up_prb+(36-overlap)-6,down_prb+(exon_start-down_pos)-6);
429 if (fit == -1)
430 continue;
431
432 if (! (fit < overlap ))
433 continue;
434
435 new_chr = up_chr;
436 new_strand = up_strand;
437 splitpos = overlap;
438
439 p_start = up_pos;
440 p_stop = exon_start+(read_size-overlap);
441
442 strncat(new_seq,new_up_seq,overlap);
443 strncat(new_prb,up_prb,overlap);
444 strncat(new_cal_prb,up_cal_prb,overlap);
445 strncat(new_chastity,up_chastity,overlap);
446
447 //strncat(new_seq,new_down_seq+fit,read_size-overlap);
448 //strncat(new_prb,down_prb+fit,read_size-overlap);
449 //strncat(new_cal_prb,down_cal_prb+fit,read_size-overlap);
450 //strncat(new_chastity,down_chastity+fit,read_size-overlap);
451
452 int offset = exon_start-down_pos;
453 strncat(new_seq,new_down_seq+offset,read_size-overlap);
454 strncat(new_prb,down_prb+offset,read_size-overlap);
455 strncat(new_cal_prb,down_cal_prb+offset,read_size-overlap);
456 strncat(new_chastity,down_chastity+offset,read_size-overlap);
457
458 // The four last integers specify the positions of
459 // p_start : the position in the dna where the first read starts
460 // exons_stop : the position in the dna where the first exons ends
461 // exons_start : the position in the dna where the second exons starts
462 // p_stop : the position in the dna where the (truncated) second read ends
463 fprintf(out_fs,"%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\n",
464 new_chr,new_strand,new_seq,splitpos,read_size,new_prb,new_cal_prb,new_chastity,gene_id,p_start,exon_stop,exon_start,p_stop);
465
466 combined_reads++;
467 used_flag[down_idx] = 1;
468 }
469 }
470
471 free(upstream_line);
472 free(downstream_line);
473 free(used_flag);
474
475 free(new_up_seq);
476 free(new_down_seq);
477
478 free(up_seq);
479 free(up_prb);
480 free(up_cal_prb);
481 free(up_chastity);
482
483 free(down_seq);
484 free(down_prb);
485 free(down_cal_prb);
486 free(down_chastity);
487
488 free(new_seq);
489 free(new_prb);
490 free(new_cal_prb);
491 free(new_chastity);
492 }
493
494 /*
495 int fitting(char* up_prb, char* down_prb) {
496 int w_size = 3;
497
498 double current_mean_up = 0;
499 double current_mean_down = 0;
500
501 int uidx;
502 for(uidx=0;uidx<w_size;uidx++) {
503 current_mean_up += up_prb[uidx]-50;
504 current_mean_down += down_prb[uidx]-50;
505 }
506
507 current_mean_up /= w_size;
508 current_mean_down /= w_size;
509
510 double min_val = fmin(current_mean_up,current_mean_down);
511 double max_val = fmax(current_mean_up,current_mean_down);
512 if ( (min_val / max_val) <= 0.33 )
513 return 1;
514
515 return -1;
516 }
517 */
518
519 int fitting(char* up_prb, char* down_prb) {
520 double epsilon_mean = 30.0;
521 double epsilon_var = 30.0;
522 int w_size = 6;
523
524 double current_mean_up = 0;
525 double current_variance_up = 0;
526 double current_mean_down = 0;
527 double current_variance_down = 0;
528
529 double* mean_up = malloc(sizeof(double)*read_size-2*w_size);
530 double* variance_up = malloc(sizeof(double)*read_size-2*w_size);
531 double* mean_down = malloc(sizeof(double)*read_size-2*w_size);
532 double* variance_down = malloc(sizeof(double)*read_size-2*w_size);
533
534 int iidx = -1;
535 int uidx;
536 int didx;
537
538 for(uidx=iidx-w_size;uidx<iidx;uidx++) {
539 didx = uidx+w_size;
540 current_mean_up += up_prb[uidx]-50;
541 current_mean_down += up_prb[didx]-50;
542 }
543
544 current_mean_up /= w_size;
545 current_mean_down /= w_size;
546
547 for(uidx=iidx-w_size;uidx<iidx;uidx++) {
548 didx = uidx+w_size;
549 current_variance_up += pow(up_prb[uidx]-50 - current_mean_up,2);
550 current_variance_down += pow(up_prb[didx]-50 - current_mean_up,2);
551 }
552
553 current_variance_up /= (w_size-1);
554 current_variance_down /= (w_size-1);
555
556 for(iidx=w_size;iidx<30;iidx++) {
557 for(uidx=iidx-w_size;uidx<iidx;uidx++) {
558 didx = uidx+w_size;
559 mean_up[iidx-w_size] += down_prb[uidx]-50;
560 mean_down[iidx-w_size] += down_prb[didx]-50;
561
562 }
563 mean_up[iidx-w_size] /= w_size;
564 mean_down[iidx-w_size] /= w_size;
565
566 for(uidx=iidx-w_size;uidx<iidx;uidx++) {
567 didx = uidx+w_size;
568 variance_up[iidx-w_size] += pow(down_prb[uidx]-50 - mean_up[iidx-w_size],2);
569 variance_down[iidx-w_size] += pow(down_prb[didx]-50 - mean_down[iidx-w_size],2);
570 }
571 variance_up[iidx-w_size] /= (w_size-1);
572 variance_down[iidx-w_size] /= (w_size-1);
573
574 //printf("means: %f %f %f %f, variances: %f %f %f %f\n",current_mean_up,current_mean_down,mean_up,mean_down,current_variance_up,current_variance_down,variance_up,variance_down);
575 //if ( abs(current_mean_up - mean_up) < epsilon_mean && abs(current_variance_up - variance_up) < epsilon_var
576 //&& abs(current_mean_down - mean_down) < epsilon_mean && abs(current_variance_down - variance_down) < epsilon_var )
577 // return iidx;
578 }
579
580 int bidx;
581 int bestIdx = -1;
582 double min = 1000.0;
583 for(bidx=0;bidx<read_size-2*w_size;bidx++) {
584 if ( abs(current_mean_up - mean_up[bidx]) < epsilon_mean && abs(current_variance_up - variance_up[bidx]) < epsilon_var
585 && abs(current_mean_down - mean_down[bidx]) < epsilon_mean && abs(current_variance_down - variance_down[bidx]) < epsilon_var ) {
586 if ( abs(current_mean_up - mean_up[bidx]) + abs(current_variance_up - variance_up[bidx]) + abs(current_mean_down - mean_down[bidx]) + abs(current_variance_down - variance_down[bidx])) {
587 min = abs(current_mean_up - mean_up[bidx]) + abs(current_variance_up - variance_up[bidx]) + abs(current_mean_down - mean_down[bidx]) + abs(current_variance_down - variance_down[bidx]);
588 bestIdx = bidx;
589 }
590 }
591 }
592
593 free(mean_up);
594 free(variance_up);
595 free(mean_down);
596 free(variance_down);
597
598 return bestIdx;
599 }
600
601 void remove_ambiguities(char * old_seq, int old_seq_size, char* new_seq) {
602 int idx=0;
603 int new_idx = 0;
604
605 while(idx<old_seq_size) {
606 //printf("Current elem %c pos %d %d\n",old_seq[idx],idx,new_idx);
607 if (old_seq[idx] == ']' || old_seq[idx] == '-' ) {
608 idx++;
609 continue;
610 }
611
612 if (old_seq[idx] == '[') {
613 idx += 2;
614 //printf("%c %c\n",old_seq[idx-2],old_seq[idx]);
615 continue;
616 }
617
618 new_seq[new_idx] = old_seq[idx];
619 idx++;
620 new_idx++;
621 }
622
623 if (new_idx != 36) {
624 printf("Error: Sequence is not of length 36!\n");
625 printf("old seq: %s\n",old_seq);
626 printf("new seq: %s\n",new_seq);
627 exit(EXIT_FAILURE);
628 }
629 }
630
631 /*
632 * TODO:
633 * - Check strand -> done simple (only if equal)
634 * - check for [AC] and similar entries -> done simple (see function
635 * - remove_ambiguities (exchanges [XY] by the second entry)
636 */