+ rearranged code
[qpalma.git] / tools / data_tools / filterReads.c
1 ////////////////////////////////////////////////////////////
2 // The purpose of this program is to read a gff and a
3 // solexa reads file and create a data set used by QPalma.
4 //
5 //
6 ////////////////////////////////////////////////////////////
7
8 #include <sys/mman.h>
9 #include <sys/stat.h>
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <string.h>
13 #include <unistd.h>
14 #include <math.h>
15
16 #include "datastructures.h"
17
18 #define _FILE_OFFSET_BITS == 64
19
20 int compare_gene_struct(struct gene* a, struct gene* b) {
21 return a->stop - b->start;
22 }
23
24 int parse_gff(char* filename,FILE* fid,struct gene*** allGenes);
25
26 void sort_genes(struct gene*** allGenes, int numGenes);
27
28 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes,FILE* out_fs);
29
30 void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size, FILE* out_fs,const char* gene_id, int exon_idx);
31
32 //int fitting(char* up_prb, char* up_prb_end, char* down_prb, char* down_prb_end);
33 int fitting(char* up_prb, char* down_prb);
34
35 void remove_ambiguities(char * old_seq, int old_seq_size, char* new_seq);
36
37 static char *info = "Usage is:\n./filterReads gff reads output";
38
39 /*
40 * Some constants specifying the exact behavior of the filter
41 *
42 */
43
44 const int read_size = 36;
45
46 const int min_overlap = 6;
47 const int max_overlap = 30;
48
49 int read_nr = 1;
50
51 int combined_reads = 0;
52 int main(int argc, char* argv[]) {
53
54 if(argc != 5) {
55 printf("%s\n",info);
56 exit(EXIT_FAILURE);
57 }
58
59 int status;
60 int filenameSize = 256;
61 char* gff_filename = malloc(sizeof(char)*filenameSize);
62 char* reads_filename = malloc(sizeof(char)*filenameSize);
63 char* output_filename = malloc(sizeof(char)*filenameSize);
64
65 strncpy(gff_filename,argv[1],filenameSize);
66 strncpy(reads_filename,argv[2],filenameSize);
67 strncpy(output_filename,argv[3],filenameSize);
68
69 FILE *gff_fs = fopen(gff_filename,"r");
70 FILE *reads_fs = fopen(reads_filename,"r");
71 FILE *out_fs = fopen(output_filename,"w");
72
73 read_nr = atoi(argv[4]);
74 read_nr++;
75
76 if(gff_fs == NULL) {
77 printf("Error: Could not open file: %s",gff_filename);
78 exit(EXIT_FAILURE);
79 }
80
81 if(reads_fs == NULL) {
82 printf("Error: Could not open file: %s",reads_filename);
83 exit(EXIT_FAILURE);
84 }
85
86 if(out_fs == NULL) {
87 printf("Error: Could not open file: %s",output_filename);
88 exit(EXIT_FAILURE);
89 }
90
91 struct gene** allGenes;
92 int numGenes = parse_gff(gff_filename,gff_fs,&allGenes);
93 status = fclose(gff_fs);
94 free(gff_filename);
95 if(status != 0)
96 printf("closing of gff filestream failed!\n");
97
98 printf("Successfully parsed gff file! Found %d genes.\n",numGenes);
99
100 // some entries in the gff files are not in a sorted order
101 //printf("Sorting genes...\n");
102 //sort_genes(&allGenes,numGenes);
103 //qsort(allGenes,numGenes,sizeof(struct gene*),compare_gene_struct);
104 ///printf("Genes were sorted!\n");
105
106 int gidx, eidx;
107 int exon_cov = 0;
108 for(gidx=0;gidx<numGenes;gidx++) {
109 for(eidx=0;eidx<allGenes[gidx]->num_exons;eidx++) {
110 exon_cov += allGenes[gidx]->exon_stops[eidx] - allGenes[gidx]->exon_starts[eidx];
111 }}
112 printf("Exon coverage is %f\n",(double)exon_cov/30432563);
113
114 process_reads(reads_fs,&allGenes,numGenes,out_fs);
115
116 status = fclose(reads_fs);
117 status = fclose(out_fs);
118 if(status != 0)
119 perror("fclose");
120
121 free(reads_filename);
122 free(output_filename);
123 return 0;
124 }
125
126 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* out_fs) {
127 int status;
128
129 int buffer_size= 64;
130 int chr = 0;
131 int pos = 0;
132 char* seq = malloc(sizeof(char)*buffer_size);
133 unsigned long id = 0;
134 char strand = ' ';
135 int mismatch = 0;
136 int occurrence = 0;
137 int size = 0;
138 int cut = 0;
139 char* prb = malloc(sizeof(char)*buffer_size);
140 char* cal_prb = malloc(sizeof(char)*buffer_size);
141 char* chastity = malloc(sizeof(char)*buffer_size);
142
143 int reads_fid = fileno(reads_fs);
144 struct stat reads_stat;
145 if ( fstat(reads_fid,&reads_stat) == -1) {
146 perror("fstat");
147 exit(EXIT_FAILURE);
148 }
149 off_t reads_filesize = reads_stat.st_size;
150 printf("Reads file is of size %lu bytes\n",(unsigned long) reads_filesize);
151 int numReads = reads_filesize / 178.0;
152
153 void *reads_area = mmap (NULL,reads_filesize,PROT_READ|PROT_WRITE,MAP_PRIVATE,reads_fid,0);
154 if (reads_area == MAP_FAILED) {
155 perror("mmap");
156 exit(EXIT_FAILURE);
157 }
158 close(reads_fid);
159 printf("Successfully mapped %lu bytes of reads file into memory\n",(unsigned long)reads_filesize);
160
161 void* linePtr = reads_area;
162 char* current_line = malloc(sizeof(char)*256);
163
164 int SIZE = 500;
165 // initialize boundary arrays
166 void** upstream_end = malloc(sizeof(void*)*SIZE);
167 void** upstream_overlap = malloc(sizeof(void*)*SIZE);
168 void** downstream_start = malloc(sizeof(void*)*SIZE);
169 void** downstream_overlap= malloc(sizeof(void*)*SIZE);
170 int ue,uo,ds,dov;
171 ue = uo = ds = dov = 0;
172
173 int skippedLinesCounter = 0;
174
175 int prev_exon_start = -1;
176 int prev_exon_stop = -1;
177 int cur_exon_start = -1;
178
179 current_line = strncpy(current_line,linePtr,256);
180 int gene_idx = 0;
181 int exon_idx = 1;
182 struct gene* currentGene = (*allGenes)[gene_idx];
183 char* gene_id = currentGene->id;
184
185 //char* disamb_seq = malloc(sizeof(char)*read_size);
186
187 int skippedReadCtr = 0;
188 int uselessReadCtr = 0;
189 int exonicReadCtr = 0;
190 int endPrevCtr = 0;
191 int prevOverlapCtr = 0;
192 int currentStartCtr = 0;
193 int currentOverlapCtr = 0;
194 int multioccurReadCtr = 0;
195
196 int readCtr = 0;
197 // start of the parsing loop
198 while(1) {
199 if (gene_idx == numGenes || strcmp(current_line,"") == 0)
200 break;
201
202 gene_id = currentGene->id;
203
204 if (readCtr != 0 && readCtr % 1000000 == 0)
205 printf("Processed %d/%d genes and %d/%d reads.\n",gene_idx,numGenes,readCtr,numReads);
206
207 //if (gene_idx >= 1833)
208 // printf("currentGene start/stop: %d/%d. Positions is %d\n",currentGene->start,currentGene->stop,pos);
209
210 status = sscanf(current_line,"%d\t%d\t%s\t%lu\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
211 &chr,&pos,seq,&id,&strand,&mismatch,&occurrence,&size,&cut,prb,cal_prb,chastity);
212 if (status < 12) {
213 skippedLinesCounter++;
214 goto next_read;
215 }
216
217 // if the read is occurring several times elsewhere then get rid of it
218 //if(!(occurrence >= 1 && occurrence <= 25)) {
219 if ( occurrence != 1 ) {
220 multioccurReadCtr++;
221 goto next_read;
222 }
223
224 if (!(currentGene->start <= pos && (pos + read_size-1) <= currentGene->stop)) { // read is not within gene borders
225
226 if ( currentGene->stop < (pos + read_size-1) ) { // go to next gene
227 gene_idx++;
228 exon_idx = 1;
229 currentGene = (*allGenes)[gene_idx];
230 //printf("currentGene->start / currentGene->stop %d/%d pos is %d\n",currentGene->start,currentGene->stop,pos);
231 ue = uo = ds = dov = 0;
232 continue;
233 }
234
235 if ( pos < currentGene->start ) { // go to next read
236 skippedReadCtr++;
237
238 next_read:
239
240 while (*(char*)linePtr != '\n') linePtr++;
241 linePtr++;
242 readCtr += 1;
243 current_line = strncpy(current_line,linePtr,256);
244 continue;
245 }
246
247 } else { // read IS within gene borders
248
249 exon_label:
250
251 if (exon_idx == currentGene->num_exons) {
252 gene_idx++;
253 exon_idx = 1;
254 currentGene = (*allGenes)[gene_idx];
255 continue;
256 }
257
258 prev_exon_start = currentGene->exon_starts[exon_idx-1];
259 prev_exon_stop = currentGene->exon_stops[exon_idx-1];
260 cur_exon_start = currentGene->exon_starts[exon_idx];
261
262 //printf("exon %d %d inton til %d pos %d\n",prev_exon_start,prev_exon_stop,cur_exon_start,pos);
263
264 if (cur_exon_start - prev_exon_stop < min_overlap || cur_exon_start < pos ) { // go to next exon
265 if (ue != 0 && dov != 0)
266 combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov,out_fs,gene_id,exon_idx);
267
268 if (uo != 0 && ds != 0)
269 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds,out_fs,gene_id,exon_idx);
270
271 exon_idx++;
272 ue = uo = ds = dov = 0;
273 goto exon_label;
274 }
275
276 if ( prev_exon_start < pos && (pos+read_size) < prev_exon_stop ) { // read is inside previous exon
277 exonicReadCtr++;
278
279 // Removed exonic reads
280
281 //remove_ambiguities(seq,strlen(seq),disamb_seq);
282 //fprintf(out_fs,"%d\t%c\t%s\t%d\t%s\t%s\t%s\n",chr,strand,disamb_seq,read_size,prb,cal_prb,chastity);
283
284 goto next_read;
285 }
286
287 if ( pos + (read_size-1) < prev_exon_stop ) // go to next read
288 goto next_read;
289
290 // if this position is reached the read is somehow overlapping or
291 // exactly on the exon boundary. now determine where exactly:
292 if (pos + (read_size-1) == prev_exon_stop) { // read ends at previous exon end
293 endPrevCtr++;
294 upstream_end[ue] = linePtr;
295 ue++;
296 goto next_read;
297 }
298
299 if ( (prev_exon_stop - pos) >= min_overlap && (prev_exon_stop - pos) <= max_overlap) { // read overlaps with previous exon boundary
300 prevOverlapCtr++;
301 upstream_overlap[uo] = linePtr;
302 uo++;
303 goto next_read;
304 }
305
306 if ( pos == cur_exon_start ) { // read starts at current exon start
307 currentStartCtr++;
308 downstream_start[ds] = linePtr;
309 ds++;
310 goto next_read;
311 }
312
313 if ( (cur_exon_start - pos) >= min_overlap && (cur_exon_start - pos) <= max_overlap) { // read overlaps with current exon boundary
314 currentOverlapCtr++;
315 downstream_overlap[dov] = linePtr;
316 dov++;
317 goto next_read;
318 }
319
320 uselessReadCtr++;
321 goto next_read; // read was not useful i.e. not overlapping/starting at exon boundaries
322 }
323 }
324
325 combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov,out_fs,gene_id,exon_idx);
326 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds,out_fs,gene_id,exon_idx);
327
328 free(upstream_end);
329 free(upstream_overlap);
330 free(downstream_start);
331 free(downstream_overlap);
332
333 printf("Processed %d reads in total while %d reads where skipped.\n",readCtr,skippedLinesCounter);
334 printf("%d were totally intronic. %d were outside of genes and %d were occurring on several positions.\n",uselessReadCtr,skippedReadCtr,multioccurReadCtr);
335 printf("%d reads were useless in total\n",uselessReadCtr+skippedReadCtr+multioccurReadCtr);
336 printf("%d reads were totally exonic\n",exonicReadCtr);
337 printf("%d reads overlap with prev. exon. %d reads overlap with current exon\n",prevOverlapCtr,currentOverlapCtr);
338 printf("%d reads where newly combined from two original reads\n",combined_reads);
339 printf("Total used reads: %d\n",exonicReadCtr+endPrevCtr+prevOverlapCtr+currentStartCtr+currentOverlapCtr);
340
341 status = munmap(reads_area,reads_filesize);
342 if(status != 0)
343 perror("munmap");
344
345 //free(current_line);
346 free(seq);
347 free(prb);
348 free(cal_prb);
349 free(chastity);
350 }
351
352 void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size,FILE* out_fs,const char* gene_id, int exon_idx) {
353 //printf("up/down size is %d/%d\n",up_size,down_size);
354
355 int up_idx, down_idx, status;
356 char* upstream_line = malloc(sizeof(char)*256);
357 char* downstream_line = malloc(sizeof(char)*256);
358 int p_start, p_stop;
359
360 int buffer_size= 64;
361
362 int up_chr = 0;
363 int up_pos = 0;
364 char* up_seq = malloc(sizeof(char)*buffer_size);
365 int up_id = 0;
366 char up_strand = ' ';
367 int up_mismatch = 0;
368 int up_occurrence = 0;
369 int up_sz = 0;
370 int up_cut = 0;
371 char* up_prb = malloc(sizeof(char)*buffer_size);
372 char* up_cal_prb = malloc(sizeof(char)*buffer_size);
373 char* up_chastity = malloc(sizeof(char)*buffer_size);
374
375 int down_chr = 0;
376 int down_pos = 0;
377 char* down_seq = malloc(sizeof(char)*buffer_size);
378 int down_id = 0;
379 char down_strand = ' ';
380 int down_mismatch = 0;
381 int down_occurrence = 0;
382 int down_sz = 0;
383 int down_cut = 0;
384 char* down_prb = malloc(sizeof(char)*buffer_size);
385 char* down_cal_prb = malloc(sizeof(char)*buffer_size);
386 char* down_chastity = malloc(sizeof(char)*buffer_size);
387
388 int new_chr = 0;
389 char* new_seq = malloc(sizeof(char)*buffer_size);
390 char new_strand = ' ';
391 char* new_prb = malloc(sizeof(char)*buffer_size);
392 char* new_cal_prb = malloc(sizeof(char)*buffer_size);
393 char* new_chastity = malloc(sizeof(char)*buffer_size);
394 char* new_up_seq = malloc(sizeof(char)*read_size);
395 char* new_down_seq = malloc(sizeof(char)*read_size);
396
397 int overlap;
398 int fit;
399
400 char* used_flag = calloc(down_size,sizeof(char));
401
402 for(up_idx=0;up_idx<up_size;up_idx++) {
403 strncpy(upstream_line,upstream[up_idx],256);
404 status = sscanf(upstream_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
405 &up_chr,&up_pos,up_seq,&up_id,&up_strand,&up_mismatch,&up_occurrence,&up_sz,
406 &up_cut,up_prb,up_cal_prb,up_chastity);
407
408 remove_ambiguities(up_seq,strlen(up_seq),new_up_seq);
409
410 overlap = exon_stop - up_pos;
411
412 for(down_idx=0;down_idx<down_size;down_idx++) {
413 if( used_flag[down_idx] == 1)
414 continue;
415
416 strncpy(downstream_line,downstream[down_idx],256);
417 status = sscanf(downstream_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
418 &down_chr,&down_pos,down_seq,&down_id,&down_strand,&down_mismatch,&down_occurrence,&down_sz,
419 &down_cut,down_prb,down_cal_prb,down_chastity);
420
421 remove_ambiguities(down_seq,strlen(down_seq),new_down_seq);
422
423 if(up_strand != down_strand)
424 continue;
425
426 new_seq[0] = '\0';
427 new_prb[0] = '\0';
428 new_cal_prb[0] = '\0';
429 new_chastity[0] = '\0';
430 int splitpos = 0;
431
432 fit = fitting(up_prb+(36-overlap)-6,down_prb+(exon_start-down_pos)-6);
433 if (fit == -1)
434 continue;
435
436 if (! (fit < overlap ))
437 continue;
438
439 new_chr = up_chr;
440 new_strand = up_strand;
441 splitpos = overlap;
442
443 p_start = up_pos;
444 p_stop = exon_start+(read_size-overlap);
445
446 strncat(new_seq,new_up_seq,overlap);
447 strncat(new_prb,up_prb,overlap);
448 strncat(new_cal_prb,up_cal_prb,overlap);
449 strncat(new_chastity,up_chastity,overlap);
450
451 //strncat(new_seq,new_down_seq+fit,read_size-overlap);
452 //strncat(new_prb,down_prb+fit,read_size-overlap);
453 //strncat(new_cal_prb,down_cal_prb+fit,read_size-overlap);
454 //strncat(new_chastity,down_chastity+fit,read_size-overlap);
455
456 int offset = exon_start-down_pos;
457 strncat(new_seq,new_down_seq+offset,read_size-overlap);
458 strncat(new_prb,down_prb+offset,read_size-overlap);
459 strncat(new_cal_prb,down_cal_prb+offset,read_size-overlap);
460 strncat(new_chastity,down_chastity+offset,read_size-overlap);
461
462 // The four last integers specify the positions of
463 // p_start : the position in the dna where the first read starts
464 // exons_stop : the position in the dna where the first exons ends
465 // exons_start : the position in the dna where the second exons starts
466 // p_stop : the position in the dna where the (truncated) second read ends
467 fprintf(out_fs,"%d\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\n",
468 read_nr,new_chr,new_strand,new_seq,splitpos,read_size,new_prb,new_cal_prb,new_chastity,gene_id,p_start,exon_stop,exon_start,p_stop);
469 read_nr++;
470
471 combined_reads++;
472 used_flag[down_idx] = 1;
473 }
474 }
475
476 free(upstream_line);
477 free(downstream_line);
478 free(used_flag);
479
480 free(new_up_seq);
481 free(new_down_seq);
482
483 free(up_seq);
484 free(up_prb);
485 free(up_cal_prb);
486 free(up_chastity);
487
488 free(down_seq);
489 free(down_prb);
490 free(down_cal_prb);
491 free(down_chastity);
492
493 free(new_seq);
494 free(new_prb);
495 free(new_cal_prb);
496 free(new_chastity);
497 }
498
499 /*
500 int fitting(char* up_prb, char* down_prb) {
501 int w_size = 3;
502
503 double current_mean_up = 0;
504 double current_mean_down = 0;
505
506 int uidx;
507 for(uidx=0;uidx<w_size;uidx++) {
508 current_mean_up += up_prb[uidx]-50;
509 current_mean_down += down_prb[uidx]-50;
510 }
511
512 current_mean_up /= w_size;
513 current_mean_down /= w_size;
514
515 double min_val = fmin(current_mean_up,current_mean_down);
516 double max_val = fmax(current_mean_up,current_mean_down);
517 if ( (min_val / max_val) <= 0.33 )
518 return 1;
519
520 return -1;
521 }
522 */
523
524 int fitting(char* up_prb, char* down_prb) {
525 double epsilon_mean = 30.0;
526 double epsilon_var = 30.0;
527 int w_size = 6;
528
529 double current_mean_up = 0;
530 double current_variance_up = 0;
531 double current_mean_down = 0;
532 double current_variance_down = 0;
533
534 double* mean_up = malloc(sizeof(double)*read_size-2*w_size);
535 double* variance_up = malloc(sizeof(double)*read_size-2*w_size);
536 double* mean_down = malloc(sizeof(double)*read_size-2*w_size);
537 double* variance_down = malloc(sizeof(double)*read_size-2*w_size);
538
539 int iidx = -1;
540 int uidx;
541 int didx;
542
543 for(uidx=iidx-w_size;uidx<iidx;uidx++) {
544 didx = uidx+w_size;
545 current_mean_up += up_prb[uidx]-50;
546 current_mean_down += up_prb[didx]-50;
547 }
548
549 current_mean_up /= w_size;
550 current_mean_down /= w_size;
551
552 for(uidx=iidx-w_size;uidx<iidx;uidx++) {
553 didx = uidx+w_size;
554 current_variance_up += pow(up_prb[uidx]-50 - current_mean_up,2);
555 current_variance_down += pow(up_prb[didx]-50 - current_mean_up,2);
556 }
557
558 current_variance_up /= (w_size-1);
559 current_variance_down /= (w_size-1);
560
561 for(iidx=w_size;iidx<30;iidx++) {
562 for(uidx=iidx-w_size;uidx<iidx;uidx++) {
563 didx = uidx+w_size;
564 mean_up[iidx-w_size] += down_prb[uidx]-50;
565 mean_down[iidx-w_size] += down_prb[didx]-50;
566
567 }
568 mean_up[iidx-w_size] /= w_size;
569 mean_down[iidx-w_size] /= w_size;
570
571 for(uidx=iidx-w_size;uidx<iidx;uidx++) {
572 didx = uidx+w_size;
573 variance_up[iidx-w_size] += pow(down_prb[uidx]-50 - mean_up[iidx-w_size],2);
574 variance_down[iidx-w_size] += pow(down_prb[didx]-50 - mean_down[iidx-w_size],2);
575 }
576 variance_up[iidx-w_size] /= (w_size-1);
577 variance_down[iidx-w_size] /= (w_size-1);
578
579 //printf("means: %f %f %f %f, variances: %f %f %f %f\n",current_mean_up,current_mean_down,mean_up,mean_down,current_variance_up,current_variance_down,variance_up,variance_down);
580 //if ( abs(current_mean_up - mean_up) < epsilon_mean && abs(current_variance_up - variance_up) < epsilon_var
581 //&& abs(current_mean_down - mean_down) < epsilon_mean && abs(current_variance_down - variance_down) < epsilon_var )
582 // return iidx;
583 }
584
585 int bidx;
586 int bestIdx = -1;
587 double min = 1000.0;
588 for(bidx=0;bidx<read_size-2*w_size;bidx++) {
589 if ( abs(current_mean_up - mean_up[bidx]) < epsilon_mean && abs(current_variance_up - variance_up[bidx]) < epsilon_var
590 && abs(current_mean_down - mean_down[bidx]) < epsilon_mean && abs(current_variance_down - variance_down[bidx]) < epsilon_var ) {
591 if ( abs(current_mean_up - mean_up[bidx]) + abs(current_variance_up - variance_up[bidx]) + abs(current_mean_down - mean_down[bidx]) + abs(current_variance_down - variance_down[bidx])) {
592 min = abs(current_mean_up - mean_up[bidx]) + abs(current_variance_up - variance_up[bidx]) + abs(current_mean_down - mean_down[bidx]) + abs(current_variance_down - variance_down[bidx]);
593 bestIdx = bidx;
594 }
595 }
596 }
597
598 free(mean_up);
599 free(variance_up);
600 free(mean_down);
601 free(variance_down);
602
603 return bestIdx;
604 }
605
606 void remove_ambiguities(char * old_seq, int old_seq_size, char* new_seq) {
607 int idx=0;
608 int new_idx = 0;
609
610 while(idx<old_seq_size) {
611 //printf("Current elem %c pos %d %d\n",old_seq[idx],idx,new_idx);
612 if (old_seq[idx] == ']' || old_seq[idx] == '-' ) {
613 idx++;
614 continue;
615 }
616
617 if (old_seq[idx] == '[') {
618 idx += 2;
619 //printf("%c %c\n",old_seq[idx-2],old_seq[idx]);
620 continue;
621 }
622
623 new_seq[new_idx] = old_seq[idx];
624 idx++;
625 new_idx++;
626 }
627
628 if (new_idx != 36) {
629 printf("Error: Sequence is not of length 36!\n");
630 printf("old seq: %s\n",old_seq);
631 printf("new seq: %s\n",new_seq);
632 exit(EXIT_FAILURE);
633 }
634 }
635
636 /*
637 * TODO:
638 * - Check strand -> done simple (only if equal)
639 * - check for [AC] and similar entries -> done simple (see function
640 * - remove_ambiguities (exchanges [XY] by the second entry)
641 */