+ resolved an issue with the memory mapping in filterReads
[qpalma.git] / tools / data_tools / filterReads.c
1 ////////////////////////////////////////////////////////////
2 // The purpose of this program is to read a gff and a
3 // solexa reads file and create a data set used by QPalma.
4 //
5 //
6 ////////////////////////////////////////////////////////////
7
8 #include <sys/mman.h>
9 #include <sys/stat.h>
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <string.h>
13 #include <unistd.h>
14 #include <math.h>
15
16 #include "debug_tools.h"
17 #include "datastructures.h"
18
19 #define _FILE_OFFSET_BITS == 64
20
21 int compare_gene_struct(struct gene* a, struct gene* b) {
22 return a->stop - b->start;
23 }
24
25 int parse_gff(char* filename,FILE* fid,struct gene*** allGenes);
26
27 void sort_genes(struct gene*** allGenes, int numGenes);
28
29 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes,FILE* out_fs);
30
31 void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size, FILE* out_fs,const char* gene_id, int exon_idx);
32
33 int fitting(char* up_prb, char* down_prb);
34
35 void remove_ambiguities(char * old_seq, int old_seq_size, char* new_seq);
36
37 static char *info = "Usage is:\n./filterReads gff reads output";
38
39 /*
40 * Some constants specifying the exact behavior of the filter
41 *
42 */
43
44 const int read_size = 36;
45
46 const int min_overlap = 6;
47 const int max_overlap = 30;
48
49 int read_nr = 1;
50
51 int combined_reads = 0;
52 int main(int argc, char* argv[]) {
53
54 if(argc != 5) {
55 printf("%s\n",info);
56 exit(EXIT_FAILURE);
57 }
58
59 int status;
60 int filenameSize = 256;
61 char* gff_filename = malloc(sizeof(char)*filenameSize);
62 char* reads_filename = malloc(sizeof(char)*filenameSize);
63 char* output_filename = malloc(sizeof(char)*filenameSize);
64
65 strncpy(gff_filename,argv[1],filenameSize);
66 strncpy(reads_filename,argv[2],filenameSize);
67 strncpy(output_filename,argv[3],filenameSize);
68
69 FILE *gff_fs = fopen(gff_filename,"r");
70 FILE *reads_fs = fopen(reads_filename,"r");
71 FILE *out_fs = fopen(output_filename,"w");
72
73 read_nr = atoi(argv[4]);
74 read_nr++;
75
76 if(gff_fs == NULL) {
77 printf("Error: Could not open file: %s",gff_filename);
78 exit(EXIT_FAILURE);
79 }
80
81 if(reads_fs == NULL) {
82 printf("Error: Could not open file: %s",reads_filename);
83 exit(EXIT_FAILURE);
84 }
85
86 if(out_fs == NULL) {
87 printf("Error: Could not open file: %s",output_filename);
88 exit(EXIT_FAILURE);
89 }
90
91 struct gene** allGenes;
92 int numGenes = parse_gff(gff_filename,gff_fs,&allGenes);
93 status = fclose(gff_fs);
94 free(gff_filename);
95 if(status != 0)
96 printf("closing of gff filestream failed!\n");
97
98 printf("Successfully parsed gff file! Found %d genes.\n",numGenes);
99
100 // check if allGenes is sorted. if not throw away those genes that do not
101 // occur in the sorted order
102 int g_idx;
103 struct gene* currentGene = 0;
104 int nulled_genes=0;
105 int old_gene_stop = -1;
106 for(g_idx=0;g_idx<numGenes;g_idx++) {
107 currentGene = allGenes[g_idx];
108
109 if (! (currentGene->start < currentGene->stop))
110 printf("Invalid positions for gene %s!\n",currentGene->id);
111
112 if (! (old_gene_stop < currentGene->start ) ) {
113 old_gene_stop = currentGene->stop;
114 allGenes[g_idx] = 0;
115 nulled_genes++;
116 continue;
117 }
118 old_gene_stop = currentGene->stop;
119 }
120
121 printf("Found %d unordered genes.\n",nulled_genes);
122 int gidx, eidx;
123 int exon_cov = 0;
124 for(gidx=0;gidx<numGenes;gidx++) {
125 if (allGenes[gidx] == 0)
126 continue;
127
128 for(eidx=0;eidx<allGenes[gidx]->num_exons;eidx++) {
129 exon_cov += allGenes[gidx]->exon_stops[eidx] - allGenes[gidx]->exon_starts[eidx];
130 }}
131 printf("Exon coverage is %f\n",(double)exon_cov/30432563);
132
133 process_reads(reads_fs,&allGenes,numGenes,out_fs);
134
135 status = fclose(reads_fs);
136 status = fclose(out_fs);
137 if(status != 0)
138 perror("fclose");
139
140 free(reads_filename);
141 free(output_filename);
142 return 0;
143 }
144
145 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* out_fs) {
146 int status;
147
148 int buffer_size= 64;
149 int chr = 0;
150 int pos = 0;
151 char* seq = malloc(sizeof(char)*buffer_size);
152 unsigned long id = 0;
153 char strand = ' ';
154 int mismatch = 0;
155 int occurrence = 0;
156 int size = 0;
157 int cut = 0;
158 char* prb = malloc(sizeof(char)*buffer_size);
159 char* cal_prb = malloc(sizeof(char)*buffer_size);
160 char* chastity = malloc(sizeof(char)*buffer_size);
161
162 int reads_fid = fileno(reads_fs);
163 struct stat reads_stat;
164 if ( fstat(reads_fid,&reads_stat) == -1) {
165 perror("fstat");
166 exit(EXIT_FAILURE);
167 }
168 off_t reads_filesize = reads_stat.st_size;
169 printf("Reads file is of size %lu bytes\n",(unsigned long) reads_filesize);
170 int numReads = reads_filesize / 178.0;
171
172 void *reads_area = mmap (NULL,reads_filesize,PROT_READ,MAP_PRIVATE,reads_fid,0);
173 if (reads_area == MAP_FAILED) {
174 perror("mmap");
175 exit(EXIT_FAILURE);
176 }
177 close(reads_fid);
178 printf("Successfully mapped %lu bytes of reads file into memory\n",(unsigned long)reads_filesize);
179
180 char* lineBeginPtr = (char*) reads_area;
181 char* lineEndPtr = (char*) reads_area;
182 char* end_of_mapped_area = ((char*) reads_area) + reads_filesize;
183
184 while (*lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
185 lineEndPtr++;
186
187 char* current_line = malloc(sizeof(char)*512);
188 memset(current_line,0,512);
189
190 int SIZE = 500;
191 // initialize boundary arrays
192 void** upstream_overlap = malloc(sizeof(void*)*SIZE);
193 void** downstream_overlap= malloc(sizeof(void*)*SIZE);
194 int uov,dov;
195 uov = dov = 0;
196
197 int skippedLinesCounter = 0;
198
199 int prev_exon_start = -1;
200 int prev_exon_stop = -1;
201 int cur_exon_start = -1;
202
203 unsigned long line_size = lineEndPtr - lineBeginPtr;
204 //printf("distance is %lu\n",line_size);
205 strncpy(current_line,lineBeginPtr,line_size);
206 current_line[line_size] = '\0';
207 //printf("%s test",current_line);
208
209 int gene_idx = 0;
210 int exon_idx = 1;
211 struct gene* currentGene = (*allGenes)[gene_idx];
212 char* gene_id = currentGene->id;
213
214 int skippedReadCtr = 0;
215 int uselessReadCtr = 0;
216 int exonicReadCtr = 0;
217 int endPrevCtr = 0;
218 int prevOverlapCtr = 0;
219 int currentStartCtr = 0;
220 int currentOverlapCtr = 0;
221 int multioccurReadCtr = 0;
222
223 int readCtr = 0;
224 // start of the parsing loop
225 while(1) {
226 if (gene_idx == numGenes || strcmp(current_line,"") == 0)
227 break;
228
229 gene_id = currentGene->id;
230
231 if (readCtr != 0 && readCtr % 1000000 == 0)
232 printf("Processed %d/%d genes and %d/%d reads.\n",gene_idx,numGenes,readCtr,numReads);
233
234 status = sscanf(current_line,"%d\t%d\t%s\t%lu\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
235 &chr,&pos,seq,&id,&strand,&mismatch,&occurrence,&size,&cut,prb,cal_prb,chastity);
236 if (status < 12) {
237 skippedLinesCounter++;
238 goto next_read;
239 }
240
241 // if the read is occurring several times elsewhere then get rid of it
242 if ( occurrence != 1 ) {
243 multioccurReadCtr++;
244 goto next_read;
245 }
246
247 FA(currentGene != 0);
248
249 if (!(currentGene->start <= pos && (pos + read_size-1) <= currentGene->stop)) { // read is not within gene borders
250
251 if ( currentGene->stop < (pos + read_size-1) ) { // go to next gene
252 gene_idx++;
253 exon_idx = 1;
254 currentGene = (*allGenes)[gene_idx];
255 while( currentGene == 0 && gene_idx < numGenes) {
256 currentGene = (*allGenes)[gene_idx];
257 gene_idx++;
258 }
259 //printf("currentGene->start / currentGene->stop %d/%d pos is %d\n",currentGene->start,currentGene->stop,pos);
260 uov = dov = 0;
261 continue;
262 }
263
264 if ( pos < currentGene->start ) { // go to next read
265 skippedReadCtr++;
266
267 next_read:
268
269 lineBeginPtr = lineEndPtr;
270 while (*(char*)lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
271 lineEndPtr++;
272 readCtr += 1;
273 current_line = strncpy(current_line,lineBeginPtr,lineEndPtr-lineBeginPtr);
274 current_line[lineEndPtr-lineBeginPtr] = '\0';
275 continue;
276 }
277
278 } else { // read IS within gene borders
279
280 exon_label:
281
282 if (exon_idx == currentGene->num_exons) {
283 gene_idx++;
284 exon_idx = 1;
285 currentGene = (*allGenes)[gene_idx];
286 while( currentGene == 0 && gene_idx < numGenes) {
287 currentGene = (*allGenes)[gene_idx];
288 gene_idx++;
289 }
290 continue;
291 }
292
293 prev_exon_start = currentGene->exon_starts[exon_idx-1];
294 prev_exon_stop = currentGene->exon_stops[exon_idx-1];
295 cur_exon_start = currentGene->exon_starts[exon_idx];
296
297 if (cur_exon_start - prev_exon_stop < min_overlap || cur_exon_start < pos ) { // go to next exon
298
299 if (uov != 0 && dov != 0)
300 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
301
302 exon_idx++;
303 uov = dov = 0;
304 goto exon_label;
305 }
306
307 if ( prev_exon_start < pos && (pos+read_size) < prev_exon_stop ) { // read is inside previous exon
308 exonicReadCtr++;
309 goto next_read;
310 }
311
312 if ( pos + (read_size-1) < prev_exon_stop ) // go to next read
313 goto next_read;
314
315 // if this position is reached the read is somehow overlapping or
316 // exactly on the exon boundary.
317 if ( (prev_exon_stop - pos) >= min_overlap && (prev_exon_stop - pos) < read_size) { // read overlaps with previous exon boundary
318 //printf("%s\n",current_line);
319 prevOverlapCtr++;
320 upstream_overlap[uov] = lineBeginPtr;
321 uov++;
322 goto next_read;
323 }
324
325 int end_pos = pos+read_size-1;
326 if ( (end_pos - cur_exon_start) >= min_overlap && (end_pos - cur_exon_start) < read_size) { // read overlaps with current exon boundary
327 //printf("%s\n",current_line);
328 currentOverlapCtr++;
329 downstream_overlap[dov] = lineBeginPtr;
330 dov++;
331 goto next_read;
332 }
333
334 uselessReadCtr++;
335 goto next_read; // read was not useful i.e. not overlapping/starting at exon boundaries
336 }
337 }
338
339 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
340
341 free(upstream_overlap);
342 free(downstream_overlap);
343
344 printf("Processed %d reads in total while %d reads where skipped.\n",readCtr,skippedLinesCounter);
345 printf("%d were totally intronic. %d were outside of genes and %d were occurring on several positions.\n",uselessReadCtr,skippedReadCtr,multioccurReadCtr);
346 printf("%d reads were useless in total\n",uselessReadCtr+skippedReadCtr+multioccurReadCtr);
347 printf("%d reads were totally exonic\n",exonicReadCtr);
348 printf("%d reads overlap with prev. exon. %d reads overlap with current exon\n",prevOverlapCtr,currentOverlapCtr);
349 printf("%d reads where newly combined from two original reads\n",combined_reads);
350 printf("Total used reads: %d\n",exonicReadCtr+endPrevCtr+prevOverlapCtr+currentStartCtr+currentOverlapCtr);
351
352 status = munmap(reads_area,reads_filesize);
353 if(status != 0)
354 perror("munmap");
355
356 //free(current_line);
357 free(seq);
358 free(prb);
359 free(cal_prb);
360 free(chastity);
361 }
362
363 void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size,FILE* out_fs,const char* gene_id, int exon_idx) {
364
365 //printf("up/down size is %d/%d\n",up_size,down_size);
366
367 /*
368 int up_idx, down_idx, status;
369 char* upstream_line = malloc(sizeof(char)*256);
370 char* downstream_line = malloc(sizeof(char)*256);
371 int p_start, p_stop;
372
373 int buffer_size= 64;
374
375 int up_chr = 0;
376 int up_pos = 0;
377 char* up_seq = malloc(sizeof(char)*buffer_size);
378 int up_id = 0;
379 char up_strand = ' ';
380 int up_mismatch = 0;
381 int up_occurrence = 0;
382 int up_sz = 0;
383 int up_cut = 0;
384 char* up_prb = malloc(sizeof(char)*buffer_size);
385 char* up_cal_prb = malloc(sizeof(char)*buffer_size);
386 char* up_chastity = malloc(sizeof(char)*buffer_size);
387
388 int down_chr = 0;
389 int down_pos = 0;
390 char* down_seq = malloc(sizeof(char)*buffer_size);
391 int down_id = 0;
392 char down_strand = ' ';
393 int down_mismatch = 0;
394 int down_occurrence = 0;
395 int down_sz = 0;
396 int down_cut = 0;
397 char* down_prb = malloc(sizeof(char)*buffer_size);
398 char* down_cal_prb = malloc(sizeof(char)*buffer_size);
399 char* down_chastity = malloc(sizeof(char)*buffer_size);
400
401 int new_chr = 0;
402 char* new_seq = malloc(sizeof(char)*buffer_size);
403 char new_strand = ' ';
404 char* new_prb = malloc(sizeof(char)*buffer_size);
405 char* new_cal_prb = malloc(sizeof(char)*buffer_size);
406 char* new_chastity = malloc(sizeof(char)*buffer_size);
407 char* new_up_seq = malloc(sizeof(char)*read_size);
408 char* new_down_seq = malloc(sizeof(char)*read_size);
409
410 int overlap;
411 int fit;
412
413 char* used_flag = calloc(down_size,sizeof(char));
414
415 for(up_idx=0;up_idx<up_size;up_idx++) {
416 strncpy(upstream_line,upstream[up_idx],256);
417 status = sscanf(upstream_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
418 &up_chr,&up_pos,up_seq,&up_id,&up_strand,&up_mismatch,&up_occurrence,&up_sz,
419 &up_cut,up_prb,up_cal_prb,up_chastity);
420
421 remove_ambiguities(up_seq,strlen(up_seq),new_up_seq);
422
423 overlap = exon_stop - up_pos;
424
425 for(down_idx=0;down_idx<down_size;down_idx++) {
426 if( used_flag[down_idx] == 1)
427 continue;
428
429 strncpy(downstream_line,downstream[down_idx],256);
430 status = sscanf(downstream_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
431 &down_chr,&down_pos,down_seq,&down_id,&down_strand,&down_mismatch,&down_occurrence,&down_sz,
432 &down_cut,down_prb,down_cal_prb,down_chastity);
433
434 remove_ambiguities(down_seq,strlen(down_seq),new_down_seq);
435
436 if(up_strand != down_strand)
437 continue;
438
439 new_seq[0] = '\0';
440 new_prb[0] = '\0';
441 new_cal_prb[0] = '\0';
442 new_chastity[0] = '\0';
443 int splitpos = 0;
444
445 fit = fitting(up_prb+(36-overlap)-6,down_prb+(exon_start-down_pos)-6);
446 if (fit == -1)
447 continue;
448
449 if (! (fit < overlap ))
450 continue;
451
452 new_chr = up_chr;
453 new_strand = up_strand;
454 splitpos = overlap;
455
456 p_start = up_pos;
457 p_stop = exon_start+(read_size-overlap);
458
459 strncat(new_seq,new_up_seq,overlap);
460 strncat(new_prb,up_prb,overlap);
461 strncat(new_cal_prb,up_cal_prb,overlap);
462 strncat(new_chastity,up_chastity,overlap);
463
464 //strncat(new_seq,new_down_seq+fit,read_size-overlap);
465 //strncat(new_prb,down_prb+fit,read_size-overlap);
466 //strncat(new_cal_prb,down_cal_prb+fit,read_size-overlap);
467 //strncat(new_chastity,down_chastity+fit,read_size-overlap);
468
469 int offset = exon_start-down_pos;
470 strncat(new_seq,new_down_seq+offset,read_size-overlap);
471 strncat(new_prb,down_prb+offset,read_size-overlap);
472 strncat(new_cal_prb,down_cal_prb+offset,read_size-overlap);
473 strncat(new_chastity,down_chastity+offset,read_size-overlap);
474
475 // The four last integers specify the positions of
476 // p_start : the position in the dna where the first read starts
477 // exons_stop : the position in the dna where the first exons ends
478 // exons_start : the position in the dna where the second exons starts
479 // p_stop : the position in the dna where the (truncated) second read ends
480 fprintf(out_fs,"%d\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\n",
481 read_nr,new_chr,new_strand,new_seq,splitpos,read_size,new_prb,new_cal_prb,new_chastity,gene_id,p_start,exon_stop,exon_start,p_stop);
482 read_nr++;
483
484 combined_reads++;
485 used_flag[down_idx] = 1;
486 }
487 }
488
489 free(upstream_line);
490 free(downstream_line);
491 free(used_flag);
492
493 free(new_up_seq);
494 free(new_down_seq);
495
496 free(up_seq);
497 free(up_prb);
498 free(up_cal_prb);
499 free(up_chastity);
500
501 free(down_seq);
502 free(down_prb);
503 free(down_cal_prb);
504 free(down_chastity);
505
506 free(new_seq);
507 free(new_prb);
508 free(new_cal_prb);
509 free(new_chastity);
510 */
511 }
512
513 /*
514 int fitting(char* up_prb, char* down_prb) {
515 int w_size = 3;
516
517 double current_mean_up = 0;
518 double current_mean_down = 0;
519
520 int uidx;
521 for(uidx=0;uidx<w_size;uidx++) {
522 current_mean_up += up_prb[uidx]-50;
523 current_mean_down += down_prb[uidx]-50;
524 }
525
526 current_mean_up /= w_size;
527 current_mean_down /= w_size;
528
529 double min_val = fmin(current_mean_up,current_mean_down);
530 double max_val = fmax(current_mean_up,current_mean_down);
531 if ( (min_val / max_val) <= 0.33 )
532 return 1;
533
534 return -1;
535 }
536 */
537
538 int fitting(char* up_prb, char* down_prb) {
539 double epsilon_mean = 30.0;
540 double epsilon_var = 30.0;
541 int w_size = 6;
542
543 double current_mean_up = 0;
544 double current_variance_up = 0;
545 double current_mean_down = 0;
546 double current_variance_down = 0;
547
548 double* mean_up = malloc(sizeof(double)*read_size-2*w_size);
549 double* variance_up = malloc(sizeof(double)*read_size-2*w_size);
550 double* mean_down = malloc(sizeof(double)*read_size-2*w_size);
551 double* variance_down = malloc(sizeof(double)*read_size-2*w_size);
552
553 int iidx = -1;
554 int uidx;
555 int didx;
556
557 for(uidx=iidx-w_size;uidx<iidx;uidx++) {
558 didx = uidx+w_size;
559 current_mean_up += up_prb[uidx]-50;
560 current_mean_down += up_prb[didx]-50;
561 }
562
563 current_mean_up /= w_size;
564 current_mean_down /= w_size;
565
566 for(uidx=iidx-w_size;uidx<iidx;uidx++) {
567 didx = uidx+w_size;
568 current_variance_up += pow(up_prb[uidx]-50 - current_mean_up,2);
569 current_variance_down += pow(up_prb[didx]-50 - current_mean_up,2);
570 }
571
572 current_variance_up /= (w_size-1);
573 current_variance_down /= (w_size-1);
574
575 for(iidx=w_size;iidx<30;iidx++) {
576 for(uidx=iidx-w_size;uidx<iidx;uidx++) {
577 didx = uidx+w_size;
578 mean_up[iidx-w_size] += down_prb[uidx]-50;
579 mean_down[iidx-w_size] += down_prb[didx]-50;
580
581 }
582 mean_up[iidx-w_size] /= w_size;
583 mean_down[iidx-w_size] /= w_size;
584
585 for(uidx=iidx-w_size;uidx<iidx;uidx++) {
586 didx = uidx+w_size;
587 variance_up[iidx-w_size] += pow(down_prb[uidx]-50 - mean_up[iidx-w_size],2);
588 variance_down[iidx-w_size] += pow(down_prb[didx]-50 - mean_down[iidx-w_size],2);
589 }
590 variance_up[iidx-w_size] /= (w_size-1);
591 variance_down[iidx-w_size] /= (w_size-1);
592
593 //printf("means: %f %f %f %f, variances: %f %f %f %f\n",current_mean_up,current_mean_down,mean_up,mean_down,current_variance_up,current_variance_down,variance_up,variance_down);
594 //if ( abs(current_mean_up - mean_up) < epsilon_mean && abs(current_variance_up - variance_up) < epsilon_var
595 //&& abs(current_mean_down - mean_down) < epsilon_mean && abs(current_variance_down - variance_down) < epsilon_var )
596 // return iidx;
597 }
598
599 int bidx;
600 int bestIdx = -1;
601 double min = 1000.0;
602 for(bidx=0;bidx<read_size-2*w_size;bidx++) {
603 if ( abs(current_mean_up - mean_up[bidx]) < epsilon_mean && abs(current_variance_up - variance_up[bidx]) < epsilon_var
604 && abs(current_mean_down - mean_down[bidx]) < epsilon_mean && abs(current_variance_down - variance_down[bidx]) < epsilon_var ) {
605 if ( abs(current_mean_up - mean_up[bidx]) + abs(current_variance_up - variance_up[bidx]) + abs(current_mean_down - mean_down[bidx]) + abs(current_variance_down - variance_down[bidx])) {
606 min = abs(current_mean_up - mean_up[bidx]) + abs(current_variance_up - variance_up[bidx]) + abs(current_mean_down - mean_down[bidx]) + abs(current_variance_down - variance_down[bidx]);
607 bestIdx = bidx;
608 }
609 }
610 }
611
612 free(mean_up);
613 free(variance_up);
614 free(mean_down);
615 free(variance_down);
616
617 return bestIdx;
618 }
619
620 void remove_ambiguities(char * old_seq, int old_seq_size, char* new_seq) {
621 int idx=0;
622 int new_idx = 0;
623
624 while(idx<old_seq_size) {
625 //printf("Current elem %c pos %d %d\n",old_seq[idx],idx,new_idx);
626 if (old_seq[idx] == ']' || old_seq[idx] == '-' ) {
627 idx++;
628 continue;
629 }
630
631 if (old_seq[idx] == '[') {
632 idx += 2;
633 //printf("%c %c\n",old_seq[idx-2],old_seq[idx]);
634 continue;
635 }
636
637 new_seq[new_idx] = old_seq[idx];
638 idx++;
639 new_idx++;
640 }
641
642 if (new_idx != 36) {
643 printf("Error: Sequence is not of length 36!\n");
644 printf("old seq: %s\n",old_seq);
645 printf("new seq: %s\n",new_seq);
646 exit(EXIT_FAILURE);
647 }
648 }
649
650 /*
651 * TODO:
652 * - Check strand -> done simple (only if equal)
653 * - check for [AC] and similar entries -> done simple (see function
654 * - remove_ambiguities (exchanges [XY] by the second entry)
655 */