+ restored old filtering functionality -> reads have to start/stop at exon boundaries
[qpalma.git] / tools / data_tools / filterReads.c
1 ////////////////////////////////////////////////////////////
2 // The purpose of this program is to read a gff and a
3 // solexa reads file and create a data set used by QPalma.
4 //
5 //
6 //
7 ////////////////////////////////////////////////////////////
8
9 #include <sys/mman.h>
10 #include <sys/stat.h>
11 #include <stdio.h>
12 #include <stdlib.h>
13 #include <string.h>
14 #include <unistd.h>
15 #include <math.h>
16
17 #include "common.h"
18 #include "datastructures.h"
19
20 int compare_gene_struct(struct gene* a, struct gene* b) {
21 return a->stop - b->start;
22 }
23
24 int parse_gff(char* filename,FILE* fid,struct gene*** allGenes);
25
26 void sort_genes(struct gene*** allGenes, int numGenes);
27
28 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes,FILE* out_fs);
29
30 void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size, FILE* out_fs,const char* gene_id);
31
32 //int fitting(char* up_prb, char* up_prb_end, char* down_prb, char* down_prb_end);
33 int fitting(char* up_prb, char* down_prb);
34
35 void remove_ambiguities(char * old_seq, int old_seq_size, char* new_seq);
36
37 static char *info = "Usage is:\n./filterReads gff reads output";
38
39 const int read_size = 36;
40
41 int combined_reads = 0;
42 int main(int argc, char* argv[]) {
43
44 if(argc != 4) {
45 printf("%s\n",info);
46 exit(EXIT_FAILURE);
47 }
48
49 int status;
50 int filenameSize = 256;
51 char* gff_filename = malloc(sizeof(char)*filenameSize);
52 char* reads_filename = malloc(sizeof(char)*filenameSize);
53 char* output_filename = malloc(sizeof(char)*filenameSize);
54
55 strncpy(gff_filename,argv[1],filenameSize);
56 strncpy(reads_filename,argv[2],filenameSize);
57 strncpy(output_filename,argv[3],filenameSize);
58
59 FILE *gff_fs = fopen(gff_filename,"r");
60 FILE *reads_fs = fopen(reads_filename,"r");
61 FILE *out_fs = fopen(output_filename,"w");
62
63 if(gff_fs == NULL) {
64 printf("Error: Could not open file: %s",gff_filename);
65 exit(EXIT_FAILURE);
66 }
67
68 if(reads_fs == NULL) {
69 printf("Error: Could not open file: %s",reads_filename);
70 exit(EXIT_FAILURE);
71 }
72
73 if(out_fs == NULL) {
74 printf("Error: Could not open file: %s",output_filename);
75 exit(EXIT_FAILURE);
76 }
77
78 struct gene** allGenes;
79 int numGenes = parse_gff(gff_filename,gff_fs,&allGenes);
80 status = fclose(gff_fs);
81 free(gff_filename);
82 if(status != 0)
83 printf("closing of gff filestream failed!\n");
84
85 printf("Successfully parsed gff file! Found %d genes.\n",numGenes);
86
87 // some entries in the gff files are not in a sorted order
88 //printf("Sorting genes...\n");
89 //sort_genes(&allGenes,numGenes);
90 //qsort(allGenes,numGenes,sizeof(struct gene*),compare_gene_struct);
91 ///printf("Genes were sorted!\n");
92
93 int gidx, eidx;
94 int exon_cov = 0;
95 for(gidx=0;gidx<numGenes;gidx++) {
96 for(eidx=0;eidx<allGenes[gidx]->num_exons;eidx++) {
97 exon_cov += allGenes[gidx]->exon_stops[eidx] - allGenes[gidx]->exon_starts[eidx];
98 }}
99 printf("Exon coverage is %f\n",(double)exon_cov/30432563);
100
101 process_reads(reads_fs,&allGenes,numGenes,out_fs);
102
103 status = fclose(reads_fs);
104 status = fclose(out_fs);
105 if(status != 0)
106 perror("fclose");
107
108 free(reads_filename);
109 free(output_filename);
110 return 0;
111 }
112
113 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* out_fs) {
114 int status;
115
116 int buffer_size= 64;
117 int chr = 0;
118 int pos = 0;
119 char* seq = malloc(sizeof(char)*buffer_size);
120 unsigned long id = 0;
121 char strand = ' ';
122 int mismatch = 0;
123 int occurrence = 0;
124 int size = 0;
125 int cut = 0;
126 char* prb = malloc(sizeof(char)*buffer_size);
127 char* cal_prb = malloc(sizeof(char)*buffer_size);
128 char* chastity = malloc(sizeof(char)*buffer_size);
129
130 int reads_fid = fileno(reads_fs);
131 struct stat reads_stat;
132 if ( fstat(reads_fid,&reads_stat) == -1) {
133 perror("fstat");
134 exit(EXIT_FAILURE);
135 }
136 off_t reads_filesize = reads_stat.st_size;
137 printf("Reads file is of size %lu bytes\n",(unsigned long) reads_filesize);
138 int numReads = reads_filesize / 178.0;
139
140 void *reads_area = mmap (NULL,reads_filesize,PROT_READ|PROT_WRITE,MAP_PRIVATE,reads_fid,0);
141 if (reads_area == MAP_FAILED) {
142 perror("mmap");
143 exit(EXIT_FAILURE);
144 }
145 close(reads_fid);
146 printf("Successfully mapped %lu bytes of reads file into memory\n",(unsigned long)reads_filesize);
147
148 void* linePtr = reads_area;
149 char* current_line = malloc(sizeof(char)*256);
150
151 int SIZE = 500;
152 // initialize boundary arrays
153 void** upstream_end = malloc(sizeof(void*)*SIZE);
154 void** upstream_overlap = malloc(sizeof(void*)*SIZE);
155 void** downstream_start = malloc(sizeof(void*)*SIZE);
156 void** downstream_overlap= malloc(sizeof(void*)*SIZE);
157 int ue,uo,ds,dov;
158 ue = uo = ds = dov = 0;
159
160 int skippedLinesCounter = 0;
161
162 int prev_exon_start = -1;
163 int prev_exon_stop = -1;
164 int cur_exon_start = -1;
165
166 current_line = strncpy(current_line,linePtr,256);
167 int gene_idx = 0;
168 int exon_idx = 1;
169 struct gene* currentGene = (*allGenes)[gene_idx];
170 char* gene_id = currentGene->id;
171
172 char* disamb_seq = malloc(sizeof(char)*read_size);
173
174 int skippedReadCtr = 0;
175 int uselessReadCtr = 0;
176 int exonicReadCtr = 0;
177 int endPrevCtr = 0;
178 int prevOverlapCtr = 0;
179 int currentStartCtr = 0;
180 int currentOverlapCtr = 0;
181 int multioccurReadCtr = 0;
182
183 int readCtr = 0;
184 // start of the parsing loop
185 while(1) {
186 if (gene_idx == numGenes || strcmp(current_line,"") == 0)
187 break;
188
189 if (readCtr != 0 && readCtr % 1000000 == 0)
190 printf("Processed %d/%d genes and %d/%d reads.\n",gene_idx,numGenes,readCtr,numReads);
191
192 //if (gene_idx >= 1833)
193 // printf("currentGene start/stop: %d/%d. Positions is %d\n",currentGene->start,currentGene->stop,pos);
194
195 status = sscanf(current_line,"%d\t%d\t%s\t%lu\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
196 &chr,&pos,seq,&id,&strand,&mismatch,&occurrence,&size,&cut,prb,cal_prb,chastity);
197 if (status < 12) {
198 skippedLinesCounter++;
199 goto next_read;
200 }
201
202 // if the read is occurring several times elsewhere then get rid of it
203 //if(!(occurrence >= 1 && occurrence <= 25)) {
204 if ( occurrence != 1 ) {
205 multioccurReadCtr++;
206 goto next_read;
207 }
208
209 if (!(currentGene->start <= pos && (pos + read_size-1) <= currentGene->stop)) { // read is not within gene borders
210
211 if ( currentGene->stop < (pos + read_size-1) ) { // go to next gene
212 gene_idx++;
213 exon_idx = 1;
214 currentGene = (*allGenes)[gene_idx];
215 //printf("currentGene->start / currentGene->stop %d/%d pos is %d\n",currentGene->start,currentGene->stop,pos);
216 gene_id = currentGene->id;
217 ue = uo = ds = dov = 0;
218 continue;
219 }
220
221 if ( pos < currentGene->start ) { // go to next read
222 skippedReadCtr++;
223
224 next_read:
225
226 while (*(char*)linePtr != '\n') linePtr++;
227 linePtr++;
228 readCtr += 1;
229 current_line = strncpy(current_line,linePtr,256);
230 continue;
231 }
232
233 } else { // read IS within gene borders
234
235 exon_label:
236
237 if (exon_idx == currentGene->num_exons) {
238 gene_idx++;
239 exon_idx = 1;
240 currentGene = (*allGenes)[gene_idx];
241 continue;
242 }
243
244 prev_exon_start = currentGene->exon_starts[exon_idx-1];
245 prev_exon_stop = currentGene->exon_stops[exon_idx-1];
246 cur_exon_start = currentGene->exon_starts[exon_idx];
247
248 //printf("exon %d %d inton til %d pos %d\n",prev_exon_start,prev_exon_stop,cur_exon_start,pos);
249
250 if (cur_exon_start - prev_exon_stop < 6 || cur_exon_start < pos ) { // go to next exon
251 exon_idx++;
252
253 if (ue != 0 && dov != 0)
254 combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov,out_fs,gene_id);
255
256 if (uo != 0 && ds != 0)
257 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds,out_fs,gene_id);
258
259 ue = uo = ds = dov = 0;
260 goto exon_label;
261 }
262
263 if ( prev_exon_start < pos && (pos+read_size) < prev_exon_stop ) { // read is inside previous exon
264 exonicReadCtr++;
265
266 // Removed exonic reads
267
268 //remove_ambiguities(seq,strlen(seq),disamb_seq);
269 //fprintf(out_fs,"%d\t%c\t%s\t%d\t%s\t%s\t%s\n",chr,strand,disamb_seq,read_size,prb,cal_prb,chastity);
270
271 goto next_read;
272 }
273
274 if ( pos + (read_size-1) < prev_exon_stop ) // go to next read
275 goto next_read;
276
277 // if this position is reached the read is somehow overlapping or
278 // exactly on the exon boundary. now determine where exactly:
279 if (pos + (read_size-1) == prev_exon_stop) { // read ends at previous exon end
280 endPrevCtr++;
281 upstream_end[ue] = linePtr;
282 ue++;
283 goto next_read;
284 }
285
286 if ( (prev_exon_stop - pos) >= 6 && (prev_exon_stop - pos) <= 30) { // read overlaps with previous exon boundary
287 prevOverlapCtr++;
288 upstream_overlap[uo] = linePtr;
289 uo++;
290 goto next_read;
291 }
292
293 if ( pos == cur_exon_start ) { // read starts at current exon start
294 currentStartCtr++;
295 downstream_start[ds] = linePtr;
296 ds++;
297 goto next_read;
298 }
299
300 if ( (cur_exon_start - pos) >= 6 && (cur_exon_start - pos) <= 30 ) { // read overlaps with current exon boundary
301 currentOverlapCtr++;
302 downstream_overlap[dov] = linePtr;
303 dov++;
304 goto next_read;
305 }
306
307 uselessReadCtr++;
308 goto next_read; // read was not useful i.e. not overlapping/starting at exon boundaries
309 }
310 }
311
312 combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov,out_fs,gene_id);
313 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds,out_fs,gene_id);
314
315 free(upstream_end);
316 free(upstream_overlap);
317 free(downstream_start);
318 free(downstream_overlap);
319
320 printf("Processed %d reads in total while %d reads where skipped.\n",readCtr,skippedLinesCounter);
321 printf("%d were totally intronic. %d were outside of genes and %d were occurring on several positions.\n",uselessReadCtr,skippedReadCtr,multioccurReadCtr);
322 printf("%d reads were useless in total\n",uselessReadCtr+skippedReadCtr+multioccurReadCtr);
323 printf("%d reads were totally exonic\n",exonicReadCtr);
324 printf("%d reads overlap with prev. exon. %d reads overlap with current exon\n",prevOverlapCtr,currentOverlapCtr);
325 printf("%d reads where newly combined from two original reads\n",combined_reads);
326 printf("Total used reads: %d\n",exonicReadCtr+endPrevCtr+prevOverlapCtr+currentStartCtr+currentOverlapCtr);
327
328 status = munmap(reads_area,reads_filesize);
329 if(status != 0)
330 perror("munmap");
331
332 //free(current_line);
333 free(seq);
334 free(prb);
335 free(cal_prb);
336 free(chastity);
337 }
338
339 /*
340 void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size,FILE* out_fs) {
341 //printf("up/down size is %d/%d\n",up_size,down_size);
342
343 int up_idx, down_idx, status;
344 char* upstream_line = malloc(sizeof(char)*256);
345 char* downstream_line = malloc(sizeof(char)*256);
346
347 int buffer_size= 64;
348
349 int up_chr = 0;
350 int up_pos = 0;
351 char* up_seq = malloc(sizeof(char)*buffer_size);
352 int up_id = 0;
353 char up_strand = ' ';
354 int up_mismatch = 0;
355 int up_occurrence = 0;
356 int up_sz = 0;
357 int up_cut = 0;
358 char* up_prb = malloc(sizeof(char)*buffer_size);
359 char* up_cal_prb = malloc(sizeof(char)*buffer_size);
360 char* up_chastity = malloc(sizeof(char)*buffer_size);
361
362 int down_chr = 0;
363 int down_pos = 0;
364 char* down_seq = malloc(sizeof(char)*buffer_size);
365 int down_id = 0;
366 char down_strand = ' ';
367 int down_mismatch = 0;
368 int down_occurrence = 0;
369 int down_sz = 0;
370 int down_cut = 0;
371 char* down_prb = malloc(sizeof(char)*buffer_size);
372 char* down_cal_prb = malloc(sizeof(char)*buffer_size);
373 char* down_chastity = malloc(sizeof(char)*buffer_size);
374
375 int new_chr = 0;
376 char* new_seq = malloc(sizeof(char)*buffer_size);
377 char new_strand = ' ';
378 char* new_prb = malloc(sizeof(char)*buffer_size);
379 char* new_cal_prb = malloc(sizeof(char)*buffer_size);
380 char* new_chastity = malloc(sizeof(char)*buffer_size);
381 char* new_up_seq = malloc(sizeof(char)*read_size);
382 char* new_down_seq = malloc(sizeof(char)*read_size);
383
384 up_idx=0;
385 down_idx=0;
386 while(1) {
387 if (up_idx == up_size || down_idx == down_size || up_strand != down_strand)
388 break;
389
390 strncpy(upstream_line,upstream[up_idx],256);
391 status = sscanf(upstream_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
392 &up_chr,&up_pos,up_seq,&up_id,&up_strand,&up_mismatch,&up_occurrence,&up_sz,
393 &up_cut,up_prb,up_cal_prb,up_chastity);
394
395 strncpy(downstream_line,downstream[down_idx],256);
396 status = sscanf(downstream_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
397 &down_chr,&down_pos,down_seq,&down_id,&down_strand,&down_mismatch,&down_occurrence,&down_sz,
398 &down_cut,down_prb,down_cal_prb,down_chastity);
399
400 remove_ambiguities(up_seq,strlen(up_seq),new_up_seq);
401 remove_ambiguities(down_seq,strlen(down_seq),new_down_seq);
402
403 new_seq[0] = '\0';
404 new_prb[0] = '\0';
405 new_cal_prb[0] = '\0';
406 new_chastity[0] = '\0';
407
408 int fit;
409 int w_size = 6;
410 int overlap = 0;
411 if (up_pos+35 == exon_stop) { // merge with read which is downstream overlapping
412 overlap = exon_start - down_pos;
413
414 //fit = fitting(up_prb+(read_size-w_size),up_prb+read_size,down_prb+overlap,down_prb+overlap+w_size);
415 //if (fit != 1)
416 // goto end;
417
418 new_chr = up_chr;
419 new_strand = up_strand;
420
421 strncat(new_seq,new_up_seq+(read_size-overlap),overlap);
422 strncat(new_prb,up_prb+(read_size-overlap),overlap);
423 strncat(new_cal_prb,up_cal_prb+(read_size-overlap),overlap);
424 strncat(new_chastity,up_chastity+(read_size-overlap),overlap);
425
426 strncat(new_seq,new_down_seq+overlap,read_size-overlap);
427 strncat(new_prb,down_prb+overlap,read_size-overlap);
428 strncat(new_cal_prb,down_cal_prb+overlap,read_size-overlap);
429 strncat(new_chastity,down_chastity+overlap,read_size-overlap);
430
431 //printf("Between exon stop/start %d/%d : merging pos %d %d with overlap %d\n",exon_stop,exon_start,up_pos+35,down_pos, overlap);
432
433 } // merge with read which is upstream overlapping
434
435 if (down_pos == exon_start) {
436 overlap = up_pos+read_size - exon_stop;
437 //printf("overlap is %d\n",overlap);
438 //printf("pos are: %d %d\n",up_pos,down_pos);
439
440 fit = fitting(up_prb+read_size-overlap-w_size,up_prb+read_size-overlap,down_prb,down_prb+w_size);
441 if (fit == -1)
442
443 // goto end;
444
445 new_chr = up_chr;
446 new_strand = up_strand;
447
448 strncat(new_seq,new_up_seq,(read_size-overlap));
449 strncat(new_prb,up_prb,(read_size-overlap));
450 strncat(new_cal_prb,up_cal_prb,(read_size-overlap));
451 strncat(new_chastity,up_chastity,(read_size-overlap));
452
453 strncat(new_seq,new_down_seq,overlap);
454 strncat(new_prb,down_prb,overlap);
455 strncat(new_cal_prb,down_cal_prb,overlap);
456 strncat(new_chastity,down_chastity,overlap);
457
458 //printf("Between exon stop/start %d/%d : merging pos %d %d with overlap %d\n",exon_stop,exon_start,up_pos,down_pos, overlap);
459 }
460
461 if ( !(up_pos+35 == exon_stop) && !(down_pos == exon_start) )
462 printf("ERROR: Between exon stop/start %d/%d : merging pos %d %d with overlap %d\n",exon_stop,exon_start,up_pos,down_pos, overlap);
463
464 fprintf(out_fs,"%d\t%c\t%s\t%d\t%s\t%s\t%s\n",
465 new_chr,new_strand,new_seq,read_size,new_prb,new_cal_prb,new_chastity);
466
467 end:
468
469 up_idx++;
470 down_idx++;
471 }
472
473 free(upstream_line);
474 free(downstream_line);
475
476 free(new_up_seq);
477 free(new_down_seq);
478
479 free(up_prb);
480 free(up_cal_prb);
481 free(up_chastity);
482
483 free(down_prb);
484 free(down_cal_prb);
485 free(down_chastity);
486
487 free(new_prb);
488 free(new_cal_prb);
489 free(new_chastity);
490
491 }
492 */
493
494 void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size,FILE* out_fs,const char* gene_id) {
495 //printf("up/down size is %d/%d\n",up_size,down_size);
496
497 int up_idx, down_idx, status;
498 char* upstream_line = malloc(sizeof(char)*256);
499 char* downstream_line = malloc(sizeof(char)*256);
500
501 int buffer_size= 64;
502
503 int up_chr = 0;
504 int up_pos = 0;
505 char* up_seq = malloc(sizeof(char)*buffer_size);
506 int up_id = 0;
507 char up_strand = ' ';
508 int up_mismatch = 0;
509 int up_occurrence = 0;
510 int up_sz = 0;
511 int up_cut = 0;
512 char* up_prb = malloc(sizeof(char)*buffer_size);
513 char* up_cal_prb = malloc(sizeof(char)*buffer_size);
514 char* up_chastity = malloc(sizeof(char)*buffer_size);
515
516 int down_chr = 0;
517 int down_pos = 0;
518 char* down_seq = malloc(sizeof(char)*buffer_size);
519 int down_id = 0;
520 char down_strand = ' ';
521 int down_mismatch = 0;
522 int down_occurrence = 0;
523 int down_sz = 0;
524 int down_cut = 0;
525 char* down_prb = malloc(sizeof(char)*buffer_size);
526 char* down_cal_prb = malloc(sizeof(char)*buffer_size);
527 char* down_chastity = malloc(sizeof(char)*buffer_size);
528
529 int new_chr = 0;
530 char* new_seq = malloc(sizeof(char)*buffer_size);
531 char new_strand = ' ';
532 char* new_prb = malloc(sizeof(char)*buffer_size);
533 char* new_cal_prb = malloc(sizeof(char)*buffer_size);
534 char* new_chastity = malloc(sizeof(char)*buffer_size);
535 char* new_up_seq = malloc(sizeof(char)*read_size);
536 char* new_down_seq = malloc(sizeof(char)*read_size);
537
538 int overlap;
539 int fit;
540
541 char* used_flag = calloc(down_size,sizeof(char));
542
543 for(up_idx=0;up_idx<up_size;up_idx++) {
544 strncpy(upstream_line,upstream[up_idx],256);
545 status = sscanf(upstream_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
546 &up_chr,&up_pos,up_seq,&up_id,&up_strand,&up_mismatch,&up_occurrence,&up_sz,
547 &up_cut,up_prb,up_cal_prb,up_chastity);
548
549 remove_ambiguities(up_seq,strlen(up_seq),new_up_seq);
550
551 overlap = exon_stop - up_pos;
552
553 for(down_idx=0;down_idx<down_size;down_idx++) {
554 if( used_flag[down_idx] == 1)
555 continue;
556
557 strncpy(downstream_line,downstream[down_idx],256);
558 status = sscanf(downstream_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
559 &down_chr,&down_pos,down_seq,&down_id,&down_strand,&down_mismatch,&down_occurrence,&down_sz,
560 &down_cut,down_prb,down_cal_prb,down_chastity);
561
562 remove_ambiguities(down_seq,strlen(down_seq),new_down_seq);
563
564 new_seq[0] = '\0';
565 new_prb[0] = '\0';
566 new_cal_prb[0] = '\0';
567 new_chastity[0] = '\0';
568 int splitpos = 0;
569
570 fit = fitting(up_prb+(36-overlap),down_prb);
571 if (fit == -1)
572 continue;
573
574 if (! (fit < overlap ))
575 continue;
576
577 new_chr = up_chr;
578 new_strand = up_strand;
579
580 strncat(new_seq,new_up_seq,overlap);
581 strncat(new_prb,up_prb,overlap);
582 strncat(new_cal_prb,up_cal_prb,overlap);
583 strncat(new_chastity,up_chastity,overlap);
584
585 strncat(new_seq,new_down_seq+fit,read_size-overlap);
586 strncat(new_prb,down_prb+fit,read_size-overlap);
587 strncat(new_cal_prb,down_cal_prb+fit,read_size-overlap);
588 strncat(new_chastity,down_chastity+fit,read_size-overlap);
589
590 fprintf(out_fs,"%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n",
591 new_chr,new_strand,new_seq,splitpos,read_size,new_prb,new_cal_prb,new_chastity,gene_id);
592
593 combined_reads++;
594 used_flag[down_idx] = 1;
595 }
596 }
597
598 free(upstream_line);
599 free(downstream_line);
600
601 free(new_up_seq);
602 free(new_down_seq);
603
604 free(up_seq);
605 free(up_prb);
606 free(up_cal_prb);
607 free(up_chastity);
608
609 free(down_seq);
610 free(down_prb);
611 free(down_cal_prb);
612 free(down_chastity);
613
614 free(new_seq);
615 free(new_prb);
616 free(new_cal_prb);
617 free(new_chastity);
618 }
619
620
621
622
623 /*
624 int fitting(char* up_prb, char* up_prb_end, char* down_prb, char* down_prb_end) {
625 double epsilon_mean = 15.0;
626 double epsilon_var = 10.0;
627 double mean_up = 0;
628 double variance_up = 0;
629 double mean_down = 0;
630 double variance_down = 0;
631
632 char *up_ptr = up_prb;
633 char *down_ptr = down_prb;
634
635 int w_size = 0;
636 while(up_ptr != up_prb_end) {
637 mean_up += (*up_ptr)-50;
638 mean_down += (*down_ptr)-50;
639 w_size++;
640 up_ptr++;
641 down_ptr++;
642 }
643 mean_up /= w_size;
644 mean_down /= w_size;
645
646
647 up_ptr = up_prb;
648 down_ptr = down_prb;
649 w_size = 0;
650 while(up_ptr != up_prb_end) {
651 variance_up += pow((*up_prb)-50 - mean_up,2);
652 variance_down += pow((*down_prb)-50 - mean_down,2);
653 w_size++;
654 up_ptr++;
655 down_ptr++;
656 }
657 variance_up /= (w_size-1);
658 variance_down /= (w_size-1);
659
660 //printf("means: %f %f, variances: %f %f\n",mean_up,mean_down,variance_up,variance_down);
661
662 if ( abs(mean_up - mean_down) < epsilon_mean && abs(variance_up - variance_down) < epsilon_var )
663 return 1;
664
665 return 0;
666 }
667 */
668
669 int fitting(char* up_prb, char* down_prb) {
670 double epsilon_mean = 30.0;
671 double epsilon_var = 30.0;
672 int w_size = 6;
673
674 double current_mean_up = 0;
675 double current_variance_up = 0;
676 double current_mean_down = 0;
677 double current_variance_down = 0;
678
679 double* mean_up = malloc(sizeof(double)*read_size-2*w_size);
680 double* variance_up = malloc(sizeof(double)*read_size-2*w_size);
681 double* mean_down = malloc(sizeof(double)*read_size-2*w_size);
682 double* variance_down = malloc(sizeof(double)*read_size-2*w_size);
683
684 int iidx = -1;
685 int uidx;
686 int didx;
687
688 for(uidx=iidx-w_size;uidx<iidx;uidx++) {
689 didx = uidx+w_size;
690 current_mean_up += up_prb[uidx]-50;
691 current_mean_down += up_prb[didx]-50;
692
693 }
694 current_mean_up /= w_size;
695 current_mean_down /= w_size;
696
697 for(uidx=iidx-w_size;uidx<iidx;uidx++) {
698 didx = uidx+w_size;
699 current_variance_up += pow(up_prb[uidx]-50 - current_mean_up,2);
700 current_variance_down += pow(up_prb[didx]-50 - current_mean_up,2);
701
702 }
703 current_variance_up /= (w_size-1);
704 current_variance_down /= (w_size-1);
705
706 for(iidx=w_size;iidx<30;iidx++) {
707 for(uidx=iidx-w_size;uidx<iidx;uidx++) {
708 didx = uidx+w_size;
709 mean_up[iidx-w_size] += down_prb[uidx]-50;
710 mean_down[iidx-w_size] += down_prb[didx]-50;
711
712 }
713 mean_up[iidx-w_size] /= w_size;
714 mean_down[iidx-w_size] /= w_size;
715
716 for(uidx=iidx-w_size;uidx<iidx;uidx++) {
717 didx = uidx+w_size;
718 variance_up[iidx-w_size] += pow(down_prb[uidx]-50 - mean_up[iidx-w_size],2);
719 variance_down[iidx-w_size] += pow(down_prb[didx]-50 - mean_down[iidx-w_size],2);
720 }
721 variance_up[iidx-w_size] /= (w_size-1);
722 variance_down[iidx-w_size] /= (w_size-1);
723
724 //printf("means: %f %f %f %f, variances: %f %f %f %f\n",current_mean_up,current_mean_down,mean_up,mean_down,current_variance_up,current_variance_down,variance_up,variance_down);
725 //if ( abs(current_mean_up - mean_up) < epsilon_mean && abs(current_variance_up - variance_up) < epsilon_var
726 //&& abs(current_mean_down - mean_down) < epsilon_mean && abs(current_variance_down - variance_down) < epsilon_var )
727 // return iidx;
728 }
729
730 int bidx;
731 int bestIdx = -1;
732 double min = 1000.0;
733 for(bidx=0;bidx<read_size-2*w_size;bidx++) {
734 if ( abs(current_mean_up - mean_up[bidx]) < epsilon_mean && abs(current_variance_up - variance_up[bidx]) < epsilon_var
735 && abs(current_mean_down - mean_down[bidx]) < epsilon_mean && abs(current_variance_down - variance_down[bidx]) < epsilon_var ) {
736 if ( abs(current_mean_up - mean_up[bidx]) + abs(current_variance_up - variance_up[bidx]) + abs(current_mean_down - mean_down[bidx]) + abs(current_variance_down - variance_down[bidx])) {
737 min = abs(current_mean_up - mean_up[bidx]) + abs(current_variance_up - variance_up[bidx]) + abs(current_mean_down - mean_down[bidx]) + abs(current_variance_down - variance_down[bidx]);
738 bestIdx = bidx;
739 }
740 }
741 }
742
743 free(mean_up);
744 free(variance_up);
745 free(mean_down);
746 free(variance_down);
747
748 return bestIdx;
749 }
750
751 void remove_ambiguities(char * old_seq, int old_seq_size, char* new_seq) {
752 int idx=0;
753 int new_idx = 0;
754
755 while(idx<old_seq_size) {
756 //printf("Current elem %c pos %d %d\n",old_seq[idx],idx,new_idx);
757 if (old_seq[idx] == ']' || old_seq[idx] == '-' ) {
758 idx++;
759 continue;
760 }
761
762 if (old_seq[idx] == '[') {
763 idx += 2;
764 //printf("%c %c\n",old_seq[idx-2],old_seq[idx]);
765 continue;
766 }
767
768 new_seq[new_idx] = old_seq[idx];
769 idx++;
770 new_idx++;
771 }
772
773 if (new_idx != 36) {
774 printf("Error: Sequence is not of length 36!\n");
775 printf("old seq: %s\n",old_seq);
776 printf("new seq: %s\n",new_seq);
777 exit(EXIT_FAILURE);
778 }
779 }
780
781 /*
782 * TODO:
783 * - Check strand -> done simple (only if equal)
784 * - check for [AC] and similar entries -> done simple (see function
785 * - remove_ambiguities (exchanges [XY] by the second entry)
786 */