1 ////////////////////////////////////////////////////////////
2 // The purpose of this program is to read a gff and a
3 // solexa reads file and create a data set used by QPalma.
6 ////////////////////////////////////////////////////////////
16 #include "datastructures.h"
18 #define _FILE_OFFSET_BITS == 64
20 int compare_gene_struct(struct gene
* a
, struct gene
* b
) {
21 return a
->stop
- b
->start
;
24 int parse_gff(char* filename
,FILE* fid
,struct gene
*** allGenes
);
26 void sort_genes(struct gene
*** allGenes
, int numGenes
);
28 void process_reads(FILE* reads_fs
,struct gene
*** allGenes
,int numGenes
,FILE* out_fs
);
30 void combine_info(int exon_stop
, int exon_start
, void** upstream
, int up_size
, void** downstream
, int down_size
, FILE* out_fs
,const char* gene_id
, int exon_idx
);
32 //int fitting(char* up_prb, char* up_prb_end, char* down_prb, char* down_prb_end);
33 int fitting(char* up_prb
, char* down_prb
);
35 void remove_ambiguities(char * old_seq
, int old_seq_size
, char* new_seq
);
37 static char *info
= "Usage is:\n./filterReads gff reads output";
40 * Some constants specifying the exact behavior of the filter
44 const int read_size
= 36;
46 const int min_overlap
= 6;
47 const int max_overlap
= 30;
51 int combined_reads
= 0;
52 int main(int argc
, char* argv
[]) {
60 int filenameSize
= 256;
61 char* gff_filename
= malloc(sizeof(char)*filenameSize
);
62 char* reads_filename
= malloc(sizeof(char)*filenameSize
);
63 char* output_filename
= malloc(sizeof(char)*filenameSize
);
65 strncpy(gff_filename
,argv
[1],filenameSize
);
66 strncpy(reads_filename
,argv
[2],filenameSize
);
67 strncpy(output_filename
,argv
[3],filenameSize
);
69 FILE *gff_fs
= fopen(gff_filename
,"r");
70 FILE *reads_fs
= fopen(reads_filename
,"r");
71 FILE *out_fs
= fopen(output_filename
,"w");
74 printf("Error: Could not open file: %s",gff_filename
);
78 if(reads_fs
== NULL
) {
79 printf("Error: Could not open file: %s",reads_filename
);
84 printf("Error: Could not open file: %s",output_filename
);
88 struct gene
** allGenes
;
89 int numGenes
= parse_gff(gff_filename
,gff_fs
,&allGenes
);
90 status
= fclose(gff_fs
);
93 printf("closing of gff filestream failed!\n");
95 printf("Successfully parsed gff file! Found %d genes.\n",numGenes
);
97 // some entries in the gff files are not in a sorted order
98 //printf("Sorting genes...\n");
99 //sort_genes(&allGenes,numGenes);
100 //qsort(allGenes,numGenes,sizeof(struct gene*),compare_gene_struct);
101 ///printf("Genes were sorted!\n");
105 for(gidx
=0;gidx
<numGenes
;gidx
++) {
106 for(eidx
=0;eidx
<allGenes
[gidx
]->num_exons
;eidx
++) {
107 exon_cov
+= allGenes
[gidx
]->exon_stops
[eidx
] - allGenes
[gidx
]->exon_starts
[eidx
];
109 printf("Exon coverage is %f\n",(double)exon_cov
/30432563);
111 process_reads(reads_fs
,&allGenes
,numGenes
,out_fs
);
113 status
= fclose(reads_fs
);
114 status
= fclose(out_fs
);
118 free(reads_filename
);
119 free(output_filename
);
123 void process_reads(FILE* reads_fs
,struct gene
*** allGenes
,int numGenes
, FILE* out_fs
) {
129 char* seq
= malloc(sizeof(char)*buffer_size
);
130 unsigned long id
= 0;
136 char* prb
= malloc(sizeof(char)*buffer_size
);
137 char* cal_prb
= malloc(sizeof(char)*buffer_size
);
138 char* chastity
= malloc(sizeof(char)*buffer_size
);
140 int reads_fid
= fileno(reads_fs
);
141 struct stat reads_stat
;
142 if ( fstat(reads_fid
,&reads_stat
) == -1) {
146 off_t reads_filesize
= reads_stat
.st_size
;
147 printf("Reads file is of size %lu bytes\n",(unsigned long) reads_filesize
);
148 int numReads
= reads_filesize
/ 178.0;
150 void *reads_area
= mmap (NULL
,reads_filesize
,PROT_READ
|PROT_WRITE
,MAP_PRIVATE
,reads_fid
,0);
151 if (reads_area
== MAP_FAILED
) {
156 printf("Successfully mapped %lu bytes of reads file into memory\n",(unsigned long)reads_filesize
);
158 void* linePtr
= reads_area
;
159 char* current_line
= malloc(sizeof(char)*256);
162 // initialize boundary arrays
163 void** upstream_end
= malloc(sizeof(void*)*SIZE
);
164 void** upstream_overlap
= malloc(sizeof(void*)*SIZE
);
165 void** downstream_start
= malloc(sizeof(void*)*SIZE
);
166 void** downstream_overlap
= malloc(sizeof(void*)*SIZE
);
168 ue
= uo
= ds
= dov
= 0;
170 int skippedLinesCounter
= 0;
172 int prev_exon_start
= -1;
173 int prev_exon_stop
= -1;
174 int cur_exon_start
= -1;
176 current_line
= strncpy(current_line
,linePtr
,256);
179 struct gene
* currentGene
= (*allGenes
)[gene_idx
];
180 char* gene_id
= currentGene
->id
;
182 //char* disamb_seq = malloc(sizeof(char)*read_size);
184 int skippedReadCtr
= 0;
185 int uselessReadCtr
= 0;
186 int exonicReadCtr
= 0;
188 int prevOverlapCtr
= 0;
189 int currentStartCtr
= 0;
190 int currentOverlapCtr
= 0;
191 int multioccurReadCtr
= 0;
194 // start of the parsing loop
196 if (gene_idx
== numGenes
|| strcmp(current_line
,"") == 0)
199 gene_id
= currentGene
->id
;
201 if (readCtr
!= 0 && readCtr
% 1000000 == 0)
202 printf("Processed %d/%d genes and %d/%d reads.\n",gene_idx
,numGenes
,readCtr
,numReads
);
204 //if (gene_idx >= 1833)
205 // printf("currentGene start/stop: %d/%d. Positions is %d\n",currentGene->start,currentGene->stop,pos);
207 status
= sscanf(current_line
,"%d\t%d\t%s\t%lu\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
208 &chr
,&pos
,seq
,&id
,&strand
,&mismatch
,&occurrence
,&size
,&cut
,prb
,cal_prb
,chastity
);
210 skippedLinesCounter
++;
214 // if the read is occurring several times elsewhere then get rid of it
215 //if(!(occurrence >= 1 && occurrence <= 25)) {
216 if ( occurrence
!= 1 ) {
221 if (!(currentGene
->start
<= pos
&& (pos
+ read_size
-1) <= currentGene
->stop
)) { // read is not within gene borders
223 if ( currentGene
->stop
< (pos
+ read_size
-1) ) { // go to next gene
226 currentGene
= (*allGenes
)[gene_idx
];
227 //printf("currentGene->start / currentGene->stop %d/%d pos is %d\n",currentGene->start,currentGene->stop,pos);
228 ue
= uo
= ds
= dov
= 0;
232 if ( pos
< currentGene
->start
) { // go to next read
237 while (*(char*)linePtr
!= '\n') linePtr
++;
240 current_line
= strncpy(current_line
,linePtr
,256);
244 } else { // read IS within gene borders
248 if (exon_idx
== currentGene
->num_exons
) {
251 currentGene
= (*allGenes
)[gene_idx
];
255 prev_exon_start
= currentGene
->exon_starts
[exon_idx
-1];
256 prev_exon_stop
= currentGene
->exon_stops
[exon_idx
-1];
257 cur_exon_start
= currentGene
->exon_starts
[exon_idx
];
259 //printf("exon %d %d inton til %d pos %d\n",prev_exon_start,prev_exon_stop,cur_exon_start,pos);
261 if (cur_exon_start
- prev_exon_stop
< min_overlap
|| cur_exon_start
< pos
) { // go to next exon
262 if (ue
!= 0 && dov
!= 0)
263 combine_info(prev_exon_stop
,cur_exon_start
,upstream_end
,ue
,downstream_overlap
,dov
,out_fs
,gene_id
,exon_idx
);
265 if (uo
!= 0 && ds
!= 0)
266 combine_info(prev_exon_stop
,cur_exon_start
,upstream_overlap
,uo
,downstream_start
,ds
,out_fs
,gene_id
,exon_idx
);
269 ue
= uo
= ds
= dov
= 0;
273 if ( prev_exon_start
< pos
&& (pos
+read_size
) < prev_exon_stop
) { // read is inside previous exon
276 // Removed exonic reads
278 //remove_ambiguities(seq,strlen(seq),disamb_seq);
279 //fprintf(out_fs,"%d\t%c\t%s\t%d\t%s\t%s\t%s\n",chr,strand,disamb_seq,read_size,prb,cal_prb,chastity);
284 if ( pos
+ (read_size
-1) < prev_exon_stop
) // go to next read
287 // if this position is reached the read is somehow overlapping or
288 // exactly on the exon boundary. now determine where exactly:
289 if (pos
+ (read_size
-1) == prev_exon_stop
) { // read ends at previous exon end
291 upstream_end
[ue
] = linePtr
;
296 if ( (prev_exon_stop
- pos
) >= min_overlap
&& (prev_exon_stop
- pos
) <= max_overlap
) { // read overlaps with previous exon boundary
298 upstream_overlap
[uo
] = linePtr
;
303 if ( pos
== cur_exon_start
) { // read starts at current exon start
305 downstream_start
[ds
] = linePtr
;
310 if ( (cur_exon_start
- pos
) >= min_overlap
&& (cur_exon_start
- pos
) <= max_overlap
) { // read overlaps with current exon boundary
312 downstream_overlap
[dov
] = linePtr
;
318 goto next_read
; // read was not useful i.e. not overlapping/starting at exon boundaries
322 combine_info(prev_exon_stop
,cur_exon_start
,upstream_end
,ue
,downstream_overlap
,dov
,out_fs
,gene_id
,exon_idx
);
323 combine_info(prev_exon_stop
,cur_exon_start
,upstream_overlap
,uo
,downstream_start
,ds
,out_fs
,gene_id
,exon_idx
);
326 free(upstream_overlap
);
327 free(downstream_start
);
328 free(downstream_overlap
);
330 printf("Processed %d reads in total while %d reads where skipped.\n",readCtr
,skippedLinesCounter
);
331 printf("%d were totally intronic. %d were outside of genes and %d were occurring on several positions.\n",uselessReadCtr
,skippedReadCtr
,multioccurReadCtr
);
332 printf("%d reads were useless in total\n",uselessReadCtr
+skippedReadCtr
+multioccurReadCtr
);
333 printf("%d reads were totally exonic\n",exonicReadCtr
);
334 printf("%d reads overlap with prev. exon. %d reads overlap with current exon\n",prevOverlapCtr
,currentOverlapCtr
);
335 printf("%d reads where newly combined from two original reads\n",combined_reads
);
336 printf("Total used reads: %d\n",exonicReadCtr
+endPrevCtr
+prevOverlapCtr
+currentStartCtr
+currentOverlapCtr
);
338 status
= munmap(reads_area
,reads_filesize
);
342 //free(current_line);
349 void combine_info(int exon_stop
, int exon_start
, void** upstream
, int up_size
, void** downstream
, int down_size
,FILE* out_fs
,const char* gene_id
, int exon_idx
) {
350 //printf("up/down size is %d/%d\n",up_size,down_size);
352 int up_idx
, down_idx
, status
;
353 char* upstream_line
= malloc(sizeof(char)*256);
354 char* downstream_line
= malloc(sizeof(char)*256);
361 char* up_seq
= malloc(sizeof(char)*buffer_size
);
363 char up_strand
= ' ';
365 int up_occurrence
= 0;
368 char* up_prb
= malloc(sizeof(char)*buffer_size
);
369 char* up_cal_prb
= malloc(sizeof(char)*buffer_size
);
370 char* up_chastity
= malloc(sizeof(char)*buffer_size
);
374 char* down_seq
= malloc(sizeof(char)*buffer_size
);
376 char down_strand
= ' ';
377 int down_mismatch
= 0;
378 int down_occurrence
= 0;
381 char* down_prb
= malloc(sizeof(char)*buffer_size
);
382 char* down_cal_prb
= malloc(sizeof(char)*buffer_size
);
383 char* down_chastity
= malloc(sizeof(char)*buffer_size
);
386 char* new_seq
= malloc(sizeof(char)*buffer_size
);
387 char new_strand
= ' ';
388 char* new_prb
= malloc(sizeof(char)*buffer_size
);
389 char* new_cal_prb
= malloc(sizeof(char)*buffer_size
);
390 char* new_chastity
= malloc(sizeof(char)*buffer_size
);
391 char* new_up_seq
= malloc(sizeof(char)*read_size
);
392 char* new_down_seq
= malloc(sizeof(char)*read_size
);
397 char* used_flag
= calloc(down_size
,sizeof(char));
399 for(up_idx
=0;up_idx
<up_size
;up_idx
++) {
400 strncpy(upstream_line
,upstream
[up_idx
],256);
401 status
= sscanf(upstream_line
,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
402 &up_chr
,&up_pos
,up_seq
,&up_id
,&up_strand
,&up_mismatch
,&up_occurrence
,&up_sz
,
403 &up_cut
,up_prb
,up_cal_prb
,up_chastity
);
405 remove_ambiguities(up_seq
,strlen(up_seq
),new_up_seq
);
407 overlap
= exon_stop
- up_pos
;
409 for(down_idx
=0;down_idx
<down_size
;down_idx
++) {
410 if( used_flag
[down_idx
] == 1)
413 strncpy(downstream_line
,downstream
[down_idx
],256);
414 status
= sscanf(downstream_line
,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
415 &down_chr
,&down_pos
,down_seq
,&down_id
,&down_strand
,&down_mismatch
,&down_occurrence
,&down_sz
,
416 &down_cut
,down_prb
,down_cal_prb
,down_chastity
);
418 remove_ambiguities(down_seq
,strlen(down_seq
),new_down_seq
);
420 if(up_strand
!= down_strand
)
425 new_cal_prb
[0] = '\0';
426 new_chastity
[0] = '\0';
429 fit
= fitting(up_prb
+(36-overlap
)-6,down_prb
+(exon_start
-down_pos
)-6);
433 if (! (fit
< overlap
))
437 new_strand
= up_strand
;
441 p_stop
= exon_start
+(read_size
-overlap
);
443 strncat(new_seq
,new_up_seq
,overlap
);
444 strncat(new_prb
,up_prb
,overlap
);
445 strncat(new_cal_prb
,up_cal_prb
,overlap
);
446 strncat(new_chastity
,up_chastity
,overlap
);
448 //strncat(new_seq,new_down_seq+fit,read_size-overlap);
449 //strncat(new_prb,down_prb+fit,read_size-overlap);
450 //strncat(new_cal_prb,down_cal_prb+fit,read_size-overlap);
451 //strncat(new_chastity,down_chastity+fit,read_size-overlap);
453 int offset
= exon_start
-down_pos
;
454 strncat(new_seq
,new_down_seq
+offset
,read_size
-overlap
);
455 strncat(new_prb
,down_prb
+offset
,read_size
-overlap
);
456 strncat(new_cal_prb
,down_cal_prb
+offset
,read_size
-overlap
);
457 strncat(new_chastity
,down_chastity
+offset
,read_size
-overlap
);
459 // The four last integers specify the positions of
460 // p_start : the position in the dna where the first read starts
461 // exons_stop : the position in the dna where the first exons ends
462 // exons_start : the position in the dna where the second exons starts
463 // p_stop : the position in the dna where the (truncated) second read ends
464 fprintf(out_fs
,"%08d\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\n",
465 read_nr
,new_chr
,new_strand
,new_seq
,splitpos
,read_size
,new_prb
,new_cal_prb
,new_chastity
,gene_id
,p_start
,exon_stop
,exon_start
,p_stop
);
469 used_flag
[down_idx
] = 1;
474 free(downstream_line
);
497 int fitting(char* up_prb, char* down_prb) {
500 double current_mean_up = 0;
501 double current_mean_down = 0;
504 for(uidx=0;uidx<w_size;uidx++) {
505 current_mean_up += up_prb[uidx]-50;
506 current_mean_down += down_prb[uidx]-50;
509 current_mean_up /= w_size;
510 current_mean_down /= w_size;
512 double min_val = fmin(current_mean_up,current_mean_down);
513 double max_val = fmax(current_mean_up,current_mean_down);
514 if ( (min_val / max_val) <= 0.33 )
521 int fitting(char* up_prb
, char* down_prb
) {
522 double epsilon_mean
= 30.0;
523 double epsilon_var
= 30.0;
526 double current_mean_up
= 0;
527 double current_variance_up
= 0;
528 double current_mean_down
= 0;
529 double current_variance_down
= 0;
531 double* mean_up
= malloc(sizeof(double)*read_size
-2*w_size
);
532 double* variance_up
= malloc(sizeof(double)*read_size
-2*w_size
);
533 double* mean_down
= malloc(sizeof(double)*read_size
-2*w_size
);
534 double* variance_down
= malloc(sizeof(double)*read_size
-2*w_size
);
540 for(uidx
=iidx
-w_size
;uidx
<iidx
;uidx
++) {
542 current_mean_up
+= up_prb
[uidx
]-50;
543 current_mean_down
+= up_prb
[didx
]-50;
546 current_mean_up
/= w_size
;
547 current_mean_down
/= w_size
;
549 for(uidx
=iidx
-w_size
;uidx
<iidx
;uidx
++) {
551 current_variance_up
+= pow(up_prb
[uidx
]-50 - current_mean_up
,2);
552 current_variance_down
+= pow(up_prb
[didx
]-50 - current_mean_up
,2);
555 current_variance_up
/= (w_size
-1);
556 current_variance_down
/= (w_size
-1);
558 for(iidx
=w_size
;iidx
<30;iidx
++) {
559 for(uidx
=iidx
-w_size
;uidx
<iidx
;uidx
++) {
561 mean_up
[iidx
-w_size
] += down_prb
[uidx
]-50;
562 mean_down
[iidx
-w_size
] += down_prb
[didx
]-50;
565 mean_up
[iidx
-w_size
] /= w_size
;
566 mean_down
[iidx
-w_size
] /= w_size
;
568 for(uidx
=iidx
-w_size
;uidx
<iidx
;uidx
++) {
570 variance_up
[iidx
-w_size
] += pow(down_prb
[uidx
]-50 - mean_up
[iidx
-w_size
],2);
571 variance_down
[iidx
-w_size
] += pow(down_prb
[didx
]-50 - mean_down
[iidx
-w_size
],2);
573 variance_up
[iidx
-w_size
] /= (w_size
-1);
574 variance_down
[iidx
-w_size
] /= (w_size
-1);
576 //printf("means: %f %f %f %f, variances: %f %f %f %f\n",current_mean_up,current_mean_down,mean_up,mean_down,current_variance_up,current_variance_down,variance_up,variance_down);
577 //if ( abs(current_mean_up - mean_up) < epsilon_mean && abs(current_variance_up - variance_up) < epsilon_var
578 //&& abs(current_mean_down - mean_down) < epsilon_mean && abs(current_variance_down - variance_down) < epsilon_var )
585 for(bidx
=0;bidx
<read_size
-2*w_size
;bidx
++) {
586 if ( abs(current_mean_up
- mean_up
[bidx
]) < epsilon_mean
&& abs(current_variance_up
- variance_up
[bidx
]) < epsilon_var
587 && abs(current_mean_down
- mean_down
[bidx
]) < epsilon_mean
&& abs(current_variance_down
- variance_down
[bidx
]) < epsilon_var
) {
588 if ( abs(current_mean_up
- mean_up
[bidx
]) + abs(current_variance_up
- variance_up
[bidx
]) + abs(current_mean_down
- mean_down
[bidx
]) + abs(current_variance_down
- variance_down
[bidx
])) {
589 min
= abs(current_mean_up
- mean_up
[bidx
]) + abs(current_variance_up
- variance_up
[bidx
]) + abs(current_mean_down
- mean_down
[bidx
]) + abs(current_variance_down
- variance_down
[bidx
]);
603 void remove_ambiguities(char * old_seq
, int old_seq_size
, char* new_seq
) {
607 while(idx
<old_seq_size
) {
608 //printf("Current elem %c pos %d %d\n",old_seq[idx],idx,new_idx);
609 if (old_seq
[idx
] == ']' || old_seq
[idx
] == '-' ) {
614 if (old_seq
[idx
] == '[') {
616 //printf("%c %c\n",old_seq[idx-2],old_seq[idx]);
620 new_seq
[new_idx
] = old_seq
[idx
];
626 printf("Error: Sequence is not of length 36!\n");
627 printf("old seq: %s\n",old_seq
);
628 printf("new seq: %s\n",new_seq
);
635 * - Check strand -> done simple (only if equal)
636 * - check for [AC] and similar entries -> done simple (see function
637 * - remove_ambiguities (exchanges [XY] by the second entry)