1 ////////////////////////////////////////////////////////////
2 // The purpose of this program is to read a gff and a
3 // solexa reads file and create a data set used by QPalma.
7 ////////////////////////////////////////////////////////////
18 #include "datastructures.h"
20 int parse_gff(char* filename
,FILE* fid
,struct gene
*** allGenes
);
22 void process_reads(FILE* reads_fs
,struct gene
*** allGenes
,int numGenes
,FILE* out_fs
);
24 void combine_info(int exon_stop
, int exon_start
, void** upstream
, int up_size
, void** downstream
, int down_size
, FILE* out_fs
);
26 int fitting(char* up_prb
, char* up_prb_end
, char* down_prb
, char* down_prb_end
);
28 void remove_ambiguities(char * old_seq
, int old_seq_size
, char* new_seq
);
30 static char *info
= "Usage is:\n./filterReads gff reads output";
32 const int read_size
= 36;
34 int main(int argc
, char* argv
[]) {
42 int filenameSize
= 256;
43 char *gff_filename
= malloc(sizeof(char)*filenameSize
);
44 char *reads_filename
= malloc(sizeof(char)*filenameSize
);
45 char *output_filename
= malloc(sizeof(char)*filenameSize
);
47 strncpy(gff_filename
,argv
[1],filenameSize
);
48 strncpy(reads_filename
,argv
[2],filenameSize
);
49 strncpy(output_filename
,argv
[3],filenameSize
);
51 FILE *gff_fs
= fopen(gff_filename
,"r");
52 FILE *reads_fs
= fopen(reads_filename
,"r");
53 FILE *out_fs
= fopen(output_filename
,"w");
56 printf("Error: Could not open file: %s",gff_filename
);
60 if(reads_fs
== NULL
) {
61 printf("Error: Could not open file: %s",reads_filename
);
66 printf("Error: Could not open file: %s",output_filename
);
70 struct gene
** allGenes
;
71 int numGenes
= parse_gff(gff_filename
,gff_fs
,&allGenes
);
72 status
= fclose(gff_fs
);
74 printf("closing of gff filestream failed!\n");
76 printf("Successfully parsed gff file! Found %d genes.\n",numGenes
);
78 process_reads(reads_fs
,&allGenes
,numGenes
,out_fs
);
80 status
= fclose(reads_fs
);
81 status
+= fclose(out_fs
);
83 printf("closing of filestreams failed!\n");
91 void process_reads(FILE* reads_fs
,struct gene
*** allGenes
,int numGenes
, FILE* out_fs
) {
97 char* seq
= malloc(sizeof(char)*buffer_size
);
104 char* prb
= malloc(sizeof(char)*buffer_size
);
105 char* cal_prb
= malloc(sizeof(char)*buffer_size
);
106 char* chastity
= malloc(sizeof(char)*buffer_size
);
108 int reads_fid
= fileno(reads_fs
);
109 struct stat reads_stat
;
110 if ( fstat(reads_fid
,&reads_stat
) == -1) {
114 off_t reads_filesize
= reads_stat
.st_size
;
116 printf("Reads file is of size %lu bytes\n",(unsigned long) reads_filesize
);
118 void *reads_area
= mmap (NULL
,reads_filesize
,PROT_READ
|PROT_WRITE
,MAP_PRIVATE
,reads_fid
,0);
119 if (reads_area
== MAP_FAILED
) {
125 printf("Successfully mapped %lu bytes of reads file into memory\n",(unsigned long)reads_filesize
);
127 void* linePtr
= reads_area
;
128 char* current_line
= malloc(sizeof(char)*256);
131 // initialize boundary arrays
132 void** upstream_end
= malloc(sizeof(void*)*SIZE
);
133 void** upstream_overlap
= malloc(sizeof(void*)*SIZE
);
134 void** downstream_start
= malloc(sizeof(void*)*SIZE
);
135 void** downstream_overlap
= malloc(sizeof(void*)*SIZE
);
137 ue
= uo
= ds
= dov
= 0;
139 int skippedLinesCounter
= 0;
141 int prev_exon_stop
= -1;
142 int cur_exon_start
= -1;
144 current_line
= strncpy(current_line
,linePtr
,256);
147 struct gene
* currentGene
= (*allGenes
)[gene_idx
];
150 // start of the parsing loop
152 if (gene_idx
== numGenes
|| strcmp(current_line
,"") == 0)
155 if (readCtr
!= 0 && readCtr
% 100000 == 0)
156 printf("Processed %d reads.\n",readCtr
);
158 //if (readCtr == 10000000)
159 // exit(EXIT_FAILURE);
161 status
= sscanf(current_line
,"%d\t%d\t%s\t%lu\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
162 &chr
,&pos
,seq
,&id
,&strand
,&mismatch
,&occurrence
,&size
,&cut
,prb
,cal_prb
,chastity
);
164 skippedLinesCounter
++;
167 //printf("gene start/stop %d,%d, pos: %d\n",currentGene->start,currentGene->stop,pos);
169 // if the read is occurring several times elsewhere then get rid of it
170 if(occurrence
!= 1) {
171 while (*(char*)linePtr
!= '\n') linePtr
++;
174 current_line
= strncpy(current_line
,linePtr
,256);
178 if (!(currentGene
->start
<= pos
&& (pos
+ read_size
-1) <= currentGene
->stop
)) { // read is not within gene borders
180 if ( currentGene
->stop
< (pos
+ read_size
-1) ) { // go to next gene
183 currentGene
= (*allGenes
)[gene_idx
];
187 if ( pos
< currentGene
->start
) { // go to next read
190 while (*(char*)linePtr
!= '\n') linePtr
++;
193 current_line
= strncpy(current_line
,linePtr
,256);
197 } else { // read IS within gene borders
200 prev_exon_stop
= currentGene
->exon_stops
[exon_idx
-1];
201 cur_exon_start
= currentGene
->exon_starts
[exon_idx
];
202 //printf("prev_exon_stop %d cur_exon_start %d pos %d\n",prev_exon_stop,cur_exon_start,pos);
203 //printf("exon_idx %d\n",exon_idx);
205 if (exon_idx
== currentGene
->num_exons
) {
208 currentGene
= (*allGenes
)[gene_idx
];
212 if (cur_exon_start
- prev_exon_stop
< 6 || cur_exon_start
< pos
) { // go to next exon
214 combine_info(prev_exon_stop
,cur_exon_start
,upstream_end
,ue
,downstream_overlap
,dov
,out_fs
);
215 combine_info(prev_exon_stop
,cur_exon_start
,upstream_overlap
,uo
,downstream_start
,ds
,out_fs
);
216 ue
= uo
= ds
= dov
= 0;
220 if ( pos
< prev_exon_stop
- read_size
+ 1 ) { // go to next read
224 // if this position is reached the read is somehow overlapping or
225 // exactly on the exon boundary. now determine where exactly:
227 if (pos
+ (read_size
-1) == prev_exon_stop
) { // read ends at previous exon end
228 upstream_end
[ue
] = linePtr
;
233 if (prev_exon_stop
- pos
>= 6 && prev_exon_stop
- pos
<= 30) { // read overlaps with previous exon boundary
234 upstream_overlap
[uo
] = linePtr
;
239 if (pos
== cur_exon_start
) { // read starts at current exon start
240 downstream_start
[ds
] = linePtr
;
245 if (cur_exon_start
- pos
>= 6 && cur_exon_start
- pos
<= 30 ) { // read overlaps with current exon boundary
246 downstream_overlap
[dov
] = linePtr
;
255 combine_info(prev_exon_stop
,cur_exon_start
,upstream_end
,ue
,downstream_overlap
,dov
,out_fs
);
256 combine_info(prev_exon_stop
,cur_exon_start
,upstream_overlap
,uo
,downstream_start
,ds
,out_fs
);
259 free(upstream_overlap
);
260 free(downstream_start
);
261 free(downstream_overlap
);
263 printf("Processed %d reads in total while %d reads where skipped.\n",readCtr
,skippedLinesCounter
);
265 status
= munmap(reads_area
,reads_filesize
);
267 printf("munmap failed!\n");
275 void combine_info(int exon_stop
, int exon_start
, void** upstream
, int up_size
, void** downstream
, int down_size
,FILE* out_fs
) {
276 //printf("up_/down_size %d %d\n",up_size,down_size);
278 if (up_size
== 0 || down_size
== 0 || exon_stop
== -1)
281 int up_idx
, down_idx
, status
;
282 char* upstream_line
= malloc(sizeof(char)*256);
283 char* downstream_line
= malloc(sizeof(char)*256);
289 char* up_seq
= malloc(sizeof(char)*buffer_size
);
291 char up_strand
= ' ';
293 int up_occurrence
= 0;
296 char* up_prb
= malloc(sizeof(char)*buffer_size
);
297 char* up_cal_prb
= malloc(sizeof(char)*buffer_size
);
298 char* up_chastity
= malloc(sizeof(char)*buffer_size
);
302 char* down_seq
= malloc(sizeof(char)*buffer_size
);
304 char down_strand
= ' ';
305 int down_mismatch
= 0;
306 int down_occurrence
= 0;
309 char* down_prb
= malloc(sizeof(char)*buffer_size
);
310 char* down_cal_prb
= malloc(sizeof(char)*buffer_size
);
311 char* down_chastity
= malloc(sizeof(char)*buffer_size
);
314 char* new_seq
= malloc(sizeof(char)*buffer_size
);
315 char new_strand
= ' ';
316 char* new_prb
= malloc(sizeof(char)*buffer_size
);
317 char* new_cal_prb
= malloc(sizeof(char)*buffer_size
);
318 char* new_chastity
= malloc(sizeof(char)*buffer_size
);
323 //printf("up_/down_idx %d %d\n",up_idx,down_idx);
325 if (up_idx
== up_size
|| down_idx
== down_size
)
328 if ( up_strand
!= down_strand
)
331 strncpy(upstream_line
,upstream
[up_idx
],256);
332 status
= sscanf(upstream_line
,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
333 &up_chr
,&up_pos
,up_seq
,&up_id
,&up_strand
,&up_mismatch
,&up_occurrence
,&up_sz
,
334 &up_cut
,up_prb
,up_cal_prb
,up_chastity
);
336 strncpy(downstream_line
,downstream
[down_idx
],256);
337 status
= sscanf(downstream_line
,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
338 &down_chr
,&down_pos
,down_seq
,&down_id
,&down_strand
,&down_mismatch
,&down_occurrence
,&down_sz
,
339 &down_cut
,down_prb
,down_cal_prb
,down_chastity
);
341 char* new_up_seq
= malloc(sizeof(char)*read_size
);
342 char* new_down_seq
= malloc(sizeof(char)*read_size
);
344 remove_ambiguities(up_seq
,strlen(up_seq
),new_up_seq
);
345 remove_ambiguities(down_seq
,strlen(down_seq
),new_down_seq
);
349 new_cal_prb
[0] = '\0';
350 new_chastity
[0] = '\0';
354 if (up_pos
+35 == exon_stop
) { // merge with read which is downstream overlapping
355 int overlap
= exon_start
- down_pos
;
356 //printf("overlap is %d\n",overlap);
357 //printf("pos are: %d %d\n",up_pos,down_pos);
359 fit
= fitting(up_prb
+(36-w_size
),up_prb
+36,down_prb
+overlap
,down_prb
+overlap
+w_size
);
364 new_strand
= up_strand
;
366 strncat(new_seq
,new_up_seq
+(36-overlap
),overlap
);
367 strncat(new_prb
,up_prb
+(36-overlap
),overlap
);
368 strncat(new_cal_prb
,up_cal_prb
+(36-overlap
),overlap
);
369 strncat(new_chastity
,up_chastity
+(36-overlap
),overlap
);
371 strncat(new_seq
,new_down_seq
+overlap
,36-overlap
);
372 strncat(new_prb
,down_prb
+overlap
,36-overlap
);
373 strncat(new_cal_prb
,down_cal_prb
+overlap
,36-overlap
);
374 strncat(new_chastity
,down_chastity
+overlap
,36-overlap
);
376 } // merge with read which is upstream overlapping
378 if (down_pos
== exon_start
) {
379 int overlap
= up_pos
+36 - exon_stop
;
380 //printf("overlap is %d\n",overlap);
381 //printf("pos are: %d %d\n",up_pos,down_pos);
382 fit
= fitting(up_prb
+36-overlap
-w_size
,up_prb
+36-overlap
,down_prb
,down_prb
+w_size
);
387 new_strand
= up_strand
;
389 strncat(new_seq
,new_up_seq
,(36-overlap
));
390 strncat(new_prb
,up_prb
,(36-overlap
));
391 strncat(new_cal_prb
,up_cal_prb
,(36-overlap
));
392 strncat(new_chastity
,up_chastity
,(36-overlap
));
394 strncat(new_seq
,new_down_seq
,overlap
);
395 strncat(new_prb
,down_prb
,overlap
);
396 strncat(new_cal_prb
,down_cal_prb
,overlap
);
397 strncat(new_chastity
,down_chastity
,overlap
);
400 //printf("New entry!\n");
401 fprintf(out_fs
,"%d\t%c\t%s\t%d\t%s\t%s\t%s\n",
402 new_chr
,new_strand
,new_seq
,read_size
,new_prb
,new_cal_prb
,new_chastity
);
414 int fitting(char* up_prb
, char* up_prb_end
, char* down_prb
, char* down_prb_end
) {
415 double epsilon_mean
= 10.0;
416 double epsilon_var
= 10.0;
418 double variance_up
= 0;
419 double mean_down
= 0;
420 double variance_down
= 0;
422 char *up_ptr
= up_prb
;
423 char *down_ptr
= down_prb
;
426 while(up_ptr
!= up_prb_end
) {
427 mean_up
+= (*up_ptr
)-50;
428 mean_down
+= (*down_ptr
)-50;
436 //printf("means: %f %f\n",mean_up,mean_down);
441 while(up_ptr
!= up_prb_end
) {
442 variance_up
+= pow((*up_prb
)-50 - mean_up
,2);
443 variance_down
+= pow((*down_prb
)-50 - mean_down
,2);
448 variance_up
/= (w_size
-1);
449 variance_down
/= (w_size
-1);
451 //printf("variances: %f %f\n",variance_up,variance_down);
453 if ( abs(mean_up
- mean_down
) < epsilon_mean
&& abs(variance_up
- variance_down
) < epsilon_var
)
459 void remove_ambiguities(char * old_seq
, int old_seq_size
, char* new_seq
) {
460 //printf("old seq: %s\n",old_seq);
461 //printf("new seq: %s\n",new_seq);
465 while(idx
<old_seq_size
) {
466 if (old_seq
[idx
] == '[') {
467 new_seq
[new_idx
] = old_seq
[++idx
];
473 new_seq
[new_idx
] = old_seq
[idx
++];
476 //printf("old seq: %s\n",old_seq);
477 //printf("new seq: %s\n",new_seq);
482 * - Check strand -> done simple (only if equal)
483 * - check for [AC] and similar entries -> done simple (see function
484 * remove_ambiguities (exchanges [XY] by the first entry)