7 #include "datastructures.h"
11 * In the current gff files from the TAIR repository
12 * the exons boundaries are defined as follows:
18 * whereas exon stops at:
26 char* get_regerror (int errcode
, regex_t
*compiled
) {
27 size_t length
= regerror (errcode
, compiled
, NULL
, 0);
28 char *buffer
= malloc (length
);
29 (void) regerror (errcode
, compiled
, buffer
, length
);
33 char* get_id(regex_t rx
, const char* desc
) {
35 regmatch_t
* all_matches
= malloc(sizeof(regmatch_t
)*nmatch
);
36 int regerr
= regexec(&rx
, desc
, nmatch
, all_matches
, REG_NOTBOL
);
39 char* mes
= get_regerror(regerr
,&rx
);
44 //printf("%s\n",desc);
46 int start
= all_matches
[0].rm_so
+3;
47 int end
= all_matches
[0].rm_eo
-1;
48 assert( start
<= end
);
50 int id_size
= end
- start
;
51 char* id
= malloc(sizeof(char)*(id_size
+1));
53 strncpy(id
,desc
+start
,id_size
);
55 //printf("desc is %s, id is %s\n",desc,id);
62 int parse_gff(char *filename
, FILE* fid
,struct gene
*** allGenes
) {
64 int buffer_size
= 256;
65 char* chr
= malloc(sizeof(char)*buffer_size
);
66 char* blah
= malloc(sizeof(char)*buffer_size
);
67 char* id
= malloc(sizeof(char)*buffer_size
);
68 char* desc
= malloc(sizeof(char)*buffer_size
);
71 char* xy
= malloc(sizeof(char)*4);
72 char* strand
= malloc(sizeof(char)*4);
73 char* xz
= malloc(sizeof(char)*4);
76 const char* pattern
= "ID=[^;]*;";
77 if ( regcomp(&rx
, pattern
, 0) != 0) {
82 // do one pass through the gff line to determine the number of
87 status
= fscanf(fid
,"%s\t%s\t%s\t%d\t%d\t%c\t%c\t%c\t%s\n",chr
,blah
,id
,&start
,&stop
,xy
,strand
,xz
,desc
);
91 if ( status
>= 5 && strcmp(id
,"gene")==0)
95 freopen(filename
,"r",fid
);
98 (*allGenes
) = malloc(sizeof(struct gene
*)*numGenes
);
99 (*allGenes
)[idx
] = NULL
;
101 int skippedLinesCounter
= 0;
103 status
= fscanf(fid
,"%s\t%s\t%s\t%d\t%d\t%c\t%c\t%c\t%s\n",chr
,blah
,id
,&start
,&stop
,xy
,strand
,xz
,desc
);
108 skippedLinesCounter
++;
112 if (strcmp(id
,"gene")==0) {
113 if ( (*allGenes
)[idx
] !=NULL
)
116 (*allGenes
)[idx
] = gene_alloc();
117 (*allGenes
)[idx
]->start
= start
;
118 (*allGenes
)[idx
]->stop
= stop
;
120 //printf("strand: %s %d\n",strand,strcmp(strand,"+"));
122 if (strcmp(strand
,"+") == 0) {
123 (*allGenes
)[idx
]->strand
= 'D';
125 if (strcmp(strand
,"-") == 0)
126 (*allGenes
)[idx
]->strand
= 'P';
128 (*allGenes
)[idx
]->strand
= 'z';
130 assert( (*allGenes
)[idx
]->strand
!= 'z' );
132 (*allGenes
)[idx
]->id
= get_id(rx
,desc
);
133 //printf("gene start/stop: %d/%d\n",start,stop);
137 if (strcmp(id
,"exon")==0) {
138 add_exon((*allGenes
)[idx
],start
,stop
);
139 //printf("exon start/stop: %d/%d\n",start-1,stop);
143 if (strcmp(id
,"pseudogene")==0) {
144 if ( (*allGenes
)[idx
] !=NULL
)
149 if ( (*allGenes
)[idx
] !=NULL
)
152 //printf("allGenes[0] is %d\n",(*allGenes)[0]);
153 //printf("allGenes[1] is %d\n",(*allGenes)[1]);
154 //printf("Skipped %d lines.\n",skippedLinesCounter);