+ minor changes in the paths
[qpalma.git] / tools / data_tools / parser.c
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <string.h>
4 #include <regex.h>
5 #include <assert.h>
6
7 #include "datastructures.h"
8
9 /*
10 * Note:
11 * In the current gff files from the TAIR repository
12 * the exons boundaries are defined as follows:
13 *
14 * exon starts at:
15 *
16 * agxy
17 * ^
18 * whereas exon stops at:
19 *
20 * xygt
21 * ^
22 *
23 *
24 */
25
26 char* get_regerror (int errcode, regex_t *compiled) {
27 size_t length = regerror (errcode, compiled, NULL, 0);
28 char *buffer = malloc (length);
29 (void) regerror (errcode, compiled, buffer, length);
30 return buffer;
31 }
32
33 char* get_id(regex_t rx, const char* desc) {
34 size_t nmatch = 1;
35 regmatch_t* all_matches = malloc(sizeof(regmatch_t)*nmatch);
36 int regerr = regexec(&rx, desc, nmatch, all_matches, REG_NOTBOL);
37
38 if ( regerr != 0 ) {
39 char* mes = get_regerror(regerr,&rx);
40 perror(mes);
41 exit(EXIT_FAILURE);
42 }
43
44 //printf("%s\n",desc);
45
46 int start = all_matches[0].rm_so+3;
47 int end = all_matches[0].rm_eo-1;
48 assert( start <= end);
49
50 int id_size = end - start;
51 char* id = malloc(sizeof(char)*(id_size+1));
52
53 strncpy(id,desc+start,id_size);
54 id[id_size] = '\0';
55 //printf("desc is %s, id is %s\n",desc,id);
56
57 free(all_matches);
58
59 return id;
60 }
61
62 int parse_gff(char *filename, FILE* fid,struct gene*** allGenes) {
63
64 int buffer_size = 256;
65 char* chr = malloc(sizeof(char)*buffer_size);
66 char* blah = malloc(sizeof(char)*buffer_size);
67 char* id = malloc(sizeof(char)*buffer_size);
68 char* desc = malloc(sizeof(char)*buffer_size);
69 int start = 0;
70 int stop = 0;
71 char* xy = malloc(sizeof(char)*4);
72 char* strand = malloc(sizeof(char)*4);
73 char* xz = malloc(sizeof(char)*4);
74
75 regex_t rx;
76 const char* pattern = "ID=[^;]*;";
77 if ( regcomp(&rx, pattern, 0) != 0) {
78 perror("regcomp");
79 exit(EXIT_FAILURE);
80 }
81
82 // do one pass through the gff line to determine the number of
83 // genes
84 int numGenes = 0;
85 int status = 0;
86 while(1) {
87 status = fscanf(fid,"%s\t%s\t%s\t%d\t%d\t%c\t%c\t%c\t%s\n",chr,blah,id,&start,&stop,xy,strand,xz,desc);
88 if(status == EOF)
89 break;
90
91 if ( status >= 5 && strcmp(id,"gene")==0)
92 numGenes++;
93
94 }
95 freopen(filename,"r",fid);
96
97 int idx = 0;
98 (*allGenes) = malloc(sizeof(struct gene*)*numGenes);
99 (*allGenes)[idx] = NULL;
100
101 int skippedLinesCounter = 0;
102 while(1) {
103 status = fscanf(fid,"%s\t%s\t%s\t%d\t%d\t%c\t%c\t%c\t%s\n",chr,blah,id,&start,&stop,xy,strand,xz,desc);
104 if(status == EOF)
105 break;
106
107 if (status < 7) {
108 skippedLinesCounter++;
109 continue;
110 }
111
112 if (strcmp(id,"gene")==0) {
113 if ( (*allGenes)[idx] !=NULL )
114 idx++;
115
116 (*allGenes)[idx] = gene_alloc();
117 (*allGenes)[idx]->start = start;
118 (*allGenes)[idx]->stop = stop;
119
120 //printf("strand: %s %d\n",strand,strcmp(strand,"+"));
121
122 if (strcmp(strand,"+") == 0) {
123 (*allGenes)[idx]->strand = 'D';
124 } else {
125 if (strcmp(strand,"-") == 0)
126 (*allGenes)[idx]->strand = 'P';
127 else
128 (*allGenes)[idx]->strand = 'z';
129 }
130 assert( (*allGenes)[idx]->strand != 'z' );
131
132 (*allGenes)[idx]->id = get_id(rx,desc);
133 //printf("gene start/stop: %d/%d\n",start,stop);
134 continue;
135 }
136
137 if (strcmp(id,"exon")==0) {
138 add_exon((*allGenes)[idx],start,stop);
139 //printf("exon start/stop: %d/%d\n",start-1,stop);
140 continue;
141 }
142
143 if (strcmp(id,"pseudogene")==0) {
144 if ( (*allGenes)[idx] !=NULL )
145 idx++;
146 }
147 }
148
149 if ( (*allGenes)[idx] !=NULL )
150 idx++;
151
152 //printf("allGenes[0] is %d\n",(*allGenes)[0]);
153 //printf("allGenes[1] is %d\n",(*allGenes)[1]);
154 //printf("Skipped %d lines.\n",skippedLinesCounter);
155
156 regfree(&rx);
157 free(chr);
158 free(blah);
159 free(id);
160 free(desc);
161 free(xy);
162 free(strand);
163 free(xz);
164
165 return numGenes;
166 }