79fe130d7d78f3113dc4c3415ce6800be7296617
[qpalma.git] / tools / data_tools / parser.c
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <string.h>
4 #include <regex.h>
5 #include <assert.h>
6
7 #include "datastructures.h"
8
9 char* get_regerror (int errcode, regex_t *compiled) {
10 size_t length = regerror (errcode, compiled, NULL, 0);
11 char *buffer = malloc (length);
12 (void) regerror (errcode, compiled, buffer, length);
13 return buffer;
14 }
15
16 char* get_id(regex_t rx, const char* desc) {
17 size_t nmatch = 1;
18 regmatch_t* all_matches = malloc(sizeof(regmatch_t)*nmatch);
19 int regerr = regexec(&rx, desc, nmatch, all_matches, REG_NOTBOL);
20
21 if ( regerr != 0 ) {
22 char* mes = get_regerror(regerr,&rx);
23 perror(mes);
24 exit(EXIT_FAILURE);
25 }
26
27 int start = all_matches[0].rm_so+3;
28 int end = all_matches[0].rm_eo-1;
29 assert( start <= end);
30
31 int id_size = end - start;
32 char* id = malloc(sizeof(char)*(id_size+1));
33
34 strncpy(id,desc+start,id_size);
35 id[id_size] = '\0';
36 //printf("desc is %s, id is %s\n",desc,id);
37
38 return id;
39 }
40
41 int parse_gff(char *filename, FILE* fid,struct gene*** allGenes) {
42
43 int buffer_size = 256;
44 char* chr = malloc(sizeof(char)*buffer_size);
45 char* blah = malloc(sizeof(char)*buffer_size);
46 char* id = malloc(sizeof(char)*buffer_size);
47 char* desc = malloc(sizeof(char)*buffer_size);
48 int start = 0;
49 int stop = 0;
50 char* xy = malloc(sizeof(char)*4);
51 char* strand = malloc(sizeof(char)*4);
52 char* xz = malloc(sizeof(char)*4);
53
54 regex_t rx;
55 const char* pattern = "ID=[^;]*;";
56 if ( regcomp(&rx, pattern, 0) != 0) {
57 perror("regcomp");
58 exit(EXIT_FAILURE);
59 }
60
61 // do one pass through the gff line to determine the number of
62 // genes
63 int numGenes = 0;
64 int status = 0;
65 while(1) {
66 status = fscanf(fid,"%s\t%s\t%s\t%d\t%d\t%c\t%c\t%c\t%s\n",chr,blah,id,&start,&stop,xy,strand,xz,desc);
67 if(status == EOF)
68 break;
69
70 if ( status >= 5 && strcmp(id,"gene")==0)
71 numGenes++;
72
73 }
74 freopen(filename,"r",fid);
75
76 int idx = 0;
77 (*allGenes) = malloc(sizeof(struct gene*)*numGenes);
78 (*allGenes)[idx] = NULL;
79
80 int skippedLinesCounter = 0;
81 while(1) {
82 status = fscanf(fid,"%s\t%s\t%s\t%d\t%d\t%c\t%c\t%c\t%s\n",chr,blah,id,&start,&stop,xy,strand,xz,desc);
83 if(status == EOF)
84 break;
85
86 if (status < 7) {
87 skippedLinesCounter++;
88 continue;
89 }
90
91 if (strcmp(id,"gene")==0) {
92 if ( (*allGenes)[idx] !=NULL )
93 idx++;
94
95 (*allGenes)[idx] = gene_alloc();
96 (*allGenes)[idx]->start = start;
97 (*allGenes)[idx]->stop = stop;
98 (*allGenes)[idx]->strand = (*strand);
99 (*allGenes)[idx]->id = get_id(rx,desc);
100 //printf("gene start/stop: %d/%d\n",start,stop);
101 continue;
102 }
103
104 if (strcmp(id,"exon")==0) {
105 add_exon((*allGenes)[idx],start,stop);
106 //printf("exon start/stop: %d/%d\n",start,stop);
107 continue;
108 }
109
110 if (strcmp(id,"pseudogene")==0) {
111 if ( (*allGenes)[idx] !=NULL )
112 idx++;
113 }
114 }
115
116 if ( (*allGenes)[idx] !=NULL )
117 idx++;
118
119 //printf("allGenes[0] is %d\n",(*allGenes)[0]);
120 //printf("allGenes[1] is %d\n",(*allGenes)[1]);
121 //printf("Skipped %d lines.\n",skippedLinesCounter);
122
123 regfree(&rx);
124 free(chr);
125 free(blah);
126 free(id);
127 free(desc);
128 free(xy);
129 free(strand);
130 free(xz);
131
132 return numGenes;
133 }
134