+ small changes
[qpalma.git] / tools / data_tools / parser.c
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <string.h>
4 #include <regex.h>
5 #include <assert.h>
6
7 #include "datastructures.h"
8
9 char* get_regerror (int errcode, regex_t *compiled) {
10 size_t length = regerror (errcode, compiled, NULL, 0);
11 char *buffer = malloc (length);
12 (void) regerror (errcode, compiled, buffer, length);
13 return buffer;
14 }
15
16 char* get_id(regex_t rx, const char* desc) {
17 size_t nmatch = 1;
18 regmatch_t* all_matches = malloc(sizeof(regmatch_t)*nmatch);
19 int regerr = regexec(&rx, desc, nmatch, all_matches, REG_NOTBOL);
20
21 if ( regerr != 0 ) {
22 char* mes = get_regerror(regerr,&rx);
23 perror(mes);
24 exit(EXIT_FAILURE);
25 }
26
27 int start = all_matches[0].rm_so+3;
28 int end = all_matches[0].rm_eo-1;
29 assert( start <= end);
30
31 int id_size = end - start;
32 char* id = malloc(sizeof(char)*(id_size+1));
33
34 strncpy(id,desc+start,id_size);
35 id[id_size] = '\0';
36 //printf("desc is %s, id is %s\n",desc,id);
37
38 return id;
39 }
40
41 int parse_gff(char *filename, FILE* fid,struct gene*** allGenes) {
42
43 int buffer_size = 256;
44 char* chr = malloc(sizeof(char)*buffer_size);
45 char* blah = malloc(sizeof(char)*buffer_size);
46 char* id = malloc(sizeof(char)*buffer_size);
47 char* desc = malloc(sizeof(char)*buffer_size);
48 int start = 0;
49 int stop = 0;
50 char* xy = malloc(sizeof(char)*4);
51 char* strand = malloc(sizeof(char)*4);
52 char* xz = malloc(sizeof(char)*4);
53
54 regex_t rx;
55 const char* pattern = "ID=[^;]*;";
56 if ( regcomp(&rx, pattern, 0) != 0) {
57 perror("regcomp");
58 exit(EXIT_FAILURE);
59 }
60
61
62 // do one pass through the gff line to determine the number of
63 // genes
64 int numGenes = 0;
65 int status = 0;
66 while(1) {
67 status = fscanf(fid,"%s\t%s\t%s\t%d\t%d\t%c\t%c\t%c\t%s\n",chr,blah,id,&start,&stop,xy,strand,xz,desc);
68 if(status == EOF)
69 break;
70
71 if ( status >= 5 && strcmp(id,"gene")==0)
72 numGenes++;
73
74 }
75 freopen(filename,"r",fid);
76
77 int idx = 0;
78 (*allGenes) = malloc(sizeof(struct gene*)*numGenes);
79 (*allGenes)[idx] = NULL;
80
81 int skippedLinesCounter = 0;
82 while(1) {
83 status = fscanf(fid,"%s\t%s\t%s\t%d\t%d\t%c\t%c\t%c\t%s\n",chr,blah,id,&start,&stop,xy,strand,xz,desc);
84 if(status == EOF)
85 break;
86
87 if (status < 7) {
88 skippedLinesCounter++;
89 continue;
90 }
91
92 if (strcmp(id,"gene")==0) {
93 if ( (*allGenes)[idx] !=NULL )
94 idx++;
95
96 (*allGenes)[idx] = gene_alloc();
97 (*allGenes)[idx]->start = start;
98 (*allGenes)[idx]->stop = stop;
99 (*allGenes)[idx]->strand = (*strand);
100 (*allGenes)[idx]->id = get_id(rx,desc);
101 //printf("gene start/stop: %d/%d\n",start,stop);
102 continue;
103 }
104
105 if (strcmp(id,"exon")==0) {
106 add_exon((*allGenes)[idx],start,stop);
107 //printf("exon start/stop: %d/%d\n",start,stop);
108 continue;
109 }
110
111 if (strcmp(id,"pseudogene")==0) {
112 if ( (*allGenes)[idx] !=NULL )
113 idx++;
114 }
115 }
116
117 if ( (*allGenes)[idx] !=NULL )
118 idx++;
119
120 //printf("allGenes[0] is %d\n",(*allGenes)[0]);
121 //printf("allGenes[1] is %d\n",(*allGenes)[1]);
122 //printf("Skipped %d lines.\n",skippedLinesCounter);
123
124 regfree(&rx);
125 free(chr);
126 free(blah);
127 free(id);
128 free(desc);
129 free(xy);
130 free(strand);
131 free(xz);
132
133 return numGenes;
134 }
135