+ wrote faster parser / file processing tools in C
[qpalma.git] / tools / data_tools / parser.c
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <string.h>
4
5 #include "datastructures.h"
6
7 void parse_gff(char *filename, FILE* fid,struct gene** allGenes) {
8
9 int buffer_size = 256;
10 char* chr = malloc(sizeof(char)*buffer_size);
11 char* blah = malloc(sizeof(char)*buffer_size);
12 char* id = malloc(sizeof(char)*buffer_size);
13 char* desc = malloc(sizeof(char)*buffer_size);
14 int start = 0;
15 int stop = 0;
16 char* xy = malloc(sizeof(char)*4);
17 char* strand = malloc(sizeof(char)*4);
18 char* xz = malloc(sizeof(char)*4);
19
20 // do one pass through the gff line to determine the number of
21 // genes
22 int numGenes = 0;
23
24 int status = 0;
25 while(1) {
26 status = fscanf(fid,"%s\t%s\t%s\t%d\t%d\t%c\t%c\t%c\t%s\n",chr,blah,id,&start,&stop,xy,strand,xz,desc);
27 if(status == EOF)
28 break;
29
30 if ( status > 5 && strcmp(id,"gene")==0)
31 numGenes++;
32 }
33 freopen(filename,"r",fid);
34
35 printf("Found %d genes!\n",numGenes);
36
37 allGenes = malloc(sizeof(struct gene)*numGenes);
38 struct gene *currentGene = gene_alloc();
39
40 int skippedLinesCounter = 0;
41 int idx = 0;
42 while(1) {
43 status = fscanf(fid,"%s\t%s\t%s\t%d\t%d\t%c\t%c\t%c\t%s\n",chr,blah,id,&start,&stop,xy,strand,xz,desc);
44 if(status == EOF)
45 break;
46
47 if (status < 7) {
48 skippedLinesCounter++;
49 continue;
50 }
51
52 if (strcmp(id,"gene")==0) {
53 if ( currentGene->start != -1)
54 allGenes[idx] = currentGene;
55 idx++;
56
57 currentGene = gene_alloc();
58 currentGene->start = start;
59 currentGene->stop = stop;
60 currentGene->strand = (*strand);
61 //printf("gene start/stop: %d/%d\n",start,stop);
62 continue;
63 }
64
65 if (strcmp(id,"exon")==0) {
66 add_exon(currentGene,start,stop);
67 //printf("exon start/stop: %d/%d\n",start,stop);
68 continue;
69 }
70
71 if (strcmp(id,"pseudogene")==0) {
72 if ( currentGene->start != -1)
73 allGenes[idx] = currentGene;
74 idx++;
75 }
76 }
77
78 if ( currentGene->start != -1)
79 allGenes[idx] = currentGene;
80 idx++;
81
82 free(chr);
83 free(blah);
84 free(id);
85 free(desc);
86 free(xy);
87 free(strand);
88 free(xz);
89 }