+ found bug which caused ~10 reads of overlap "0"
[qpalma.git] / tools / data_tools / parser.c
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <string.h>
4
5 #include "common.h"
6 #include "datastructures.h"
7
8 int parse_gff(char *filename, FILE* fid,struct gene*** allGenes) {
9
10 int buffer_size = 256;
11 char* chr = malloc(sizeof(char)*buffer_size);
12 char* blah = malloc(sizeof(char)*buffer_size);
13 char* id = malloc(sizeof(char)*buffer_size);
14 char* desc = malloc(sizeof(char)*buffer_size);
15 int start = 0;
16 int stop = 0;
17 char* xy = malloc(sizeof(char)*4);
18 char* strand = malloc(sizeof(char)*4);
19 char* xz = malloc(sizeof(char)*4);
20
21 // do one pass through the gff line to determine the number of
22 // genes
23 int numGenes = 0;
24 int status = 0;
25 while(1) {
26 status = fscanf(fid,"%s\t%s\t%s\t%d\t%d\t%c\t%c\t%c\t%s\n",chr,blah,id,&start,&stop,xy,strand,xz,desc);
27 if(status == EOF)
28 break;
29
30 if ( status >= 5 && strcmp(id,"gene")==0)
31 numGenes++;
32
33 }
34 freopen(filename,"r",fid);
35
36 int idx = 0;
37 (*allGenes) = malloc(sizeof(struct gene*)*numGenes);
38 (*allGenes)[idx] = NULL;
39
40 int skippedLinesCounter = 0;
41 while(1) {
42 status = fscanf(fid,"%s\t%s\t%s\t%d\t%d\t%c\t%c\t%c\t%s\n",chr,blah,id,&start,&stop,xy,strand,xz,desc);
43 if(status == EOF)
44 break;
45
46 if (status < 7) {
47 skippedLinesCounter++;
48 continue;
49 }
50
51 if (strcmp(id,"gene")==0) {
52 if ( (*allGenes)[idx] !=NULL )
53 idx++;
54
55 (*allGenes)[idx] = gene_alloc();
56 (*allGenes)[idx]->start = start;
57 (*allGenes)[idx]->stop = stop;
58 (*allGenes)[idx]->strand = (*strand);
59 //printf("gene start/stop: %d/%d\n",start,stop);
60 continue;
61 }
62
63 if (strcmp(id,"exon")==0) {
64 add_exon((*allGenes)[idx],start,stop);
65 //printf("exon start/stop: %d/%d\n",start,stop);
66 continue;
67 }
68
69 if (strcmp(id,"pseudogene")==0) {
70 if ( (*allGenes)[idx] !=NULL )
71 idx++;
72 }
73 }
74
75 if ( (*allGenes)[idx] !=NULL )
76 idx++;
77
78 //printf("allGenes[0] is %d\n",(*allGenes)[0]);
79 //printf("allGenes[1] is %d\n",(*allGenes)[1]);
80 //printf("Skipped %d lines.\n",skippedLinesCounter);
81
82 free(chr);
83 free(blah);
84 free(id);
85 free(desc);
86 free(xy);
87 free(strand);
88 free(xz);
89
90 return numGenes;
91 }
92