+ added processing function for reads
[qpalma.git] / tools / data_tools / parser.c
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <string.h>
4
5 #include "datastructures.h"
6
7 int parse_gff(char *filename, FILE* fid,struct gene*** allGenes) {
8
9 int buffer_size = 256;
10 char* chr = malloc(sizeof(char)*buffer_size);
11 char* blah = malloc(sizeof(char)*buffer_size);
12 char* id = malloc(sizeof(char)*buffer_size);
13 char* desc = malloc(sizeof(char)*buffer_size);
14 int start = 0;
15 int stop = 0;
16 char* xy = malloc(sizeof(char)*4);
17 char* strand = malloc(sizeof(char)*4);
18 char* xz = malloc(sizeof(char)*4);
19
20 // do one pass through the gff line to determine the number of
21 // genes
22 int numGenes = 0;
23
24 int status = 0;
25 while(1) {
26 status = fscanf(fid,"%s\t%s\t%s\t%d\t%d\t%c\t%c\t%c\t%s\n",chr,blah,id,&start,&stop,xy,strand,xz,desc);
27 if(status == EOF)
28 break;
29
30 if ( status > 5 && strcmp(id,"gene")==0)
31 numGenes++;
32 }
33 freopen(filename,"r",fid);
34
35 //printf("Found %d genes!\n",numGenes);
36
37 int idx = 0;
38 (*allGenes) = malloc(sizeof(struct gene*)*numGenes);
39 (*allGenes)[idx] = NULL;
40
41 int skippedLinesCounter = 0;
42 while(1) {
43 status = fscanf(fid,"%s\t%s\t%s\t%d\t%d\t%c\t%c\t%c\t%s\n",chr,blah,id,&start,&stop,xy,strand,xz,desc);
44 if(status == EOF)
45 break;
46
47 if (status < 7) {
48 skippedLinesCounter++;
49 continue;
50 }
51
52 if (strcmp(id,"gene")==0) {
53 if ( (*allGenes)[idx] !=NULL )
54 idx++;
55
56 (*allGenes)[idx] = gene_alloc();
57 (*allGenes)[idx]->start = start;
58 (*allGenes)[idx]->stop = stop;
59 (*allGenes)[idx]->strand = (*strand);
60 //printf("gene start/stop: %d/%d\n",start,stop);
61 continue;
62 }
63
64 if (strcmp(id,"exon")==0) {
65 add_exon((*allGenes)[idx],start,stop);
66 //printf("exon start/stop: %d/%d\n",start,stop);
67 continue;
68 }
69
70 if (strcmp(id,"pseudogene")==0) {
71 if ( (*allGenes)[idx] !=NULL )
72 idx++;
73 }
74 }
75
76 if ( (*allGenes)[idx] !=NULL )
77 idx++;
78
79 //printf("allGenes[0] is %d\n",(*allGenes)[0]);
80 //printf("allGenes[1] is %d\n",(*allGenes)[1]);
81 //printf("Skipped %d lines.\n",skippedLinesCounter);
82
83 free(chr);
84 free(blah);
85 free(id);
86 free(desc);
87 free(xy);
88 free(strand);
89 free(xz);
90
91 return numGenes;
92 }
93