+ added processing function for reads
[qpalma.git] / tools / data_tools / filterReads.c
1 ////////////////////////////////////////////////////////////
2 // The purpose of this program is to read a gff and a
3 // solexa reads file and create a data set used by QPalma.
4 //
5 //
6 //
7 ////////////////////////////////////////////////////////////
8
9 #include <sys/mman.h>
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <string.h>
13
14 #include "datastructures.h"
15
16 int parse_gff(char* filename,FILE* fid,struct gene*** allGenes);
17
18 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes);
19
20 static char *info = "Usage is:\n./filterReads gff reads";
21
22 int main(int argc, char* argv[]) {
23
24 if(argc != 3) {
25 printf("%s\n",info);
26 exit(EXIT_FAILURE);
27 }
28
29 //static size_t page_size;
30 // page_size = (size_t) sysconf (_SC_PAGESIZE);
31 int status;
32 int filenameSize = 256;
33 char *gff_filename = malloc(sizeof(char)*filenameSize);
34 char *reads_filename = malloc(sizeof(char)*filenameSize);
35
36 strncpy(gff_filename,argv[1],filenameSize);
37 strncpy(reads_filename,argv[2],filenameSize);
38
39 FILE *gff_fs = fopen(gff_filename,"r");
40 FILE *reads_fs = fopen(reads_filename,"r");
41
42 if(gff_fs == NULL) {
43 printf("Error: Could not open file: %s",gff_filename);
44 exit(EXIT_FAILURE);
45 }
46
47 if(reads_fs == NULL) {
48 printf("Error: Could not open file: %s",reads_filename);
49 exit(EXIT_FAILURE);
50 }
51
52 struct gene** allGenes;
53 int numGenes = parse_gff(gff_filename,gff_fs,&allGenes);
54 status = fclose(gff_fs);
55 if(status != 0)
56 printf("closing of gff filestream failed!\n");
57
58 process_reads(reads_fs,&allGenes,numGenes);
59
60 status += fclose(reads_fs);
61 if(status != 0)
62 printf("closing of filestreams failed!\n");
63
64 //free(allGenes);
65 free(gff_filename);
66 free(reads_filename);
67 return 0;
68 }
69
70
71 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes) {
72 int status;
73
74 int buffer_size= 64;
75 int chr = 0;
76 int pos = 0;
77 char* seq = malloc(sizeof(char)*buffer_size);
78 int id = 0;
79 char strand = ' ';
80 int mismatch = 0;
81 int repetition = 0;
82 int size = 0;
83 int cut = 0;
84 char* prb = malloc(sizeof(char)*buffer_size);
85 char* cal_prb = malloc(sizeof(char)*buffer_size);
86 char* chastity = malloc(sizeof(char)*buffer_size);
87
88 int reads_fid = fileno(reads_fs);
89 size_t mmapAreaSize = 20000;
90
91 void *reads_area = mmap (NULL,mmapAreaSize,PROT_READ|PROT_WRITE,MAP_PRIVATE,reads_fid,0);
92 if((long int) reads_area == -1)
93 printf("mmapping failed!\n");
94
95 void* linePtr = reads_area;
96
97 char *test_string = malloc(sizeof(char)*10);
98 strncpy(test_string,(char*)reads_area,10);
99
100 status = munmap(reads_area,mmapAreaSize);
101 if(status != 0)
102 printf("munmap failed!\n");
103
104 void** upstream_end;
105 void** upstream_overlap;
106
107 void** downstream_start;
108 void** downstream_overlap;
109
110 int ue,uo,ds,dov;
111
112 int SIZE = 50;
113
114 int skippedLinesCounter = 0;
115
116 printf("Entering loop...\n");
117
118 int gene_idx;
119 for (gene_idx = 0; gene_idx < numGenes;) {
120 struct gene* currentGene = (*allGenes)[gene_idx];
121
122 int exon_idx;
123 for(exon_idx=1;exon_idx<currentGene->num_exons-1;exon_idx++) {
124 int prev_exon_stop = currentGene->exon_stops[exon_idx-1];
125 int cur_exon_start = currentGene->exon_starts[exon_idx];
126
127 if (cur_exon_start - prev_exon_stop < 6)
128 continue;
129
130 // initialize boundary arrays
131 upstream_end = malloc(sizeof(void*)*SIZE);
132 upstream_overlap = malloc(sizeof(void*)*SIZE);
133 downstream_start = malloc(sizeof(void*)*SIZE);
134 downstream_overlap= malloc(sizeof(void*)*SIZE);
135 ue = 0;
136 uo = 0;
137 ds = 0;
138 dov = 0;
139
140 printf("Fetching reads line...\n");
141
142 label:
143
144 status = sscanf((char*)linePtr,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
145 &chr,&pos,seq,&id,&strand,&mismatch,&repetition,&size,&cut,prb,cal_prb,chastity);
146 if (status < 12) {
147 skippedLinesCounter++;
148 continue;
149 }
150
151 printf("read line!");
152
153 if (currentGene->start <= pos && pos+35 <= currentGene->stop) {
154 printf("gene boundaries: %d %d, pos: %d\n",currentGene->start,currentGene->stop,pos);
155
156 // upstream ending
157 if (pos+35 == prev_exon_stop) {
158 upstream_end[ue] = linePtr;
159 ue++;
160 }
161
162 // upstream overlapping
163 if (prev_exon_stop - pos <= 30) {
164 upstream_overlap[uo] = linePtr;
165 uo++;
166 }
167
168 // downstream starting
169 if (pos == cur_exon_start) {
170 downstream_start[ds] = linePtr;
171 ds++;
172 }
173
174 // downstream overlapping
175 if (cur_exon_start - pos >= 6) {
176 downstream_overlap[dov] = linePtr;
177 dov++;
178 }
179
180 while (*(char*)linePtr != '\n') {
181 printf("linePtr is %d",linePtr);
182 linePtr++;
183 }
184
185 linePtr++;
186 goto label;
187
188 free(upstream_end);
189 free(upstream_overlap);
190 free(downstream_start);
191 free(downstream_overlap);
192 }
193
194 while (*(char*)linePtr != '\n') {
195 printf("linePtr is %d",linePtr);
196 linePtr++;
197 }
198
199 linePtr++;
200 goto label;
201
202 }
203
204 gene_idx++;
205 }
206
207 printf("skipped %d lines.\n",skippedLinesCounter);
208
209 free(seq);
210 free(prb);
211 free(cal_prb);
212 free(chastity);
213 }
214
215 // int skippedLinesCounter = 0;
216 // while(1) {
217 // status = fscanf(reads_fs,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",&chr,&pos,seq,&id,&strand,&mismatch,&repetition,&size,&cut,prb,cal_prb,chastity);
218 // if(status == EOF)
219 // break;
220 //
221 // if (status < 12) {
222 // skippedLinesCounter++;
223 // continue;
224 // }
225 //
226 // label:
227 //
228 // if (pos < currentGene->start)
229 // continue;
230 //
231 // if (currentGene->stop < pos) {
232 // gene_idx++;
233 // currentGene = (*allGenes)[gene_idx];
234 // goto label;
235 // }
236 //