939845410623896094e6f8defc8b4e570dc920a8
[qpalma.git] / tools / data_tools / filterReads.c
1 ////////////////////////////////////////////////////////////
2 // The purpose of this program is to read a gff and a
3 // solexa reads file and create a data set used by QPalma.
4 //
5 //
6 //
7 ////////////////////////////////////////////////////////////
8
9 #include <sys/mman.h>
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <string.h>
13
14 #include "datastructures.h"
15
16 int parse_gff(char* filename,FILE* fid,struct gene*** allGenes);
17
18 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes);
19
20 void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size);
21
22 static char *info = "Usage is:\n./filterReads gff reads";
23
24 int main(int argc, char* argv[]) {
25
26 if(argc != 3) {
27 printf("%s\n",info);
28 exit(EXIT_FAILURE);
29 }
30
31 //static size_t page_size;
32 // page_size = (size_t) sysconf (_SC_PAGESIZE);
33 int status;
34 int filenameSize = 256;
35 char *gff_filename = malloc(sizeof(char)*filenameSize);
36 char *reads_filename = malloc(sizeof(char)*filenameSize);
37
38 strncpy(gff_filename,argv[1],filenameSize);
39 strncpy(reads_filename,argv[2],filenameSize);
40
41 FILE *gff_fs = fopen(gff_filename,"r");
42 FILE *reads_fs = fopen(reads_filename,"r");
43
44 if(gff_fs == NULL) {
45 printf("Error: Could not open file: %s",gff_filename);
46 exit(EXIT_FAILURE);
47 }
48
49 if(reads_fs == NULL) {
50 printf("Error: Could not open file: %s",reads_filename);
51 exit(EXIT_FAILURE);
52 }
53
54 struct gene** allGenes;
55 int numGenes = parse_gff(gff_filename,gff_fs,&allGenes);
56 status = fclose(gff_fs);
57 if(status != 0)
58 printf("closing of gff filestream failed!\n");
59
60 process_reads(reads_fs,&allGenes,numGenes);
61
62 status += fclose(reads_fs);
63 if(status != 0)
64 printf("closing of filestreams failed!\n");
65
66 //free(allGenes);
67 free(gff_filename);
68 free(reads_filename);
69 return 0;
70 }
71
72
73 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes) {
74 int status;
75
76 int buffer_size= 64;
77 int chr = 0;
78 int pos = 0;
79 char* seq = malloc(sizeof(char)*buffer_size);
80 int id = 0;
81 char strand = ' ';
82 int mismatch = 0;
83 int repetition = 0;
84 int size = 0;
85 int cut = 0;
86 char* prb = malloc(sizeof(char)*buffer_size);
87 char* cal_prb = malloc(sizeof(char)*buffer_size);
88 char* chastity = malloc(sizeof(char)*buffer_size);
89
90 int reads_fid = fileno(reads_fs);
91 size_t mmapAreaSize = 20000;
92
93 void *reads_area = mmap (NULL,mmapAreaSize,PROT_READ|PROT_WRITE,MAP_PRIVATE,reads_fid,0);
94 if((long int) reads_area == -1)
95 printf("mmapping failed!\n");
96
97 void* linePtr = reads_area;
98 char* current_line = malloc(sizeof(char)*256);
99
100 int SIZE = 500;
101 // initialize boundary arrays
102 void** upstream_end = malloc(sizeof(void*)*SIZE);
103 void** upstream_overlap = malloc(sizeof(void*)*SIZE);
104 void** downstream_start = malloc(sizeof(void*)*SIZE);
105 void** downstream_overlap= malloc(sizeof(void*)*SIZE);
106 int ue,uo,ds,dov;
107 ue = uo = ds = dov = 0;
108
109 int skippedLinesCounter = 0;
110
111 int prev_exon_stop = -1;
112 int cur_exon_start = -1;
113
114 int gene_idx;
115 for (gene_idx = 0; gene_idx < numGenes;) {
116 struct gene* currentGene = (*allGenes)[gene_idx];
117
118 int exon_idx;
119 for(exon_idx=1;exon_idx<currentGene->num_exons;exon_idx++) {
120
121 combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov);
122 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds);
123
124 prev_exon_stop = currentGene->exon_stops[exon_idx-1];
125 cur_exon_start = currentGene->exon_starts[exon_idx];
126
127 if (cur_exon_start - prev_exon_stop < 6)
128 continue;
129
130 ue = uo = ds = dov = 0;
131
132 label:
133
134 current_line = strncpy(current_line,linePtr,256);
135 if (strcmp(current_line,"") == 0)
136 goto end;
137
138 status = sscanf(current_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
139 &chr,&pos,seq,&id,&strand,&mismatch,&repetition,&size,&cut,prb,cal_prb,chastity);
140 if (status < 12) {
141 //printf("skipped line is %s\n",current_line);
142 skippedLinesCounter++;
143 }
144
145 //printf("%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
146 //chr,pos,seq,id,strand,mismatch,repetition,size,cut,prb,cal_prb,chastity);
147 //printf("read position: %d gene start/stop: %d-%d\n ",pos,currentGene->start,currentGene->stop);
148
149 if (currentGene->start <= pos && pos+35 <= currentGene->stop) {
150 //printf("gene boundaries: %d %d, pos: %d linePtr %d\n",currentGene->start,currentGene->stop,pos,linePtr);
151 // upstream ending
152 if (pos+35 == prev_exon_stop) {
153 upstream_end[ue] = linePtr;
154 ue++;
155 } else {
156
157 // upstream overlapping
158 if (prev_exon_stop - pos >= 6 && prev_exon_stop - pos <= 30) {
159 upstream_overlap[uo] = linePtr;
160 uo++;
161 }
162 }
163
164 // downstream starting
165 if (pos == cur_exon_start) {
166 downstream_start[ds] = linePtr;
167 ds++;
168 } else {
169
170 // downstream overlapping
171 if (cur_exon_start - pos >= 6 && cur_exon_start - pos <= 30 ) {
172 downstream_overlap[dov] = linePtr;
173 dov++;
174 }
175 }
176
177 while (*(char*)linePtr != '\n') linePtr++;
178 linePtr++;
179 goto label;
180
181 } else {
182
183 if (currentGene->stop < pos) {
184 //printf("read is too far downstream\n");
185 break;
186 }
187
188 //printf("read is too far upstream\n");
189 while (*(char*)linePtr != '\n') linePtr++;
190 linePtr++;
191 goto label;
192 }
193 }
194
195 gene_idx++;
196 }
197
198 end:
199
200 free(upstream_end);
201 free(upstream_overlap);
202 free(downstream_start);
203 free(downstream_overlap);
204
205 printf("skipped %d lines.\n",skippedLinesCounter);
206
207 status = munmap(reads_area,mmapAreaSize);
208 if(status != 0)
209 printf("munmap failed!\n");
210
211 free(seq);
212 free(prb);
213 free(cal_prb);
214 free(chastity);
215 }
216
217 void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size) {
218 //printf("up_/down_size %d %d\n",up_size,down_size);
219
220 if (up_size == 0 || down_size == 0 || exon_stop == -1)
221 return;
222
223 int up_idx, down_idx, status;
224 char* upstream_line = malloc(sizeof(char)*256);
225 char* downstream_line = malloc(sizeof(char)*256);
226
227 int buffer_size= 64;
228
229 int up_chr = 0;
230 int up_pos = 0;
231 char* up_seq = malloc(sizeof(char)*buffer_size);
232 int up_id = 0;
233 char up_strand = ' ';
234 int up_mismatch = 0;
235 int up_repetition = 0;
236 int up_sz = 0;
237 int up_cut = 0;
238 char* up_prb = malloc(sizeof(char)*buffer_size);
239 char* up_cal_prb = malloc(sizeof(char)*buffer_size);
240 char* up_chastity = malloc(sizeof(char)*buffer_size);
241
242 int down_chr = 0;
243 int down_pos = 0;
244 char* down_seq = malloc(sizeof(char)*buffer_size);
245 int down_id = 0;
246 char down_strand = ' ';
247 int down_mismatch = 0;
248 int down_repetition = 0;
249 int down_sz = 0;
250 int down_cut = 0;
251 char* down_prb = malloc(sizeof(char)*buffer_size);
252 char* down_cal_prb = malloc(sizeof(char)*buffer_size);
253 char* down_chastity = malloc(sizeof(char)*buffer_size);
254
255 int new_chr = 0;
256 int new_pos = 0;
257 char* new_seq = malloc(sizeof(char)*buffer_size);
258 int new_id = 0;
259 char new_strand = ' ';
260 char* new_prb = malloc(sizeof(char)*buffer_size);
261 char* new_cal_prb = malloc(sizeof(char)*buffer_size);
262 char* new_chastity = malloc(sizeof(char)*buffer_size);
263
264 up_idx=0;
265 down_idx=0;
266 while(1) {
267 //printf("up_/down_idx %d %d\n",up_idx,down_idx);
268
269 if (up_idx == up_size || down_idx == down_size)
270 break;
271
272 strncpy(upstream_line,upstream[up_idx],256);
273 status = sscanf(upstream_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
274 &up_chr,&up_pos,up_seq,&up_id,&up_strand,&up_mismatch,&up_repetition,&up_sz,
275 &up_cut,up_prb,up_cal_prb,up_chastity);
276
277 strncpy(downstream_line,downstream[down_idx],256);
278 status = sscanf(downstream_line,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
279 &down_chr,&down_pos,down_seq,&down_id,&down_strand,&down_mismatch,&down_repetition,&down_sz,
280 &down_cut,down_prb,down_cal_prb,down_chastity);
281
282 new_prb[0] = '\0';
283 new_cal_prb[0] = '\0';
284 new_chastity[0] = '\0';
285
286 // is downstream overlapping
287 if (up_pos+35 == exon_stop) {
288 int overlap = exon_start - down_pos;
289 //printf("overlap is %d\n",overlap);
290 //printf("pos are: %d %d\n",up_pos,down_pos);
291
292 strncat(new_prb,up_prb+(36-overlap),overlap);
293 strncat(new_cal_prb,up_cal_prb+(36-overlap),overlap);
294 strncat(new_chastity,up_chastity+(36-overlap),overlap);
295
296 strncat(new_prb,down_prb+overlap,36-overlap);
297 strncat(new_cal_prb,down_cal_prb+overlap,36-overlap);
298 strncat(new_chastity,down_chastity+overlap,36-overlap);
299
300 //printf("up_prb: %s\n",up_prb);
301 //printf("down_prb: %s\n",down_prb);
302 //printf("new_prb: %s\n",new_prb);
303
304 // is upstream overlapping
305 } else {
306
307 }
308
309 up_idx++;
310 down_idx++;
311 }
312 }
313
314 /*
315 * TODO
316 * - Check strand
317 * - check repetition
318 *
319 */