132f3dc2b31c6be408c91376b41067c6c55ea9df
[qpalma.git] / tools / data_tools / filterReads.c
1 ////////////////////////////////////////////////////////////////////////////////
2 // The purpose of this program is to read a gff and a
3 // solexa reads file and create a data set used by QPalma.
4 //
5 //
6 //
7 // Notes:
8 //
9 // - Both the read indices and the gff gene indices are one-based
10 //
11 //
12 ////////////////////////////////////////////////////////////////////////////////
13
14 #include <sys/mman.h>
15 #include <sys/stat.h>
16 #include <stdio.h>
17 #include <stdlib.h>
18 #include <assert.h>
19 #include <string.h>
20 #include <unistd.h>
21 #include <math.h>
22
23 #include "debug_tools.h"
24 #include "join.h"
25 #include "datastructures.h"
26
27 #define _FILE_OFFSET_BITS == 64
28
29 const char* line_format = "%d\t%d\t%s\t%lu\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n";
30
31 const int read_size = 36;
32
33 const int min_overlap = 1;
34 const int max_overlap = 35;
35
36 unsigned long read_nr = 1;
37 unsigned long unfiltered_read_nr = 1;
38
39 int parse_gff(char* filename,FILE* fid,struct gene*** allGenes);
40 void sort_genes(struct gene*** allGenes, int numGenes);
41 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* out_fs, FILE* unfiltered_out_fs);
42 void combine_info(int exon_stop, int exon_start, Read** upstream, int up_size, Read** downstream, int down_size, FILE* out_fs,const char* gene_id, int exon_idx);
43 int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FILE* out_fs,const char* gene_id, int exon_idx);
44 int fitting(char* up_prb, char* down_prb, int u_off, int d_off, int u_size, int d_size);
45 Tuple join_seq(char* new_seq, char* up_seq,int u_off, int u_size, char* down_seq, int d_off, int d_size, int d_range);
46
47 static char *info = "Usage is:\n./filterReads gff reads output";
48
49 /*
50 * Some constants specifying the exact behavior of the filter
51 *
52 */
53
54 //#define _FDEBUG_ 0
55 //#define DEBUG_READ 38603
56
57
58 //
59 /*
60 * TODO:
61 * - Check strand -> done simple (only if equal)
62 * - check for [AC] and similar entries -> done simple (see function
63 */
64
65 #ifdef _FDEBUG_
66 if(read_nr == DEBUG_READ) {
67 printf("read nr %lu",read_nr);
68 printf("pos: %d %d %d %d\n",p_start,exon_stop,exon_start,p_stop);
69 printf("u/d range: %d %d\n",up_range,down_range);
70 printf("u/d size: %d %d\n",u_size,d_size);
71 printf("u/d off: %d %d\n",u_off,d_off);
72 printf("add_pos/down_range-d_size: %d %d\n",additional_pos,(down_range - d_size));
73
74 printf("******************\n");
75
76 printf("%s\n",up_read->seq);
77 //printf("%s\n",new_up_seq);
78
79 printf("******************\n");
80
81 printf("%s\n",down_read->seq);
82 //printf("%s\n",new_down_seq);
83
84 printf("******************\n");
85 printf("%s\n",new_seq);
86 printf("%s\n",new_prb);
87 printf("%s\n",new_cal_prb);
88 printf("%s\n",new_chastity);
89 }
90 #endif // _FDEBUG_
91
92 int combined_reads = 0;
93
94 /*
95 *
96 *
97 */
98
99 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* out_fs, FILE* unfiltered_out_fs) {
100 int status;
101
102 int buffer_size= 64;
103 int chr = 0;
104 int pos = 0;
105 char* seq = malloc(sizeof(char)*buffer_size);
106 unsigned long id = 0;
107 char strand = ' ';
108 int mismatch = 0;
109 int occurrence = 0;
110 int size = 0;
111 int cut = 0;
112 char* prb = malloc(sizeof(char)*buffer_size);
113 char* cal_prb = malloc(sizeof(char)*buffer_size);
114 char* chastity = malloc(sizeof(char)*buffer_size);
115
116 int reads_fid = fileno(reads_fs);
117 struct stat reads_stat;
118 if ( fstat(reads_fid,&reads_stat) == -1) {
119 perror("fstat");
120 exit(EXIT_FAILURE);
121 }
122 off_t reads_filesize = reads_stat.st_size;
123 printf("Reads file is of size %lu bytes\n",(unsigned long) reads_filesize);
124 int numReads = reads_filesize / 178.0;
125
126 void *reads_area = mmap (NULL,reads_filesize,PROT_READ,MAP_PRIVATE,reads_fid,0);
127 if (reads_area == MAP_FAILED) {
128 perror("mmap");
129 exit(EXIT_FAILURE);
130 }
131 close(reads_fid);
132 printf("Successfully mapped %lu bytes of reads file into memory\n",(unsigned long)reads_filesize);
133
134 char* lineBeginPtr = (char*) reads_area;
135 char* lineEndPtr = (char*) reads_area;
136 char* end_of_mapped_area = ((char*) reads_area) + reads_filesize;
137
138 while (*lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
139 lineEndPtr++;
140
141 char* current_line = malloc(sizeof(char)*512);
142 memset(current_line,0,512);
143
144 int SIZE = 5000;
145 // initialize boundary arrays
146 Read** upstream_overlap = malloc(sizeof(Read*)*SIZE);
147 Read** downstream_overlap= malloc(sizeof(Read*)*SIZE);
148 int uov,dov;
149 uov = dov = 0;
150
151 int skippedLinesCounter = 0;
152
153 int prev_exon_start = -1;
154 int prev_exon_stop = -1;
155 int cur_exon_start = -1;
156
157 unsigned long line_size = lineEndPtr - lineBeginPtr;
158 //printf("distance is %lu\n",line_size);
159 strncpy(current_line,lineBeginPtr,line_size);
160 current_line[line_size] = '\0';
161 //printf("%s test",current_line);
162
163 int gene_idx = 0;
164 int exon_idx = 1;
165 struct gene* currentGene = (*allGenes)[gene_idx];
166 char* gene_id = currentGene->id;
167
168 int skippedReadCtr = 0;
169 int uselessReadCtr = 0;
170 int exonicReadCtr = 0;
171
172 int currentOverlapCtr = 0;
173 int previousOverlapCtr = 0;
174
175 int multioccurReadCtr = 0;
176
177 Read* currentRead;
178 int up_idx, down_idx;
179
180 int readCtr = 0;
181 int wrong_strand_ctr = 0;
182 int read_within_gene_ctr = 0;
183 int read_outside_gene_ctr = 0;
184 int read_start, read_stop;
185 // start of the parsing loop
186
187 while(1) {
188 if (gene_idx == numGenes || strcmp(current_line,"") == 0)
189 break;
190
191 gene_id = currentGene->id;
192
193 //if (readCtr != 0 && readCtr % 1000000 == 0) {
194 // printf("Processed %d/%d genes and %d/%d reads.\n",gene_idx,numGenes,readCtr,numReads);
195 // printf("%d reads in total while %d reads where skipped.\n",readCtr,skippedLinesCounter);
196 // printf("%d were totally intronic. %d were outside of genes and %d were occurring on several positions.\n",uselessReadCtr,skippedReadCtr,multioccurReadCtr);
197 // printf("%d reads were useless in total\n",uselessReadCtr+skippedReadCtr+multioccurReadCtr);
198 // printf("\t%d useless reads\n",uselessReadCtr);
199 // printf("\t%d skipped reads\n",skippedReadCtr);
200 // printf("\t%d multioccurring\n",multioccurReadCtr);
201 // printf("\t%d wrong_strand\n",wrong_strand_ctr);
202 // printf("%d within gene\n",read_within_gene_ctr);
203 // printf("%d outside gene\n",read_outside_gene_ctr);
204 // printf("%d reads were totally exonic\n",exonicReadCtr);
205 // printf("%d reads overlap with prev. exon. %d reads overlap with current exon\n",previousOverlapCtr,currentOverlapCtr);
206 // printf("%d reads where newly combined from two original reads\n",combined_reads);
207 // printf("Total used reads: %d (some may not be combined)\n",exonicReadCtr+combined_reads);
208 //}
209
210 // pos of the reads is one-based
211 status = sscanf(current_line,line_format,&chr,&pos,seq,&id,&strand,&mismatch,&occurrence,&size,&cut,prb,cal_prb,chastity);
212 if (status < 12) {
213 skippedLinesCounter++;
214 goto next_read;
215 }
216
217 // if the read is occurring several times elsewhere then get rid of it
218 if ( occurrence != 1 ) {
219 multioccurReadCtr++;
220 goto next_read;
221 }
222
223 // define read start and stop positions
224 read_start = pos;
225 read_stop = pos + read_size-1;
226
227 FA(strlen(seq) >= read_size);
228
229 FA(currentGene != 0);
230
231 if ( currentGene->start <= read_start && read_stop <= currentGene->stop) { // read is within gene borders
232 read_within_gene_ctr++;
233
234 exon_label:
235
236 if (exon_idx == currentGene->num_exons) {
237 gene_idx++;
238 exon_idx = 1;
239 currentGene = (*allGenes)[gene_idx];
240 while( currentGene == 0 && gene_idx < numGenes) {
241 currentGene = (*allGenes)[gene_idx];
242 gene_idx++;
243 }
244 continue;
245 }
246
247 prev_exon_start = currentGene->exon_starts[exon_idx-1];
248 prev_exon_stop = currentGene->exon_stops[exon_idx-1];
249 cur_exon_start = currentGene->exon_starts[exon_idx];
250
251 //printf("id: %s,exon_idx: %d intron: %d %d read start/stop: %d / %d\n",currentGene->id,exon_idx,prev_exon_stop,cur_exon_start,read_start,read_stop);
252
253 if ( (cur_exon_start - prev_exon_stop - 1) < min_overlap || cur_exon_start < read_start ) { // go to next exon
254
255 if (uov != 0 && dov != 0)
256 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
257
258 for(up_idx=0;up_idx<uov;up_idx++) {
259 free_read(upstream_overlap[up_idx]);
260 free(upstream_overlap[up_idx]);
261 }
262
263 for(down_idx=0;down_idx<dov;down_idx++) {
264 free_read(downstream_overlap[down_idx]);
265 free(downstream_overlap[down_idx]);
266 }
267
268 uov = dov = 0;
269
270 exon_idx++;
271 goto exon_label;
272 }
273
274 if ( prev_exon_start <= read_start && read_stop <= prev_exon_stop ) { // read is inside previous exon
275
276 // output of unused i.e. unspliced reads
277 fprintf(unfiltered_out_fs,"%lu\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\n",
278 unfiltered_read_nr,chr,strand,seq,0,read_size,prb,cal_prb,chastity,gene_id,read_start,-1,-1,read_stop,-1);
279 unfiltered_read_nr++;
280
281 exonicReadCtr++;
282 goto next_read;
283 }
284
285 if ( read_stop < prev_exon_stop ) { // go to next read
286 //printf("%d\t%d\t%d\n",read_start,prev_exon_start,prev_exon_stop);
287 //if( exon_idx > 1) {
288 // printf("%d\t%d\n", currentGene->exon_starts[exon_idx-2], currentGene->exon_stops[exon_idx-2]);
289 // printf("---\n");
290 //}
291
292 uselessReadCtr++;
293 goto next_read;
294 }
295
296 // if this position is reached the read is somehow overlapping or
297 // exactly on the exon boundary.
298 if ( (prev_exon_stop - read_start + 1) >= min_overlap && (prev_exon_stop - read_start + 1) <= max_overlap ) { // read overlaps with previous exon boundary
299 //printf("%s\n",current_line);
300 previousOverlapCtr++;
301 currentRead = create_read(chr,read_start,seq,id,strand,mismatch,occurrence,size,cut,prb,cal_prb,chastity);
302 assert (uov < SIZE);
303 upstream_overlap[uov] = currentRead;
304 uov++;
305 goto next_read;
306 }
307
308 if ( ( read_stop - cur_exon_start + 1) >= min_overlap && (read_stop - cur_exon_start + 1) <= max_overlap ) { // read overlaps with current exon boundary
309 //printf("%s\n",current_line);
310 currentOverlapCtr++;
311 currentRead = create_read(chr,read_start,seq,id,strand,mismatch,occurrence,size,cut,prb,cal_prb,chastity);
312 assert (dov < SIZE);
313 downstream_overlap[dov] = currentRead;
314 dov++;
315 goto next_read;
316 }
317
318 uselessReadCtr++;
319 goto next_read; // read was not useful i.e. not overlapping/starting at exon boundaries
320
321 } else { // read is not within gene borders
322 read_outside_gene_ctr++;
323
324 if (uov != 0 && dov != 0)
325 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
326
327 for(up_idx=0;up_idx<uov;up_idx++) {
328 free_read(upstream_overlap[up_idx]);
329 free(upstream_overlap[up_idx]);
330 }
331
332 for(down_idx=0;down_idx<dov;down_idx++) {
333 free_read(downstream_overlap[down_idx]);
334 free(downstream_overlap[down_idx]);
335 }
336
337 uov = dov = 0;
338
339 if ( currentGene->stop < read_stop ) { // go to next gene
340 gene_idx++;
341 exon_idx = 1;
342 currentGene = (*allGenes)[gene_idx];
343 while( currentGene == 0 && gene_idx < numGenes) {
344 currentGene = (*allGenes)[gene_idx];
345 gene_idx++;
346 }
347 //printf("currentGene->start / currentGene->stop %d/%d pos is %d\n",currentGene->start,currentGene->stop,pos);
348 continue;
349 }
350
351 if ( read_start < currentGene->start ) { // go to next read
352 skippedReadCtr++;
353
354 next_read:
355
356 lineBeginPtr = lineEndPtr;
357 while (*(char*)lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
358 lineEndPtr++;
359 readCtr += 1;
360 current_line = strncpy(current_line,lineBeginPtr,lineEndPtr-lineBeginPtr);
361 current_line[lineEndPtr-lineBeginPtr] = '\0';
362 continue;
363 }
364 }
365 }
366
367 //} // end of for all strands
368
369 if (uov != 0 && dov != 0)
370 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
371
372 for(up_idx=0;up_idx<uov;up_idx++) {
373 free_read(upstream_overlap[up_idx]);
374 free(upstream_overlap[up_idx]);
375 }
376
377 for(down_idx=0;down_idx<dov;down_idx++) {
378 free_read(downstream_overlap[down_idx]);
379 free(downstream_overlap[down_idx]);
380 }
381
382 uov = dov = 0;
383
384 free(upstream_overlap);
385 free(downstream_overlap);
386
387 printf("Processed %d reads in total while %d reads where skipped.\n",readCtr,skippedLinesCounter);
388 printf("%d were totally intronic. %d were outside of genes and %d were occurring on several positions.\n",uselessReadCtr,skippedReadCtr,multioccurReadCtr);
389 printf("%d reads were useless in total\n",uselessReadCtr+skippedReadCtr+multioccurReadCtr);
390 printf("\t%d useless reads\n",uselessReadCtr);
391 printf("\t%d skipped reads\n",skippedReadCtr);
392 printf("\t%d multioccurring\n",multioccurReadCtr);
393 printf("\t%d wrong_strand\n",wrong_strand_ctr);
394 printf("%d reads were totally exonic\n",exonicReadCtr);
395 printf("%d reads overlap with prev. exon. %d reads overlap with current exon\n",previousOverlapCtr,currentOverlapCtr);
396 printf("%d reads where newly combined from two original reads\n",combined_reads);
397 printf("Total used reads: %d (some may not be combined)\n",exonicReadCtr+combined_reads);
398
399 status = munmap(reads_area,reads_filesize);
400 if(status != 0)
401 perror("munmap");
402
403 free(current_line);
404 free(seq);
405 free(prb);
406 free(cal_prb);
407 free(chastity);
408 }
409
410
411 void combine_info(int exon_stop, int exon_start, Read** upstream, int up_size, Read** downstream, int down_size, FILE* out_fs,const char* gene_id, int exon_idx) {
412 int up_idx, down_idx, success;
413
414 char* up_used_flag = calloc(up_size,sizeof(char));
415 char* down_used_flag = calloc(down_size,sizeof(char));
416
417 Read* currentUpRead;
418 Read* currentDownRead;
419
420 for(up_idx=0;up_idx<up_size;up_idx++) {
421 if( up_used_flag[up_idx] == 1)
422 continue;
423
424 currentUpRead = upstream[up_idx];
425
426 for(down_idx=0;down_idx<down_size;down_idx++) {
427
428 if( up_used_flag[up_idx] == 1 || down_used_flag[down_idx] == 1)
429 continue;
430
431 currentDownRead = downstream[down_idx];
432
433 if(currentUpRead->strand != currentDownRead->strand)
434 continue;
435
436 success = join_reads(exon_stop,exon_start,currentUpRead,currentDownRead,out_fs,gene_id,exon_idx);
437
438 if(success == 1) {
439 up_used_flag[up_idx] = 1;
440 down_used_flag[down_idx] = 1;
441 }
442 }
443 }
444
445 free(up_used_flag);
446 free(down_used_flag);
447 }
448
449 /*
450 * Now we join the candidate reads wherever possible according to the following
451 * scheme:
452 *
453 * ACGTACGTCA GTXXXXXXXXAG ACGTAGACGT
454 * p1 e1 e2 p2
455 *
456 *
457 *
458 *
459 */
460
461 int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FILE* out_fs,const char* gene_id, int exon_idx) {
462 // range of possible sequence length on exon side
463 int up_read_start = up_read->pos;
464 //int up_read_stop = up_read->pos+read_size-1;
465
466 int down_read_start = down_read->pos;
467 int down_read_stop = down_read->pos+read_size-1;
468
469 int up_range = exon_stop - up_read_start + 1;
470 int down_range = down_read_stop - exon_start + 1;
471 int retval;
472
473 int u_size, d_size;
474 u_size = d_size = -1;
475
476 if(up_range+down_range < read_size)
477 return 0;
478
479 if (read_nr % 2 != 0) {
480 d_size = down_range;
481 u_size = read_size - d_size;
482 } else {
483 u_size = up_range;
484 d_size = read_size - u_size;
485 }
486
487 if( u_size > up_range || d_size > down_range)
488 return 0;
489
490 const int p_start = exon_stop - u_size + 1;
491 const int p_stop = exon_start + d_size - 1;
492
493 int u_off = p_start - up_read_start;
494 int d_off = exon_start - down_read_start;
495
496 FA(u_off >= 0);
497 FA(d_off >= 0);
498
499 FA( exon_stop - p_start + p_stop - exon_start + 2 == read_size);
500 FA( u_size + d_size == read_size );
501
502 // seems reasonable up to here
503
504 int buf_size = 4*read_size;
505 char* new_seq = malloc(sizeof(char)*buf_size);
506 memset(new_seq,'z',sizeof(char)*buf_size);
507
508 Tuple jinfo = join_seq(new_seq,up_read->seq,u_off,u_size,down_read->seq,d_off,d_size,down_range);
509
510 int cut_pos = jinfo.first;
511 int additional_pos = jinfo.second;
512
513 buf_size = read_size+1+additional_pos;
514
515 char* new_prb = malloc(sizeof(char)*buf_size);
516 char* new_cal_prb = malloc(sizeof(char)*buf_size);
517 char* new_chastity = malloc(sizeof(char)*buf_size);
518
519 if( jinfo.first == -1 ) {
520 retval = 0;
521 goto free;
522 }
523
524 if( additional_pos > (down_range - d_size) ) {
525 retval = 0;
526 goto free;
527 }
528
529 strncpy(new_prb, up_read->prb+u_off, u_size);
530 strncpy(new_prb+u_size, down_read->prb+d_off, d_size+additional_pos);
531 new_prb[read_size+additional_pos] = '\0';
532
533 strncpy(new_cal_prb, up_read->cal_prb+u_off, u_size);
534 strncpy(new_cal_prb+u_size, down_read->cal_prb+d_off, d_size+additional_pos);
535 new_cal_prb[read_size+additional_pos] = '\0';
536
537 strncpy(new_chastity, up_read->chastity+u_off, u_size);
538 strncpy(new_chastity+u_size, down_read->chastity+d_off, d_size+additional_pos);
539 new_chastity[read_size+additional_pos] = '\0';
540
541
542 int status = 1; //fitting(up_read->prb,down_read->prb,u_off,d_off,u_size,d_size);
543
544 if(status != 1) {
545 retval = 0;
546 goto free;
547 }
548
549 retval = status;
550
551 //fprintf(out_fs,"%d\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%s\t%s\n",
552 // read_nr,up_read->chr,up_read->strand,new_seq,cut_pos,read_size,new_prb,new_cal_prb,new_chastity,gene_id,p_start,exon_stop,exon_start,p_stop,up_read->seq,down_read->seq);
553
554 //fprintf(out_fs,"%lu\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\n",
555 // read_nr,up_read->chr,up_read->strand,new_seq,cut_pos,read_size,new_prb,new_cal_prb,new_chastity,gene_id,p_start,exon_stop,exon_start,p_stop,u_size);
556
557 fprintf(out_fs,"%lu\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\n",
558 read_nr,up_read->chr,up_read->strand,new_seq,cut_pos,read_size,new_prb,new_cal_prb,new_chastity,gene_id,p_start,exon_stop,exon_start,p_stop,u_size);
559
560 read_nr++;
561 combined_reads++;
562
563 free:
564 free(new_seq);
565 free(new_prb);
566 free(new_cal_prb);
567 free(new_chastity);
568
569 return retval;
570 }
571
572
573 void print_read(Read* cRead) {
574 printf(line_format,
575 cRead->chr, cRead->pos, cRead->seq, cRead->id,
576 cRead->strand, cRead->mismatch, cRead->occurrence,
577 cRead->size, cRead->cut, cRead->prb, cRead->cal_prb,
578 cRead->chastity);
579 }
580
581
582 void open_file(const char* filename, const char* mode, FILE** fs) {
583 *fs = fopen(filename,mode);
584 if(*fs == NULL) {
585 printf("Error: Could not open file: %s",filename);
586 exit(EXIT_FAILURE);
587 }
588 }
589
590 /*
591 * The program expects 7 arguments, namely:
592 *
593 * - The strand to be used
594 * - The filename of the gff file
595 * - The filename of the reads file
596 * - The name of the spliced reads ouput file
597 * - The name of the unspliced reads ouput file
598 * - The offset for counting new spliced reads
599 * - The offset for counting new unspliced reads
600 *
601 */
602
603 int main(int argc, char* argv[]) {
604
605 if(argc != 7) {
606 printf("%s\n",info);
607 exit(EXIT_FAILURE);
608 }
609
610 int status;
611 int filenameSize = 256;
612 char* gff_filename = malloc(sizeof(char)*filenameSize);
613
614 strncpy(gff_filename,argv[1],filenameSize);
615
616 FILE *gff_fs;
617 FILE *reads_fs;
618 FILE *out_fs;
619 FILE *unfiltered_out_fs;
620
621 // open file streams for all needed input/output files
622 open_file(argv[1],"r",&gff_fs);
623 open_file(argv[2],"r",&reads_fs);
624 open_file(argv[3],"w",&out_fs);
625 open_file(argv[4],"w",&unfiltered_out_fs);
626
627 read_nr = strtoul(argv[5],NULL,10);
628 read_nr++;
629
630 unfiltered_read_nr = strtoul(argv[6],NULL,10);
631 unfiltered_read_nr++;
632
633 // allocate and load all genes and then close the gff file stream
634 struct gene** allGenes;
635 int numGenes = parse_gff(gff_filename,gff_fs,&allGenes);
636 status = fclose(gff_fs);
637 free(gff_filename);
638 if(status != 0)
639 printf("closing of gff filestream failed!\n");
640
641 printf("Successfully parsed gff file! Found %d genes.\n",numGenes);
642
643 // check if allGenes is sorted. if not throw away those genes that do not
644 // occur in the sorted order
645 int g_idx;
646 struct gene* currentGene = 0;
647 int nulled_genes=0;
648 int old_gene_stop = -1;
649 for(g_idx=0;g_idx<numGenes;g_idx++) {
650 currentGene = allGenes[g_idx];
651
652 if (! (currentGene->start < currentGene->stop))
653 printf("Invalid positions for gene %s!\n",currentGene->id);
654
655 if (! (old_gene_stop < currentGene->start ) ) {
656 printf("Gene %s is overlapping\n",currentGene->id);
657 old_gene_stop = currentGene->stop;
658 allGenes[g_idx] = 0;
659 nulled_genes++;
660 continue;
661 }
662 old_gene_stop = currentGene->stop;
663 }
664
665 printf("Found %d unordered genes.\n",nulled_genes);
666 int gidx, eidx;
667 int exon_cov = 0;
668 for(gidx=0;gidx<numGenes;gidx++) {
669 if (allGenes[gidx] == 0)
670 continue;
671
672 for(eidx=0;eidx<allGenes[gidx]->num_exons;eidx++) {
673 exon_cov += allGenes[gidx]->exon_stops[eidx] - allGenes[gidx]->exon_starts[eidx];
674 }}
675 printf("Exon coverage is %f\n",(double)exon_cov/30432563);
676
677 // now that we loaded all neccessary data we start to process the reads
678 process_reads(reads_fs,&allGenes,numGenes,out_fs,unfiltered_out_fs);
679
680 // free all allocated ressources
681 for(g_idx=0;g_idx<numGenes;g_idx++) {
682 if(allGenes[g_idx] != 0) {
683 free_gene(allGenes[g_idx]);
684 free(allGenes[g_idx]);
685 }
686 }
687 free(allGenes);
688
689 status = fclose(reads_fs);
690 status = fclose(out_fs);
691 if(status != 0)
692 perror("fclose");
693
694 return 0;
695 }