+ some optimization
[qpalma.git] / tools / data_tools / filterReads.c
1 ////////////////////////////////////////////////////////////////////////////////
2 // The purpose of this program is to read a gff and a
3 // solexa reads file and create a data set used by QPalma.
4 //
5 //
6 //
7 // Notes:
8 //
9 // - Both the read indices and the gff gene indices are one-based
10 //
11 //
12 ////////////////////////////////////////////////////////////////////////////////
13
14 #include <sys/mman.h>
15 #include <sys/stat.h>
16 #include <stdio.h>
17 #include <stdlib.h>
18 #include <assert.h>
19 #include <string.h>
20 #include <unistd.h>
21 #include <math.h>
22
23 #include "debug_tools.h"
24 #include "join.h"
25 #include "datastructures.h"
26
27 #define _FILE_OFFSET_BITS == 64
28
29 const char* line_format = "%d\t%d\t%s\t%lu\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n";
30
31 const int read_size = 36;
32
33 const int min_overlap = 1;
34 const int max_overlap = 35;
35
36 unsigned long read_nr = 1;
37 unsigned long unfiltered_read_nr = 1;
38
39 int parse_gff(char* filename,FILE* fid,struct gene*** allGenes);
40 void sort_genes(struct gene*** allGenes, int numGenes);
41 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* out_fs, FILE* unfiltered_out_fs);
42 void combine_info(int exon_stop, int exon_start, Read** upstream, int up_size, Read** downstream, int down_size, FILE* out_fs,const char* gene_id, int exon_idx);
43 int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FILE* out_fs,const char* gene_id, int exon_idx);
44 int fitting(char* up_prb, char* down_prb, int u_off, int d_off, int u_size, int d_size);
45 Tuple join_seq(char* new_seq, char* up_seq,int u_off, int u_size, char* down_seq, int d_off, int d_size, int d_range);
46
47 static char *info = "Usage is:\n./filterReads gff reads output";
48
49 /*
50 * Some constants specifying the exact behavior of the filter
51 *
52 */
53
54 //#define _FDEBUG_ 0
55 //#define DEBUG_READ 38603
56
57
58 //
59 /*
60 * TODO:
61 * - Check strand -> done simple (only if equal)
62 * - check for [AC] and similar entries -> done simple (see function
63 */
64
65 #ifdef _FDEBUG_
66 if(read_nr == DEBUG_READ) {
67 printf("read nr %lu",read_nr);
68 printf("pos: %d %d %d %d\n",p_start,exon_stop,exon_start,p_stop);
69 printf("u/d range: %d %d\n",up_range,down_range);
70 printf("u/d size: %d %d\n",u_size,d_size);
71 printf("u/d off: %d %d\n",u_off,d_off);
72 printf("add_pos/down_range-d_size: %d %d\n",additional_pos,(down_range - d_size));
73
74 printf("******************\n");
75
76 printf("%s\n",up_read->seq);
77 //printf("%s\n",new_up_seq);
78
79 printf("******************\n");
80
81 printf("%s\n",down_read->seq);
82 //printf("%s\n",new_down_seq);
83
84 printf("******************\n");
85 printf("%s\n",new_seq);
86 printf("%s\n",new_prb);
87 printf("%s\n",new_cal_prb);
88 printf("%s\n",new_chastity);
89 }
90 #endif // _FDEBUG_
91
92 int combined_reads = 0;
93
94 char current_strand;
95
96 /*
97 *
98 *
99 */
100
101 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* out_fs, FILE* unfiltered_out_fs) {
102 int status;
103
104 int buffer_size= 64;
105 int chr = 0;
106 int pos = 0;
107 char* seq = malloc(sizeof(char)*buffer_size);
108 unsigned long id = 0;
109 char strand = ' ';
110 int mismatch = 0;
111 int occurrence = 0;
112 int size = 0;
113 int cut = 0;
114 char* prb = malloc(sizeof(char)*buffer_size);
115 char* cal_prb = malloc(sizeof(char)*buffer_size);
116 char* chastity = malloc(sizeof(char)*buffer_size);
117
118 int reads_fid = fileno(reads_fs);
119 struct stat reads_stat;
120 if ( fstat(reads_fid,&reads_stat) == -1) {
121 perror("fstat");
122 exit(EXIT_FAILURE);
123 }
124 off_t reads_filesize = reads_stat.st_size;
125 printf("Reads file is of size %lu bytes\n",(unsigned long) reads_filesize);
126 int numReads = reads_filesize / 178.0;
127
128 void *reads_area = mmap (NULL,reads_filesize,PROT_READ,MAP_PRIVATE,reads_fid,0);
129 if (reads_area == MAP_FAILED) {
130 perror("mmap");
131 exit(EXIT_FAILURE);
132 }
133 close(reads_fid);
134 printf("Successfully mapped %lu bytes of reads file into memory\n",(unsigned long)reads_filesize);
135
136 char* lineBeginPtr = (char*) reads_area;
137 char* lineEndPtr = (char*) reads_area;
138 char* end_of_mapped_area = ((char*) reads_area) + reads_filesize;
139
140 while (*lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
141 lineEndPtr++;
142
143 char* current_line = malloc(sizeof(char)*512);
144 memset(current_line,0,512);
145
146 int SIZE = 5000;
147 // initialize boundary arrays
148 Read** upstream_overlap = malloc(sizeof(Read*)*SIZE);
149 Read** downstream_overlap= malloc(sizeof(Read*)*SIZE);
150 int uov,dov;
151 uov = dov = 0;
152
153 int skippedLinesCounter = 0;
154
155 int prev_exon_start = -1;
156 int prev_exon_stop = -1;
157 int cur_exon_start = -1;
158
159 unsigned long line_size = lineEndPtr - lineBeginPtr;
160 //printf("distance is %lu\n",line_size);
161 strncpy(current_line,lineBeginPtr,line_size);
162 current_line[line_size] = '\0';
163 //printf("%s test",current_line);
164
165 int gene_idx = 0;
166 int exon_idx = 1;
167 struct gene* currentGene = (*allGenes)[gene_idx];
168 char* gene_id = currentGene->id;
169
170 int skippedReadCtr = 0;
171 int uselessReadCtr = 0;
172 int exonicReadCtr = 0;
173
174 int currentOverlapCtr = 0;
175 int previousOverlapCtr = 0;
176
177 int multioccurReadCtr = 0;
178
179 Read* currentRead;
180 int up_idx, down_idx;
181
182 int readCtr = 0;
183 int wrong_strand_ctr = 0;
184 int read_within_gene_ctr = 0;
185 int read_outside_gene_ctr = 0;
186 int read_start, read_stop;
187 // start of the parsing loop
188
189 //static char *all_strands = "+-";
190 //static char *all_strands = "DP";
191 //int strand_idx;
192 //for(strand_idx=0;strand_idx<2;strand_idx++) {
193 // char current_strand = all_strands[strand_idx];
194
195 while(1) {
196 if (gene_idx == numGenes || strcmp(current_line,"") == 0)
197 break;
198
199 gene_id = currentGene->id;
200
201 if (readCtr != 0 && readCtr % 1000000 == 0) {
202 printf("Processed %d/%d genes and %d/%d reads.\n",gene_idx,numGenes,readCtr,numReads);
203 printf("%d reads in total while %d reads where skipped.\n",readCtr,skippedLinesCounter);
204 printf("%d were totally intronic. %d were outside of genes and %d were occurring on several positions.\n",uselessReadCtr,skippedReadCtr,multioccurReadCtr);
205 printf("%d reads were useless in total\n",uselessReadCtr+skippedReadCtr+multioccurReadCtr);
206 printf("\t%d useless reads\n",uselessReadCtr);
207 printf("\t%d skipped reads\n",skippedReadCtr);
208 printf("\t%d multioccurring\n",multioccurReadCtr);
209 printf("\t%d wrong_strand\n",wrong_strand_ctr);
210 printf("%d within gene\n",read_within_gene_ctr);
211 printf("%d outside gene\n",read_outside_gene_ctr);
212 printf("%d reads were totally exonic\n",exonicReadCtr);
213 printf("%d reads overlap with prev. exon. %d reads overlap with current exon\n",previousOverlapCtr,currentOverlapCtr);
214 printf("%d reads where newly combined from two original reads\n",combined_reads);
215 printf("Total used reads: %d (some may not be combined)\n",exonicReadCtr+combined_reads);
216 }
217
218 // pos of the reads is one-based
219 status = sscanf(current_line,line_format,&chr,&pos,seq,&id,&strand,&mismatch,&occurrence,&size,&cut,prb,cal_prb,chastity);
220 if (status < 12) {
221 skippedLinesCounter++;
222 goto next_read;
223 }
224
225 // if the read is occurring several times elsewhere then get rid of it
226 if ( occurrence != 1 ) {
227 multioccurReadCtr++;
228 goto next_read;
229 }
230
231 // define read start and stop positions
232 read_start = pos;
233 read_stop = pos + read_size-1;
234
235 if( strand != current_strand ) {
236 wrong_strand_ctr++;
237 goto next_read;
238 }
239
240 FA(strlen(seq) >= read_size);
241
242 FA(currentGene != 0);
243
244 if ( currentGene->start <= read_start && read_stop <= currentGene->stop) { // read is within gene borders
245 read_within_gene_ctr++;
246
247 exon_label:
248
249 if (exon_idx == currentGene->num_exons) {
250 gene_idx++;
251 exon_idx = 1;
252 currentGene = (*allGenes)[gene_idx];
253 while( (currentGene == 0 || currentGene->strand != current_strand) && gene_idx < numGenes) {
254 currentGene = (*allGenes)[gene_idx];
255 gene_idx++;
256 }
257 continue;
258 }
259
260 prev_exon_start = currentGene->exon_starts[exon_idx-1];
261 prev_exon_stop = currentGene->exon_stops[exon_idx-1];
262 cur_exon_start = currentGene->exon_starts[exon_idx];
263
264 //printf("id: %s,exon_idx: %d intron: %d %d read start/stop: %d / %d\n",currentGene->id,exon_idx,prev_exon_stop,cur_exon_start,read_start,read_stop);
265
266 if ( (cur_exon_start - prev_exon_stop - 1) < min_overlap || cur_exon_start < read_start ) { // go to next exon
267
268 if (uov != 0 && dov != 0)
269 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
270
271 for(up_idx=0;up_idx<uov;up_idx++) {
272 free_read(upstream_overlap[up_idx]);
273 free(upstream_overlap[up_idx]);
274 }
275
276 for(down_idx=0;down_idx<dov;down_idx++) {
277 free_read(downstream_overlap[down_idx]);
278 free(downstream_overlap[down_idx]);
279 }
280
281 uov = dov = 0;
282
283 exon_idx++;
284 goto exon_label;
285 }
286
287 if ( prev_exon_start <= read_start && read_stop <= prev_exon_stop ) { // read is inside previous exon
288
289 // output of unused i.e. unspliced reads
290 fprintf(unfiltered_out_fs,"%lu\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\n",
291 unfiltered_read_nr,chr,strand,seq,0,read_size,prb,cal_prb,chastity,gene_id,read_start,-1,-1,read_stop,-1);
292 unfiltered_read_nr++;
293
294 exonicReadCtr++;
295 goto next_read;
296 }
297
298 if ( read_stop < prev_exon_stop ) { // go to next read
299 //printf("%d\t%d\t%d\n",read_start,prev_exon_start,prev_exon_stop);
300 //if( exon_idx > 1) {
301 // printf("%d\t%d\n", currentGene->exon_starts[exon_idx-2], currentGene->exon_stops[exon_idx-2]);
302 // printf("---\n");
303 //}
304
305 uselessReadCtr++;
306 goto next_read;
307 }
308
309 // if this position is reached the read is somehow overlapping or
310 // exactly on the exon boundary.
311 if ( (prev_exon_stop - read_start + 1) >= min_overlap && (prev_exon_stop - read_start + 1) <= max_overlap ) { // read overlaps with previous exon boundary
312 //printf("%s\n",current_line);
313 previousOverlapCtr++;
314 currentRead = create_read(chr,read_start,seq,id,strand,mismatch,occurrence,size,cut,prb,cal_prb,chastity);
315 assert (uov < SIZE);
316 upstream_overlap[uov] = currentRead;
317 uov++;
318 goto next_read;
319 }
320
321 if ( ( read_stop - cur_exon_start + 1) >= min_overlap && (read_stop - cur_exon_start + 1) <= max_overlap ) { // read overlaps with current exon boundary
322 //printf("%s\n",current_line);
323 currentOverlapCtr++;
324 currentRead = create_read(chr,read_start,seq,id,strand,mismatch,occurrence,size,cut,prb,cal_prb,chastity);
325 assert (dov < SIZE);
326 downstream_overlap[dov] = currentRead;
327 dov++;
328 goto next_read;
329 }
330
331 uselessReadCtr++;
332 goto next_read; // read was not useful i.e. not overlapping/starting at exon boundaries
333
334 } else { // read is not within gene borders
335 read_outside_gene_ctr++;
336
337 if (uov != 0 && dov != 0)
338 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
339
340 for(up_idx=0;up_idx<uov;up_idx++) {
341 free_read(upstream_overlap[up_idx]);
342 free(upstream_overlap[up_idx]);
343 }
344
345 for(down_idx=0;down_idx<dov;down_idx++) {
346 free_read(downstream_overlap[down_idx]);
347 free(downstream_overlap[down_idx]);
348 }
349
350 uov = dov = 0;
351
352 if ( currentGene->stop < read_stop ) { // go to next gene
353 gene_idx++;
354 exon_idx = 1;
355 currentGene = (*allGenes)[gene_idx];
356 while( currentGene == 0 && gene_idx < numGenes) {
357 currentGene = (*allGenes)[gene_idx];
358 gene_idx++;
359 }
360 //printf("currentGene->start / currentGene->stop %d/%d pos is %d\n",currentGene->start,currentGene->stop,pos);
361 continue;
362 }
363
364 if ( read_start < currentGene->start ) { // go to next read
365 skippedReadCtr++;
366
367 next_read:
368
369 lineBeginPtr = lineEndPtr;
370 while (*(char*)lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
371 lineEndPtr++;
372 readCtr += 1;
373 current_line = strncpy(current_line,lineBeginPtr,lineEndPtr-lineBeginPtr);
374 current_line[lineEndPtr-lineBeginPtr] = '\0';
375 continue;
376 }
377 }
378 }
379
380 //} // end of for all strands
381
382 if (uov != 0 && dov != 0)
383 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
384
385 for(up_idx=0;up_idx<uov;up_idx++) {
386 free_read(upstream_overlap[up_idx]);
387 free(upstream_overlap[up_idx]);
388 }
389
390 for(down_idx=0;down_idx<dov;down_idx++) {
391 free_read(downstream_overlap[down_idx]);
392 free(downstream_overlap[down_idx]);
393 }
394
395 uov = dov = 0;
396
397 free(upstream_overlap);
398 free(downstream_overlap);
399
400 printf("Processed %d reads in total while %d reads where skipped.\n",readCtr,skippedLinesCounter);
401 printf("%d were totally intronic. %d were outside of genes and %d were occurring on several positions.\n",uselessReadCtr,skippedReadCtr,multioccurReadCtr);
402 printf("%d reads were useless in total\n",uselessReadCtr+skippedReadCtr+multioccurReadCtr);
403 printf("\t%d useless reads\n",uselessReadCtr);
404 printf("\t%d skipped reads\n",skippedReadCtr);
405 printf("\t%d multioccurring\n",multioccurReadCtr);
406 printf("\t%d wrong_strand\n",wrong_strand_ctr);
407 printf("%d reads were totally exonic\n",exonicReadCtr);
408 printf("%d reads overlap with prev. exon. %d reads overlap with current exon\n",previousOverlapCtr,currentOverlapCtr);
409 printf("%d reads where newly combined from two original reads\n",combined_reads);
410 printf("Total used reads: %d (some may not be combined)\n",exonicReadCtr+combined_reads);
411
412 status = munmap(reads_area,reads_filesize);
413 if(status != 0)
414 perror("munmap");
415
416 free(current_line);
417 free(seq);
418 free(prb);
419 free(cal_prb);
420 free(chastity);
421 }
422
423
424 void combine_info(int exon_stop, int exon_start, Read** upstream, int up_size, Read** downstream, int down_size, FILE* out_fs,const char* gene_id, int exon_idx) {
425 int up_idx, down_idx, success;
426
427 char* up_used_flag = calloc(up_size,sizeof(char));
428 char* down_used_flag = calloc(down_size,sizeof(char));
429
430 Read* currentUpRead;
431 Read* currentDownRead;
432
433 for(up_idx=0;up_idx<up_size;up_idx++) {
434 if( up_used_flag[up_idx] == 1)
435 continue;
436
437 currentUpRead = upstream[up_idx];
438
439 for(down_idx=0;down_idx<down_size;down_idx++) {
440
441 if( up_used_flag[up_idx] == 1 || down_used_flag[down_idx] == 1)
442 continue;
443
444 currentDownRead = downstream[down_idx];
445
446 if(currentUpRead->strand != currentDownRead->strand)
447 continue;
448
449 success = join_reads(exon_stop,exon_start,currentUpRead,currentDownRead,out_fs,gene_id,exon_idx);
450
451 if(success == 1) {
452 up_used_flag[up_idx] = 1;
453 down_used_flag[down_idx] = 1;
454 }
455 }
456 }
457
458 free(up_used_flag);
459 free(down_used_flag);
460 }
461
462 /*
463 * Now we join the candidate reads wherever possible according to the following
464 * scheme:
465 *
466 * ACGTACGTCA GTXXXXXXXXAG ACGTAGACGT
467 * p1 e1 e2 p2
468 *
469 *
470 *
471 *
472 */
473
474 int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FILE* out_fs,const char* gene_id, int exon_idx) {
475 // range of possible sequence length on exon side
476 int up_read_start = up_read->pos;
477 //int up_read_stop = up_read->pos+read_size-1;
478
479 int down_read_start = down_read->pos;
480 int down_read_stop = down_read->pos+read_size-1;
481
482 int up_range = exon_stop - up_read_start + 1;
483 int down_range = down_read_stop - exon_start + 1;
484 int retval;
485
486 int u_size, d_size;
487 u_size = d_size = -1;
488
489 if(up_range+down_range < read_size)
490 return 0;
491
492 if (read_nr % 2 != 0) {
493 d_size = down_range;
494 u_size = read_size - d_size;
495 } else {
496 u_size = up_range;
497 d_size = read_size - u_size;
498 }
499
500 if( u_size > up_range || d_size > down_range)
501 return 0;
502
503 const int p_start = exon_stop - u_size + 1;
504 const int p_stop = exon_start + d_size - 1;
505
506 int u_off = p_start - up_read_start;
507 int d_off = exon_start - down_read_start;
508
509 FA(u_off >= 0);
510 FA(d_off >= 0);
511
512 FA( exon_stop - p_start + p_stop - exon_start + 2 == read_size);
513 FA( u_size + d_size == read_size );
514
515 // seems reasonable up to here
516
517 int buf_size = 4*read_size;
518 char* new_seq = malloc(sizeof(char)*buf_size);
519 memset(new_seq,'z',sizeof(char)*buf_size);
520
521 Tuple jinfo = join_seq(new_seq,up_read->seq,u_off,u_size,down_read->seq,d_off,d_size,down_range);
522
523 int cut_pos = jinfo.first;
524 int additional_pos = jinfo.second;
525
526 buf_size = read_size+1+additional_pos;
527
528 char* new_prb = malloc(sizeof(char)*buf_size);
529 char* new_cal_prb = malloc(sizeof(char)*buf_size);
530 char* new_chastity = malloc(sizeof(char)*buf_size);
531
532 if( jinfo.first == -1 ) {
533 retval = 0;
534 goto free;
535 }
536
537 if( additional_pos > (down_range - d_size) ) {
538 retval = 0;
539 goto free;
540 }
541
542 strncpy(new_prb, up_read->prb+u_off, u_size);
543 strncpy(new_prb+u_size, down_read->prb+d_off, d_size+additional_pos);
544 new_prb[read_size+additional_pos] = '\0';
545
546 strncpy(new_cal_prb, up_read->cal_prb+u_off, u_size);
547 strncpy(new_cal_prb+u_size, down_read->cal_prb+d_off, d_size+additional_pos);
548 new_cal_prb[read_size+additional_pos] = '\0';
549
550 strncpy(new_chastity, up_read->chastity+u_off, u_size);
551 strncpy(new_chastity+u_size, down_read->chastity+d_off, d_size+additional_pos);
552 new_chastity[read_size+additional_pos] = '\0';
553
554
555 int status = 1; //fitting(up_read->prb,down_read->prb,u_off,d_off,u_size,d_size);
556
557 if(status != 1) {
558 retval = 0;
559 goto free;
560 }
561
562 retval = status;
563
564 //fprintf(out_fs,"%d\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%s\t%s\n",
565 // read_nr,up_read->chr,up_read->strand,new_seq,cut_pos,read_size,new_prb,new_cal_prb,new_chastity,gene_id,p_start,exon_stop,exon_start,p_stop,up_read->seq,down_read->seq);
566
567 //fprintf(out_fs,"%lu\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\n",
568 // read_nr,up_read->chr,up_read->strand,new_seq,cut_pos,read_size,new_prb,new_cal_prb,new_chastity,gene_id,p_start,exon_stop,exon_start,p_stop,u_size);
569
570 fprintf(out_fs,"%lu\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\n",
571 read_nr,up_read->chr,up_read->strand,new_seq,cut_pos,read_size,new_prb,new_cal_prb,new_chastity,gene_id,p_start,exon_stop,exon_start,p_stop,u_size);
572
573 read_nr++;
574 combined_reads++;
575
576 free:
577 free(new_seq);
578 free(new_prb);
579 free(new_cal_prb);
580 free(new_chastity);
581
582 return retval;
583 }
584
585
586 void print_read(Read* cRead) {
587 printf(line_format,
588 cRead->chr, cRead->pos, cRead->seq, cRead->id,
589 cRead->strand, cRead->mismatch, cRead->occurrence,
590 cRead->size, cRead->cut, cRead->prb, cRead->cal_prb,
591 cRead->chastity);
592 }
593
594
595 void open_file(const char* filename, const char* mode, FILE** fs) {
596 *fs = fopen(filename,mode);
597 if(*fs == NULL) {
598 printf("Error: Could not open file: %s",filename);
599 exit(EXIT_FAILURE);
600 }
601 }
602
603 /*
604 * The program expects 7 arguments, namely:
605 *
606 * - The strand to be used
607 * - The filename of the gff file
608 * - The filename of the reads file
609 * - The name of the spliced reads ouput file
610 * - The name of the unspliced reads ouput file
611 * - The offset for counting new spliced reads
612 * - The offset for counting new unspliced reads
613 *
614 */
615
616 int main(int argc, char* argv[]) {
617
618 if(argc != 8) {
619 printf("%s\n",info);
620 exit(EXIT_FAILURE);
621 }
622
623 int status;
624 int filenameSize = 256;
625 char* gff_filename = malloc(sizeof(char)*filenameSize);
626
627 current_strand = argv[1][0];
628 printf("current strand is %c\n",current_strand);
629 strncpy(gff_filename,argv[2],filenameSize);
630
631 FILE *gff_fs;
632 FILE *reads_fs;
633 FILE *out_fs;
634 FILE *unfiltered_out_fs;
635
636 // open file streams for all needed input/output files
637 open_file(argv[2],"r",&gff_fs);
638 open_file(argv[3],"r",&reads_fs);
639 open_file(argv[4],"w",&out_fs);
640 open_file(argv[5],"w",&unfiltered_out_fs);
641
642 read_nr = strtoul(argv[6],NULL,10);
643 read_nr++;
644
645 unfiltered_read_nr = strtoul(argv[7],NULL,10);
646 unfiltered_read_nr++;
647
648 // allocate and load all genes and then close the gff file stream
649 struct gene** allGenes;
650 int numGenes = parse_gff(gff_filename,gff_fs,&allGenes);
651 status = fclose(gff_fs);
652 free(gff_filename);
653 if(status != 0)
654 printf("closing of gff filestream failed!\n");
655
656 printf("Successfully parsed gff file! Found %d genes.\n",numGenes);
657
658 // check if allGenes is sorted. if not throw away those genes that do not
659 // occur in the sorted order
660 int g_idx;
661 struct gene* currentGene = 0;
662 int nulled_genes=0;
663 int old_gene_stop = -1;
664 for(g_idx=0;g_idx<numGenes;g_idx++) {
665 currentGene = allGenes[g_idx];
666
667 if (! (currentGene->start < currentGene->stop))
668 printf("Invalid positions for gene %s!\n",currentGene->id);
669
670 if (! (old_gene_stop < currentGene->start ) ) {
671 old_gene_stop = currentGene->stop;
672 allGenes[g_idx] = 0;
673 nulled_genes++;
674 continue;
675 }
676 old_gene_stop = currentGene->stop;
677 }
678
679 printf("Found %d unordered genes.\n",nulled_genes);
680 int gidx, eidx;
681 int exon_cov = 0;
682 for(gidx=0;gidx<numGenes;gidx++) {
683 if (allGenes[gidx] == 0)
684 continue;
685
686 for(eidx=0;eidx<allGenes[gidx]->num_exons;eidx++) {
687 exon_cov += allGenes[gidx]->exon_stops[eidx] - allGenes[gidx]->exon_starts[eidx];
688 }}
689 printf("Exon coverage is %f\n",(double)exon_cov/30432563);
690
691 // now that we loaded all neccessary data we start to process the reads
692 process_reads(reads_fs,&allGenes,numGenes,out_fs,unfiltered_out_fs);
693
694 // free all allocated ressources
695 for(g_idx=0;g_idx<numGenes;g_idx++) {
696 if(allGenes[g_idx] != 0) {
697 free_gene(allGenes[g_idx]);
698 free(allGenes[g_idx]);
699 }
700 }
701 free(allGenes);
702
703 status = fclose(reads_fs);
704 status = fclose(out_fs);
705 if(status != 0)
706 perror("fclose");
707
708 return 0;
709 }