+ added reverse complementation to filter reads function
[qpalma.git] / tools / data_tools / filterReads.c
1 ////////////////////////////////////////////////////////////////////////////////
2 // The purpose of this program is to read a gff and a
3 // solexa reads file and create a data set used by QPalma.
4 //
5 //
6 //
7 // Notes:
8 //
9 // - Both the read indices and the gff gene indices are one-based
10 //
11 //
12 ////////////////////////////////////////////////////////////////////////////////
13
14 #include <sys/mman.h>
15 #include <sys/stat.h>
16 #include <stdio.h>
17 #include <stdlib.h>
18 #include <assert.h>
19 #include <string.h>
20 #include <unistd.h>
21 #include <math.h>
22
23 #include "debug_tools.h"
24 #include "join.h"
25 #include "datastructures.h"
26
27 #define _FILE_OFFSET_BITS == 64
28
29 const char* line_format = "%d\t%d\t%s\t%lu\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n";
30
31 const int read_size = 36;
32
33 const int min_overlap = 1;
34 const int max_overlap = 35;
35
36 unsigned long read_nr = 1;
37 unsigned long unfiltered_read_nr = 1;
38
39 int parse_gff(char* filename,FILE* fid,struct gene*** allGenes);
40 void sort_genes(struct gene*** allGenes, int numGenes);
41 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* out_fs, FILE* unfiltered_out_fs);
42 void combine_info(int exon_stop, int exon_start, Read** upstream, int up_size, Read** downstream, int down_size, FILE* out_fs,const char* gene_id, int exon_idx);
43 int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FILE* out_fs,const char* gene_id, int exon_idx);
44 int fitting(char* up_prb, char* down_prb, int u_off, int d_off, int u_size, int d_size);
45 Tuple join_seq(char* new_seq, char* up_seq,int u_off, int u_size, char* down_seq, int d_off, int d_size, int d_range);
46
47 static char *info = "Usage is:\n./filterReads gff reads output";
48
49 void reverse_complement(char** seq, int len);
50
51 char current_strand;
52
53 /*
54 * Some constants specifying the exact behavior of the filter
55 *
56 */
57
58 //#define _FDEBUG_ 0
59 //#define DEBUG_READ 38603
60
61
62 //
63 /*
64 * TODO:
65 * - Check strand -> done simple (only if equal)
66 * - check for [AC] and similar entries -> done simple (see function
67 */
68
69 #ifdef _FDEBUG_
70 if(read_nr == DEBUG_READ) {
71 printf("read nr %lu",read_nr);
72 printf("pos: %d %d %d %d\n",p_start,exon_stop,exon_start,p_stop);
73 printf("u/d range: %d %d\n",up_range,down_range);
74 printf("u/d size: %d %d\n",u_size,d_size);
75 printf("u/d off: %d %d\n",u_off,d_off);
76 printf("add_pos/down_range-d_size: %d %d\n",additional_pos,(down_range - d_size));
77
78 printf("******************\n");
79
80 printf("%s\n",up_read->seq);
81 //printf("%s\n",new_up_seq);
82
83 printf("******************\n");
84
85 printf("%s\n",down_read->seq);
86 //printf("%s\n",new_down_seq);
87
88 printf("******************\n");
89 printf("%s\n",new_seq);
90 printf("%s\n",new_prb);
91 printf("%s\n",new_cal_prb);
92 printf("%s\n",new_chastity);
93 }
94 #endif // _FDEBUG_
95
96 int combined_reads = 0;
97
98 /*
99 *
100 *
101 */
102
103 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* out_fs, FILE* unfiltered_out_fs) {
104 int status;
105
106 int buffer_size= 64;
107 int chr = 0;
108 int pos = 0;
109 char* seq = malloc(sizeof(char)*buffer_size);
110 unsigned long id = 0;
111 char strand = ' ';
112 int mismatch = 0;
113 int occurrence = 0;
114 int size = 0;
115 int cut = 0;
116 char* prb = malloc(sizeof(char)*buffer_size);
117 char* cal_prb = malloc(sizeof(char)*buffer_size);
118 char* chastity = malloc(sizeof(char)*buffer_size);
119
120 int reads_fid = fileno(reads_fs);
121 struct stat reads_stat;
122 if ( fstat(reads_fid,&reads_stat) == -1) {
123 perror("fstat");
124 exit(EXIT_FAILURE);
125 }
126 off_t reads_filesize = reads_stat.st_size;
127 printf("Reads file is of size %lu bytes\n",(unsigned long) reads_filesize);
128 //int numReads = reads_filesize / 178.0;
129
130 void *reads_area = mmap (NULL,reads_filesize,PROT_READ,MAP_PRIVATE,reads_fid,0);
131 if (reads_area == MAP_FAILED) {
132 perror("mmap");
133 exit(EXIT_FAILURE);
134 }
135 close(reads_fid);
136 printf("Successfully mapped %lu bytes of reads file into memory\n",(unsigned long)reads_filesize);
137
138 char* lineBeginPtr = (char*) reads_area;
139 char* lineEndPtr = (char*) reads_area;
140 char* end_of_mapped_area = ((char*) reads_area) + reads_filesize;
141
142 while (*lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
143 lineEndPtr++;
144
145 char* current_line = malloc(sizeof(char)*512);
146 memset(current_line,0,512);
147
148 int SIZE = 5000;
149 // initialize boundary arrays
150 Read** upstream_overlap = malloc(sizeof(Read*)*SIZE);
151 Read** downstream_overlap= malloc(sizeof(Read*)*SIZE);
152 int uov,dov;
153 uov = dov = 0;
154
155 int skippedLinesCounter = 0;
156
157 int prev_exon_start = -1;
158 int prev_exon_stop = -1;
159 int cur_exon_start = -1;
160
161 unsigned long line_size = lineEndPtr - lineBeginPtr;
162 //printf("distance is %lu\n",line_size);
163 strncpy(current_line,lineBeginPtr,line_size);
164 current_line[line_size] = '\0';
165 //printf("%s test",current_line);
166
167 int gene_idx = 0;
168 int exon_idx = 1;
169 struct gene* currentGene = (*allGenes)[gene_idx];
170 char* gene_id = currentGene->id;
171
172 int skippedReadCtr = 0;
173 int uselessReadCtr = 0;
174 int exonicReadCtr = 0;
175
176 int currentOverlapCtr = 0;
177 int previousOverlapCtr = 0;
178
179 int multioccurReadCtr = 0;
180
181 Read* currentRead;
182 int up_idx, down_idx;
183
184 int readCtr = 0;
185 int wrong_strand_ctr = 0;
186 int read_within_gene_ctr = 0;
187 int read_outside_gene_ctr = 0;
188 int read_start, read_stop;
189 // start of the parsing loop
190
191 while(1) {
192 if (gene_idx == numGenes || strcmp(current_line,"") == 0)
193 break;
194
195 gene_id = currentGene->id;
196
197 //if (readCtr != 0 && readCtr % 1000000 == 0) {
198 // printf("Processed %d/%d genes and %d/%d reads.\n",gene_idx,numGenes,readCtr,numReads);
199 // printf("%d reads in total while %d reads where skipped.\n",readCtr,skippedLinesCounter);
200 // printf("%d were totally intronic. %d were outside of genes and %d were occurring on several positions.\n",uselessReadCtr,skippedReadCtr,multioccurReadCtr);
201 // printf("%d reads were useless in total\n",uselessReadCtr+skippedReadCtr+multioccurReadCtr);
202 // printf("\t%d useless reads\n",uselessReadCtr);
203 // printf("\t%d skipped reads\n",skippedReadCtr);
204 // printf("\t%d multioccurring\n",multioccurReadCtr);
205 // printf("\t%d wrong_strand\n",wrong_strand_ctr);
206 // printf("%d within gene\n",read_within_gene_ctr);
207 // printf("%d outside gene\n",read_outside_gene_ctr);
208 // printf("%d reads were totally exonic\n",exonicReadCtr);
209 // printf("%d reads overlap with prev. exon. %d reads overlap with current exon\n",previousOverlapCtr,currentOverlapCtr);
210 // printf("%d reads where newly combined from two original reads\n",combined_reads);
211 // printf("Total used reads: %d (some may not be combined)\n",exonicReadCtr+combined_reads);
212 //}
213
214 // pos of the reads is one-based
215 status = sscanf(current_line,line_format,&chr,&pos,seq,&id,&strand,&mismatch,&occurrence,&size,&cut,prb,cal_prb,chastity);
216 if (status < 12) {
217 skippedLinesCounter++;
218 goto next_read;
219 }
220
221 // if the read is occurring several times elsewhere then get rid of it
222 if ( occurrence != 1 ) {
223 multioccurReadCtr++;
224 goto next_read;
225 }
226
227 //printf("before rc %s\n",seq);
228 if (current_strand == 'P')
229 reverse_complement(&seq,strlen(seq));
230 //printf("after rc %s\n",seq);
231
232 // define read start and stop positions
233 read_start = pos;
234 read_stop = pos + read_size-1;
235
236 FA(strlen(seq) >= read_size);
237
238 FA(currentGene != 0);
239
240 if ( currentGene->start <= read_start && read_stop <= currentGene->stop) { // read is within gene borders
241 read_within_gene_ctr++;
242
243 exon_label:
244
245 if (exon_idx == currentGene->num_exons) {
246 gene_idx++;
247 exon_idx = 1;
248 currentGene = (*allGenes)[gene_idx];
249 while( currentGene == 0 && gene_idx < numGenes) {
250 currentGene = (*allGenes)[gene_idx];
251 gene_idx++;
252 }
253 continue;
254 }
255
256 prev_exon_start = currentGene->exon_starts[exon_idx-1];
257 prev_exon_stop = currentGene->exon_stops[exon_idx-1];
258 cur_exon_start = currentGene->exon_starts[exon_idx];
259
260 //printf("id: %s,exon_idx: %d intron: %d %d read start/stop: %d / %d\n",currentGene->id,exon_idx,prev_exon_stop,cur_exon_start,read_start,read_stop);
261
262 if ( (cur_exon_start - prev_exon_stop - 1) < min_overlap || cur_exon_start < read_start ) { // go to next exon
263
264 if (uov != 0 && dov != 0)
265 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
266
267 for(up_idx=0;up_idx<uov;up_idx++) {
268 free_read(upstream_overlap[up_idx]);
269 free(upstream_overlap[up_idx]);
270 }
271
272 for(down_idx=0;down_idx<dov;down_idx++) {
273 free_read(downstream_overlap[down_idx]);
274 free(downstream_overlap[down_idx]);
275 }
276
277 uov = dov = 0;
278
279 exon_idx++;
280 goto exon_label;
281 }
282
283 if ( prev_exon_start <= read_start && read_stop <= prev_exon_stop ) { // read is inside previous exon
284
285 // output of unused i.e. unspliced reads
286 fprintf(unfiltered_out_fs,"%lu\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\n",
287 unfiltered_read_nr,chr,strand,seq,0,read_size,prb,cal_prb,chastity,gene_id,read_start,-1,-1,read_stop,-1);
288 unfiltered_read_nr++;
289
290 exonicReadCtr++;
291 goto next_read;
292 }
293
294 if ( read_stop < prev_exon_stop ) { // go to next read
295 //printf("%d\t%d\t%d\n",read_start,prev_exon_start,prev_exon_stop);
296 //if( exon_idx > 1) {
297 // printf("%d\t%d\n", currentGene->exon_starts[exon_idx-2], currentGene->exon_stops[exon_idx-2]);
298 // printf("---\n");
299 //}
300
301 uselessReadCtr++;
302 goto next_read;
303 }
304
305 // if this position is reached the read is somehow overlapping or
306 // exactly on the exon boundary.
307 if ( (prev_exon_stop - read_start + 1) >= min_overlap && (prev_exon_stop - read_start + 1) <= max_overlap ) { // read overlaps with previous exon boundary
308 //printf("%s\n",current_line);
309 previousOverlapCtr++;
310 currentRead = create_read(chr,read_start,seq,id,strand,mismatch,occurrence,size,cut,prb,cal_prb,chastity);
311 assert (uov < SIZE);
312 upstream_overlap[uov] = currentRead;
313 uov++;
314 goto next_read;
315 }
316
317 if ( ( read_stop - cur_exon_start + 1) >= min_overlap && (read_stop - cur_exon_start + 1) <= max_overlap ) { // read overlaps with current exon boundary
318 //printf("%s\n",current_line);
319 currentOverlapCtr++;
320 currentRead = create_read(chr,read_start,seq,id,strand,mismatch,occurrence,size,cut,prb,cal_prb,chastity);
321 assert (dov < SIZE);
322 downstream_overlap[dov] = currentRead;
323 dov++;
324 goto next_read;
325 }
326
327 uselessReadCtr++;
328 goto next_read; // read was not useful i.e. not overlapping/starting at exon boundaries
329
330 } else { // read is not within gene borders
331 read_outside_gene_ctr++;
332
333 if (uov != 0 && dov != 0)
334 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
335
336 for(up_idx=0;up_idx<uov;up_idx++) {
337 free_read(upstream_overlap[up_idx]);
338 free(upstream_overlap[up_idx]);
339 }
340
341 for(down_idx=0;down_idx<dov;down_idx++) {
342 free_read(downstream_overlap[down_idx]);
343 free(downstream_overlap[down_idx]);
344 }
345
346 uov = dov = 0;
347
348 if ( currentGene->stop < read_stop ) { // go to next gene
349 gene_idx++;
350 exon_idx = 1;
351 currentGene = (*allGenes)[gene_idx];
352 while( currentGene == 0 && gene_idx < numGenes) {
353 currentGene = (*allGenes)[gene_idx];
354 gene_idx++;
355 }
356 //printf("currentGene->start / currentGene->stop %d/%d pos is %d\n",currentGene->start,currentGene->stop,pos);
357 continue;
358 }
359
360 if ( read_start < currentGene->start ) { // go to next read
361 skippedReadCtr++;
362
363 next_read:
364
365 lineBeginPtr = lineEndPtr;
366 while (*(char*)lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
367 lineEndPtr++;
368 readCtr += 1;
369 current_line = strncpy(current_line,lineBeginPtr,lineEndPtr-lineBeginPtr);
370 current_line[lineEndPtr-lineBeginPtr] = '\0';
371 continue;
372 }
373 }
374 }
375
376 //} // end of for all strands
377
378 if (uov != 0 && dov != 0)
379 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
380
381 for(up_idx=0;up_idx<uov;up_idx++) {
382 free_read(upstream_overlap[up_idx]);
383 free(upstream_overlap[up_idx]);
384 }
385
386 for(down_idx=0;down_idx<dov;down_idx++) {
387 free_read(downstream_overlap[down_idx]);
388 free(downstream_overlap[down_idx]);
389 }
390
391 uov = dov = 0;
392
393 free(upstream_overlap);
394 free(downstream_overlap);
395
396 printf("Processed %d reads in total while %d reads where skipped.\n",readCtr,skippedLinesCounter);
397 printf("%d were totally intronic. %d were outside of genes and %d were occurring on several positions.\n",uselessReadCtr,skippedReadCtr,multioccurReadCtr);
398 printf("%d reads were useless in total\n",uselessReadCtr+skippedReadCtr+multioccurReadCtr);
399 printf("\t%d useless reads\n",uselessReadCtr);
400 printf("\t%d skipped reads\n",skippedReadCtr);
401 printf("\t%d multioccurring\n",multioccurReadCtr);
402 printf("\t%d wrong_strand\n",wrong_strand_ctr);
403 printf("%d reads were totally exonic\n",exonicReadCtr);
404 printf("%d reads overlap with prev. exon. %d reads overlap with current exon\n",previousOverlapCtr,currentOverlapCtr);
405 printf("%d reads where newly combined from two original reads\n",combined_reads);
406 printf("Total used reads: %d (some may not be combined)\n",exonicReadCtr+combined_reads);
407
408 status = munmap(reads_area,reads_filesize);
409 if(status != 0)
410 perror("munmap");
411
412 free(current_line);
413 free(seq);
414 free(prb);
415 free(cal_prb);
416 free(chastity);
417 }
418
419
420 void combine_info(int exon_stop, int exon_start, Read** upstream, int up_size, Read** downstream, int down_size, FILE* out_fs,const char* gene_id, int exon_idx) {
421 int up_idx, down_idx, success;
422
423 char* up_used_flag = calloc(up_size,sizeof(char));
424 char* down_used_flag = calloc(down_size,sizeof(char));
425
426 Read* currentUpRead;
427 Read* currentDownRead;
428
429 for(up_idx=0;up_idx<up_size;up_idx++) {
430 if( up_used_flag[up_idx] == 1)
431 continue;
432
433 currentUpRead = upstream[up_idx];
434
435 for(down_idx=0;down_idx<down_size;down_idx++) {
436
437 if( up_used_flag[up_idx] == 1 || down_used_flag[down_idx] == 1)
438 continue;
439
440 currentDownRead = downstream[down_idx];
441
442 if(currentUpRead->strand != currentDownRead->strand)
443 continue;
444
445 success = join_reads(exon_stop,exon_start,currentUpRead,currentDownRead,out_fs,gene_id,exon_idx);
446
447 if(success == 1) {
448 up_used_flag[up_idx] = 1;
449 down_used_flag[down_idx] = 1;
450 }
451 }
452 }
453
454 free(up_used_flag);
455 free(down_used_flag);
456 }
457
458 /*
459 * Now we join the candidate reads wherever possible according to the following
460 * scheme:
461 *
462 * ACGTACGTCA GTXXXXXXXXAG ACGTAGACGT
463 * p1 e1 e2 p2
464 *
465 *
466 *
467 *
468 */
469
470 int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FILE* out_fs,const char* gene_id, int exon_idx) {
471 // range of possible sequence length on exon side
472 int up_read_start = up_read->pos;
473 //int up_read_stop = up_read->pos+read_size-1;
474
475 int down_read_start = down_read->pos;
476 int down_read_stop = down_read->pos+read_size-1;
477
478 int up_range = exon_stop - up_read_start + 1;
479 int down_range = down_read_stop - exon_start + 1;
480 int retval;
481
482
483 int u_size, d_size;
484 u_size = d_size = -1;
485
486 if(up_range+down_range < read_size)
487 return 0;
488
489 if (read_nr % 2 != 0) {
490 d_size = down_range;
491 u_size = read_size - d_size;
492 } else {
493 u_size = up_range;
494 d_size = read_size - u_size;
495 }
496
497 if( u_size > up_range || d_size > down_range)
498 return 0;
499
500 const int p_start = exon_stop - u_size + 1;
501 const int p_stop = exon_start + d_size - 1;
502
503 int u_off = p_start - up_read_start;
504 int d_off = exon_start - down_read_start;
505
506 FA(u_off >= 0);
507 FA(d_off >= 0);
508
509 FA( exon_stop - p_start + p_stop - exon_start + 2 == read_size);
510 FA( u_size + d_size == read_size );
511
512 // seems reasonable up to here
513
514 int buf_size = 4*read_size;
515 char* new_seq = malloc(sizeof(char)*buf_size);
516 memset(new_seq,'z',sizeof(char)*buf_size);
517
518 Tuple jinfo = join_seq(new_seq,up_read->seq,u_off,u_size,down_read->seq,d_off,d_size,down_range);
519
520 int cut_pos = jinfo.first;
521 int additional_pos = jinfo.second;
522
523 buf_size = read_size+1+additional_pos;
524
525 char* new_prb = malloc(sizeof(char)*buf_size);
526 char* new_cal_prb = malloc(sizeof(char)*buf_size);
527 char* new_chastity = malloc(sizeof(char)*buf_size);
528
529 if( jinfo.first == -1 ) {
530 retval = 0;
531 goto free;
532 }
533
534 if( additional_pos > (down_range - d_size) ) {
535 retval = 0;
536 goto free;
537 }
538
539 strncpy(new_prb, up_read->prb+u_off, u_size);
540 strncpy(new_prb+u_size, down_read->prb+d_off, d_size+additional_pos);
541 new_prb[read_size+additional_pos] = '\0';
542
543 strncpy(new_cal_prb, up_read->cal_prb+u_off, u_size);
544 strncpy(new_cal_prb+u_size, down_read->cal_prb+d_off, d_size+additional_pos);
545 new_cal_prb[read_size+additional_pos] = '\0';
546
547 strncpy(new_chastity, up_read->chastity+u_off, u_size);
548 strncpy(new_chastity+u_size, down_read->chastity+d_off, d_size+additional_pos);
549 new_chastity[read_size+additional_pos] = '\0';
550
551 printf("old reads: %s %s\n",up_read->seq,down_read->seq);
552 printf("new read: %s %d\n",new_seq,cut_pos);
553
554 int status = 1; //fitting(up_read->prb,down_read->prb,u_off,d_off,u_size,d_size);
555
556 if(status != 1) {
557 retval = 0;
558 goto free;
559 }
560
561 retval = status;
562
563 //fprintf(out_fs,"%d\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%s\t%s\n",
564 // read_nr,up_read->chr,up_read->strand,new_seq,cut_pos,read_size,new_prb,new_cal_prb,new_chastity,gene_id,p_start,exon_stop,exon_start,p_stop,up_read->seq,down_read->seq);
565
566 //fprintf(out_fs,"%lu\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\n",
567 // read_nr,up_read->chr,up_read->strand,new_seq,cut_pos,read_size,new_prb,new_cal_prb,new_chastity,gene_id,p_start,exon_stop,exon_start,p_stop,u_size);
568
569 fprintf(out_fs,"%lu\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\n",
570 read_nr,up_read->chr,up_read->strand,new_seq,cut_pos,read_size,new_prb,new_cal_prb,new_chastity,gene_id,p_start,exon_stop,exon_start,p_stop,u_size);
571
572 read_nr++;
573 combined_reads++;
574
575 free:
576 free(new_seq);
577 free(new_prb);
578 free(new_cal_prb);
579 free(new_chastity);
580
581 return retval;
582 }
583
584 static char* translation = "T G C A [ ]";
585
586 void reverse_complement(char** seq, int len) {
587 int idx;
588 char *temp = malloc(sizeof(char)*len);
589 for(idx=0;idx<len;idx++)
590 temp[idx] = translation[(*seq)[idx]-65];
591
592 idx=0;
593 int temp_idx=len-1;
594 while(1) {
595 if (temp[temp_idx] == ']') {
596 (*seq)[idx] = '[';
597 (*seq)[idx+1] = temp[temp_idx-2];
598 (*seq)[idx+2] = temp[temp_idx-1];
599 (*seq)[idx+3] = ']';
600 idx += 4;
601 temp_idx -= 4;
602 } else {
603 (*seq)[idx] = temp[temp_idx];
604 idx++;
605 temp_idx--;
606 }
607
608 if (idx == len || temp_idx == -1)
609 break;
610 }
611 }
612
613
614 void print_read(Read* cRead) {
615 printf(line_format,
616 cRead->chr, cRead->pos, cRead->seq, cRead->id,
617 cRead->strand, cRead->mismatch, cRead->occurrence,
618 cRead->size, cRead->cut, cRead->prb, cRead->cal_prb,
619 cRead->chastity);
620 }
621
622
623 void open_file(const char* filename, const char* mode, FILE** fs) {
624 *fs = fopen(filename,mode);
625 if(*fs == NULL) {
626 printf("Error: Could not open file: %s",filename);
627 exit(EXIT_FAILURE);
628 }
629 }
630
631 /*
632 * The program expects 7 arguments, namely:
633 *
634 * - The strand to be used
635 * - The filename of the gff file
636 * - The filename of the reads file
637 * - The name of the spliced reads ouput file
638 * - The name of the unspliced reads ouput file
639 * - The offset for counting new spliced reads
640 * - The offset for counting new unspliced reads
641 *
642 */
643
644
645 int main(int argc, char* argv[]) {
646
647 if(argc != 8) {
648 printf("%s\n",info);
649 exit(EXIT_FAILURE);
650 }
651
652 current_strand = argv[1][0];
653 printf("Current strand is %c\n",current_strand);
654
655 int status;
656 int filenameSize = 256;
657 char* gff_filename = malloc(sizeof(char)*filenameSize);
658
659 strncpy(gff_filename,argv[2],filenameSize);
660
661 FILE *gff_fs;
662 FILE *reads_fs;
663 FILE *out_fs;
664 FILE *unfiltered_out_fs;
665
666 // open file streams for all needed input/output files
667 open_file(argv[2],"r",&gff_fs);
668 open_file(argv[3],"r",&reads_fs);
669 open_file(argv[4],"w",&out_fs);
670 open_file(argv[5],"w",&unfiltered_out_fs);
671
672 read_nr = strtoul(argv[6],NULL,10);
673 read_nr++;
674
675 unfiltered_read_nr = strtoul(argv[7],NULL,10);
676 unfiltered_read_nr++;
677
678 // allocate and load all genes and then close the gff file stream
679 struct gene** allGenes;
680 int numGenes = parse_gff(gff_filename,gff_fs,&allGenes);
681 status = fclose(gff_fs);
682 free(gff_filename);
683 if(status != 0)
684 printf("closing of gff filestream failed!\n");
685
686 printf("Successfully parsed gff file! Found %d genes.\n",numGenes);
687
688 // check if allGenes is sorted. if not throw away those genes that do not
689 // occur in the sorted order
690 int g_idx;
691 struct gene* currentGene = 0;
692 int nulled_genes=0;
693 int old_gene_stop = -1;
694 for(g_idx=0;g_idx<numGenes;g_idx++) {
695 currentGene = allGenes[g_idx];
696
697 if (! (currentGene->start < currentGene->stop))
698 printf("Invalid positions for gene %s!\n",currentGene->id);
699
700 if (! (old_gene_stop < currentGene->start ) ) {
701 printf("Gene %s is overlapping\n",currentGene->id);
702 old_gene_stop = currentGene->stop;
703 allGenes[g_idx] = 0;
704 nulled_genes++;
705 continue;
706 }
707 old_gene_stop = currentGene->stop;
708 }
709
710 printf("Found %d unordered genes.\n",nulled_genes);
711 int gidx, eidx;
712 int exon_cov = 0;
713 for(gidx=0;gidx<numGenes;gidx++) {
714 if (allGenes[gidx] == 0)
715 continue;
716
717 for(eidx=0;eidx<allGenes[gidx]->num_exons;eidx++) {
718 exon_cov += allGenes[gidx]->exon_stops[eidx] - allGenes[gidx]->exon_starts[eidx];
719 }}
720 printf("Exon coverage is %f\n",(double)exon_cov/30432563);
721
722 // now that we loaded all neccessary data we start to process the reads
723 process_reads(reads_fs,&allGenes,numGenes,out_fs,unfiltered_out_fs);
724
725 // free all allocated ressources
726 for(g_idx=0;g_idx<numGenes;g_idx++) {
727 if(allGenes[g_idx] != 0) {
728 free_gene(allGenes[g_idx]);
729 free(allGenes[g_idx]);
730 }
731 }
732 free(allGenes);
733
734 status = fclose(reads_fs);
735 status = fclose(out_fs);
736 if(status != 0)
737 perror("fclose");
738
739 return 0;
740 }