+ minor changes in the paths
[qpalma.git] / tools / data_tools / filterReads.c
1 ////////////////////////////////////////////////////////////////////////////////
2 // The purpose of this program is to read a gff and a
3 // solexa reads file and create a data set used by QPalma.
4 //
5 // Notes:
6 //
7 // - Both the read indices and the gff gene indices are one-based
8 //
9 ////////////////////////////////////////////////////////////////////////////////
10
11 #include <sys/mman.h>
12 #include <sys/stat.h>
13 #include <stdio.h>
14 #include <stdlib.h>
15 #include <assert.h>
16 #include <string.h>
17 #include <unistd.h>
18 #include <math.h>
19
20 #include "debug_tools.h"
21 #include "join.h"
22 #include "datastructures.h"
23
24 #define _FILE_OFFSET_BITS == 64
25
26 const char* line_format = "%d\t%d\t%s\t%lu\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n";
27
28 const int read_size = 36;
29
30 const int min_overlap = 1;
31 const int max_overlap = 35;
32
33 unsigned long read_nr = 1;
34 unsigned long unfiltered_read_nr = 1;
35
36 int parse_gff(char* filename,FILE* fid,struct gene*** allGenes);
37 void sort_genes(struct gene*** allGenes, int numGenes);
38 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* out_fs, FILE* unfiltered_out_fs);
39 void combine_info(int exon_stop, int exon_start, Read** upstream, int up_size, Read** downstream, int down_size, FILE* out_fs,const char* gene_id, int exon_idx);
40 int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FILE* out_fs,const char* gene_id, int exon_idx);
41 int fitting(char* up_prb, char* down_prb, int u_off, int d_off, int u_size, int d_size);
42 Tuple join_seq(char* new_seq, char* up_seq,int u_off, int u_size, char* down_seq, int d_off, int d_size, int d_range);
43
44 static char *info = "Usage is:\n./filterReads gff reads output";
45
46 void reverse_complement(char** seq, int len);
47
48 char current_strand;
49
50 /*
51 * Some constants specifying the exact behavior of the filter
52 *
53 */
54
55 //#define _FDEBUG_ 0
56 //#define DEBUG_READ 38603
57
58 /*
59 * TODO:
60 * - Check strand -> done simple (only if equal)
61 * - check for [AC] and similar entries -> done simple (see function
62 */
63
64 #ifdef _FDEBUG_
65 if(read_nr == DEBUG_READ) {
66 printf("read nr %lu",read_nr);
67 printf("pos: %d %d %d %d\n",p_start,exon_stop,exon_start,p_stop);
68 printf("u/d range: %d %d\n",up_range,down_range);
69 printf("u/d size: %d %d\n",u_size,d_size);
70 printf("u/d off: %d %d\n",u_off,d_off);
71 printf("add_pos/down_range-d_size: %d %d\n",additional_pos,(down_range - d_size));
72
73 printf("******************\n");
74
75 printf("%s\n",up_read->seq);
76 //printf("%s\n",new_up_seq);
77
78 printf("******************\n");
79
80 printf("%s\n",down_read->seq);
81 //printf("%s\n",new_down_seq);
82
83 printf("******************\n");
84 printf("%s\n",new_seq);
85 printf("%s\n",new_prb);
86 printf("%s\n",new_cal_prb);
87 printf("%s\n",new_chastity);
88 }
89 #endif // _FDEBUG_
90
91 int combined_reads = 0;
92
93 /*
94 *
95 *
96 */
97
98 void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* out_fs, FILE* unfiltered_out_fs) {
99 int status;
100
101 int buffer_size= 64;
102 int chr = 0;
103 int pos = 0;
104 char* seq = malloc(sizeof(char)*buffer_size);
105 unsigned long id = 0;
106 char strand = ' ';
107 int mismatch = 0;
108 int occurrence = 0;
109 int size = 0;
110 int cut = 0;
111 char* prb = malloc(sizeof(char)*buffer_size);
112 char* cal_prb = malloc(sizeof(char)*buffer_size);
113 char* chastity = malloc(sizeof(char)*buffer_size);
114
115 int reads_fid = fileno(reads_fs);
116 struct stat reads_stat;
117 if ( fstat(reads_fid,&reads_stat) == -1) {
118 perror("fstat");
119 exit(EXIT_FAILURE);
120 }
121 off_t reads_filesize = reads_stat.st_size;
122 printf("Reads file is of size %lu bytes\n",(unsigned long) reads_filesize);
123 //int numReads = reads_filesize / 178.0;
124
125 void *reads_area = mmap (NULL,reads_filesize,PROT_READ,MAP_PRIVATE,reads_fid,0);
126 if (reads_area == MAP_FAILED) {
127 perror("mmap");
128 exit(EXIT_FAILURE);
129 }
130 close(reads_fid);
131 printf("Successfully mapped %lu bytes of reads file into memory\n",(unsigned long)reads_filesize);
132
133 char* lineBeginPtr = (char*) reads_area;
134 char* lineEndPtr = (char*) reads_area;
135 char* end_of_mapped_area = ((char*) reads_area) + reads_filesize;
136
137 while (*lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
138 lineEndPtr++;
139
140 char* current_line = malloc(sizeof(char)*512);
141 memset(current_line,0,512);
142
143 int SIZE = 5000;
144 // initialize boundary arrays
145 Read** upstream_overlap = malloc(sizeof(Read*)*SIZE);
146 Read** downstream_overlap= malloc(sizeof(Read*)*SIZE);
147 int uov,dov;
148 uov = dov = 0;
149
150 int skippedLinesCounter = 0;
151
152 int prev_exon_start = -1;
153 int prev_exon_stop = -1;
154 int cur_exon_start = -1;
155
156 unsigned long line_size = lineEndPtr - lineBeginPtr;
157 //printf("distance is %lu\n",line_size);
158 strncpy(current_line,lineBeginPtr,line_size);
159 current_line[line_size] = '\0';
160 //printf("%s test",current_line);
161
162 int gene_idx = 0;
163 int exon_idx = 1;
164 struct gene* currentGene = (*allGenes)[gene_idx];
165 char* gene_id = currentGene->id;
166
167 int skippedReadCtr = 0;
168 int uselessReadCtr = 0;
169 int exonicReadCtr = 0;
170
171 int currentOverlapCtr = 0;
172 int previousOverlapCtr = 0;
173
174 int multioccurReadCtr = 0;
175
176 Read* currentRead;
177 int up_idx, down_idx;
178
179 int readCtr = 0;
180 int wrong_strand_ctr = 0;
181 int read_within_gene_ctr = 0;
182 int read_outside_gene_ctr = 0;
183 int read_start, read_stop;
184 // start of the parsing loop
185
186 while(1) {
187 if (gene_idx == numGenes || strcmp(current_line,"") == 0)
188 break;
189
190 gene_id = currentGene->id;
191
192 //if (readCtr != 0 && readCtr % 1000000 == 0) {
193 // printf("Processed %d/%d genes and %d/%d reads.\n",gene_idx,numGenes,readCtr,numReads);
194 // printf("%d reads in total while %d reads where skipped.\n",readCtr,skippedLinesCounter);
195 // printf("%d were totally intronic. %d were outside of genes and %d were occurring on several positions.\n",uselessReadCtr,skippedReadCtr,multioccurReadCtr);
196 // printf("%d reads were useless in total\n",uselessReadCtr+skippedReadCtr+multioccurReadCtr);
197 // printf("\t%d useless reads\n",uselessReadCtr);
198 // printf("\t%d skipped reads\n",skippedReadCtr);
199 // printf("\t%d multioccurring\n",multioccurReadCtr);
200 // printf("\t%d wrong_strand\n",wrong_strand_ctr);
201 // printf("%d within gene\n",read_within_gene_ctr);
202 // printf("%d outside gene\n",read_outside_gene_ctr);
203 // printf("%d reads were totally exonic\n",exonicReadCtr);
204 // printf("%d reads overlap with prev. exon. %d reads overlap with current exon\n",previousOverlapCtr,currentOverlapCtr);
205 // printf("%d reads where newly combined from two original reads\n",combined_reads);
206 // printf("Total used reads: %d (some may not be combined)\n",exonicReadCtr+combined_reads);
207 //}
208
209 // pos of the reads is one-based
210 status = sscanf(current_line,line_format,&chr,&pos,seq,&id,&strand,&mismatch,&occurrence,&size,&cut,prb,cal_prb,chastity);
211 if (status < 12) {
212 skippedLinesCounter++;
213 goto next_read;
214 }
215
216 // if the read is occurring several times elsewhere then get rid of it
217 if ( occurrence != 1 ) {
218 multioccurReadCtr++;
219 goto next_read;
220 }
221
222 //printf("before rc %s\n",seq);
223 if (current_strand == 'P')
224 reverse_complement(&seq,strlen(seq));
225 //printf("after rc %s\n",seq);
226
227 //printf("new read: %s at %d\n",seq,pos);
228
229 // define read start and stop positions
230 read_start = pos;
231 read_stop = pos + read_size-1;
232
233 FA(strlen(seq) >= read_size);
234
235 FA(currentGene != 0);
236
237 if ( currentGene->start <= read_start && read_stop <= currentGene->stop) { // read is within gene borders
238 read_within_gene_ctr++;
239
240 exon_label:
241
242 if (exon_idx == currentGene->num_exons) {
243 gene_idx++;
244 exon_idx = 1;
245 currentGene = (*allGenes)[gene_idx];
246 while( currentGene == 0 && gene_idx < numGenes) {
247 currentGene = (*allGenes)[gene_idx];
248 gene_idx++;
249 }
250 continue;
251 }
252
253 prev_exon_start = currentGene->exon_starts[exon_idx-1];
254 prev_exon_stop = currentGene->exon_stops[exon_idx-1];
255 cur_exon_start = currentGene->exon_starts[exon_idx];
256
257 //printf("id: %s,exon_idx: %d intron: %d %d read start/stop: %d / %d\n",currentGene->id,exon_idx,prev_exon_stop,cur_exon_start,read_start,read_stop);
258
259 if ( (cur_exon_start - prev_exon_stop - 1) < min_overlap || cur_exon_start < read_start ) { // go to next exon
260
261 if (uov != 0 && dov != 0)
262 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
263
264 for(up_idx=0;up_idx<uov;up_idx++) {
265 free_read(upstream_overlap[up_idx]);
266 free(upstream_overlap[up_idx]);
267 }
268
269 for(down_idx=0;down_idx<dov;down_idx++) {
270 free_read(downstream_overlap[down_idx]);
271 free(downstream_overlap[down_idx]);
272 }
273
274 uov = dov = 0;
275
276 exon_idx++;
277 goto exon_label;
278 }
279
280 if ( prev_exon_start <= read_start && read_stop <= prev_exon_stop ) { // read is inside previous exon
281
282 // output of unused i.e. unspliced reads
283 fprintf(unfiltered_out_fs,"%lu\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\n",
284 unfiltered_read_nr,chr,strand,seq,0,read_size,prb,cal_prb,chastity,gene_id,read_start,-1,-1,read_stop,-1);
285 unfiltered_read_nr++;
286
287 exonicReadCtr++;
288 goto next_read;
289 }
290
291 if ( read_stop < prev_exon_stop ) { // go to next read
292 //printf("%d\t%d\t%d\n",read_start,prev_exon_start,prev_exon_stop);
293 //if( exon_idx > 1) {
294 // printf("%d\t%d\n", currentGene->exon_starts[exon_idx-2], currentGene->exon_stops[exon_idx-2]);
295 // printf("---\n");
296 //}
297
298 uselessReadCtr++;
299 goto next_read;
300 }
301
302 // if this position is reached the read is somehow overlapping or
303 // exactly on the exon boundary.
304 if ( (prev_exon_stop - read_start + 1) >= min_overlap && (prev_exon_stop - read_start + 1) <= max_overlap ) { // read overlaps with previous exon boundary
305 //printf("%s\n",current_line);
306 previousOverlapCtr++;
307 currentRead = create_read(chr,read_start,seq,id,strand,mismatch,occurrence,size,cut,prb,cal_prb,chastity);
308 assert (uov < SIZE);
309 upstream_overlap[uov] = currentRead;
310 uov++;
311 goto next_read;
312 }
313
314 if ( ( read_stop - cur_exon_start + 1) >= min_overlap && (read_stop - cur_exon_start + 1) <= max_overlap ) { // read overlaps with current exon boundary
315 //printf("%s\n",current_line);
316 currentOverlapCtr++;
317 currentRead = create_read(chr,read_start,seq,id,strand,mismatch,occurrence,size,cut,prb,cal_prb,chastity);
318 assert (dov < SIZE);
319 downstream_overlap[dov] = currentRead;
320 dov++;
321 goto next_read;
322 }
323
324 uselessReadCtr++;
325 goto next_read; // read was not useful i.e. not overlapping/starting at exon boundaries
326
327 } else { // read is not within gene borders
328 read_outside_gene_ctr++;
329
330 if (uov != 0 && dov != 0)
331 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
332
333 for(up_idx=0;up_idx<uov;up_idx++) {
334 free_read(upstream_overlap[up_idx]);
335 free(upstream_overlap[up_idx]);
336 }
337
338 for(down_idx=0;down_idx<dov;down_idx++) {
339 free_read(downstream_overlap[down_idx]);
340 free(downstream_overlap[down_idx]);
341 }
342
343 uov = dov = 0;
344
345 if ( currentGene->stop < read_stop ) { // go to next gene
346 gene_idx++;
347 exon_idx = 1;
348 currentGene = (*allGenes)[gene_idx];
349 while( currentGene == 0 && gene_idx < numGenes) {
350 currentGene = (*allGenes)[gene_idx];
351 gene_idx++;
352 }
353 //printf("currentGene->start / currentGene->stop %d/%d pos is %d\n",currentGene->start,currentGene->stop,pos);
354 continue;
355 }
356
357 if ( read_start < currentGene->start ) { // go to next read
358 skippedReadCtr++;
359
360 next_read:
361
362 lineBeginPtr = lineEndPtr;
363 while (*(char*)lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
364 lineEndPtr++;
365 readCtr += 1;
366 current_line = strncpy(current_line,lineBeginPtr,lineEndPtr-lineBeginPtr);
367 current_line[lineEndPtr-lineBeginPtr] = '\0';
368 continue;
369 }
370 }
371 }
372
373 //} // end of for all strands
374
375 if (uov != 0 && dov != 0)
376 combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
377
378 for(up_idx=0;up_idx<uov;up_idx++) {
379 free_read(upstream_overlap[up_idx]);
380 free(upstream_overlap[up_idx]);
381 }
382
383 for(down_idx=0;down_idx<dov;down_idx++) {
384 free_read(downstream_overlap[down_idx]);
385 free(downstream_overlap[down_idx]);
386 }
387
388 uov = dov = 0;
389
390 free(upstream_overlap);
391 free(downstream_overlap);
392
393 printf("Processed %d reads in total while %d reads where skipped.\n",readCtr,skippedLinesCounter);
394 printf("%d were totally intronic. %d were outside of genes and %d were occurring on several positions.\n",uselessReadCtr,skippedReadCtr,multioccurReadCtr);
395 printf("%d reads were useless in total\n",uselessReadCtr+skippedReadCtr+multioccurReadCtr);
396 printf("\t%d useless reads\n",uselessReadCtr);
397 printf("\t%d skipped reads\n",skippedReadCtr);
398 printf("\t%d multioccurring\n",multioccurReadCtr);
399 printf("\t%d wrong_strand\n",wrong_strand_ctr);
400 printf("%d reads were totally exonic\n",exonicReadCtr);
401 printf("%d reads overlap with prev. exon. %d reads overlap with current exon\n",previousOverlapCtr,currentOverlapCtr);
402 printf("%d reads where newly combined from two original reads\n",combined_reads);
403 printf("Total used reads: %d (some may not be combined)\n",exonicReadCtr+combined_reads);
404
405 status = munmap(reads_area,reads_filesize);
406 if(status != 0)
407 perror("munmap");
408
409 free(current_line);
410 free(seq);
411 free(prb);
412 free(cal_prb);
413 free(chastity);
414 }
415
416
417 void combine_info(int exon_stop, int exon_start, Read** upstream, int up_size, Read** downstream, int down_size, FILE* out_fs,const char* gene_id, int exon_idx) {
418 int up_idx, down_idx, success;
419
420 char* up_used_flag = calloc(up_size,sizeof(char));
421 char* down_used_flag = calloc(down_size,sizeof(char));
422
423 Read* currentUpRead;
424 Read* currentDownRead;
425
426 for(up_idx=0;up_idx<up_size;up_idx++) {
427 if( up_used_flag[up_idx] == 1)
428 continue;
429
430 currentUpRead = upstream[up_idx];
431
432 for(down_idx=0;down_idx<down_size;down_idx++) {
433
434 if( up_used_flag[up_idx] == 1 || down_used_flag[down_idx] == 1)
435 continue;
436
437 currentDownRead = downstream[down_idx];
438
439 if(currentUpRead->strand != currentDownRead->strand)
440 continue;
441
442 success = join_reads(exon_stop,exon_start,currentUpRead,currentDownRead,out_fs,gene_id,exon_idx);
443
444 if(success == 1) {
445 up_used_flag[up_idx] = 1;
446 down_used_flag[down_idx] = 1;
447 }
448 }
449 }
450
451 exit(0);
452 free(up_used_flag);
453 free(down_used_flag);
454 }
455
456 /*
457 * Now we join the candidate reads wherever possible according to the following
458 * scheme:
459 *
460 * ACGTACGTCA GTXXXXXXXXAG ACGTAGACGT
461 * p1 e1 e2 p2
462 *
463 *
464 *
465 *
466 */
467
468 int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FILE* out_fs,const char* gene_id, int exon_idx) {
469 // range of possible sequence length on exon side
470 int up_read_start = up_read->pos;
471 //int up_read_stop = up_read->pos+read_size-1;
472
473 int down_read_start = down_read->pos;
474 int down_read_stop = down_read->pos+read_size-1;
475
476 int up_range = exon_stop - up_read_start + 1;
477 int down_range = down_read_stop - exon_start + 1;
478 int retval;
479
480 int u_size, d_size;
481 u_size = d_size = -1;
482
483 if(up_range+down_range < read_size)
484 return 0;
485
486 if (read_nr % 2 != 0) {
487 d_size = down_range;
488 u_size = read_size - d_size;
489 } else {
490 u_size = up_range;
491 d_size = read_size - u_size;
492 }
493
494 if( u_size > up_range || d_size > down_range)
495 return 0;
496
497 int p_start = exon_stop - u_size + 1;
498 int p_stop = exon_start + d_size - 1;
499
500 int u_off = p_start - up_read_start;
501 int d_off = exon_start - down_read_start;
502
503 FA( u_off >= 0 && d_off >= 0 );
504 FA( exon_stop - p_start + p_stop - exon_start + 2 == read_size);
505 FA( u_size + d_size == read_size );
506
507 // seems reasonable up to here
508
509 int buf_size = 4*read_size;
510 char* new_seq = malloc(sizeof(char)*buf_size);
511 memset(new_seq,'z',sizeof(char)*buf_size);
512
513 if ( current_strand == 'P' ) {
514 printf("flipping read sequences...\n");
515 printf("%s %s\n",up_read->seq,down_read->seq);
516 char *tmp = malloc(sizeof(char)*(strlen(up_read->seq)+1));
517 strncpy(tmp,up_read->seq,strlen(up_read->seq));
518 tmp[strlen(up_read->seq)]='\0';
519
520 realloc(up_read->seq,sizeof(char)*(strlen(down_read->seq)+1));
521 strncpy(up_read->seq,down_read->seq,strlen(down_read->seq));
522 up_read->seq[strlen(down_read->seq)] = '\0';
523
524 if (up_read->seq == NULL)
525 perror("realloc\n");
526
527 realloc(down_read->seq,sizeof(char)*strlen(tmp)+1);
528 strncpy(down_read->seq,tmp,strlen(tmp));
529 down_read->seq[strlen(tmp)] = '\0';
530
531 if (down_read->seq == NULL)
532 perror("realloc\n");
533
534 free(tmp);
535 printf("flipping done...\n");
536 printf("%s %s\n",up_read->seq,down_read->seq);
537 }
538
539 printf("start joining...\n");
540 Tuple jinfo = join_seq(new_seq,up_read->seq,u_off,u_size,down_read->seq,d_off,d_size,down_range);
541 printf("end of joining...\n");
542
543 printf("new_seq is %s\n",new_seq);
544
545
546 int cut_pos = jinfo.first;
547 int additional_pos = jinfo.second;
548 printf("jinfo contains %d/%d\n",jinfo.first,jinfo.second);
549
550 buf_size = read_size+1+additional_pos;
551
552 printf("allocating quality arrays (size=%d)...\n",buf_size);
553 char* new_prb = malloc(sizeof(char)*buf_size);
554 char* new_cal_prb = malloc(sizeof(char)*buf_size);
555 char* new_chastity = malloc(sizeof(char)*buf_size);
556
557 if (new_prb == NULL || new_cal_prb == NULL || new_chastity == NULL)
558 perror("malloc\n");
559
560 if( jinfo.first == -1 || (additional_pos > (down_range - d_size)) ) {
561 retval = 0;
562 goto free;
563 }
564
565 printf("joining qualities...\n");
566 strncpy(new_prb, up_read->prb+u_off, u_size);
567 strncpy(new_prb+u_size, down_read->prb+d_off, d_size+additional_pos);
568 new_prb[buf_size-1] = '\0';
569
570 strncpy(new_cal_prb, up_read->cal_prb+u_off, u_size);
571 strncpy(new_cal_prb+u_size, down_read->cal_prb+d_off, d_size+additional_pos);
572 new_cal_prb[buf_size-1] = '\0';
573
574 strncpy(new_chastity, up_read->chastity+u_off, u_size);
575 strncpy(new_chastity+u_size, down_read->chastity+d_off, d_size+additional_pos);
576 new_chastity[buf_size-1] = '\0';
577 printf("end of joining qualities...\n");
578
579 //printf("old reads: %s %s (%d %d %d/%d)\n",up_read->seq,down_read->seq,up_read->pos,down_read->pos,u_off,d_off);
580 //printf("new read: %s %d %d\n",new_seq,cut_pos,u_size);
581 //if ( current_strand == 'P' ) {
582 // int alpha = read_size - u_off - 1;
583 // int beta = alpha - u_size ;
584 // p_start = up_read->pos + beta + 1;
585 // exon_stop = up_read->pos + alpha;
586 // alpha = read_size - d_off - 1;
587 // beta = alpha - (read_size - u_size);
588 // exon_start = down_read->pos + beta + 1;
589 // p_stop = down_read->pos + alpha;
590 //}
591
592 int status = 1; //fitting(up_read->prb,down_read->prb,u_off,d_off,u_size,d_size);
593
594 if(status != 1) {
595 retval = 0;
596 goto free;
597 }
598
599 retval = status;
600
601 fprintf(out_fs,"%lu\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\n",
602 read_nr,up_read->chr,up_read->strand,new_seq,cut_pos,read_size,new_prb,new_cal_prb,new_chastity,gene_id,p_start,exon_stop,exon_start,p_stop,u_size);
603
604 read_nr++;
605 combined_reads++;
606
607 free:
608 free(new_seq);
609 free(new_prb);
610 free(new_cal_prb);
611 free(new_chastity);
612
613 return retval;
614 }
615
616 static char* translation = "T G C A [ ]";
617
618 void reverse_complement(char** seq, int len) {
619 int idx;
620 char *temp = malloc(sizeof(char)*len);
621 for(idx=0;idx<len;idx++)
622 temp[idx] = translation[(*seq)[idx]-65];
623
624 idx=0;
625 int temp_idx=len-1;
626 while(1) {
627 if (temp[temp_idx] == ']') {
628 (*seq)[idx] = '[';
629 (*seq)[idx+1] = temp[temp_idx-2];
630 (*seq)[idx+2] = temp[temp_idx-1];
631 (*seq)[idx+3] = ']';
632 idx += 4;
633 temp_idx -= 4;
634 } else {
635 (*seq)[idx] = temp[temp_idx];
636 idx++;
637 temp_idx--;
638 }
639
640 if (idx == len || temp_idx == -1)
641 break;
642 }
643 free(temp);
644 }
645
646
647 void print_read(Read* cRead) {
648 printf(line_format,
649 cRead->chr, cRead->pos, cRead->seq, cRead->id,
650 cRead->strand, cRead->mismatch, cRead->occurrence,
651 cRead->size, cRead->cut, cRead->prb, cRead->cal_prb,
652 cRead->chastity);
653 }
654
655
656 void open_file(const char* filename, const char* mode, FILE** fs) {
657 *fs = fopen(filename,mode);
658 if(*fs == NULL) {
659 printf("Error: Could not open file: %s",filename);
660 exit(EXIT_FAILURE);
661 }
662 }
663
664 /*
665 * The program expects 7 arguments, namely:
666 *
667 * - The strand to be used
668 * - The filename of the gff file
669 * - The filename of the reads file
670 * - The name of the spliced reads ouput file
671 * - The name of the unspliced reads ouput file
672 * - The offset for counting new spliced reads
673 * - The offset for counting new unspliced reads
674 *
675 */
676
677 int main(int argc, char* argv[]) {
678
679 if(argc != 8) {
680 printf("%s\n",info);
681 exit(EXIT_FAILURE);
682 }
683
684 current_strand = argv[1][0];
685 printf("Current strand is %c\n",current_strand);
686
687 int status;
688 int filenameSize = 256;
689 char* gff_filename = malloc(sizeof(char)*filenameSize);
690
691 strncpy(gff_filename,argv[2],filenameSize);
692
693 FILE *gff_fs;
694 FILE *reads_fs;
695 FILE *out_fs;
696 FILE *unfiltered_out_fs;
697
698 // open file streams for all needed input/output files
699 open_file(argv[2],"r",&gff_fs);
700 open_file(argv[3],"r",&reads_fs);
701 open_file(argv[4],"w",&out_fs);
702 open_file(argv[5],"w",&unfiltered_out_fs);
703
704 read_nr = strtoul(argv[6],NULL,10);
705 read_nr++;
706
707 unfiltered_read_nr = strtoul(argv[7],NULL,10);
708 unfiltered_read_nr++;
709
710 // allocate and load all genes and then close the gff file stream
711 struct gene** allGenes;
712 int numGenes = parse_gff(gff_filename,gff_fs,&allGenes);
713 status = fclose(gff_fs);
714 free(gff_filename);
715 if(status != 0)
716 printf("closing of gff filestream failed!\n");
717
718 printf("Successfully parsed gff file! Found %d genes.\n",numGenes);
719
720 // check if allGenes is sorted. if not throw away those genes that do not
721 // occur in the sorted order
722 int g_idx;
723 struct gene* currentGene = 0;
724 int nulled_genes=0;
725 int old_gene_stop = -1;
726 for(g_idx=0;g_idx<numGenes;g_idx++) {
727 currentGene = allGenes[g_idx];
728
729 if (! (currentGene->start < currentGene->stop))
730 printf("Invalid positions for gene %s!\n",currentGene->id);
731
732 if (! (old_gene_stop < currentGene->start ) ) {
733 printf("Gene %s is overlapping\n",currentGene->id);
734 old_gene_stop = currentGene->stop;
735 free_gene(allGenes[g_idx]);
736 free(allGenes[g_idx]);
737 allGenes[g_idx] = 0;
738 nulled_genes++;
739 continue;
740 }
741 old_gene_stop = currentGene->stop;
742 }
743
744 printf("Found %d unordered genes.\n",nulled_genes);
745 int gidx, eidx;
746 int exon_cov = 0;
747 for(gidx=0;gidx<numGenes;gidx++) {
748 if (allGenes[gidx] == 0)
749 continue;
750
751 for(eidx=0;eidx<allGenes[gidx]->num_exons;eidx++) {
752 exon_cov += allGenes[gidx]->exon_stops[eidx] - allGenes[gidx]->exon_starts[eidx];
753 }}
754 printf("Exon coverage is %f\n",(double)exon_cov/30432563);
755
756 // now that we loaded all neccessary data we start to process the reads
757 process_reads(reads_fs,&allGenes,numGenes,out_fs,unfiltered_out_fs);
758
759 // free all allocated ressources
760 for(g_idx=0;g_idx<numGenes;g_idx++) {
761 if(allGenes[g_idx] != 0) {
762 free_gene(allGenes[g_idx]);
763 free(allGenes[g_idx]);
764 }
765 }
766 free(allGenes);
767
768 status = fclose(reads_fs);
769 status = fclose(out_fs);
770 if(status != 0)
771 perror("fclose");
772
773 return 0;
774 }