1 ////////////////////////////////////////////////////////////
2 // The purpose of this program is to read a gff and a
3 // solexa reads file and create a data set used by QPalma.
7 ////////////////////////////////////////////////////////////
14 #include "datastructures.h"
16 int parse_gff(char* filename
,FILE* fid
,struct gene
*** allGenes
);
18 void process_reads(FILE* reads_fs
,struct gene
*** allGenes
,int numGenes
);
20 static char *info
= "Usage is:\n./filterReads gff reads";
22 int main(int argc
, char* argv
[]) {
29 //static size_t page_size;
30 // page_size = (size_t) sysconf (_SC_PAGESIZE);
32 int filenameSize
= 256;
33 char *gff_filename
= malloc(sizeof(char)*filenameSize
);
34 char *reads_filename
= malloc(sizeof(char)*filenameSize
);
36 strncpy(gff_filename
,argv
[1],filenameSize
);
37 strncpy(reads_filename
,argv
[2],filenameSize
);
39 FILE *gff_fs
= fopen(gff_filename
,"r");
40 FILE *reads_fs
= fopen(reads_filename
,"r");
43 printf("Error: Could not open file: %s",gff_filename
);
47 if(reads_fs
== NULL
) {
48 printf("Error: Could not open file: %s",reads_filename
);
52 struct gene
** allGenes
;
53 int numGenes
= parse_gff(gff_filename
,gff_fs
,&allGenes
);
54 status
= fclose(gff_fs
);
56 printf("closing of gff filestream failed!\n");
58 process_reads(reads_fs
,&allGenes
,numGenes
);
60 status
+= fclose(reads_fs
);
62 printf("closing of filestreams failed!\n");
71 void process_reads(FILE* reads_fs
,struct gene
*** allGenes
,int numGenes
) {
77 char* seq
= malloc(sizeof(char)*buffer_size
);
84 char* prb
= malloc(sizeof(char)*buffer_size
);
85 char* cal_prb
= malloc(sizeof(char)*buffer_size
);
86 char* chastity
= malloc(sizeof(char)*buffer_size
);
88 int reads_fid
= fileno(reads_fs
);
89 size_t mmapAreaSize
= 20000;
91 void *reads_area
= mmap (NULL
,mmapAreaSize
,PROT_READ
|PROT_WRITE
,MAP_PRIVATE
,reads_fid
,0);
92 if((long int) reads_area
== -1)
93 printf("mmapping failed!\n");
95 void* linePtr
= reads_area
;
97 char *test_string
= malloc(sizeof(char)*10);
98 strncpy(test_string
,(char*)reads_area
,10);
100 status
= munmap(reads_area
,mmapAreaSize
);
102 printf("munmap failed!\n");
105 void** upstream_overlap
;
107 void** downstream_start
;
108 void** downstream_overlap
;
114 int skippedLinesCounter
= 0;
116 printf("Entering loop...\n");
119 for (gene_idx
= 0; gene_idx
< numGenes
;) {
120 struct gene
* currentGene
= (*allGenes
)[gene_idx
];
123 for(exon_idx
=1;exon_idx
<currentGene
->num_exons
-1;exon_idx
++) {
124 int prev_exon_stop
= currentGene
->exon_stops
[exon_idx
-1];
125 int cur_exon_start
= currentGene
->exon_starts
[exon_idx
];
127 if (cur_exon_start
- prev_exon_stop
< 6)
130 // initialize boundary arrays
131 upstream_end
= malloc(sizeof(void*)*SIZE
);
132 upstream_overlap
= malloc(sizeof(void*)*SIZE
);
133 downstream_start
= malloc(sizeof(void*)*SIZE
);
134 downstream_overlap
= malloc(sizeof(void*)*SIZE
);
140 printf("Fetching reads line...\n");
144 status
= sscanf((char*)linePtr
,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
145 &chr
,&pos
,seq
,&id
,&strand
,&mismatch
,&repetition
,&size
,&cut
,prb
,cal_prb
,chastity
);
147 skippedLinesCounter
++;
151 printf("read line!");
153 if (currentGene
->start
<= pos
&& pos
+35 <= currentGene
->stop
) {
154 printf("gene boundaries: %d %d, pos: %d\n",currentGene
->start
,currentGene
->stop
,pos
);
157 if (pos
+35 == prev_exon_stop
) {
158 upstream_end
[ue
] = linePtr
;
162 // upstream overlapping
163 if (prev_exon_stop
- pos
<= 30) {
164 upstream_overlap
[uo
] = linePtr
;
168 // downstream starting
169 if (pos
== cur_exon_start
) {
170 downstream_start
[ds
] = linePtr
;
174 // downstream overlapping
175 if (cur_exon_start
- pos
>= 6) {
176 downstream_overlap
[dov
] = linePtr
;
180 while (*(char*)linePtr
!= '\n') {
181 printf("linePtr is %d",linePtr
);
189 free(upstream_overlap
);
190 free(downstream_start
);
191 free(downstream_overlap
);
194 while (*(char*)linePtr
!= '\n') {
195 printf("linePtr is %d",linePtr
);
207 printf("skipped %d lines.\n",skippedLinesCounter
);
215 // int skippedLinesCounter = 0;
217 // status = fscanf(reads_fs,"%d\t%d\t%s\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",&chr,&pos,seq,&id,&strand,&mismatch,&repetition,&size,&cut,prb,cal_prb,chastity);
221 // if (status < 12) {
222 // skippedLinesCounter++;
228 // if (pos < currentGene->start)
231 // if (currentGene->stop < pos) {
233 // currentGene = (*allGenes)[gene_idx];