process_reads(reads_fs,&allGenes,numGenes,out_fs);
status = fclose(reads_fs);
- status += fclose(out_fs);
+ status = fclose(out_fs);
if(status != 0)
- printf("closing of filestreams failed!\n");
+ perror("fclose");
//free(allGenes);
free(gff_filename);
exit(EXIT_FAILURE);
}
off_t reads_filesize = reads_stat.st_size;
-
printf("Reads file is of size %lu bytes\n",(unsigned long) reads_filesize);
+ int numReads = reads_filesize / 178.0;
void *reads_area = mmap (NULL,reads_filesize,PROT_READ|PROT_WRITE,MAP_PRIVATE,reads_fid,0);
if (reads_area == MAP_FAILED) {
exit(EXIT_FAILURE);
}
close(reads_fid);
-
printf("Successfully mapped %lu bytes of reads file into memory\n",(unsigned long)reads_filesize);
void* linePtr = reads_area;
struct gene* currentGene = (*allGenes)[gene_idx];
int readCtr = 0;
+ int old_gene_stop = -1;
+ int old_pos = 0;
+ char* posPtr;
+ int new_chr = 0;
+ int new_pos = 0;
+ char* tmp_line = malloc(sizeof(char)*256);
// start of the parsing loop
while(1) {
if (gene_idx == numGenes || strcmp(current_line,"") == 0)
break;
- if (readCtr != 0 && readCtr % 100000 == 0)
- printf("Processed %d reads.\n",readCtr);
+ if (readCtr != 0 && readCtr % 1000000 == 0)
+ printf("Processed %d/%d genes and %d/%d reads.\n",readCtr,gene_idx,numGenes,readCtr,numReads);
- //if (readCtr == 10000000)
- // exit(EXIT_FAILURE);
+ //if (gene_idx >= 1833)
+ // printf("currentGene start/stop: %d/%d. Positions is %d\n",currentGene->start,currentGene->stop,pos);
+ old_pos = pos;
status = sscanf(current_line,"%d\t%d\t%s\t%lu\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n",
&chr,&pos,seq,&id,&strand,&mismatch,&occurrence,&size,&cut,prb,cal_prb,chastity);
if (status < 12) {
skippedLinesCounter++;
}
-
//printf("gene start/stop %d,%d, pos: %d\n",currentGene->start,currentGene->stop,pos);
-
// if the read is occurring several times elsewhere then get rid of it
if(occurrence != 1) {
while (*(char*)linePtr != '\n') linePtr++;
if (!(currentGene->start <= pos && (pos + read_size-1) <= currentGene->stop)) { // read is not within gene borders
- if ( currentGene->stop < (pos + read_size-1) ) { // go to next gene
+ if ( currentGene->stop < (pos + read_size-1) || currentGene->start < old_gene_stop ) { // go to next gene
gene_idx++;
exon_idx = 1;
+ old_gene_stop = currentGene->stop ;
currentGene = (*allGenes)[gene_idx];
+ //printf("currentGene->start / currentGene->stop %d/%d pos is %d\n",currentGene->start,currentGene->stop,pos);
+ ue = uo = ds = dov = 0;
continue;
}
- if ( pos < currentGene->start ) { // go to next read
+ if ( pos < currentGene->start || pos < old_pos) { // go to next read
next_read:
+ //posPtr = linePtr;
+ //while (1) {
+ // printf("posPtr points to %c\n",*(char*)posPtr);
+ // if ((*(char*)posPtr) == '\n') {
+ // posPtr++;
+ // tmp_line = strncpy(tmp_line,posPtr,256);
+
+ // if (strcmp(tmp_line,"") == 0)
+ // break;
+
+ // sscanf(tmp_line,"%d\t%d\t",new_chr,new_pos);
+ // printf("new_pos %d\n",new_pos);
+ // if (new_pos >= currentGene->start) {
+ // linePtr = posPtr;
+ // break;
+ // }
+ // }
+ // posPtr++;
+ //}
+ //printf("Went out!\n");
+
while (*(char*)linePtr != '\n') linePtr++;
linePtr++;
readCtr += 1;
current_line = strncpy(current_line,linePtr,256);
+
continue;
}
} else { // read IS within gene borders
exon_label:
- prev_exon_stop = currentGene->exon_stops[exon_idx-1];
- cur_exon_start = currentGene->exon_starts[exon_idx];
- //printf("prev_exon_stop %d cur_exon_start %d pos %d\n",prev_exon_stop,cur_exon_start,pos);
- //printf("exon_idx %d\n",exon_idx);
if (exon_idx == currentGene->num_exons) {
gene_idx++;
currentGene = (*allGenes)[gene_idx];
continue;
}
+
+ prev_exon_stop = currentGene->exon_stops[exon_idx-1];
+ cur_exon_start = currentGene->exon_starts[exon_idx];
if (cur_exon_start - prev_exon_stop < 6 || cur_exon_start < pos ) { // go to next exon
exon_idx++;
- combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov,out_fs);
- combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds,out_fs);
+
+ if (ue != 0 && dov != 0)
+ combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov,out_fs);
+
+ if (uo != 0 && ds != 0)
+ combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds,out_fs);
+
ue = uo = ds = dov = 0;
goto exon_label;
}
- if ( pos < prev_exon_stop - read_size + 1 ) { // go to next read
+ if ( pos + (read_size-1) < prev_exon_stop ) // go to next read
goto next_read;
- }
// if this position is reached the read is somehow overlapping or
// exactly on the exon boundary. now determine where exactly:
-
if (pos + (read_size-1) == prev_exon_stop) { // read ends at previous exon end
upstream_end[ue] = linePtr;
ue++;
goto next_read;
}
- if (prev_exon_stop - pos >= 6 && prev_exon_stop - pos <= 30) { // read overlaps with previous exon boundary
+ if ( (prev_exon_stop - pos) >= 6 && (prev_exon_stop - pos) <= 30) { // read overlaps with previous exon boundary
upstream_overlap[uo] = linePtr;
uo++;
goto next_read;
}
- if (pos == cur_exon_start) { // read starts at current exon start
+ if ( pos == cur_exon_start ) { // read starts at current exon start
downstream_start[ds] = linePtr;
ds++;
goto next_read;
}
- if (cur_exon_start - pos >= 6 && cur_exon_start - pos <= 30 ) { // read overlaps with current exon boundary
+ if ( (cur_exon_start - pos) >= 6 && (cur_exon_start - pos) <= 30 ) { // read overlaps with current exon boundary
downstream_overlap[dov] = linePtr;
dov++;
goto next_read;
}
- goto next_read;
+ goto next_read; // read was not useful i.e. not overlapping/starting at exon boundaries
}
}
}
void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, void** downstream, int down_size,FILE* out_fs) {
- //printf("up_/down_size %d %d\n",up_size,down_size);
-
- if (up_size == 0 || down_size == 0 || exon_stop == -1)
- return;
-
int up_idx, down_idx, status;
char* upstream_line = malloc(sizeof(char)*256);
char* downstream_line = malloc(sizeof(char)*256);
up_idx=0;
down_idx=0;
while(1) {
- //printf("up_/down_idx %d %d\n",up_idx,down_idx);
-
- if (up_idx == up_size || down_idx == down_size)
- break;
-
- if ( up_strand != down_strand )
+ if (up_idx == up_size || down_idx == down_size || up_strand != down_strand)
break;
strncpy(upstream_line,upstream[up_idx],256);
new_chastity[0] = '\0';
int fit;
- int w_size = 5;
+ int w_size = 6;
+ int overlap = 0;
if (up_pos+35 == exon_stop) { // merge with read which is downstream overlapping
- int overlap = exon_start - down_pos;
- //printf("overlap is %d\n",overlap);
- //printf("pos are: %d %d\n",up_pos,down_pos);
+ overlap = exon_start - down_pos;
fit = fitting(up_prb+(36-w_size),up_prb+36,down_prb+overlap,down_prb+overlap+w_size);
if (fit != 1)
strncat(new_cal_prb,down_cal_prb+overlap,36-overlap);
strncat(new_chastity,down_chastity+overlap,36-overlap);
+ //printf("Between exon stop/start %d/%d : merging pos %d %d with overlap %d\n",exon_stop,exon_start,up_pos+35,down_pos, overlap);
+
} // merge with read which is upstream overlapping
if (down_pos == exon_start) {
- int overlap = up_pos+36 - exon_stop;
+ overlap = up_pos+36 - exon_stop;
//printf("overlap is %d\n",overlap);
//printf("pos are: %d %d\n",up_pos,down_pos);
fit = fitting(up_prb+36-overlap-w_size,up_prb+36-overlap,down_prb,down_prb+w_size);
strncat(new_prb,down_prb,overlap);
strncat(new_cal_prb,down_cal_prb,overlap);
strncat(new_chastity,down_chastity,overlap);
+
+ //printf("Between exon stop/start %d/%d : merging pos %d %d with overlap %d\n",exon_stop,exon_start,up_pos,down_pos, overlap);
}
- //printf("New entry!\n");
+ if ( !(up_pos+35 == exon_stop) && !(down_pos == exon_start) )
+ printf("ERROR: Between exon stop/start %d/%d : merging pos %d %d with overlap %d\n",exon_stop,exon_start,up_pos,down_pos, overlap);
+
fprintf(out_fs,"%d\t%c\t%s\t%d\t%s\t%s\t%s\n",
new_chr,new_strand,new_seq,read_size,new_prb,new_cal_prb,new_chastity);
}
int fitting(char* up_prb, char* up_prb_end, char* down_prb, char* down_prb_end) {
- double epsilon_mean = 10.0;
+ double epsilon_mean = 15.0;
double epsilon_var = 10.0;
double mean_up = 0;
double variance_up = 0;
mean_up /= w_size;
mean_down /= w_size;
- //printf("means: %f %f\n",mean_up,mean_down);
up_ptr = up_prb;
down_ptr = down_prb;
variance_up /= (w_size-1);
variance_down /= (w_size-1);
- //printf("variances: %f %f\n",variance_up,variance_down);
+ //printf("means: %f %f, variances: %f %f\n",mean_up,mean_down,variance_up,variance_down);
if ( abs(mean_up - mean_down) < epsilon_mean && abs(variance_up - variance_down) < epsilon_var )
return 1;