+ added license text
[qpalma.git] / ParaParser / CParaParser.cpp
1 #include "CParaParser.h"
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <sys/mman.h>
5 #include <sys/stat.h>
6
7 /*
8 * The constructor needs the format string to be used with sscanf and the names
9 * of the respective fields for the dictionary.
10 *
11 */
12
13 ParaParser::ParaParser(const char* fmt, const char** fields) {
14 size_t buf_size = 512;
15 format_string = malloc(sizeof(char)*buf_size);
16 if (strlen(fmt) > buf_size)
17 perror("format string to long!");
18
19 strncpy(format_string,fmt,strlen(fmt));
20 }
21
22 /*
23 *
24 */
25 int create_read_from_line(Read* newRead, const char* current_line) {
26 //printf("current line is %s\n",current_line);
27
28 int entries_found = sscanf(current_line,line_format,&(newRead->id),
29 &(newRead->chr),&(newRead->strand),newRead->seq,&(newRead->splitpos),&(newRead->size),
30 newRead->prb,newRead->cal_prb,newRead->chastity,newRead->gene_id,&(newRead->p_start),
31 &(newRead->exon_stop),&(newRead->exon_start),&(newRead->p_stop),&(newRead->true_cut));
32
33 if (entries_found != 15) {
34 return -1;
35 }
36
37 // make sequence lowercase but don't destroy brackets
38 Py_ssize_t idx;
39 for(idx=0;idx<strlen(newRead->seq);idx++) {
40 if ( 65 <= newRead->seq[idx] && newRead->seq[idx] < 85)
41 newRead->seq[idx] = newRead->seq[idx]+32;
42 }
43
44 if ( newRead->strand == 'D' )
45 newRead->strand = '+';
46
47 if ( newRead->strand == 'P' )
48 newRead->strand = '-';
49
50 return 0;
51 }
52
53
54 ParaParser::parseFile(const char* reads_filename) {
55
56 // first define some constant strings
57
58 FILE *reads_fs = fopen(reads_filename,"r");
59
60 if(reads_fs == NULL) {
61 printf("Error: Could not open file: %s",reads_filename);
62 exit(EXIT_FAILURE);
63 }
64
65 int reads_fid = fileno(reads_fs);
66 struct stat reads_stat;
67 if ( fstat(reads_fid,&reads_stat) == -1) {
68 perror("fstat");
69 exit(EXIT_FAILURE);
70 }
71
72 off_t reads_filesize = reads_stat.st_size;
73 //printf("Reads file is of size %lu bytes\n",(unsigned long) reads_filesize);
74
75 // ATTENTION this is an overestimator of the reads in the file
76 // it is NOT the exact number
77 int num_init_reads = reads_filesize / 200.0;
78
79 read_array = malloc(sizeof(Read*)*num_init_reads);
80 id_map = malloc(sizeof(unsigned long)*num_init_reads);
81
82 //printf("Found %d reads.",numReads);
83
84 // try to acquire file using mmap
85 void *reads_area = mmap (NULL,reads_filesize,PROT_READ,MAP_PRIVATE,reads_fid,0);
86 if (reads_area == MAP_FAILED) {
87 perror("mmap");
88 exit(EXIT_FAILURE);
89 }
90
91 close(reads_fid);
92 printf("Successfully mapped %lu bytes of reads file into memory\n",(unsigned long)reads_filesize);
93
94 char* lineBeginPtr = (char*) reads_area;
95 char* lineEndPtr = (char*) reads_area;
96 char* end_of_mapped_area = ((char*) reads_area) + reads_filesize;
97
98 while (*lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
99 lineEndPtr++;
100
101 char* current_line = malloc(sizeof(char)*512);
102 memset(current_line,0,512);
103
104 unsigned long line_size = lineEndPtr - lineBeginPtr;
105 strncpy(current_line,lineBeginPtr,line_size);
106 current_line[line_size] = '\0';
107
108 int readCtr = 0;
109 int status = 0;
110
111 num_reads = 0;
112 map_idx = 0;
113
114 while(1) {
115 if (strcmp(current_line,"") == 0)
116 break;
117
118 status = set_item_from_line(current_line);
119 if (status != 0 )
120 printf("Error while parsing line (status=%d).",status);
121
122 lineBeginPtr = lineEndPtr;
123 while (*(char*)lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
124 lineEndPtr++;
125
126 current_line = strncpy(current_line,lineBeginPtr,lineEndPtr-lineBeginPtr);
127 current_line[lineEndPtr-lineBeginPtr] = '\0';
128
129 readCtr += 1;
130 }
131
132 // clean up
133 status = munmap(reads_area,reads_filesize);
134 if(status != 0)
135 perror("munmap");
136
137 free(current_line);
138
139 return PyInt_FromLong(num_reads);
140 fetchEntry(int id);
141
142 ~ParaParser();
143
144
145