Rename bloom_test_and_insert() to bloom_insert().
[adu.git] / bloom.c
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <inttypes.h>
4 #include <string.h>
5 #include <assert.h>
6
7 #include "adu.h"
8 #include "string.h"
9 #include "bloom.h"
10
11 #ifdef TEST_BLOOM
12 void *adu_calloc(size_t size)
13 {
14 void *ret = malloc(size);
15
16 if (!ret)
17 exit(EXIT_FAILURE);
18 memset(ret, 0, size);
19 return ret;
20 }
21 #undef INFO_LOG
22 #define INFO_LOG printf
23 #endif
24
25 static inline uint64_t filter_bits(struct bloom *b)
26 {
27 return 1ULL << b->order;
28 }
29
30 /*
31 * SuperFastHash, by Paul Hsieh.
32 * http://www.azillionmonkeys.com/qed/hash.html
33 */
34
35 #if (defined(__GNUC__) && defined(__i386__))
36 #define get16bits(d) (*((const uint16_t *) (d)))
37 #else
38 #define get16bits(d) ((((uint32_t)(((const uint8_t *)(d))[1])) << 8)\
39 +(uint32_t)(((const uint8_t *)(d))[0]) )
40 #endif
41
42
43 static uint32_t super_fast_hash(const uint8_t *data, uint32_t len, uint32_t hash)
44 {
45 uint32_t tmp;
46 int rem = len & 3;
47
48 len >>= 2;
49
50 for (;len > 0; len--) {
51 hash += get16bits (data);
52 tmp = (get16bits (data+2) << 11) ^ hash;
53 hash = (hash << 16) ^ tmp;
54 data += 2*sizeof (uint16_t);
55 hash += hash >> 11;
56 }
57
58 /* Handle end cases */
59 switch (rem) {
60 case 3:
61 hash += get16bits (data);
62 hash ^= hash << 16;
63 hash ^= data[sizeof (uint16_t)] << 18;
64 hash += hash >> 11;
65 break;
66 case 2:
67 hash += get16bits (data);
68 hash ^= hash << 11;
69 hash += hash >> 17;
70 break;
71 case 1:
72 hash += *data;
73 hash ^= hash << 10;
74 hash += hash >> 1;
75 }
76 /* Force "avalanching" of final 127 bits */
77 hash ^= hash << 3;
78 hash += hash >> 5;
79 hash ^= hash << 4;
80 hash += hash >> 17;
81 hash ^= hash << 25;
82 hash += hash >> 6;
83 return hash;
84 }
85
86 static int test_and_set_bit(uint64_t bitnum, struct bloom *b)
87 {
88 uint64_t byte = bitnum / 8;
89 uint8_t bit = 1 << (bitnum % 8);
90 int ret = b->filter[byte] & bit;
91
92 b->filter[byte] |= bit;
93
94 if (!ret)
95 b->num_set_bits++;
96 return ret;
97 }
98
99 int bloom_insert(const uint8_t *data, size_t len, struct bloom *b)
100 {
101 int i, ret = 0;
102 uint32_t h = 0xb11924e1; /* some arbitrary value */
103
104 for (i = 1; i < b->num_hash_functions; i++) {
105 uint64_t bitnum = super_fast_hash(data, len, h);
106 h = bitnum;
107 if (b->order > 32) {
108 bitnum = bitnum << 32;
109 h = super_fast_hash(data, len, h);
110 bitnum |= h;
111 }
112 bitnum &= filter_bits(b) - 1;
113 ret |= !test_and_set_bit(bitnum, b);
114 }
115 if (ret)
116 b->num_entries++;
117 return !ret;
118 }
119
120 /**
121 * Deallocate a bloom filter.
122 *
123 * \param b The filter to deallocate.
124 */
125 void bloom_free(struct bloom *b)
126 {
127 if (!b)
128 return;
129 free(b->filter);
130 free(b);
131 }
132
133 /**
134 * Allocate and initialize a new bloom filter.
135 *
136 * \param order Use a filter containing 2^order bits.
137 * \param num_hash_functions Set that many bits in the filter per entry.
138 *
139 * \return This function either returns a freshly allocated and initialized
140 * bloom filter or does not return at all (if the underlying malloc() failed).
141 */
142 struct bloom *bloom_new(unsigned order, unsigned num_hash_functions)
143 {
144 struct bloom *b = adu_calloc(sizeof(*b));
145
146 INFO_LOG("allocating bloom filter (order: %u, %u hash functions)\n",
147 order, num_hash_functions);
148 b->order = order;
149 b->num_hash_functions = num_hash_functions;
150 b->filter = adu_calloc(filter_bits(b) / 8);
151 return b;
152 }
153
154 #ifdef TEST_BLOOM
155
156 int bloom_insert_string(const char *str, struct bloom *b)
157 {
158 uint32_t len = strlen(str);
159
160 return bloom_insert((const uint8_t *)str, len, b);
161 }
162
163 void add_stdin(struct bloom *b)
164 {
165 char buf[255];
166 unsigned false_positives = 0;
167
168 while (fgets(buf, sizeof(buf) - 1, stdin)) {
169 size_t len = strlen(buf);
170 if (!len)
171 continue;
172 if (buf[len - 1] == '\n')
173 buf[len - 1] = '\0';
174 if (bloom_insert_string(buf, b))
175 false_positives++;
176 }
177 INFO_LOG("filter contains %llu entries\n", b->num_entries);
178 printf("%u possible false positives\n", false_positives);
179 }
180
181 int main(int argc, char **argv)
182 {
183 struct bloom *b;
184
185 if (argc != 3) {
186 printf("usage: $0 j k\n");
187 printf("j: set bloom filter size to m=2^j bits\n");
188 printf("k: # of hash functions to use\n");
189 exit(1);
190 }
191 b = bloom_new(atoi(argv[1]), atoi(argv[2]));
192 add_stdin(b);
193 INFO_LOG("%u%% of bits are set\n", (unsigned)
194 (b->num_set_bits * 100ULL / filter_bits(b)));
195 return 1;
196 }
197
198 #endif /* TEST_BLOOM */