bloom: Add some source documentation.
[adu.git] / bloom.c
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <inttypes.h>
4 #include <string.h>
5 #include <assert.h>
6
7 #include "adu.h"
8 #include "string.h"
9 #include "bloom.h"
10
11 #ifdef TEST_BLOOM
12 void *adu_calloc(size_t size)
13 {
14 void *ret = malloc(size);
15 memset(ret, 0, size);
16 return ret;
17 }
18 #undef INFO_LOG
19 #define INFO_LOG printf
20 #endif
21
22 static inline uint64_t filter_bits(struct bloom *b)
23 {
24 return 1ULL << b->order;
25 }
26
27 /*
28 * SuperFastHash, by Paul Hsieh.
29 * http://www.azillionmonkeys.com/qed/hash.html
30 */
31
32 #if (defined(__GNUC__) && defined(__i386__))
33 #define get16bits(d) (*((const uint16_t *) (d)))
34 #else
35 #define get16bits(d) ((((uint32_t)(((const uint8_t *)(d))[1])) << 8)\
36 +(uint32_t)(((const uint8_t *)(d))[0]) )
37 #endif
38
39
40 static uint32_t super_fast_hash(const uint8_t *data, uint32_t len, uint32_t hash)
41 {
42 uint32_t tmp;
43 int rem = len & 3;
44
45 len >>= 2;
46
47 for (;len > 0; len--) {
48 hash += get16bits (data);
49 tmp = (get16bits (data+2) << 11) ^ hash;
50 hash = (hash << 16) ^ tmp;
51 data += 2*sizeof (uint16_t);
52 hash += hash >> 11;
53 }
54
55 /* Handle end cases */
56 switch (rem) {
57 case 3:
58 hash += get16bits (data);
59 hash ^= hash << 16;
60 hash ^= data[sizeof (uint16_t)] << 18;
61 hash += hash >> 11;
62 break;
63 case 2:
64 hash += get16bits (data);
65 hash ^= hash << 11;
66 hash += hash >> 17;
67 break;
68 case 1:
69 hash += *data;
70 hash ^= hash << 10;
71 hash += hash >> 1;
72 }
73 /* Force "avalanching" of final 127 bits */
74 hash ^= hash << 3;
75 hash += hash >> 5;
76 hash ^= hash << 4;
77 hash += hash >> 17;
78 hash ^= hash << 25;
79 hash += hash >> 6;
80 return hash;
81 }
82
83 static int test_and_set_bit(uint64_t bitnum, struct bloom *b)
84 {
85 uint64_t byte = bitnum / 8;
86 uint8_t bit = 1 << (bitnum % 8);
87 int ret = b->filter[byte] & bit;
88
89 b->filter[byte] |= bit;
90
91 if (!ret)
92 b->num_set_bits++;
93 return ret;
94 }
95
96 int bloom_test_and_insert(const uint8_t *data, size_t len, struct bloom *b)
97 {
98 int i, ret = 0;
99 uint32_t h = 0xb11924e1; /* some arbitrary value */
100
101 for (i = 1; i < b->num_hash_functions; i++) {
102 uint64_t bitnum = super_fast_hash(data, len, h);
103 h = bitnum;
104 if (b->order > 32) {
105 bitnum = bitnum << 32;
106 h = super_fast_hash(data, len, h);
107 bitnum |= h;
108 }
109 bitnum &= filter_bits(b) - 1;
110 ret |= !test_and_set_bit(bitnum, b);
111 }
112 if (ret)
113 b->num_entries++;
114 return !ret;
115 }
116
117 /**
118 * Deallocate a bloom filter.
119 *
120 * \param b The filter to deallocate.
121 */
122 void bloom_free(struct bloom *b)
123 {
124 if (!b)
125 return;
126 free(b->filter);
127 free(b);
128 }
129
130 /**
131 * Initialize a bloom filter.
132 *
133 * \param order Use a filter containing 2^order bits.
134 * \param num_hash_functions Set that many bits in the filter per entry.
135 */
136 int bloom_init(unsigned order, unsigned num_hash_functions,
137 struct bloom **result)
138 {
139 struct bloom *b = adu_calloc(sizeof(*b));
140
141 INFO_LOG("allocating bloom filter (order: %u, %u hash functions)\n",
142 order, num_hash_functions);
143 b->order = order;
144 b->num_hash_functions = num_hash_functions;
145 b->filter = adu_calloc(filter_bits(b) / 8);
146 *result = b;
147 return 1;
148 }
149
150 #ifdef TEST_BLOOM
151
152 int bloom_test_and_insert_string(const char *str, struct bloom *b)
153 {
154 uint32_t len = strlen(str);
155
156 return bloom_test_and_insert((const uint8_t *)str, len, b);
157 }
158
159 void add_stdin(struct bloom *b)
160 {
161 char buf[255];
162 unsigned false_positives = 0;
163
164 while (fgets(buf, sizeof(buf) - 1, stdin)) {
165 size_t len = strlen(buf);
166 if (!len)
167 continue;
168 if (buf[len - 1] == '\n')
169 buf[len - 1] = '\0';
170 if (bloom_test_and_insert_string(buf, b))
171 false_positives++;
172 }
173 INFO_LOG("filter contains %llu entries\n", b->num_entries);
174 printf("%u possible false positives\n", false_positives);
175 }
176
177 int main(int argc, char **argv)
178 {
179 struct bloom *b;
180
181 if (argc != 3) {
182 printf("usage: $0 j k\n");
183 printf("j: set bloom filter size to m=2^j bits\n");
184 printf("k: # of hash functions to use\n");
185 exit(1);
186 }
187 bloom_init(atoi(argv[1]), atoi(argv[2]), &b);
188 add_stdin(b);
189 INFO_LOG("%u%% of bits are set\n", (unsigned)
190 (b->num_set_bits * 100ULL / filter_bits(b)));
191 return 1;
192 }
193
194 #endif /* TEST_BLOOM */