2 * Copyright (C) 2008 Andre Noll <maan@systemlinux.org>
4 * Licensed under the GPL v2. For licencing details see COPYING.
7 /** \file bloom.c Simple bloom filter implementation. */
20 void *adu_calloc(size_t size)
22 void *ret = malloc(size);
30 #define INFO_LOG printf
33 static inline uint64_t filter_bits(struct bloom *b)
35 return 1ULL << b->order;
39 * SuperFastHash, by Paul Hsieh.
40 * http://www.azillionmonkeys.com/qed/hash.html
44 #if (defined(__GNUC__) && defined(__i386__))
45 #define get16bits(d) (*((const uint16_t *) (d)))
47 #define get16bits(d) ((((uint32_t)(((const uint8_t *)(d))[1])) << 8)\
48 +(uint32_t)(((const uint8_t *)(d))[0]) )
52 static uint32_t super_fast_hash(const uint8_t *data, uint32_t len, uint32_t hash)
59 for (;len > 0; len--) {
60 hash += get16bits (data);
61 tmp = (get16bits (data+2) << 11) ^ hash;
62 hash = (hash << 16) ^ tmp;
63 data += 2*sizeof (uint16_t);
67 /* Handle end cases */
70 hash += get16bits (data);
72 hash ^= data[sizeof (uint16_t)] << 18;
76 hash += get16bits (data);
85 /* Force "avalanching" of final 127 bits */
95 static int test_and_set_bit(uint64_t bitnum, struct bloom *b)
97 uint64_t byte = bitnum / 8;
98 uint8_t bit = 1 << (bitnum % 8);
99 int ret = b->filter[byte] & bit;
101 b->filter[byte] |= bit;
109 * Insert data to the given bloom filter.
111 * This function computes \a k hashes from the given data where \a k is the
112 * number of hash functions of the filter \a b. Each hash value corresponds to
113 * a position in the bit array of the filter and each of these bits are being
114 * tested and set. If not all \a k bits were already set, the given data was
115 * not yet contained in the filter and the function returns non-zero.
117 * Otherwise either (a) the same data has already been inserted previously or
118 * (b) a hash collision occurred or (c) the \a k bits are set due to previous
119 * insertion of other data (i.e. a false positive occurred). It is impossible
120 * to distinguish these cases.
122 * \param data The data to insert.
123 * \param len Number of bytes of \a data.
124 * \param b The filter to insert to.
126 * \return Zero if the entry was already contained in the filter (or in case of
127 * false positives), non-zero otherwise.
129 int bloom_insert(const uint8_t *data, size_t len, struct bloom *b)
132 uint32_t h = 0xb11924e1; /* some arbitrary value */
134 for (i = 1; i < b->num_hash_functions; i++) {
135 uint64_t bitnum = super_fast_hash(data, len, h);
138 bitnum = bitnum << 32;
139 h = super_fast_hash(data, len, h);
142 bitnum &= filter_bits(b) - 1;
143 ret |= !test_and_set_bit(bitnum, b);
151 * Deallocate a bloom filter.
153 * \param b The filter to deallocate.
155 void bloom_free(struct bloom *b)
164 * Allocate and initialize a new bloom filter.
166 * \param order Use a filter containing 2^order bits.
167 * \param num_hash_functions Set that many bits in the filter per entry.
169 * \return This function either returns a freshly allocated and initialized
170 * bloom filter or does not return at all (if the underlying malloc() failed).
172 struct bloom *bloom_new(unsigned order, unsigned num_hash_functions)
174 struct bloom *b = adu_calloc(sizeof(*b));
176 INFO_LOG("allocating bloom filter (order: %u, %u hash functions)\n",
177 order, num_hash_functions);
179 b->num_hash_functions = num_hash_functions;
180 b->filter = adu_calloc(filter_bits(b) / 8);
186 int bloom_insert_string(const char *str, struct bloom *b)
188 uint32_t len = strlen(str);
190 return bloom_insert((const uint8_t *)str, len, b);
193 void add_stdin(struct bloom *b)
196 unsigned false_positives = 0;
198 while (fgets(buf, sizeof(buf) - 1, stdin)) {
199 size_t len = strlen(buf);
202 if (buf[len - 1] == '\n')
204 if (bloom_insert_string(buf, b))
207 INFO_LOG("filter contains %llu entries\n", b->num_entries);
208 printf("%u possible false positives\n", false_positives);
211 int main(int argc, char **argv)
216 printf("usage: $0 j k\n");
217 printf("j: set bloom filter size to m=2^j bits\n");
218 printf("k: # of hash functions to use\n");
221 b = bloom_new(atoi(argv[1]), atoi(argv[2]));
223 INFO_LOG("%u%% of bits are set\n", (unsigned)
224 (b->num_set_bits * 100ULL / filter_bits(b)));
228 #endif /* TEST_BLOOM */