X-Git-Url: http://git.tuebingen.mpg.de/?p=adu.git;a=blobdiff_plain;f=bloom.c;fp=bloom.c;h=f7a9b1aabd32f89863fa09f4c0121942df1b23f9;hp=0000000000000000000000000000000000000000;hb=3e7ac6a3c698a28e9f0cd2b69b2a85f789524d96;hpb=bd0166ae6242deafc9b3436199965420517033b5 diff --git a/bloom.c b/bloom.c new file mode 100644 index 0000000..f7a9b1a --- /dev/null +++ b/bloom.c @@ -0,0 +1,183 @@ +#include +#include +#include +#include +#include + +#include "adu.h" +#include "string.h" +#include "bloom.h" + +#ifdef TEST_BLOOM +void *adu_calloc(size_t size) +{ + void *ret = malloc(size); + memset(ret, 0, size); + return ret; +} +#undef INFO_LOG +#define INFO_LOG printf +#endif + +static inline uint64_t filter_bits(struct bloom *b) +{ + return 1ULL << b->order; +} + +/* + * SuperFastHash, by Paul Hsieh. + * http://www.azillionmonkeys.com/qed/hash.html + */ + +#if (defined(__GNUC__) && defined(__i386__)) +#define get16bits(d) (*((const uint16_t *) (d))) +#else +#define get16bits(d) ((((uint32_t)(((const uint8_t *)(d))[1])) << 8)\ + +(uint32_t)(((const uint8_t *)(d))[0]) ) +#endif + + +static uint32_t super_fast_hash(const uint8_t *data, uint32_t len, uint32_t hash) +{ + uint32_t tmp; + int rem = len & 3; + + len >>= 2; + + for (;len > 0; len--) { + hash += get16bits (data); + tmp = (get16bits (data+2) << 11) ^ hash; + hash = (hash << 16) ^ tmp; + data += 2*sizeof (uint16_t); + hash += hash >> 11; + } + + /* Handle end cases */ + switch (rem) { + case 3: + hash += get16bits (data); + hash ^= hash << 16; + hash ^= data[sizeof (uint16_t)] << 18; + hash += hash >> 11; + break; + case 2: + hash += get16bits (data); + hash ^= hash << 11; + hash += hash >> 17; + break; + case 1: + hash += *data; + hash ^= hash << 10; + hash += hash >> 1; + } + /* Force "avalanching" of final 127 bits */ + hash ^= hash << 3; + hash += hash >> 5; + hash ^= hash << 4; + hash += hash >> 17; + hash ^= hash << 25; + hash += hash >> 6; + return hash; +} + +static int test_and_set_bit(uint64_t bitnum, struct bloom *b) +{ + uint64_t byte = bitnum / 8; + uint8_t bit = 1 << (bitnum % 8); + int ret = b->filter[byte] & bit; + + b->filter[byte] |= bit; + + if (!ret) + b->num_set_bits++; + return ret; +} + +int bloom_test_and_insert(const uint8_t *data, size_t len, struct bloom *b) +{ + int i, ret = 0; + uint32_t h = 0xb11924e1; /* some arbitrary value */ + + for (i = 1; i < b->num_hash_functions; i++) { + uint64_t bitnum = super_fast_hash(data, len, h); + h = bitnum; + if (b->order > 32) { + bitnum = bitnum << 32; + h = super_fast_hash(data, len, h); + bitnum |= h; + } + bitnum &= filter_bits(b) - 1; + ret |= !test_and_set_bit(bitnum, b); + } + if (ret) + b->num_entries++; + return !ret; +} + +int bloom_test_and_insert_string(const char *str, struct bloom *b) +{ + uint32_t len = strlen(str); + + return bloom_test_and_insert((const uint8_t *)str, len, b); +} + +void bloom_free(struct bloom *b) +{ + if (!b) + return; + free(b->filter); + free(b); +} + +int bloom_init(unsigned order, unsigned num_hash_functions, + struct bloom **result) +{ + struct bloom *b = adu_calloc(sizeof(*b)); + + INFO_LOG("allocating bloom filter (order: %u, %u hash functions)\n", + order, num_hash_functions); + b->order = order; + b->num_hash_functions = num_hash_functions; + b->filter = adu_calloc(filter_bits(b) / 8); + *result = b; + return 1; +} + +#ifdef TEST_BLOOM + +void add_stdin(struct bloom *b) +{ + char buf[255]; + unsigned false_positives = 0; + + while (fgets(buf, sizeof(buf) - 1, stdin)) { + size_t len = strlen(buf); + if (!len) + continue; + if (buf[len - 1] == '\n') + buf[len - 1] = '\0'; + if (bloom_test_and_insert_string(buf, b)) + false_positives++; + } + INFO_LOG("filter contains %llu entries\n", b->num_entries); + printf("%u possible false positives\n", false_positives); +} + +int main(int argc, char **argv) +{ + struct bloom *b; + + if (argc != 3) { + printf("usage: $0 j k\n"); + printf("j: set bloom filter size to m=2^j bits\n"); + printf("k: # of hash functions to use\n"); + exit(1); + } + bloom_init(atoi(argv[1]), atoi(argv[2]), &b); + add_stdin(b); + INFO_LOG("%u%% of bits are set\n", (unsigned) + (b->num_set_bits * 100ULL / filter_bits(b))); + return 1; +} + +#endif /* TEST_BLOOM */