X-Git-Url: http://git.tuebingen.mpg.de/?p=adu.git;a=blobdiff_plain;f=bloom.c;h=a0e91d40f4c09282a8ba583d34190ba29a526479;hp=f7a9b1aabd32f89863fa09f4c0121942df1b23f9;hb=560d397a66ba141f18e13557feae78ca94a25f98;hpb=7231c544e2ee3f53f5b2c8bc393b7fd1e0b8d0a7 diff --git a/bloom.c b/bloom.c index f7a9b1a..a0e91d4 100644 --- a/bloom.c +++ b/bloom.c @@ -1,8 +1,10 @@ -#include -#include -#include -#include -#include +/* + * Copyright (C) 2008 Andre Noll + * + * Licensed under the GPL v2. For licencing details see COPYING. + */ + +/** \file bloom.c Simple bloom filter implementation. */ #include "adu.h" #include "string.h" @@ -12,6 +14,9 @@ void *adu_calloc(size_t size) { void *ret = malloc(size); + + if (!ret) + exit(EXIT_FAILURE); memset(ret, 0, size); return ret; } @@ -29,13 +34,14 @@ static inline uint64_t filter_bits(struct bloom *b) * http://www.azillionmonkeys.com/qed/hash.html */ +/** \cond */ #if (defined(__GNUC__) && defined(__i386__)) #define get16bits(d) (*((const uint16_t *) (d))) #else #define get16bits(d) ((((uint32_t)(((const uint8_t *)(d))[1])) << 8)\ +(uint32_t)(((const uint8_t *)(d))[0]) ) #endif - +/** \endcond */ static uint32_t super_fast_hash(const uint8_t *data, uint32_t len, uint32_t hash) { @@ -93,7 +99,28 @@ static int test_and_set_bit(uint64_t bitnum, struct bloom *b) return ret; } -int bloom_test_and_insert(const uint8_t *data, size_t len, struct bloom *b) +/** + * Insert data to the given bloom filter. + * + * This function computes \a k hashes from the given data where \a k is the + * number of hash functions of the filter \a b. Each hash value corresponds to + * a position in the bit array of the filter and each of these bits are being + * tested and set. If not all \a k bits were already set, the given data was + * not yet contained in the filter and the function returns non-zero. + * + * Otherwise either (a) the same data has already been inserted previously or + * (b) a hash collision occurred or (c) the \a k bits are set due to previous + * insertion of other data (i.e. a false positive occurred). It is impossible + * to distinguish these cases. + * + * \param data The data to insert. + * \param len Number of bytes of \a data. + * \param b The filter to insert to. + * + * \return Zero if the entry was already contained in the filter (or in case of + * false positives), non-zero otherwise. + */ +int bloom_insert(const uint8_t *data, size_t len, struct bloom *b) { int i, ret = 0; uint32_t h = 0xb11924e1; /* some arbitrary value */ @@ -114,13 +141,11 @@ int bloom_test_and_insert(const uint8_t *data, size_t len, struct bloom *b) return !ret; } -int bloom_test_and_insert_string(const char *str, struct bloom *b) -{ - uint32_t len = strlen(str); - - return bloom_test_and_insert((const uint8_t *)str, len, b); -} - +/** + * Deallocate a bloom filter. + * + * \param b The filter to deallocate. + */ void bloom_free(struct bloom *b) { if (!b) @@ -129,8 +154,16 @@ void bloom_free(struct bloom *b) free(b); } -int bloom_init(unsigned order, unsigned num_hash_functions, - struct bloom **result) +/** + * Allocate and initialize a new bloom filter. + * + * \param order Use a filter containing 2^order bits. + * \param num_hash_functions Set that many bits in the filter per entry. + * + * \return This function either returns a freshly allocated and initialized + * bloom filter or does not return at all (if the underlying malloc() failed). + */ +struct bloom *bloom_new(unsigned order, unsigned num_hash_functions) { struct bloom *b = adu_calloc(sizeof(*b)); @@ -139,12 +172,18 @@ int bloom_init(unsigned order, unsigned num_hash_functions, b->order = order; b->num_hash_functions = num_hash_functions; b->filter = adu_calloc(filter_bits(b) / 8); - *result = b; - return 1; + return b; } #ifdef TEST_BLOOM +int bloom_insert_string(const char *str, struct bloom *b) +{ + uint32_t len = strlen(str); + + return bloom_insert((const uint8_t *)str, len, b); +} + void add_stdin(struct bloom *b) { char buf[255]; @@ -156,7 +195,7 @@ void add_stdin(struct bloom *b) continue; if (buf[len - 1] == '\n') buf[len - 1] = '\0'; - if (bloom_test_and_insert_string(buf, b)) + if (bloom_insert_string(buf, b)) false_positives++; } INFO_LOG("filter contains %llu entries\n", b->num_entries); @@ -173,7 +212,7 @@ int main(int argc, char **argv) printf("k: # of hash functions to use\n"); exit(1); } - bloom_init(atoi(argv[1]), atoi(argv[2]), &b); + b = bloom_new(atoi(argv[1]), atoi(argv[2])); add_stdin(b); INFO_LOG("%u%% of bits are set\n", (unsigned) (b->num_set_bits * 100ULL / filter_bits(b)));