From: Andre Noll Date: Thu, 25 Dec 2008 11:23:10 +0000 (+0100) Subject: Merge branch 'master' into bloom X-Git-Tag: v0.1.0~14^2~2 X-Git-Url: http://git.tuebingen.mpg.de/?a=commitdiff_plain;h=25e4fb919ce9b87aeb5aa3456b687c5c3ba55a9e;hp=c7d5ddb6906ab8a18181fee29ac78d604db5fdb9;p=adu.git Merge branch 'master' into bloom Conflicts: create.c --- diff --git a/Makefile b/Makefile index a4293c7..a040478 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -objects := adu.o string.o cmdline.o fd.o select.o create.o interactive.o select.cmdline.o format.o user.o +objects := adu.o string.o cmdline.o fd.o select.o create.o interactive.o select.cmdline.o format.o user.o bloom.o all: adu version := 0.0.5 uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not') diff --git a/adu.c b/adu.c index 82c876a..b227589 100644 --- a/adu.c +++ b/adu.c @@ -12,7 +12,7 @@ * - Modes of operation: \ref select.c, \ref create.c, \ref interactive.c * - User handling: \ref user.c * - Error handling: \ref error.h - * - Library-type functions: \ref fd.c, \ref format.c, \ref string.c, \ref portable_io.h + * - Library-type functions: \ref fd.c, \ref format.c, \ref string.c, \ref portable_io.h, \ref bloom.c */ #include "adu.h" diff --git a/adu.ggo b/adu.ggo index 5da5908..a180f1f 100644 --- a/adu.ggo +++ b/adu.ggo @@ -126,6 +126,36 @@ details=" users. Decreasing the value causes adu to use slightly less memory. " +option "bloom-filter-order" B +#~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +"use bloom filters for hard link detection" +int typestr="order" +dependon="create" +default="23" +optional +details=" + Allocate bloom filters of size 2^order bits. Each regular + file with hard link count greater than one is added to these + filters which allows to detect hard links on a per-user basis. + Greater values reduce the probability of false positives but + require more memory. + + Values less than 10 deactivate this feature so that no hard + links are being detected. +" + +option "num-bloom-filter-hash-functions" N +#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +"number of hash functions for the bloom filters" +int typestr="num" +dependon="create" +default="10" +optional +details=" + Cause each entry which is added to the bloom filter to set + \"num\" bits of the bloom filter. +" + ############################## section "Options for --select" ############################## diff --git a/bloom.c b/bloom.c new file mode 100644 index 0000000..ec12d51 --- /dev/null +++ b/bloom.c @@ -0,0 +1,222 @@ +/* + * Copyright (C) 2008 Andre Noll + * + * Licensed under the GPL v2. For licencing details see COPYING. + */ + +/** \file bloom.c Simple bloom filter implementation. */ + +#include "adu.h" +#include "string.h" +#include "bloom.h" + +#ifdef TEST_BLOOM +void *adu_calloc(size_t size) +{ + void *ret = malloc(size); + + if (!ret) + exit(EXIT_FAILURE); + memset(ret, 0, size); + return ret; +} +#undef INFO_LOG +#define INFO_LOG printf +#endif + +static inline uint64_t filter_bits(struct bloom *b) +{ + return 1ULL << b->order; +} + +/* + * SuperFastHash, by Paul Hsieh. + * http://www.azillionmonkeys.com/qed/hash.html + */ + +/** \cond */ +#if (defined(__GNUC__) && defined(__i386__)) +#define get16bits(d) (*((const uint16_t *) (d))) +#else +#define get16bits(d) ((((uint32_t)(((const uint8_t *)(d))[1])) << 8)\ + +(uint32_t)(((const uint8_t *)(d))[0]) ) +#endif +/** \endcond */ + +static uint32_t super_fast_hash(const uint8_t *data, uint32_t len, uint32_t hash) +{ + uint32_t tmp; + int rem = len & 3; + + len >>= 2; + + for (;len > 0; len--) { + hash += get16bits (data); + tmp = (get16bits (data+2) << 11) ^ hash; + hash = (hash << 16) ^ tmp; + data += 2*sizeof (uint16_t); + hash += hash >> 11; + } + + /* Handle end cases */ + switch (rem) { + case 3: + hash += get16bits (data); + hash ^= hash << 16; + hash ^= data[sizeof (uint16_t)] << 18; + hash += hash >> 11; + break; + case 2: + hash += get16bits (data); + hash ^= hash << 11; + hash += hash >> 17; + break; + case 1: + hash += *data; + hash ^= hash << 10; + hash += hash >> 1; + } + /* Force "avalanching" of final 127 bits */ + hash ^= hash << 3; + hash += hash >> 5; + hash ^= hash << 4; + hash += hash >> 17; + hash ^= hash << 25; + hash += hash >> 6; + return hash; +} + +static int test_and_set_bit(uint64_t bitnum, struct bloom *b) +{ + uint64_t byte = bitnum / 8; + uint8_t bit = 1 << (bitnum % 8); + int ret = b->filter[byte] & bit; + + b->filter[byte] |= bit; + + if (!ret) + b->num_set_bits++; + return ret; +} + +/** + * Insert data to the given bloom filter. + * + * This function computes \a k hashes from the given data where \a k is the + * number of hash functions of the filter \a b. Each hash value corresponds to + * a position in the bit array of the filter and each of these bits are being + * tested and set. If not all \a k bits were already set, the given data was + * not yet contained in the filter and the function returns non-zero. + * + * Otherwise either (a) the same data has already been inserted previously or + * (b) a hash collision occurred or (c) the \a k bits are set due to previous + * insertion of other data (i.e. a false positive occurred). It is impossible + * to distinguish these cases. + * + * \param data The data to insert. + * \param len Number of bytes of \a data. + * \param b The filter to insert to. + * + * \return Zero if the entry was already contained in the filter (or in case of + * false positives), non-zero otherwise. + */ +int bloom_insert(const uint8_t *data, size_t len, struct bloom *b) +{ + int i, ret = 0; + uint32_t h = 0xb11924e1; /* some arbitrary value */ + + for (i = 1; i < b->num_hash_functions; i++) { + uint64_t bitnum = super_fast_hash(data, len, h); + h = bitnum; + if (b->order > 32) { + bitnum = bitnum << 32; + h = super_fast_hash(data, len, h); + bitnum |= h; + } + bitnum &= filter_bits(b) - 1; + ret |= !test_and_set_bit(bitnum, b); + } + if (ret) + b->num_entries++; + return !ret; +} + +/** + * Deallocate a bloom filter. + * + * \param b The filter to deallocate. + */ +void bloom_free(struct bloom *b) +{ + if (!b) + return; + free(b->filter); + free(b); +} + +/** + * Allocate and initialize a new bloom filter. + * + * \param order Use a filter containing 2^order bits. + * \param num_hash_functions Set that many bits in the filter per entry. + * + * \return This function either returns a freshly allocated and initialized + * bloom filter or does not return at all (if the underlying malloc() failed). + */ +struct bloom *bloom_new(unsigned order, unsigned num_hash_functions) +{ + struct bloom *b = adu_calloc(sizeof(*b)); + + INFO_LOG("allocating bloom filter (order: %u, %u hash functions)\n", + order, num_hash_functions); + b->order = order; + b->num_hash_functions = num_hash_functions; + b->filter = adu_calloc(filter_bits(b) / 8); + return b; +} + +#ifdef TEST_BLOOM + +int bloom_insert_string(const char *str, struct bloom *b) +{ + uint32_t len = strlen(str); + + return bloom_insert((const uint8_t *)str, len, b); +} + +void add_stdin(struct bloom *b) +{ + char buf[255]; + unsigned false_positives = 0; + + while (fgets(buf, sizeof(buf) - 1, stdin)) { + size_t len = strlen(buf); + if (!len) + continue; + if (buf[len - 1] == '\n') + buf[len - 1] = '\0'; + if (bloom_insert_string(buf, b)) + false_positives++; + } + INFO_LOG("filter contains %llu entries\n", b->num_entries); + printf("%u possible false positives\n", false_positives); +} + +int main(int argc, char **argv) +{ + struct bloom *b; + + if (argc != 3) { + printf("usage: $0 j k\n"); + printf("j: set bloom filter size to m=2^j bits\n"); + printf("k: # of hash functions to use\n"); + exit(1); + } + b = bloom_new(atoi(argv[1]), atoi(argv[2])); + add_stdin(b); + INFO_LOG("%u%% of bits are set\n", (unsigned) + (b->num_set_bits * 100ULL / filter_bits(b))); + return 1; +} + +#endif /* TEST_BLOOM */ diff --git a/bloom.h b/bloom.h new file mode 100644 index 0000000..9e71c6c --- /dev/null +++ b/bloom.h @@ -0,0 +1,25 @@ +/* + * Copyright (C) 2008 Andre Noll + * + * Licensed under the GPL v2. For licencing details see COPYING. + */ + +/** \file bloom.h Struct bloom and bloom filter functions. */ + +/** Describes one instance of a bloom filter. */ +struct bloom { + /** The bloom filter is of size 2^order bits. */ + unsigned order; + /** Set that many bits in the filter per entry. */ + unsigned num_hash_functions; + /** How many entries have been inserted so far. */ + uint64_t num_entries; + /** Number of bits currently set. */ + uint64_t num_set_bits; + /** The bit array. */ + uint8_t *filter; +}; + +struct bloom *bloom_new(unsigned order, unsigned num_hash_functions); +void bloom_free(struct bloom *b); +int bloom_insert(const uint8_t *data, size_t len, struct bloom *b); diff --git a/create.c b/create.c index 2925d57..c4d7195 100644 --- a/create.c +++ b/create.c @@ -15,9 +15,57 @@ #include "string.h" #include "error.h" #include "user.h" +#include "bloom.h" /* Id of the device containing the base dir. */ static dev_t device_id; +static struct bloom *global_bloom_filter; +static struct bloom *user_bloom_filter; + +static int consider_bloom(struct stat64 *s) +{ + if (!global_bloom_filter) + return 0; + if (s->st_nlink <= 1) + return 0; + return 1; +} + +/** Data size to hash for the global bloom filter. */ +#define GLOBAL_BLOOM_BUF_SIZE (sizeof(ino_t) + sizeof(dev_t) + sizeof(off_t)) +/** For the user bloom filter also the uid is being hashed. */ +#define USER_BLOOM_BUF_SIZE (GLOBAL_BLOOM_BUF_SIZE + sizeof(uid_t)) + +static void make_bloom_buf(struct stat64 *s, uint8_t buf[USER_BLOOM_BUF_SIZE]) +{ + uint8_t *p = buf; + + if (!consider_bloom(s)) + return; + memcpy(p, &s->st_ino, sizeof(ino_t)); + p += sizeof(ino_t); + memcpy(p, &s->st_dev, sizeof(dev_t)); + p += sizeof(dev_t); + memcpy(p, &s->st_size, sizeof(off_t)); + p += sizeof(off_t); + memcpy(p, &s->st_uid, sizeof(uid_t)); +} + +static int insert_global_bloom(struct stat64 *s, + uint8_t buf[USER_BLOOM_BUF_SIZE]) +{ + if (!consider_bloom(s)) + return 0; + return bloom_insert(buf, GLOBAL_BLOOM_BUF_SIZE, global_bloom_filter); +} + +static int insert_user_bloom(struct stat64 *s, + uint8_t buf[USER_BLOOM_BUF_SIZE]) +{ + if (!consider_bloom(s)) + return 0; + return bloom_insert(buf, USER_BLOOM_BUF_SIZE, user_bloom_filter); +} static int add_directory(char *dirname, uint64_t *dir_num, uint64_t *parent_dir_num, uint64_t *dir_size, uint64_t *dir_files) @@ -39,7 +87,7 @@ static int add_directory(char *dirname, uint64_t *dir_num, uint64_t *parent_dir_ } static int update_user_row(struct osl_table *t, uint64_t dir_num, - uint64_t *add) + uint64_t add) { struct osl_row *row; struct osl_object obj = {.data = &dir_num, .size = sizeof(dir_num)}; @@ -54,8 +102,8 @@ static int update_user_row(struct osl_table *t, uint64_t dir_num, objects[UT_DIR_NUM].data = &dir_num; objects[UT_DIR_NUM].size = sizeof(dir_num); - objects[UT_BYTES].data = add; - objects[UT_BYTES].size = sizeof(*add); + objects[UT_BYTES].data = &add; + objects[UT_BYTES].size = sizeof(add); objects[UT_FILES].data = &num_files; objects[UT_FILES].size = sizeof(num_files); ret = osl(osl_add_row(t, objects)); @@ -67,7 +115,7 @@ static int update_user_row(struct osl_table *t, uint64_t dir_num, ret = osl(osl_get_object(t, row, UT_BYTES, &obj1)); if (ret < 0) return ret; - num = *(uint64_t *)obj1.data + *add; + num = *(uint64_t *)obj1.data + add; ret = osl(osl_update_object(t, row, UT_BYTES, &obj2)); if (ret < 0) return ret; @@ -87,7 +135,6 @@ static int scan_dir(char *dirname, uint64_t *parent_dir_num) uint64_t dir_size = 0, dir_files = 0; /* dir count. */ static uint64_t current_dir_num; - uint64_t this_dir_num = ++current_dir_num; check_signals(); @@ -102,9 +149,8 @@ static int scan_dir(char *dirname, uint64_t *parent_dir_num) while ((entry = readdir(dir))) { mode_t m; struct stat64 s; - uint32_t uid; - uint64_t size; struct user_info *ui; + uint8_t bloom_buf[USER_BLOOM_BUF_SIZE]; if (!strcmp(entry->d_name, ".")) continue; @@ -121,20 +167,38 @@ static int scan_dir(char *dirname, uint64_t *parent_dir_num) if (S_ISDIR(m)) { if (conf.one_file_system_given && s.st_dev != device_id) continue; + dir_size += s.st_size; + dir_files++; + ret = create_user_table(conf.database_dir_arg, s.st_uid, &ui); + if (ret < 0) + goto out; + ret = update_user_row(ui->table, this_dir_num, + s.st_size); + if (ret < 0) + goto out; ret = scan_dir(entry->d_name, &this_dir_num); if (ret < 0) goto out; continue; } + /* regular file */ - size = s.st_size; - dir_size += size; + make_bloom_buf(&s, bloom_buf); + if (insert_global_bloom(&s, bloom_buf)) + DEBUG_LOG("global hard link: %s/%s\n", dirname, + entry->d_name); + else + dir_size += s.st_size; dir_files++; - uid = s.st_uid; - ret = create_user_table(conf.database_dir_arg, uid, &ui); + ret = create_user_table(conf.database_dir_arg, s.st_uid, &ui); if (ret < 0) goto out; - ret = update_user_row(ui->table, this_dir_num, &size); + ret = insert_user_bloom(&s, bloom_buf); + if (ret) + DEBUG_LOG("hard link for uid %d: %s/%s\n", + (unsigned)s.st_uid, dirname, entry->d_name); + ret = update_user_row(ui->table, this_dir_num, + ret? 0 : s.st_size); if (ret < 0) goto out; } @@ -149,6 +213,32 @@ out: return ret; } +static void log_bloom_stat(struct bloom *b) +{ + unsigned percent; + + NOTICE_LOG("\tfilter contains %llu entries\n", + (long long unsigned)b->num_entries); + percent = b->num_set_bits * 100ULL / (1ULL << b->order); + NOTICE_LOG("\t%u%% of bits are set\n", percent); + if (percent > 50) { + WARNING_LOG("results may be unreliable!\n"); + WARNING_LOG("consider incrasing bllom filter size\n"); + } +} + +static void log_bloom_stats(void) +{ + struct bloom *b = global_bloom_filter; + if (!b) + return; + NOTICE_LOG("global bloom filter statistics:\n"); + log_bloom_stat(b); + NOTICE_LOG("user bloom filter statistics:\n"); + b = user_bloom_filter; + log_bloom_stat(b); +} + /** * The main function of the create mode. * @@ -157,23 +247,32 @@ out: int com_create(void) { uint64_t zero = 0ULL; - int ret; + int ret, order = conf.bloom_filter_order_arg, + num = conf.num_bloom_filter_hash_functions_arg; struct stat statbuf; if (lstat(conf.base_dir_arg, &statbuf) == -1) return -ERRNO_TO_ERROR(errno); if (!S_ISDIR(statbuf.st_mode)) return -ERRNO_TO_ERROR(ENOTDIR); + if (order >= 10 && num > 0) { + global_bloom_filter = bloom_new(order, num); + user_bloom_filter = bloom_new(order, num); + } else + WARNING_LOG("hard link detection deactivated\n"); device_id = statbuf.st_dev; create_hash_table(conf.hash_table_bits_arg); ret = open_dir_table(1); if (ret < 0) - return ret; + goto out; check_signals(); ret = scan_dir(conf.base_dir_arg, &zero); if (ret < 0) goto out; ret = write_uid_file(conf.database_dir_arg); + log_bloom_stats(); out: + bloom_free(global_bloom_filter); + bloom_free(user_bloom_filter); return ret; }