-objects := adu.o string.o cmdline.o fd.o select.o create.o interactive.o select.cmdline.o format.o user.o
+objects := adu.o string.o cmdline.o fd.o select.o create.o interactive.o select.cmdline.o format.o user.o bloom.o
all: adu
version := 0.0.5
uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not')
* - Modes of operation: \ref select.c, \ref create.c, \ref interactive.c
* - User handling: \ref user.c
* - Error handling: \ref error.h
- * - Library-type functions: \ref fd.c, \ref format.c, \ref string.c, \ref portable_io.h
+ * - Library-type functions: \ref fd.c, \ref format.c, \ref string.c, \ref portable_io.h, \ref bloom.c
*/
#include "adu.h"
users. Decreasing the value causes adu to use slightly less memory.
"
+option "bloom-filter-order" B
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+"use bloom filters for hard link detection"
+int typestr="order"
+dependon="create"
+default="23"
+optional
+details="
+ Allocate bloom filters of size 2^order bits. Each regular
+ file with hard link count greater than one is added to these
+ filters which allows to detect hard links on a per-user basis.
+ Greater values reduce the probability of false positives but
+ require more memory.
+
+ Values less than 10 deactivate this feature so that no hard
+ links are being detected.
+"
+
+option "num-bloom-filter-hash-functions" N
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+"number of hash functions for the bloom filters"
+int typestr="num"
+dependon="create"
+default="10"
+optional
+details="
+ Cause each entry which is added to the bloom filter to set
+ \"num\" bits of the bloom filter.
+"
+
##############################
section "Options for --select"
##############################
--- /dev/null
+/*
+ * Copyright (C) 2008 Andre Noll <maan@systemlinux.org>
+ *
+ * Licensed under the GPL v2. For licencing details see COPYING.
+ */
+
+/** \file bloom.c Simple bloom filter implementation. */
+
+#include "adu.h"
+#include "string.h"
+#include "bloom.h"
+
+#ifdef TEST_BLOOM
+void *adu_calloc(size_t size)
+{
+ void *ret = malloc(size);
+
+ if (!ret)
+ exit(EXIT_FAILURE);
+ memset(ret, 0, size);
+ return ret;
+}
+#undef INFO_LOG
+#define INFO_LOG printf
+#endif
+
+static inline uint64_t filter_bits(struct bloom *b)
+{
+ return 1ULL << b->order;
+}
+
+/*
+ * SuperFastHash, by Paul Hsieh.
+ * http://www.azillionmonkeys.com/qed/hash.html
+ */
+
+/** \cond */
+#if (defined(__GNUC__) && defined(__i386__))
+#define get16bits(d) (*((const uint16_t *) (d)))
+#else
+#define get16bits(d) ((((uint32_t)(((const uint8_t *)(d))[1])) << 8)\
+ +(uint32_t)(((const uint8_t *)(d))[0]) )
+#endif
+/** \endcond */
+
+static uint32_t super_fast_hash(const uint8_t *data, uint32_t len, uint32_t hash)
+{
+ uint32_t tmp;
+ int rem = len & 3;
+
+ len >>= 2;
+
+ for (;len > 0; len--) {
+ hash += get16bits (data);
+ tmp = (get16bits (data+2) << 11) ^ hash;
+ hash = (hash << 16) ^ tmp;
+ data += 2*sizeof (uint16_t);
+ hash += hash >> 11;
+ }
+
+ /* Handle end cases */
+ switch (rem) {
+ case 3:
+ hash += get16bits (data);
+ hash ^= hash << 16;
+ hash ^= data[sizeof (uint16_t)] << 18;
+ hash += hash >> 11;
+ break;
+ case 2:
+ hash += get16bits (data);
+ hash ^= hash << 11;
+ hash += hash >> 17;
+ break;
+ case 1:
+ hash += *data;
+ hash ^= hash << 10;
+ hash += hash >> 1;
+ }
+ /* Force "avalanching" of final 127 bits */
+ hash ^= hash << 3;
+ hash += hash >> 5;
+ hash ^= hash << 4;
+ hash += hash >> 17;
+ hash ^= hash << 25;
+ hash += hash >> 6;
+ return hash;
+}
+
+static int test_and_set_bit(uint64_t bitnum, struct bloom *b)
+{
+ uint64_t byte = bitnum / 8;
+ uint8_t bit = 1 << (bitnum % 8);
+ int ret = b->filter[byte] & bit;
+
+ b->filter[byte] |= bit;
+
+ if (!ret)
+ b->num_set_bits++;
+ return ret;
+}
+
+/**
+ * Insert data to the given bloom filter.
+ *
+ * This function computes \a k hashes from the given data where \a k is the
+ * number of hash functions of the filter \a b. Each hash value corresponds to
+ * a position in the bit array of the filter and each of these bits are being
+ * tested and set. If not all \a k bits were already set, the given data was
+ * not yet contained in the filter and the function returns non-zero.
+ *
+ * Otherwise either (a) the same data has already been inserted previously or
+ * (b) a hash collision occurred or (c) the \a k bits are set due to previous
+ * insertion of other data (i.e. a false positive occurred). It is impossible
+ * to distinguish these cases.
+ *
+ * \param data The data to insert.
+ * \param len Number of bytes of \a data.
+ * \param b The filter to insert to.
+ *
+ * \return Zero if the entry was already contained in the filter (or in case of
+ * false positives), non-zero otherwise.
+ */
+int bloom_insert(const uint8_t *data, size_t len, struct bloom *b)
+{
+ int i, ret = 0;
+ uint32_t h = 0xb11924e1; /* some arbitrary value */
+
+ for (i = 1; i < b->num_hash_functions; i++) {
+ uint64_t bitnum = super_fast_hash(data, len, h);
+ h = bitnum;
+ if (b->order > 32) {
+ bitnum = bitnum << 32;
+ h = super_fast_hash(data, len, h);
+ bitnum |= h;
+ }
+ bitnum &= filter_bits(b) - 1;
+ ret |= !test_and_set_bit(bitnum, b);
+ }
+ if (ret)
+ b->num_entries++;
+ return !ret;
+}
+
+/**
+ * Deallocate a bloom filter.
+ *
+ * \param b The filter to deallocate.
+ */
+void bloom_free(struct bloom *b)
+{
+ if (!b)
+ return;
+ free(b->filter);
+ free(b);
+}
+
+/**
+ * Allocate and initialize a new bloom filter.
+ *
+ * \param order Use a filter containing 2^order bits.
+ * \param num_hash_functions Set that many bits in the filter per entry.
+ *
+ * \return This function either returns a freshly allocated and initialized
+ * bloom filter or does not return at all (if the underlying malloc() failed).
+ */
+struct bloom *bloom_new(unsigned order, unsigned num_hash_functions)
+{
+ struct bloom *b = adu_calloc(sizeof(*b));
+
+ INFO_LOG("allocating bloom filter (order: %u, %u hash functions)\n",
+ order, num_hash_functions);
+ b->order = order;
+ b->num_hash_functions = num_hash_functions;
+ b->filter = adu_calloc(filter_bits(b) / 8);
+ return b;
+}
+
+#ifdef TEST_BLOOM
+
+int bloom_insert_string(const char *str, struct bloom *b)
+{
+ uint32_t len = strlen(str);
+
+ return bloom_insert((const uint8_t *)str, len, b);
+}
+
+void add_stdin(struct bloom *b)
+{
+ char buf[255];
+ unsigned false_positives = 0;
+
+ while (fgets(buf, sizeof(buf) - 1, stdin)) {
+ size_t len = strlen(buf);
+ if (!len)
+ continue;
+ if (buf[len - 1] == '\n')
+ buf[len - 1] = '\0';
+ if (bloom_insert_string(buf, b))
+ false_positives++;
+ }
+ INFO_LOG("filter contains %llu entries\n", b->num_entries);
+ printf("%u possible false positives\n", false_positives);
+}
+
+int main(int argc, char **argv)
+{
+ struct bloom *b;
+
+ if (argc != 3) {
+ printf("usage: $0 j k\n");
+ printf("j: set bloom filter size to m=2^j bits\n");
+ printf("k: # of hash functions to use\n");
+ exit(1);
+ }
+ b = bloom_new(atoi(argv[1]), atoi(argv[2]));
+ add_stdin(b);
+ INFO_LOG("%u%% of bits are set\n", (unsigned)
+ (b->num_set_bits * 100ULL / filter_bits(b)));
+ return 1;
+}
+
+#endif /* TEST_BLOOM */
--- /dev/null
+/*
+ * Copyright (C) 2008 Andre Noll <maan@systemlinux.org>
+ *
+ * Licensed under the GPL v2. For licencing details see COPYING.
+ */
+
+/** \file bloom.h Struct bloom and bloom filter functions. */
+
+/** Describes one instance of a bloom filter. */
+struct bloom {
+ /** The bloom filter is of size 2^order bits. */
+ unsigned order;
+ /** Set that many bits in the filter per entry. */
+ unsigned num_hash_functions;
+ /** How many entries have been inserted so far. */
+ uint64_t num_entries;
+ /** Number of bits currently set. */
+ uint64_t num_set_bits;
+ /** The bit array. */
+ uint8_t *filter;
+};
+
+struct bloom *bloom_new(unsigned order, unsigned num_hash_functions);
+void bloom_free(struct bloom *b);
+int bloom_insert(const uint8_t *data, size_t len, struct bloom *b);
#include "string.h"
#include "error.h"
#include "user.h"
+#include "bloom.h"
/* Id of the device containing the base dir. */
static dev_t device_id;
+static struct bloom *global_bloom_filter;
+static struct bloom *user_bloom_filter;
+
+static int consider_bloom(struct stat64 *s)
+{
+ if (!global_bloom_filter)
+ return 0;
+ if (s->st_nlink <= 1)
+ return 0;
+ return 1;
+}
+
+/** Data size to hash for the global bloom filter. */
+#define GLOBAL_BLOOM_BUF_SIZE (sizeof(ino_t) + sizeof(dev_t) + sizeof(off_t))
+/** For the user bloom filter also the uid is being hashed. */
+#define USER_BLOOM_BUF_SIZE (GLOBAL_BLOOM_BUF_SIZE + sizeof(uid_t))
+
+static void make_bloom_buf(struct stat64 *s, uint8_t buf[USER_BLOOM_BUF_SIZE])
+{
+ uint8_t *p = buf;
+
+ if (!consider_bloom(s))
+ return;
+ memcpy(p, &s->st_ino, sizeof(ino_t));
+ p += sizeof(ino_t);
+ memcpy(p, &s->st_dev, sizeof(dev_t));
+ p += sizeof(dev_t);
+ memcpy(p, &s->st_size, sizeof(off_t));
+ p += sizeof(off_t);
+ memcpy(p, &s->st_uid, sizeof(uid_t));
+}
+
+static int insert_global_bloom(struct stat64 *s,
+ uint8_t buf[USER_BLOOM_BUF_SIZE])
+{
+ if (!consider_bloom(s))
+ return 0;
+ return bloom_insert(buf, GLOBAL_BLOOM_BUF_SIZE, global_bloom_filter);
+}
+
+static int insert_user_bloom(struct stat64 *s,
+ uint8_t buf[USER_BLOOM_BUF_SIZE])
+{
+ if (!consider_bloom(s))
+ return 0;
+ return bloom_insert(buf, USER_BLOOM_BUF_SIZE, user_bloom_filter);
+}
static int add_directory(char *dirname, uint64_t *dir_num, uint64_t *parent_dir_num,
uint64_t *dir_size, uint64_t *dir_files)
}
static int update_user_row(struct osl_table *t, uint64_t dir_num,
- uint64_t *add)
+ uint64_t add)
{
struct osl_row *row;
struct osl_object obj = {.data = &dir_num, .size = sizeof(dir_num)};
objects[UT_DIR_NUM].data = &dir_num;
objects[UT_DIR_NUM].size = sizeof(dir_num);
- objects[UT_BYTES].data = add;
- objects[UT_BYTES].size = sizeof(*add);
+ objects[UT_BYTES].data = &add;
+ objects[UT_BYTES].size = sizeof(add);
objects[UT_FILES].data = &num_files;
objects[UT_FILES].size = sizeof(num_files);
ret = osl(osl_add_row(t, objects));
ret = osl(osl_get_object(t, row, UT_BYTES, &obj1));
if (ret < 0)
return ret;
- num = *(uint64_t *)obj1.data + *add;
+ num = *(uint64_t *)obj1.data + add;
ret = osl(osl_update_object(t, row, UT_BYTES, &obj2));
if (ret < 0)
return ret;
uint64_t dir_size = 0, dir_files = 0;
/* dir count. */
static uint64_t current_dir_num;
-
uint64_t this_dir_num = ++current_dir_num;
check_signals();
while ((entry = readdir(dir))) {
mode_t m;
struct stat64 s;
- uint32_t uid;
- uint64_t size;
struct user_info *ui;
+ uint8_t bloom_buf[USER_BLOOM_BUF_SIZE];
if (!strcmp(entry->d_name, "."))
continue;
if (S_ISDIR(m)) {
if (conf.one_file_system_given && s.st_dev != device_id)
continue;
+ dir_size += s.st_size;
+ dir_files++;
+ ret = create_user_table(conf.database_dir_arg, s.st_uid, &ui);
+ if (ret < 0)
+ goto out;
+ ret = update_user_row(ui->table, this_dir_num,
+ s.st_size);
+ if (ret < 0)
+ goto out;
ret = scan_dir(entry->d_name, &this_dir_num);
if (ret < 0)
goto out;
continue;
}
+
/* regular file */
- size = s.st_size;
- dir_size += size;
+ make_bloom_buf(&s, bloom_buf);
+ if (insert_global_bloom(&s, bloom_buf))
+ DEBUG_LOG("global hard link: %s/%s\n", dirname,
+ entry->d_name);
+ else
+ dir_size += s.st_size;
dir_files++;
- uid = s.st_uid;
- ret = create_user_table(conf.database_dir_arg, uid, &ui);
+ ret = create_user_table(conf.database_dir_arg, s.st_uid, &ui);
if (ret < 0)
goto out;
- ret = update_user_row(ui->table, this_dir_num, &size);
+ ret = insert_user_bloom(&s, bloom_buf);
+ if (ret)
+ DEBUG_LOG("hard link for uid %d: %s/%s\n",
+ (unsigned)s.st_uid, dirname, entry->d_name);
+ ret = update_user_row(ui->table, this_dir_num,
+ ret? 0 : s.st_size);
if (ret < 0)
goto out;
}
return ret;
}
+static void log_bloom_stat(struct bloom *b)
+{
+ unsigned percent;
+
+ NOTICE_LOG("\tfilter contains %llu entries\n",
+ (long long unsigned)b->num_entries);
+ percent = b->num_set_bits * 100ULL / (1ULL << b->order);
+ NOTICE_LOG("\t%u%% of bits are set\n", percent);
+ if (percent > 50) {
+ WARNING_LOG("results may be unreliable!\n");
+ WARNING_LOG("consider incrasing bllom filter size\n");
+ }
+}
+
+static void log_bloom_stats(void)
+{
+ struct bloom *b = global_bloom_filter;
+ if (!b)
+ return;
+ NOTICE_LOG("global bloom filter statistics:\n");
+ log_bloom_stat(b);
+ NOTICE_LOG("user bloom filter statistics:\n");
+ b = user_bloom_filter;
+ log_bloom_stat(b);
+}
+
/**
* The main function of the create mode.
*
int com_create(void)
{
uint64_t zero = 0ULL;
- int ret;
+ int ret, order = conf.bloom_filter_order_arg,
+ num = conf.num_bloom_filter_hash_functions_arg;
struct stat statbuf;
if (lstat(conf.base_dir_arg, &statbuf) == -1)
return -ERRNO_TO_ERROR(errno);
if (!S_ISDIR(statbuf.st_mode))
return -ERRNO_TO_ERROR(ENOTDIR);
+ if (order >= 10 && num > 0) {
+ global_bloom_filter = bloom_new(order, num);
+ user_bloom_filter = bloom_new(order, num);
+ } else
+ WARNING_LOG("hard link detection deactivated\n");
device_id = statbuf.st_dev;
create_hash_table(conf.hash_table_bits_arg);
ret = open_dir_table(1);
if (ret < 0)
- return ret;
+ goto out;
check_signals();
ret = scan_dir(conf.base_dir_arg, &zero);
if (ret < 0)
goto out;
ret = write_uid_file(conf.database_dir_arg);
+ log_bloom_stats();
out:
+ bloom_free(global_bloom_filter);
+ bloom_free(user_bloom_filter);
return ret;
}