First draft of hard link detection via bloom filters.
authorAndre Noll <maan@systemlinux.org>
Sun, 21 Dec 2008 22:47:28 +0000 (23:47 +0100)
committerAndre Noll <maan@systemlinux.org>
Sun, 21 Dec 2008 22:47:28 +0000 (23:47 +0100)
Makefile
adu.ggo
bloom.c [new file with mode: 0644]
bloom.h [new file with mode: 0644]
create.c

index 0cce3b9..4832666 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-objects := adu.o string.o cmdline.o fd.o select.o create.o interactive.o select.cmdline.o format.o user.o
+objects := adu.o string.o cmdline.o fd.o select.o create.o interactive.o select.cmdline.o format.o user.o bloom.o
 all: adu
 version := 0.0.5
 
diff --git a/adu.ggo b/adu.ggo
index 2ecdea4..84187eb 100644 (file)
--- a/adu.ggo
+++ b/adu.ggo
@@ -138,6 +138,36 @@ details="
        users. Decreasing the value causes adu to use slightly less memory.
 "
 
+option "bloom-filter-order" B
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+"use bloom filters for hard link detection"
+int typestr="order"
+dependon="create"
+default="23"
+optional
+details="
+       Allocate bloom filters of size 2^order bits. Each regular
+       file with hard link count greater than one is added to these
+       filters which allows to detect hard links on a per-user basis.
+       Greater values reduce the probability of false positives but
+       require more memory.
+
+       Values less than 10 deactivate this feature so that no hard
+       links are being detected.
+"
+
+option "num-bloom-filter-hash-functions" N
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+"number of hash functions for the bloom filters"
+int typestr="num"
+dependon="create"
+default="10"
+optional
+details="
+       Cause each entry which is added to the bloom filter to set
+       \"num\" bits of the bloom filter.
+"
+
 ##############################
 section "Options for --select"
 ##############################
diff --git a/bloom.c b/bloom.c
new file mode 100644 (file)
index 0000000..f7a9b1a
--- /dev/null
+++ b/bloom.c
@@ -0,0 +1,183 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <inttypes.h>
+#include <string.h>
+#include <assert.h>
+
+#include "adu.h"
+#include "string.h"
+#include "bloom.h"
+
+#ifdef TEST_BLOOM
+void *adu_calloc(size_t size)
+{
+       void *ret = malloc(size);
+       memset(ret, 0, size);
+       return ret;
+}
+#undef INFO_LOG
+#define INFO_LOG printf
+#endif
+
+static inline uint64_t filter_bits(struct bloom *b)
+{
+       return 1ULL << b->order;
+}
+
+/*
+ * SuperFastHash, by Paul Hsieh.
+ * http://www.azillionmonkeys.com/qed/hash.html
+ */
+
+#if (defined(__GNUC__) && defined(__i386__))
+#define get16bits(d) (*((const uint16_t *) (d)))
+#else
+#define get16bits(d) ((((uint32_t)(((const uint8_t *)(d))[1])) << 8)\
+                       +(uint32_t)(((const uint8_t *)(d))[0]) )
+#endif
+
+
+static uint32_t super_fast_hash(const uint8_t *data, uint32_t len, uint32_t hash)
+{
+       uint32_t tmp;
+       int rem = len & 3;
+
+       len >>= 2;
+
+       for (;len > 0; len--) {
+               hash  += get16bits (data);
+               tmp    = (get16bits (data+2) << 11) ^ hash;
+               hash   = (hash << 16) ^ tmp;
+               data  += 2*sizeof (uint16_t);
+               hash  += hash >> 11;
+       }
+
+       /* Handle end cases */
+       switch (rem) {
+       case 3:
+               hash += get16bits (data);
+               hash ^= hash << 16;
+               hash ^= data[sizeof (uint16_t)] << 18;
+               hash += hash >> 11;
+               break;
+       case 2:
+               hash += get16bits (data);
+               hash ^= hash << 11;
+               hash += hash >> 17;
+               break;
+       case 1:
+               hash += *data;
+               hash ^= hash << 10;
+               hash += hash >> 1;
+       }
+       /* Force "avalanching" of final 127 bits */
+       hash ^= hash << 3;
+       hash += hash >> 5;
+       hash ^= hash << 4;
+       hash += hash >> 17;
+       hash ^= hash << 25;
+       hash += hash >> 6;
+       return hash;
+}
+
+static int test_and_set_bit(uint64_t bitnum, struct bloom *b)
+{
+       uint64_t byte = bitnum / 8;
+       uint8_t bit = 1 << (bitnum % 8);
+       int ret = b->filter[byte] & bit;
+
+       b->filter[byte] |= bit;
+
+       if (!ret)
+               b->num_set_bits++;
+       return ret;
+}
+
+int bloom_test_and_insert(const uint8_t *data, size_t len, struct bloom *b)
+{
+       int i, ret = 0;
+       uint32_t h = 0xb11924e1; /* some arbitrary value */
+
+       for (i = 1; i < b->num_hash_functions; i++) {
+               uint64_t bitnum = super_fast_hash(data, len, h);
+               h = bitnum;
+               if (b->order > 32) {
+                       bitnum = bitnum << 32;
+                       h = super_fast_hash(data, len, h);
+                       bitnum |= h;
+               }
+               bitnum &= filter_bits(b) - 1;
+               ret |= !test_and_set_bit(bitnum, b);
+       }
+       if (ret)
+               b->num_entries++;
+       return !ret;
+}
+
+int bloom_test_and_insert_string(const char *str, struct bloom *b)
+{
+       uint32_t len = strlen(str);
+
+       return bloom_test_and_insert((const uint8_t *)str, len, b);
+}
+
+void bloom_free(struct bloom *b)
+{
+       if (!b)
+               return;
+       free(b->filter);
+       free(b);
+}
+
+int bloom_init(unsigned order, unsigned num_hash_functions,
+               struct bloom **result)
+{
+       struct bloom *b = adu_calloc(sizeof(*b));
+
+       INFO_LOG("allocating bloom filter (order: %u, %u hash functions)\n",
+               order, num_hash_functions);
+       b->order = order;
+       b->num_hash_functions = num_hash_functions;
+       b->filter = adu_calloc(filter_bits(b) / 8);
+       *result = b;
+       return 1;
+}
+
+#ifdef TEST_BLOOM
+
+void add_stdin(struct bloom *b)
+{
+       char buf[255];
+       unsigned false_positives = 0;
+
+       while (fgets(buf, sizeof(buf) - 1, stdin)) {
+               size_t len = strlen(buf);
+               if (!len)
+                       continue;
+               if (buf[len - 1] == '\n')
+                       buf[len - 1] = '\0';
+               if (bloom_test_and_insert_string(buf, b))
+                       false_positives++;
+       }
+       INFO_LOG("filter contains %llu entries\n", b->num_entries);
+       printf("%u possible false positives\n", false_positives);
+}
+
+int main(int argc, char **argv)
+{
+       struct bloom *b;
+
+       if (argc != 3) {
+               printf("usage: $0 j k\n");
+               printf("j: set bloom filter size to m=2^j bits\n");
+               printf("k: # of hash functions to use\n");
+               exit(1);
+       }
+       bloom_init(atoi(argv[1]), atoi(argv[2]), &b);
+       add_stdin(b);
+       INFO_LOG("%u%% of bits are set\n", (unsigned)
+               (b->num_set_bits * 100ULL / filter_bits(b)));
+       return 1;
+}
+
+#endif /* TEST_BLOOM */
diff --git a/bloom.h b/bloom.h
new file mode 100644 (file)
index 0000000..afb2298
--- /dev/null
+++ b/bloom.h
@@ -0,0 +1,12 @@
+struct bloom {
+       unsigned order;
+       unsigned num_hash_functions;
+       uint64_t num_entries;
+       uint64_t num_set_bits;
+       uint8_t *filter;
+};
+
+int bloom_init(unsigned bloom_filter_order, unsigned num_hash_functions,
+               struct bloom **result);
+void bloom_free(struct bloom *b);
+int bloom_test_and_insert(const uint8_t *data, size_t len, struct bloom *b);
index 87b0b9a..1b4ba88 100644 (file)
--- a/create.c
+++ b/create.c
 #include "string.h"
 #include "error.h"
 #include "user.h"
+#include "bloom.h"
 
 /* Id of the device containing the base dir. */
 static dev_t device_id;
+static struct bloom *global_bloom_filter;
+static struct bloom *user_bloom_filter;
+
+static int consider_bloom(struct stat64 *s)
+{
+       if (!global_bloom_filter)
+               return 0;
+       if (s->st_nlink <= 1)
+               return 0;
+       return 1;
+}
+
+#define GLOBAL_BLOOM_BUF_SIZE (sizeof(ino_t) + sizeof(dev_t) + sizeof(off_t))
+#define USER_BLOOM_BUF_SIZE (GLOBAL_BLOOM_BUF_SIZE + sizeof(uid_t))
+
+static void make_bloom_buf(struct stat64 *s, uint8_t buf[USER_BLOOM_BUF_SIZE])
+{
+       uint8_t *p = buf;
+
+       if (!consider_bloom(s))
+               return;
+       memcpy(p, &s->st_ino, sizeof(ino_t));
+       p += sizeof(ino_t);
+       memcpy(p, &s->st_dev, sizeof(dev_t));
+       p += sizeof(dev_t);
+       memcpy(p, &s->st_size, sizeof(off_t));
+       p += sizeof(off_t);
+       memcpy(p, &s->st_uid, sizeof(uid_t));
+}
+
+static int insert_global_bloom(struct stat64 *s,
+               uint8_t buf[USER_BLOOM_BUF_SIZE])
+{
+       if (!consider_bloom(s))
+               return 0;
+       return bloom_test_and_insert(buf, GLOBAL_BLOOM_BUF_SIZE,
+               global_bloom_filter);
+}
+
+static int insert_user_bloom(struct stat64 *s,
+               uint8_t buf[USER_BLOOM_BUF_SIZE])
+{
+       if (!consider_bloom(s))
+               return 0;
+       return bloom_test_and_insert(buf, USER_BLOOM_BUF_SIZE,
+               user_bloom_filter);
+}
 
 static int add_directory(char *dirname, uint64_t *dir_num, uint64_t *parent_dir_num,
                uint64_t *dir_size, uint64_t *dir_files)
@@ -39,7 +87,7 @@ static int add_directory(char *dirname, uint64_t *dir_num, uint64_t *parent_dir_
 }
 
 static int update_user_row(struct osl_table *t, uint64_t dir_num,
-               uint64_t *add)
+               uint64_t add)
 {
        struct osl_row *row;
        struct osl_object obj = {.data = &dir_num, .size = sizeof(dir_num)};
@@ -54,8 +102,8 @@ static int update_user_row(struct osl_table *t, uint64_t dir_num,
 
                objects[UT_DIR_NUM].data = &dir_num;
                objects[UT_DIR_NUM].size = sizeof(dir_num);
-               objects[UT_BYTES].data = add;
-               objects[UT_BYTES].size = sizeof(*add);
+               objects[UT_BYTES].data = &add;
+               objects[UT_BYTES].size = sizeof(add);
                objects[UT_FILES].data = &num_files;
                objects[UT_FILES].size = sizeof(num_files);
                ret = osl(osl_add_row(t, objects));
@@ -67,7 +115,7 @@ static int update_user_row(struct osl_table *t, uint64_t dir_num,
                ret = osl(osl_get_object(t, row, UT_BYTES, &obj1));
                if (ret < 0)
                        return ret;
-               num = *(uint64_t *)obj1.data + *add;
+               num = *(uint64_t *)obj1.data + add;
                ret = osl(osl_update_object(t, row, UT_BYTES, &obj2));
                if (ret < 0)
                        return ret;
@@ -87,7 +135,6 @@ static int scan_dir(char *dirname, uint64_t *parent_dir_num)
        uint64_t dir_size = 0, dir_files = 0;
        /* dir count. */
        static uint64_t current_dir_num;
-
        uint64_t this_dir_num = ++current_dir_num;
 
        check_signals();
@@ -102,9 +149,8 @@ static int scan_dir(char *dirname, uint64_t *parent_dir_num)
        while ((entry = readdir(dir))) {
                mode_t m;
                struct stat64 s;
-               uint32_t uid;
-               uint64_t size;
                struct user_info *ui;
+               uint8_t bloom_buf[USER_BLOOM_BUF_SIZE];
 
                if (!strcmp(entry->d_name, "."))
                        continue;
@@ -121,20 +167,38 @@ static int scan_dir(char *dirname, uint64_t *parent_dir_num)
                if (S_ISDIR(m)) {
                        if (conf.one_file_system_given && s.st_dev != device_id)
                                continue;
+                       dir_size += s.st_size;
+                       dir_files++;
+                       ret = create_user_table(s.st_uid, &ui);
+                       if (ret < 0)
+                               goto out;
+                       ret = update_user_row(ui->table, this_dir_num,
+                               s.st_size);
+                       if (ret < 0)
+                               goto out;
                        ret = scan_dir(entry->d_name, &this_dir_num);
                        if (ret < 0)
                                goto out;
                        continue;
                }
+
                /* regular file */
-               size = s.st_size;
-               dir_size += size;
+               make_bloom_buf(&s, bloom_buf);
+               if (insert_global_bloom(&s, bloom_buf))
+                       DEBUG_LOG("global hard link: %s/%s\n", dirname,
+                               entry->d_name);
+               else
+                       dir_size += s.st_size;
                dir_files++;
-               uid = s.st_uid;
-               ret = create_user_table(uid, &ui);
+               ret = create_user_table(s.st_uid, &ui);
                if (ret < 0)
                        goto out;
-               ret = update_user_row(ui->table, this_dir_num, &size);
+               ret = insert_user_bloom(&s, bloom_buf);
+               if (ret)
+                       DEBUG_LOG("hard link for uid %d: %s/%s\n",
+                               (unsigned)s.st_uid, dirname, entry->d_name);
+               ret = update_user_row(ui->table, this_dir_num,
+                       ret? 0 : s.st_size);
                if (ret < 0)
                        goto out;
        }
@@ -149,6 +213,32 @@ out:
        return ret;
 }
 
+static void log_bloom_stat(struct bloom *b)
+{
+       unsigned percent;
+
+       NOTICE_LOG("\tfilter contains %llu entries\n",
+               (long long unsigned)b->num_entries);
+       percent = b->num_set_bits * 100ULL / (1ULL << b->order);
+       NOTICE_LOG("\t%u%% of bits are set\n", percent);
+       if (percent > 50) {
+               WARNING_LOG("results may be unreliable!\n");
+               WARNING_LOG("consider incrasing bllom filter size\n");
+       }
+}
+
+static void log_bloom_stats(void)
+{
+       struct bloom *b = global_bloom_filter;
+       if (!b)
+               return;
+       NOTICE_LOG("global bloom filter statistics:\n");
+       log_bloom_stat(b);
+       NOTICE_LOG("user bloom filter statistics:\n");
+       b = user_bloom_filter;
+       log_bloom_stat(b);
+}
+
 /**
  * The main function of the create mode.
  *
@@ -157,23 +247,32 @@ out:
 int com_create(void)
 {
        uint64_t zero = 0ULL;
-       int ret;
+       int ret, order = conf.bloom_filter_order_arg,
+               num = conf.num_bloom_filter_hash_functions_arg;
        struct stat statbuf;
 
        if (lstat(conf.base_dir_arg, &statbuf) == -1)
                return -ERRNO_TO_ERROR(errno);
        if (!S_ISDIR(statbuf.st_mode))
                return -ERRNO_TO_ERROR(ENOTDIR);
+       if (order >= 10 && num > 0) {
+               bloom_init(order, num, &global_bloom_filter);
+               bloom_init(order, num, &user_bloom_filter);
+       } else
+               WARNING_LOG("hard link detection deactivated\n");
        device_id = statbuf.st_dev;
        create_hash_table(conf.hash_table_bits_arg);
        ret = open_dir_table(1);
        if (ret < 0)
-               return ret;
+               goto out;
        check_signals();
        ret = scan_dir(conf.base_dir_arg, &zero);
        if (ret < 0)
                goto out;
        ret = write_uid_file();
+       log_bloom_stats();
 out:
+       bloom_free(global_bloom_filter);
+       bloom_free(user_bloom_filter);
        return ret;
 }