Fix typo.
[adu.git] / create.c
index 5c243d8f507a92efc93b2464b53de5ac5af732fe..3023e29b291814478a8466265edb497dd9307759 100644 (file)
--- a/create.c
+++ b/create.c
@@ -4,49 +4,67 @@
  * Licensed under the GPL v2. For licencing details see COPYING.
  */
 
-/** \file create.c The create mode of adu. */
+/** \file create.c \brief The create mode of adu. */
 
 #include <dirent.h> /* readdir() */
+#include "format.h"
 #include "adu.h"
 #include "gcc-compat.h"
 #include "cmdline.h"
 #include "fd.h"
 #include "string.h"
 #include "error.h"
-#include "portable_io.h"
+#include "user.h"
+#include "bloom.h"
 
 /* Id of the device containing the base dir. */
 static dev_t device_id;
+static struct bloom *global_bloom_filter;
+static struct bloom *user_bloom_filter;
 
-static int write_uid(struct user_info *ui, void *data)
+static int consider_bloom(struct stat64 *s)
 {
-       char **p = data;
-
-       write_u32(*p, ui->uid);
-       *p += sizeof(uint32_t);
+       if (!global_bloom_filter)
+               return 0;
+       if (s->st_nlink <= 1)
+               return 0;
        return 1;
 }
 
-static int write_uid_list(void)
+/** Data size to hash for the global bloom filter. */
+#define GLOBAL_BLOOM_BUF_SIZE (sizeof(ino_t) + sizeof(dev_t) + sizeof(off_t))
+/** For the user bloom filter also the uid is being hashed. */
+#define USER_BLOOM_BUF_SIZE (GLOBAL_BLOOM_BUF_SIZE + sizeof(uid_t))
+
+static void make_bloom_buf(struct stat64 *s, uint8_t buf[USER_BLOOM_BUF_SIZE])
 {
-       char *buf, *p, *filename;
-       uint32_t count = 0;
-       struct user_info *ui;
-       size_t size = num_uids * sizeof(uint32_t);
-       int ret;
+       uint8_t *p = buf;
 
-       if (!num_uids)
+       if (!consider_bloom(s))
+               return;
+       memcpy(p, &s->st_ino, sizeof(ino_t));
+       p += sizeof(ino_t);
+       memcpy(p, &s->st_dev, sizeof(dev_t));
+       p += sizeof(dev_t);
+       memcpy(p, &s->st_size, sizeof(off_t));
+       p += sizeof(off_t);
+       memcpy(p, &s->st_uid, sizeof(uid_t));
+}
+
+static int insert_global_bloom(struct stat64 *s,
+               uint8_t buf[USER_BLOOM_BUF_SIZE])
+{
+       if (!consider_bloom(s))
                return 0;
-       buf = p = adu_malloc(size);
-       ret = for_each_admissible_user(write_uid, &p);
-       if (ret < 0)
-               goto out;
-       filename = get_uid_list_name();
-       ret = adu_write_file(filename, buf, size);
-       free(filename);
-out:
-       free(buf);
-       return ret;
+       return bloom_insert(buf, GLOBAL_BLOOM_BUF_SIZE, global_bloom_filter);
+}
+
+static int insert_user_bloom(struct stat64 *s,
+               uint8_t buf[USER_BLOOM_BUF_SIZE])
+{
+       if (!consider_bloom(s))
+               return 0;
+       return bloom_insert(buf, USER_BLOOM_BUF_SIZE, user_bloom_filter);
 }
 
 static int add_directory(char *dirname, uint64_t *dir_num, uint64_t *parent_dir_num,
@@ -69,7 +87,7 @@ static int add_directory(char *dirname, uint64_t *dir_num, uint64_t *parent_dir_
 }
 
 static int update_user_row(struct osl_table *t, uint64_t dir_num,
-               uint64_t *add)
+               uint64_t add)
 {
        struct osl_row *row;
        struct osl_object obj = {.data = &dir_num, .size = sizeof(dir_num)};
@@ -84,13 +102,11 @@ static int update_user_row(struct osl_table *t, uint64_t dir_num,
 
                objects[UT_DIR_NUM].data = &dir_num;
                objects[UT_DIR_NUM].size = sizeof(dir_num);
-               objects[UT_BYTES].data = add;
-               objects[UT_BYTES].size = sizeof(*add);
+               objects[UT_BYTES].data = &add;
+               objects[UT_BYTES].size = sizeof(add);
                objects[UT_FILES].data = &num_files;
                objects[UT_FILES].size = sizeof(num_files);
-               INFO_LOG("######################### ret: %d\n", ret);
                ret = osl(osl_add_row(t, objects));
-               INFO_LOG("######################### ret: %d\n", ret);
                return ret;
        } else { /* add size and increment file count */
                uint64_t num;
@@ -99,7 +115,7 @@ static int update_user_row(struct osl_table *t, uint64_t dir_num,
                ret = osl(osl_get_object(t, row, UT_BYTES, &obj1));
                if (ret < 0)
                        return ret;
-               num = *(uint64_t *)obj1.data + *add;
+               num = *(uint64_t *)obj1.data + add;
                ret = osl(osl_update_object(t, row, UT_BYTES, &obj2));
                if (ret < 0)
                        return ret;
@@ -119,7 +135,6 @@ static int scan_dir(char *dirname, uint64_t *parent_dir_num)
        uint64_t dir_size = 0, dir_files = 0;
        /* dir count. */
        static uint64_t current_dir_num;
-
        uint64_t this_dir_num = ++current_dir_num;
 
        check_signals();
@@ -133,17 +148,16 @@ static int scan_dir(char *dirname, uint64_t *parent_dir_num)
        }
        while ((entry = readdir(dir))) {
                mode_t m;
-               struct stat s;
-               uint32_t uid;
-               uint64_t size;
+               struct stat64 s;
                struct user_info *ui;
+               uint8_t bloom_buf[USER_BLOOM_BUF_SIZE];
 
                if (!strcmp(entry->d_name, "."))
                        continue;
                if (!strcmp(entry->d_name, ".."))
                        continue;
-               if (lstat(entry->d_name, &s) == -1) {
-                       WARNING_LOG("lstat error for %s/%s (%s)\n",
+               if (lstat64(entry->d_name, &s) == -1) {
+                       WARNING_LOG("lstat64 error for %s/%s (%s)\n",
                                dirname, entry->d_name, strerror(errno));
                        continue;
                }
@@ -153,22 +167,38 @@ static int scan_dir(char *dirname, uint64_t *parent_dir_num)
                if (S_ISDIR(m)) {
                        if (conf.one_file_system_given && s.st_dev != device_id)
                                continue;
+                       dir_size += s.st_size;
+                       dir_files++;
+                       ret = create_user_table(conf.database_dir_arg, s.st_uid, &ui);
+                       if (ret < 0)
+                               goto out;
+                       ret = update_user_row(ui->table, this_dir_num,
+                               s.st_size);
+                       if (ret < 0)
+                               goto out;
                        ret = scan_dir(entry->d_name, &this_dir_num);
                        if (ret < 0)
                                goto out;
                        continue;
                }
+
                /* regular file */
-               size = s.st_size;
-               dir_size += size;
+               make_bloom_buf(&s, bloom_buf);
+               if (insert_global_bloom(&s, bloom_buf))
+                       DEBUG_LOG("global hard link: %s/%s\n", dirname,
+                               entry->d_name);
+               else
+                       dir_size += s.st_size;
                dir_files++;
-               uid = s.st_uid;
-               ret = search_uid(uid, CREATE_USER_TABLE | OPEN_USER_TABLE, &ui);
+               ret = create_user_table(conf.database_dir_arg, s.st_uid, &ui);
                if (ret < 0)
                        goto out;
-               ui->bytes += size;
-               ui->files++;
-               ret = update_user_row(ui->table, this_dir_num, &size);
+               ret = insert_user_bloom(&s, bloom_buf);
+               if (ret)
+                       DEBUG_LOG("hard link for uid %d: %s/%s\n",
+                               (unsigned)s.st_uid, dirname, entry->d_name);
+               ret = update_user_row(ui->table, this_dir_num,
+                       ret? 0 : s.st_size);
                if (ret < 0)
                        goto out;
        }
@@ -183,27 +213,66 @@ out:
        return ret;
 }
 
-int com_create()
+static void log_bloom_stat(struct bloom *b)
+{
+       unsigned percent;
+
+       NOTICE_LOG("\tfilter contains %llu entries\n",
+               (long long unsigned)b->num_entries);
+       percent = b->num_set_bits * 100ULL / (1ULL << b->order);
+       NOTICE_LOG("\t%u%% of bits are set\n", percent);
+       if (percent > 50) {
+               WARNING_LOG("results may be unreliable!\n");
+               WARNING_LOG("consider incrasing bloom filter size\n");
+       }
+}
+
+static void log_bloom_stats(void)
+{
+       struct bloom *b = global_bloom_filter;
+       if (!b)
+               return;
+       NOTICE_LOG("global bloom filter statistics:\n");
+       log_bloom_stat(b);
+       NOTICE_LOG("user bloom filter statistics:\n");
+       b = user_bloom_filter;
+       log_bloom_stat(b);
+}
+
+/**
+ * The main function of the create mode.
+ *
+ * \return Standard.
+ */
+int com_create(void)
 {
        uint64_t zero = 0ULL;
-       int ret;
+       int ret, order = conf.bloom_filter_order_arg,
+               num = conf.num_bloom_filter_hash_functions_arg;
        struct stat statbuf;
 
        if (lstat(conf.base_dir_arg, &statbuf) == -1)
                return -ERRNO_TO_ERROR(errno);
        if (!S_ISDIR(statbuf.st_mode))
                return -ERRNO_TO_ERROR(ENOTDIR);
+       if (order >= 10 && num > 0) {
+               global_bloom_filter = bloom_new(order, num);
+               user_bloom_filter = bloom_new(order, num);
+       } else
+               WARNING_LOG("hard link detection deactivated\n");
        device_id = statbuf.st_dev;
        create_hash_table(conf.hash_table_bits_arg);
        ret = open_dir_table(1);
        if (ret < 0)
-               return ret;
+               goto out;
        check_signals();
        ret = scan_dir(conf.base_dir_arg, &zero);
        if (ret < 0)
                goto out;
-       ret = write_uid_list();
+       ret = write_uid_file(conf.database_dir_arg);
+       log_bloom_stats();
 out:
-       close_all_tables();
+       bloom_free(global_bloom_filter);
+       bloom_free(user_bloom_filter);
        return ret;
 }