From: Andre Noll Date: Sun, 25 May 2008 16:08:27 +0000 (+0200) Subject: Replace id table by hash table. X-Git-Tag: v0.0.2~42^2~6 X-Git-Url: http://git.tuebingen.mpg.de/?p=adu.git;a=commitdiff_plain;h=042c26596b371df0e63591cccce951e1d60887dd Replace id table by hash table. This should make uid lookups much faster. We use a power of two for the hash table size and open addressing with double hashing to handle hash collisions. --- diff --git a/adu.c b/adu.c index 03df3e1..a7ac6b4 100644 --- a/adu.c +++ b/adu.c @@ -138,58 +138,6 @@ static struct osl_table_description dir_table_desc = { .dir = "/tmp/adu" }; -/** The columns of the id table. */ -enum id_table_columns { - /** The user id. */ - IDT_UID, - /** The number of bytes of all regular files owned by this id. */ - IDT_BYTES, - /** The number of regular files owned by this id. */ - IDT_FILES, - /** The user table for this uid. */ - IDT_TABLE, - /** Number of columns in this table. */ - NUM_IDT_COLUMNS -}; - -static struct osl_column_description id_table_cols[] = { - [IDT_UID] = { - .storage_type = OSL_MAPPED_STORAGE, - .storage_flags = OSL_RBTREE | OSL_FIXED_SIZE | OSL_UNIQUE, - .name = "uid", - .compare_function = uint32_compare, - .data_size = sizeof(uint32_t) - }, - [IDT_BYTES] = { - .storage_type = OSL_MAPPED_STORAGE, - .storage_flags = OSL_RBTREE | OSL_FIXED_SIZE, - .compare_function = size_compare, - .name = "num_bytes", - .data_size = sizeof(uint64_t) - }, - [IDT_FILES] = { - .storage_type = OSL_MAPPED_STORAGE, - .storage_flags = OSL_RBTREE | OSL_FIXED_SIZE, - .compare_function = size_compare, - .name = "num_filess", - .data_size = sizeof(uint64_t) - }, - [IDT_TABLE] = { - .storage_type = OSL_NO_STORAGE, - .storage_flags = OSL_FIXED_SIZE | OSL_UNIQUE, - .name = "user_table", - .data_size = sizeof(void *) - } -}; - -static struct osl_table_description id_table_desc = { - .name = "id_table", - .num_columns = NUM_IDT_COLUMNS, - .flags = 0, - .column_descriptions = id_table_cols, - .dir = "/tmp/adu" -}; - /** The columns of the id table. */ enum user_table_columns { /** The numer of the directory. */ @@ -210,14 +158,14 @@ static struct osl_column_description user_table_cols[] = { .compare_function = uint32_compare, .data_size = sizeof(uint32_t) }, - [IDT_BYTES] = { + [UT_BYTES] = { .storage_type = OSL_MAPPED_STORAGE, .storage_flags = OSL_RBTREE | OSL_FIXED_SIZE, .compare_function = size_compare, .name = "num_bytes", .data_size = sizeof(uint64_t) }, - [IDT_FILES] = { + [UT_FILES] = { .storage_type = OSL_MAPPED_STORAGE, .storage_flags = OSL_RBTREE | OSL_FIXED_SIZE, .compare_function = size_compare, @@ -226,25 +174,7 @@ static struct osl_column_description user_table_cols[] = { }, }; -static struct osl_table_description user_table_desc = { - .num_columns = NUM_UT_COLUMNS, - .flags = 0, - .column_descriptions = user_table_cols, - .dir = "/tmp/adu" -}; static struct osl_table *dir_table; -static struct osl_table *id_table; - -static int create_tables(void) -{ - int ret = osl_create_table(&dir_table_desc); - if (ret < 0) - return ret; - ret = osl_create_table(&id_table_desc); - if (ret < 0) - return ret; - return 1; -} int add_directory(char *dirname, uint32_t dir_num, uint64_t *dir_size, uint64_t *dir_files) @@ -282,57 +212,64 @@ int create_and_open_user_table(uint32_t uid, struct osl_table **t) return osl_open_table(desc, t); } -static int insert_id_row(uint32_t uid, struct osl_table *t, struct osl_row **row) + +#define uid_hash_bits 8 +static uint32_t uid_hash_table_size = 1 << uid_hash_bits; +#define PRIME1 0x811c9dc5 +#define PRIME2 0x01000193 + +struct user_info { + uint32_t uid; + struct osl_table *table; + uint64_t files; + uint64_t bytes; +}; + +static struct user_info *uid_hash_table; + +static void create_hash_table(void) { - struct osl_object objects[NUM_IDT_COLUMNS]; - uint64_t num = 0; - - struct osl_table **table_ptr = para_malloc(sizeof(*table_ptr)); - *table_ptr = t; - - INFO_LOG("§§§§§§§§§§§§§§§§§§§§§ uid: %d, t: %p\n", uid, t); - objects[IDT_UID].data = &uid; - objects[IDT_UID].size = sizeof(uid); - objects[IDT_BYTES].data = # - objects[IDT_BYTES].size = sizeof(num); - objects[IDT_FILES].data = # - objects[IDT_FILES].size = sizeof(num); - objects[IDT_TABLE].data = table_ptr; - objects[IDT_TABLE].size = sizeof(*table_ptr); - return osl_add_and_get_row(id_table, objects, row); + uid_hash_table = para_calloc(uid_hash_table_size + * sizeof(struct user_info)); } -static int get_user_table(struct osl_row *row, struct osl_table **t) +static int create_tables(void) { - struct osl_object obj; - - int ret = osl_get_object(id_table, row, IDT_TABLE, &obj); - if (ret < 0) - return ret; - *t = *(struct osl_table **)obj.data; - INFO_LOG("^^^^^^^^^^^^^^^^^^ t: %p\n", *t); - return 1; + create_hash_table(); + return osl_create_table(&dir_table_desc); } -static int add_id_bytes(struct osl_row *row, uint64_t *add) + +static uint32_t double_hash(uint32_t uid, uint32_t probe_num) { - uint64_t num; - struct osl_object obj1, obj2 = {.data = &num, .size = sizeof(num)}; + return (uid * PRIME1 + ((uid * PRIME2) | 1) * probe_num) % uid_hash_table_size; +} - /* update number of bytes */ - int ret = osl_get_object(id_table, row, IDT_BYTES, &obj1); - if (ret < 0) - return ret; - num = *(uint64_t *)obj1.data + *add; - ret = osl_update_object(id_table, row, IDT_BYTES, &obj2); - if (ret < 0) - return ret; - /* increment number of files */ - ret = osl_get_object(id_table, row, IDT_FILES, &obj1); - if (ret < 0) - return ret; - num = *(uint64_t *)obj1.data + 1; - return osl_update_object(id_table, row, IDT_FILES, &obj2); +#define FOR_EACH_USER(ui) for (ui = uid_hash_table; ui < uid_hash_table \ + + uid_hash_table_size; ui++) + +static int search_uid(uint32_t uid, int insert, struct user_info **ui) +{ + uint32_t p; + + for (p = 0; p < uid_hash_table_size; p++) { + struct user_info *i = uid_hash_table + double_hash(uid, p); + if (!i->table) { + if (!insert) + return -E_BAD_UID; + int ret = create_and_open_user_table(uid, &i->table); + if (ret < 0) + return ret; + i->uid = uid; + *ui = i; + return 1; + } + if (i->uid != uid) + continue; + *ui = i; + return 0; + } + return insert? -E_HASH_TABLE_OVERFLOW : -E_BAD_UID; } static int update_user_row(struct osl_table *t, uint32_t dir_num, @@ -378,7 +315,9 @@ static int update_user_row(struct osl_table *t, uint32_t dir_num, } } -static uint32_t dir_num; +static uint32_t num_dirs; +static uint32_t num_files; +static uint64_t num_bytes; int scan_dir(char *dirname) { @@ -386,9 +325,9 @@ int scan_dir(char *dirname) struct dirent *entry; int ret, cwd_fd, ret2; uint64_t dir_size = 0, dir_files = 0; - struct osl_object obj; + uint32_t this_dir_num = num_dirs++; - INFO_LOG("----------------- %s\n", dirname); + DEBUG_LOG("----------------- %u: %s\n", num_dirs, dirname); ret = para_opendir(dirname, &dir, &cwd_fd); if (ret < 0) { if (ret != -ERRNO_TO_ERROR(EACCES)) @@ -402,15 +341,17 @@ int scan_dir(char *dirname) struct stat s; uint32_t uid; uint64_t size; - struct osl_row *id_row; - struct osl_table *user_table; + struct user_info *ui; if (!strcmp(entry->d_name, ".")) continue; if (!strcmp(entry->d_name, "..")) continue; - if (lstat(entry->d_name, &s) == -1) + if (lstat(entry->d_name, &s) == -1) { + WARNING_LOG("lstat error for %s/%s\n", dirname, + entry->d_name); continue; + } m = s.st_mode; if (!S_ISREG(m) && !S_ISDIR(m)) continue; @@ -425,36 +366,20 @@ int scan_dir(char *dirname) /* regular file */ size = s.st_size; dir_size += size; + num_bytes += size; dir_files++; + num_files++; uid = s.st_uid; - INFO_LOG("++++++++++++++++++++++++++ %s, uid: %u\n", entry->d_name, uid); - obj.data = &uid; - obj.size = sizeof(uid); - ret = osl_get_row(id_table, IDT_UID, &obj, &id_row); - if (ret < 0 && ret != -E_RB_KEY_NOT_FOUND) - goto out; - if (ret < 0) { - ret = create_and_open_user_table(uid, &user_table); - if (ret < 0) - goto out; - ret = insert_id_row(uid, user_table, &id_row); - if (ret < 0) - goto out; - } else { - ret = get_user_table(id_row, &user_table); - if (ret < 0) - goto out; - } - ret = add_id_bytes(id_row, &size); + ret = search_uid(uid, 1, &ui); if (ret < 0) goto out; - INFO_LOG("user_table: %p\n", user_table); - ret = update_user_row(user_table, dir_num, &size); - INFO_LOG("update_user ret: %d\n", ret); + ui->bytes += size; + ui->files++; + ret = update_user_row(ui->table, this_dir_num, &size); if (ret < 0) goto out; } - ret = add_directory(dirname, dir_num++, &dir_size, &dir_files); + ret = add_directory(dirname, this_dir_num, &dir_size, &dir_files); out: closedir(dir); ret2 = para_fchdir(cwd_fd); @@ -513,52 +438,39 @@ static int print_dirname_and_file_count(struct osl_row *row, void *data) return 1; } -static int print_id_stats(struct osl_row *row, __a_unused void *data) +static void print_id_stats(void) { - struct osl_object obj; - uint32_t uid; - uint64_t bytes, files; - int ret = osl_get_object(id_table, row, IDT_UID, &obj); + struct user_info *ui; - if (ret < 0) - return ret; - uid = *(uint32_t *)obj.data; - ret = osl_get_object(id_table, row, IDT_BYTES, &obj); - if (ret < 0) - return ret; - bytes = *(uint64_t *)obj.data; - ret = osl_get_object(id_table, row, IDT_FILES, &obj); - if (ret < 0) - return ret; - files = *(uint64_t *)obj.data; - - printf("%u\t%llu\t%llu\n", (unsigned)uid, (long long unsigned)files, - (long long unsigned)bytes); - return 1; + FOR_EACH_USER(ui) { + if (!ui->table) + continue; + printf("%u\t%llu\t%llu\n", (unsigned)ui->uid, (long long unsigned)ui->files, + (long long unsigned)ui->bytes); + } } -struct id_dir_stat_info { +struct big_dir_info { unsigned count; struct osl_table *user_table; }; static int print_big_dir(struct osl_row *row, void *data) { - struct id_dir_stat_info *info = data; - info->count++; + struct big_dir_info *bdi = data; int ret; struct osl_row *dir_row; char *dirname; uint64_t bytes; struct osl_object obj; - if (info->count > 10) + if (bdi->count++ > 10) return -E_LOOP_COMPLETE; - ret = osl_get_object(info->user_table, row, UT_BYTES, &obj); + ret = osl_get_object(bdi->user_table, row, UT_BYTES, &obj); if (ret < 0) return ret; bytes = *(uint64_t *)obj.data; - ret = osl_get_object(info->user_table, row, UT_DIR_NUM, &obj); + ret = osl_get_object(bdi->user_table, row, UT_DIR_NUM, &obj); if (ret < 0) return ret; ret = osl_get_row(dir_table, DT_NUM, &obj, &dir_row); @@ -572,25 +484,18 @@ static int print_big_dir(struct osl_row *row, void *data) return 1; } -static int print_id_dir_stats(struct osl_row *row, __a_unused void *data) +static void print_id_dir_stats(void) { - struct osl_object obj; - uint32_t uid; - int ret = osl_get_object(id_table, row, IDT_UID, &obj); - struct id_dir_stat_info info = {.count = 0}; - - if (ret < 0) - return ret; - uid = *(uint32_t *)obj.data; - - ret = osl_get_object(id_table, row, IDT_TABLE, &obj); - if (ret < 0) - return ret; - info.user_table = *(struct osl_table **)obj.data; + struct user_info *ui; - printf("************************* Big dirs owned by uid %u\n", (unsigned) uid); - osl_rbtree_loop_reverse(info.user_table, IDT_BYTES, &info, print_big_dir); - return 1; + FOR_EACH_USER(ui) { + struct big_dir_info bdi = {.count = 0}; + if (!ui->table) + continue; + bdi.user_table = ui->table; + printf("************************* Big dirs owned by uid %u\n", (unsigned) ui->uid); + osl_rbtree_loop_reverse(ui->table, UT_BYTES, &bdi, print_big_dir); + } } static int print_statistics(void) @@ -598,6 +503,8 @@ static int print_statistics(void) unsigned count = 0; int ret; + printf("Summary: %u dirs, %u files, %llu bytes\n", (unsigned)num_dirs, + (unsigned)num_files, (long long unsigned)num_bytes); printf("************************* Biggest dirs\n"); ret = osl_rbtree_loop_reverse(dir_table, DT_BYTES, &count, print_dirname_and_size); if (ret < 0 && ret != -E_LOOP_COMPLETE) @@ -609,11 +516,9 @@ static int print_statistics(void) return ret; printf("************************* dirs stats by owner\n"); - ret = osl_rbtree_loop(id_table, IDT_BYTES, NULL, print_id_stats); - if (ret < 0) - return ret; - - return osl_rbtree_loop(id_table, IDT_BYTES, NULL, print_id_dir_stats); + print_id_stats(); + print_id_dir_stats(); + return 1; } @@ -623,9 +528,6 @@ int main(int argc, char **argv) if (ret < 0) goto out; ret = osl_open_table(&dir_table_desc, &dir_table); - if (ret < 0) - goto out; - ret = osl_open_table(&id_table_desc, &id_table); if (ret < 0) goto out; ret = -E_SYNTAX; diff --git a/error.h b/error.h index 8603291..1067e7f 100644 --- a/error.h +++ b/error.h @@ -91,7 +91,9 @@ static inline char *error_txt(int num) _ERROR(EMPTY, "file empty") \ _ERROR(MMAP, "mmap error") \ _ERROR(SYNTAX, "syntax error") \ - _ERROR(LOOP_COMPLETE, "loop complete") + _ERROR(LOOP_COMPLETE, "loop complete") \ + _ERROR(HASH_TABLE_OVERFLOW, "hash table too small") \ + _ERROR(BAD_UID, "uid not found in hash table") /**