1 /* SPDX-License-Identifier: GPL-3.0-only */
4 * Simple linear hashing implementation
6 * Linear hashing is a hash table algorithm by Witold Litwin (1980)
7 * which grows the table on demand without rehashing existing objects.
9 * This implementation employs a conventional hash function C whose
10 * output length is assumed to be greater than the order of the table,
11 * i.e. the base-2 logarithm of the number of slots. Hash collisions
12 * are resolved by chaining.
14 * The hash table consists in fact of two tables which are consecutive
15 * in memory. Each table represents N = 1 << n many slots of the
16 * table. Example (n = 2, N = 4):
21 * In what follows, we refer to the two tables as the lower table and
24 * C gives rise to a sequence of hash functions C_k via C_k(key) :=
25 * lower k bits of C(key). In terms of C_k the lower and higher hash
26 * functions, h and H, are defined by h = C_n and H = C_{n+1}. Hence
27 * h(key) always corresponds to a slot in the lower table while H(key)
28 * corresponds to a slot in either table. Both functions are trivial to
29 * compute given C(key).
31 * The algorithm also maintains a split position s which corresponds to
32 * a slot of the lower table and starts out at zero.
37 * The linear hash function L is defined by L(key) = H(key) if h(key)
38 * < s, and L(key) = h(key) otherwise. In other words, L(key) is either
39 * the lower or the higher hash value, depending on the relation between
40 * the lower hash value and s. The higher hash function applies for all
41 * keys whose lower hash value is smaller than s. Initially we have L = h.
43 * On insertion, if the table load is large, the chain of entries which
44 * hash to s is split by transitioning from h to H.
46 * For an existing key, we have either H(key) = h(key) or H(key) =
47 * h(key) + 1 << n. That is, the most significant bit of H(key) tells
48 * whether this entry must be moved from its slot in the low table to the
49 * corresponding slot in the high table. Approximately half the entries
50 * will be moved this way. To reflect the fact that from now on L(key)
51 * = H(key) for all keys with h(key) = s, s is increased by one.
53 * Each time s reaches 1 << n, the upper bound of the lower table, we
54 * have L = H. In this case the hash table size is doubled, and the split
55 * position is reset to zero. Moreover, h and H are redefined as h :=
56 * C_{n+1} (i.e., the new h becomes the former H), and H := C_{n+2}. This
57 * means the table order n has increased by one, and a new round begins.
59 * This implementation includes an API for iterating over all entries
60 * in the hash table which avoids callback functions. Instead an opaque
61 * iterator structure is provided together with a set of functions that
62 * create or operate on such a structure.
64 * While shrinking a linear hash table can performed essentially
65 * by reversing the steps for growing described above, this is not
66 * implemented since it is not needed for tfortune.
72 struct linhash_item item;
77 struct linhash_table {
78 struct lh_slot **slots;
80 uint32_t num_items; /* how many items have been inserted so far. */
81 uint32_t split_position;
84 struct linhash_iterator {
85 /* these are used regardless of whether comp is NULL */
86 struct linhash_table *t;
88 /* comp == NULL means: iterate in hash order */
89 linhash_comparator *comp;
90 /* only used if comp == NULL */
91 struct lh_slot *head, *prev, *next;
92 /* only used if comp != NULL */
93 struct linhash_item **items;
97 static uint32_t lh_num_slots_low(const struct linhash_table *t)
99 return (uint32_t)1 << t->order;
102 static uint32_t lh_num_slots_high(const struct linhash_table *t)
104 return lh_num_slots_low(t) * 2;
107 static uint32_t lh_index_low(uint32_t hash, const struct linhash_table *t)
109 return hash % lh_num_slots_low(t);
112 static uint32_t lh_index_high(uint32_t hash, const struct linhash_table *t)
114 return hash % lh_num_slots_high(t);
117 static bool lh_must_grow(const struct linhash_table *t)
119 return t->num_items >= lh_num_slots_high(t) / 2;
122 /* The simple DJB (Daniel J Bernstein) hash is good enough here. */
123 static uint32_t lh_hash(const char *data)
126 const unsigned char *c = (typeof(c))data;
134 * Create a new table for linear hashing.
136 * The order argument determines the initial number of slots: it is given by 2
137 * << order. The hash table grows on demand, and the point of linear hashing is
138 * the ability to cheaply grow the table, so specifying anything greater than
139 * zero is probably not necessary.
141 * The functtion returns an opaque pointer which serves as the handle to the
142 * newly created hash table. Most functions of the linhash API take such a
143 * pointer to know the table to operate on. This function either succeeds or
144 * terminates, it never returns NULL.
146 struct linhash_table *linhash_new(uint32_t order)
148 struct linhash_table *t;
151 DEBUG_LOG("creating order %u hash table\n", order);
152 t = xmalloc(sizeof(*t));
155 t->split_position = 0;
156 ns = lh_num_slots_high(t);
157 t->slots = xcalloc(ns * sizeof(*t->slots));
161 static uint32_t lh_index(uint32_t hash, const struct linhash_table *t)
163 uint32_t low = lh_index_low(hash, t);
165 if (low >= t->split_position)
167 return lh_index_high(hash, t);
170 static void lh_split_slot(struct linhash_table *t)
172 struct lh_slot *prev, *next, *s;
173 uint32_t sp = t->split_position;
175 DEBUG_LOG("splitting slot %u\n", sp);
176 for (prev = NULL, s = t->slots[sp]; s;) {
177 uint32_t idx = lh_index_high(s->hash, t);
183 DEBUG_LOG("moving %s up from slot %u to slot %u\n",
184 s->item.key, sp, idx);
185 if (t->slots[sp] == s)
186 t->slots[sp] = s->next;
188 prev->next = s->next;
190 s->next = t->slots[idx];
194 DEBUG_LOG("slot %u has been split\n", sp);
196 DEBUG_LOG("slot #%u has become empty\n", sp);
199 static void lh_grow_table(struct linhash_table *t)
203 DEBUG_LOG("growing hash table to order %u\n", ++t->order);
204 ns = lh_num_slots_high(t);
205 t->slots = xrealloc(t->slots, ns * sizeof(*t->slots));
206 idx = lh_num_slots_low(t);
207 memset(t->slots + idx, 0, (ns - idx) * sizeof(*t->slots));
210 static struct lh_slot *lh_lookup(const char *key,
211 const struct linhash_table *t, uint32_t *hashp, uint32_t *idxp,
212 struct lh_slot **prevp)
214 struct lh_slot *s, *prev;
220 idx = lh_index(hash, t);
221 //DEBUG_LOG("key %s, hash: %u, idx: %u\n", key, hash, idx);
226 for (s = t->slots[idx], prev = NULL; s; prev = s, s = s->next) {
227 //DEBUG_LOG("comparing %s vs. %s\n", key, s->item.key);
228 if (strcmp(s->item.key, key))
241 * Find the entry identified by a key.
243 * \param key The key to look up.
244 * \param t Where to look up the key.
246 struct linhash_item *linhash_lookup(const char *key,
247 const struct linhash_table *t)
249 struct lh_slot *s = lh_lookup(key, t, NULL, NULL, NULL);
256 static void *lh_remove(struct lh_slot *s, struct lh_slot *prev, uint32_t idx, struct linhash_table *t)
263 obj = s->item.object;
265 prev->next = s->next;
267 t->slots[idx] = s->next;
272 /* Note: This renders all existing iterators stale. */
273 void *linhash_remove(const char *key, struct linhash_table *t)
276 struct lh_slot *prev, *s = lh_lookup(key, t, NULL, &idx, &prev);
278 return lh_remove(s, prev, idx, t);
281 static void *lh_iterator_remove_current(struct linhash_iterator *iter)
288 obj = lh_remove(iter->head, iter->prev, iter->idx, iter->t);
289 iter->head = iter->prev;
293 static struct lh_slot *lh_first_nonempty_slot(uint32_t *idxp,
294 const struct linhash_table *t)
296 uint32_t ns = lh_num_slots_high(t);
298 for (; *idxp < ns; (*idxp)++)
300 return t->slots[*idxp];
304 static void lh_iter_init(struct linhash_iterator *iter, uint32_t idx)
308 iter->head = lh_first_nonempty_slot(&iter->idx, iter->t);
310 iter->next = iter->head->next;
314 * Normally iter->head points to the current head. However, if this head was
315 * removed with lh_iterator_remove_current(), iter->head points to its
316 * predecessor in the list.
318 void linhash_iterator_next(struct linhash_iterator *iter)
328 iter->prev = iter->head;
329 iter->head = iter->next;
330 iter->next = iter->next->next;
333 lh_iter_init(iter, iter->idx + 1);
336 struct linhash_item *linhash_iterator_item(const struct linhash_iterator *iter)
339 if (iter->idx >= iter->t->num_items)
341 return iter->items[iter->idx];
345 return &iter->head->item;
348 void linhash_iterator_free(struct linhash_iterator *iter)
357 /* always succeeds. reverse order is only respected if a comparator is given. */
358 struct linhash_iterator *linhash_iterator_new(struct linhash_table *t,
359 linhash_comparator *comp, bool reverse_sort_order)
361 struct linhash_iterator *iter = xmalloc(sizeof(*iter)), *iter2;
362 struct linhash_item *item;
368 lh_iter_init(iter, 0);
371 iter->reverse = reverse_sort_order;
372 iter2 = linhash_iterator_new(t, NULL, false);
373 iter->items = xmalloc(t->num_items * sizeof(struct linhash_item *));
376 (item = linhash_iterator_item(iter2));
377 linhash_iterator_next(iter2)
379 iter->items[n++] = item;
380 linhash_iterator_free(iter2);
381 qsort(iter->items, t->num_items, sizeof(struct linhash_iter *),
382 (int (*)(const void *, const void *))comp);
383 iter->idx = reverse_sort_order? t->num_items - 1 : 0;
387 /* Deallocate the resources occupied by the given hash table. */
388 void linhash_free(struct linhash_table *t)
390 struct linhash_iterator *iter;
391 struct linhash_item *itemp;
396 iter = linhash_iterator_new(t, NULL, false);
397 (itemp = linhash_iterator_item(iter));
398 linhash_iterator_next(iter)
400 lh_iterator_remove_current(iter);
401 linhash_iterator_free(iter);
402 assert(t->num_items == 0);
407 /* returns item in first arg if key already exists */
408 int linhash_insert(struct linhash_item *item, struct linhash_table *t,
414 s = lh_lookup(item->key, t, &hash, &idx, NULL);
417 *object = &s->item.object;
420 s = xmalloc(sizeof(*s));
423 DEBUG_LOG("inserting item #%u, key: %s, hash: %u, idx: %u\n",
424 t->num_items, item->key, hash, idx);
425 s->next = t->slots[idx];
429 if (!lh_must_grow(t)) {
430 DEBUG_LOG("no need to grow\n");
435 if (t->split_position < lh_num_slots_low(t))
437 t->split_position = 0;
442 uint32_t linhash_num_items(const struct linhash_table *t)
447 char *linhash_statistics(const struct linhash_table *t)
449 uint32_t min_fill = -1, max_fill = 0, n, idx, ns;
450 uint32_t fill_count[11] = {0};
453 ns = lh_num_slots_low(t) + t->split_position;
454 for (idx = 0; idx < ns; idx++) {
456 for (n = 0, s = t->slots[idx]; s; s = s->next, n++)
458 min_fill = MIN(min_fill, n);
459 max_fill = MAX(max_fill, n);
460 fill_count[n < 10? n : 10]++;
463 "order............... %2u\n"
464 "num slots........... %u\n"
465 "num items (table)... %u\n"
466 "load factor........ %3u%%\n"
467 "min fill............ %2u\n"
468 "max fill............ %2u\n"
469 "max count[0]....... %3u%% (%u)\n"
470 "max count[1]....... %3u%% (%u)\n"
471 "max count[2]....... %3u%% (%u)\n"
472 "max count[3]....... %3u%% (%u)\n"
473 "max count[4]....... %3u%% (%u)\n"
474 "max count[5]....... %3u%% (%u)\n"
475 "max count[6]....... %3u%% (%u)\n"
476 "max count[7]....... %3u%% (%u)\n"
477 "max count[8]....... %3u%% (%u)\n"
478 "max count[9]....... %3u%% (%u)\n"
479 "max count[10+]..... %3u%% (%u)\n"
484 (t->num_items * 100 + (ns / 2)) / ns,
487 100 * fill_count[0] / ns, fill_count[0],
488 100 * fill_count[1] / ns, fill_count[1],
489 100 * fill_count[2] / ns, fill_count[2],
490 100 * fill_count[3] / ns, fill_count[3],
491 100 * fill_count[4] / ns, fill_count[4],
492 100 * fill_count[5] / ns, fill_count[5],
493 100 * fill_count[6] / ns, fill_count[6],
494 100 * fill_count[7] / ns, fill_count[7],
495 100 * fill_count[8] / ns, fill_count[8],
496 100 * fill_count[9] / ns, fill_count[9],
497 100 * fill_count[10] / ns, fill_count[10]