Update copyright year.
[tfortune.git] / linhash.c
1 /* SPDX-License-Identifier: GPL-3.0-only */
2
3 /*
4 * Simple linear hashing implementation
5 *
6 * Linear hashing is a hash table algorithm by Witold Litwin (1980)
7 * which grows the table on demand without rehashing existing objects.
8 *
9 * This implementation employs a conventional hash function C whose
10 * output length is assumed to be greater than the order of the table,
11 * i.e. the base-2 logarithm of the number of slots. Hash collisions
12 * are resolved by chaining.
13 *
14 * The hash table consists in fact of two tables which are consecutive
15 * in memory. Each table represents N = 1 << n many slots of the
16 * table. Example (n = 2, N = 4):
17 *
18 * [....][....]
19 * 0123 4567
20 *
21 * In what follows, we refer to the two tables as the lower table and
22 * the higher table.
23 *
24 * C gives rise to a sequence of hash functions C_k via C_k(key) :=
25 * lower k bits of C(key). In terms of C_k the lower and higher hash
26 * functions, h and H, are defined by h = C_n and H = C_{n+1}. Hence
27 * h(key) always corresponds to a slot in the lower table while H(key)
28 * corresponds to a slot in either table. Both functions are trivial to
29 * compute given C(key).
30 *
31 * The algorithm also maintains a split position s which corresponds to
32 * a slot of the lower table and starts out at zero.
33 *
34 * [s...][....]
35 * 0123 4567
36 *
37 * The linear hash function L is defined by L(key) = H(key) if h(key)
38 * < s, and L(key) = h(key) otherwise. In other words, L(key) is either
39 * the lower or the higher hash value, depending on the relation between
40 * the lower hash value and s. The higher hash function applies for all
41 * keys whose lower hash value is smaller than s. Initially we have L = h.
42 *
43 * On insertion, if the table load is large, the chain of entries which
44 * hash to s is split by transitioning from h to H.
45 *
46 * For an existing key, we have either H(key) = h(key) or H(key) =
47 * h(key) + 1 << n. That is, the most significant bit of H(key) tells
48 * whether this entry must be moved from its slot in the low table to the
49 * corresponding slot in the high table. Approximately half the entries
50 * will be moved this way. To reflect the fact that from now on L(key)
51 * = H(key) for all keys with h(key) = s, s is increased by one.
52 *
53 * Each time s reaches 1 << n, the upper bound of the lower table, we
54 * have L = H. In this case the hash table size is doubled, and the split
55 * position is reset to zero. Moreover, h and H are redefined as h :=
56 * C_{n+1} (i.e., the new h becomes the former H), and H := C_{n+2}. This
57 * means the table order n has increased by one, and a new round begins.
58 *
59 * This implementation includes an API for iterating over all entries
60 * in the hash table which avoids callback functions. Instead an opaque
61 * iterator structure is provided together with a set of functions that
62 * create or operate on such a structure.
63 *
64 * While shrinking a linear hash table can performed essentially
65 * by reversing the steps for growing described above, this is not
66 * implemented since it is not needed for tfortune.
67 */
68
69 #include "tf.h"
70
71 struct lh_slot {
72 struct linhash_item item;
73 uint32_t hash;
74 struct lh_slot *next;
75 };
76
77 struct linhash_table {
78 struct lh_slot **slots;
79 unsigned order;
80 uint32_t num_items; /* how many items have been inserted so far. */
81 uint32_t split_position;
82 };
83
84 struct linhash_iterator {
85 /* these are used regardless of whether comp is NULL */
86 struct linhash_table *t;
87 uint32_t idx;
88 /* comp == NULL means: iterate in hash order */
89 linhash_comparator *comp;
90 /* only used if comp == NULL */
91 struct lh_slot *head, *prev, *next;
92 /* only used if comp != NULL */
93 struct linhash_item **items;
94 bool reverse;
95 };
96
97 static uint32_t lh_num_slots_low(const struct linhash_table *t)
98 {
99 return (uint32_t)1 << t->order;
100 }
101
102 static uint32_t lh_num_slots_high(const struct linhash_table *t)
103 {
104 return lh_num_slots_low(t) * 2;
105 }
106
107 static uint32_t lh_index_low(uint32_t hash, const struct linhash_table *t)
108 {
109 return hash % lh_num_slots_low(t);
110 }
111
112 static uint32_t lh_index_high(uint32_t hash, const struct linhash_table *t)
113 {
114 return hash % lh_num_slots_high(t);
115 }
116
117 static bool lh_must_grow(const struct linhash_table *t)
118 {
119 return t->num_items >= lh_num_slots_high(t) / 2;
120 }
121
122 /* The simple DJB (Daniel J Bernstein) hash is good enough here. */
123 static uint32_t lh_hash(const char *data)
124 {
125 uint32_t h = 5381;
126 const unsigned char *c = (typeof(c))data;
127
128 while (*(c++))
129 h = h * 33 + *c;
130 return h;
131 }
132
133 /*
134 * Create a new table for linear hashing.
135 *
136 * The order argument determines the initial number of slots: it is given by 2
137 * << order. The hash table grows on demand, and the point of linear hashing is
138 * the ability to cheaply grow the table, so specifying anything greater than
139 * zero is probably not necessary.
140 *
141 * The functtion returns an opaque pointer which serves as the handle to the
142 * newly created hash table. Most functions of the linhash API take such a
143 * pointer to know the table to operate on. This function either succeeds or
144 * terminates, it never returns NULL.
145 */
146 struct linhash_table *linhash_new(uint32_t order)
147 {
148 struct linhash_table *t;
149 uint32_t ns;
150
151 DEBUG_LOG("creating order %u hash table\n", order);
152 t = xmalloc(sizeof(*t));
153 t->order = order;
154 t->num_items = 0;
155 t->split_position = 0;
156 ns = lh_num_slots_high(t);
157 t->slots = xcalloc(ns * sizeof(*t->slots));
158 return t;
159 }
160
161 static uint32_t lh_index(uint32_t hash, const struct linhash_table *t)
162 {
163 uint32_t low = lh_index_low(hash, t);
164
165 if (low >= t->split_position)
166 return low;
167 return lh_index_high(hash, t);
168 }
169
170 static void lh_split_slot(struct linhash_table *t)
171 {
172 struct lh_slot *prev, *next, *s;
173 uint32_t sp = t->split_position;
174
175 DEBUG_LOG("splitting slot %u\n", sp);
176 for (prev = NULL, s = t->slots[sp]; s;) {
177 uint32_t idx = lh_index_high(s->hash, t);
178 if (idx == sp) {
179 prev = s;
180 s = s->next;
181 continue;
182 }
183 DEBUG_LOG("moving %s up from slot %u to slot %u\n",
184 s->item.key, sp, idx);
185 if (t->slots[sp] == s)
186 t->slots[sp] = s->next;
187 if (prev)
188 prev->next = s->next;
189 next = s->next;
190 s->next = t->slots[idx];
191 t->slots[idx] = s;
192 s = next;
193 }
194 DEBUG_LOG("slot %u has been split\n", sp);
195 if (!t->slots[sp])
196 DEBUG_LOG("slot #%u has become empty\n", sp);
197 }
198
199 static void lh_grow_table(struct linhash_table *t)
200 {
201 uint32_t idx, ns;
202
203 DEBUG_LOG("growing hash table to order %u\n", ++t->order);
204 ns = lh_num_slots_high(t);
205 t->slots = xrealloc(t->slots, ns * sizeof(*t->slots));
206 idx = lh_num_slots_low(t);
207 memset(t->slots + idx, 0, (ns - idx) * sizeof(*t->slots));
208 }
209
210 static struct lh_slot *lh_lookup(const char *key,
211 const struct linhash_table *t, uint32_t *hashp, uint32_t *idxp,
212 struct lh_slot **prevp)
213 {
214 struct lh_slot *s, *prev;
215 uint32_t hash, idx;
216
217 if (!t)
218 return NULL;
219 hash = lh_hash(key);
220 idx = lh_index(hash, t);
221 //DEBUG_LOG("key %s, hash: %u, idx: %u\n", key, hash, idx);
222 if (hashp)
223 *hashp = hash;
224 if (idxp)
225 *idxp = idx;
226 for (s = t->slots[idx], prev = NULL; s; prev = s, s = s->next) {
227 //DEBUG_LOG("comparing %s vs. %s\n", key, s->item.key);
228 if (strcmp(s->item.key, key))
229 continue;
230 /* found it */
231 if (prevp)
232 *prevp = prev;
233 return s;
234 }
235 if (prevp)
236 *prevp = NULL;
237 return NULL;
238 }
239
240 /**
241 * Find the entry identified by a key.
242 *
243 * \param key The key to look up.
244 * \param t Where to look up the key.
245 */
246 struct linhash_item *linhash_lookup(const char *key,
247 const struct linhash_table *t)
248 {
249 struct lh_slot *s = lh_lookup(key, t, NULL, NULL, NULL);
250
251 if (!s)
252 return NULL;
253 return &s->item;
254 }
255
256 static void *lh_remove(struct lh_slot *s, struct lh_slot *prev, uint32_t idx, struct linhash_table *t)
257 {
258 void *obj;
259
260 if (!s)
261 return NULL;
262 t->num_items--;
263 obj = s->item.object;
264 if (prev)
265 prev->next = s->next;
266 else
267 t->slots[idx] = s->next;
268 free(s);
269 return obj;
270 }
271
272 /* Note: This renders all existing iterators stale. */
273 void *linhash_remove(const char *key, struct linhash_table *t)
274 {
275 uint32_t idx;
276 struct lh_slot *prev, *s = lh_lookup(key, t, NULL, &idx, &prev);
277
278 return lh_remove(s, prev, idx, t);
279 }
280
281 static void *lh_iterator_remove_current(struct linhash_iterator *iter)
282 {
283 void *obj;
284
285 assert(!iter->comp);
286 if (!iter->head)
287 return NULL;
288 obj = lh_remove(iter->head, iter->prev, iter->idx, iter->t);
289 iter->head = iter->prev;
290 return obj;
291 }
292
293 static struct lh_slot *lh_first_nonempty_slot(uint32_t *idxp,
294 const struct linhash_table *t)
295 {
296 uint32_t ns = lh_num_slots_high(t);
297
298 for (; *idxp < ns; (*idxp)++)
299 if (t->slots[*idxp])
300 return t->slots[*idxp];
301 return NULL;
302 }
303
304 static void lh_iter_init(struct linhash_iterator *iter, uint32_t idx)
305 {
306 iter->idx = idx;
307 iter->prev = NULL;
308 iter->head = lh_first_nonempty_slot(&iter->idx, iter->t);
309 if (iter->head)
310 iter->next = iter->head->next;
311 }
312
313 /*
314 * Normally iter->head points to the current head. However, if this head was
315 * removed with lh_iterator_remove_current(), iter->head points to its
316 * predecessor in the list.
317 */
318 void linhash_iterator_next(struct linhash_iterator *iter)
319 {
320 if (iter->comp) {
321 if (iter->reverse)
322 iter->idx--;
323 else
324 iter->idx++;
325 return;
326 }
327 if (iter->next) {
328 iter->prev = iter->head;
329 iter->head = iter->next;
330 iter->next = iter->next->next;
331 return;
332 }
333 lh_iter_init(iter, iter->idx + 1);
334 }
335
336 struct linhash_item *linhash_iterator_item(const struct linhash_iterator *iter)
337 {
338 if (iter->comp) {
339 if (iter->idx >= iter->t->num_items)
340 return NULL;
341 return iter->items[iter->idx];
342 }
343 if (!iter->head)
344 return NULL;
345 return &iter->head->item;
346 }
347
348 void linhash_iterator_free(struct linhash_iterator *iter)
349 {
350 if (!iter)
351 return;
352 if (iter->comp)
353 free(iter->items);
354 free(iter);
355 }
356
357 /* always succeeds. reverse order is only respected if a comparator is given. */
358 struct linhash_iterator *linhash_iterator_new(struct linhash_table *t,
359 linhash_comparator *comp, bool reverse_sort_order)
360 {
361 struct linhash_iterator *iter = xmalloc(sizeof(*iter)), *iter2;
362 struct linhash_item *item;
363 unsigned n;
364
365 iter->t = t;
366 iter->comp = comp;
367 if (!comp) {
368 lh_iter_init(iter, 0);
369 return iter;
370 }
371 iter->reverse = reverse_sort_order;
372 iter2 = linhash_iterator_new(t, NULL, false);
373 iter->items = xmalloc(t->num_items * sizeof(struct linhash_item *));
374 for (
375 n = 0;
376 (item = linhash_iterator_item(iter2));
377 linhash_iterator_next(iter2)
378 )
379 iter->items[n++] = item;
380 linhash_iterator_free(iter2);
381 qsort(iter->items, t->num_items, sizeof(struct linhash_iter *),
382 (int (*)(const void *, const void *))comp);
383 iter->idx = reverse_sort_order? t->num_items - 1 : 0;
384 return iter;
385 }
386
387 /* Deallocate the resources occupied by the given hash table. */
388 void linhash_free(struct linhash_table *t)
389 {
390 struct linhash_iterator *iter;
391 struct linhash_item *itemp;
392
393 if (!t)
394 return;
395 for (
396 iter = linhash_iterator_new(t, NULL, false);
397 (itemp = linhash_iterator_item(iter));
398 linhash_iterator_next(iter)
399 )
400 lh_iterator_remove_current(iter);
401 linhash_iterator_free(iter);
402 assert(t->num_items == 0);
403 free(t->slots);
404 free(t);
405 }
406
407 /* returns item in first arg if key already exists */
408 int linhash_insert(struct linhash_item *item, struct linhash_table *t,
409 void ***object)
410 {
411 struct lh_slot *s;
412 uint32_t idx, hash;
413
414 s = lh_lookup(item->key, t, &hash, &idx, NULL);
415 if (s) {
416 if (object)
417 *object = &s->item.object;
418 return -E_LH_EXIST;
419 }
420 s = xmalloc(sizeof(*s));
421 s->item = *item;
422 s->hash = hash;
423 DEBUG_LOG("inserting item #%u, key: %s, hash: %u, idx: %u\n",
424 t->num_items, item->key, hash, idx);
425 s->next = t->slots[idx];
426 t->slots[idx] = s;
427 t->num_items++;
428
429 if (!lh_must_grow(t)) {
430 DEBUG_LOG("no need to grow\n");
431 return 0;
432 }
433 lh_split_slot(t);
434 t->split_position++;
435 if (t->split_position < lh_num_slots_low(t))
436 return 1;
437 t->split_position = 0;
438 lh_grow_table(t);
439 return 2;
440 }
441
442 uint32_t linhash_num_items(const struct linhash_table *t)
443 {
444 return t->num_items;
445 }
446
447 char *linhash_statistics(const struct linhash_table *t)
448 {
449 uint32_t min_fill = -1, max_fill = 0, n, idx, ns;
450 uint32_t fill_count[11] = {0};
451 char *result;
452
453 ns = lh_num_slots_low(t) + t->split_position;
454 for (idx = 0; idx < ns; idx++) {
455 struct lh_slot *s;
456 for (n = 0, s = t->slots[idx]; s; s = s->next, n++)
457 ; /* nothing */
458 min_fill = MIN(min_fill, n);
459 max_fill = MAX(max_fill, n);
460 fill_count[n < 10? n : 10]++;
461 }
462 xasprintf(&result,
463 "order............... %2u\n"
464 "num slots........... %u\n"
465 "num items (table)... %u\n"
466 "load factor........ %3u%%\n"
467 "min fill............ %2u\n"
468 "max fill............ %2u\n"
469 "max count[0]....... %3u%% (%u)\n"
470 "max count[1]....... %3u%% (%u)\n"
471 "max count[2]....... %3u%% (%u)\n"
472 "max count[3]....... %3u%% (%u)\n"
473 "max count[4]....... %3u%% (%u)\n"
474 "max count[5]....... %3u%% (%u)\n"
475 "max count[6]....... %3u%% (%u)\n"
476 "max count[7]....... %3u%% (%u)\n"
477 "max count[8]....... %3u%% (%u)\n"
478 "max count[9]....... %3u%% (%u)\n"
479 "max count[10+]..... %3u%% (%u)\n"
480 ,
481 t->order,
482 ns,
483 t->num_items,
484 (t->num_items * 100 + (ns / 2)) / ns,
485 min_fill,
486 max_fill,
487 100 * fill_count[0] / ns, fill_count[0],
488 100 * fill_count[1] / ns, fill_count[1],
489 100 * fill_count[2] / ns, fill_count[2],
490 100 * fill_count[3] / ns, fill_count[3],
491 100 * fill_count[4] / ns, fill_count[4],
492 100 * fill_count[5] / ns, fill_count[5],
493 100 * fill_count[6] / ns, fill_count[6],
494 100 * fill_count[7] / ns, fill_count[7],
495 100 * fill_count[8] / ns, fill_count[8],
496 100 * fill_count[9] / ns, fill_count[9],
497 100 * fill_count[10] / ns, fill_count[10]
498 );
499 return result;
500 }