2 * SPDX-License-Identifier: BSD-2-Clause
4 * Copyright (c) 2013 EMC Corp.
5 * Copyright (c) 2011 Jeffrey Roberson <jeff@freebsd.org>
6 * Copyright (c) 2008 Mayur Shardul <mayur.shardul@gmail.com>
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * Path-compressed radix trie implementation.
35 * The implementation takes into account the following rationale:
36 * - Size of the nodes should be as small as possible but still big enough
37 * to avoid a large maximum depth for the trie. This is a balance
38 * between the necessity to not wire too much physical memory for the nodes
39 * and the necessity to avoid too much cache pollution during the trie
41 * - There is not a huge bias toward the number of lookup operations over
42 * the number of insert and remove operations. This basically implies
43 * that optimizations supposedly helping one operation but hurting the
44 * other might be carefully evaluated.
45 * - On average not many nodes are expected to be fully populated, hence
46 * level compression may just complicate things.
49 #include <sys/cdefs.h>
52 #include <sys/param.h>
53 #include <sys/systm.h>
54 #include <sys/kernel.h>
55 #include <sys/pctrie.h>
56 #include <sys/proc.h> /* smr.h depends on struct thread. */
58 #include <sys/smr_types.h>
64 #define PCTRIE_MASK (PCTRIE_COUNT - 1)
65 #define PCTRIE_LIMIT (howmany(sizeof(uint64_t) * NBBY, PCTRIE_WIDTH) - 1)
67 /* Flag bits stored in node pointers. */
68 #define PCTRIE_ISLEAF 0x1
69 #define PCTRIE_FLAGS 0x1
70 #define PCTRIE_PAD PCTRIE_FLAGS
72 /* Returns one unit associated with specified level. */
73 #define PCTRIE_UNITLEVEL(lev) \
74 ((uint64_t)1 << ((lev) * PCTRIE_WIDTH))
77 typedef SMR_POINTER(struct pctrie_node *) smr_pctnode_t;
80 uint64_t pn_owner; /* Owner of record. */
81 uint16_t pn_count; /* Valid children. */
82 uint8_t pn_clev; /* Current level. */
83 int8_t pn_last; /* Zero last ptr. */
84 smr_pctnode_t pn_child[PCTRIE_COUNT]; /* Child nodes. */
87 enum pctrie_access { PCTRIE_SMR, PCTRIE_LOCKED, PCTRIE_UNSERIALIZED };
89 static __inline void pctrie_node_store(smr_pctnode_t *p, void *val,
90 enum pctrie_access access);
93 * Allocate a node. Pre-allocation should ensure that the request
94 * will always be satisfied.
96 static struct pctrie_node *
97 pctrie_node_get(struct pctrie *ptree, pctrie_alloc_t allocfn, uint64_t owner,
98 uint16_t count, uint16_t clevel)
100 struct pctrie_node *node;
102 node = allocfn(ptree);
107 * We want to clear the last child pointer after the final section
108 * has exited so lookup can not return false negatives. It is done
109 * here because it will be cache-cold in the dtor callback.
111 if (node->pn_last != 0) {
112 pctrie_node_store(&node->pn_child[node->pn_last - 1], NULL,
113 PCTRIE_UNSERIALIZED);
116 node->pn_owner = owner;
117 node->pn_count = count;
118 node->pn_clev = clevel;
126 pctrie_node_put(struct pctrie *ptree, struct pctrie_node *node,
127 pctrie_free_t freefn, int8_t last)
132 KASSERT(node->pn_count == 0,
133 ("pctrie_node_put: node %p has %d children", node,
135 for (slot = 0; slot < PCTRIE_COUNT; slot++) {
138 KASSERT(smr_unserialized_load(&node->pn_child[slot], true) ==
139 NULL, ("pctrie_node_put: node %p has a child", node));
142 node->pn_last = last + 1;
147 * Return the position in the array for a given level.
150 pctrie_slot(uint64_t index, uint16_t level)
153 return ((index >> (level * PCTRIE_WIDTH)) & PCTRIE_MASK);
156 /* Trims the key after the specified level. */
157 static __inline uint64_t
158 pctrie_trimkey(uint64_t index, uint16_t level)
164 ret >>= level * PCTRIE_WIDTH;
165 ret <<= level * PCTRIE_WIDTH;
171 * Fetch a node pointer from a slot.
173 static __inline struct pctrie_node *
174 pctrie_node_load(smr_pctnode_t *p, smr_t smr, enum pctrie_access access)
177 case PCTRIE_UNSERIALIZED:
178 return (smr_unserialized_load(p, true));
180 return (smr_serialized_load(p, true));
182 return (smr_entered_load(p, smr));
184 __assert_unreachable();
188 pctrie_node_store(smr_pctnode_t *p, void *v, enum pctrie_access access)
191 case PCTRIE_UNSERIALIZED:
192 smr_unserialized_store(p, v, true);
195 smr_serialized_store(p, v, true);
198 panic("%s: Not supported in SMR section.", __func__);
201 __assert_unreachable();
207 * Get the root node for a tree.
209 static __inline struct pctrie_node *
210 pctrie_root_load(struct pctrie *ptree, smr_t smr, enum pctrie_access access)
212 return (pctrie_node_load((smr_pctnode_t *)&ptree->pt_root, smr, access));
216 * Set the root node for a tree.
219 pctrie_root_store(struct pctrie *ptree, struct pctrie_node *node,
220 enum pctrie_access access)
222 pctrie_node_store((smr_pctnode_t *)&ptree->pt_root, node, access);
226 * Returns TRUE if the specified node is a leaf and FALSE otherwise.
229 pctrie_isleaf(struct pctrie_node *node)
232 return (((uintptr_t)node & PCTRIE_ISLEAF) != 0);
236 * Returns the associated val extracted from node.
238 static __inline uint64_t *
239 pctrie_toval(struct pctrie_node *node)
242 return ((uint64_t *)((uintptr_t)node & ~PCTRIE_FLAGS));
246 * Adds the val as a child of the provided node.
249 pctrie_addval(struct pctrie_node *node, uint64_t index, uint16_t clev,
250 uint64_t *val, enum pctrie_access access)
254 slot = pctrie_slot(index, clev);
255 pctrie_node_store(&node->pn_child[slot],
256 (void *)((uintptr_t)val | PCTRIE_ISLEAF), access);
260 * Returns the slot where two keys differ.
261 * It cannot accept 2 equal keys.
263 static __inline uint16_t
264 pctrie_keydiff(uint64_t index1, uint64_t index2)
268 KASSERT(index1 != index2, ("%s: passing the same key value %jx",
269 __func__, (uintmax_t)index1));
272 for (clev = PCTRIE_LIMIT;; clev--)
273 if (pctrie_slot(index1, clev) != 0)
278 * Returns TRUE if it can be determined that key does not belong to the
279 * specified node. Otherwise, returns FALSE.
282 pctrie_keybarr(struct pctrie_node *node, uint64_t idx)
285 if (node->pn_clev < PCTRIE_LIMIT) {
286 idx = pctrie_trimkey(idx, node->pn_clev + 1);
287 return (idx != node->pn_owner);
293 * Internal helper for pctrie_reclaim_allnodes().
294 * This function is recursive.
297 pctrie_reclaim_allnodes_int(struct pctrie *ptree, struct pctrie_node *node,
298 pctrie_free_t freefn)
300 struct pctrie_node *child;
303 KASSERT(node->pn_count <= PCTRIE_COUNT,
304 ("pctrie_reclaim_allnodes_int: bad count in node %p", node));
305 for (slot = 0; node->pn_count != 0; slot++) {
306 child = pctrie_node_load(&node->pn_child[slot], NULL,
307 PCTRIE_UNSERIALIZED);
310 if (!pctrie_isleaf(child))
311 pctrie_reclaim_allnodes_int(ptree, child, freefn);
312 pctrie_node_store(&node->pn_child[slot], NULL,
313 PCTRIE_UNSERIALIZED);
316 pctrie_node_put(ptree, node, freefn, -1);
320 * pctrie node zone initializer.
323 pctrie_zone_init(void *mem, int size __unused, int flags __unused)
325 struct pctrie_node *node;
329 memset(node->pn_child, 0, sizeof(node->pn_child));
334 pctrie_node_size(void)
337 return (sizeof(struct pctrie_node));
341 * Inserts the key-value pair into the trie.
342 * Panics if the key already exists.
345 pctrie_insert(struct pctrie *ptree, uint64_t *val, pctrie_alloc_t allocfn)
347 uint64_t index, newind;
348 struct pctrie_node *node, *tmp;
349 smr_pctnode_t *parentp;
357 * The owner of record for root is not really important because it
358 * will never be used.
360 node = pctrie_root_load(ptree, NULL, PCTRIE_LOCKED);
362 ptree->pt_root = (uintptr_t)val | PCTRIE_ISLEAF;
365 parentp = (smr_pctnode_t *)&ptree->pt_root;
367 if (pctrie_isleaf(node)) {
368 m = pctrie_toval(node);
370 panic("%s: key %jx is already present",
371 __func__, (uintmax_t)index);
372 clev = pctrie_keydiff(*m, index);
373 tmp = pctrie_node_get(ptree, allocfn,
374 pctrie_trimkey(index, clev + 1), 2, clev);
377 /* These writes are not yet visible due to ordering. */
378 pctrie_addval(tmp, index, clev, val,
379 PCTRIE_UNSERIALIZED);
380 pctrie_addval(tmp, *m, clev, m, PCTRIE_UNSERIALIZED);
381 /* Synchronize to make leaf visible. */
382 pctrie_node_store(parentp, tmp, PCTRIE_LOCKED);
384 } else if (pctrie_keybarr(node, index))
386 slot = pctrie_slot(index, node->pn_clev);
387 parentp = &node->pn_child[slot];
388 tmp = pctrie_node_load(parentp, NULL, PCTRIE_LOCKED);
391 pctrie_addval(node, index, node->pn_clev, val,
399 * A new node is needed because the right insertion level is reached.
400 * Setup the new intermediate node and add the 2 children: the
401 * new object and the older edge.
403 newind = node->pn_owner;
404 clev = pctrie_keydiff(newind, index);
405 tmp = pctrie_node_get(ptree, allocfn,
406 pctrie_trimkey(index, clev + 1), 2, clev);
409 slot = pctrie_slot(newind, clev);
410 /* These writes are not yet visible due to ordering. */
411 pctrie_addval(tmp, index, clev, val, PCTRIE_UNSERIALIZED);
412 pctrie_node_store(&tmp->pn_child[slot], node, PCTRIE_UNSERIALIZED);
413 /* Synchronize to make the above visible. */
414 pctrie_node_store(parentp, tmp, PCTRIE_LOCKED);
420 * Returns the value stored at the index. If the index is not present,
423 static __always_inline uint64_t *
424 _pctrie_lookup(struct pctrie *ptree, uint64_t index, smr_t smr,
425 enum pctrie_access access)
427 struct pctrie_node *node;
431 node = pctrie_root_load(ptree, smr, access);
432 while (node != NULL) {
433 if (pctrie_isleaf(node)) {
434 m = pctrie_toval(node);
439 if (pctrie_keybarr(node, index))
441 slot = pctrie_slot(index, node->pn_clev);
442 node = pctrie_node_load(&node->pn_child[slot], smr, access);
448 * Returns the value stored at the index, assuming access is externally
449 * synchronized by a lock.
451 * If the index is not present, NULL is returned.
454 pctrie_lookup(struct pctrie *ptree, uint64_t index)
456 return (_pctrie_lookup(ptree, index, NULL, PCTRIE_LOCKED));
460 * Returns the value stored at the index without requiring an external lock.
462 * If the index is not present, NULL is returned.
465 pctrie_lookup_unlocked(struct pctrie *ptree, uint64_t index, smr_t smr)
470 res = _pctrie_lookup(ptree, index, smr, PCTRIE_SMR);
476 * Look up the nearest entry at a position bigger than or equal to index,
477 * assuming access is externally synchronized by a lock.
480 pctrie_lookup_ge(struct pctrie *ptree, uint64_t index)
482 struct pctrie_node *stack[PCTRIE_LIMIT];
485 struct pctrie_node *child, *node;
492 node = pctrie_root_load(ptree, NULL, PCTRIE_LOCKED);
495 else if (pctrie_isleaf(node)) {
496 m = pctrie_toval(node);
505 * If the keys differ before the current bisection node,
506 * then the search key might rollback to the earliest
507 * available bisection node or to the smallest key
508 * in the current node (if the owner is greater than the
511 if (pctrie_keybarr(node, index)) {
512 if (index > node->pn_owner) {
514 KASSERT(++loops < 1000,
515 ("pctrie_lookup_ge: too many loops"));
518 * Pop nodes from the stack until either the
519 * stack is empty or a node that could have a
520 * matching descendant is found.
526 } while (pctrie_slot(index,
527 node->pn_clev) == (PCTRIE_COUNT - 1));
530 * The following computation cannot overflow
531 * because index's slot at the current level
532 * is less than PCTRIE_COUNT - 1.
534 index = pctrie_trimkey(index,
536 index += PCTRIE_UNITLEVEL(node->pn_clev);
538 index = node->pn_owner;
539 KASSERT(!pctrie_keybarr(node, index),
540 ("pctrie_lookup_ge: keybarr failed"));
542 slot = pctrie_slot(index, node->pn_clev);
543 child = pctrie_node_load(&node->pn_child[slot], NULL,
545 if (pctrie_isleaf(child)) {
546 m = pctrie_toval(child);
549 } else if (child != NULL)
553 * Look for an available edge or val within the current
556 if (slot < (PCTRIE_COUNT - 1)) {
557 inc = PCTRIE_UNITLEVEL(node->pn_clev);
558 index = pctrie_trimkey(index, node->pn_clev);
562 child = pctrie_node_load(&node->pn_child[slot],
563 NULL, PCTRIE_LOCKED);
564 if (pctrie_isleaf(child)) {
565 m = pctrie_toval(child);
568 } else if (child != NULL)
570 } while (slot < (PCTRIE_COUNT - 1));
572 KASSERT(child == NULL || pctrie_isleaf(child),
573 ("pctrie_lookup_ge: child is radix node"));
576 * If a value or edge greater than the search slot is not found
577 * in the current node, ascend to the next higher-level node.
581 KASSERT(node->pn_clev > 0,
582 ("pctrie_lookup_ge: pushing leaf's parent"));
583 KASSERT(tos < PCTRIE_LIMIT,
584 ("pctrie_lookup_ge: stack overflow"));
591 * Look up the nearest entry at a position less than or equal to index,
592 * assuming access is externally synchronized by a lock.
595 pctrie_lookup_le(struct pctrie *ptree, uint64_t index)
597 struct pctrie_node *stack[PCTRIE_LIMIT];
600 struct pctrie_node *child, *node;
607 node = pctrie_root_load(ptree, NULL, PCTRIE_LOCKED);
610 else if (pctrie_isleaf(node)) {
611 m = pctrie_toval(node);
620 * If the keys differ before the current bisection node,
621 * then the search key might rollback to the earliest
622 * available bisection node or to the largest key
623 * in the current node (if the owner is smaller than the
626 if (pctrie_keybarr(node, index)) {
627 if (index > node->pn_owner) {
628 index = node->pn_owner + PCTRIE_COUNT *
629 PCTRIE_UNITLEVEL(node->pn_clev);
632 KASSERT(++loops < 1000,
633 ("pctrie_lookup_le: too many loops"));
636 * Pop nodes from the stack until either the
637 * stack is empty or a node that could have a
638 * matching descendant is found.
644 } while (pctrie_slot(index,
645 node->pn_clev) == 0);
648 * The following computation cannot overflow
649 * because index's slot at the current level
652 index = pctrie_trimkey(index,
656 KASSERT(!pctrie_keybarr(node, index),
657 ("pctrie_lookup_le: keybarr failed"));
659 slot = pctrie_slot(index, node->pn_clev);
660 child = pctrie_node_load(&node->pn_child[slot], NULL,
662 if (pctrie_isleaf(child)) {
663 m = pctrie_toval(child);
666 } else if (child != NULL)
670 * Look for an available edge or value within the current
674 inc = PCTRIE_UNITLEVEL(node->pn_clev);
679 child = pctrie_node_load(&node->pn_child[slot],
680 NULL, PCTRIE_LOCKED);
681 if (pctrie_isleaf(child)) {
682 m = pctrie_toval(child);
685 } else if (child != NULL)
689 KASSERT(child == NULL || pctrie_isleaf(child),
690 ("pctrie_lookup_le: child is radix node"));
693 * If a value or edge smaller than the search slot is not found
694 * in the current node, ascend to the next higher-level node.
698 KASSERT(node->pn_clev > 0,
699 ("pctrie_lookup_le: pushing leaf's parent"));
700 KASSERT(tos < PCTRIE_LIMIT,
701 ("pctrie_lookup_le: stack overflow"));
708 * Remove the specified index from the tree.
709 * Panics if the key is not present.
712 pctrie_remove(struct pctrie *ptree, uint64_t index, pctrie_free_t freefn)
714 struct pctrie_node *node, *parent, *tmp;
718 node = pctrie_root_load(ptree, NULL, PCTRIE_LOCKED);
719 if (pctrie_isleaf(node)) {
720 m = pctrie_toval(node);
722 panic("%s: invalid key found", __func__);
723 pctrie_root_store(ptree, NULL, PCTRIE_LOCKED);
729 panic("pctrie_remove: impossible to locate the key");
730 slot = pctrie_slot(index, node->pn_clev);
731 tmp = pctrie_node_load(&node->pn_child[slot], NULL,
733 if (pctrie_isleaf(tmp)) {
734 m = pctrie_toval(tmp);
736 panic("%s: invalid key found", __func__);
737 pctrie_node_store(&node->pn_child[slot], NULL,
740 if (node->pn_count > 1)
742 for (i = 0; i < PCTRIE_COUNT; i++) {
743 tmp = pctrie_node_load(&node->pn_child[i],
744 NULL, PCTRIE_LOCKED);
748 KASSERT(i != PCTRIE_COUNT,
749 ("%s: invalid node configuration", __func__));
751 pctrie_root_store(ptree, tmp, PCTRIE_LOCKED);
753 slot = pctrie_slot(index, parent->pn_clev);
754 KASSERT(pctrie_node_load(
755 &parent->pn_child[slot], NULL,
756 PCTRIE_LOCKED) == node,
757 ("%s: invalid child value", __func__));
758 pctrie_node_store(&parent->pn_child[slot], tmp,
762 * The child is still valid and we can not zero the
763 * pointer until all SMR references are gone.
766 pctrie_node_put(ptree, node, freefn, i);
775 * Remove and free all the nodes from the tree.
776 * This function is recursive but there is a tight control on it as the
777 * maximum depth of the tree is fixed.
780 pctrie_reclaim_allnodes(struct pctrie *ptree, pctrie_free_t freefn)
782 struct pctrie_node *root;
784 root = pctrie_root_load(ptree, NULL, PCTRIE_LOCKED);
787 pctrie_root_store(ptree, NULL, PCTRIE_UNSERIALIZED);
788 if (!pctrie_isleaf(root))
789 pctrie_reclaim_allnodes_int(ptree, root, freefn);
794 * Show details about the given node.
796 DB_SHOW_COMMAND(pctrienode, db_show_pctrienode)
798 struct pctrie_node *node, *tmp;
803 node = (struct pctrie_node *)addr;
804 db_printf("node %p, owner %jx, children count %u, level %u:\n",
805 (void *)node, (uintmax_t)node->pn_owner, node->pn_count,
807 for (i = 0; i < PCTRIE_COUNT; i++) {
808 tmp = pctrie_node_load(&node->pn_child[i], NULL,
809 PCTRIE_UNSERIALIZED);
811 db_printf("slot: %d, val: %p, value: %p, clev: %d\n",
813 pctrie_isleaf(tmp) ? pctrie_toval(tmp) : NULL,