2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2013 EMC Corp.
5 * Copyright (c) 2011 Jeffrey Roberson <jeff@freebsd.org>
6 * Copyright (c) 2008 Mayur Shardul <mayur.shardul@gmail.com>
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * Path-compressed radix trie implementation.
35 * The implementation takes into account the following rationale:
36 * - Size of the nodes should be as small as possible but still big enough
37 * to avoid a large maximum depth for the trie. This is a balance
38 * between the necessity to not wire too much physical memory for the nodes
39 * and the necessity to avoid too much cache pollution during the trie
41 * - There is not a huge bias toward the number of lookup operations over
42 * the number of insert and remove operations. This basically implies
43 * that optimizations supposedly helping one operation but hurting the
44 * other might be carefully evaluated.
45 * - On average not many nodes are expected to be fully populated, hence
46 * level compression may just complicate things.
49 #include <sys/cdefs.h>
50 __FBSDID("$FreeBSD$");
54 #include <sys/param.h>
55 #include <sys/systm.h>
56 #include <sys/kernel.h>
57 #include <sys/pctrie.h>
58 #include <sys/proc.h> /* smr.h depends on struct thread. */
60 #include <sys/smr_types.h>
66 #define PCTRIE_MASK (PCTRIE_COUNT - 1)
67 #define PCTRIE_LIMIT (howmany(sizeof(uint64_t) * NBBY, PCTRIE_WIDTH) - 1)
69 /* Flag bits stored in node pointers. */
70 #define PCTRIE_ISLEAF 0x1
71 #define PCTRIE_FLAGS 0x1
72 #define PCTRIE_PAD PCTRIE_FLAGS
74 /* Returns one unit associated with specified level. */
75 #define PCTRIE_UNITLEVEL(lev) \
76 ((uint64_t)1 << ((lev) * PCTRIE_WIDTH))
79 typedef SMR_POINTER(struct pctrie_node *) smr_pctnode_t;
82 uint64_t pn_owner; /* Owner of record. */
83 uint16_t pn_count; /* Valid children. */
84 uint8_t pn_clev; /* Current level. */
85 int8_t pn_last; /* Zero last ptr. */
86 smr_pctnode_t pn_child[PCTRIE_COUNT]; /* Child nodes. */
89 enum pctrie_access { PCTRIE_SMR, PCTRIE_LOCKED, PCTRIE_UNSERIALIZED };
91 static __inline void pctrie_node_store(smr_pctnode_t *p, void *val,
92 enum pctrie_access access);
95 * Allocate a node. Pre-allocation should ensure that the request
96 * will always be satisfied.
98 static struct pctrie_node *
99 pctrie_node_get(struct pctrie *ptree, pctrie_alloc_t allocfn, uint64_t owner,
100 uint16_t count, uint16_t clevel)
102 struct pctrie_node *node;
104 node = allocfn(ptree);
109 * We want to clear the last child pointer after the final section
110 * has exited so lookup can not return false negatives. It is done
111 * here because it will be cache-cold in the dtor callback.
113 if (node->pn_last != 0) {
114 pctrie_node_store(&node->pn_child[node->pn_last - 1], NULL,
115 PCTRIE_UNSERIALIZED);
118 node->pn_owner = owner;
119 node->pn_count = count;
120 node->pn_clev = clevel;
128 pctrie_node_put(struct pctrie *ptree, struct pctrie_node *node,
129 pctrie_free_t freefn, int8_t last)
134 KASSERT(node->pn_count == 0,
135 ("pctrie_node_put: node %p has %d children", node,
137 for (slot = 0; slot < PCTRIE_COUNT; slot++) {
140 KASSERT(smr_unserialized_load(&node->pn_child[slot], true) ==
141 NULL, ("pctrie_node_put: node %p has a child", node));
144 node->pn_last = last + 1;
149 * Return the position in the array for a given level.
152 pctrie_slot(uint64_t index, uint16_t level)
155 return ((index >> (level * PCTRIE_WIDTH)) & PCTRIE_MASK);
158 /* Trims the key after the specified level. */
159 static __inline uint64_t
160 pctrie_trimkey(uint64_t index, uint16_t level)
166 ret >>= level * PCTRIE_WIDTH;
167 ret <<= level * PCTRIE_WIDTH;
173 * Fetch a node pointer from a slot.
175 static __inline struct pctrie_node *
176 pctrie_node_load(smr_pctnode_t *p, smr_t smr, enum pctrie_access access)
179 case PCTRIE_UNSERIALIZED:
180 return (smr_unserialized_load(p, true));
182 return (smr_serialized_load(p, true));
184 return (smr_entered_load(p, smr));
186 __assert_unreachable();
190 pctrie_node_store(smr_pctnode_t *p, void *v, enum pctrie_access access)
193 case PCTRIE_UNSERIALIZED:
194 smr_unserialized_store(p, v, true);
197 smr_serialized_store(p, v, true);
200 panic("%s: Not supported in SMR section.", __func__);
203 __assert_unreachable();
209 * Get the root node for a tree.
211 static __inline struct pctrie_node *
212 pctrie_root_load(struct pctrie *ptree, smr_t smr, enum pctrie_access access)
214 return (pctrie_node_load((smr_pctnode_t *)&ptree->pt_root, smr, access));
218 * Set the root node for a tree.
221 pctrie_root_store(struct pctrie *ptree, struct pctrie_node *node,
222 enum pctrie_access access)
224 pctrie_node_store((smr_pctnode_t *)&ptree->pt_root, node, access);
228 * Returns TRUE if the specified node is a leaf and FALSE otherwise.
231 pctrie_isleaf(struct pctrie_node *node)
234 return (((uintptr_t)node & PCTRIE_ISLEAF) != 0);
238 * Returns the associated val extracted from node.
240 static __inline uint64_t *
241 pctrie_toval(struct pctrie_node *node)
244 return ((uint64_t *)((uintptr_t)node & ~PCTRIE_FLAGS));
248 * Adds the val as a child of the provided node.
251 pctrie_addval(struct pctrie_node *node, uint64_t index, uint16_t clev,
252 uint64_t *val, enum pctrie_access access)
256 slot = pctrie_slot(index, clev);
257 pctrie_node_store(&node->pn_child[slot],
258 (void *)((uintptr_t)val | PCTRIE_ISLEAF), access);
262 * Returns the slot where two keys differ.
263 * It cannot accept 2 equal keys.
265 static __inline uint16_t
266 pctrie_keydiff(uint64_t index1, uint64_t index2)
270 KASSERT(index1 != index2, ("%s: passing the same key value %jx",
271 __func__, (uintmax_t)index1));
274 for (clev = PCTRIE_LIMIT;; clev--)
275 if (pctrie_slot(index1, clev) != 0)
280 * Returns TRUE if it can be determined that key does not belong to the
281 * specified node. Otherwise, returns FALSE.
284 pctrie_keybarr(struct pctrie_node *node, uint64_t idx)
287 if (node->pn_clev < PCTRIE_LIMIT) {
288 idx = pctrie_trimkey(idx, node->pn_clev + 1);
289 return (idx != node->pn_owner);
295 * Internal helper for pctrie_reclaim_allnodes().
296 * This function is recursive.
299 pctrie_reclaim_allnodes_int(struct pctrie *ptree, struct pctrie_node *node,
300 pctrie_free_t freefn)
302 struct pctrie_node *child;
305 KASSERT(node->pn_count <= PCTRIE_COUNT,
306 ("pctrie_reclaim_allnodes_int: bad count in node %p", node));
307 for (slot = 0; node->pn_count != 0; slot++) {
308 child = pctrie_node_load(&node->pn_child[slot], NULL,
309 PCTRIE_UNSERIALIZED);
312 if (!pctrie_isleaf(child))
313 pctrie_reclaim_allnodes_int(ptree, child, freefn);
314 pctrie_node_store(&node->pn_child[slot], NULL,
315 PCTRIE_UNSERIALIZED);
318 pctrie_node_put(ptree, node, freefn, -1);
322 * pctrie node zone initializer.
325 pctrie_zone_init(void *mem, int size __unused, int flags __unused)
327 struct pctrie_node *node;
331 memset(node->pn_child, 0, sizeof(node->pn_child));
336 pctrie_node_size(void)
339 return (sizeof(struct pctrie_node));
343 * Inserts the key-value pair into the trie.
344 * Panics if the key already exists.
347 pctrie_insert(struct pctrie *ptree, uint64_t *val, pctrie_alloc_t allocfn)
349 uint64_t index, newind;
350 struct pctrie_node *node, *tmp;
351 smr_pctnode_t *parentp;
359 * The owner of record for root is not really important because it
360 * will never be used.
362 node = pctrie_root_load(ptree, NULL, PCTRIE_LOCKED);
364 ptree->pt_root = (uintptr_t)val | PCTRIE_ISLEAF;
367 parentp = (smr_pctnode_t *)&ptree->pt_root;
369 if (pctrie_isleaf(node)) {
370 m = pctrie_toval(node);
372 panic("%s: key %jx is already present",
373 __func__, (uintmax_t)index);
374 clev = pctrie_keydiff(*m, index);
375 tmp = pctrie_node_get(ptree, allocfn,
376 pctrie_trimkey(index, clev + 1), 2, clev);
379 /* These writes are not yet visible due to ordering. */
380 pctrie_addval(tmp, index, clev, val,
381 PCTRIE_UNSERIALIZED);
382 pctrie_addval(tmp, *m, clev, m, PCTRIE_UNSERIALIZED);
383 /* Synchronize to make leaf visible. */
384 pctrie_node_store(parentp, tmp, PCTRIE_LOCKED);
386 } else if (pctrie_keybarr(node, index))
388 slot = pctrie_slot(index, node->pn_clev);
389 parentp = &node->pn_child[slot];
390 tmp = pctrie_node_load(parentp, NULL, PCTRIE_LOCKED);
393 pctrie_addval(node, index, node->pn_clev, val,
401 * A new node is needed because the right insertion level is reached.
402 * Setup the new intermediate node and add the 2 children: the
403 * new object and the older edge.
405 newind = node->pn_owner;
406 clev = pctrie_keydiff(newind, index);
407 tmp = pctrie_node_get(ptree, allocfn,
408 pctrie_trimkey(index, clev + 1), 2, clev);
411 slot = pctrie_slot(newind, clev);
412 /* These writes are not yet visible due to ordering. */
413 pctrie_addval(tmp, index, clev, val, PCTRIE_UNSERIALIZED);
414 pctrie_node_store(&tmp->pn_child[slot], node, PCTRIE_UNSERIALIZED);
415 /* Synchronize to make the above visible. */
416 pctrie_node_store(parentp, tmp, PCTRIE_LOCKED);
422 * Returns the value stored at the index. If the index is not present,
425 static __always_inline uint64_t *
426 _pctrie_lookup(struct pctrie *ptree, uint64_t index, smr_t smr,
427 enum pctrie_access access)
429 struct pctrie_node *node;
433 node = pctrie_root_load(ptree, smr, access);
434 while (node != NULL) {
435 if (pctrie_isleaf(node)) {
436 m = pctrie_toval(node);
441 if (pctrie_keybarr(node, index))
443 slot = pctrie_slot(index, node->pn_clev);
444 node = pctrie_node_load(&node->pn_child[slot], smr, access);
450 * Returns the value stored at the index, assuming access is externally
451 * synchronized by a lock.
453 * If the index is not present, NULL is returned.
456 pctrie_lookup(struct pctrie *ptree, uint64_t index)
458 return (_pctrie_lookup(ptree, index, NULL, PCTRIE_LOCKED));
462 * Returns the value stored at the index without requiring an external lock.
464 * If the index is not present, NULL is returned.
467 pctrie_lookup_unlocked(struct pctrie *ptree, uint64_t index, smr_t smr)
472 res = _pctrie_lookup(ptree, index, smr, PCTRIE_SMR);
478 * Look up the nearest entry at a position bigger than or equal to index,
479 * assuming access is externally synchronized by a lock.
482 pctrie_lookup_ge(struct pctrie *ptree, uint64_t index)
484 struct pctrie_node *stack[PCTRIE_LIMIT];
487 struct pctrie_node *child, *node;
494 node = pctrie_root_load(ptree, NULL, PCTRIE_LOCKED);
497 else if (pctrie_isleaf(node)) {
498 m = pctrie_toval(node);
507 * If the keys differ before the current bisection node,
508 * then the search key might rollback to the earliest
509 * available bisection node or to the smallest key
510 * in the current node (if the owner is greater than the
513 if (pctrie_keybarr(node, index)) {
514 if (index > node->pn_owner) {
516 KASSERT(++loops < 1000,
517 ("pctrie_lookup_ge: too many loops"));
520 * Pop nodes from the stack until either the
521 * stack is empty or a node that could have a
522 * matching descendant is found.
528 } while (pctrie_slot(index,
529 node->pn_clev) == (PCTRIE_COUNT - 1));
532 * The following computation cannot overflow
533 * because index's slot at the current level
534 * is less than PCTRIE_COUNT - 1.
536 index = pctrie_trimkey(index,
538 index += PCTRIE_UNITLEVEL(node->pn_clev);
540 index = node->pn_owner;
541 KASSERT(!pctrie_keybarr(node, index),
542 ("pctrie_lookup_ge: keybarr failed"));
544 slot = pctrie_slot(index, node->pn_clev);
545 child = pctrie_node_load(&node->pn_child[slot], NULL,
547 if (pctrie_isleaf(child)) {
548 m = pctrie_toval(child);
551 } else if (child != NULL)
555 * Look for an available edge or val within the current
558 if (slot < (PCTRIE_COUNT - 1)) {
559 inc = PCTRIE_UNITLEVEL(node->pn_clev);
560 index = pctrie_trimkey(index, node->pn_clev);
564 child = pctrie_node_load(&node->pn_child[slot],
565 NULL, PCTRIE_LOCKED);
566 if (pctrie_isleaf(child)) {
567 m = pctrie_toval(child);
570 } else if (child != NULL)
572 } while (slot < (PCTRIE_COUNT - 1));
574 KASSERT(child == NULL || pctrie_isleaf(child),
575 ("pctrie_lookup_ge: child is radix node"));
578 * If a value or edge greater than the search slot is not found
579 * in the current node, ascend to the next higher-level node.
583 KASSERT(node->pn_clev > 0,
584 ("pctrie_lookup_ge: pushing leaf's parent"));
585 KASSERT(tos < PCTRIE_LIMIT,
586 ("pctrie_lookup_ge: stack overflow"));
593 * Look up the nearest entry at a position less than or equal to index,
594 * assuming access is externally synchronized by a lock.
597 pctrie_lookup_le(struct pctrie *ptree, uint64_t index)
599 struct pctrie_node *stack[PCTRIE_LIMIT];
602 struct pctrie_node *child, *node;
609 node = pctrie_root_load(ptree, NULL, PCTRIE_LOCKED);
612 else if (pctrie_isleaf(node)) {
613 m = pctrie_toval(node);
622 * If the keys differ before the current bisection node,
623 * then the search key might rollback to the earliest
624 * available bisection node or to the largest key
625 * in the current node (if the owner is smaller than the
628 if (pctrie_keybarr(node, index)) {
629 if (index > node->pn_owner) {
630 index = node->pn_owner + PCTRIE_COUNT *
631 PCTRIE_UNITLEVEL(node->pn_clev);
634 KASSERT(++loops < 1000,
635 ("pctrie_lookup_le: too many loops"));
638 * Pop nodes from the stack until either the
639 * stack is empty or a node that could have a
640 * matching descendant is found.
646 } while (pctrie_slot(index,
647 node->pn_clev) == 0);
650 * The following computation cannot overflow
651 * because index's slot at the current level
654 index = pctrie_trimkey(index,
658 KASSERT(!pctrie_keybarr(node, index),
659 ("pctrie_lookup_le: keybarr failed"));
661 slot = pctrie_slot(index, node->pn_clev);
662 child = pctrie_node_load(&node->pn_child[slot], NULL,
664 if (pctrie_isleaf(child)) {
665 m = pctrie_toval(child);
668 } else if (child != NULL)
672 * Look for an available edge or value within the current
676 inc = PCTRIE_UNITLEVEL(node->pn_clev);
681 child = pctrie_node_load(&node->pn_child[slot],
682 NULL, PCTRIE_LOCKED);
683 if (pctrie_isleaf(child)) {
684 m = pctrie_toval(child);
687 } else if (child != NULL)
691 KASSERT(child == NULL || pctrie_isleaf(child),
692 ("pctrie_lookup_le: child is radix node"));
695 * If a value or edge smaller than the search slot is not found
696 * in the current node, ascend to the next higher-level node.
700 KASSERT(node->pn_clev > 0,
701 ("pctrie_lookup_le: pushing leaf's parent"));
702 KASSERT(tos < PCTRIE_LIMIT,
703 ("pctrie_lookup_le: stack overflow"));
710 * Remove the specified index from the tree.
711 * Panics if the key is not present.
714 pctrie_remove(struct pctrie *ptree, uint64_t index, pctrie_free_t freefn)
716 struct pctrie_node *node, *parent, *tmp;
720 node = pctrie_root_load(ptree, NULL, PCTRIE_LOCKED);
721 if (pctrie_isleaf(node)) {
722 m = pctrie_toval(node);
724 panic("%s: invalid key found", __func__);
725 pctrie_root_store(ptree, NULL, PCTRIE_LOCKED);
731 panic("pctrie_remove: impossible to locate the key");
732 slot = pctrie_slot(index, node->pn_clev);
733 tmp = pctrie_node_load(&node->pn_child[slot], NULL,
735 if (pctrie_isleaf(tmp)) {
736 m = pctrie_toval(tmp);
738 panic("%s: invalid key found", __func__);
739 pctrie_node_store(&node->pn_child[slot], NULL,
742 if (node->pn_count > 1)
744 for (i = 0; i < PCTRIE_COUNT; i++) {
745 tmp = pctrie_node_load(&node->pn_child[i],
746 NULL, PCTRIE_LOCKED);
750 KASSERT(i != PCTRIE_COUNT,
751 ("%s: invalid node configuration", __func__));
753 pctrie_root_store(ptree, tmp, PCTRIE_LOCKED);
755 slot = pctrie_slot(index, parent->pn_clev);
756 KASSERT(pctrie_node_load(
757 &parent->pn_child[slot], NULL,
758 PCTRIE_LOCKED) == node,
759 ("%s: invalid child value", __func__));
760 pctrie_node_store(&parent->pn_child[slot], tmp,
764 * The child is still valid and we can not zero the
765 * pointer until all SMR references are gone.
768 pctrie_node_put(ptree, node, freefn, i);
777 * Remove and free all the nodes from the tree.
778 * This function is recursive but there is a tight control on it as the
779 * maximum depth of the tree is fixed.
782 pctrie_reclaim_allnodes(struct pctrie *ptree, pctrie_free_t freefn)
784 struct pctrie_node *root;
786 root = pctrie_root_load(ptree, NULL, PCTRIE_LOCKED);
789 pctrie_root_store(ptree, NULL, PCTRIE_UNSERIALIZED);
790 if (!pctrie_isleaf(root))
791 pctrie_reclaim_allnodes_int(ptree, root, freefn);
796 * Show details about the given node.
798 DB_SHOW_COMMAND(pctrienode, db_show_pctrienode)
800 struct pctrie_node *node, *tmp;
805 node = (struct pctrie_node *)addr;
806 db_printf("node %p, owner %jx, children count %u, level %u:\n",
807 (void *)node, (uintmax_t)node->pn_owner, node->pn_count,
809 for (i = 0; i < PCTRIE_COUNT; i++) {
810 tmp = pctrie_node_load(&node->pn_child[i], NULL,
811 PCTRIE_UNSERIALIZED);
813 db_printf("slot: %d, val: %p, value: %p, clev: %d\n",
815 pctrie_isleaf(tmp) ? pctrie_toval(tmp) : NULL,