2 * SPDX-License-Identifier: BSD-2-Clause
4 * Copyright (c) 2013 EMC Corp.
5 * Copyright (c) 2011 Jeffrey Roberson <jeff@freebsd.org>
6 * Copyright (c) 2008 Mayur Shardul <mayur.shardul@gmail.com>
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * Path-compressed radix trie implementation.
35 * The implementation takes into account the following rationale:
36 * - Size of the nodes should be as small as possible but still big enough
37 * to avoid a large maximum depth for the trie. This is a balance
38 * between the necessity to not wire too much physical memory for the nodes
39 * and the necessity to avoid too much cache pollution during the trie
41 * - There is not a huge bias toward the number of lookup operations over
42 * the number of insert and remove operations. This basically implies
43 * that optimizations supposedly helping one operation but hurting the
44 * other might be carefully evaluated.
45 * - On average not many nodes are expected to be fully populated, hence
46 * level compression may just complicate things.
49 #include <sys/cdefs.h>
50 __FBSDID("$FreeBSD$");
54 #include <sys/param.h>
55 #include <sys/systm.h>
56 #include <sys/kernel.h>
57 #include <sys/libkern.h>
58 #include <sys/pctrie.h>
59 #include <sys/proc.h> /* smr.h depends on struct thread. */
61 #include <sys/smr_types.h>
67 #define PCTRIE_MASK (PCTRIE_COUNT - 1)
68 #define PCTRIE_LIMIT (howmany(sizeof(uint64_t) * NBBY, PCTRIE_WIDTH) - 1)
71 typedef uint8_t pn_popmap_t;
72 #elif PCTRIE_WIDTH == 4
73 typedef uint16_t pn_popmap_t;
74 #elif PCTRIE_WIDTH == 5
75 typedef uint32_t pn_popmap_t;
77 #error Unsupported width
79 _Static_assert(sizeof(pn_popmap_t) <= sizeof(int),
80 "pn_popmap_t too wide");
82 /* Set of all flag bits stored in node pointers. */
83 #define PCTRIE_FLAGS (PCTRIE_ISLEAF)
84 #define PCTRIE_PAD PCTRIE_FLAGS
86 /* Returns one unit associated with specified level. */
87 #define PCTRIE_UNITLEVEL(lev) \
88 ((uint64_t)1 << ((lev) * PCTRIE_WIDTH))
91 typedef SMR_POINTER(struct pctrie_node *) smr_pctnode_t;
94 uint64_t pn_owner; /* Owner of record. */
95 pn_popmap_t pn_popmap; /* Valid children. */
96 uint8_t pn_clev; /* Current level. */
97 smr_pctnode_t pn_child[PCTRIE_COUNT]; /* Child nodes. */
100 enum pctrie_access { PCTRIE_SMR, PCTRIE_LOCKED, PCTRIE_UNSERIALIZED };
102 static __inline void pctrie_node_store(smr_pctnode_t *p, void *val,
103 enum pctrie_access access);
106 * Return the position in the array for a given level.
109 pctrie_slot(uint64_t index, uint16_t level)
111 return ((index >> (level * PCTRIE_WIDTH)) & PCTRIE_MASK);
114 /* Computes the key (index) with the low-order 'level' radix-digits zeroed. */
115 static __inline uint64_t
116 pctrie_trimkey(uint64_t index, uint16_t level)
118 return (index & -PCTRIE_UNITLEVEL(level));
122 * Allocate a node. Pre-allocation should ensure that the request
123 * will always be satisfied.
125 static struct pctrie_node *
126 pctrie_node_get(struct pctrie *ptree, pctrie_alloc_t allocfn, uint64_t index,
129 struct pctrie_node *node;
131 node = allocfn(ptree);
136 * We want to clear the last child pointer after the final section
137 * has exited so lookup can not return false negatives. It is done
138 * here because it will be cache-cold in the dtor callback.
140 if (node->pn_popmap != 0) {
141 pctrie_node_store(&node->pn_child[ffs(node->pn_popmap) - 1],
142 PCTRIE_NULL, PCTRIE_UNSERIALIZED);
145 node->pn_owner = pctrie_trimkey(index, clevel + 1);
146 node->pn_clev = clevel;
154 pctrie_node_put(struct pctrie *ptree, struct pctrie_node *node,
155 pctrie_free_t freefn)
160 KASSERT(powerof2(node->pn_popmap),
161 ("pctrie_node_put: node %p has too many children %04x", node,
163 for (slot = 0; slot < PCTRIE_COUNT; slot++) {
164 if ((node->pn_popmap & (1 << slot)) != 0)
166 KASSERT(smr_unserialized_load(&node->pn_child[slot], true) ==
168 ("pctrie_node_put: node %p has a child", node));
175 * Fetch a node pointer from a slot.
177 static __inline struct pctrie_node *
178 pctrie_node_load(smr_pctnode_t *p, smr_t smr, enum pctrie_access access)
181 case PCTRIE_UNSERIALIZED:
182 return (smr_unserialized_load(p, true));
184 return (smr_serialized_load(p, true));
186 return (smr_entered_load(p, smr));
188 __assert_unreachable();
192 pctrie_node_store(smr_pctnode_t *p, void *v, enum pctrie_access access)
195 case PCTRIE_UNSERIALIZED:
196 smr_unserialized_store(p, v, true);
199 smr_serialized_store(p, v, true);
202 panic("%s: Not supported in SMR section.", __func__);
205 __assert_unreachable();
211 * Get the root node for a tree.
213 static __inline struct pctrie_node *
214 pctrie_root_load(struct pctrie *ptree, smr_t smr, enum pctrie_access access)
216 return (pctrie_node_load((smr_pctnode_t *)&ptree->pt_root, smr, access));
220 * Set the root node for a tree.
223 pctrie_root_store(struct pctrie *ptree, struct pctrie_node *node,
224 enum pctrie_access access)
226 pctrie_node_store((smr_pctnode_t *)&ptree->pt_root, node, access);
230 * Returns TRUE if the specified node is a leaf and FALSE otherwise.
233 pctrie_isleaf(struct pctrie_node *node)
236 return (((uintptr_t)node & PCTRIE_ISLEAF) != 0);
240 * Returns val with leaf bit set.
242 static __inline void *
243 pctrie_toleaf(uint64_t *val)
245 return ((void *)((uintptr_t)val | PCTRIE_ISLEAF));
249 * Returns the associated val extracted from node.
251 static __inline uint64_t *
252 pctrie_toval(struct pctrie_node *node)
255 return ((uint64_t *)((uintptr_t)node & ~PCTRIE_FLAGS));
259 * Make 'child' a child of 'node'.
262 pctrie_addnode(struct pctrie_node *node, uint64_t index, uint16_t clev,
263 struct pctrie_node *child, enum pctrie_access access)
267 slot = pctrie_slot(index, clev);
268 pctrie_node_store(&node->pn_child[slot], child, access);
269 node->pn_popmap ^= 1 << slot;
270 KASSERT((node->pn_popmap & (1 << slot)) != 0,
271 ("%s: bad popmap slot %d in node %p", __func__, slot, node));
275 * Returns the level where two keys differ.
276 * It cannot accept 2 equal keys.
278 static __inline uint16_t
279 pctrie_keydiff(uint64_t index1, uint64_t index2)
282 KASSERT(index1 != index2, ("%s: passing the same key value %jx",
283 __func__, (uintmax_t)index1));
284 CTASSERT(sizeof(long long) >= sizeof(uint64_t));
287 * From the highest-order bit where the indexes differ,
288 * compute the highest level in the trie where they differ.
290 return ((flsll(index1 ^ index2) - 1) / PCTRIE_WIDTH);
294 * Returns TRUE if it can be determined that key does not belong to the
295 * specified node. Otherwise, returns FALSE.
298 pctrie_keybarr(struct pctrie_node *node, uint64_t idx)
301 if (node->pn_clev < PCTRIE_LIMIT) {
302 idx = pctrie_trimkey(idx, node->pn_clev + 1);
303 return (idx != node->pn_owner);
309 * Internal helper for pctrie_reclaim_allnodes().
310 * This function is recursive.
313 pctrie_reclaim_allnodes_int(struct pctrie *ptree, struct pctrie_node *node,
314 pctrie_free_t freefn)
316 struct pctrie_node *child;
319 while (node->pn_popmap != 0) {
320 slot = ffs(node->pn_popmap) - 1;
321 child = pctrie_node_load(&node->pn_child[slot], NULL,
322 PCTRIE_UNSERIALIZED);
323 KASSERT(child != PCTRIE_NULL,
324 ("%s: bad popmap slot %d in node %p",
325 __func__, slot, node));
326 if (!pctrie_isleaf(child))
327 pctrie_reclaim_allnodes_int(ptree, child, freefn);
328 node->pn_popmap ^= 1 << slot;
329 pctrie_node_store(&node->pn_child[slot], PCTRIE_NULL,
330 PCTRIE_UNSERIALIZED);
332 pctrie_node_put(ptree, node, freefn);
336 * pctrie node zone initializer.
339 pctrie_zone_init(void *mem, int size __unused, int flags __unused)
341 struct pctrie_node *node;
345 for (int i = 0; i < nitems(node->pn_child); i++)
346 pctrie_node_store(&node->pn_child[i], PCTRIE_NULL,
347 PCTRIE_UNSERIALIZED);
352 pctrie_node_size(void)
355 return (sizeof(struct pctrie_node));
359 * Inserts the key-value pair into the trie.
360 * Panics if the key already exists.
363 pctrie_insert(struct pctrie *ptree, uint64_t *val, pctrie_alloc_t allocfn)
365 uint64_t index, newind;
366 struct pctrie_node *leaf, *node, *parent;
367 smr_pctnode_t *parentp;
372 leaf = pctrie_toleaf(val);
375 * The owner of record for root is not really important because it
376 * will never be used.
378 node = pctrie_root_load(ptree, NULL, PCTRIE_LOCKED);
381 if (pctrie_isleaf(node)) {
382 if (node == PCTRIE_NULL) {
384 ptree->pt_root = leaf;
386 pctrie_addnode(parent, index,
387 parent->pn_clev, leaf,
391 newind = *pctrie_toval(node);
393 panic("%s: key %jx is already present",
394 __func__, (uintmax_t)index);
397 if (pctrie_keybarr(node, index)) {
398 newind = node->pn_owner;
401 slot = pctrie_slot(index, node->pn_clev);
403 node = pctrie_node_load(&node->pn_child[slot], NULL,
408 * A new node is needed because the right insertion level is reached.
409 * Setup the new intermediate node and add the 2 children: the
410 * new object and the older edge or object.
412 parentp = (parent != NULL) ? &parent->pn_child[slot]:
413 (smr_pctnode_t *)&ptree->pt_root;
414 clev = pctrie_keydiff(newind, index);
415 parent = pctrie_node_get(ptree, allocfn, index, clev);
418 /* These writes are not yet visible due to ordering. */
419 pctrie_addnode(parent, index, clev, leaf, PCTRIE_UNSERIALIZED);
420 pctrie_addnode(parent, newind, clev, node, PCTRIE_UNSERIALIZED);
421 /* Synchronize to make the above visible. */
422 pctrie_node_store(parentp, parent, PCTRIE_LOCKED);
427 * Returns the value stored at the index. If the index is not present,
430 static __always_inline uint64_t *
431 _pctrie_lookup(struct pctrie *ptree, uint64_t index, smr_t smr,
432 enum pctrie_access access)
434 struct pctrie_node *node;
438 node = pctrie_root_load(ptree, smr, access);
440 if (pctrie_isleaf(node)) {
441 if ((m = pctrie_toval(node)) != NULL && *m == index)
445 if (pctrie_keybarr(node, index))
447 slot = pctrie_slot(index, node->pn_clev);
448 node = pctrie_node_load(&node->pn_child[slot], smr, access);
454 * Returns the value stored at the index, assuming access is externally
455 * synchronized by a lock.
457 * If the index is not present, NULL is returned.
460 pctrie_lookup(struct pctrie *ptree, uint64_t index)
462 return (_pctrie_lookup(ptree, index, NULL, PCTRIE_LOCKED));
466 * Returns the value stored at the index without requiring an external lock.
468 * If the index is not present, NULL is returned.
471 pctrie_lookup_unlocked(struct pctrie *ptree, uint64_t index, smr_t smr)
476 res = _pctrie_lookup(ptree, index, smr, PCTRIE_SMR);
482 * Returns the value with the least index that is greater than or equal to the
483 * specified index, or NULL if there are no such values.
485 * Requires that access be externally synchronized by a lock.
488 pctrie_lookup_ge(struct pctrie *ptree, uint64_t index)
490 struct pctrie_node *node, *succ;
495 * Descend the trie as if performing an ordinary lookup for the
496 * specified value. However, unlike an ordinary lookup, as we descend
497 * the trie, we use "succ" to remember the last branching-off point,
498 * that is, the interior node under which the least value that is both
499 * outside our current path down the trie and greater than the specified
500 * index resides. (The node's popmap makes it fast and easy to
501 * recognize a branching-off point.) If our ordinary lookup fails to
502 * yield a value that is greater than or equal to the specified index,
503 * then we will exit this loop and perform a lookup starting from
504 * "succ". If "succ" is not NULL, then that lookup is guaranteed to
507 node = pctrie_root_load(ptree, NULL, PCTRIE_LOCKED);
510 if (pctrie_isleaf(node)) {
511 if ((m = pctrie_toval(node)) != NULL && *m >= index)
515 if (pctrie_keybarr(node, index)) {
517 * If all values in this subtree are > index, then the
518 * least value in this subtree is the answer.
520 if (node->pn_owner > index)
524 slot = pctrie_slot(index, node->pn_clev);
527 * Just in case the next search step leads to a subtree of all
528 * values < index, check popmap to see if a next bigger step, to
529 * a subtree of all pages with values > index, is available. If
530 * so, remember to restart the search here.
532 if ((node->pn_popmap >> slot) > 1)
534 node = pctrie_node_load(&node->pn_child[slot], NULL,
539 * Restart the search from the last place visited in the subtree that
540 * included some values > index, if there was such a place.
546 * Take a step to the next bigger sibling of the node chosen
547 * last time. In that subtree, all values > index.
549 slot = pctrie_slot(index, succ->pn_clev) + 1;
550 KASSERT((succ->pn_popmap >> slot) != 0,
551 ("%s: no popmap siblings past slot %d in node %p",
552 __func__, slot, succ));
553 slot += ffs(succ->pn_popmap >> slot) - 1;
554 succ = pctrie_node_load(&succ->pn_child[slot], NULL,
559 * Find the value in the subtree rooted at "succ" with the least index.
561 while (!pctrie_isleaf(succ)) {
562 KASSERT(succ->pn_popmap != 0,
563 ("%s: no popmap children in node %p", __func__, succ));
564 slot = ffs(succ->pn_popmap) - 1;
565 succ = pctrie_node_load(&succ->pn_child[slot], NULL,
568 return (pctrie_toval(succ));
572 * Returns the value with the greatest index that is less than or equal to the
573 * specified index, or NULL if there are no such values.
575 * Requires that access be externally synchronized by a lock.
578 pctrie_lookup_le(struct pctrie *ptree, uint64_t index)
580 struct pctrie_node *node, *pred;
585 * Mirror the implementation of pctrie_lookup_ge, described above.
587 node = pctrie_root_load(ptree, NULL, PCTRIE_LOCKED);
590 if (pctrie_isleaf(node)) {
591 if ((m = pctrie_toval(node)) != NULL && *m <= index)
595 if (pctrie_keybarr(node, index)) {
596 if (node->pn_owner < index)
600 slot = pctrie_slot(index, node->pn_clev);
601 if ((node->pn_popmap & ((1 << slot) - 1)) != 0)
603 node = pctrie_node_load(&node->pn_child[slot], NULL,
609 slot = pctrie_slot(index, pred->pn_clev);
610 KASSERT((pred->pn_popmap & ((1 << slot) - 1)) != 0,
611 ("%s: no popmap siblings before slot %d in node %p",
612 __func__, slot, pred));
613 slot = fls(pred->pn_popmap & ((1 << slot) - 1)) - 1;
614 pred = pctrie_node_load(&pred->pn_child[slot], NULL,
617 while (!pctrie_isleaf(pred)) {
618 KASSERT(pred->pn_popmap != 0,
619 ("%s: no popmap children in node %p", __func__, pred));
620 slot = fls(pred->pn_popmap) - 1;
621 pred = pctrie_node_load(&pred->pn_child[slot], NULL,
624 return (pctrie_toval(pred));
628 * Remove the specified index from the tree.
629 * Panics if the key is not present.
632 pctrie_remove(struct pctrie *ptree, uint64_t index, pctrie_free_t freefn)
634 struct pctrie_node *child, *node, *parent;
639 child = pctrie_root_load(ptree, NULL, PCTRIE_LOCKED);
641 if (pctrie_isleaf(child))
645 slot = pctrie_slot(index, node->pn_clev);
646 child = pctrie_node_load(&node->pn_child[slot], NULL,
649 if ((m = pctrie_toval(child)) == NULL || *m != index)
650 panic("%s: key not found", __func__);
652 pctrie_root_store(ptree, PCTRIE_NULL, PCTRIE_LOCKED);
655 KASSERT((node->pn_popmap & (1 << slot)) != 0,
656 ("%s: bad popmap slot %d in node %p",
657 __func__, slot, node));
658 node->pn_popmap ^= 1 << slot;
659 pctrie_node_store(&node->pn_child[slot], PCTRIE_NULL, PCTRIE_LOCKED);
660 if (!powerof2(node->pn_popmap))
662 KASSERT(node->pn_popmap != 0, ("%s: bad popmap all zeroes", __func__));
663 slot = ffs(node->pn_popmap) - 1;
664 child = pctrie_node_load(&node->pn_child[slot], NULL, PCTRIE_LOCKED);
665 KASSERT(child != PCTRIE_NULL,
666 ("%s: bad popmap slot %d in node %p", __func__, slot, node));
668 pctrie_root_store(ptree, child, PCTRIE_LOCKED);
670 slot = pctrie_slot(index, parent->pn_clev);
672 pctrie_node_load(&parent->pn_child[slot], NULL,
673 PCTRIE_LOCKED), ("%s: invalid child value", __func__));
674 pctrie_node_store(&parent->pn_child[slot], child,
678 * The child is still valid and we can not zero the
679 * pointer until all SMR references are gone.
681 pctrie_node_put(ptree, node, freefn);
685 * Remove and free all the nodes from the tree.
686 * This function is recursive but there is a tight control on it as the
687 * maximum depth of the tree is fixed.
690 pctrie_reclaim_allnodes(struct pctrie *ptree, pctrie_free_t freefn)
692 struct pctrie_node *root;
694 root = pctrie_root_load(ptree, NULL, PCTRIE_LOCKED);
695 if (root == PCTRIE_NULL)
697 pctrie_root_store(ptree, PCTRIE_NULL, PCTRIE_UNSERIALIZED);
698 if (!pctrie_isleaf(root))
699 pctrie_reclaim_allnodes_int(ptree, root, freefn);
704 * Show details about the given node.
706 DB_SHOW_COMMAND(pctrienode, db_show_pctrienode)
708 struct pctrie_node *node, *tmp;
714 node = (struct pctrie_node *)addr;
715 db_printf("node %p, owner %jx, children popmap %04x, level %u:\n",
716 (void *)node, (uintmax_t)node->pn_owner, node->pn_popmap,
718 for (popmap = node->pn_popmap; popmap != 0; popmap ^= 1 << slot) {
719 slot = ffs(popmap) - 1;
720 tmp = pctrie_node_load(&node->pn_child[slot], NULL,
721 PCTRIE_UNSERIALIZED);
722 db_printf("slot: %d, val: %p, value: %p, clev: %d\n",
724 pctrie_isleaf(tmp) ? pctrie_toval(tmp) : NULL,