2 * Copyright (c) 2013 EMC Corp.
3 * Copyright (c) 2011 Jeffrey Roberson <jeff@freebsd.org>
4 * Copyright (c) 2008 Mayur Shardul <mayur.shardul@gmail.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * Path-compressed radix trie implementation.
33 * The implementation takes into account the following rationale:
34 * - Size of the nodes should be as small as possible but still big enough
35 * to avoid a large maximum depth for the trie. This is a balance
36 * between the necessity to not wire too much physical memory for the nodes
37 * and the necessity to avoid too much cache pollution during the trie
39 * - There is not a huge bias toward the number of lookup operations over
40 * the number of insert and remove operations. This basically implies
41 * that optimizations supposedly helping one operation but hurting the
42 * other might be carefully evaluated.
43 * - On average not many nodes are expected to be fully populated, hence
44 * level compression may just complicate things.
47 #include <sys/cdefs.h>
48 __FBSDID("$FreeBSD$");
52 #include <sys/param.h>
53 #include <sys/systm.h>
54 #include <sys/kernel.h>
55 #include <sys/pctrie.h>
61 #define PCTRIE_MASK (PCTRIE_COUNT - 1)
62 #define PCTRIE_LIMIT (howmany(sizeof(uint64_t) * NBBY, PCTRIE_WIDTH) - 1)
64 /* Flag bits stored in node pointers. */
65 #define PCTRIE_ISLEAF 0x1
66 #define PCTRIE_FLAGS 0x1
67 #define PCTRIE_PAD PCTRIE_FLAGS
69 /* Returns one unit associated with specified level. */
70 #define PCTRIE_UNITLEVEL(lev) \
71 ((uint64_t)1 << ((lev) * PCTRIE_WIDTH))
74 uint64_t pn_owner; /* Owner of record. */
75 uint16_t pn_count; /* Valid children. */
76 uint16_t pn_clev; /* Current level. */
77 void *pn_child[PCTRIE_COUNT]; /* Child nodes. */
81 * Allocate a node. Pre-allocation should ensure that the request
82 * will always be satisfied.
84 static __inline struct pctrie_node *
85 pctrie_node_get(struct pctrie *ptree, pctrie_alloc_t allocfn, uint64_t owner,
86 uint16_t count, uint16_t clevel)
88 struct pctrie_node *node;
90 node = allocfn(ptree);
93 node->pn_owner = owner;
94 node->pn_count = count;
95 node->pn_clev = clevel;
104 pctrie_node_put(struct pctrie *ptree, struct pctrie_node *node,
105 pctrie_free_t freefn)
110 KASSERT(node->pn_count == 0,
111 ("pctrie_node_put: node %p has %d children", node,
113 for (slot = 0; slot < PCTRIE_COUNT; slot++)
114 KASSERT(node->pn_child[slot] == NULL,
115 ("pctrie_node_put: node %p has a child", node));
121 * Return the position in the array for a given level.
124 pctrie_slot(uint64_t index, uint16_t level)
127 return ((index >> (level * PCTRIE_WIDTH)) & PCTRIE_MASK);
130 /* Trims the key after the specified level. */
131 static __inline uint64_t
132 pctrie_trimkey(uint64_t index, uint16_t level)
138 ret >>= level * PCTRIE_WIDTH;
139 ret <<= level * PCTRIE_WIDTH;
145 * Get the root node for a tree.
147 static __inline struct pctrie_node *
148 pctrie_getroot(struct pctrie *ptree)
151 return ((struct pctrie_node *)ptree->pt_root);
155 * Set the root node for a tree.
158 pctrie_setroot(struct pctrie *ptree, struct pctrie_node *node)
161 ptree->pt_root = (uintptr_t)node;
165 * Returns TRUE if the specified node is a leaf and FALSE otherwise.
167 static __inline boolean_t
168 pctrie_isleaf(struct pctrie_node *node)
171 return (((uintptr_t)node & PCTRIE_ISLEAF) != 0);
175 * Returns the associated val extracted from node.
177 static __inline uint64_t *
178 pctrie_toval(struct pctrie_node *node)
181 return ((uint64_t *)((uintptr_t)node & ~PCTRIE_FLAGS));
185 * Adds the val as a child of the provided node.
188 pctrie_addval(struct pctrie_node *node, uint64_t index, uint16_t clev,
193 slot = pctrie_slot(index, clev);
194 node->pn_child[slot] = (void *)((uintptr_t)val | PCTRIE_ISLEAF);
198 * Returns the slot where two keys differ.
199 * It cannot accept 2 equal keys.
201 static __inline uint16_t
202 pctrie_keydiff(uint64_t index1, uint64_t index2)
206 KASSERT(index1 != index2, ("%s: passing the same key value %jx",
207 __func__, (uintmax_t)index1));
210 for (clev = PCTRIE_LIMIT;; clev--)
211 if (pctrie_slot(index1, clev) != 0)
216 * Returns TRUE if it can be determined that key does not belong to the
217 * specified node. Otherwise, returns FALSE.
219 static __inline boolean_t
220 pctrie_keybarr(struct pctrie_node *node, uint64_t idx)
223 if (node->pn_clev < PCTRIE_LIMIT) {
224 idx = pctrie_trimkey(idx, node->pn_clev + 1);
225 return (idx != node->pn_owner);
231 * Internal helper for pctrie_reclaim_allnodes().
232 * This function is recursive.
235 pctrie_reclaim_allnodes_int(struct pctrie *ptree, struct pctrie_node *node,
236 pctrie_free_t freefn)
240 KASSERT(node->pn_count <= PCTRIE_COUNT,
241 ("pctrie_reclaim_allnodes_int: bad count in node %p", node));
242 for (slot = 0; node->pn_count != 0; slot++) {
243 if (node->pn_child[slot] == NULL)
245 if (!pctrie_isleaf(node->pn_child[slot]))
246 pctrie_reclaim_allnodes_int(ptree,
247 node->pn_child[slot], freefn);
248 node->pn_child[slot] = NULL;
251 pctrie_node_put(ptree, node, freefn);
255 * pctrie node zone initializer.
258 pctrie_zone_init(void *mem, int size __unused, int flags __unused)
260 struct pctrie_node *node;
263 memset(node->pn_child, 0, sizeof(node->pn_child));
268 pctrie_node_size(void)
271 return (sizeof(struct pctrie_node));
275 * Inserts the key-value pair into the trie.
276 * Panics if the key already exists.
279 pctrie_insert(struct pctrie *ptree, uint64_t *val, pctrie_alloc_t allocfn)
281 uint64_t index, newind;
283 struct pctrie_node *node, *tmp;
291 * The owner of record for root is not really important because it
292 * will never be used.
294 node = pctrie_getroot(ptree);
296 ptree->pt_root = (uintptr_t)val | PCTRIE_ISLEAF;
299 parentp = (void **)&ptree->pt_root;
301 if (pctrie_isleaf(node)) {
302 m = pctrie_toval(node);
304 panic("%s: key %jx is already present",
305 __func__, (uintmax_t)index);
306 clev = pctrie_keydiff(*m, index);
307 tmp = pctrie_node_get(ptree, allocfn,
308 pctrie_trimkey(index, clev + 1), 2, clev);
312 pctrie_addval(tmp, index, clev, val);
313 pctrie_addval(tmp, *m, clev, m);
315 } else if (pctrie_keybarr(node, index))
317 slot = pctrie_slot(index, node->pn_clev);
318 if (node->pn_child[slot] == NULL) {
320 pctrie_addval(node, index, node->pn_clev, val);
323 parentp = &node->pn_child[slot];
324 node = node->pn_child[slot];
328 * A new node is needed because the right insertion level is reached.
329 * Setup the new intermediate node and add the 2 children: the
330 * new object and the older edge.
332 newind = node->pn_owner;
333 clev = pctrie_keydiff(newind, index);
334 tmp = pctrie_node_get(ptree, allocfn,
335 pctrie_trimkey(index, clev + 1), 2, clev);
339 pctrie_addval(tmp, index, clev, val);
340 slot = pctrie_slot(newind, clev);
341 tmp->pn_child[slot] = node;
347 * Returns the value stored at the index. If the index is not present,
351 pctrie_lookup(struct pctrie *ptree, uint64_t index)
353 struct pctrie_node *node;
357 node = pctrie_getroot(ptree);
358 while (node != NULL) {
359 if (pctrie_isleaf(node)) {
360 m = pctrie_toval(node);
365 } else if (pctrie_keybarr(node, index))
367 slot = pctrie_slot(index, node->pn_clev);
368 node = node->pn_child[slot];
374 * Look up the nearest entry at a position bigger than or equal to index.
377 pctrie_lookup_ge(struct pctrie *ptree, uint64_t index)
379 struct pctrie_node *stack[PCTRIE_LIMIT];
382 struct pctrie_node *child, *node;
388 node = pctrie_getroot(ptree);
391 else if (pctrie_isleaf(node)) {
392 m = pctrie_toval(node);
401 * If the keys differ before the current bisection node,
402 * then the search key might rollback to the earliest
403 * available bisection node or to the smallest key
404 * in the current node (if the owner is bigger than the
407 if (pctrie_keybarr(node, index)) {
408 if (index > node->pn_owner) {
410 KASSERT(++loops < 1000,
411 ("pctrie_lookup_ge: too many loops"));
414 * Pop nodes from the stack until either the
415 * stack is empty or a node that could have a
416 * matching descendant is found.
422 } while (pctrie_slot(index,
423 node->pn_clev) == (PCTRIE_COUNT - 1));
426 * The following computation cannot overflow
427 * because index's slot at the current level
428 * is less than PCTRIE_COUNT - 1.
430 index = pctrie_trimkey(index,
432 index += PCTRIE_UNITLEVEL(node->pn_clev);
434 index = node->pn_owner;
435 KASSERT(!pctrie_keybarr(node, index),
436 ("pctrie_lookup_ge: keybarr failed"));
438 slot = pctrie_slot(index, node->pn_clev);
439 child = node->pn_child[slot];
440 if (pctrie_isleaf(child)) {
441 m = pctrie_toval(child);
444 } else if (child != NULL)
448 * Look for an available edge or val within the current
451 if (slot < (PCTRIE_COUNT - 1)) {
452 inc = PCTRIE_UNITLEVEL(node->pn_clev);
453 index = pctrie_trimkey(index, node->pn_clev);
457 child = node->pn_child[slot];
458 if (pctrie_isleaf(child)) {
459 m = pctrie_toval(child);
462 } else if (child != NULL)
464 } while (slot < (PCTRIE_COUNT - 1));
466 KASSERT(child == NULL || pctrie_isleaf(child),
467 ("pctrie_lookup_ge: child is radix node"));
470 * If a value or edge bigger than the search slot is not found
471 * in the current node, ascend to the next higher-level node.
475 KASSERT(node->pn_clev > 0,
476 ("pctrie_lookup_ge: pushing leaf's parent"));
477 KASSERT(tos < PCTRIE_LIMIT,
478 ("pctrie_lookup_ge: stack overflow"));
485 * Look up the nearest entry at a position less than or equal to index.
488 pctrie_lookup_le(struct pctrie *ptree, uint64_t index)
490 struct pctrie_node *stack[PCTRIE_LIMIT];
493 struct pctrie_node *child, *node;
499 node = pctrie_getroot(ptree);
502 else if (pctrie_isleaf(node)) {
503 m = pctrie_toval(node);
512 * If the keys differ before the current bisection node,
513 * then the search key might rollback to the earliest
514 * available bisection node or to the largest key
515 * in the current node (if the owner is smaller than the
518 if (pctrie_keybarr(node, index)) {
519 if (index > node->pn_owner) {
520 index = node->pn_owner + PCTRIE_COUNT *
521 PCTRIE_UNITLEVEL(node->pn_clev);
524 KASSERT(++loops < 1000,
525 ("pctrie_lookup_le: too many loops"));
528 * Pop nodes from the stack until either the
529 * stack is empty or a node that could have a
530 * matching descendant is found.
536 } while (pctrie_slot(index,
537 node->pn_clev) == 0);
540 * The following computation cannot overflow
541 * because index's slot at the current level
544 index = pctrie_trimkey(index,
548 KASSERT(!pctrie_keybarr(node, index),
549 ("pctrie_lookup_le: keybarr failed"));
551 slot = pctrie_slot(index, node->pn_clev);
552 child = node->pn_child[slot];
553 if (pctrie_isleaf(child)) {
554 m = pctrie_toval(child);
557 } else if (child != NULL)
561 * Look for an available edge or value within the current
565 inc = PCTRIE_UNITLEVEL(node->pn_clev);
570 child = node->pn_child[slot];
571 if (pctrie_isleaf(child)) {
572 m = pctrie_toval(child);
575 } else if (child != NULL)
579 KASSERT(child == NULL || pctrie_isleaf(child),
580 ("pctrie_lookup_le: child is radix node"));
583 * If a value or edge smaller than the search slot is not found
584 * in the current node, ascend to the next higher-level node.
588 KASSERT(node->pn_clev > 0,
589 ("pctrie_lookup_le: pushing leaf's parent"));
590 KASSERT(tos < PCTRIE_LIMIT,
591 ("pctrie_lookup_le: stack overflow"));
598 * Remove the specified index from the tree.
599 * Panics if the key is not present.
602 pctrie_remove(struct pctrie *ptree, uint64_t index, pctrie_free_t freefn)
604 struct pctrie_node *node, *parent;
608 node = pctrie_getroot(ptree);
609 if (pctrie_isleaf(node)) {
610 m = pctrie_toval(node);
612 panic("%s: invalid key found", __func__);
613 pctrie_setroot(ptree, NULL);
619 panic("pctrie_remove: impossible to locate the key");
620 slot = pctrie_slot(index, node->pn_clev);
621 if (pctrie_isleaf(node->pn_child[slot])) {
622 m = pctrie_toval(node->pn_child[slot]);
624 panic("%s: invalid key found", __func__);
625 node->pn_child[slot] = NULL;
627 if (node->pn_count > 1)
629 for (i = 0; i < PCTRIE_COUNT; i++)
630 if (node->pn_child[i] != NULL)
632 KASSERT(i != PCTRIE_COUNT,
633 ("%s: invalid node configuration", __func__));
635 pctrie_setroot(ptree, node->pn_child[i]);
637 slot = pctrie_slot(index, parent->pn_clev);
638 KASSERT(parent->pn_child[slot] == node,
639 ("%s: invalid child value", __func__));
640 parent->pn_child[slot] = node->pn_child[i];
643 node->pn_child[i] = NULL;
644 pctrie_node_put(ptree, node, freefn);
648 node = node->pn_child[slot];
653 * Remove and free all the nodes from the tree.
654 * This function is recursive but there is a tight control on it as the
655 * maximum depth of the tree is fixed.
658 pctrie_reclaim_allnodes(struct pctrie *ptree, pctrie_free_t freefn)
660 struct pctrie_node *root;
662 root = pctrie_getroot(ptree);
665 pctrie_setroot(ptree, NULL);
666 if (!pctrie_isleaf(root))
667 pctrie_reclaim_allnodes_int(ptree, root, freefn);
672 * Show details about the given node.
674 DB_SHOW_COMMAND(pctrienode, db_show_pctrienode)
676 struct pctrie_node *node;
681 node = (struct pctrie_node *)addr;
682 db_printf("node %p, owner %jx, children count %u, level %u:\n",
683 (void *)node, (uintmax_t)node->pn_owner, node->pn_count,
685 for (i = 0; i < PCTRIE_COUNT; i++)
686 if (node->pn_child[i] != NULL)
687 db_printf("slot: %d, val: %p, value: %p, clev: %d\n",
688 i, (void *)node->pn_child[i],
689 pctrie_isleaf(node->pn_child[i]) ?
690 pctrie_toval(node->pn_child[i]) : NULL,