sys/vm/vm_radix.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-2-Clause
   3  *
   4  * Copyright (c) 2013 EMC Corp.
   5  * Copyright (c) 2011 Jeffrey Roberson <jeff@freebsd.org>
   6  * Copyright (c) 2008 Mayur Shardul <mayur.shardul@gmail.com>
   7  * All rights reserved.
   8  *
   9  * Redistribution and use in source and binary forms, with or without
  10  * modification, are permitted provided that the following conditions
  11  * are met:
  12  * 1. Redistributions of source code must retain the above copyright
  13  *    notice, this list of conditions and the following disclaimer.
  14  * 2. Redistributions in binary form must reproduce the above copyright
  15  *    notice, this list of conditions and the following disclaimer in the
  16  *    documentation and/or other materials provided with the distribution.
  17  *
  18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  28  * SUCH DAMAGE.
  29  *
  30  */
  31
  32 /*
  33  * Path-compressed radix trie implementation.
  34  * The following code is not generalized into a general purpose library
  35  * because there are way too many parameters embedded that should really
  36  * be decided by the library consumers.  At the same time, consumers
  37  * of this code must achieve highest possible performance.
  38  *
  39  * The implementation takes into account the following rationale:
  40  * - Size of the nodes should be as small as possible but still big enough
  41  *   to avoid a large maximum depth for the trie.  This is a balance
  42  *   between the necessity to not wire too much physical memory for the nodes
  43  *   and the necessity to avoid too much cache pollution during the trie
  44  *   operations.
  45  * - There is not a huge bias toward the number of lookup operations over
  46  *   the number of insert and remove operations.  This basically implies
  47  *   that optimizations supposedly helping one operation but hurting the
  48  *   other might be carefully evaluated.
  49  * - On average not many nodes are expected to be fully populated, hence
  50  *   level compression may just complicate things.
  51  */
  52
  53 #include <sys/cdefs.h>
  54 __FBSDID("$FreeBSD$");
  55
  56 #include "opt_ddb.h"
  57
  58 #include <sys/param.h>
  59 #include <sys/systm.h>
  60 #include <sys/kernel.h>
  61 #include <sys/libkern.h>
  62 #include <sys/proc.h>
  63 #include <sys/vmmeter.h>
  64 #include <sys/smr.h>
  65 #include <sys/smr_types.h>
  66
  67 #include <vm/uma.h>
  68 #include <vm/vm.h>
  69 #include <vm/vm_param.h>
  70 #include <vm/vm_object.h>
  71 #include <vm/vm_page.h>
  72 #include <vm/vm_radix.h>
  73
  74 #ifdef DDB
  75 #include <ddb/ddb.h>
  76 #endif
  77
  78 /*
  79  * These widths should allow the pointers to a node's children to fit within
  80  * a single cache line.  The extra levels from a narrow width should not be
  81  * a problem thanks to path compression.
  82  */
  83 #ifdef __LP64__
  84 #define VM_RADIX_WIDTH  4
  85 #else
  86 #define VM_RADIX_WIDTH  3
  87 #endif
  88
  89 #define VM_RADIX_COUNT  (1 << VM_RADIX_WIDTH)
  90 #define VM_RADIX_MASK   (VM_RADIX_COUNT - 1)
  91 #define VM_RADIX_LIMIT                                                  \
  92         (howmany(sizeof(vm_pindex_t) * NBBY, VM_RADIX_WIDTH) - 1)
  93
  94 #if VM_RADIX_WIDTH == 3
  95 typedef uint8_t rn_popmap_t;
  96 #elif VM_RADIX_WIDTH == 4
  97 typedef uint16_t rn_popmap_t;
  98 #elif VM_RADIX_WIDTH == 5
  99 typedef uint32_t rn_popmap_t;
 100 #else
 101 #error Unsupported width
 102 #endif
 103 _Static_assert(sizeof(rn_popmap_t) <= sizeof(int),
 104     "rn_popmap_t too wide");
 105
 106 /* Set of all flag bits stored in node pointers. */
 107 #define VM_RADIX_FLAGS  (VM_RADIX_ISLEAF)
 108 #define VM_RADIX_PAD    VM_RADIX_FLAGS
 109
 110 enum vm_radix_access { SMR, LOCKED, UNSERIALIZED };
 111
 112 struct vm_radix_node;
 113 typedef SMR_POINTER(struct vm_radix_node *) smrnode_t;
 114
 115 struct vm_radix_node {
 116         vm_pindex_t     rn_owner;                       /* Owner of record. */
 117         rn_popmap_t     rn_popmap;                      /* Valid children. */
 118         uint8_t         rn_clev;                        /* Level * WIDTH. */
 119         smrnode_t       rn_child[VM_RADIX_COUNT];       /* Child nodes. */
 120 };
 121
 122 static uma_zone_t vm_radix_node_zone;
 123 static smr_t vm_radix_smr;
 124
 125 static void vm_radix_node_store(smrnode_t *p, struct vm_radix_node *v,
 126     enum vm_radix_access access);
 127
 128 /*
 129  * Map index to an array position for the children of rnode,
 130  */
 131 static __inline int
 132 vm_radix_slot(struct vm_radix_node *rnode, vm_pindex_t index)
 133 {
 134         return ((index >> rnode->rn_clev) & VM_RADIX_MASK);
 135 }
 136
 137 /*
 138  * Returns true if index does not belong to the specified rnode.  Otherwise,
 139  * sets slot value, and returns false.
 140  */
 141 static __inline bool
 142 vm_radix_keybarr(struct vm_radix_node *rnode, vm_pindex_t index, int *slot)
 143 {
 144         index = (index - rnode->rn_owner) >> rnode->rn_clev;
 145         if (index >= VM_RADIX_COUNT)
 146                 return (true);
 147         *slot = index;
 148         return (false);
 149 }
 150
 151 /*
 152  * Allocate a radix node.
 153  */
 154 static struct vm_radix_node *
 155 vm_radix_node_get(vm_pindex_t index, vm_pindex_t newind)
 156 {
 157         struct vm_radix_node *rnode;
 158
 159         rnode = uma_zalloc_smr(vm_radix_node_zone, M_NOWAIT);
 160         if (rnode == NULL)
 161                 return (NULL);
 162
 163         /*
 164          * We want to clear the last child pointer after the final section
 165          * has exited so lookup can not return false negatives.  It is done
 166          * here because it will be cache-cold in the dtor callback.
 167          */
 168         if (rnode->rn_popmap != 0) {
 169                 vm_radix_node_store(&rnode->rn_child[ffs(rnode->rn_popmap) - 1],
 170                     VM_RADIX_NULL, UNSERIALIZED);
 171                 rnode->rn_popmap = 0;
 172         }
 173
 174         /*
 175          * From the highest-order bit where the indexes differ,
 176          * compute the highest level in the trie where they differ.  Then,
 177          * compute the least index of this subtrie.
 178          */
 179         KASSERT(index != newind, ("%s: passing the same key value %jx",
 180             __func__, (uintmax_t)index));
 181         _Static_assert(sizeof(long long) >= sizeof(vm_pindex_t),
 182             "vm_pindex_t too wide");
 183         _Static_assert(sizeof(vm_pindex_t) * NBBY <=
 184             (1 << (sizeof(rnode->rn_clev) * NBBY)), "rn_clev too narrow");
 185         rnode->rn_clev = rounddown(flsll(index ^ newind) - 1, VM_RADIX_WIDTH);
 186         rnode->rn_owner = VM_RADIX_COUNT;
 187         rnode->rn_owner = index & -(rnode->rn_owner << rnode->rn_clev);
 188         return (rnode);
 189 }
 190
 191 /*
 192  * Free radix node.
 193  */
 194 static __inline void
 195 vm_radix_node_put(struct vm_radix_node *rnode)
 196 {
 197 #ifdef INVARIANTS
 198         int slot;
 199
 200         KASSERT(powerof2(rnode->rn_popmap),
 201             ("vm_radix_node_put: rnode %p has too many children %04x", rnode,
 202             rnode->rn_popmap));
 203         for (slot = 0; slot < VM_RADIX_COUNT; slot++) {
 204                 if ((rnode->rn_popmap & (1 << slot)) != 0)
 205                         continue;
 206                 KASSERT(smr_unserialized_load(&rnode->rn_child[slot], true) ==
 207                     VM_RADIX_NULL,
 208                     ("vm_radix_node_put: rnode %p has a child", rnode));
 209         }
 210 #endif
 211         uma_zfree_smr(vm_radix_node_zone, rnode);
 212 }
 213
 214 /*
 215  * Fetch a node pointer from a slot in another node.
 216  */
 217 static __inline struct vm_radix_node *
 218 vm_radix_node_load(smrnode_t *p, enum vm_radix_access access)
 219 {
 220
 221         switch (access) {
 222         case UNSERIALIZED:
 223                 return (smr_unserialized_load(p, true));
 224         case LOCKED:
 225                 return (smr_serialized_load(p, true));
 226         case SMR:
 227                 return (smr_entered_load(p, vm_radix_smr));
 228         }
 229         __assert_unreachable();
 230 }
 231
 232 static __inline void
 233 vm_radix_node_store(smrnode_t *p, struct vm_radix_node *v,
 234     enum vm_radix_access access)
 235 {
 236
 237         switch (access) {
 238         case UNSERIALIZED:
 239                 smr_unserialized_store(p, v, true);
 240                 break;
 241         case LOCKED:
 242                 smr_serialized_store(p, v, true);
 243                 break;
 244         case SMR:
 245                 panic("vm_radix_node_store: Not supported in smr section.");
 246         }
 247 }
 248
 249 /*
 250  * Get the root node for a radix tree.
 251  */
 252 static __inline struct vm_radix_node *
 253 vm_radix_root_load(struct vm_radix *rtree, enum vm_radix_access access)
 254 {
 255
 256         return (vm_radix_node_load((smrnode_t *)&rtree->rt_root, access));
 257 }
 258
 259 /*
 260  * Set the root node for a radix tree.
 261  */
 262 static __inline void
 263 vm_radix_root_store(struct vm_radix *rtree, struct vm_radix_node *rnode,
 264     enum vm_radix_access access)
 265 {
 266
 267         vm_radix_node_store((smrnode_t *)&rtree->rt_root, rnode, access);
 268 }
 269
 270 /*
 271  * Returns TRUE if the specified radix node is a leaf and FALSE otherwise.
 272  */
 273 static __inline bool
 274 vm_radix_isleaf(struct vm_radix_node *rnode)
 275 {
 276
 277         return (((uintptr_t)rnode & VM_RADIX_ISLEAF) != 0);
 278 }
 279
 280 /*
 281  * Returns page cast to radix node with leaf bit set.
 282  */
 283 static __inline struct vm_radix_node *
 284 vm_radix_toleaf(vm_page_t page)
 285 {
 286         return ((struct vm_radix_node *)((uintptr_t)page | VM_RADIX_ISLEAF));
 287 }
 288
 289 /*
 290  * Returns the associated page extracted from rnode.
 291  */
 292 static __inline vm_page_t
 293 vm_radix_topage(struct vm_radix_node *rnode)
 294 {
 295
 296         return ((vm_page_t)((uintptr_t)rnode & ~VM_RADIX_FLAGS));
 297 }
 298
 299 /*
 300  * Make 'child' a child of 'rnode'.
 301  */
 302 static __inline void
 303 vm_radix_addnode(struct vm_radix_node *rnode, vm_pindex_t index,
 304     struct vm_radix_node *child, enum vm_radix_access access)
 305 {
 306         int slot;
 307
 308         slot = vm_radix_slot(rnode, index);
 309         vm_radix_node_store(&rnode->rn_child[slot], child, access);
 310         rnode->rn_popmap ^= 1 << slot;
 311         KASSERT((rnode->rn_popmap & (1 << slot)) != 0,
 312             ("%s: bad popmap slot %d in rnode %p", __func__, slot, rnode));
 313 }
 314
 315 /*
 316  * Internal helper for vm_radix_reclaim_allnodes().
 317  * This function is recursive.
 318  */
 319 static void
 320 vm_radix_reclaim_allnodes_int(struct vm_radix_node *rnode)
 321 {
 322         struct vm_radix_node *child;
 323         int slot;
 324
 325         while (rnode->rn_popmap != 0) {
 326                 slot = ffs(rnode->rn_popmap) - 1;
 327                 child = vm_radix_node_load(&rnode->rn_child[slot],
 328                     UNSERIALIZED);
 329                 KASSERT(child != VM_RADIX_NULL,
 330                     ("%s: bad popmap slot %d in rnode %p",
 331                     __func__, slot, rnode));
 332                 if (!vm_radix_isleaf(child))
 333                         vm_radix_reclaim_allnodes_int(child);
 334                 rnode->rn_popmap ^= 1 << slot;
 335                 vm_radix_node_store(&rnode->rn_child[slot], VM_RADIX_NULL,
 336                     UNSERIALIZED);
 337         }
 338         vm_radix_node_put(rnode);
 339 }
 340
 341 /*
 342  * radix node zone initializer.
 343  */
 344 static int
 345 vm_radix_zone_init(void *mem, int size, int flags)
 346 {
 347         struct vm_radix_node *rnode;
 348
 349         rnode = mem;
 350         rnode->rn_popmap = 0;
 351         for (int i = 0; i < nitems(rnode->rn_child); i++)
 352                 vm_radix_node_store(&rnode->rn_child[i], VM_RADIX_NULL,
 353                     UNSERIALIZED);
 354         return (0);
 355 }
 356
 357 #ifndef UMA_MD_SMALL_ALLOC
 358 void vm_radix_reserve_kva(void);
 359 /*
 360  * Reserve the KVA necessary to satisfy the node allocation.
 361  * This is mandatory in architectures not supporting direct
 362  * mapping as they will need otherwise to carve into the kernel maps for
 363  * every node allocation, resulting into deadlocks for consumers already
 364  * working with kernel maps.
 365  */
 366 void
 367 vm_radix_reserve_kva(void)
 368 {
 369
 370         /*
 371          * Calculate the number of reserved nodes, discounting the pages that
 372          * are needed to store them.
 373          */
 374         if (!uma_zone_reserve_kva(vm_radix_node_zone,
 375             ((vm_paddr_t)vm_cnt.v_page_count * PAGE_SIZE) / (PAGE_SIZE +
 376             sizeof(struct vm_radix_node))))
 377                 panic("%s: unable to reserve KVA", __func__);
 378 }
 379 #endif
 380
 381 /*
 382  * Initialize the UMA slab zone.
 383  */
 384 void
 385 vm_radix_zinit(void)
 386 {
 387
 388         vm_radix_node_zone = uma_zcreate("RADIX NODE",
 389             sizeof(struct vm_radix_node), NULL, NULL, vm_radix_zone_init, NULL,
 390             VM_RADIX_PAD, UMA_ZONE_VM | UMA_ZONE_SMR);
 391         vm_radix_smr = uma_zone_get_smr(vm_radix_node_zone);
 392 }
 393
 394 /*
 395  * Inserts the key-value pair into the trie.
 396  * Panics if the key already exists.
 397  */
 398 int
 399 vm_radix_insert(struct vm_radix *rtree, vm_page_t page)
 400 {
 401         vm_pindex_t index, newind;
 402         struct vm_radix_node *leaf, *parent, *rnode;
 403         smrnode_t *parentp;
 404         int slot;
 405
 406         index = page->pindex;
 407         leaf = vm_radix_toleaf(page);
 408
 409         /*
 410          * The owner of record for root is not really important because it
 411          * will never be used.
 412          */
 413         rnode = vm_radix_root_load(rtree, LOCKED);
 414         parent = NULL;
 415         for (;;) {
 416                 if (vm_radix_isleaf(rnode)) {
 417                         if (rnode == VM_RADIX_NULL) {
 418                                 if (parent == NULL)
 419                                         rtree->rt_root = leaf;
 420                                 else
 421                                         vm_radix_addnode(parent, index, leaf,
 422                                             LOCKED);
 423                                 return (0);
 424                         }
 425                         newind = vm_radix_topage(rnode)->pindex;
 426                         if (newind == index)
 427                                 panic("%s: key %jx is already present",
 428                                     __func__, (uintmax_t)index);
 429                         break;
 430                 }
 431                 if (vm_radix_keybarr(rnode, index, &slot)) {
 432                         newind = rnode->rn_owner;
 433                         break;
 434                 }
 435                 parent = rnode;
 436                 rnode = vm_radix_node_load(&rnode->rn_child[slot], LOCKED);
 437         }
 438
 439         /*
 440          * A new node is needed because the right insertion level is reached.
 441          * Setup the new intermediate node and add the 2 children: the
 442          * new object and the older edge or object.
 443          */
 444         parentp = (parent != NULL) ? &parent->rn_child[slot]:
 445             (smrnode_t *)&rtree->rt_root;
 446         parent = vm_radix_node_get(index, newind);
 447         if (parent == NULL)
 448                 return (ENOMEM);
 449         /* These writes are not yet visible due to ordering. */
 450         vm_radix_addnode(parent, index, leaf, UNSERIALIZED);
 451         vm_radix_addnode(parent, newind, rnode, UNSERIALIZED);
 452         /* Serializing write to make the above visible. */
 453         vm_radix_node_store(parentp, parent, LOCKED);
 454         return (0);
 455 }
 456
 457 /*
 458  * Returns the value stored at the index.  If the index is not present,
 459  * NULL is returned.
 460  */
 461 static __always_inline vm_page_t
 462 _vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index,
 463     enum vm_radix_access access)
 464 {
 465         struct vm_radix_node *rnode;
 466         vm_page_t m;
 467         int slot;
 468
 469         rnode = vm_radix_root_load(rtree, access);
 470         for (;;) {
 471                 if (vm_radix_isleaf(rnode)) {
 472                         if ((m = vm_radix_topage(rnode)) != NULL &&
 473                             m->pindex == index)
 474                                 return (m);
 475                         break;
 476                 }
 477                 if (vm_radix_keybarr(rnode, index, &slot))
 478                         break;
 479                 rnode = vm_radix_node_load(&rnode->rn_child[slot], access);
 480         }
 481         return (NULL);
 482 }
 483
 484 /*
 485  * Returns the value stored at the index assuming there is an external lock.
 486  *
 487  * If the index is not present, NULL is returned.
 488  */
 489 vm_page_t
 490 vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index)
 491 {
 492
 493         return _vm_radix_lookup(rtree, index, LOCKED);
 494 }
 495
 496 /*
 497  * Returns the value stored at the index without requiring an external lock.
 498  *
 499  * If the index is not present, NULL is returned.
 500  */
 501 vm_page_t
 502 vm_radix_lookup_unlocked(struct vm_radix *rtree, vm_pindex_t index)
 503 {
 504         vm_page_t m;
 505
 506         smr_enter(vm_radix_smr);
 507         m = _vm_radix_lookup(rtree, index, SMR);
 508         smr_exit(vm_radix_smr);
 509
 510         return (m);
 511 }
 512
 513 /*
 514  * Returns the page with the least pindex that is greater than or equal to the
 515  * specified pindex, or NULL if there are no such pages.
 516  *
 517  * Requires that access be externally synchronized by a lock.
 518  */
 519 vm_page_t
 520 vm_radix_lookup_ge(struct vm_radix *rtree, vm_pindex_t index)
 521 {
 522         struct vm_radix_node *rnode, *succ;
 523         vm_page_t m;
 524         int slot;
 525
 526         /*
 527          * Descend the trie as if performing an ordinary lookup for the page
 528          * with the specified pindex.  However, unlike an ordinary lookup, as we
 529          * descend the trie, we use "succ" to remember the last branching-off
 530          * point, that is, the interior node under which the page with the least
 531          * pindex that is both outside our current path down the trie and more
 532          * than the specified pindex resides.  (The node's popmap makes it fast
 533          * and easy to recognize a branching-off point.)  If our ordinary lookup
 534          * fails to yield a page with a pindex that is greater than or equal to
 535          * the specified pindex, then we will exit this loop and perform a
 536          * lookup starting from "succ".  If "succ" is not NULL, then that lookup
 537          * is guaranteed to succeed.
 538          */
 539         rnode = vm_radix_root_load(rtree, LOCKED);
 540         succ = NULL;
 541         for (;;) {
 542                 if (vm_radix_isleaf(rnode)) {
 543                         if ((m = vm_radix_topage(rnode)) != NULL &&
 544                             m->pindex >= index)
 545                                 return (m);
 546                         break;
 547                 }
 548                 if (vm_radix_keybarr(rnode, index, &slot)) {
 549                         /*
 550                          * If all pages in this subtree have pindex > index,
 551                          * then the page in this subtree with the least pindex
 552                          * is the answer.
 553                          */
 554                         if (rnode->rn_owner > index)
 555                                 succ = rnode;
 556                         break;
 557                 }
 558
 559                 /*
 560                  * Just in case the next search step leads to a subtree of all
 561                  * pages with pindex < index, check popmap to see if a next
 562                  * bigger step, to a subtree of all pages with pindex > index,
 563                  * is available.  If so, remember to restart the search here.
 564                  */
 565                 if ((rnode->rn_popmap >> slot) > 1)
 566                         succ = rnode;
 567                 rnode = vm_radix_node_load(&rnode->rn_child[slot], LOCKED);
 568         }
 569
 570         /*
 571          * Restart the search from the last place visited in the subtree that
 572          * included some pages with pindex > index, if there was such a place.
 573          */
 574         if (succ == NULL)
 575                 return (NULL);
 576         if (succ != rnode) {
 577                 /*
 578                  * Take a step to the next bigger sibling of the node chosen
 579                  * last time.  In that subtree, all pages have pindex > index.
 580                  */
 581                 slot = vm_radix_slot(succ, index) + 1;
 582                 KASSERT((succ->rn_popmap >> slot) != 0,
 583                     ("%s: no popmap siblings past slot %d in node %p",
 584                     __func__, slot, succ));
 585                 slot += ffs(succ->rn_popmap >> slot) - 1;
 586                 succ = vm_radix_node_load(&succ->rn_child[slot], LOCKED);
 587         }
 588
 589         /*
 590          * Find the page in the subtree rooted at "succ" with the least pindex.
 591          */
 592         while (!vm_radix_isleaf(succ)) {
 593                 KASSERT(succ->rn_popmap != 0,
 594                     ("%s: no popmap children in node %p",  __func__, succ));
 595                 slot = ffs(succ->rn_popmap) - 1;
 596                 succ = vm_radix_node_load(&succ->rn_child[slot], LOCKED);
 597         }
 598         return (vm_radix_topage(succ));
 599 }
 600
 601 /*
 602  * Returns the page with the greatest pindex that is less than or equal to the
 603  * specified pindex, or NULL if there are no such pages.
 604  *
 605  * Requires that access be externally synchronized by a lock.
 606  */
 607 vm_page_t
 608 vm_radix_lookup_le(struct vm_radix *rtree, vm_pindex_t index)
 609 {
 610         struct vm_radix_node *pred, *rnode;
 611         vm_page_t m;
 612         int slot;
 613
 614         /*
 615          * Mirror the implementation of vm_radix_lookup_ge, described above.
 616          */
 617         rnode = vm_radix_root_load(rtree, LOCKED);
 618         pred = NULL;
 619         for (;;) {
 620                 if (vm_radix_isleaf(rnode)) {
 621                         if ((m = vm_radix_topage(rnode)) != NULL &&
 622                             m->pindex <= index)
 623                                 return (m);
 624                         break;
 625                 }
 626                 if (vm_radix_keybarr(rnode, index, &slot)) {
 627                         if (rnode->rn_owner < index)
 628                                 pred = rnode;
 629                         break;
 630                 }
 631                 if ((rnode->rn_popmap & ((1 << slot) - 1)) != 0)
 632                         pred = rnode;
 633                 rnode = vm_radix_node_load(&rnode->rn_child[slot], LOCKED);
 634         }
 635         if (pred == NULL)
 636                 return (NULL);
 637         if (pred != rnode) {
 638                 slot = vm_radix_slot(pred, index);
 639                 KASSERT((pred->rn_popmap & ((1 << slot) - 1)) != 0,
 640                     ("%s: no popmap siblings before slot %d in node %p",
 641                     __func__, slot, pred));
 642                 slot = fls(pred->rn_popmap & ((1 << slot) - 1)) - 1;
 643                 pred = vm_radix_node_load(&pred->rn_child[slot], LOCKED);
 644         }
 645         while (!vm_radix_isleaf(pred)) {
 646                 KASSERT(pred->rn_popmap != 0,
 647                     ("%s: no popmap children in node %p",  __func__, pred));
 648                 slot = fls(pred->rn_popmap) - 1;
 649                 pred = vm_radix_node_load(&pred->rn_child[slot], LOCKED);
 650         }
 651         return (vm_radix_topage(pred));
 652 }
 653
 654 /*
 655  * Remove the specified index from the trie, and return the value stored at
 656  * that index.  If the index is not present, return NULL.
 657  */
 658 vm_page_t
 659 vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index)
 660 {
 661         struct vm_radix_node *child, *parent, *rnode;
 662         vm_page_t m;
 663         int slot;
 664
 665         rnode = NULL;
 666         child = vm_radix_root_load(rtree, LOCKED);
 667         for (;;) {
 668                 if (vm_radix_isleaf(child))
 669                         break;
 670                 parent = rnode;
 671                 rnode = child;
 672                 slot = vm_radix_slot(rnode, index);
 673                 child = vm_radix_node_load(&rnode->rn_child[slot], LOCKED);
 674         }
 675         if ((m = vm_radix_topage(child)) == NULL || m->pindex != index)
 676                 return (NULL);
 677         if (rnode == NULL) {
 678                 vm_radix_root_store(rtree, VM_RADIX_NULL, LOCKED);
 679                 return (m);
 680         }
 681         KASSERT((rnode->rn_popmap & (1 << slot)) != 0,
 682             ("%s: bad popmap slot %d in rnode %p", __func__, slot, rnode));
 683         rnode->rn_popmap ^= 1 << slot;
 684         vm_radix_node_store(&rnode->rn_child[slot], VM_RADIX_NULL, LOCKED);
 685         if (!powerof2(rnode->rn_popmap))
 686                 return (m);
 687         KASSERT(rnode->rn_popmap != 0, ("%s: bad popmap all zeroes", __func__));
 688         slot = ffs(rnode->rn_popmap) - 1;
 689         child = vm_radix_node_load(&rnode->rn_child[slot], LOCKED);
 690         KASSERT(child != VM_RADIX_NULL,
 691             ("%s: bad popmap slot %d in rnode %p", __func__, slot, rnode));
 692         if (parent == NULL)
 693                 vm_radix_root_store(rtree, child, LOCKED);
 694         else {
 695                 slot = vm_radix_slot(parent, index);
 696                 KASSERT(rnode ==
 697                     vm_radix_node_load(&parent->rn_child[slot], LOCKED),
 698                     ("%s: invalid child value", __func__));
 699                 vm_radix_node_store(&parent->rn_child[slot], child, LOCKED);
 700         }
 701         /*
 702          * The child is still valid and we can not zero the
 703          * pointer until all smr references are gone.
 704          */
 705         vm_radix_node_put(rnode);
 706         return (m);
 707 }
 708
 709 /*
 710  * Remove and free all the nodes from the radix tree.
 711  * This function is recursive but there is a tight control on it as the
 712  * maximum depth of the tree is fixed.
 713  */
 714 void
 715 vm_radix_reclaim_allnodes(struct vm_radix *rtree)
 716 {
 717         struct vm_radix_node *root;
 718
 719         root = vm_radix_root_load(rtree, LOCKED);
 720         if (root == VM_RADIX_NULL)
 721                 return;
 722         vm_radix_root_store(rtree, VM_RADIX_NULL, UNSERIALIZED);
 723         if (!vm_radix_isleaf(root))
 724                 vm_radix_reclaim_allnodes_int(root);
 725 }
 726
 727 /*
 728  * Replace an existing page in the trie with another one.
 729  * Panics if there is not an old page in the trie at the new page's index.
 730  */
 731 vm_page_t
 732 vm_radix_replace(struct vm_radix *rtree, vm_page_t newpage)
 733 {
 734         struct vm_radix_node *leaf, *parent, *rnode;
 735         vm_page_t m;
 736         vm_pindex_t index;
 737         int slot;
 738
 739         leaf = vm_radix_toleaf(newpage);
 740         index = newpage->pindex;
 741         rnode = vm_radix_root_load(rtree, LOCKED);
 742         parent = NULL;
 743         for (;;) {
 744                 if (vm_radix_isleaf(rnode)) {
 745                         if ((m = vm_radix_topage(rnode)) != NULL &&
 746                             m->pindex == index) {
 747                                 if (parent == NULL)
 748                                         rtree->rt_root = leaf;
 749                                 else
 750                                         vm_radix_node_store(
 751                                             &parent->rn_child[slot], leaf,
 752                                             LOCKED);
 753                                 return (m);
 754                         }
 755                         break;
 756                 }
 757                 if (vm_radix_keybarr(rnode, index, &slot))
 758                         break;
 759                 parent = rnode;
 760                 rnode = vm_radix_node_load(&rnode->rn_child[slot], LOCKED);
 761         }
 762         panic("%s: original replacing page not found", __func__);
 763 }
 764
 765 void
 766 vm_radix_wait(void)
 767 {
 768         uma_zwait(vm_radix_node_zone);
 769 }
 770
 771 #ifdef DDB
 772 /*
 773  * Show details about the given radix node.
 774  */
 775 DB_SHOW_COMMAND(radixnode, db_show_radixnode)
 776 {
 777         struct vm_radix_node *rnode, *tmp;
 778         int slot;
 779         rn_popmap_t popmap;
 780
 781         if (!have_addr)
 782                 return;
 783         rnode = (struct vm_radix_node *)addr;
 784         db_printf("radixnode %p, owner %jx, children popmap %04x, level %u:\n",
 785             (void *)rnode, (uintmax_t)rnode->rn_owner, rnode->rn_popmap,
 786             rnode->rn_clev / VM_RADIX_WIDTH);
 787         for (popmap = rnode->rn_popmap; popmap != 0; popmap ^= 1 << slot) {
 788                 slot = ffs(popmap) - 1;
 789                 tmp = vm_radix_node_load(&rnode->rn_child[slot], UNSERIALIZED);
 790                 db_printf("slot: %d, val: %p, page: %p, clev: %d\n",
 791                     slot, (void *)tmp,
 792                     vm_radix_isleaf(tmp) ?  vm_radix_topage(tmp) : NULL,
 793                     rnode->rn_clev / VM_RADIX_WIDTH);
 794         }
 795 }
 796 #endif /* DDB */