]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/vm/vm_radix.c
Revert r364310.
[FreeBSD/FreeBSD.git] / sys / vm / vm_radix.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2013 EMC Corp.
5  * Copyright (c) 2011 Jeffrey Roberson <jeff@freebsd.org>
6  * Copyright (c) 2008 Mayur Shardul <mayur.shardul@gmail.com>
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  *
30  */
31
32 /*
33  * Path-compressed radix trie implementation.
34  * The following code is not generalized into a general purpose library
35  * because there are way too many parameters embedded that should really
36  * be decided by the library consumers.  At the same time, consumers
37  * of this code must achieve highest possible performance.
38  *
39  * The implementation takes into account the following rationale:
40  * - Size of the nodes should be as small as possible but still big enough
41  *   to avoid a large maximum depth for the trie.  This is a balance
42  *   between the necessity to not wire too much physical memory for the nodes
43  *   and the necessity to avoid too much cache pollution during the trie
44  *   operations.
45  * - There is not a huge bias toward the number of lookup operations over
46  *   the number of insert and remove operations.  This basically implies
47  *   that optimizations supposedly helping one operation but hurting the
48  *   other might be carefully evaluated.
49  * - On average not many nodes are expected to be fully populated, hence
50  *   level compression may just complicate things.
51  */
52
53 #include <sys/cdefs.h>
54 __FBSDID("$FreeBSD$");
55
56 #include "opt_ddb.h"
57
58 #include <sys/param.h>
59 #include <sys/systm.h>
60 #include <sys/kernel.h>
61 #include <sys/proc.h>
62 #include <sys/vmmeter.h>
63 #include <sys/smr.h>
64 #include <sys/smr_types.h>
65
66 #include <vm/uma.h>
67 #include <vm/vm.h>
68 #include <vm/vm_param.h>
69 #include <vm/vm_object.h>
70 #include <vm/vm_page.h>
71 #include <vm/vm_radix.h>
72
73 #ifdef DDB
74 #include <ddb/ddb.h>
75 #endif
76
77 /*
78  * These widths should allow the pointers to a node's children to fit within
79  * a single cache line.  The extra levels from a narrow width should not be
80  * a problem thanks to path compression.
81  */
82 #ifdef __LP64__
83 #define VM_RADIX_WIDTH  4
84 #else
85 #define VM_RADIX_WIDTH  3
86 #endif
87
88 #define VM_RADIX_COUNT  (1 << VM_RADIX_WIDTH)
89 #define VM_RADIX_MASK   (VM_RADIX_COUNT - 1)
90 #define VM_RADIX_LIMIT                                                  \
91         (howmany(sizeof(vm_pindex_t) * NBBY, VM_RADIX_WIDTH) - 1)
92
93 /* Flag bits stored in node pointers. */
94 #define VM_RADIX_ISLEAF 0x1
95 #define VM_RADIX_FLAGS  0x1
96 #define VM_RADIX_PAD    VM_RADIX_FLAGS
97
98 /* Returns one unit associated with specified level. */
99 #define VM_RADIX_UNITLEVEL(lev)                                         \
100         ((vm_pindex_t)1 << ((lev) * VM_RADIX_WIDTH))
101
102 enum vm_radix_access { SMR, LOCKED, UNSERIALIZED };
103
104 struct vm_radix_node;
105 typedef SMR_POINTER(struct vm_radix_node *) smrnode_t;
106
107 struct vm_radix_node {
108         vm_pindex_t     rn_owner;                       /* Owner of record. */
109         uint16_t        rn_count;                       /* Valid children. */
110         uint8_t         rn_clev;                        /* Current level. */
111         int8_t          rn_last;                        /* zero last ptr. */
112         smrnode_t       rn_child[VM_RADIX_COUNT];       /* Child nodes. */
113 };
114
115 static uma_zone_t vm_radix_node_zone;
116 static smr_t vm_radix_smr;
117
118 static void vm_radix_node_store(smrnode_t *p, struct vm_radix_node *v,
119     enum vm_radix_access access);
120
121 /*
122  * Allocate a radix node.
123  */
124 static struct vm_radix_node *
125 vm_radix_node_get(vm_pindex_t owner, uint16_t count, uint16_t clevel)
126 {
127         struct vm_radix_node *rnode;
128
129         rnode = uma_zalloc_smr(vm_radix_node_zone, M_NOWAIT);
130         if (rnode == NULL)
131                 return (NULL);
132
133         /*
134          * We want to clear the last child pointer after the final section
135          * has exited so lookup can not return false negatives.  It is done
136          * here because it will be cache-cold in the dtor callback.
137          */
138         if (rnode->rn_last != 0) {
139                 vm_radix_node_store(&rnode->rn_child[rnode->rn_last - 1],
140                     NULL, UNSERIALIZED);
141                 rnode->rn_last = 0;
142         }
143         rnode->rn_owner = owner;
144         rnode->rn_count = count;
145         rnode->rn_clev = clevel;
146         return (rnode);
147 }
148
149 /*
150  * Free radix node.
151  */
152 static __inline void
153 vm_radix_node_put(struct vm_radix_node *rnode, int8_t last)
154 {
155 #ifdef INVARIANTS
156         int slot;
157
158         KASSERT(rnode->rn_count == 0,
159             ("vm_radix_node_put: rnode %p has %d children", rnode,
160             rnode->rn_count));
161         for (slot = 0; slot < VM_RADIX_COUNT; slot++) {
162                 if (slot == last)
163                         continue;
164                 KASSERT(smr_unserialized_load(&rnode->rn_child[slot], true) ==
165                     NULL, ("vm_radix_node_put: rnode %p has a child", rnode));
166         }
167 #endif
168         /* Off by one so a freshly zero'd node is not assigned to. */
169         rnode->rn_last = last + 1;
170         uma_zfree_smr(vm_radix_node_zone, rnode);
171 }
172
173 /*
174  * Return the position in the array for a given level.
175  */
176 static __inline int
177 vm_radix_slot(vm_pindex_t index, uint16_t level)
178 {
179
180         return ((index >> (level * VM_RADIX_WIDTH)) & VM_RADIX_MASK);
181 }
182
183 /* Trims the key after the specified level. */
184 static __inline vm_pindex_t
185 vm_radix_trimkey(vm_pindex_t index, uint16_t level)
186 {
187         vm_pindex_t ret;
188
189         ret = index;
190         if (level > 0) {
191                 ret >>= level * VM_RADIX_WIDTH;
192                 ret <<= level * VM_RADIX_WIDTH;
193         }
194         return (ret);
195 }
196
197 /*
198  * Fetch a node pointer from a slot in another node.
199  */
200 static __inline struct vm_radix_node *
201 vm_radix_node_load(smrnode_t *p, enum vm_radix_access access)
202 {
203
204         switch (access) {
205         case UNSERIALIZED:
206                 return (smr_unserialized_load(p, true));
207         case LOCKED:
208                 return (smr_serialized_load(p, true));
209         case SMR:
210                 return (smr_entered_load(p, vm_radix_smr));
211         }
212         __assert_unreachable();
213 }
214
215 static __inline void
216 vm_radix_node_store(smrnode_t *p, struct vm_radix_node *v,
217     enum vm_radix_access access)
218 {
219
220
221         switch (access) {
222         case UNSERIALIZED:
223                 smr_unserialized_store(p, v, true);
224                 break;
225         case LOCKED:
226                 smr_serialized_store(p, v, true);
227                 break;
228         case SMR:
229                 panic("vm_radix_node_store: Not supported in smr section.");
230         }
231 }
232
233 /*
234  * Get the root node for a radix tree.
235  */
236 static __inline struct vm_radix_node *
237 vm_radix_root_load(struct vm_radix *rtree, enum vm_radix_access access)
238 {
239
240         return (vm_radix_node_load((smrnode_t *)&rtree->rt_root, access));
241 }
242
243 /*
244  * Set the root node for a radix tree.
245  */
246 static __inline void
247 vm_radix_root_store(struct vm_radix *rtree, struct vm_radix_node *rnode,
248     enum vm_radix_access access)
249 {
250
251         vm_radix_node_store((smrnode_t *)&rtree->rt_root, rnode, access);
252 }
253
254 /*
255  * Returns TRUE if the specified radix node is a leaf and FALSE otherwise.
256  */
257 static __inline boolean_t
258 vm_radix_isleaf(struct vm_radix_node *rnode)
259 {
260
261         return (((uintptr_t)rnode & VM_RADIX_ISLEAF) != 0);
262 }
263
264 /*
265  * Returns the associated page extracted from rnode.
266  */
267 static __inline vm_page_t
268 vm_radix_topage(struct vm_radix_node *rnode)
269 {
270
271         return ((vm_page_t)((uintptr_t)rnode & ~VM_RADIX_FLAGS));
272 }
273
274 /*
275  * Adds the page as a child of the provided node.
276  */
277 static __inline void
278 vm_radix_addpage(struct vm_radix_node *rnode, vm_pindex_t index, uint16_t clev,
279     vm_page_t page, enum vm_radix_access access)
280 {
281         int slot;
282
283         slot = vm_radix_slot(index, clev);
284         vm_radix_node_store(&rnode->rn_child[slot],
285             (struct vm_radix_node *)((uintptr_t)page | VM_RADIX_ISLEAF), access);
286 }
287
288 /*
289  * Returns the slot where two keys differ.
290  * It cannot accept 2 equal keys.
291  */
292 static __inline uint16_t
293 vm_radix_keydiff(vm_pindex_t index1, vm_pindex_t index2)
294 {
295         uint16_t clev;
296
297         KASSERT(index1 != index2, ("%s: passing the same key value %jx",
298             __func__, (uintmax_t)index1));
299
300         index1 ^= index2;
301         for (clev = VM_RADIX_LIMIT;; clev--)
302                 if (vm_radix_slot(index1, clev) != 0)
303                         return (clev);
304 }
305
306 /*
307  * Returns TRUE if it can be determined that key does not belong to the
308  * specified rnode.  Otherwise, returns FALSE.
309  */
310 static __inline boolean_t
311 vm_radix_keybarr(struct vm_radix_node *rnode, vm_pindex_t idx)
312 {
313
314         if (rnode->rn_clev < VM_RADIX_LIMIT) {
315                 idx = vm_radix_trimkey(idx, rnode->rn_clev + 1);
316                 return (idx != rnode->rn_owner);
317         }
318         return (FALSE);
319 }
320
321 /*
322  * Internal helper for vm_radix_reclaim_allnodes().
323  * This function is recursive.
324  */
325 static void
326 vm_radix_reclaim_allnodes_int(struct vm_radix_node *rnode)
327 {
328         struct vm_radix_node *child;
329         int slot;
330
331         KASSERT(rnode->rn_count <= VM_RADIX_COUNT,
332             ("vm_radix_reclaim_allnodes_int: bad count in rnode %p", rnode));
333         for (slot = 0; rnode->rn_count != 0; slot++) {
334                 child = vm_radix_node_load(&rnode->rn_child[slot], UNSERIALIZED);
335                 if (child == NULL)
336                         continue;
337                 if (!vm_radix_isleaf(child))
338                         vm_radix_reclaim_allnodes_int(child);
339                 vm_radix_node_store(&rnode->rn_child[slot], NULL, UNSERIALIZED);
340                 rnode->rn_count--;
341         }
342         vm_radix_node_put(rnode, -1);
343 }
344
345 #ifndef UMA_MD_SMALL_ALLOC
346 void vm_radix_reserve_kva(void);
347 /*
348  * Reserve the KVA necessary to satisfy the node allocation.
349  * This is mandatory in architectures not supporting direct
350  * mapping as they will need otherwise to carve into the kernel maps for
351  * every node allocation, resulting into deadlocks for consumers already
352  * working with kernel maps.
353  */
354 void
355 vm_radix_reserve_kva(void)
356 {
357
358         /*
359          * Calculate the number of reserved nodes, discounting the pages that
360          * are needed to store them.
361          */
362         if (!uma_zone_reserve_kva(vm_radix_node_zone,
363             ((vm_paddr_t)vm_cnt.v_page_count * PAGE_SIZE) / (PAGE_SIZE +
364             sizeof(struct vm_radix_node))))
365                 panic("%s: unable to reserve KVA", __func__);
366 }
367 #endif
368
369 /*
370  * Initialize the UMA slab zone.
371  */
372 void
373 vm_radix_zinit(void)
374 {
375
376         vm_radix_node_zone = uma_zcreate("RADIX NODE",
377             sizeof(struct vm_radix_node), NULL, NULL, NULL, NULL,
378             VM_RADIX_PAD, UMA_ZONE_VM | UMA_ZONE_SMR | UMA_ZONE_ZINIT);
379         vm_radix_smr = uma_zone_get_smr(vm_radix_node_zone);
380 }
381
382 /*
383  * Inserts the key-value pair into the trie.
384  * Panics if the key already exists.
385  */
386 int
387 vm_radix_insert(struct vm_radix *rtree, vm_page_t page)
388 {
389         vm_pindex_t index, newind;
390         struct vm_radix_node *rnode, *tmp;
391         smrnode_t *parentp;
392         vm_page_t m;
393         int slot;
394         uint16_t clev;
395
396         index = page->pindex;
397
398         /*
399          * The owner of record for root is not really important because it
400          * will never be used.
401          */
402         rnode = vm_radix_root_load(rtree, LOCKED);
403         if (rnode == NULL) {
404                 rtree->rt_root = (uintptr_t)page | VM_RADIX_ISLEAF;
405                 return (0);
406         }
407         parentp = (smrnode_t *)&rtree->rt_root;
408         for (;;) {
409                 if (vm_radix_isleaf(rnode)) {
410                         m = vm_radix_topage(rnode);
411                         if (m->pindex == index)
412                                 panic("%s: key %jx is already present",
413                                     __func__, (uintmax_t)index);
414                         clev = vm_radix_keydiff(m->pindex, index);
415                         tmp = vm_radix_node_get(vm_radix_trimkey(index,
416                             clev + 1), 2, clev);
417                         if (tmp == NULL)
418                                 return (ENOMEM);
419                         /* These writes are not yet visible due to ordering. */
420                         vm_radix_addpage(tmp, index, clev, page, UNSERIALIZED);
421                         vm_radix_addpage(tmp, m->pindex, clev, m, UNSERIALIZED);
422                         /* Synchronize to make leaf visible. */
423                         vm_radix_node_store(parentp, tmp, LOCKED);
424                         return (0);
425                 } else if (vm_radix_keybarr(rnode, index))
426                         break;
427                 slot = vm_radix_slot(index, rnode->rn_clev);
428                 parentp = &rnode->rn_child[slot];
429                 tmp = vm_radix_node_load(parentp, LOCKED);
430                 if (tmp == NULL) {
431                         rnode->rn_count++;
432                         vm_radix_addpage(rnode, index, rnode->rn_clev, page,
433                             LOCKED);
434                         return (0);
435                 }
436                 rnode = tmp;
437         }
438
439         /*
440          * A new node is needed because the right insertion level is reached.
441          * Setup the new intermediate node and add the 2 children: the
442          * new object and the older edge.
443          */
444         newind = rnode->rn_owner;
445         clev = vm_radix_keydiff(newind, index);
446         tmp = vm_radix_node_get(vm_radix_trimkey(index, clev + 1), 2, clev);
447         if (tmp == NULL)
448                 return (ENOMEM);
449         slot = vm_radix_slot(newind, clev);
450         /* These writes are not yet visible due to ordering. */
451         vm_radix_addpage(tmp, index, clev, page, UNSERIALIZED);
452         vm_radix_node_store(&tmp->rn_child[slot], rnode, UNSERIALIZED);
453         /* Serializing write to make the above visible. */
454         vm_radix_node_store(parentp, tmp, LOCKED);
455
456         return (0);
457 }
458
459 /*
460  * Returns TRUE if the specified radix tree contains a single leaf and FALSE
461  * otherwise.
462  */
463 boolean_t
464 vm_radix_is_singleton(struct vm_radix *rtree)
465 {
466         struct vm_radix_node *rnode;
467
468         rnode = vm_radix_root_load(rtree, LOCKED);
469         if (rnode == NULL)
470                 return (FALSE);
471         return (vm_radix_isleaf(rnode));
472 }
473
474 /*
475  * Returns the value stored at the index.  If the index is not present,
476  * NULL is returned.
477  */
478 static __always_inline vm_page_t
479 _vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index,
480     enum vm_radix_access access)
481 {
482         struct vm_radix_node *rnode;
483         vm_page_t m;
484         int slot;
485
486         rnode = vm_radix_root_load(rtree, access);
487         while (rnode != NULL) {
488                 if (vm_radix_isleaf(rnode)) {
489                         m = vm_radix_topage(rnode);
490                         if (m->pindex == index)
491                                 return (m);
492                         break;
493                 }
494                 if (vm_radix_keybarr(rnode, index))
495                         break;
496                 slot = vm_radix_slot(index, rnode->rn_clev);
497                 rnode = vm_radix_node_load(&rnode->rn_child[slot], access);
498         }
499         return (NULL);
500 }
501
502 /*
503  * Returns the value stored at the index assuming there is an external lock.
504  *
505  * If the index is not present, NULL is returned.
506  */
507 vm_page_t
508 vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index)
509 {
510
511         return _vm_radix_lookup(rtree, index, LOCKED);
512 }
513
514 /*
515  * Returns the value stored at the index without requiring an external lock.
516  *
517  * If the index is not present, NULL is returned.
518  */
519 vm_page_t
520 vm_radix_lookup_unlocked(struct vm_radix *rtree, vm_pindex_t index)
521 {
522         vm_page_t m;
523
524         smr_enter(vm_radix_smr);
525         m = _vm_radix_lookup(rtree, index, SMR);
526         smr_exit(vm_radix_smr);
527
528         return (m);
529 }
530
531 /*
532  * Look up the nearest entry at a position greater than or equal to index.
533  */
534 vm_page_t
535 vm_radix_lookup_ge(struct vm_radix *rtree, vm_pindex_t index)
536 {
537         struct vm_radix_node *stack[VM_RADIX_LIMIT];
538         vm_pindex_t inc;
539         vm_page_t m;
540         struct vm_radix_node *child, *rnode;
541 #ifdef INVARIANTS
542         int loops = 0;
543 #endif
544         int slot, tos;
545
546         rnode = vm_radix_root_load(rtree, LOCKED);
547         if (rnode == NULL)
548                 return (NULL);
549         else if (vm_radix_isleaf(rnode)) {
550                 m = vm_radix_topage(rnode);
551                 if (m->pindex >= index)
552                         return (m);
553                 else
554                         return (NULL);
555         }
556         tos = 0;
557         for (;;) {
558                 /*
559                  * If the keys differ before the current bisection node,
560                  * then the search key might rollback to the earliest
561                  * available bisection node or to the smallest key
562                  * in the current node (if the owner is greater than the
563                  * search key).
564                  */
565                 if (vm_radix_keybarr(rnode, index)) {
566                         if (index > rnode->rn_owner) {
567 ascend:
568                                 KASSERT(++loops < 1000,
569                                     ("vm_radix_lookup_ge: too many loops"));
570
571                                 /*
572                                  * Pop nodes from the stack until either the
573                                  * stack is empty or a node that could have a
574                                  * matching descendant is found.
575                                  */
576                                 do {
577                                         if (tos == 0)
578                                                 return (NULL);
579                                         rnode = stack[--tos];
580                                 } while (vm_radix_slot(index,
581                                     rnode->rn_clev) == (VM_RADIX_COUNT - 1));
582
583                                 /*
584                                  * The following computation cannot overflow
585                                  * because index's slot at the current level
586                                  * is less than VM_RADIX_COUNT - 1.
587                                  */
588                                 index = vm_radix_trimkey(index,
589                                     rnode->rn_clev);
590                                 index += VM_RADIX_UNITLEVEL(rnode->rn_clev);
591                         } else
592                                 index = rnode->rn_owner;
593                         KASSERT(!vm_radix_keybarr(rnode, index),
594                             ("vm_radix_lookup_ge: keybarr failed"));
595                 }
596                 slot = vm_radix_slot(index, rnode->rn_clev);
597                 child = vm_radix_node_load(&rnode->rn_child[slot], LOCKED);
598                 if (vm_radix_isleaf(child)) {
599                         m = vm_radix_topage(child);
600                         if (m->pindex >= index)
601                                 return (m);
602                 } else if (child != NULL)
603                         goto descend;
604
605                 /*
606                  * Look for an available edge or page within the current
607                  * bisection node.
608                  */
609                 if (slot < (VM_RADIX_COUNT - 1)) {
610                         inc = VM_RADIX_UNITLEVEL(rnode->rn_clev);
611                         index = vm_radix_trimkey(index, rnode->rn_clev);
612                         do {
613                                 index += inc;
614                                 slot++;
615                                 child = vm_radix_node_load(&rnode->rn_child[slot],
616                                     LOCKED);
617                                 if (vm_radix_isleaf(child)) {
618                                         m = vm_radix_topage(child);
619                                         if (m->pindex >= index)
620                                                 return (m);
621                                 } else if (child != NULL)
622                                         goto descend;
623                         } while (slot < (VM_RADIX_COUNT - 1));
624                 }
625                 KASSERT(child == NULL || vm_radix_isleaf(child),
626                     ("vm_radix_lookup_ge: child is radix node"));
627
628                 /*
629                  * If a page or edge greater than the search slot is not found
630                  * in the current node, ascend to the next higher-level node.
631                  */
632                 goto ascend;
633 descend:
634                 KASSERT(rnode->rn_clev > 0,
635                     ("vm_radix_lookup_ge: pushing leaf's parent"));
636                 KASSERT(tos < VM_RADIX_LIMIT,
637                     ("vm_radix_lookup_ge: stack overflow"));
638                 stack[tos++] = rnode;
639                 rnode = child;
640         }
641 }
642
643 /*
644  * Look up the nearest entry at a position less than or equal to index.
645  */
646 vm_page_t
647 vm_radix_lookup_le(struct vm_radix *rtree, vm_pindex_t index)
648 {
649         struct vm_radix_node *stack[VM_RADIX_LIMIT];
650         vm_pindex_t inc;
651         vm_page_t m;
652         struct vm_radix_node *child, *rnode;
653 #ifdef INVARIANTS
654         int loops = 0;
655 #endif
656         int slot, tos;
657
658         rnode = vm_radix_root_load(rtree, LOCKED);
659         if (rnode == NULL)
660                 return (NULL);
661         else if (vm_radix_isleaf(rnode)) {
662                 m = vm_radix_topage(rnode);
663                 if (m->pindex <= index)
664                         return (m);
665                 else
666                         return (NULL);
667         }
668         tos = 0;
669         for (;;) {
670                 /*
671                  * If the keys differ before the current bisection node,
672                  * then the search key might rollback to the earliest
673                  * available bisection node or to the largest key
674                  * in the current node (if the owner is smaller than the
675                  * search key).
676                  */
677                 if (vm_radix_keybarr(rnode, index)) {
678                         if (index > rnode->rn_owner) {
679                                 index = rnode->rn_owner + VM_RADIX_COUNT *
680                                     VM_RADIX_UNITLEVEL(rnode->rn_clev);
681                         } else {
682 ascend:
683                                 KASSERT(++loops < 1000,
684                                     ("vm_radix_lookup_le: too many loops"));
685
686                                 /*
687                                  * Pop nodes from the stack until either the
688                                  * stack is empty or a node that could have a
689                                  * matching descendant is found.
690                                  */
691                                 do {
692                                         if (tos == 0)
693                                                 return (NULL);
694                                         rnode = stack[--tos];
695                                 } while (vm_radix_slot(index,
696                                     rnode->rn_clev) == 0);
697
698                                 /*
699                                  * The following computation cannot overflow
700                                  * because index's slot at the current level
701                                  * is greater than 0.
702                                  */
703                                 index = vm_radix_trimkey(index,
704                                     rnode->rn_clev);
705                         }
706                         index--;
707                         KASSERT(!vm_radix_keybarr(rnode, index),
708                             ("vm_radix_lookup_le: keybarr failed"));
709                 }
710                 slot = vm_radix_slot(index, rnode->rn_clev);
711                 child = vm_radix_node_load(&rnode->rn_child[slot], LOCKED);
712                 if (vm_radix_isleaf(child)) {
713                         m = vm_radix_topage(child);
714                         if (m->pindex <= index)
715                                 return (m);
716                 } else if (child != NULL)
717                         goto descend;
718
719                 /*
720                  * Look for an available edge or page within the current
721                  * bisection node.
722                  */
723                 if (slot > 0) {
724                         inc = VM_RADIX_UNITLEVEL(rnode->rn_clev);
725                         index |= inc - 1;
726                         do {
727                                 index -= inc;
728                                 slot--;
729                                 child = vm_radix_node_load(&rnode->rn_child[slot],
730                                     LOCKED);
731                                 if (vm_radix_isleaf(child)) {
732                                         m = vm_radix_topage(child);
733                                         if (m->pindex <= index)
734                                                 return (m);
735                                 } else if (child != NULL)
736                                         goto descend;
737                         } while (slot > 0);
738                 }
739                 KASSERT(child == NULL || vm_radix_isleaf(child),
740                     ("vm_radix_lookup_le: child is radix node"));
741
742                 /*
743                  * If a page or edge smaller than the search slot is not found
744                  * in the current node, ascend to the next higher-level node.
745                  */
746                 goto ascend;
747 descend:
748                 KASSERT(rnode->rn_clev > 0,
749                     ("vm_radix_lookup_le: pushing leaf's parent"));
750                 KASSERT(tos < VM_RADIX_LIMIT,
751                     ("vm_radix_lookup_le: stack overflow"));
752                 stack[tos++] = rnode;
753                 rnode = child;
754         }
755 }
756
757 /*
758  * Remove the specified index from the trie, and return the value stored at
759  * that index.  If the index is not present, return NULL.
760  */
761 vm_page_t
762 vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index)
763 {
764         struct vm_radix_node *rnode, *parent, *tmp;
765         vm_page_t m;
766         int i, slot;
767
768         rnode = vm_radix_root_load(rtree, LOCKED);
769         if (vm_radix_isleaf(rnode)) {
770                 m = vm_radix_topage(rnode);
771                 if (m->pindex != index)
772                         return (NULL);
773                 vm_radix_root_store(rtree, NULL, LOCKED);
774                 return (m);
775         }
776         parent = NULL;
777         for (;;) {
778                 if (rnode == NULL)
779                         return (NULL);
780                 slot = vm_radix_slot(index, rnode->rn_clev);
781                 tmp = vm_radix_node_load(&rnode->rn_child[slot], LOCKED);
782                 if (vm_radix_isleaf(tmp)) {
783                         m = vm_radix_topage(tmp);
784                         if (m->pindex != index)
785                                 return (NULL);
786                         vm_radix_node_store(&rnode->rn_child[slot], NULL, LOCKED);
787                         rnode->rn_count--;
788                         if (rnode->rn_count > 1)
789                                 return (m);
790                         for (i = 0; i < VM_RADIX_COUNT; i++)
791                                 if (vm_radix_node_load(&rnode->rn_child[i],
792                                     LOCKED) != NULL)
793                                         break;
794                         KASSERT(i != VM_RADIX_COUNT,
795                             ("%s: invalid node configuration", __func__));
796                         tmp = vm_radix_node_load(&rnode->rn_child[i], LOCKED);
797                         if (parent == NULL)
798                                 vm_radix_root_store(rtree, tmp, LOCKED);
799                         else {
800                                 slot = vm_radix_slot(index, parent->rn_clev);
801                                 KASSERT(vm_radix_node_load(
802                                     &parent->rn_child[slot], LOCKED) == rnode,
803                                     ("%s: invalid child value", __func__));
804                                 vm_radix_node_store(&parent->rn_child[slot],
805                                     tmp, LOCKED);
806                         }
807                         /*
808                          * The child is still valid and we can not zero the
809                          * pointer until all smr references are gone.
810                          */
811                         rnode->rn_count--;
812                         vm_radix_node_put(rnode, i);
813                         return (m);
814                 }
815                 parent = rnode;
816                 rnode = tmp;
817         }
818 }
819
820 /*
821  * Remove and free all the nodes from the radix tree.
822  * This function is recursive but there is a tight control on it as the
823  * maximum depth of the tree is fixed.
824  */
825 void
826 vm_radix_reclaim_allnodes(struct vm_radix *rtree)
827 {
828         struct vm_radix_node *root;
829
830         root = vm_radix_root_load(rtree, LOCKED);
831         if (root == NULL)
832                 return;
833         vm_radix_root_store(rtree, NULL, UNSERIALIZED);
834         if (!vm_radix_isleaf(root))
835                 vm_radix_reclaim_allnodes_int(root);
836 }
837
838 /*
839  * Replace an existing page in the trie with another one.
840  * Panics if there is not an old page in the trie at the new page's index.
841  */
842 vm_page_t
843 vm_radix_replace(struct vm_radix *rtree, vm_page_t newpage)
844 {
845         struct vm_radix_node *rnode, *tmp;
846         vm_page_t m;
847         vm_pindex_t index;
848         int slot;
849
850         index = newpage->pindex;
851         rnode = vm_radix_root_load(rtree, LOCKED);
852         if (rnode == NULL)
853                 panic("%s: replacing page on an empty trie", __func__);
854         if (vm_radix_isleaf(rnode)) {
855                 m = vm_radix_topage(rnode);
856                 if (m->pindex != index)
857                         panic("%s: original replacing root key not found",
858                             __func__);
859                 rtree->rt_root = (uintptr_t)newpage | VM_RADIX_ISLEAF;
860                 return (m);
861         }
862         for (;;) {
863                 slot = vm_radix_slot(index, rnode->rn_clev);
864                 tmp = vm_radix_node_load(&rnode->rn_child[slot], LOCKED);
865                 if (vm_radix_isleaf(tmp)) {
866                         m = vm_radix_topage(tmp);
867                         if (m->pindex == index) {
868                                 vm_radix_node_store(&rnode->rn_child[slot],
869                                     (struct vm_radix_node *)((uintptr_t)newpage |
870                                     VM_RADIX_ISLEAF), LOCKED);
871                                 return (m);
872                         } else
873                                 break;
874                 } else if (tmp == NULL || vm_radix_keybarr(tmp, index))
875                         break;
876                 rnode = tmp;
877         }
878         panic("%s: original replacing page not found", __func__);
879 }
880
881 void
882 vm_radix_wait(void)
883 {
884         uma_zwait(vm_radix_node_zone);
885 }
886
887 #ifdef DDB
888 /*
889  * Show details about the given radix node.
890  */
891 DB_SHOW_COMMAND(radixnode, db_show_radixnode)
892 {
893         struct vm_radix_node *rnode, *tmp;
894         int i;
895
896         if (!have_addr)
897                 return;
898         rnode = (struct vm_radix_node *)addr;
899         db_printf("radixnode %p, owner %jx, children count %u, level %u:\n",
900             (void *)rnode, (uintmax_t)rnode->rn_owner, rnode->rn_count,
901             rnode->rn_clev);
902         for (i = 0; i < VM_RADIX_COUNT; i++) {
903                 tmp = vm_radix_node_load(&rnode->rn_child[i], UNSERIALIZED);
904                 if (tmp != NULL)
905                         db_printf("slot: %d, val: %p, page: %p, clev: %d\n",
906                             i, (void *)tmp,
907                             vm_radix_isleaf(tmp) ?  vm_radix_topage(tmp) : NULL,
908                             rnode->rn_clev);
909         }
910 }
911 #endif /* DDB */