]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/vm/vm_radix.c
Add vnode_pager_clean_{a,}sync(9)
[FreeBSD/FreeBSD.git] / sys / vm / vm_radix.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2013 EMC Corp.
5  * Copyright (c) 2011 Jeffrey Roberson <jeff@freebsd.org>
6  * Copyright (c) 2008 Mayur Shardul <mayur.shardul@gmail.com>
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  *
30  */
31
32 /*
33  * Path-compressed radix trie implementation.
34  * The following code is not generalized into a general purpose library
35  * because there are way too many parameters embedded that should really
36  * be decided by the library consumers.  At the same time, consumers
37  * of this code must achieve highest possible performance.
38  *
39  * The implementation takes into account the following rationale:
40  * - Size of the nodes should be as small as possible but still big enough
41  *   to avoid a large maximum depth for the trie.  This is a balance
42  *   between the necessity to not wire too much physical memory for the nodes
43  *   and the necessity to avoid too much cache pollution during the trie
44  *   operations.
45  * - There is not a huge bias toward the number of lookup operations over
46  *   the number of insert and remove operations.  This basically implies
47  *   that optimizations supposedly helping one operation but hurting the
48  *   other might be carefully evaluated.
49  * - On average not many nodes are expected to be fully populated, hence
50  *   level compression may just complicate things.
51  */
52
53 #include <sys/cdefs.h>
54 #include "opt_ddb.h"
55
56 #include <sys/param.h>
57 #include <sys/systm.h>
58 #include <sys/kernel.h>
59 #include <sys/proc.h>
60 #include <sys/vmmeter.h>
61 #include <sys/smr.h>
62 #include <sys/smr_types.h>
63
64 #include <vm/uma.h>
65 #include <vm/vm.h>
66 #include <vm/vm_param.h>
67 #include <vm/vm_object.h>
68 #include <vm/vm_page.h>
69 #include <vm/vm_radix.h>
70
71 #ifdef DDB
72 #include <ddb/ddb.h>
73 #endif
74
75 /*
76  * These widths should allow the pointers to a node's children to fit within
77  * a single cache line.  The extra levels from a narrow width should not be
78  * a problem thanks to path compression.
79  */
80 #ifdef __LP64__
81 #define VM_RADIX_WIDTH  4
82 #else
83 #define VM_RADIX_WIDTH  3
84 #endif
85
86 #define VM_RADIX_COUNT  (1 << VM_RADIX_WIDTH)
87 #define VM_RADIX_MASK   (VM_RADIX_COUNT - 1)
88 #define VM_RADIX_LIMIT                                                  \
89         (howmany(sizeof(vm_pindex_t) * NBBY, VM_RADIX_WIDTH) - 1)
90
91 /* Flag bits stored in node pointers. */
92 #define VM_RADIX_ISLEAF 0x1
93 #define VM_RADIX_FLAGS  0x1
94 #define VM_RADIX_PAD    VM_RADIX_FLAGS
95
96 /* Returns one unit associated with specified level. */
97 #define VM_RADIX_UNITLEVEL(lev)                                         \
98         ((vm_pindex_t)1 << ((lev) * VM_RADIX_WIDTH))
99
100 enum vm_radix_access { SMR, LOCKED, UNSERIALIZED };
101
102 struct vm_radix_node;
103 typedef SMR_POINTER(struct vm_radix_node *) smrnode_t;
104
105 struct vm_radix_node {
106         vm_pindex_t     rn_owner;                       /* Owner of record. */
107         uint16_t        rn_count;                       /* Valid children. */
108         uint8_t         rn_clev;                        /* Current level. */
109         int8_t          rn_last;                        /* zero last ptr. */
110         smrnode_t       rn_child[VM_RADIX_COUNT];       /* Child nodes. */
111 };
112
113 static uma_zone_t vm_radix_node_zone;
114 static smr_t vm_radix_smr;
115
116 static void vm_radix_node_store(smrnode_t *p, struct vm_radix_node *v,
117     enum vm_radix_access access);
118
119 /*
120  * Allocate a radix node.
121  */
122 static struct vm_radix_node *
123 vm_radix_node_get(vm_pindex_t owner, uint16_t count, uint16_t clevel)
124 {
125         struct vm_radix_node *rnode;
126
127         rnode = uma_zalloc_smr(vm_radix_node_zone, M_NOWAIT);
128         if (rnode == NULL)
129                 return (NULL);
130
131         /*
132          * We want to clear the last child pointer after the final section
133          * has exited so lookup can not return false negatives.  It is done
134          * here because it will be cache-cold in the dtor callback.
135          */
136         if (rnode->rn_last != 0) {
137                 vm_radix_node_store(&rnode->rn_child[rnode->rn_last - 1],
138                     NULL, UNSERIALIZED);
139                 rnode->rn_last = 0;
140         }
141         rnode->rn_owner = owner;
142         rnode->rn_count = count;
143         rnode->rn_clev = clevel;
144         return (rnode);
145 }
146
147 /*
148  * Free radix node.
149  */
150 static __inline void
151 vm_radix_node_put(struct vm_radix_node *rnode, int8_t last)
152 {
153 #ifdef INVARIANTS
154         int slot;
155
156         KASSERT(rnode->rn_count == 0,
157             ("vm_radix_node_put: rnode %p has %d children", rnode,
158             rnode->rn_count));
159         for (slot = 0; slot < VM_RADIX_COUNT; slot++) {
160                 if (slot == last)
161                         continue;
162                 KASSERT(smr_unserialized_load(&rnode->rn_child[slot], true) ==
163                     NULL, ("vm_radix_node_put: rnode %p has a child", rnode));
164         }
165 #endif
166         /* Off by one so a freshly zero'd node is not assigned to. */
167         rnode->rn_last = last + 1;
168         uma_zfree_smr(vm_radix_node_zone, rnode);
169 }
170
171 /*
172  * Return the position in the array for a given level.
173  */
174 static __inline int
175 vm_radix_slot(vm_pindex_t index, uint16_t level)
176 {
177
178         return ((index >> (level * VM_RADIX_WIDTH)) & VM_RADIX_MASK);
179 }
180
181 /* Trims the key after the specified level. */
182 static __inline vm_pindex_t
183 vm_radix_trimkey(vm_pindex_t index, uint16_t level)
184 {
185         vm_pindex_t ret;
186
187         ret = index;
188         if (level > 0) {
189                 ret >>= level * VM_RADIX_WIDTH;
190                 ret <<= level * VM_RADIX_WIDTH;
191         }
192         return (ret);
193 }
194
195 /*
196  * Fetch a node pointer from a slot in another node.
197  */
198 static __inline struct vm_radix_node *
199 vm_radix_node_load(smrnode_t *p, enum vm_radix_access access)
200 {
201
202         switch (access) {
203         case UNSERIALIZED:
204                 return (smr_unserialized_load(p, true));
205         case LOCKED:
206                 return (smr_serialized_load(p, true));
207         case SMR:
208                 return (smr_entered_load(p, vm_radix_smr));
209         }
210         __assert_unreachable();
211 }
212
213 static __inline void
214 vm_radix_node_store(smrnode_t *p, struct vm_radix_node *v,
215     enum vm_radix_access access)
216 {
217
218         switch (access) {
219         case UNSERIALIZED:
220                 smr_unserialized_store(p, v, true);
221                 break;
222         case LOCKED:
223                 smr_serialized_store(p, v, true);
224                 break;
225         case SMR:
226                 panic("vm_radix_node_store: Not supported in smr section.");
227         }
228 }
229
230 /*
231  * Get the root node for a radix tree.
232  */
233 static __inline struct vm_radix_node *
234 vm_radix_root_load(struct vm_radix *rtree, enum vm_radix_access access)
235 {
236
237         return (vm_radix_node_load((smrnode_t *)&rtree->rt_root, access));
238 }
239
240 /*
241  * Set the root node for a radix tree.
242  */
243 static __inline void
244 vm_radix_root_store(struct vm_radix *rtree, struct vm_radix_node *rnode,
245     enum vm_radix_access access)
246 {
247
248         vm_radix_node_store((smrnode_t *)&rtree->rt_root, rnode, access);
249 }
250
251 /*
252  * Returns TRUE if the specified radix node is a leaf and FALSE otherwise.
253  */
254 static __inline boolean_t
255 vm_radix_isleaf(struct vm_radix_node *rnode)
256 {
257
258         return (((uintptr_t)rnode & VM_RADIX_ISLEAF) != 0);
259 }
260
261 /*
262  * Returns the associated page extracted from rnode.
263  */
264 static __inline vm_page_t
265 vm_radix_topage(struct vm_radix_node *rnode)
266 {
267
268         return ((vm_page_t)((uintptr_t)rnode & ~VM_RADIX_FLAGS));
269 }
270
271 /*
272  * Adds the page as a child of the provided node.
273  */
274 static __inline void
275 vm_radix_addpage(struct vm_radix_node *rnode, vm_pindex_t index, uint16_t clev,
276     vm_page_t page, enum vm_radix_access access)
277 {
278         int slot;
279
280         slot = vm_radix_slot(index, clev);
281         vm_radix_node_store(&rnode->rn_child[slot],
282             (struct vm_radix_node *)((uintptr_t)page | VM_RADIX_ISLEAF), access);
283 }
284
285 /*
286  * Returns the slot where two keys differ.
287  * It cannot accept 2 equal keys.
288  */
289 static __inline uint16_t
290 vm_radix_keydiff(vm_pindex_t index1, vm_pindex_t index2)
291 {
292         uint16_t clev;
293
294         KASSERT(index1 != index2, ("%s: passing the same key value %jx",
295             __func__, (uintmax_t)index1));
296
297         index1 ^= index2;
298         for (clev = VM_RADIX_LIMIT;; clev--)
299                 if (vm_radix_slot(index1, clev) != 0)
300                         return (clev);
301 }
302
303 /*
304  * Returns TRUE if it can be determined that key does not belong to the
305  * specified rnode.  Otherwise, returns FALSE.
306  */
307 static __inline boolean_t
308 vm_radix_keybarr(struct vm_radix_node *rnode, vm_pindex_t idx)
309 {
310
311         if (rnode->rn_clev < VM_RADIX_LIMIT) {
312                 idx = vm_radix_trimkey(idx, rnode->rn_clev + 1);
313                 return (idx != rnode->rn_owner);
314         }
315         return (FALSE);
316 }
317
318 /*
319  * Internal helper for vm_radix_reclaim_allnodes().
320  * This function is recursive.
321  */
322 static void
323 vm_radix_reclaim_allnodes_int(struct vm_radix_node *rnode)
324 {
325         struct vm_radix_node *child;
326         int slot;
327
328         KASSERT(rnode->rn_count <= VM_RADIX_COUNT,
329             ("vm_radix_reclaim_allnodes_int: bad count in rnode %p", rnode));
330         for (slot = 0; rnode->rn_count != 0; slot++) {
331                 child = vm_radix_node_load(&rnode->rn_child[slot], UNSERIALIZED);
332                 if (child == NULL)
333                         continue;
334                 if (!vm_radix_isleaf(child))
335                         vm_radix_reclaim_allnodes_int(child);
336                 vm_radix_node_store(&rnode->rn_child[slot], NULL, UNSERIALIZED);
337                 rnode->rn_count--;
338         }
339         vm_radix_node_put(rnode, -1);
340 }
341
342 #ifndef UMA_MD_SMALL_ALLOC
343 void vm_radix_reserve_kva(void);
344 /*
345  * Reserve the KVA necessary to satisfy the node allocation.
346  * This is mandatory in architectures not supporting direct
347  * mapping as they will need otherwise to carve into the kernel maps for
348  * every node allocation, resulting into deadlocks for consumers already
349  * working with kernel maps.
350  */
351 void
352 vm_radix_reserve_kva(void)
353 {
354
355         /*
356          * Calculate the number of reserved nodes, discounting the pages that
357          * are needed to store them.
358          */
359         if (!uma_zone_reserve_kva(vm_radix_node_zone,
360             ((vm_paddr_t)vm_cnt.v_page_count * PAGE_SIZE) / (PAGE_SIZE +
361             sizeof(struct vm_radix_node))))
362                 panic("%s: unable to reserve KVA", __func__);
363 }
364 #endif
365
366 /*
367  * Initialize the UMA slab zone.
368  */
369 void
370 vm_radix_zinit(void)
371 {
372
373         vm_radix_node_zone = uma_zcreate("RADIX NODE",
374             sizeof(struct vm_radix_node), NULL, NULL, NULL, NULL,
375             VM_RADIX_PAD, UMA_ZONE_VM | UMA_ZONE_SMR | UMA_ZONE_ZINIT);
376         vm_radix_smr = uma_zone_get_smr(vm_radix_node_zone);
377 }
378
379 /*
380  * Inserts the key-value pair into the trie.
381  * Panics if the key already exists.
382  */
383 int
384 vm_radix_insert(struct vm_radix *rtree, vm_page_t page)
385 {
386         vm_pindex_t index, newind;
387         struct vm_radix_node *rnode, *tmp;
388         smrnode_t *parentp;
389         vm_page_t m;
390         int slot;
391         uint16_t clev;
392
393         index = page->pindex;
394
395         /*
396          * The owner of record for root is not really important because it
397          * will never be used.
398          */
399         rnode = vm_radix_root_load(rtree, LOCKED);
400         if (rnode == NULL) {
401                 rtree->rt_root = (uintptr_t)page | VM_RADIX_ISLEAF;
402                 return (0);
403         }
404         parentp = (smrnode_t *)&rtree->rt_root;
405         for (;;) {
406                 if (vm_radix_isleaf(rnode)) {
407                         m = vm_radix_topage(rnode);
408                         if (m->pindex == index)
409                                 panic("%s: key %jx is already present",
410                                     __func__, (uintmax_t)index);
411                         clev = vm_radix_keydiff(m->pindex, index);
412                         tmp = vm_radix_node_get(vm_radix_trimkey(index,
413                             clev + 1), 2, clev);
414                         if (tmp == NULL)
415                                 return (ENOMEM);
416                         /* These writes are not yet visible due to ordering. */
417                         vm_radix_addpage(tmp, index, clev, page, UNSERIALIZED);
418                         vm_radix_addpage(tmp, m->pindex, clev, m, UNSERIALIZED);
419                         /* Synchronize to make leaf visible. */
420                         vm_radix_node_store(parentp, tmp, LOCKED);
421                         return (0);
422                 } else if (vm_radix_keybarr(rnode, index))
423                         break;
424                 slot = vm_radix_slot(index, rnode->rn_clev);
425                 parentp = &rnode->rn_child[slot];
426                 tmp = vm_radix_node_load(parentp, LOCKED);
427                 if (tmp == NULL) {
428                         rnode->rn_count++;
429                         vm_radix_addpage(rnode, index, rnode->rn_clev, page,
430                             LOCKED);
431                         return (0);
432                 }
433                 rnode = tmp;
434         }
435
436         /*
437          * A new node is needed because the right insertion level is reached.
438          * Setup the new intermediate node and add the 2 children: the
439          * new object and the older edge.
440          */
441         newind = rnode->rn_owner;
442         clev = vm_radix_keydiff(newind, index);
443         tmp = vm_radix_node_get(vm_radix_trimkey(index, clev + 1), 2, clev);
444         if (tmp == NULL)
445                 return (ENOMEM);
446         slot = vm_radix_slot(newind, clev);
447         /* These writes are not yet visible due to ordering. */
448         vm_radix_addpage(tmp, index, clev, page, UNSERIALIZED);
449         vm_radix_node_store(&tmp->rn_child[slot], rnode, UNSERIALIZED);
450         /* Serializing write to make the above visible. */
451         vm_radix_node_store(parentp, tmp, LOCKED);
452
453         return (0);
454 }
455
456 /*
457  * Returns TRUE if the specified radix tree contains a single leaf and FALSE
458  * otherwise.
459  */
460 boolean_t
461 vm_radix_is_singleton(struct vm_radix *rtree)
462 {
463         struct vm_radix_node *rnode;
464
465         rnode = vm_radix_root_load(rtree, LOCKED);
466         if (rnode == NULL)
467                 return (FALSE);
468         return (vm_radix_isleaf(rnode));
469 }
470
471 /*
472  * Returns the value stored at the index.  If the index is not present,
473  * NULL is returned.
474  */
475 static __always_inline vm_page_t
476 _vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index,
477     enum vm_radix_access access)
478 {
479         struct vm_radix_node *rnode;
480         vm_page_t m;
481         int slot;
482
483         rnode = vm_radix_root_load(rtree, access);
484         while (rnode != NULL) {
485                 if (vm_radix_isleaf(rnode)) {
486                         m = vm_radix_topage(rnode);
487                         if (m->pindex == index)
488                                 return (m);
489                         break;
490                 }
491                 if (vm_radix_keybarr(rnode, index))
492                         break;
493                 slot = vm_radix_slot(index, rnode->rn_clev);
494                 rnode = vm_radix_node_load(&rnode->rn_child[slot], access);
495         }
496         return (NULL);
497 }
498
499 /*
500  * Returns the value stored at the index assuming there is an external lock.
501  *
502  * If the index is not present, NULL is returned.
503  */
504 vm_page_t
505 vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index)
506 {
507
508         return _vm_radix_lookup(rtree, index, LOCKED);
509 }
510
511 /*
512  * Returns the value stored at the index without requiring an external lock.
513  *
514  * If the index is not present, NULL is returned.
515  */
516 vm_page_t
517 vm_radix_lookup_unlocked(struct vm_radix *rtree, vm_pindex_t index)
518 {
519         vm_page_t m;
520
521         smr_enter(vm_radix_smr);
522         m = _vm_radix_lookup(rtree, index, SMR);
523         smr_exit(vm_radix_smr);
524
525         return (m);
526 }
527
528 /*
529  * Look up the nearest entry at a position greater than or equal to index.
530  */
531 vm_page_t
532 vm_radix_lookup_ge(struct vm_radix *rtree, vm_pindex_t index)
533 {
534         struct vm_radix_node *stack[VM_RADIX_LIMIT];
535         vm_pindex_t inc;
536         vm_page_t m;
537         struct vm_radix_node *child, *rnode;
538 #ifdef INVARIANTS
539         int loops = 0;
540 #endif
541         int slot, tos;
542
543         rnode = vm_radix_root_load(rtree, LOCKED);
544         if (rnode == NULL)
545                 return (NULL);
546         else if (vm_radix_isleaf(rnode)) {
547                 m = vm_radix_topage(rnode);
548                 if (m->pindex >= index)
549                         return (m);
550                 else
551                         return (NULL);
552         }
553         tos = 0;
554         for (;;) {
555                 /*
556                  * If the keys differ before the current bisection node,
557                  * then the search key might rollback to the earliest
558                  * available bisection node or to the smallest key
559                  * in the current node (if the owner is greater than the
560                  * search key).
561                  */
562                 if (vm_radix_keybarr(rnode, index)) {
563                         if (index > rnode->rn_owner) {
564 ascend:
565                                 KASSERT(++loops < 1000,
566                                     ("vm_radix_lookup_ge: too many loops"));
567
568                                 /*
569                                  * Pop nodes from the stack until either the
570                                  * stack is empty or a node that could have a
571                                  * matching descendant is found.
572                                  */
573                                 do {
574                                         if (tos == 0)
575                                                 return (NULL);
576                                         rnode = stack[--tos];
577                                 } while (vm_radix_slot(index,
578                                     rnode->rn_clev) == (VM_RADIX_COUNT - 1));
579
580                                 /*
581                                  * The following computation cannot overflow
582                                  * because index's slot at the current level
583                                  * is less than VM_RADIX_COUNT - 1.
584                                  */
585                                 index = vm_radix_trimkey(index,
586                                     rnode->rn_clev);
587                                 index += VM_RADIX_UNITLEVEL(rnode->rn_clev);
588                         } else
589                                 index = rnode->rn_owner;
590                         KASSERT(!vm_radix_keybarr(rnode, index),
591                             ("vm_radix_lookup_ge: keybarr failed"));
592                 }
593                 slot = vm_radix_slot(index, rnode->rn_clev);
594                 child = vm_radix_node_load(&rnode->rn_child[slot], LOCKED);
595                 if (vm_radix_isleaf(child)) {
596                         m = vm_radix_topage(child);
597                         if (m->pindex >= index)
598                                 return (m);
599                 } else if (child != NULL)
600                         goto descend;
601
602                 /*
603                  * Look for an available edge or page within the current
604                  * bisection node.
605                  */
606                 if (slot < (VM_RADIX_COUNT - 1)) {
607                         inc = VM_RADIX_UNITLEVEL(rnode->rn_clev);
608                         index = vm_radix_trimkey(index, rnode->rn_clev);
609                         do {
610                                 index += inc;
611                                 slot++;
612                                 child = vm_radix_node_load(&rnode->rn_child[slot],
613                                     LOCKED);
614                                 if (vm_radix_isleaf(child)) {
615                                         m = vm_radix_topage(child);
616                                         if (m->pindex >= index)
617                                                 return (m);
618                                 } else if (child != NULL)
619                                         goto descend;
620                         } while (slot < (VM_RADIX_COUNT - 1));
621                 }
622                 KASSERT(child == NULL || vm_radix_isleaf(child),
623                     ("vm_radix_lookup_ge: child is radix node"));
624
625                 /*
626                  * If a page or edge greater than the search slot is not found
627                  * in the current node, ascend to the next higher-level node.
628                  */
629                 goto ascend;
630 descend:
631                 KASSERT(rnode->rn_clev > 0,
632                     ("vm_radix_lookup_ge: pushing leaf's parent"));
633                 KASSERT(tos < VM_RADIX_LIMIT,
634                     ("vm_radix_lookup_ge: stack overflow"));
635                 stack[tos++] = rnode;
636                 rnode = child;
637         }
638 }
639
640 /*
641  * Look up the nearest entry at a position less than or equal to index.
642  */
643 vm_page_t
644 vm_radix_lookup_le(struct vm_radix *rtree, vm_pindex_t index)
645 {
646         struct vm_radix_node *stack[VM_RADIX_LIMIT];
647         vm_pindex_t inc;
648         vm_page_t m;
649         struct vm_radix_node *child, *rnode;
650 #ifdef INVARIANTS
651         int loops = 0;
652 #endif
653         int slot, tos;
654
655         rnode = vm_radix_root_load(rtree, LOCKED);
656         if (rnode == NULL)
657                 return (NULL);
658         else if (vm_radix_isleaf(rnode)) {
659                 m = vm_radix_topage(rnode);
660                 if (m->pindex <= index)
661                         return (m);
662                 else
663                         return (NULL);
664         }
665         tos = 0;
666         for (;;) {
667                 /*
668                  * If the keys differ before the current bisection node,
669                  * then the search key might rollback to the earliest
670                  * available bisection node or to the largest key
671                  * in the current node (if the owner is smaller than the
672                  * search key).
673                  */
674                 if (vm_radix_keybarr(rnode, index)) {
675                         if (index > rnode->rn_owner) {
676                                 index = rnode->rn_owner + VM_RADIX_COUNT *
677                                     VM_RADIX_UNITLEVEL(rnode->rn_clev);
678                         } else {
679 ascend:
680                                 KASSERT(++loops < 1000,
681                                     ("vm_radix_lookup_le: too many loops"));
682
683                                 /*
684                                  * Pop nodes from the stack until either the
685                                  * stack is empty or a node that could have a
686                                  * matching descendant is found.
687                                  */
688                                 do {
689                                         if (tos == 0)
690                                                 return (NULL);
691                                         rnode = stack[--tos];
692                                 } while (vm_radix_slot(index,
693                                     rnode->rn_clev) == 0);
694
695                                 /*
696                                  * The following computation cannot overflow
697                                  * because index's slot at the current level
698                                  * is greater than 0.
699                                  */
700                                 index = vm_radix_trimkey(index,
701                                     rnode->rn_clev);
702                         }
703                         index--;
704                         KASSERT(!vm_radix_keybarr(rnode, index),
705                             ("vm_radix_lookup_le: keybarr failed"));
706                 }
707                 slot = vm_radix_slot(index, rnode->rn_clev);
708                 child = vm_radix_node_load(&rnode->rn_child[slot], LOCKED);
709                 if (vm_radix_isleaf(child)) {
710                         m = vm_radix_topage(child);
711                         if (m->pindex <= index)
712                                 return (m);
713                 } else if (child != NULL)
714                         goto descend;
715
716                 /*
717                  * Look for an available edge or page within the current
718                  * bisection node.
719                  */
720                 if (slot > 0) {
721                         inc = VM_RADIX_UNITLEVEL(rnode->rn_clev);
722                         index |= inc - 1;
723                         do {
724                                 index -= inc;
725                                 slot--;
726                                 child = vm_radix_node_load(&rnode->rn_child[slot],
727                                     LOCKED);
728                                 if (vm_radix_isleaf(child)) {
729                                         m = vm_radix_topage(child);
730                                         if (m->pindex <= index)
731                                                 return (m);
732                                 } else if (child != NULL)
733                                         goto descend;
734                         } while (slot > 0);
735                 }
736                 KASSERT(child == NULL || vm_radix_isleaf(child),
737                     ("vm_radix_lookup_le: child is radix node"));
738
739                 /*
740                  * If a page or edge smaller than the search slot is not found
741                  * in the current node, ascend to the next higher-level node.
742                  */
743                 goto ascend;
744 descend:
745                 KASSERT(rnode->rn_clev > 0,
746                     ("vm_radix_lookup_le: pushing leaf's parent"));
747                 KASSERT(tos < VM_RADIX_LIMIT,
748                     ("vm_radix_lookup_le: stack overflow"));
749                 stack[tos++] = rnode;
750                 rnode = child;
751         }
752 }
753
754 /*
755  * Remove the specified index from the trie, and return the value stored at
756  * that index.  If the index is not present, return NULL.
757  */
758 vm_page_t
759 vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index)
760 {
761         struct vm_radix_node *rnode, *parent, *tmp;
762         vm_page_t m;
763         int i, slot;
764
765         rnode = vm_radix_root_load(rtree, LOCKED);
766         if (vm_radix_isleaf(rnode)) {
767                 m = vm_radix_topage(rnode);
768                 if (m->pindex != index)
769                         return (NULL);
770                 vm_radix_root_store(rtree, NULL, LOCKED);
771                 return (m);
772         }
773         parent = NULL;
774         for (;;) {
775                 if (rnode == NULL)
776                         return (NULL);
777                 slot = vm_radix_slot(index, rnode->rn_clev);
778                 tmp = vm_radix_node_load(&rnode->rn_child[slot], LOCKED);
779                 if (vm_radix_isleaf(tmp)) {
780                         m = vm_radix_topage(tmp);
781                         if (m->pindex != index)
782                                 return (NULL);
783                         vm_radix_node_store(&rnode->rn_child[slot], NULL, LOCKED);
784                         rnode->rn_count--;
785                         if (rnode->rn_count > 1)
786                                 return (m);
787                         for (i = 0; i < VM_RADIX_COUNT; i++)
788                                 if (vm_radix_node_load(&rnode->rn_child[i],
789                                     LOCKED) != NULL)
790                                         break;
791                         KASSERT(i != VM_RADIX_COUNT,
792                             ("%s: invalid node configuration", __func__));
793                         tmp = vm_radix_node_load(&rnode->rn_child[i], LOCKED);
794                         if (parent == NULL)
795                                 vm_radix_root_store(rtree, tmp, LOCKED);
796                         else {
797                                 slot = vm_radix_slot(index, parent->rn_clev);
798                                 KASSERT(vm_radix_node_load(
799                                     &parent->rn_child[slot], LOCKED) == rnode,
800                                     ("%s: invalid child value", __func__));
801                                 vm_radix_node_store(&parent->rn_child[slot],
802                                     tmp, LOCKED);
803                         }
804                         /*
805                          * The child is still valid and we can not zero the
806                          * pointer until all smr references are gone.
807                          */
808                         rnode->rn_count--;
809                         vm_radix_node_put(rnode, i);
810                         return (m);
811                 }
812                 parent = rnode;
813                 rnode = tmp;
814         }
815 }
816
817 /*
818  * Remove and free all the nodes from the radix tree.
819  * This function is recursive but there is a tight control on it as the
820  * maximum depth of the tree is fixed.
821  */
822 void
823 vm_radix_reclaim_allnodes(struct vm_radix *rtree)
824 {
825         struct vm_radix_node *root;
826
827         root = vm_radix_root_load(rtree, LOCKED);
828         if (root == NULL)
829                 return;
830         vm_radix_root_store(rtree, NULL, UNSERIALIZED);
831         if (!vm_radix_isleaf(root))
832                 vm_radix_reclaim_allnodes_int(root);
833 }
834
835 /*
836  * Replace an existing page in the trie with another one.
837  * Panics if there is not an old page in the trie at the new page's index.
838  */
839 vm_page_t
840 vm_radix_replace(struct vm_radix *rtree, vm_page_t newpage)
841 {
842         struct vm_radix_node *rnode, *tmp;
843         vm_page_t m;
844         vm_pindex_t index;
845         int slot;
846
847         index = newpage->pindex;
848         rnode = vm_radix_root_load(rtree, LOCKED);
849         if (rnode == NULL)
850                 panic("%s: replacing page on an empty trie", __func__);
851         if (vm_radix_isleaf(rnode)) {
852                 m = vm_radix_topage(rnode);
853                 if (m->pindex != index)
854                         panic("%s: original replacing root key not found",
855                             __func__);
856                 rtree->rt_root = (uintptr_t)newpage | VM_RADIX_ISLEAF;
857                 return (m);
858         }
859         for (;;) {
860                 slot = vm_radix_slot(index, rnode->rn_clev);
861                 tmp = vm_radix_node_load(&rnode->rn_child[slot], LOCKED);
862                 if (vm_radix_isleaf(tmp)) {
863                         m = vm_radix_topage(tmp);
864                         if (m->pindex == index) {
865                                 vm_radix_node_store(&rnode->rn_child[slot],
866                                     (struct vm_radix_node *)((uintptr_t)newpage |
867                                     VM_RADIX_ISLEAF), LOCKED);
868                                 return (m);
869                         } else
870                                 break;
871                 } else if (tmp == NULL || vm_radix_keybarr(tmp, index))
872                         break;
873                 rnode = tmp;
874         }
875         panic("%s: original replacing page not found", __func__);
876 }
877
878 void
879 vm_radix_wait(void)
880 {
881         uma_zwait(vm_radix_node_zone);
882 }
883
884 #ifdef DDB
885 /*
886  * Show details about the given radix node.
887  */
888 DB_SHOW_COMMAND(radixnode, db_show_radixnode)
889 {
890         struct vm_radix_node *rnode, *tmp;
891         int i;
892
893         if (!have_addr)
894                 return;
895         rnode = (struct vm_radix_node *)addr;
896         db_printf("radixnode %p, owner %jx, children count %u, level %u:\n",
897             (void *)rnode, (uintmax_t)rnode->rn_owner, rnode->rn_count,
898             rnode->rn_clev);
899         for (i = 0; i < VM_RADIX_COUNT; i++) {
900                 tmp = vm_radix_node_load(&rnode->rn_child[i], UNSERIALIZED);
901                 if (tmp != NULL)
902                         db_printf("slot: %d, val: %p, page: %p, clev: %d\n",
903                             i, (void *)tmp,
904                             vm_radix_isleaf(tmp) ?  vm_radix_topage(tmp) : NULL,
905                             rnode->rn_clev);
906         }
907 }
908 #endif /* DDB */