]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/vm/vm_radix.c
sysctl(9): Fix a few mandoc related issues
[FreeBSD/FreeBSD.git] / sys / vm / vm_radix.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2013 EMC Corp.
5  * Copyright (c) 2011 Jeffrey Roberson <jeff@freebsd.org>
6  * Copyright (c) 2008 Mayur Shardul <mayur.shardul@gmail.com>
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  *
30  */
31
32 /*
33  * Path-compressed radix trie implementation.
34  * The following code is not generalized into a general purpose library
35  * because there are way too many parameters embedded that should really
36  * be decided by the library consumers.  At the same time, consumers
37  * of this code must achieve highest possible performance.
38  *
39  * The implementation takes into account the following rationale:
40  * - Size of the nodes should be as small as possible but still big enough
41  *   to avoid a large maximum depth for the trie.  This is a balance
42  *   between the necessity to not wire too much physical memory for the nodes
43  *   and the necessity to avoid too much cache pollution during the trie
44  *   operations.
45  * - There is not a huge bias toward the number of lookup operations over
46  *   the number of insert and remove operations.  This basically implies
47  *   that optimizations supposedly helping one operation but hurting the
48  *   other might be carefully evaluated.
49  * - On average not many nodes are expected to be fully populated, hence
50  *   level compression may just complicate things.
51  */
52
53 #include <sys/cdefs.h>
54 __FBSDID("$FreeBSD$");
55
56 #include "opt_ddb.h"
57
58 #include <sys/param.h>
59 #include <sys/systm.h>
60 #include <sys/kernel.h>
61 #include <sys/proc.h>
62 #include <sys/vmmeter.h>
63 #include <sys/smr.h>
64 #include <sys/smr_types.h>
65
66 #include <vm/uma.h>
67 #include <vm/vm.h>
68 #include <vm/vm_param.h>
69 #include <vm/vm_object.h>
70 #include <vm/vm_page.h>
71 #include <vm/vm_radix.h>
72
73 #ifdef DDB
74 #include <ddb/ddb.h>
75 #endif
76
77 /*
78  * These widths should allow the pointers to a node's children to fit within
79  * a single cache line.  The extra levels from a narrow width should not be
80  * a problem thanks to path compression.
81  */
82 #ifdef __LP64__
83 #define VM_RADIX_WIDTH  4
84 #else
85 #define VM_RADIX_WIDTH  3
86 #endif
87
88 #define VM_RADIX_COUNT  (1 << VM_RADIX_WIDTH)
89 #define VM_RADIX_MASK   (VM_RADIX_COUNT - 1)
90 #define VM_RADIX_LIMIT                                                  \
91         (howmany(sizeof(vm_pindex_t) * NBBY, VM_RADIX_WIDTH) - 1)
92
93 /* Flag bits stored in node pointers. */
94 #define VM_RADIX_ISLEAF 0x1
95 #define VM_RADIX_FLAGS  0x1
96 #define VM_RADIX_PAD    VM_RADIX_FLAGS
97
98 /* Returns one unit associated with specified level. */
99 #define VM_RADIX_UNITLEVEL(lev)                                         \
100         ((vm_pindex_t)1 << ((lev) * VM_RADIX_WIDTH))
101
102 enum vm_radix_access { SMR, LOCKED, UNSERIALIZED };
103
104 struct vm_radix_node;
105 typedef SMR_POINTER(struct vm_radix_node *) smrnode_t;
106
107 struct vm_radix_node {
108         vm_pindex_t     rn_owner;                       /* Owner of record. */
109         uint16_t        rn_count;                       /* Valid children. */
110         uint8_t         rn_clev;                        /* Current level. */
111         int8_t          rn_last;                        /* zero last ptr. */
112         smrnode_t       rn_child[VM_RADIX_COUNT];       /* Child nodes. */
113 };
114
115 static uma_zone_t vm_radix_node_zone;
116 static smr_t vm_radix_smr;
117
118 static void vm_radix_node_store(smrnode_t *p, struct vm_radix_node *v,
119     enum vm_radix_access access);
120
121 /*
122  * Allocate a radix node.
123  */
124 static struct vm_radix_node *
125 vm_radix_node_get(vm_pindex_t owner, uint16_t count, uint16_t clevel)
126 {
127         struct vm_radix_node *rnode;
128
129         rnode = uma_zalloc_smr(vm_radix_node_zone, M_NOWAIT);
130         if (rnode == NULL)
131                 return (NULL);
132
133         /*
134          * We want to clear the last child pointer after the final section
135          * has exited so lookup can not return false negatives.  It is done
136          * here because it will be cache-cold in the dtor callback.
137          */
138         if (rnode->rn_last != 0) {
139                 vm_radix_node_store(&rnode->rn_child[rnode->rn_last - 1],
140                     NULL, UNSERIALIZED);
141                 rnode->rn_last = 0;
142         }
143         rnode->rn_owner = owner;
144         rnode->rn_count = count;
145         rnode->rn_clev = clevel;
146         return (rnode);
147 }
148
149 /*
150  * Free radix node.
151  */
152 static __inline void
153 vm_radix_node_put(struct vm_radix_node *rnode, int8_t last)
154 {
155 #ifdef INVARIANTS
156         int slot;
157
158         KASSERT(rnode->rn_count == 0,
159             ("vm_radix_node_put: rnode %p has %d children", rnode,
160             rnode->rn_count));
161         for (slot = 0; slot < VM_RADIX_COUNT; slot++) {
162                 if (slot == last)
163                         continue;
164                 KASSERT(smr_unserialized_load(&rnode->rn_child[slot], true) ==
165                     NULL, ("vm_radix_node_put: rnode %p has a child", rnode));
166         }
167 #endif
168         /* Off by one so a freshly zero'd node is not assigned to. */
169         rnode->rn_last = last + 1;
170         uma_zfree_smr(vm_radix_node_zone, rnode);
171 }
172
173 /*
174  * Return the position in the array for a given level.
175  */
176 static __inline int
177 vm_radix_slot(vm_pindex_t index, uint16_t level)
178 {
179
180         return ((index >> (level * VM_RADIX_WIDTH)) & VM_RADIX_MASK);
181 }
182
183 /* Trims the key after the specified level. */
184 static __inline vm_pindex_t
185 vm_radix_trimkey(vm_pindex_t index, uint16_t level)
186 {
187         vm_pindex_t ret;
188
189         ret = index;
190         if (level > 0) {
191                 ret >>= level * VM_RADIX_WIDTH;
192                 ret <<= level * VM_RADIX_WIDTH;
193         }
194         return (ret);
195 }
196
197 /*
198  * Fetch a node pointer from a slot in another node.
199  */
200 static __inline struct vm_radix_node *
201 vm_radix_node_load(smrnode_t *p, enum vm_radix_access access)
202 {
203
204         switch (access) {
205         case UNSERIALIZED:
206                 return (smr_unserialized_load(p, true));
207         case LOCKED:
208                 return (smr_serialized_load(p, true));
209         case SMR:
210                 return (smr_entered_load(p, vm_radix_smr));
211         }
212         __assert_unreachable();
213 }
214
215 static __inline void
216 vm_radix_node_store(smrnode_t *p, struct vm_radix_node *v,
217     enum vm_radix_access access)
218 {
219
220         switch (access) {
221         case UNSERIALIZED:
222                 smr_unserialized_store(p, v, true);
223                 break;
224         case LOCKED:
225                 smr_serialized_store(p, v, true);
226                 break;
227         case SMR:
228                 panic("vm_radix_node_store: Not supported in smr section.");
229         }
230 }
231
232 /*
233  * Get the root node for a radix tree.
234  */
235 static __inline struct vm_radix_node *
236 vm_radix_root_load(struct vm_radix *rtree, enum vm_radix_access access)
237 {
238
239         return (vm_radix_node_load((smrnode_t *)&rtree->rt_root, access));
240 }
241
242 /*
243  * Set the root node for a radix tree.
244  */
245 static __inline void
246 vm_radix_root_store(struct vm_radix *rtree, struct vm_radix_node *rnode,
247     enum vm_radix_access access)
248 {
249
250         vm_radix_node_store((smrnode_t *)&rtree->rt_root, rnode, access);
251 }
252
253 /*
254  * Returns TRUE if the specified radix node is a leaf and FALSE otherwise.
255  */
256 static __inline boolean_t
257 vm_radix_isleaf(struct vm_radix_node *rnode)
258 {
259
260         return (((uintptr_t)rnode & VM_RADIX_ISLEAF) != 0);
261 }
262
263 /*
264  * Returns the associated page extracted from rnode.
265  */
266 static __inline vm_page_t
267 vm_radix_topage(struct vm_radix_node *rnode)
268 {
269
270         return ((vm_page_t)((uintptr_t)rnode & ~VM_RADIX_FLAGS));
271 }
272
273 /*
274  * Adds the page as a child of the provided node.
275  */
276 static __inline void
277 vm_radix_addpage(struct vm_radix_node *rnode, vm_pindex_t index, uint16_t clev,
278     vm_page_t page, enum vm_radix_access access)
279 {
280         int slot;
281
282         slot = vm_radix_slot(index, clev);
283         vm_radix_node_store(&rnode->rn_child[slot],
284             (struct vm_radix_node *)((uintptr_t)page | VM_RADIX_ISLEAF), access);
285 }
286
287 /*
288  * Returns the slot where two keys differ.
289  * It cannot accept 2 equal keys.
290  */
291 static __inline uint16_t
292 vm_radix_keydiff(vm_pindex_t index1, vm_pindex_t index2)
293 {
294         uint16_t clev;
295
296         KASSERT(index1 != index2, ("%s: passing the same key value %jx",
297             __func__, (uintmax_t)index1));
298
299         index1 ^= index2;
300         for (clev = VM_RADIX_LIMIT;; clev--)
301                 if (vm_radix_slot(index1, clev) != 0)
302                         return (clev);
303 }
304
305 /*
306  * Returns TRUE if it can be determined that key does not belong to the
307  * specified rnode.  Otherwise, returns FALSE.
308  */
309 static __inline boolean_t
310 vm_radix_keybarr(struct vm_radix_node *rnode, vm_pindex_t idx)
311 {
312
313         if (rnode->rn_clev < VM_RADIX_LIMIT) {
314                 idx = vm_radix_trimkey(idx, rnode->rn_clev + 1);
315                 return (idx != rnode->rn_owner);
316         }
317         return (FALSE);
318 }
319
320 /*
321  * Internal helper for vm_radix_reclaim_allnodes().
322  * This function is recursive.
323  */
324 static void
325 vm_radix_reclaim_allnodes_int(struct vm_radix_node *rnode)
326 {
327         struct vm_radix_node *child;
328         int slot;
329
330         KASSERT(rnode->rn_count <= VM_RADIX_COUNT,
331             ("vm_radix_reclaim_allnodes_int: bad count in rnode %p", rnode));
332         for (slot = 0; rnode->rn_count != 0; slot++) {
333                 child = vm_radix_node_load(&rnode->rn_child[slot], UNSERIALIZED);
334                 if (child == NULL)
335                         continue;
336                 if (!vm_radix_isleaf(child))
337                         vm_radix_reclaim_allnodes_int(child);
338                 vm_radix_node_store(&rnode->rn_child[slot], NULL, UNSERIALIZED);
339                 rnode->rn_count--;
340         }
341         vm_radix_node_put(rnode, -1);
342 }
343
344 #ifndef UMA_MD_SMALL_ALLOC
345 void vm_radix_reserve_kva(void);
346 /*
347  * Reserve the KVA necessary to satisfy the node allocation.
348  * This is mandatory in architectures not supporting direct
349  * mapping as they will need otherwise to carve into the kernel maps for
350  * every node allocation, resulting into deadlocks for consumers already
351  * working with kernel maps.
352  */
353 void
354 vm_radix_reserve_kva(void)
355 {
356
357         /*
358          * Calculate the number of reserved nodes, discounting the pages that
359          * are needed to store them.
360          */
361         if (!uma_zone_reserve_kva(vm_radix_node_zone,
362             ((vm_paddr_t)vm_cnt.v_page_count * PAGE_SIZE) / (PAGE_SIZE +
363             sizeof(struct vm_radix_node))))
364                 panic("%s: unable to reserve KVA", __func__);
365 }
366 #endif
367
368 /*
369  * Initialize the UMA slab zone.
370  */
371 void
372 vm_radix_zinit(void)
373 {
374
375         vm_radix_node_zone = uma_zcreate("RADIX NODE",
376             sizeof(struct vm_radix_node), NULL, NULL, NULL, NULL,
377             VM_RADIX_PAD, UMA_ZONE_VM | UMA_ZONE_SMR | UMA_ZONE_ZINIT);
378         vm_radix_smr = uma_zone_get_smr(vm_radix_node_zone);
379 }
380
381 /*
382  * Inserts the key-value pair into the trie.
383  * Panics if the key already exists.
384  */
385 int
386 vm_radix_insert(struct vm_radix *rtree, vm_page_t page)
387 {
388         vm_pindex_t index, newind;
389         struct vm_radix_node *rnode, *tmp;
390         smrnode_t *parentp;
391         vm_page_t m;
392         int slot;
393         uint16_t clev;
394
395         index = page->pindex;
396
397         /*
398          * The owner of record for root is not really important because it
399          * will never be used.
400          */
401         rnode = vm_radix_root_load(rtree, LOCKED);
402         if (rnode == NULL) {
403                 rtree->rt_root = (uintptr_t)page | VM_RADIX_ISLEAF;
404                 return (0);
405         }
406         parentp = (smrnode_t *)&rtree->rt_root;
407         for (;;) {
408                 if (vm_radix_isleaf(rnode)) {
409                         m = vm_radix_topage(rnode);
410                         if (m->pindex == index)
411                                 panic("%s: key %jx is already present",
412                                     __func__, (uintmax_t)index);
413                         clev = vm_radix_keydiff(m->pindex, index);
414                         tmp = vm_radix_node_get(vm_radix_trimkey(index,
415                             clev + 1), 2, clev);
416                         if (tmp == NULL)
417                                 return (ENOMEM);
418                         /* These writes are not yet visible due to ordering. */
419                         vm_radix_addpage(tmp, index, clev, page, UNSERIALIZED);
420                         vm_radix_addpage(tmp, m->pindex, clev, m, UNSERIALIZED);
421                         /* Synchronize to make leaf visible. */
422                         vm_radix_node_store(parentp, tmp, LOCKED);
423                         return (0);
424                 } else if (vm_radix_keybarr(rnode, index))
425                         break;
426                 slot = vm_radix_slot(index, rnode->rn_clev);
427                 parentp = &rnode->rn_child[slot];
428                 tmp = vm_radix_node_load(parentp, LOCKED);
429                 if (tmp == NULL) {
430                         rnode->rn_count++;
431                         vm_radix_addpage(rnode, index, rnode->rn_clev, page,
432                             LOCKED);
433                         return (0);
434                 }
435                 rnode = tmp;
436         }
437
438         /*
439          * A new node is needed because the right insertion level is reached.
440          * Setup the new intermediate node and add the 2 children: the
441          * new object and the older edge.
442          */
443         newind = rnode->rn_owner;
444         clev = vm_radix_keydiff(newind, index);
445         tmp = vm_radix_node_get(vm_radix_trimkey(index, clev + 1), 2, clev);
446         if (tmp == NULL)
447                 return (ENOMEM);
448         slot = vm_radix_slot(newind, clev);
449         /* These writes are not yet visible due to ordering. */
450         vm_radix_addpage(tmp, index, clev, page, UNSERIALIZED);
451         vm_radix_node_store(&tmp->rn_child[slot], rnode, UNSERIALIZED);
452         /* Serializing write to make the above visible. */
453         vm_radix_node_store(parentp, tmp, LOCKED);
454
455         return (0);
456 }
457
458 /*
459  * Returns TRUE if the specified radix tree contains a single leaf and FALSE
460  * otherwise.
461  */
462 boolean_t
463 vm_radix_is_singleton(struct vm_radix *rtree)
464 {
465         struct vm_radix_node *rnode;
466
467         rnode = vm_radix_root_load(rtree, LOCKED);
468         if (rnode == NULL)
469                 return (FALSE);
470         return (vm_radix_isleaf(rnode));
471 }
472
473 /*
474  * Returns the value stored at the index.  If the index is not present,
475  * NULL is returned.
476  */
477 static __always_inline vm_page_t
478 _vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index,
479     enum vm_radix_access access)
480 {
481         struct vm_radix_node *rnode;
482         vm_page_t m;
483         int slot;
484
485         rnode = vm_radix_root_load(rtree, access);
486         while (rnode != NULL) {
487                 if (vm_radix_isleaf(rnode)) {
488                         m = vm_radix_topage(rnode);
489                         if (m->pindex == index)
490                                 return (m);
491                         break;
492                 }
493                 if (vm_radix_keybarr(rnode, index))
494                         break;
495                 slot = vm_radix_slot(index, rnode->rn_clev);
496                 rnode = vm_radix_node_load(&rnode->rn_child[slot], access);
497         }
498         return (NULL);
499 }
500
501 /*
502  * Returns the value stored at the index assuming there is an external lock.
503  *
504  * If the index is not present, NULL is returned.
505  */
506 vm_page_t
507 vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index)
508 {
509
510         return _vm_radix_lookup(rtree, index, LOCKED);
511 }
512
513 /*
514  * Returns the value stored at the index without requiring an external lock.
515  *
516  * If the index is not present, NULL is returned.
517  */
518 vm_page_t
519 vm_radix_lookup_unlocked(struct vm_radix *rtree, vm_pindex_t index)
520 {
521         vm_page_t m;
522
523         smr_enter(vm_radix_smr);
524         m = _vm_radix_lookup(rtree, index, SMR);
525         smr_exit(vm_radix_smr);
526
527         return (m);
528 }
529
530 /*
531  * Look up the nearest entry at a position greater than or equal to index.
532  */
533 vm_page_t
534 vm_radix_lookup_ge(struct vm_radix *rtree, vm_pindex_t index)
535 {
536         struct vm_radix_node *stack[VM_RADIX_LIMIT];
537         vm_pindex_t inc;
538         vm_page_t m;
539         struct vm_radix_node *child, *rnode;
540 #ifdef INVARIANTS
541         int loops = 0;
542 #endif
543         int slot, tos;
544
545         rnode = vm_radix_root_load(rtree, LOCKED);
546         if (rnode == NULL)
547                 return (NULL);
548         else if (vm_radix_isleaf(rnode)) {
549                 m = vm_radix_topage(rnode);
550                 if (m->pindex >= index)
551                         return (m);
552                 else
553                         return (NULL);
554         }
555         tos = 0;
556         for (;;) {
557                 /*
558                  * If the keys differ before the current bisection node,
559                  * then the search key might rollback to the earliest
560                  * available bisection node or to the smallest key
561                  * in the current node (if the owner is greater than the
562                  * search key).
563                  */
564                 if (vm_radix_keybarr(rnode, index)) {
565                         if (index > rnode->rn_owner) {
566 ascend:
567                                 KASSERT(++loops < 1000,
568                                     ("vm_radix_lookup_ge: too many loops"));
569
570                                 /*
571                                  * Pop nodes from the stack until either the
572                                  * stack is empty or a node that could have a
573                                  * matching descendant is found.
574                                  */
575                                 do {
576                                         if (tos == 0)
577                                                 return (NULL);
578                                         rnode = stack[--tos];
579                                 } while (vm_radix_slot(index,
580                                     rnode->rn_clev) == (VM_RADIX_COUNT - 1));
581
582                                 /*
583                                  * The following computation cannot overflow
584                                  * because index's slot at the current level
585                                  * is less than VM_RADIX_COUNT - 1.
586                                  */
587                                 index = vm_radix_trimkey(index,
588                                     rnode->rn_clev);
589                                 index += VM_RADIX_UNITLEVEL(rnode->rn_clev);
590                         } else
591                                 index = rnode->rn_owner;
592                         KASSERT(!vm_radix_keybarr(rnode, index),
593                             ("vm_radix_lookup_ge: keybarr failed"));
594                 }
595                 slot = vm_radix_slot(index, rnode->rn_clev);
596                 child = vm_radix_node_load(&rnode->rn_child[slot], LOCKED);
597                 if (vm_radix_isleaf(child)) {
598                         m = vm_radix_topage(child);
599                         if (m->pindex >= index)
600                                 return (m);
601                 } else if (child != NULL)
602                         goto descend;
603
604                 /*
605                  * Look for an available edge or page within the current
606                  * bisection node.
607                  */
608                 if (slot < (VM_RADIX_COUNT - 1)) {
609                         inc = VM_RADIX_UNITLEVEL(rnode->rn_clev);
610                         index = vm_radix_trimkey(index, rnode->rn_clev);
611                         do {
612                                 index += inc;
613                                 slot++;
614                                 child = vm_radix_node_load(&rnode->rn_child[slot],
615                                     LOCKED);
616                                 if (vm_radix_isleaf(child)) {
617                                         m = vm_radix_topage(child);
618                                         if (m->pindex >= index)
619                                                 return (m);
620                                 } else if (child != NULL)
621                                         goto descend;
622                         } while (slot < (VM_RADIX_COUNT - 1));
623                 }
624                 KASSERT(child == NULL || vm_radix_isleaf(child),
625                     ("vm_radix_lookup_ge: child is radix node"));
626
627                 /*
628                  * If a page or edge greater than the search slot is not found
629                  * in the current node, ascend to the next higher-level node.
630                  */
631                 goto ascend;
632 descend:
633                 KASSERT(rnode->rn_clev > 0,
634                     ("vm_radix_lookup_ge: pushing leaf's parent"));
635                 KASSERT(tos < VM_RADIX_LIMIT,
636                     ("vm_radix_lookup_ge: stack overflow"));
637                 stack[tos++] = rnode;
638                 rnode = child;
639         }
640 }
641
642 /*
643  * Look up the nearest entry at a position less than or equal to index.
644  */
645 vm_page_t
646 vm_radix_lookup_le(struct vm_radix *rtree, vm_pindex_t index)
647 {
648         struct vm_radix_node *stack[VM_RADIX_LIMIT];
649         vm_pindex_t inc;
650         vm_page_t m;
651         struct vm_radix_node *child, *rnode;
652 #ifdef INVARIANTS
653         int loops = 0;
654 #endif
655         int slot, tos;
656
657         rnode = vm_radix_root_load(rtree, LOCKED);
658         if (rnode == NULL)
659                 return (NULL);
660         else if (vm_radix_isleaf(rnode)) {
661                 m = vm_radix_topage(rnode);
662                 if (m->pindex <= index)
663                         return (m);
664                 else
665                         return (NULL);
666         }
667         tos = 0;
668         for (;;) {
669                 /*
670                  * If the keys differ before the current bisection node,
671                  * then the search key might rollback to the earliest
672                  * available bisection node or to the largest key
673                  * in the current node (if the owner is smaller than the
674                  * search key).
675                  */
676                 if (vm_radix_keybarr(rnode, index)) {
677                         if (index > rnode->rn_owner) {
678                                 index = rnode->rn_owner + VM_RADIX_COUNT *
679                                     VM_RADIX_UNITLEVEL(rnode->rn_clev);
680                         } else {
681 ascend:
682                                 KASSERT(++loops < 1000,
683                                     ("vm_radix_lookup_le: too many loops"));
684
685                                 /*
686                                  * Pop nodes from the stack until either the
687                                  * stack is empty or a node that could have a
688                                  * matching descendant is found.
689                                  */
690                                 do {
691                                         if (tos == 0)
692                                                 return (NULL);
693                                         rnode = stack[--tos];
694                                 } while (vm_radix_slot(index,
695                                     rnode->rn_clev) == 0);
696
697                                 /*
698                                  * The following computation cannot overflow
699                                  * because index's slot at the current level
700                                  * is greater than 0.
701                                  */
702                                 index = vm_radix_trimkey(index,
703                                     rnode->rn_clev);
704                         }
705                         index--;
706                         KASSERT(!vm_radix_keybarr(rnode, index),
707                             ("vm_radix_lookup_le: keybarr failed"));
708                 }
709                 slot = vm_radix_slot(index, rnode->rn_clev);
710                 child = vm_radix_node_load(&rnode->rn_child[slot], LOCKED);
711                 if (vm_radix_isleaf(child)) {
712                         m = vm_radix_topage(child);
713                         if (m->pindex <= index)
714                                 return (m);
715                 } else if (child != NULL)
716                         goto descend;
717
718                 /*
719                  * Look for an available edge or page within the current
720                  * bisection node.
721                  */
722                 if (slot > 0) {
723                         inc = VM_RADIX_UNITLEVEL(rnode->rn_clev);
724                         index |= inc - 1;
725                         do {
726                                 index -= inc;
727                                 slot--;
728                                 child = vm_radix_node_load(&rnode->rn_child[slot],
729                                     LOCKED);
730                                 if (vm_radix_isleaf(child)) {
731                                         m = vm_radix_topage(child);
732                                         if (m->pindex <= index)
733                                                 return (m);
734                                 } else if (child != NULL)
735                                         goto descend;
736                         } while (slot > 0);
737                 }
738                 KASSERT(child == NULL || vm_radix_isleaf(child),
739                     ("vm_radix_lookup_le: child is radix node"));
740
741                 /*
742                  * If a page or edge smaller than the search slot is not found
743                  * in the current node, ascend to the next higher-level node.
744                  */
745                 goto ascend;
746 descend:
747                 KASSERT(rnode->rn_clev > 0,
748                     ("vm_radix_lookup_le: pushing leaf's parent"));
749                 KASSERT(tos < VM_RADIX_LIMIT,
750                     ("vm_radix_lookup_le: stack overflow"));
751                 stack[tos++] = rnode;
752                 rnode = child;
753         }
754 }
755
756 /*
757  * Remove the specified index from the trie, and return the value stored at
758  * that index.  If the index is not present, return NULL.
759  */
760 vm_page_t
761 vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index)
762 {
763         struct vm_radix_node *rnode, *parent, *tmp;
764         vm_page_t m;
765         int i, slot;
766
767         rnode = vm_radix_root_load(rtree, LOCKED);
768         if (vm_radix_isleaf(rnode)) {
769                 m = vm_radix_topage(rnode);
770                 if (m->pindex != index)
771                         return (NULL);
772                 vm_radix_root_store(rtree, NULL, LOCKED);
773                 return (m);
774         }
775         parent = NULL;
776         for (;;) {
777                 if (rnode == NULL)
778                         return (NULL);
779                 slot = vm_radix_slot(index, rnode->rn_clev);
780                 tmp = vm_radix_node_load(&rnode->rn_child[slot], LOCKED);
781                 if (vm_radix_isleaf(tmp)) {
782                         m = vm_radix_topage(tmp);
783                         if (m->pindex != index)
784                                 return (NULL);
785                         vm_radix_node_store(&rnode->rn_child[slot], NULL, LOCKED);
786                         rnode->rn_count--;
787                         if (rnode->rn_count > 1)
788                                 return (m);
789                         for (i = 0; i < VM_RADIX_COUNT; i++)
790                                 if (vm_radix_node_load(&rnode->rn_child[i],
791                                     LOCKED) != NULL)
792                                         break;
793                         KASSERT(i != VM_RADIX_COUNT,
794                             ("%s: invalid node configuration", __func__));
795                         tmp = vm_radix_node_load(&rnode->rn_child[i], LOCKED);
796                         if (parent == NULL)
797                                 vm_radix_root_store(rtree, tmp, LOCKED);
798                         else {
799                                 slot = vm_radix_slot(index, parent->rn_clev);
800                                 KASSERT(vm_radix_node_load(
801                                     &parent->rn_child[slot], LOCKED) == rnode,
802                                     ("%s: invalid child value", __func__));
803                                 vm_radix_node_store(&parent->rn_child[slot],
804                                     tmp, LOCKED);
805                         }
806                         /*
807                          * The child is still valid and we can not zero the
808                          * pointer until all smr references are gone.
809                          */
810                         rnode->rn_count--;
811                         vm_radix_node_put(rnode, i);
812                         return (m);
813                 }
814                 parent = rnode;
815                 rnode = tmp;
816         }
817 }
818
819 /*
820  * Remove and free all the nodes from the radix tree.
821  * This function is recursive but there is a tight control on it as the
822  * maximum depth of the tree is fixed.
823  */
824 void
825 vm_radix_reclaim_allnodes(struct vm_radix *rtree)
826 {
827         struct vm_radix_node *root;
828
829         root = vm_radix_root_load(rtree, LOCKED);
830         if (root == NULL)
831                 return;
832         vm_radix_root_store(rtree, NULL, UNSERIALIZED);
833         if (!vm_radix_isleaf(root))
834                 vm_radix_reclaim_allnodes_int(root);
835 }
836
837 /*
838  * Replace an existing page in the trie with another one.
839  * Panics if there is not an old page in the trie at the new page's index.
840  */
841 vm_page_t
842 vm_radix_replace(struct vm_radix *rtree, vm_page_t newpage)
843 {
844         struct vm_radix_node *rnode, *tmp;
845         vm_page_t m;
846         vm_pindex_t index;
847         int slot;
848
849         index = newpage->pindex;
850         rnode = vm_radix_root_load(rtree, LOCKED);
851         if (rnode == NULL)
852                 panic("%s: replacing page on an empty trie", __func__);
853         if (vm_radix_isleaf(rnode)) {
854                 m = vm_radix_topage(rnode);
855                 if (m->pindex != index)
856                         panic("%s: original replacing root key not found",
857                             __func__);
858                 rtree->rt_root = (uintptr_t)newpage | VM_RADIX_ISLEAF;
859                 return (m);
860         }
861         for (;;) {
862                 slot = vm_radix_slot(index, rnode->rn_clev);
863                 tmp = vm_radix_node_load(&rnode->rn_child[slot], LOCKED);
864                 if (vm_radix_isleaf(tmp)) {
865                         m = vm_radix_topage(tmp);
866                         if (m->pindex == index) {
867                                 vm_radix_node_store(&rnode->rn_child[slot],
868                                     (struct vm_radix_node *)((uintptr_t)newpage |
869                                     VM_RADIX_ISLEAF), LOCKED);
870                                 return (m);
871                         } else
872                                 break;
873                 } else if (tmp == NULL || vm_radix_keybarr(tmp, index))
874                         break;
875                 rnode = tmp;
876         }
877         panic("%s: original replacing page not found", __func__);
878 }
879
880 void
881 vm_radix_wait(void)
882 {
883         uma_zwait(vm_radix_node_zone);
884 }
885
886 #ifdef DDB
887 /*
888  * Show details about the given radix node.
889  */
890 DB_SHOW_COMMAND(radixnode, db_show_radixnode)
891 {
892         struct vm_radix_node *rnode, *tmp;
893         int i;
894
895         if (!have_addr)
896                 return;
897         rnode = (struct vm_radix_node *)addr;
898         db_printf("radixnode %p, owner %jx, children count %u, level %u:\n",
899             (void *)rnode, (uintmax_t)rnode->rn_owner, rnode->rn_count,
900             rnode->rn_clev);
901         for (i = 0; i < VM_RADIX_COUNT; i++) {
902                 tmp = vm_radix_node_load(&rnode->rn_child[i], UNSERIALIZED);
903                 if (tmp != NULL)
904                         db_printf("slot: %d, val: %p, page: %p, clev: %d\n",
905                             i, (void *)tmp,
906                             vm_radix_isleaf(tmp) ?  vm_radix_topage(tmp) : NULL,
907                             rnode->rn_clev);
908         }
909 }
910 #endif /* DDB */