sys/vm/vm_reserv.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
   3  *
   4  * Copyright (c) 2002-2006 Rice University
   5  * Copyright (c) 2007-2011 Alan L. Cox <alc@cs.rice.edu>
   6  * All rights reserved.
   7  *
   8  * This software was developed for the FreeBSD Project by Alan L. Cox,
   9  * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro.
  10  *
  11  * Redistribution and use in source and binary forms, with or without
  12  * modification, are permitted provided that the following conditions
  13  * are met:
  14  * 1. Redistributions of source code must retain the above copyright
  15  *    notice, this list of conditions and the following disclaimer.
  16  * 2. Redistributions in binary form must reproduce the above copyright
  17  *    notice, this list of conditions and the following disclaimer in the
  18  *    documentation and/or other materials provided with the distribution.
  19  *
  20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  23  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
  24  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  25  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  26  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
  27  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  28  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
  30  * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  31  * POSSIBILITY OF SUCH DAMAGE.
  32  */
  33
  34 /*
  35  *      Superpage reservation management module
  36  *
  37  * Any external functions defined by this module are only to be used by the
  38  * virtual memory system.
  39  */
  40
  41 #include <sys/cdefs.h>
  42 __FBSDID("$FreeBSD$");
  43
  44 #include "opt_vm.h"
  45
  46 #include <sys/param.h>
  47 #include <sys/kernel.h>
  48 #include <sys/lock.h>
  49 #include <sys/malloc.h>
  50 #include <sys/mutex.h>
  51 #include <sys/queue.h>
  52 #include <sys/rwlock.h>
  53 #include <sys/sbuf.h>
  54 #include <sys/sysctl.h>
  55 #include <sys/systm.h>
  56 #include <sys/vmmeter.h>
  57
  58 #include <vm/vm.h>
  59 #include <vm/vm_param.h>
  60 #include <vm/vm_object.h>
  61 #include <vm/vm_page.h>
  62 #include <vm/vm_pageout.h>
  63 #include <vm/vm_phys.h>
  64 #include <vm/vm_pagequeue.h>
  65 #include <vm/vm_radix.h>
  66 #include <vm/vm_reserv.h>
  67
  68 /*
  69  * The reservation system supports the speculative allocation of large physical
  70  * pages ("superpages").  Speculative allocation enables the fully automatic
  71  * utilization of superpages by the virtual memory system.  In other words, no
  72  * programmatic directives are required to use superpages.
  73  */
  74
  75 #if VM_NRESERVLEVEL > 0
  76
  77 /*
  78  * The number of small pages that are contained in a level 0 reservation
  79  */
  80 #define VM_LEVEL_0_NPAGES       (1 << VM_LEVEL_0_ORDER)
  81
  82 /*
  83  * The number of bits by which a physical address is shifted to obtain the
  84  * reservation number
  85  */
  86 #define VM_LEVEL_0_SHIFT        (VM_LEVEL_0_ORDER + PAGE_SHIFT)
  87
  88 /*
  89  * The size of a level 0 reservation in bytes
  90  */
  91 #define VM_LEVEL_0_SIZE         (1 << VM_LEVEL_0_SHIFT)
  92
  93 /*
  94  * Computes the index of the small page underlying the given (object, pindex)
  95  * within the reservation's array of small pages.
  96  */
  97 #define VM_RESERV_INDEX(object, pindex) \
  98     (((object)->pg_color + (pindex)) & (VM_LEVEL_0_NPAGES - 1))
  99
 100 /*
 101  * The size of a population map entry
 102  */
 103 typedef u_long          popmap_t;
 104
 105 /*
 106  * The number of bits in a population map entry
 107  */
 108 #define NBPOPMAP        (NBBY * sizeof(popmap_t))
 109
 110 /*
 111  * The number of population map entries in a reservation
 112  */
 113 #define NPOPMAP         howmany(VM_LEVEL_0_NPAGES, NBPOPMAP)
 114
 115 /*
 116  * Clear a bit in the population map.
 117  */
 118 static __inline void
 119 popmap_clear(popmap_t popmap[], int i)
 120 {
 121
 122         popmap[i / NBPOPMAP] &= ~(1UL << (i % NBPOPMAP));
 123 }
 124
 125 /*
 126  * Set a bit in the population map.
 127  */
 128 static __inline void
 129 popmap_set(popmap_t popmap[], int i)
 130 {
 131
 132         popmap[i / NBPOPMAP] |= 1UL << (i % NBPOPMAP);
 133 }
 134
 135 /*
 136  * Is a bit in the population map clear?
 137  */
 138 static __inline boolean_t
 139 popmap_is_clear(popmap_t popmap[], int i)
 140 {
 141
 142         return ((popmap[i / NBPOPMAP] & (1UL << (i % NBPOPMAP))) == 0);
 143 }
 144
 145 /*
 146  * Is a bit in the population map set?
 147  */
 148 static __inline boolean_t
 149 popmap_is_set(popmap_t popmap[], int i)
 150 {
 151
 152         return ((popmap[i / NBPOPMAP] & (1UL << (i % NBPOPMAP))) != 0);
 153 }
 154
 155 /*
 156  * The reservation structure
 157  *
 158  * A reservation structure is constructed whenever a large physical page is
 159  * speculatively allocated to an object.  The reservation provides the small
 160  * physical pages for the range [pindex, pindex + VM_LEVEL_0_NPAGES) of offsets
 161  * within that object.  The reservation's "popcnt" tracks the number of these
 162  * small physical pages that are in use at any given time.  When and if the
 163  * reservation is not fully utilized, it appears in the queue of partially
 164  * populated reservations.  The reservation always appears on the containing
 165  * object's list of reservations.
 166  *
 167  * A partially populated reservation can be broken and reclaimed at any time.
 168  *
 169  * f - vm_domain_free_lock
 170  * o - vm_reserv_object_lock
 171  * c - constant after boot
 172  */
 173 struct vm_reserv {
 174         TAILQ_ENTRY(vm_reserv) partpopq;        /* (f) per-domain queue. */
 175         LIST_ENTRY(vm_reserv) objq;             /* (o, f) object queue */
 176         vm_object_t     object;                 /* (o, f) containing object */
 177         vm_pindex_t     pindex;                 /* (o, f) offset in object */
 178         vm_page_t       pages;                  /* (c) first page  */
 179         int             domain;                 /* (c) NUMA domain. */
 180         int             popcnt;                 /* (f) # of pages in use */
 181         char            inpartpopq;             /* (f) */
 182         popmap_t        popmap[NPOPMAP];        /* (f) bit vector, used pages */
 183 };
 184
 185 /*
 186  * The reservation array
 187  *
 188  * This array is analoguous in function to vm_page_array.  It differs in the
 189  * respect that it may contain a greater number of useful reservation
 190  * structures than there are (physical) superpages.  These "invalid"
 191  * reservation structures exist to trade-off space for time in the
 192  * implementation of vm_reserv_from_page().  Invalid reservation structures are
 193  * distinguishable from "valid" reservation structures by inspecting the
 194  * reservation's "pages" field.  Invalid reservation structures have a NULL
 195  * "pages" field.
 196  *
 197  * vm_reserv_from_page() maps a small (physical) page to an element of this
 198  * array by computing a physical reservation number from the page's physical
 199  * address.  The physical reservation number is used as the array index.
 200  *
 201  * An "active" reservation is a valid reservation structure that has a non-NULL
 202  * "object" field and a non-zero "popcnt" field.  In other words, every active
 203  * reservation belongs to a particular object.  Moreover, every active
 204  * reservation has an entry in the containing object's list of reservations.
 205  */
 206 static vm_reserv_t vm_reserv_array;
 207
 208 /*
 209  * The partially populated reservation queue
 210  *
 211  * This queue enables the fast recovery of an unused free small page from a
 212  * partially populated reservation.  The reservation at the head of this queue
 213  * is the least recently changed, partially populated reservation.
 214  *
 215  * Access to this queue is synchronized by the free page queue lock.
 216  */
 217 static TAILQ_HEAD(, vm_reserv) vm_rvq_partpop[MAXMEMDOM];
 218
 219 static SYSCTL_NODE(_vm, OID_AUTO, reserv, CTLFLAG_RD, 0, "Reservation Info");
 220
 221 static long vm_reserv_broken;
 222 SYSCTL_LONG(_vm_reserv, OID_AUTO, broken, CTLFLAG_RD,
 223     &vm_reserv_broken, 0, "Cumulative number of broken reservations");
 224
 225 static long vm_reserv_freed;
 226 SYSCTL_LONG(_vm_reserv, OID_AUTO, freed, CTLFLAG_RD,
 227     &vm_reserv_freed, 0, "Cumulative number of freed reservations");
 228
 229 static int sysctl_vm_reserv_fullpop(SYSCTL_HANDLER_ARGS);
 230
 231 SYSCTL_PROC(_vm_reserv, OID_AUTO, fullpop, CTLTYPE_INT | CTLFLAG_RD, NULL, 0,
 232     sysctl_vm_reserv_fullpop, "I", "Current number of full reservations");
 233
 234 static int sysctl_vm_reserv_partpopq(SYSCTL_HANDLER_ARGS);
 235
 236 SYSCTL_OID(_vm_reserv, OID_AUTO, partpopq, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0,
 237     sysctl_vm_reserv_partpopq, "A", "Partially populated reservation queues");
 238
 239 static long vm_reserv_reclaimed;
 240 SYSCTL_LONG(_vm_reserv, OID_AUTO, reclaimed, CTLFLAG_RD,
 241     &vm_reserv_reclaimed, 0, "Cumulative number of reclaimed reservations");
 242
 243 /*
 244  * The object lock pool is used to synchronize the rvq.  We can not use a
 245  * pool mutex because it is required before malloc works.
 246  *
 247  * The "hash" function could be made faster without divide and modulo.
 248  */
 249 #define VM_RESERV_OBJ_LOCK_COUNT        MAXCPU
 250
 251 struct mtx_padalign vm_reserv_object_mtx[VM_RESERV_OBJ_LOCK_COUNT];
 252
 253 #define vm_reserv_object_lock_idx(object)                       \
 254             (((uintptr_t)object / sizeof(*object)) % VM_RESERV_OBJ_LOCK_COUNT)
 255 #define vm_reserv_object_lock_ptr(object)                       \
 256             &vm_reserv_object_mtx[vm_reserv_object_lock_idx((object))]
 257 #define vm_reserv_object_lock(object)                           \
 258             mtx_lock(vm_reserv_object_lock_ptr((object)))
 259 #define vm_reserv_object_unlock(object)                         \
 260             mtx_unlock(vm_reserv_object_lock_ptr((object)))
 261
 262 static void             vm_reserv_break(vm_reserv_t rv, vm_page_t m);
 263 static void             vm_reserv_depopulate(vm_reserv_t rv, int index);
 264 static vm_reserv_t      vm_reserv_from_page(vm_page_t m);
 265 static boolean_t        vm_reserv_has_pindex(vm_reserv_t rv,
 266                             vm_pindex_t pindex);
 267 static void             vm_reserv_populate(vm_reserv_t rv, int index);
 268 static void             vm_reserv_reclaim(vm_reserv_t rv);
 269
 270 /*
 271  * Returns the current number of full reservations.
 272  *
 273  * Since the number of full reservations is computed without acquiring the
 274  * free page queue lock, the returned value may be inexact.
 275  */
 276 static int
 277 sysctl_vm_reserv_fullpop(SYSCTL_HANDLER_ARGS)
 278 {
 279         vm_paddr_t paddr;
 280         struct vm_phys_seg *seg;
 281         vm_reserv_t rv;
 282         int fullpop, segind;
 283
 284         fullpop = 0;
 285         for (segind = 0; segind < vm_phys_nsegs; segind++) {
 286                 seg = &vm_phys_segs[segind];
 287                 paddr = roundup2(seg->start, VM_LEVEL_0_SIZE);
 288                 while (paddr + VM_LEVEL_0_SIZE <= seg->end) {
 289                         rv = &vm_reserv_array[paddr >> VM_LEVEL_0_SHIFT];
 290                         fullpop += rv->popcnt == VM_LEVEL_0_NPAGES;
 291                         paddr += VM_LEVEL_0_SIZE;
 292                 }
 293         }
 294         return (sysctl_handle_int(oidp, &fullpop, 0, req));
 295 }
 296
 297 /*
 298  * Describes the current state of the partially populated reservation queue.
 299  */
 300 static int
 301 sysctl_vm_reserv_partpopq(SYSCTL_HANDLER_ARGS)
 302 {
 303         struct sbuf sbuf;
 304         vm_reserv_t rv;
 305         int counter, error, domain, level, unused_pages;
 306
 307         error = sysctl_wire_old_buffer(req, 0);
 308         if (error != 0)
 309                 return (error);
 310         sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
 311         sbuf_printf(&sbuf, "\nDOMAIN    LEVEL     SIZE  NUMBER\n\n");
 312         for (domain = 0; domain < vm_ndomains; domain++) {
 313                 for (level = -1; level <= VM_NRESERVLEVEL - 2; level++) {
 314                         counter = 0;
 315                         unused_pages = 0;
 316                         vm_domain_free_lock(VM_DOMAIN(domain));
 317                         TAILQ_FOREACH(rv, &vm_rvq_partpop[domain], partpopq) {
 318                                 counter++;
 319                                 unused_pages += VM_LEVEL_0_NPAGES - rv->popcnt;
 320                         }
 321                         vm_domain_free_unlock(VM_DOMAIN(domain));
 322                         sbuf_printf(&sbuf, "%6d, %7d, %6dK, %6d\n",
 323                             domain, level,
 324                             unused_pages * ((int)PAGE_SIZE / 1024), counter);
 325                 }
 326         }
 327         error = sbuf_finish(&sbuf);
 328         sbuf_delete(&sbuf);
 329         return (error);
 330 }
 331
 332 /*
 333  * Remove a reservation from the object's objq.
 334  */
 335 static void
 336 vm_reserv_remove(vm_reserv_t rv)
 337 {
 338         vm_object_t object;
 339
 340         KASSERT(rv->object != NULL,
 341             ("vm_reserv_remove: reserv %p is free", rv));
 342         KASSERT(!rv->inpartpopq,
 343             ("vm_reserv_remove: reserv %p's inpartpopq is TRUE", rv));
 344         object = rv->object;
 345         vm_reserv_object_lock(object);
 346         LIST_REMOVE(rv, objq);
 347         rv->object = NULL;
 348         vm_reserv_object_unlock(object);
 349 }
 350
 351 /*
 352  * Insert a new reservation into the object's objq.
 353  */
 354 static void
 355 vm_reserv_insert(vm_reserv_t rv, vm_object_t object, vm_pindex_t pindex)
 356 {
 357         int i;
 358
 359         KASSERT(rv->object == NULL,
 360             ("vm_reserv_insert: reserv %p isn't free", rv));
 361         KASSERT(rv->popcnt == 0,
 362             ("vm_reserv_insert: reserv %p's popcnt is corrupted", rv));
 363         KASSERT(!rv->inpartpopq,
 364             ("vm_reserv_insert: reserv %p's inpartpopq is TRUE", rv));
 365         for (i = 0; i < NPOPMAP; i++)
 366                 KASSERT(rv->popmap[i] == 0,
 367                     ("vm_reserv_insert: reserv %p's popmap is corrupted", rv));
 368         vm_reserv_object_lock(object);
 369         rv->pindex = pindex;
 370         rv->object = object;
 371         LIST_INSERT_HEAD(&object->rvq, rv, objq);
 372         vm_reserv_object_unlock(object);
 373 }
 374
 375 /*
 376  * Reduces the given reservation's population count.  If the population count
 377  * becomes zero, the reservation is destroyed.  Additionally, moves the
 378  * reservation to the tail of the partially populated reservation queue if the
 379  * population count is non-zero.
 380  *
 381  * The free page queue lock must be held.
 382  */
 383 static void
 384 vm_reserv_depopulate(vm_reserv_t rv, int index)
 385 {
 386
 387         vm_domain_free_assert_locked(VM_DOMAIN(rv->domain));
 388         KASSERT(rv->object != NULL,
 389             ("vm_reserv_depopulate: reserv %p is free", rv));
 390         KASSERT(popmap_is_set(rv->popmap, index),
 391             ("vm_reserv_depopulate: reserv %p's popmap[%d] is clear", rv,
 392             index));
 393         KASSERT(rv->popcnt > 0,
 394             ("vm_reserv_depopulate: reserv %p's popcnt is corrupted", rv));
 395         KASSERT(rv->domain >= 0 && rv->domain < vm_ndomains,
 396             ("vm_reserv_depopulate: reserv %p's domain is corrupted %d",
 397             rv, rv->domain));
 398         if (rv->inpartpopq) {
 399                 TAILQ_REMOVE(&vm_rvq_partpop[rv->domain], rv, partpopq);
 400                 rv->inpartpopq = FALSE;
 401         } else {
 402                 KASSERT(rv->pages->psind == 1,
 403                     ("vm_reserv_depopulate: reserv %p is already demoted",
 404                     rv));
 405                 rv->pages->psind = 0;
 406         }
 407         popmap_clear(rv->popmap, index);
 408         rv->popcnt--;
 409         if (rv->popcnt == 0) {
 410                 vm_reserv_remove(rv);
 411                 vm_phys_free_pages(rv->pages, VM_LEVEL_0_ORDER);
 412                 vm_reserv_freed++;
 413         } else {
 414                 rv->inpartpopq = TRUE;
 415                 TAILQ_INSERT_TAIL(&vm_rvq_partpop[rv->domain], rv, partpopq);
 416         }
 417 }
 418
 419 /*
 420  * Returns the reservation to which the given page might belong.
 421  */
 422 static __inline vm_reserv_t
 423 vm_reserv_from_page(vm_page_t m)
 424 {
 425
 426         return (&vm_reserv_array[VM_PAGE_TO_PHYS(m) >> VM_LEVEL_0_SHIFT]);
 427 }
 428
 429 /*
 430  * Returns an existing reservation or NULL and initialized successor pointer.
 431  */
 432 static vm_reserv_t
 433 vm_reserv_from_object(vm_object_t object, vm_pindex_t pindex,
 434     vm_page_t mpred, vm_page_t *msuccp)
 435 {
 436         vm_reserv_t rv;
 437         vm_page_t msucc;
 438
 439         msucc = NULL;
 440         if (mpred != NULL) {
 441                 KASSERT(mpred->object == object,
 442                     ("vm_reserv_from_object: object doesn't contain mpred"));
 443                 KASSERT(mpred->pindex < pindex,
 444                     ("vm_reserv_from_object: mpred doesn't precede pindex"));
 445                 rv = vm_reserv_from_page(mpred);
 446                 if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
 447                         goto found;
 448                 msucc = TAILQ_NEXT(mpred, listq);
 449         } else
 450                 msucc = TAILQ_FIRST(&object->memq);
 451         if (msucc != NULL) {
 452                 KASSERT(msucc->pindex > pindex,
 453                     ("vm_reserv_from_object: msucc doesn't succeed pindex"));
 454                 rv = vm_reserv_from_page(msucc);
 455                 if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
 456                         goto found;
 457         }
 458         rv = NULL;
 459
 460 found:
 461         *msuccp = msucc;
 462
 463         return (rv);
 464 }
 465
 466 /*
 467  * Returns TRUE if the given reservation contains the given page index and
 468  * FALSE otherwise.
 469  */
 470 static __inline boolean_t
 471 vm_reserv_has_pindex(vm_reserv_t rv, vm_pindex_t pindex)
 472 {
 473
 474         return (((pindex - rv->pindex) & ~(VM_LEVEL_0_NPAGES - 1)) == 0);
 475 }
 476
 477 /*
 478  * Increases the given reservation's population count.  Moves the reservation
 479  * to the tail of the partially populated reservation queue.
 480  *
 481  * The free page queue must be locked.
 482  */
 483 static void
 484 vm_reserv_populate(vm_reserv_t rv, int index)
 485 {
 486
 487         vm_domain_free_assert_locked(VM_DOMAIN(rv->domain));
 488         KASSERT(rv->object != NULL,
 489             ("vm_reserv_populate: reserv %p is free", rv));
 490         KASSERT(popmap_is_clear(rv->popmap, index),
 491             ("vm_reserv_populate: reserv %p's popmap[%d] is set", rv,
 492             index));
 493         KASSERT(rv->popcnt < VM_LEVEL_0_NPAGES,
 494             ("vm_reserv_populate: reserv %p is already full", rv));
 495         KASSERT(rv->pages->psind == 0,
 496             ("vm_reserv_populate: reserv %p is already promoted", rv));
 497         KASSERT(rv->domain >= 0 && rv->domain < vm_ndomains,
 498             ("vm_reserv_populate: reserv %p's domain is corrupted %d",
 499             rv, rv->domain));
 500         if (rv->inpartpopq) {
 501                 TAILQ_REMOVE(&vm_rvq_partpop[rv->domain], rv, partpopq);
 502                 rv->inpartpopq = FALSE;
 503         }
 504         popmap_set(rv->popmap, index);
 505         rv->popcnt++;
 506         if (rv->popcnt < VM_LEVEL_0_NPAGES) {
 507                 rv->inpartpopq = TRUE;
 508                 TAILQ_INSERT_TAIL(&vm_rvq_partpop[rv->domain], rv, partpopq);
 509         } else
 510                 rv->pages->psind = 1;
 511 }
 512
 513 /*
 514  * Allocates a contiguous set of physical pages of the given size "npages"
 515  * from existing or newly created reservations.  All of the physical pages
 516  * must be at or above the given physical address "low" and below the given
 517  * physical address "high".  The given value "alignment" determines the
 518  * alignment of the first physical page in the set.  If the given value
 519  * "boundary" is non-zero, then the set of physical pages cannot cross any
 520  * physical address boundary that is a multiple of that value.  Both
 521  * "alignment" and "boundary" must be a power of two.
 522  *
 523  * The page "mpred" must immediately precede the offset "pindex" within the
 524  * specified object.
 525  *
 526  * The object and free page queue must be locked.
 527  */
 528 vm_page_t
 529 vm_reserv_extend_contig(int req, vm_object_t object, vm_pindex_t pindex,
 530     int domain, u_long npages, vm_paddr_t low, vm_paddr_t high,
 531     u_long alignment, vm_paddr_t boundary, vm_page_t mpred)
 532 {
 533         struct vm_domain *vmd;
 534         vm_paddr_t pa, size;
 535         vm_page_t m, msucc;
 536         vm_reserv_t rv;
 537         int i, index;
 538
 539         VM_OBJECT_ASSERT_WLOCKED(object);
 540         KASSERT(npages != 0, ("vm_reserv_alloc_contig: npages is 0"));
 541
 542         /*
 543          * Is a reservation fundamentally impossible?
 544          */
 545         if (pindex < VM_RESERV_INDEX(object, pindex) ||
 546             pindex + npages > object->size || object->resident_page_count == 0)
 547                 return (NULL);
 548
 549         /*
 550          * All reservations of a particular size have the same alignment.
 551          * Assuming that the first page is allocated from a reservation, the
 552          * least significant bits of its physical address can be determined
 553          * from its offset from the beginning of the reservation and the size
 554          * of the reservation.
 555          *
 556          * Could the specified index within a reservation of the smallest
 557          * possible size satisfy the alignment and boundary requirements?
 558          */
 559         pa = VM_RESERV_INDEX(object, pindex) << PAGE_SHIFT;
 560         if ((pa & (alignment - 1)) != 0)
 561                 return (NULL);
 562         size = npages << PAGE_SHIFT;
 563         if (((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0)
 564                 return (NULL);
 565
 566         /*
 567          * Look for an existing reservation.
 568          */
 569         rv = vm_reserv_from_object(object, pindex, mpred, &msucc);
 570         if (rv == NULL)
 571                 return (NULL);
 572         KASSERT(object != kernel_object || rv->domain == domain,
 573             ("vm_reserv_extend_contig: Domain mismatch from reservation."));
 574         index = VM_RESERV_INDEX(object, pindex);
 575         /* Does the allocation fit within the reservation? */
 576         if (index + npages > VM_LEVEL_0_NPAGES)
 577                 return (NULL);
 578         domain = rv->domain;
 579         vmd = VM_DOMAIN(domain);
 580         vm_domain_free_lock(vmd);
 581         if (rv->object != object || !vm_domain_available(vmd, req, npages)) {
 582                 m = NULL;
 583                 goto out;
 584         }
 585         m = &rv->pages[index];
 586         pa = VM_PAGE_TO_PHYS(m);
 587         if (pa < low || pa + size > high || (pa & (alignment - 1)) != 0 ||
 588             ((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0) {
 589                 m = NULL;
 590                 goto out;
 591         }
 592         /* Handle vm_page_rename(m, new_object, ...). */
 593         for (i = 0; i < npages; i++) {
 594                 if (popmap_is_set(rv->popmap, index + i)) {
 595                         m = NULL;
 596                         goto out;
 597                 }
 598         }
 599         for (i = 0; i < npages; i++)
 600                 vm_reserv_populate(rv, index + i);
 601         vm_domain_freecnt_adj(vmd, -npages);
 602 out:
 603         vm_domain_free_unlock(vmd);
 604         return (m);
 605 }
 606
 607 /*
 608  * Allocates a contiguous set of physical pages of the given size "npages"
 609  * from existing or newly created reservations.  All of the physical pages
 610  * must be at or above the given physical address "low" and below the given
 611  * physical address "high".  The given value "alignment" determines the
 612  * alignment of the first physical page in the set.  If the given value
 613  * "boundary" is non-zero, then the set of physical pages cannot cross any
 614  * physical address boundary that is a multiple of that value.  Both
 615  * "alignment" and "boundary" must be a power of two.
 616  *
 617  * The page "mpred" must immediately precede the offset "pindex" within the
 618  * specified object.
 619  *
 620  * The object and free page queue must be locked.
 621  */
 622 vm_page_t
 623 vm_reserv_alloc_contig(vm_object_t object, vm_pindex_t pindex, int domain,
 624     u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment,
 625     vm_paddr_t boundary, vm_page_t mpred)
 626 {
 627         vm_paddr_t pa, size;
 628         vm_page_t m, m_ret, msucc;
 629         vm_pindex_t first, leftcap, rightcap;
 630         vm_reserv_t rv;
 631         u_long allocpages, maxpages, minpages;
 632         int i, index, n;
 633
 634         vm_domain_free_assert_locked(VM_DOMAIN(domain));
 635         VM_OBJECT_ASSERT_WLOCKED(object);
 636         KASSERT(npages != 0, ("vm_reserv_alloc_contig: npages is 0"));
 637
 638         /*
 639          * Is a reservation fundamentally impossible?
 640          */
 641         if (pindex < VM_RESERV_INDEX(object, pindex) ||
 642             pindex + npages > object->size)
 643                 return (NULL);
 644
 645         /*
 646          * All reservations of a particular size have the same alignment.
 647          * Assuming that the first page is allocated from a reservation, the
 648          * least significant bits of its physical address can be determined
 649          * from its offset from the beginning of the reservation and the size
 650          * of the reservation.
 651          *
 652          * Could the specified index within a reservation of the smallest
 653          * possible size satisfy the alignment and boundary requirements?
 654          */
 655         pa = VM_RESERV_INDEX(object, pindex) << PAGE_SHIFT;
 656         if ((pa & (alignment - 1)) != 0)
 657                 return (NULL);
 658         size = npages << PAGE_SHIFT;
 659         if (((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0)
 660                 return (NULL);
 661
 662         /*
 663          * Callers should've extended an existing reservation prior to
 664          * calling this function.  If a reservation exists it is
 665          * incompatible with the allocation.
 666          */
 667         rv = vm_reserv_from_object(object, pindex, mpred, &msucc);
 668         if (rv != NULL)
 669                 return (NULL);
 670
 671         /*
 672          * Could at least one reservation fit between the first index to the
 673          * left that can be used ("leftcap") and the first index to the right
 674          * that cannot be used ("rightcap")?
 675          *
 676          * We must synchronize with the reserv object lock to protect the
 677          * pindex/object of the resulting reservations against rename while
 678          * we are inspecting.
 679          */
 680         first = pindex - VM_RESERV_INDEX(object, pindex);
 681         minpages = VM_RESERV_INDEX(object, pindex) + npages;
 682         maxpages = roundup2(minpages, VM_LEVEL_0_NPAGES);
 683         allocpages = maxpages;
 684         vm_reserv_object_lock(object);
 685         if (mpred != NULL) {
 686                 if ((rv = vm_reserv_from_page(mpred))->object != object)
 687                         leftcap = mpred->pindex + 1;
 688                 else
 689                         leftcap = rv->pindex + VM_LEVEL_0_NPAGES;
 690                 if (leftcap > first) {
 691                         vm_reserv_object_unlock(object);
 692                         return (NULL);
 693                 }
 694         }
 695         if (msucc != NULL) {
 696                 if ((rv = vm_reserv_from_page(msucc))->object != object)
 697                         rightcap = msucc->pindex;
 698                 else
 699                         rightcap = rv->pindex;
 700                 if (first + maxpages > rightcap) {
 701                         if (maxpages == VM_LEVEL_0_NPAGES) {
 702                                 vm_reserv_object_unlock(object);
 703                                 return (NULL);
 704                         }
 705
 706                         /*
 707                          * At least one reservation will fit between "leftcap"
 708                          * and "rightcap".  However, a reservation for the
 709                          * last of the requested pages will not fit.  Reduce
 710                          * the size of the upcoming allocation accordingly.
 711                          */
 712                         allocpages = minpages;
 713                 }
 714         }
 715         vm_reserv_object_unlock(object);
 716
 717         /*
 718          * Would the last new reservation extend past the end of the object?
 719          */
 720         if (first + maxpages > object->size) {
 721                 /*
 722                  * Don't allocate the last new reservation if the object is a
 723                  * vnode or backed by another object that is a vnode.
 724                  */
 725                 if (object->type == OBJT_VNODE ||
 726                     (object->backing_object != NULL &&
 727                     object->backing_object->type == OBJT_VNODE)) {
 728                         if (maxpages == VM_LEVEL_0_NPAGES)
 729                                 return (NULL);
 730                         allocpages = minpages;
 731                 }
 732                 /* Speculate that the object may grow. */
 733         }
 734
 735         /*
 736          * Allocate the physical pages.  The alignment and boundary specified
 737          * for this allocation may be different from the alignment and
 738          * boundary specified for the requested pages.  For instance, the
 739          * specified index may not be the first page within the first new
 740          * reservation.
 741          */
 742         m = vm_phys_alloc_contig(domain, allocpages, low, high, ulmax(alignment,
 743             VM_LEVEL_0_SIZE), boundary > VM_LEVEL_0_SIZE ? boundary : 0);
 744         if (m == NULL)
 745                 return (NULL);
 746         KASSERT(vm_phys_domain(m) == domain,
 747             ("vm_reserv_alloc_contig: Page domain does not match requested."));
 748
 749         /*
 750          * The allocated physical pages always begin at a reservation
 751          * boundary, but they do not always end at a reservation boundary.
 752          * Initialize every reservation that is completely covered by the
 753          * allocated physical pages.
 754          */
 755         m_ret = NULL;
 756         index = VM_RESERV_INDEX(object, pindex);
 757         do {
 758                 rv = vm_reserv_from_page(m);
 759                 KASSERT(rv->pages == m,
 760                     ("vm_reserv_alloc_contig: reserv %p's pages is corrupted",
 761                     rv));
 762                 vm_reserv_insert(rv, object, first);
 763                 n = ulmin(VM_LEVEL_0_NPAGES - index, npages);
 764                 for (i = 0; i < n; i++)
 765                         vm_reserv_populate(rv, index + i);
 766                 npages -= n;
 767                 if (m_ret == NULL) {
 768                         m_ret = &rv->pages[index];
 769                         index = 0;
 770                 }
 771                 m += VM_LEVEL_0_NPAGES;
 772                 first += VM_LEVEL_0_NPAGES;
 773                 allocpages -= VM_LEVEL_0_NPAGES;
 774         } while (allocpages >= VM_LEVEL_0_NPAGES);
 775         return (m_ret);
 776 }
 777
 778 /*
 779  * Attempts to extend an existing reservation and allocate the page to the
 780  * object.
 781  *
 782  * The page "mpred" must immediately precede the offset "pindex" within the
 783  * specified object.
 784  *
 785  * The object must be locked.
 786  */
 787 vm_page_t
 788 vm_reserv_extend(int req, vm_object_t object, vm_pindex_t pindex, int domain,
 789     vm_page_t mpred)
 790 {
 791         struct vm_domain *vmd;
 792         vm_page_t m, msucc;
 793         vm_reserv_t rv;
 794         int index, free_count;
 795
 796         VM_OBJECT_ASSERT_WLOCKED(object);
 797
 798         /*
 799          * Could a reservation currently exist?
 800          */
 801         if (pindex < VM_RESERV_INDEX(object, pindex) ||
 802             pindex >= object->size || object->resident_page_count == 0)
 803                 return (NULL);
 804
 805         /*
 806          * Look for an existing reservation.
 807          */
 808         rv = vm_reserv_from_object(object, pindex, mpred, &msucc);
 809         if (rv == NULL)
 810                 return (NULL);
 811
 812         KASSERT(object != kernel_object || rv->domain == domain,
 813             ("vm_reserv_extend: Domain mismatch from reservation."));
 814         domain = rv->domain;
 815         vmd = VM_DOMAIN(domain);
 816         index = VM_RESERV_INDEX(object, pindex);
 817         m = &rv->pages[index];
 818         vm_domain_free_lock(vmd);
 819         if (vm_domain_available(vmd, req, 1) == 0 ||
 820             /* Handle reclaim race. */
 821             rv->object != object ||
 822             /* Handle vm_page_rename(m, new_object, ...). */
 823             popmap_is_set(rv->popmap, index))
 824                 m = NULL;
 825         if (m != NULL) {
 826                 vm_reserv_populate(rv, index);
 827                 free_count = vm_domain_freecnt_adj(vmd, -1);
 828         } else
 829                 free_count = vmd->vmd_free_count;
 830         vm_domain_free_unlock(vmd);
 831
 832         if (vm_paging_needed(vmd, free_count))
 833                 pagedaemon_wakeup(domain);
 834
 835         return (m);
 836 }
 837
 838 /*
 839  * Allocates a page from an existing reservation.
 840  *
 841  * The page "mpred" must immediately precede the offset "pindex" within the
 842  * specified object.
 843  *
 844  * The object and free page queue must be locked.
 845  */
 846 vm_page_t
 847 vm_reserv_alloc_page(vm_object_t object, vm_pindex_t pindex, int domain,
 848     vm_page_t mpred)
 849 {
 850         vm_page_t m, msucc;
 851         vm_pindex_t first, leftcap, rightcap;
 852         vm_reserv_t rv;
 853         int index;
 854
 855         vm_domain_free_assert_locked(VM_DOMAIN(domain));
 856         VM_OBJECT_ASSERT_WLOCKED(object);
 857
 858         /*
 859          * Is a reservation fundamentally impossible?
 860          */
 861         if (pindex < VM_RESERV_INDEX(object, pindex) ||
 862             pindex >= object->size)
 863                 return (NULL);
 864
 865         /*
 866          * Callers should've extended an existing reservation prior to
 867          * calling this function.  If a reservation exists it is
 868          * incompatible with the allocation.
 869          */
 870         rv = vm_reserv_from_object(object, pindex, mpred, &msucc);
 871         if (rv != NULL)
 872                 return (NULL);
 873
 874         /*
 875          * Could a reservation fit between the first index to the left that
 876          * can be used and the first index to the right that cannot be used?
 877          *
 878          * We must synchronize with the reserv object lock to protect the
 879          * pindex/object of the resulting reservations against rename while
 880          * we are inspecting.
 881          */
 882         first = pindex - VM_RESERV_INDEX(object, pindex);
 883         vm_reserv_object_lock(object);
 884         if (mpred != NULL) {
 885                 if ((rv = vm_reserv_from_page(mpred))->object != object)
 886                         leftcap = mpred->pindex + 1;
 887                 else
 888                         leftcap = rv->pindex + VM_LEVEL_0_NPAGES;
 889                 if (leftcap > first) {
 890                         vm_reserv_object_unlock(object);
 891                         return (NULL);
 892                 }
 893         }
 894         if (msucc != NULL) {
 895                 if ((rv = vm_reserv_from_page(msucc))->object != object)
 896                         rightcap = msucc->pindex;
 897                 else
 898                         rightcap = rv->pindex;
 899                 if (first + VM_LEVEL_0_NPAGES > rightcap) {
 900                         vm_reserv_object_unlock(object);
 901                         return (NULL);
 902                 }
 903         }
 904         vm_reserv_object_unlock(object);
 905
 906         /*
 907          * Would a new reservation extend past the end of the object?
 908          */
 909         if (first + VM_LEVEL_0_NPAGES > object->size) {
 910                 /*
 911                  * Don't allocate a new reservation if the object is a vnode or
 912                  * backed by another object that is a vnode.
 913                  */
 914                 if (object->type == OBJT_VNODE ||
 915                     (object->backing_object != NULL &&
 916                     object->backing_object->type == OBJT_VNODE))
 917                         return (NULL);
 918                 /* Speculate that the object may grow. */
 919         }
 920
 921         /*
 922          * Allocate and populate the new reservation.
 923          */
 924         m = vm_phys_alloc_pages(domain, VM_FREEPOOL_DEFAULT, VM_LEVEL_0_ORDER);
 925         if (m == NULL)
 926                 return (NULL);
 927         rv = vm_reserv_from_page(m);
 928         KASSERT(rv->pages == m,
 929             ("vm_reserv_alloc_page: reserv %p's pages is corrupted", rv));
 930         vm_reserv_insert(rv, object, first);
 931         index = VM_RESERV_INDEX(object, pindex);
 932         vm_reserv_populate(rv, index);
 933         return (&rv->pages[index]);
 934 }
 935
 936 /*
 937  * Breaks the given reservation.  Except for the specified free page, all free
 938  * pages in the reservation are returned to the physical memory allocator.
 939  * The reservation's population count and map are reset to their initial
 940  * state.
 941  *
 942  * The given reservation must not be in the partially populated reservation
 943  * queue.  The free page queue lock must be held.
 944  */
 945 static void
 946 vm_reserv_break(vm_reserv_t rv, vm_page_t m)
 947 {
 948         int begin_zeroes, hi, i, lo;
 949
 950         vm_domain_free_assert_locked(VM_DOMAIN(rv->domain));
 951         vm_reserv_remove(rv);
 952         if (m != NULL) {
 953                 /*
 954                  * Since the reservation is being broken, there is no harm in
 955                  * abusing the population map to stop "m" from being returned
 956                  * to the physical memory allocator.
 957                  */
 958                 i = m - rv->pages;
 959                 KASSERT(popmap_is_clear(rv->popmap, i),
 960                     ("vm_reserv_break: reserv %p's popmap is corrupted", rv));
 961                 popmap_set(rv->popmap, i);
 962                 rv->popcnt++;
 963         }
 964         i = hi = 0;
 965         do {
 966                 /* Find the next 0 bit.  Any previous 0 bits are < "hi". */
 967                 lo = ffsl(~(((1UL << hi) - 1) | rv->popmap[i]));
 968                 if (lo == 0) {
 969                         /* Redundantly clears bits < "hi". */
 970                         rv->popmap[i] = 0;
 971                         rv->popcnt -= NBPOPMAP - hi;
 972                         while (++i < NPOPMAP) {
 973                                 lo = ffsl(~rv->popmap[i]);
 974                                 if (lo == 0) {
 975                                         rv->popmap[i] = 0;
 976                                         rv->popcnt -= NBPOPMAP;
 977                                 } else
 978                                         break;
 979                         }
 980                         if (i == NPOPMAP)
 981                                 break;
 982                         hi = 0;
 983                 }
 984                 KASSERT(lo > 0, ("vm_reserv_break: lo is %d", lo));
 985                 /* Convert from ffsl() to ordinary bit numbering. */
 986                 lo--;
 987                 if (lo > 0) {
 988                         /* Redundantly clears bits < "hi". */
 989                         rv->popmap[i] &= ~((1UL << lo) - 1);
 990                         rv->popcnt -= lo - hi;
 991                 }
 992                 begin_zeroes = NBPOPMAP * i + lo;
 993                 /* Find the next 1 bit. */
 994                 do
 995                         hi = ffsl(rv->popmap[i]);
 996                 while (hi == 0 && ++i < NPOPMAP);
 997                 if (i != NPOPMAP)
 998                         /* Convert from ffsl() to ordinary bit numbering. */
 999                         hi--;
1000                 vm_phys_free_contig(&rv->pages[begin_zeroes], NBPOPMAP * i +
1001                     hi - begin_zeroes);
1002         } while (i < NPOPMAP);
1003         KASSERT(rv->popcnt == 0,
1004             ("vm_reserv_break: reserv %p's popcnt is corrupted", rv));
1005         vm_reserv_broken++;
1006 }
1007
1008 /*
1009  * Breaks all reservations belonging to the given object.
1010  */
1011 void
1012 vm_reserv_break_all(vm_object_t object)
1013 {
1014         vm_reserv_t rv;
1015         struct vm_domain *vmd;
1016
1017         /*
1018          * This access of object->rvq is unsynchronized so that the
1019          * object rvq lock can nest after the domain_free lock.  We
1020          * must check for races in the results.  However, the object
1021          * lock prevents new additions, so we are guaranteed that when
1022          * it returns NULL the object is properly empty.
1023          */
1024         vmd = NULL;
1025         while ((rv = LIST_FIRST(&object->rvq)) != NULL) {
1026                 if (vmd != VM_DOMAIN(rv->domain)) {
1027                         if (vmd != NULL)
1028                                 vm_domain_free_unlock(vmd);
1029                         vmd = VM_DOMAIN(rv->domain);
1030                         vm_domain_free_lock(vmd);
1031                 }
1032                 /* Reclaim race. */
1033                 if (rv->object != object)
1034                         continue;
1035                 KASSERT(rv->object == object,
1036                     ("vm_reserv_break_all: reserv %p is corrupted", rv));
1037                 if (rv->inpartpopq) {
1038                         TAILQ_REMOVE(&vm_rvq_partpop[rv->domain], rv, partpopq);
1039                         rv->inpartpopq = FALSE;
1040                 }
1041                 vm_reserv_break(rv, NULL);
1042         }
1043         if (vmd != NULL)
1044                 vm_domain_free_unlock(vmd);
1045 }
1046
1047 /*
1048  * Frees the given page if it belongs to a reservation.  Returns TRUE if the
1049  * page is freed and FALSE otherwise.
1050  *
1051  * The free page queue lock must be held.
1052  */
1053 boolean_t
1054 vm_reserv_free_page(vm_page_t m)
1055 {
1056         vm_reserv_t rv;
1057
1058         rv = vm_reserv_from_page(m);
1059         vm_domain_free_assert_locked(VM_DOMAIN(rv->domain));
1060         if (rv->object == NULL)
1061                 return (FALSE);
1062         vm_reserv_depopulate(rv, m - rv->pages);
1063         return (TRUE);
1064 }
1065
1066 /*
1067  * Initializes the reservation management system.  Specifically, initializes
1068  * the reservation array.
1069  *
1070  * Requires that vm_page_array and first_page are initialized!
1071  */
1072 void
1073 vm_reserv_init(void)
1074 {
1075         vm_paddr_t paddr;
1076         struct vm_phys_seg *seg;
1077         int i, segind;
1078
1079         /*
1080          * Initialize the reservation array.  Specifically, initialize the
1081          * "pages" field for every element that has an underlying superpage.
1082          */
1083         for (segind = 0; segind < vm_phys_nsegs; segind++) {
1084                 seg = &vm_phys_segs[segind];
1085                 paddr = roundup2(seg->start, VM_LEVEL_0_SIZE);
1086                 while (paddr + VM_LEVEL_0_SIZE <= seg->end) {
1087                         vm_reserv_array[paddr >> VM_LEVEL_0_SHIFT].pages =
1088                             PHYS_TO_VM_PAGE(paddr);
1089                         vm_reserv_array[paddr >> VM_LEVEL_0_SHIFT].domain =
1090                             seg->domain;
1091                         paddr += VM_LEVEL_0_SIZE;
1092                 }
1093         }
1094         for (i = 0; i < MAXMEMDOM; i++)
1095                 TAILQ_INIT(&vm_rvq_partpop[i]);
1096 }
1097
1098 /*
1099  * Returns true if the given page belongs to a reservation and that page is
1100  * free.  Otherwise, returns false.
1101  */
1102 bool
1103 vm_reserv_is_page_free(vm_page_t m)
1104 {
1105         vm_reserv_t rv;
1106
1107         rv = vm_reserv_from_page(m);
1108         vm_domain_free_assert_locked(VM_DOMAIN(rv->domain));
1109         if (rv->object == NULL)
1110                 return (false);
1111         return (popmap_is_clear(rv->popmap, m - rv->pages));
1112 }
1113
1114 /*
1115  * If the given page belongs to a reservation, returns the level of that
1116  * reservation.  Otherwise, returns -1.
1117  */
1118 int
1119 vm_reserv_level(vm_page_t m)
1120 {
1121         vm_reserv_t rv;
1122
1123         rv = vm_reserv_from_page(m);
1124         return (rv->object != NULL ? 0 : -1);
1125 }
1126
1127 /*
1128  * Returns a reservation level if the given page belongs to a fully populated
1129  * reservation and -1 otherwise.
1130  */
1131 int
1132 vm_reserv_level_iffullpop(vm_page_t m)
1133 {
1134         vm_reserv_t rv;
1135
1136         rv = vm_reserv_from_page(m);
1137         return (rv->popcnt == VM_LEVEL_0_NPAGES ? 0 : -1);
1138 }
1139
1140 /*
1141  * Breaks the given partially populated reservation, releasing its free pages
1142  * to the physical memory allocator.
1143  *
1144  * The free page queue lock must be held.
1145  */
1146 static void
1147 vm_reserv_reclaim(vm_reserv_t rv)
1148 {
1149
1150         vm_domain_free_assert_locked(VM_DOMAIN(rv->domain));
1151         KASSERT(rv->inpartpopq,
1152             ("vm_reserv_reclaim: reserv %p's inpartpopq is FALSE", rv));
1153         KASSERT(rv->domain >= 0 && rv->domain < vm_ndomains,
1154             ("vm_reserv_reclaim: reserv %p's domain is corrupted %d",
1155             rv, rv->domain));
1156         TAILQ_REMOVE(&vm_rvq_partpop[rv->domain], rv, partpopq);
1157         rv->inpartpopq = FALSE;
1158         vm_reserv_break(rv, NULL);
1159         vm_reserv_reclaimed++;
1160 }
1161
1162 /*
1163  * Breaks the reservation at the head of the partially populated reservation
1164  * queue, releasing its free pages to the physical memory allocator.  Returns
1165  * TRUE if a reservation is broken and FALSE otherwise.
1166  *
1167  * The free page queue lock must be held.
1168  */
1169 boolean_t
1170 vm_reserv_reclaim_inactive(int domain)
1171 {
1172         vm_reserv_t rv;
1173
1174         vm_domain_free_assert_locked(VM_DOMAIN(domain));
1175         if ((rv = TAILQ_FIRST(&vm_rvq_partpop[domain])) != NULL) {
1176                 vm_reserv_reclaim(rv);
1177                 return (TRUE);
1178         }
1179         return (FALSE);
1180 }
1181
1182 /*
1183  * Searches the partially populated reservation queue for the least recently
1184  * changed reservation with free pages that satisfy the given request for
1185  * contiguous physical memory.  If a satisfactory reservation is found, it is
1186  * broken.  Returns TRUE if a reservation is broken and FALSE otherwise.
1187  *
1188  * The free page queue lock must be held.
1189  */
1190 boolean_t
1191 vm_reserv_reclaim_contig(int domain, u_long npages, vm_paddr_t low,
1192     vm_paddr_t high, u_long alignment, vm_paddr_t boundary)
1193 {
1194         vm_paddr_t pa, size;
1195         vm_reserv_t rv;
1196         int hi, i, lo, low_index, next_free;
1197
1198         vm_domain_free_assert_locked(VM_DOMAIN(domain));
1199         if (npages > VM_LEVEL_0_NPAGES - 1)
1200                 return (FALSE);
1201         size = npages << PAGE_SHIFT;
1202         TAILQ_FOREACH(rv, &vm_rvq_partpop[domain], partpopq) {
1203                 pa = VM_PAGE_TO_PHYS(&rv->pages[VM_LEVEL_0_NPAGES - 1]);
1204                 if (pa + PAGE_SIZE - size < low) {
1205                         /* This entire reservation is too low; go to next. */
1206                         continue;
1207                 }
1208                 pa = VM_PAGE_TO_PHYS(&rv->pages[0]);
1209                 if (pa + size > high) {
1210                         /* This entire reservation is too high; go to next. */
1211                         continue;
1212                 }
1213                 if (pa < low) {
1214                         /* Start the search for free pages at "low". */
1215                         low_index = (low + PAGE_MASK - pa) >> PAGE_SHIFT;
1216                         i = low_index / NBPOPMAP;
1217                         hi = low_index % NBPOPMAP;
1218                 } else
1219                         i = hi = 0;
1220                 do {
1221                         /* Find the next free page. */
1222                         lo = ffsl(~(((1UL << hi) - 1) | rv->popmap[i]));
1223                         while (lo == 0 && ++i < NPOPMAP)
1224                                 lo = ffsl(~rv->popmap[i]);
1225                         if (i == NPOPMAP)
1226                                 break;
1227                         /* Convert from ffsl() to ordinary bit numbering. */
1228                         lo--;
1229                         next_free = NBPOPMAP * i + lo;
1230                         pa = VM_PAGE_TO_PHYS(&rv->pages[next_free]);
1231                         KASSERT(pa >= low,
1232                             ("vm_reserv_reclaim_contig: pa is too low"));
1233                         if (pa + size > high) {
1234                                 /* The rest of this reservation is too high. */
1235                                 break;
1236                         } else if ((pa & (alignment - 1)) != 0 ||
1237                             ((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0) {
1238                                 /*
1239                                  * The current page doesn't meet the alignment
1240                                  * and/or boundary requirements.  Continue
1241                                  * searching this reservation until the rest
1242                                  * of its free pages are either excluded or
1243                                  * exhausted.
1244                                  */
1245                                 hi = lo + 1;
1246                                 if (hi >= NBPOPMAP) {
1247                                         hi = 0;
1248                                         i++;
1249                                 }
1250                                 continue;
1251                         }
1252                         /* Find the next used page. */
1253                         hi = ffsl(rv->popmap[i] & ~((1UL << lo) - 1));
1254                         while (hi == 0 && ++i < NPOPMAP) {
1255                                 if ((NBPOPMAP * i - next_free) * PAGE_SIZE >=
1256                                     size) {
1257                                         vm_reserv_reclaim(rv);
1258                                         return (TRUE);
1259                                 }
1260                                 hi = ffsl(rv->popmap[i]);
1261                         }
1262                         /* Convert from ffsl() to ordinary bit numbering. */
1263                         if (i != NPOPMAP)
1264                                 hi--;
1265                         if ((NBPOPMAP * i + hi - next_free) * PAGE_SIZE >=
1266                             size) {
1267                                 vm_reserv_reclaim(rv);
1268                                 return (TRUE);
1269                         }
1270                 } while (i < NPOPMAP);
1271         }
1272         return (FALSE);
1273 }
1274
1275 /*
1276  * Transfers the reservation underlying the given page to a new object.
1277  *
1278  * The object must be locked.
1279  */
1280 void
1281 vm_reserv_rename(vm_page_t m, vm_object_t new_object, vm_object_t old_object,
1282     vm_pindex_t old_object_offset)
1283 {
1284         vm_reserv_t rv;
1285
1286         VM_OBJECT_ASSERT_WLOCKED(new_object);
1287         rv = vm_reserv_from_page(m);
1288         if (rv->object == old_object) {
1289                 vm_domain_free_lock(VM_DOMAIN(rv->domain));
1290                 if (rv->object == old_object) {
1291                         vm_reserv_object_lock(old_object);
1292                         rv->object = NULL;
1293                         LIST_REMOVE(rv, objq);
1294                         vm_reserv_object_unlock(old_object);
1295                         vm_reserv_object_lock(new_object);
1296                         rv->object = new_object;
1297                         rv->pindex -= old_object_offset;
1298                         LIST_INSERT_HEAD(&new_object->rvq, rv, objq);
1299                         vm_reserv_object_unlock(new_object);
1300                 }
1301                 vm_domain_free_unlock(VM_DOMAIN(rv->domain));
1302         }
1303 }
1304
1305 /*
1306  * Returns the size (in bytes) of a reservation of the specified level.
1307  */
1308 int
1309 vm_reserv_size(int level)
1310 {
1311
1312         switch (level) {
1313         case 0:
1314                 return (VM_LEVEL_0_SIZE);
1315         case -1:
1316                 return (PAGE_SIZE);
1317         default:
1318                 return (0);
1319         }
1320 }
1321
1322 /*
1323  * Allocates the virtual and physical memory required by the reservation
1324  * management system's data structures, in particular, the reservation array.
1325  */
1326 vm_paddr_t
1327 vm_reserv_startup(vm_offset_t *vaddr, vm_paddr_t end, vm_paddr_t high_water)
1328 {
1329         vm_paddr_t new_end;
1330         size_t size;
1331         int i;
1332
1333         /*
1334          * Calculate the size (in bytes) of the reservation array.  Round up
1335          * from "high_water" because every small page is mapped to an element
1336          * in the reservation array based on its physical address.  Thus, the
1337          * number of elements in the reservation array can be greater than the
1338          * number of superpages.
1339          */
1340         size = howmany(high_water, VM_LEVEL_0_SIZE) * sizeof(struct vm_reserv);
1341
1342         /*
1343          * Allocate and map the physical memory for the reservation array.  The
1344          * next available virtual address is returned by reference.
1345          */
1346         new_end = end - round_page(size);
1347         vm_reserv_array = (void *)(uintptr_t)pmap_map(vaddr, new_end, end,
1348             VM_PROT_READ | VM_PROT_WRITE);
1349         bzero(vm_reserv_array, size);
1350
1351         for (i = 0; i < VM_RESERV_OBJ_LOCK_COUNT; i++)
1352                 mtx_init(&vm_reserv_object_mtx[i], "resv obj lock", NULL,
1353                     MTX_DEF);
1354
1355         /*
1356          * Return the next available physical address.
1357          */
1358         return (new_end);
1359 }
1360
1361 /*
1362  * Returns the superpage containing the given page.
1363  */
1364 vm_page_t
1365 vm_reserv_to_superpage(vm_page_t m)
1366 {
1367         vm_reserv_t rv;
1368
1369         VM_OBJECT_ASSERT_LOCKED(m->object);
1370         rv = vm_reserv_from_page(m);
1371         return (rv->object == m->object && rv->popcnt == VM_LEVEL_0_NPAGES ?
1372             rv->pages : NULL);
1373 }
1374
1375 #endif  /* VM_NRESERVLEVEL > 0 */