sys/vm/vm_reserv.c

   1 /*-
   2  * Copyright (c) 2002-2006 Rice University
   3  * Copyright (c) 2007-2008 Alan L. Cox <alc@cs.rice.edu>
   4  * All rights reserved.
   5  *
   6  * This software was developed for the FreeBSD Project by Alan L. Cox,
   7  * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro.
   8  *
   9  * Redistribution and use in source and binary forms, with or without
  10  * modification, are permitted provided that the following conditions
  11  * are met:
  12  * 1. Redistributions of source code must retain the above copyright
  13  *    notice, this list of conditions and the following disclaimer.
  14  * 2. Redistributions in binary form must reproduce the above copyright
  15  *    notice, this list of conditions and the following disclaimer in the
  16  *    documentation and/or other materials provided with the distribution.
  17  *
  18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  19  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  21  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
  22  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  23  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  24  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
  25  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  26  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
  28  * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  29  * POSSIBILITY OF SUCH DAMAGE.
  30  */
  31
  32 /*
  33  *      Superpage reservation management module
  34  *
  35  * Any external functions defined by this module are only to be used by the
  36  * virtual memory system.
  37  */
  38
  39 #include <sys/cdefs.h>
  40 __FBSDID("$FreeBSD$");
  41
  42 #include "opt_vm.h"
  43
  44 #include <sys/param.h>
  45 #include <sys/kernel.h>
  46 #include <sys/lock.h>
  47 #include <sys/malloc.h>
  48 #include <sys/mutex.h>
  49 #include <sys/queue.h>
  50 #include <sys/sbuf.h>
  51 #include <sys/sysctl.h>
  52 #include <sys/systm.h>
  53
  54 #include <vm/vm.h>
  55 #include <vm/vm_param.h>
  56 #include <vm/vm_object.h>
  57 #include <vm/vm_page.h>
  58 #include <vm/vm_phys.h>
  59 #include <vm/vm_reserv.h>
  60
  61 /*
  62  * The reservation system supports the speculative allocation of large physical
  63  * pages ("superpages").  Speculative allocation enables the fully-automatic
  64  * utilization of superpages by the virtual memory system.  In other words, no
  65  * programmatic directives are required to use superpages.
  66  */
  67
  68 #if VM_NRESERVLEVEL > 0
  69
  70 /*
  71  * The number of small pages that are contained in a level 0 reservation
  72  */
  73 #define VM_LEVEL_0_NPAGES       (1 << VM_LEVEL_0_ORDER)
  74
  75 /*
  76  * The number of bits by which a physical address is shifted to obtain the
  77  * reservation number
  78  */
  79 #define VM_LEVEL_0_SHIFT        (VM_LEVEL_0_ORDER + PAGE_SHIFT)
  80
  81 /*
  82  * The size of a level 0 reservation in bytes
  83  */
  84 #define VM_LEVEL_0_SIZE         (1 << VM_LEVEL_0_SHIFT)
  85
  86 /*
  87  * Computes the index of the small page underlying the given (object, pindex)
  88  * within the reservation's array of small pages.
  89  */
  90 #define VM_RESERV_INDEX(object, pindex) \
  91     (((object)->pg_color + (pindex)) & (VM_LEVEL_0_NPAGES - 1))
  92
  93 /*
  94  * The reservation structure
  95  *
  96  * A reservation structure is constructed whenever a large physical page is
  97  * speculatively allocated to an object.  The reservation provides the small
  98  * physical pages for the range [pindex, pindex + VM_LEVEL_0_NPAGES) of offsets
  99  * within that object.  The reservation's "popcnt" tracks the number of these
 100  * small physical pages that are in use at any given time.  When and if the
 101  * reservation is not fully utilized, it appears in the queue of partially-
 102  * populated reservations.  The reservation always appears on the containing
 103  * object's list of reservations.
 104  *
 105  * A partially-populated reservation can be broken and reclaimed at any time.
 106  */
 107 struct vm_reserv {
 108         TAILQ_ENTRY(vm_reserv) partpopq;
 109         LIST_ENTRY(vm_reserv) objq;
 110         vm_object_t     object;                 /* containing object */
 111         vm_pindex_t     pindex;                 /* offset within object */
 112         vm_page_t       pages;                  /* first page of a superpage */
 113         int             popcnt;                 /* # of pages in use */
 114         char            inpartpopq;
 115 };
 116
 117 /*
 118  * The reservation array
 119  *
 120  * This array is analoguous in function to vm_page_array.  It differs in the
 121  * respect that it may contain a greater number of useful reservation
 122  * structures than there are (physical) superpages.  These "invalid"
 123  * reservation structures exist to trade-off space for time in the
 124  * implementation of vm_reserv_from_page().  Invalid reservation structures are
 125  * distinguishable from "valid" reservation structures by inspecting the
 126  * reservation's "pages" field.  Invalid reservation structures have a NULL
 127  * "pages" field.
 128  *
 129  * vm_reserv_from_page() maps a small (physical) page to an element of this
 130  * array by computing a physical reservation number from the page's physical
 131  * address.  The physical reservation number is used as the array index.
 132  *
 133  * An "active" reservation is a valid reservation structure that has a non-NULL
 134  * "object" field and a non-zero "popcnt" field.  In other words, every active
 135  * reservation belongs to a particular object.  Moreover, every active
 136  * reservation has an entry in the containing object's list of reservations.
 137  */
 138 static vm_reserv_t vm_reserv_array;
 139
 140 /*
 141  * The partially-populated reservation queue
 142  *
 143  * This queue enables the fast recovery of an unused cached or free small page
 144  * from a partially-populated reservation.  The reservation at the head of
 145  * this queue is the least-recently-changed, partially-populated reservation.
 146  *
 147  * Access to this queue is synchronized by the free page queue lock.
 148  */
 149 static TAILQ_HEAD(, vm_reserv) vm_rvq_partpop =
 150                             TAILQ_HEAD_INITIALIZER(vm_rvq_partpop);
 151
 152 static SYSCTL_NODE(_vm, OID_AUTO, reserv, CTLFLAG_RD, 0, "Reservation Info");
 153
 154 static long vm_reserv_broken;
 155 SYSCTL_LONG(_vm_reserv, OID_AUTO, broken, CTLFLAG_RD,
 156     &vm_reserv_broken, 0, "Cumulative number of broken reservations");
 157
 158 static long vm_reserv_freed;
 159 SYSCTL_LONG(_vm_reserv, OID_AUTO, freed, CTLFLAG_RD,
 160     &vm_reserv_freed, 0, "Cumulative number of freed reservations");
 161
 162 static int sysctl_vm_reserv_partpopq(SYSCTL_HANDLER_ARGS);
 163
 164 SYSCTL_OID(_vm_reserv, OID_AUTO, partpopq, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0,
 165     sysctl_vm_reserv_partpopq, "A", "Partially-populated reservation queues");
 166
 167 static long vm_reserv_reclaimed;
 168 SYSCTL_LONG(_vm_reserv, OID_AUTO, reclaimed, CTLFLAG_RD,
 169     &vm_reserv_reclaimed, 0, "Cumulative number of reclaimed reservations");
 170
 171 static void             vm_reserv_depopulate(vm_reserv_t rv);
 172 static vm_reserv_t      vm_reserv_from_page(vm_page_t m);
 173 static boolean_t        vm_reserv_has_pindex(vm_reserv_t rv,
 174                             vm_pindex_t pindex);
 175 static void             vm_reserv_populate(vm_reserv_t rv);
 176 static void             vm_reserv_reclaim(vm_reserv_t rv);
 177
 178 /*
 179  * Describes the current state of the partially-populated reservation queue.
 180  */
 181 static int
 182 sysctl_vm_reserv_partpopq(SYSCTL_HANDLER_ARGS)
 183 {
 184         struct sbuf sbuf;
 185         vm_reserv_t rv;
 186         int counter, error, level, unused_pages;
 187
 188         error = sysctl_wire_old_buffer(req, 0);
 189         if (error != 0)
 190                 return (error);
 191         sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
 192         sbuf_printf(&sbuf, "\nLEVEL     SIZE  NUMBER\n\n");
 193         for (level = -1; level <= VM_NRESERVLEVEL - 2; level++) {
 194                 counter = 0;
 195                 unused_pages = 0;
 196                 mtx_lock(&vm_page_queue_free_mtx);
 197                 TAILQ_FOREACH(rv, &vm_rvq_partpop/*[level]*/, partpopq) {
 198                         counter++;
 199                         unused_pages += VM_LEVEL_0_NPAGES - rv->popcnt;
 200                 }
 201                 mtx_unlock(&vm_page_queue_free_mtx);
 202                 sbuf_printf(&sbuf, "%5d: %6dK, %6d\n", level,
 203                     unused_pages * ((int)PAGE_SIZE / 1024), counter);
 204         }
 205         error = sbuf_finish(&sbuf);
 206         sbuf_delete(&sbuf);
 207         return (error);
 208 }
 209
 210 /*
 211  * Reduces the given reservation's population count.  If the population count
 212  * becomes zero, the reservation is destroyed.  Additionally, moves the
 213  * reservation to the tail of the partially-populated reservations queue if the
 214  * population count is non-zero.
 215  *
 216  * The free page queue lock must be held.
 217  */
 218 static void
 219 vm_reserv_depopulate(vm_reserv_t rv)
 220 {
 221
 222         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 223         KASSERT(rv->object != NULL,
 224             ("vm_reserv_depopulate: reserv %p is free", rv));
 225         KASSERT(rv->popcnt > 0,
 226             ("vm_reserv_depopulate: reserv %p's popcnt is corrupted", rv));
 227         if (rv->inpartpopq) {
 228                 TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq);
 229                 rv->inpartpopq = FALSE;
 230         }
 231         rv->popcnt--;
 232         if (rv->popcnt == 0) {
 233                 LIST_REMOVE(rv, objq);
 234                 rv->object = NULL;
 235                 vm_phys_free_pages(rv->pages, VM_LEVEL_0_ORDER);
 236                 vm_reserv_freed++;
 237         } else {
 238                 rv->inpartpopq = TRUE;
 239                 TAILQ_INSERT_TAIL(&vm_rvq_partpop, rv, partpopq);
 240         }
 241 }
 242
 243 /*
 244  * Returns the reservation to which the given page might belong.
 245  */
 246 static __inline vm_reserv_t
 247 vm_reserv_from_page(vm_page_t m)
 248 {
 249
 250         return (&vm_reserv_array[VM_PAGE_TO_PHYS(m) >> VM_LEVEL_0_SHIFT]);
 251 }
 252
 253 /*
 254  * Returns TRUE if the given reservation contains the given page index and
 255  * FALSE otherwise.
 256  */
 257 static __inline boolean_t
 258 vm_reserv_has_pindex(vm_reserv_t rv, vm_pindex_t pindex)
 259 {
 260
 261         return (((pindex - rv->pindex) & ~(VM_LEVEL_0_NPAGES - 1)) == 0);
 262 }
 263
 264 /*
 265  * Increases the given reservation's population count.  Moves the reservation
 266  * to the tail of the partially-populated reservation queue.
 267  *
 268  * The free page queue must be locked.
 269  */
 270 static void
 271 vm_reserv_populate(vm_reserv_t rv)
 272 {
 273
 274         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 275         KASSERT(rv->object != NULL,
 276             ("vm_reserv_populate: reserv %p is free", rv));
 277         KASSERT(rv->popcnt < VM_LEVEL_0_NPAGES,
 278             ("vm_reserv_populate: reserv %p is already full", rv));
 279         if (rv->inpartpopq) {
 280                 TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq);
 281                 rv->inpartpopq = FALSE;
 282         }
 283         rv->popcnt++;
 284         if (rv->popcnt < VM_LEVEL_0_NPAGES) {
 285                 rv->inpartpopq = TRUE;
 286                 TAILQ_INSERT_TAIL(&vm_rvq_partpop, rv, partpopq);
 287         }
 288 }
 289
 290 /*
 291  * Allocates a contiguous set of physical pages of the given size "npages"
 292  * from an existing or newly-created reservation.  All of the physical pages
 293  * must be at or above the given physical address "low" and below the given
 294  * physical address "high".  The given value "alignment" determines the
 295  * alignment of the first physical page in the set.  If the given value
 296  * "boundary" is non-zero, then the set of physical pages cannot cross any
 297  * physical address boundary that is a multiple of that value.  Both
 298  * "alignment" and "boundary" must be a power of two.
 299  *
 300  * The object and free page queue must be locked.
 301  */
 302 vm_page_t
 303 vm_reserv_alloc_contig(vm_object_t object, vm_pindex_t pindex, u_long npages,
 304     vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary)
 305 {
 306         vm_paddr_t pa, size;
 307         vm_page_t m, m_ret, mpred, msucc;
 308         vm_pindex_t first, leftcap, rightcap;
 309         vm_reserv_t rv;
 310         u_long allocpages, maxpages, minpages;
 311         int i, index, n;
 312
 313         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 314         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 315         KASSERT(npages != 0, ("vm_reserv_alloc_contig: npages is 0"));
 316
 317         /*
 318          * Is a reservation fundamentally impossible?
 319          */
 320         if (pindex < VM_RESERV_INDEX(object, pindex) ||
 321             pindex + npages > object->size)
 322                 return (NULL);
 323
 324         /*
 325          * All reservations of a particular size have the same alignment.
 326          * Assuming that the first page is allocated from a reservation, the
 327          * least significant bits of its physical address can be determined
 328          * from its offset from the beginning of the reservation and the size
 329          * of the reservation.
 330          *
 331          * Could the specified index within a reservation of the smallest
 332          * possible size satisfy the alignment and boundary requirements?
 333          */
 334         pa = VM_RESERV_INDEX(object, pindex) << PAGE_SHIFT;
 335         if ((pa & (alignment - 1)) != 0)
 336                 return (NULL);
 337         size = npages << PAGE_SHIFT;
 338         if (((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0)
 339                 return (NULL);
 340
 341         /*
 342          * Look for an existing reservation.
 343          */
 344         mpred = vm_radix_lookup_le(&object->rtree, pindex, VM_RADIX_BLACK);
 345         if (mpred != NULL) {
 346                 KASSERT(mpred->pindex != pindex,
 347                     ("vm_reserv_alloc_contig: pindex already allocated"));
 348                 rv = vm_reserv_from_page(mpred);
 349                 if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
 350                         goto found;
 351         }
 352         msucc = vm_radix_lookup_ge(&object->rtree, pindex, VM_RADIX_BLACK);
 353         if (msucc != NULL) {
 354                 KASSERT(msucc->pindex != pindex,
 355                     ("vm_reserv_alloc_page: pindex already allocated"));
 356                 rv = vm_reserv_from_page(msucc);
 357                 if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
 358                         goto found;
 359         }
 360
 361         /*
 362          * Could at least one reservation fit between the first index to the
 363          * left that can be used and the first index to the right that cannot
 364          * be used?
 365          */
 366         first = pindex - VM_RESERV_INDEX(object, pindex);
 367         if (mpred != NULL) {
 368                 if ((rv = vm_reserv_from_page(mpred))->object != object)
 369                         leftcap = mpred->pindex + 1;
 370                 else
 371                         leftcap = rv->pindex + VM_LEVEL_0_NPAGES;
 372                 if (leftcap > first)
 373                         return (NULL);
 374         }
 375         minpages = VM_RESERV_INDEX(object, pindex) + npages;
 376         maxpages = roundup2(minpages, VM_LEVEL_0_NPAGES);
 377         allocpages = maxpages;
 378         if (msucc != NULL) {
 379                 if ((rv = vm_reserv_from_page(msucc))->object != object)
 380                         rightcap = msucc->pindex;
 381                 else
 382                         rightcap = rv->pindex;
 383                 if (first + maxpages > rightcap) {
 384                         if (maxpages == VM_LEVEL_0_NPAGES)
 385                                 return (NULL);
 386                         allocpages = minpages;
 387                 }
 388         }
 389
 390         /*
 391          * Would the last new reservation extend past the end of the object?
 392          */
 393         if (first + maxpages > object->size) {
 394                 /*
 395                  * Don't allocate the last new reservation if the object is a
 396                  * vnode or backed by another object that is a vnode.
 397                  */
 398                 if (object->type == OBJT_VNODE ||
 399                     (object->backing_object != NULL &&
 400                     object->backing_object->type == OBJT_VNODE)) {
 401                         if (maxpages == VM_LEVEL_0_NPAGES)
 402                                 return (NULL);
 403                         allocpages = minpages;
 404                 }
 405                 /* Speculate that the object may grow. */
 406         }
 407
 408         /*
 409          * Allocate and populate the new reservations.  The alignment and
 410          * boundary specified for this allocation may be different from the
 411          * alignment and boundary specified for the requested pages.  For
 412          * instance, the specified index may not be the first page within the
 413          * first new reservation.
 414          */
 415         m = vm_phys_alloc_contig(allocpages, low, high, ulmax(alignment,
 416             VM_LEVEL_0_SIZE), boundary > VM_LEVEL_0_SIZE ? boundary : 0);
 417         if (m == NULL)
 418                 return (NULL);
 419         m_ret = NULL;
 420         index = VM_RESERV_INDEX(object, pindex);
 421         do {
 422                 rv = vm_reserv_from_page(m);
 423                 KASSERT(rv->pages == m,
 424                     ("vm_reserv_alloc_contig: reserv %p's pages is corrupted",
 425                     rv));
 426                 KASSERT(rv->object == NULL,
 427                     ("vm_reserv_alloc_contig: reserv %p isn't free", rv));
 428                 LIST_INSERT_HEAD(&object->rvq, rv, objq);
 429                 rv->object = object;
 430                 rv->pindex = first;
 431                 KASSERT(rv->popcnt == 0,
 432                     ("vm_reserv_alloc_contig: reserv %p's popcnt is corrupted",
 433                     rv));
 434                 KASSERT(!rv->inpartpopq,
 435                     ("vm_reserv_alloc_contig: reserv %p's inpartpopq is TRUE",
 436                     rv));
 437                 n = ulmin(VM_LEVEL_0_NPAGES - index, npages);
 438                 for (i = 0; i < n; i++)
 439                         vm_reserv_populate(rv);
 440                 npages -= n;
 441                 if (m_ret == NULL) {
 442                         m_ret = &rv->pages[index];
 443                         index = 0;
 444                 }
 445                 m += VM_LEVEL_0_NPAGES;
 446                 first += VM_LEVEL_0_NPAGES;
 447                 allocpages -= VM_LEVEL_0_NPAGES;
 448         } while (allocpages > 0);
 449         return (m_ret);
 450
 451         /*
 452          * Found a matching reservation.
 453          */
 454 found:
 455         index = VM_RESERV_INDEX(object, pindex);
 456         /* Does the allocation fit within the reservation? */
 457         if (index + npages > VM_LEVEL_0_NPAGES)
 458                 return (NULL);
 459         m = &rv->pages[index];
 460         pa = VM_PAGE_TO_PHYS(m);
 461         if (pa < low || pa + size > high || (pa & (alignment - 1)) != 0 ||
 462             ((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0)
 463                 return (NULL);
 464         /* Handle vm_page_rename(m, new_object, ...). */
 465         for (i = 0; i < npages; i++)
 466                 if ((rv->pages[index + i].flags & (PG_CACHED | PG_FREE)) == 0)
 467                         return (NULL);
 468         for (i = 0; i < npages; i++)
 469                 vm_reserv_populate(rv);
 470         return (m);
 471 }
 472
 473 /*
 474  * Allocates a page from an existing or newly-created reservation.
 475  *
 476  * The object and free page queue must be locked.
 477  */
 478 vm_page_t
 479 vm_reserv_alloc_page(vm_object_t object, vm_pindex_t pindex)
 480 {
 481         vm_page_t m, mpred, msucc;
 482         vm_pindex_t first, leftcap, rightcap;
 483         vm_reserv_t rv;
 484
 485         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 486         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 487
 488         /*
 489          * Is a reservation fundamentally impossible?
 490          */
 491         if (pindex < VM_RESERV_INDEX(object, pindex) ||
 492             pindex >= object->size)
 493                 return (NULL);
 494
 495         /*
 496          * Look for an existing reservation.
 497          */
 498         mpred = vm_radix_lookup_le(&object->rtree, pindex, VM_RADIX_BLACK);
 499         if (mpred != NULL) {
 500                 KASSERT(mpred->pindex != pindex,
 501                     ("vm_reserv_alloc_page: pindex already allocated"));
 502                 rv = vm_reserv_from_page(mpred);
 503                 if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
 504                         goto found;
 505         }
 506         msucc = vm_radix_lookup_ge(&object->rtree, pindex, VM_RADIX_BLACK);
 507         if (msucc != NULL) {
 508                 KASSERT(msucc->pindex != pindex,
 509                     ("vm_reserv_alloc_page: pindex already allocated"));
 510                 rv = vm_reserv_from_page(msucc);
 511                 if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
 512                         goto found;
 513         }
 514
 515         /*
 516          * Could a reservation fit between the first index to the left that
 517          * can be used and the first index to the right that cannot be used?
 518          */
 519         first = pindex - VM_RESERV_INDEX(object, pindex);
 520         if (mpred != NULL) {
 521                 if ((rv = vm_reserv_from_page(mpred))->object != object)
 522                         leftcap = mpred->pindex + 1;
 523                 else
 524                         leftcap = rv->pindex + VM_LEVEL_0_NPAGES;
 525                 if (leftcap > first)
 526                         return (NULL);
 527         }
 528         if (msucc != NULL) {
 529                 if ((rv = vm_reserv_from_page(msucc))->object != object)
 530                         rightcap = msucc->pindex;
 531                 else
 532                         rightcap = rv->pindex;
 533                 if (first + VM_LEVEL_0_NPAGES > rightcap)
 534                         return (NULL);
 535         }
 536
 537         /*
 538          * Would a new reservation extend past the end of the object?
 539          */
 540         if (first + VM_LEVEL_0_NPAGES > object->size) {
 541                 /*
 542                  * Don't allocate a new reservation if the object is a vnode or
 543                  * backed by another object that is a vnode.
 544                  */
 545                 if (object->type == OBJT_VNODE ||
 546                     (object->backing_object != NULL &&
 547                     object->backing_object->type == OBJT_VNODE))
 548                         return (NULL);
 549                 /* Speculate that the object may grow. */
 550         }
 551
 552         /*
 553          * Allocate and populate the new reservation.
 554          */
 555         m = vm_phys_alloc_pages(VM_FREEPOOL_DEFAULT, VM_LEVEL_0_ORDER);
 556         if (m == NULL)
 557                 return (NULL);
 558         rv = vm_reserv_from_page(m);
 559         KASSERT(rv->pages == m,
 560             ("vm_reserv_alloc_page: reserv %p's pages is corrupted", rv));
 561         KASSERT(rv->object == NULL,
 562             ("vm_reserv_alloc_page: reserv %p isn't free", rv));
 563         LIST_INSERT_HEAD(&object->rvq, rv, objq);
 564         rv->object = object;
 565         rv->pindex = first;
 566         KASSERT(rv->popcnt == 0,
 567             ("vm_reserv_alloc_page: reserv %p's popcnt is corrupted", rv));
 568         KASSERT(!rv->inpartpopq,
 569             ("vm_reserv_alloc_page: reserv %p's inpartpopq is TRUE", rv));
 570         vm_reserv_populate(rv);
 571         return (&rv->pages[VM_RESERV_INDEX(object, pindex)]);
 572
 573         /*
 574          * Found a matching reservation.
 575          */
 576 found:
 577         m = &rv->pages[VM_RESERV_INDEX(object, pindex)];
 578         /* Handle vm_page_rename(m, new_object, ...). */
 579         if ((m->flags & (PG_CACHED | PG_FREE)) == 0)
 580                 return (NULL);
 581         vm_reserv_populate(rv);
 582         return (m);
 583 }
 584
 585 /*
 586  * Breaks all reservations belonging to the given object.
 587  */
 588 void
 589 vm_reserv_break_all(vm_object_t object)
 590 {
 591         vm_reserv_t rv;
 592         int i;
 593
 594         mtx_lock(&vm_page_queue_free_mtx);
 595         while ((rv = LIST_FIRST(&object->rvq)) != NULL) {
 596                 KASSERT(rv->object == object,
 597                     ("vm_reserv_break_all: reserv %p is corrupted", rv));
 598                 if (rv->inpartpopq) {
 599                         TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq);
 600                         rv->inpartpopq = FALSE;
 601                 }
 602                 LIST_REMOVE(rv, objq);
 603                 rv->object = NULL;
 604                 for (i = 0; i < VM_LEVEL_0_NPAGES; i++) {
 605                         if ((rv->pages[i].flags & (PG_CACHED | PG_FREE)) != 0)
 606                                 vm_phys_free_pages(&rv->pages[i], 0);
 607                         else
 608                                 rv->popcnt--;
 609                 }
 610                 KASSERT(rv->popcnt == 0,
 611                     ("vm_reserv_break_all: reserv %p's popcnt is corrupted",
 612                     rv));
 613                 vm_reserv_broken++;
 614         }
 615         mtx_unlock(&vm_page_queue_free_mtx);
 616 }
 617
 618 /*
 619  * Frees the given page if it belongs to a reservation.  Returns TRUE if the
 620  * page is freed and FALSE otherwise.
 621  *
 622  * The free page queue lock must be held.
 623  */
 624 boolean_t
 625 vm_reserv_free_page(vm_page_t m)
 626 {
 627         vm_reserv_t rv;
 628
 629         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 630         rv = vm_reserv_from_page(m);
 631         if (rv->object == NULL)
 632                 return (FALSE);
 633         if ((m->flags & PG_CACHED) != 0 && m->pool != VM_FREEPOOL_CACHE)
 634                 vm_phys_set_pool(VM_FREEPOOL_CACHE, rv->pages,
 635                     VM_LEVEL_0_ORDER);
 636         vm_reserv_depopulate(rv);
 637         return (TRUE);
 638 }
 639
 640 /*
 641  * Initializes the reservation management system.  Specifically, initializes
 642  * the reservation array.
 643  *
 644  * Requires that vm_page_array and first_page are initialized!
 645  */
 646 void
 647 vm_reserv_init(void)
 648 {
 649         vm_paddr_t paddr;
 650         int i;
 651
 652         /*
 653          * Initialize the reservation array.  Specifically, initialize the
 654          * "pages" field for every element that has an underlying superpage.
 655          */
 656         for (i = 0; phys_avail[i + 1] != 0; i += 2) {
 657                 paddr = roundup2(phys_avail[i], VM_LEVEL_0_SIZE);
 658                 while (paddr + VM_LEVEL_0_SIZE <= phys_avail[i + 1]) {
 659                         vm_reserv_array[paddr >> VM_LEVEL_0_SHIFT].pages =
 660                             PHYS_TO_VM_PAGE(paddr);
 661                         paddr += VM_LEVEL_0_SIZE;
 662                 }
 663         }
 664 }
 665
 666 /*
 667  * Returns a reservation level if the given page belongs to a fully-populated
 668  * reservation and -1 otherwise.
 669  */
 670 int
 671 vm_reserv_level_iffullpop(vm_page_t m)
 672 {
 673         vm_reserv_t rv;
 674
 675         rv = vm_reserv_from_page(m);
 676         return (rv->popcnt == VM_LEVEL_0_NPAGES ? 0 : -1);
 677 }
 678
 679 /*
 680  * Prepare for the reactivation of a cached page.
 681  *
 682  * First, suppose that the given page "m" was allocated individually, i.e., not
 683  * as part of a reservation, and cached.  Then, suppose a reservation
 684  * containing "m" is allocated by the same object.  Although "m" and the
 685  * reservation belong to the same object, "m"'s pindex may not match the
 686  * reservation's.
 687  *
 688  * The free page queue must be locked.
 689  */
 690 boolean_t
 691 vm_reserv_reactivate_page(vm_page_t m)
 692 {
 693         vm_reserv_t rv;
 694         int i, m_index;
 695
 696         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 697         rv = vm_reserv_from_page(m);
 698         if (rv->object == NULL)
 699                 return (FALSE);
 700         KASSERT((m->flags & PG_CACHED) != 0,
 701             ("vm_reserv_uncache_page: page %p is not cached", m));
 702         if (m->object == rv->object &&
 703             m->pindex - rv->pindex == VM_RESERV_INDEX(m->object, m->pindex))
 704                 vm_reserv_populate(rv);
 705         else {
 706                 KASSERT(rv->inpartpopq,
 707                     ("vm_reserv_uncache_page: reserv %p's inpartpopq is FALSE",
 708                     rv));
 709                 TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq);
 710                 rv->inpartpopq = FALSE;
 711                 LIST_REMOVE(rv, objq);
 712                 rv->object = NULL;
 713                 /* Don't vm_phys_free_pages(m, 0). */
 714                 m_index = m - rv->pages;
 715                 for (i = 0; i < m_index; i++) {
 716                         if ((rv->pages[i].flags & (PG_CACHED | PG_FREE)) != 0)
 717                                 vm_phys_free_pages(&rv->pages[i], 0);
 718                         else
 719                                 rv->popcnt--;
 720                 }
 721                 for (i++; i < VM_LEVEL_0_NPAGES; i++) {
 722                         if ((rv->pages[i].flags & (PG_CACHED | PG_FREE)) != 0)
 723                                 vm_phys_free_pages(&rv->pages[i], 0);
 724                         else
 725                                 rv->popcnt--;
 726                 }
 727                 KASSERT(rv->popcnt == 0,
 728                     ("vm_reserv_uncache_page: reserv %p's popcnt is corrupted",
 729                     rv));
 730                 vm_reserv_broken++;
 731         }
 732         return (TRUE);
 733 }
 734
 735 /*
 736  * Breaks the given partially-populated reservation, releasing its cached and
 737  * free pages to the physical memory allocator.
 738  *
 739  * The free page queue lock must be held.
 740  */
 741 static void
 742 vm_reserv_reclaim(vm_reserv_t rv)
 743 {
 744         int i;
 745
 746         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 747         KASSERT(rv->inpartpopq,
 748             ("vm_reserv_reclaim: reserv %p's inpartpopq is corrupted", rv));
 749         TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq);
 750         rv->inpartpopq = FALSE;
 751         KASSERT(rv->object != NULL,
 752             ("vm_reserv_reclaim: reserv %p is free", rv));
 753         LIST_REMOVE(rv, objq);
 754         rv->object = NULL;
 755         for (i = 0; i < VM_LEVEL_0_NPAGES; i++) {
 756                 if ((rv->pages[i].flags & (PG_CACHED | PG_FREE)) != 0)
 757                         vm_phys_free_pages(&rv->pages[i], 0);
 758                 else
 759                         rv->popcnt--;
 760         }
 761         KASSERT(rv->popcnt == 0,
 762             ("vm_reserv_reclaim: reserv %p's popcnt is corrupted", rv));
 763         vm_reserv_reclaimed++;
 764 }
 765
 766 /*
 767  * Breaks the reservation at the head of the partially-populated reservation
 768  * queue, releasing its cached and free pages to the physical memory
 769  * allocator.  Returns TRUE if a reservation is broken and FALSE otherwise.
 770  *
 771  * The free page queue lock must be held.
 772  */
 773 boolean_t
 774 vm_reserv_reclaim_inactive(void)
 775 {
 776         vm_reserv_t rv;
 777
 778         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 779         if ((rv = TAILQ_FIRST(&vm_rvq_partpop)) != NULL) {
 780                 vm_reserv_reclaim(rv);
 781                 return (TRUE);
 782         }
 783         return (FALSE);
 784 }
 785
 786 /*
 787  * Searches the partially-populated reservation queue for the least recently
 788  * active reservation with unused pages, i.e., cached or free, that satisfy the
 789  * given request for contiguous physical memory.  If a satisfactory reservation
 790  * is found, it is broken.  Returns TRUE if a reservation is broken and FALSE
 791  * otherwise.
 792  *
 793  * The free page queue lock must be held.
 794  */
 795 boolean_t
 796 vm_reserv_reclaim_contig(u_long npages, vm_paddr_t low, vm_paddr_t high,
 797     u_long alignment, vm_paddr_t boundary)
 798 {
 799         vm_paddr_t pa, pa_length, size;
 800         vm_reserv_t rv;
 801         int i;
 802
 803         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 804         if (npages > VM_LEVEL_0_NPAGES - 1)
 805                 return (FALSE);
 806         size = npages << PAGE_SHIFT;
 807         TAILQ_FOREACH(rv, &vm_rvq_partpop, partpopq) {
 808                 pa = VM_PAGE_TO_PHYS(&rv->pages[VM_LEVEL_0_NPAGES - 1]);
 809                 if (pa + PAGE_SIZE - size < low) {
 810                         /* this entire reservation is too low; go to next */
 811                         continue;
 812                 }
 813                 pa_length = 0;
 814                 for (i = 0; i < VM_LEVEL_0_NPAGES; i++)
 815                         if ((rv->pages[i].flags & (PG_CACHED | PG_FREE)) != 0) {
 816                                 pa_length += PAGE_SIZE;
 817                                 if (pa_length == PAGE_SIZE) {
 818                                         pa = VM_PAGE_TO_PHYS(&rv->pages[i]);
 819                                         if (pa + size > high) {
 820                                                 /* skip to next reservation */
 821                                                 break;
 822                                         } else if (pa < low ||
 823                                             (pa & (alignment - 1)) != 0 ||
 824                                             ((pa ^ (pa + size - 1)) &
 825                                             ~(boundary - 1)) != 0)
 826                                                 pa_length = 0;
 827                                 }
 828                                 if (pa_length >= size) {
 829                                         vm_reserv_reclaim(rv);
 830                                         return (TRUE);
 831                                 }
 832                         } else
 833                                 pa_length = 0;
 834         }
 835         return (FALSE);
 836 }
 837
 838 /*
 839  * Transfers the reservation underlying the given page to a new object.
 840  *
 841  * The object must be locked.
 842  */
 843 void
 844 vm_reserv_rename(vm_page_t m, vm_object_t new_object, vm_object_t old_object,
 845     vm_pindex_t old_object_offset)
 846 {
 847         vm_reserv_t rv;
 848
 849         VM_OBJECT_LOCK_ASSERT(new_object, MA_OWNED);
 850         rv = vm_reserv_from_page(m);
 851         if (rv->object == old_object) {
 852                 mtx_lock(&vm_page_queue_free_mtx);
 853                 if (rv->object == old_object) {
 854                         LIST_REMOVE(rv, objq);
 855                         LIST_INSERT_HEAD(&new_object->rvq, rv, objq);
 856                         rv->object = new_object;
 857                         rv->pindex -= old_object_offset;
 858                 }
 859                 mtx_unlock(&vm_page_queue_free_mtx);
 860         }
 861 }
 862
 863 /*
 864  * Allocates the virtual and physical memory required by the reservation
 865  * management system's data structures, in particular, the reservation array.
 866  */
 867 vm_paddr_t
 868 vm_reserv_startup(vm_offset_t *vaddr, vm_paddr_t end, vm_paddr_t high_water)
 869 {
 870         vm_paddr_t new_end;
 871         size_t size;
 872
 873         /*
 874          * Calculate the size (in bytes) of the reservation array.  Round up
 875          * from "high_water" because every small page is mapped to an element
 876          * in the reservation array based on its physical address.  Thus, the
 877          * number of elements in the reservation array can be greater than the
 878          * number of superpages.
 879          */
 880         size = howmany(high_water, VM_LEVEL_0_SIZE) * sizeof(struct vm_reserv);
 881
 882         /*
 883          * Allocate and map the physical memory for the reservation array.  The
 884          * next available virtual address is returned by reference.
 885          */
 886         new_end = end - round_page(size);
 887         vm_reserv_array = (void *)(uintptr_t)pmap_map(vaddr, new_end, end,
 888             VM_PROT_READ | VM_PROT_WRITE);
 889         bzero(vm_reserv_array, size);
 890
 891         /*
 892          * Return the next available physical address.
 893          */
 894         return (new_end);
 895 }
 896
 897 #endif  /* VM_NRESERVLEVEL > 0 */