sys/vm/vm_reserv.c

   1 /*-
   2  * Copyright (c) 2002-2006 Rice University
   3  * Copyright (c) 2007-2008 Alan L. Cox <alc@cs.rice.edu>
   4  * All rights reserved.
   5  *
   6  * This software was developed for the FreeBSD Project by Alan L. Cox,
   7  * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro.
   8  *
   9  * Redistribution and use in source and binary forms, with or without
  10  * modification, are permitted provided that the following conditions
  11  * are met:
  12  * 1. Redistributions of source code must retain the above copyright
  13  *    notice, this list of conditions and the following disclaimer.
  14  * 2. Redistributions in binary form must reproduce the above copyright
  15  *    notice, this list of conditions and the following disclaimer in the
  16  *    documentation and/or other materials provided with the distribution.
  17  *
  18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  19  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  21  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
  22  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  23  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  24  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
  25  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  26  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
  28  * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  29  * POSSIBILITY OF SUCH DAMAGE.
  30  */
  31
  32 /*
  33  *      Superpage reservation management module
  34  *
  35  * Any external functions defined by this module are only to be used by the
  36  * virtual memory system.
  37  */
  38
  39 #include <sys/cdefs.h>
  40 __FBSDID("$FreeBSD$");
  41
  42 #include "opt_vm.h"
  43
  44 #include <sys/param.h>
  45 #include <sys/kernel.h>
  46 #include <sys/lock.h>
  47 #include <sys/malloc.h>
  48 #include <sys/mutex.h>
  49 #include <sys/queue.h>
  50 #include <sys/sbuf.h>
  51 #include <sys/sysctl.h>
  52 #include <sys/systm.h>
  53
  54 #include <vm/vm.h>
  55 #include <vm/vm_param.h>
  56 #include <vm/vm_object.h>
  57 #include <vm/vm_page.h>
  58 #include <vm/vm_phys.h>
  59 #include <vm/vm_reserv.h>
  60
  61 /*
  62  * The reservation system supports the speculative allocation of large physical
  63  * pages ("superpages").  Speculative allocation enables the fully-automatic
  64  * utilization of superpages by the virtual memory system.  In other words, no
  65  * programmatic directives are required to use superpages.
  66  */
  67
  68 #if VM_NRESERVLEVEL > 0
  69
  70 /*
  71  * The number of small pages that are contained in a level 0 reservation
  72  */
  73 #define VM_LEVEL_0_NPAGES       (1 << VM_LEVEL_0_ORDER)
  74
  75 /*
  76  * The number of bits by which a physical address is shifted to obtain the
  77  * reservation number
  78  */
  79 #define VM_LEVEL_0_SHIFT        (VM_LEVEL_0_ORDER + PAGE_SHIFT)
  80
  81 /*
  82  * The size of a level 0 reservation in bytes
  83  */
  84 #define VM_LEVEL_0_SIZE         (1 << VM_LEVEL_0_SHIFT)
  85
  86 /*
  87  * Computes the index of the small page underlying the given (object, pindex)
  88  * within the reservation's array of small pages.
  89  */
  90 #define VM_RESERV_INDEX(object, pindex) \
  91     (((object)->pg_color + (pindex)) & (VM_LEVEL_0_NPAGES - 1))
  92
  93 /*
  94  * The reservation structure
  95  *
  96  * A reservation structure is constructed whenever a large physical page is
  97  * speculatively allocated to an object.  The reservation provides the small
  98  * physical pages for the range [pindex, pindex + VM_LEVEL_0_NPAGES) of offsets
  99  * within that object.  The reservation's "popcnt" tracks the number of these
 100  * small physical pages that are in use at any given time.  When and if the
 101  * reservation is not fully utilized, it appears in the queue of partially-
 102  * populated reservations.  The reservation always appears on the containing
 103  * object's list of reservations.
 104  *
 105  * A partially-populated reservation can be broken and reclaimed at any time.
 106  */
 107 struct vm_reserv {
 108         TAILQ_ENTRY(vm_reserv) partpopq;
 109         LIST_ENTRY(vm_reserv) objq;
 110         vm_object_t     object;                 /* containing object */
 111         vm_pindex_t     pindex;                 /* offset within object */
 112         vm_page_t       pages;                  /* first page of a superpage */
 113         int             popcnt;                 /* # of pages in use */
 114         char            inpartpopq;
 115 };
 116
 117 /*
 118  * The reservation array
 119  *
 120  * This array is analoguous in function to vm_page_array.  It differs in the
 121  * respect that it may contain a greater number of useful reservation
 122  * structures than there are (physical) superpages.  These "invalid"
 123  * reservation structures exist to trade-off space for time in the
 124  * implementation of vm_reserv_from_page().  Invalid reservation structures are
 125  * distinguishable from "valid" reservation structures by inspecting the
 126  * reservation's "pages" field.  Invalid reservation structures have a NULL
 127  * "pages" field.
 128  *
 129  * vm_reserv_from_page() maps a small (physical) page to an element of this
 130  * array by computing a physical reservation number from the page's physical
 131  * address.  The physical reservation number is used as the array index.
 132  *
 133  * An "active" reservation is a valid reservation structure that has a non-NULL
 134  * "object" field and a non-zero "popcnt" field.  In other words, every active
 135  * reservation belongs to a particular object.  Moreover, every active
 136  * reservation has an entry in the containing object's list of reservations.
 137  */
 138 static vm_reserv_t vm_reserv_array;
 139
 140 /*
 141  * The partially-populated reservation queue
 142  *
 143  * This queue enables the fast recovery of an unused cached or free small page
 144  * from a partially-populated reservation.  The reservation at the head of
 145  * this queue is the least-recently-changed, partially-populated reservation.
 146  *
 147  * Access to this queue is synchronized by the free page queue lock.
 148  */
 149 static TAILQ_HEAD(, vm_reserv) vm_rvq_partpop =
 150                             TAILQ_HEAD_INITIALIZER(vm_rvq_partpop);
 151
 152 static SYSCTL_NODE(_vm, OID_AUTO, reserv, CTLFLAG_RD, 0, "Reservation Info");
 153
 154 static long vm_reserv_broken;
 155 SYSCTL_LONG(_vm_reserv, OID_AUTO, broken, CTLFLAG_RD,
 156     &vm_reserv_broken, 0, "Cumulative number of broken reservations");
 157
 158 static long vm_reserv_freed;
 159 SYSCTL_LONG(_vm_reserv, OID_AUTO, freed, CTLFLAG_RD,
 160     &vm_reserv_freed, 0, "Cumulative number of freed reservations");
 161
 162 static int sysctl_vm_reserv_partpopq(SYSCTL_HANDLER_ARGS);
 163
 164 SYSCTL_OID(_vm_reserv, OID_AUTO, partpopq, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0,
 165     sysctl_vm_reserv_partpopq, "A", "Partially-populated reservation queues");
 166
 167 static long vm_reserv_reclaimed;
 168 SYSCTL_LONG(_vm_reserv, OID_AUTO, reclaimed, CTLFLAG_RD,
 169     &vm_reserv_reclaimed, 0, "Cumulative number of reclaimed reservations");
 170
 171 static void             vm_reserv_depopulate(vm_reserv_t rv);
 172 static vm_reserv_t      vm_reserv_from_page(vm_page_t m);
 173 static boolean_t        vm_reserv_has_pindex(vm_reserv_t rv,
 174                             vm_pindex_t pindex);
 175 static void             vm_reserv_populate(vm_reserv_t rv);
 176 static void             vm_reserv_reclaim(vm_reserv_t rv);
 177
 178 /*
 179  * Describes the current state of the partially-populated reservation queue.
 180  */
 181 static int
 182 sysctl_vm_reserv_partpopq(SYSCTL_HANDLER_ARGS)
 183 {
 184         struct sbuf sbuf;
 185         vm_reserv_t rv;
 186         int counter, error, level, unused_pages;
 187
 188         error = sysctl_wire_old_buffer(req, 0);
 189         if (error != 0)
 190                 return (error);
 191         sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
 192         sbuf_printf(&sbuf, "\nLEVEL     SIZE  NUMBER\n\n");
 193         for (level = -1; level <= VM_NRESERVLEVEL - 2; level++) {
 194                 counter = 0;
 195                 unused_pages = 0;
 196                 mtx_lock(&vm_page_queue_free_mtx);
 197                 TAILQ_FOREACH(rv, &vm_rvq_partpop/*[level]*/, partpopq) {
 198                         counter++;
 199                         unused_pages += VM_LEVEL_0_NPAGES - rv->popcnt;
 200                 }
 201                 mtx_unlock(&vm_page_queue_free_mtx);
 202                 sbuf_printf(&sbuf, "%5d: %6dK, %6d\n", level,
 203                     unused_pages * ((int)PAGE_SIZE / 1024), counter);
 204         }
 205         error = sbuf_finish(&sbuf);
 206         sbuf_delete(&sbuf);
 207         return (error);
 208 }
 209
 210 /*
 211  * Reduces the given reservation's population count.  If the population count
 212  * becomes zero, the reservation is destroyed.  Additionally, moves the
 213  * reservation to the tail of the partially-populated reservations queue if the
 214  * population count is non-zero.
 215  *
 216  * The free page queue lock must be held.
 217  */
 218 static void
 219 vm_reserv_depopulate(vm_reserv_t rv)
 220 {
 221
 222         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 223         KASSERT(rv->object != NULL,
 224             ("vm_reserv_depopulate: reserv %p is free", rv));
 225         KASSERT(rv->popcnt > 0,
 226             ("vm_reserv_depopulate: reserv %p's popcnt is corrupted", rv));
 227         if (rv->inpartpopq) {
 228                 TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq);
 229                 rv->inpartpopq = FALSE;
 230         }
 231         rv->popcnt--;
 232         if (rv->popcnt == 0) {
 233                 LIST_REMOVE(rv, objq);
 234                 rv->object = NULL;
 235                 vm_phys_free_pages(rv->pages, VM_LEVEL_0_ORDER);
 236                 vm_reserv_freed++;
 237         } else {
 238                 rv->inpartpopq = TRUE;
 239                 TAILQ_INSERT_TAIL(&vm_rvq_partpop, rv, partpopq);
 240         }
 241 }
 242
 243 /*
 244  * Returns the reservation to which the given page might belong.
 245  */
 246 static __inline vm_reserv_t
 247 vm_reserv_from_page(vm_page_t m)
 248 {
 249
 250         return (&vm_reserv_array[VM_PAGE_TO_PHYS(m) >> VM_LEVEL_0_SHIFT]);
 251 }
 252
 253 /*
 254  * Returns TRUE if the given reservation contains the given page index and
 255  * FALSE otherwise.
 256  */
 257 static __inline boolean_t
 258 vm_reserv_has_pindex(vm_reserv_t rv, vm_pindex_t pindex)
 259 {
 260
 261         return (((pindex - rv->pindex) & ~(VM_LEVEL_0_NPAGES - 1)) == 0);
 262 }
 263
 264 /*
 265  * Increases the given reservation's population count.  Moves the reservation
 266  * to the tail of the partially-populated reservation queue.
 267  *
 268  * The free page queue must be locked.
 269  */
 270 static void
 271 vm_reserv_populate(vm_reserv_t rv)
 272 {
 273
 274         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 275         KASSERT(rv->object != NULL,
 276             ("vm_reserv_populate: reserv %p is free", rv));
 277         KASSERT(rv->popcnt < VM_LEVEL_0_NPAGES,
 278             ("vm_reserv_populate: reserv %p is already full", rv));
 279         if (rv->inpartpopq) {
 280                 TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq);
 281                 rv->inpartpopq = FALSE;
 282         }
 283         rv->popcnt++;
 284         if (rv->popcnt < VM_LEVEL_0_NPAGES) {
 285                 rv->inpartpopq = TRUE;
 286                 TAILQ_INSERT_TAIL(&vm_rvq_partpop, rv, partpopq);
 287         }
 288 }
 289
 290 /*
 291  * Allocates a contiguous set of physical pages of the given size "npages"
 292  * from an existing or newly-created reservation.  All of the physical pages
 293  * must be at or above the given physical address "low" and below the given
 294  * physical address "high".  The given value "alignment" determines the
 295  * alignment of the first physical page in the set.  If the given value
 296  * "boundary" is non-zero, then the set of physical pages cannot cross any
 297  * physical address boundary that is a multiple of that value.  Both
 298  * "alignment" and "boundary" must be a power of two.
 299  *
 300  * The object and free page queue must be locked.
 301  */
 302 vm_page_t
 303 vm_reserv_alloc_contig(vm_object_t object, vm_pindex_t pindex, u_long npages,
 304     vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary)
 305 {
 306         vm_paddr_t pa, size;
 307         vm_page_t m, m_ret, mpred, msucc;
 308         vm_pindex_t first, leftcap, rightcap;
 309         vm_reserv_t rv;
 310         u_long allocpages, maxpages, minpages;
 311         int i, index, n;
 312
 313         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 314         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 315         KASSERT(npages != 0, ("vm_reserv_alloc_contig: npages is 0"));
 316
 317         /*
 318          * Is a reservation fundamentally impossible?
 319          */
 320         if (pindex < VM_RESERV_INDEX(object, pindex) ||
 321             pindex + npages > object->size)
 322                 return (NULL);
 323
 324         /*
 325          * All reservations of a particular size have the same alignment.
 326          * Assuming that the first page is allocated from a reservation, the
 327          * least significant bits of its physical address can be determined
 328          * from its offset from the beginning of the reservation and the size
 329          * of the reservation.
 330          *
 331          * Could the specified index within a reservation of the smallest
 332          * possible size satisfy the alignment and boundary requirements?
 333          */
 334         pa = VM_RESERV_INDEX(object, pindex) << PAGE_SHIFT;
 335         if ((pa & (alignment - 1)) != 0)
 336                 return (NULL);
 337         size = npages << PAGE_SHIFT;
 338         if (((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0)
 339                 return (NULL);
 340
 341         /*
 342          * Look for an existing reservation.
 343          */
 344         msucc = NULL;
 345         mpred = object->root;
 346         while (mpred != NULL) {
 347                 KASSERT(mpred->pindex != pindex,
 348                     ("vm_reserv_alloc_contig: pindex already allocated"));
 349                 rv = vm_reserv_from_page(mpred);
 350                 if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
 351                         goto found;
 352                 else if (mpred->pindex < pindex) {
 353                         if (msucc != NULL ||
 354                             (msucc = TAILQ_NEXT(mpred, listq)) == NULL)
 355                                 break;
 356                         KASSERT(msucc->pindex != pindex,
 357                     ("vm_reserv_alloc_contig: pindex already allocated"));
 358                         rv = vm_reserv_from_page(msucc);
 359                         if (rv->object == object &&
 360                             vm_reserv_has_pindex(rv, pindex))
 361                                 goto found;
 362                         else if (pindex < msucc->pindex)
 363                                 break;
 364                 } else if (msucc == NULL) {
 365                         msucc = mpred;
 366                         mpred = TAILQ_PREV(msucc, pglist, listq);
 367                         continue;
 368                 }
 369                 msucc = NULL;
 370                 mpred = object->root = vm_page_splay(pindex, object->root);
 371         }
 372
 373         /*
 374          * Could at least one reservation fit between the first index to the
 375          * left that can be used and the first index to the right that cannot
 376          * be used?
 377          */
 378         first = pindex - VM_RESERV_INDEX(object, pindex);
 379         if (mpred != NULL) {
 380                 if ((rv = vm_reserv_from_page(mpred))->object != object)
 381                         leftcap = mpred->pindex + 1;
 382                 else
 383                         leftcap = rv->pindex + VM_LEVEL_0_NPAGES;
 384                 if (leftcap > first)
 385                         return (NULL);
 386         }
 387         minpages = VM_RESERV_INDEX(object, pindex) + npages;
 388         maxpages = roundup2(minpages, VM_LEVEL_0_NPAGES);
 389         allocpages = maxpages;
 390         if (msucc != NULL) {
 391                 if ((rv = vm_reserv_from_page(msucc))->object != object)
 392                         rightcap = msucc->pindex;
 393                 else
 394                         rightcap = rv->pindex;
 395                 if (first + maxpages > rightcap) {
 396                         if (maxpages == VM_LEVEL_0_NPAGES)
 397                                 return (NULL);
 398                         allocpages = minpages;
 399                 }
 400         }
 401
 402         /*
 403          * Would the last new reservation extend past the end of the object?
 404          */
 405         if (first + maxpages > object->size) {
 406                 /*
 407                  * Don't allocate the last new reservation if the object is a
 408                  * vnode or backed by another object that is a vnode.
 409                  */
 410                 if (object->type == OBJT_VNODE ||
 411                     (object->backing_object != NULL &&
 412                     object->backing_object->type == OBJT_VNODE)) {
 413                         if (maxpages == VM_LEVEL_0_NPAGES)
 414                                 return (NULL);
 415                         allocpages = minpages;
 416                 }
 417                 /* Speculate that the object may grow. */
 418         }
 419
 420         /*
 421          * Allocate and populate the new reservations.  The alignment and
 422          * boundary specified for this allocation may be different from the
 423          * alignment and boundary specified for the requested pages.  For
 424          * instance, the specified index may not be the first page within the
 425          * first new reservation.
 426          */
 427         m = vm_phys_alloc_contig(allocpages, low, high, ulmax(alignment,
 428             VM_LEVEL_0_SIZE), boundary > VM_LEVEL_0_SIZE ? boundary : 0);
 429         if (m == NULL)
 430                 return (NULL);
 431         m_ret = NULL;
 432         index = VM_RESERV_INDEX(object, pindex);
 433         do {
 434                 rv = vm_reserv_from_page(m);
 435                 KASSERT(rv->pages == m,
 436                     ("vm_reserv_alloc_contig: reserv %p's pages is corrupted",
 437                     rv));
 438                 KASSERT(rv->object == NULL,
 439                     ("vm_reserv_alloc_contig: reserv %p isn't free", rv));
 440                 LIST_INSERT_HEAD(&object->rvq, rv, objq);
 441                 rv->object = object;
 442                 rv->pindex = first;
 443                 KASSERT(rv->popcnt == 0,
 444                     ("vm_reserv_alloc_contig: reserv %p's popcnt is corrupted",
 445                     rv));
 446                 KASSERT(!rv->inpartpopq,
 447                     ("vm_reserv_alloc_contig: reserv %p's inpartpopq is TRUE",
 448                     rv));
 449                 n = ulmin(VM_LEVEL_0_NPAGES - index, npages);
 450                 for (i = 0; i < n; i++)
 451                         vm_reserv_populate(rv);
 452                 npages -= n;
 453                 if (m_ret == NULL) {
 454                         m_ret = &rv->pages[index];
 455                         index = 0;
 456                 }
 457                 m += VM_LEVEL_0_NPAGES;
 458                 first += VM_LEVEL_0_NPAGES;
 459                 allocpages -= VM_LEVEL_0_NPAGES;
 460         } while (allocpages > 0);
 461         return (m_ret);
 462
 463         /*
 464          * Found a matching reservation.
 465          */
 466 found:
 467         index = VM_RESERV_INDEX(object, pindex);
 468         /* Does the allocation fit within the reservation? */
 469         if (index + npages > VM_LEVEL_0_NPAGES)
 470                 return (NULL);
 471         m = &rv->pages[index];
 472         pa = VM_PAGE_TO_PHYS(m);
 473         if (pa < low || pa + size > high || (pa & (alignment - 1)) != 0 ||
 474             ((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0)
 475                 return (NULL);
 476         /* Handle vm_page_rename(m, new_object, ...). */
 477         for (i = 0; i < npages; i++)
 478                 if ((rv->pages[index + i].flags & (PG_CACHED | PG_FREE)) == 0)
 479                         return (NULL);
 480         for (i = 0; i < npages; i++)
 481                 vm_reserv_populate(rv);
 482         return (m);
 483 }
 484
 485 /*
 486  * Allocates a page from an existing or newly-created reservation.
 487  *
 488  * The object and free page queue must be locked.
 489  */
 490 vm_page_t
 491 vm_reserv_alloc_page(vm_object_t object, vm_pindex_t pindex)
 492 {
 493         vm_page_t m, mpred, msucc;
 494         vm_pindex_t first, leftcap, rightcap;
 495         vm_reserv_t rv;
 496
 497         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 498         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 499
 500         /*
 501          * Is a reservation fundamentally impossible?
 502          */
 503         if (pindex < VM_RESERV_INDEX(object, pindex) ||
 504             pindex >= object->size)
 505                 return (NULL);
 506
 507         /*
 508          * Look for an existing reservation.
 509          */
 510         msucc = NULL;
 511         mpred = object->root;
 512         while (mpred != NULL) {
 513                 KASSERT(mpred->pindex != pindex,
 514                     ("vm_reserv_alloc_page: pindex already allocated"));
 515                 rv = vm_reserv_from_page(mpred);
 516                 if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
 517                         goto found;
 518                 else if (mpred->pindex < pindex) {
 519                         if (msucc != NULL ||
 520                             (msucc = TAILQ_NEXT(mpred, listq)) == NULL)
 521                                 break;
 522                         KASSERT(msucc->pindex != pindex,
 523                             ("vm_reserv_alloc_page: pindex already allocated"));
 524                         rv = vm_reserv_from_page(msucc);
 525                         if (rv->object == object &&
 526                             vm_reserv_has_pindex(rv, pindex))
 527                                 goto found;
 528                         else if (pindex < msucc->pindex)
 529                                 break;
 530                 } else if (msucc == NULL) {
 531                         msucc = mpred;
 532                         mpred = TAILQ_PREV(msucc, pglist, listq);
 533                         continue;
 534                 }
 535                 msucc = NULL;
 536                 mpred = object->root = vm_page_splay(pindex, object->root);
 537         }
 538
 539         /*
 540          * Could a reservation fit between the first index to the left that
 541          * can be used and the first index to the right that cannot be used?
 542          */
 543         first = pindex - VM_RESERV_INDEX(object, pindex);
 544         if (mpred != NULL) {
 545                 if ((rv = vm_reserv_from_page(mpred))->object != object)
 546                         leftcap = mpred->pindex + 1;
 547                 else
 548                         leftcap = rv->pindex + VM_LEVEL_0_NPAGES;
 549                 if (leftcap > first)
 550                         return (NULL);
 551         }
 552         if (msucc != NULL) {
 553                 if ((rv = vm_reserv_from_page(msucc))->object != object)
 554                         rightcap = msucc->pindex;
 555                 else
 556                         rightcap = rv->pindex;
 557                 if (first + VM_LEVEL_0_NPAGES > rightcap)
 558                         return (NULL);
 559         }
 560
 561         /*
 562          * Would a new reservation extend past the end of the object?
 563          */
 564         if (first + VM_LEVEL_0_NPAGES > object->size) {
 565                 /*
 566                  * Don't allocate a new reservation if the object is a vnode or
 567                  * backed by another object that is a vnode.
 568                  */
 569                 if (object->type == OBJT_VNODE ||
 570                     (object->backing_object != NULL &&
 571                     object->backing_object->type == OBJT_VNODE))
 572                         return (NULL);
 573                 /* Speculate that the object may grow. */
 574         }
 575
 576         /*
 577          * Allocate and populate the new reservation.
 578          */
 579         m = vm_phys_alloc_pages(VM_FREEPOOL_DEFAULT, VM_LEVEL_0_ORDER);
 580         if (m == NULL)
 581                 return (NULL);
 582         rv = vm_reserv_from_page(m);
 583         KASSERT(rv->pages == m,
 584             ("vm_reserv_alloc_page: reserv %p's pages is corrupted", rv));
 585         KASSERT(rv->object == NULL,
 586             ("vm_reserv_alloc_page: reserv %p isn't free", rv));
 587         LIST_INSERT_HEAD(&object->rvq, rv, objq);
 588         rv->object = object;
 589         rv->pindex = first;
 590         KASSERT(rv->popcnt == 0,
 591             ("vm_reserv_alloc_page: reserv %p's popcnt is corrupted", rv));
 592         KASSERT(!rv->inpartpopq,
 593             ("vm_reserv_alloc_page: reserv %p's inpartpopq is TRUE", rv));
 594         vm_reserv_populate(rv);
 595         return (&rv->pages[VM_RESERV_INDEX(object, pindex)]);
 596
 597         /*
 598          * Found a matching reservation.
 599          */
 600 found:
 601         m = &rv->pages[VM_RESERV_INDEX(object, pindex)];
 602         /* Handle vm_page_rename(m, new_object, ...). */
 603         if ((m->flags & (PG_CACHED | PG_FREE)) == 0)
 604                 return (NULL);
 605         vm_reserv_populate(rv);
 606         return (m);
 607 }
 608
 609 /*
 610  * Breaks all reservations belonging to the given object.
 611  */
 612 void
 613 vm_reserv_break_all(vm_object_t object)
 614 {
 615         vm_reserv_t rv;
 616         int i;
 617
 618         mtx_lock(&vm_page_queue_free_mtx);
 619         while ((rv = LIST_FIRST(&object->rvq)) != NULL) {
 620                 KASSERT(rv->object == object,
 621                     ("vm_reserv_break_all: reserv %p is corrupted", rv));
 622                 if (rv->inpartpopq) {
 623                         TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq);
 624                         rv->inpartpopq = FALSE;
 625                 }
 626                 LIST_REMOVE(rv, objq);
 627                 rv->object = NULL;
 628                 for (i = 0; i < VM_LEVEL_0_NPAGES; i++) {
 629                         if ((rv->pages[i].flags & (PG_CACHED | PG_FREE)) != 0)
 630                                 vm_phys_free_pages(&rv->pages[i], 0);
 631                         else
 632                                 rv->popcnt--;
 633                 }
 634                 KASSERT(rv->popcnt == 0,
 635                     ("vm_reserv_break_all: reserv %p's popcnt is corrupted",
 636                     rv));
 637                 vm_reserv_broken++;
 638         }
 639         mtx_unlock(&vm_page_queue_free_mtx);
 640 }
 641
 642 /*
 643  * Frees the given page if it belongs to a reservation.  Returns TRUE if the
 644  * page is freed and FALSE otherwise.
 645  *
 646  * The free page queue lock must be held.
 647  */
 648 boolean_t
 649 vm_reserv_free_page(vm_page_t m)
 650 {
 651         vm_reserv_t rv;
 652
 653         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 654         rv = vm_reserv_from_page(m);
 655         if (rv->object == NULL)
 656                 return (FALSE);
 657         if ((m->flags & PG_CACHED) != 0 && m->pool != VM_FREEPOOL_CACHE)
 658                 vm_phys_set_pool(VM_FREEPOOL_CACHE, rv->pages,
 659                     VM_LEVEL_0_ORDER);
 660         vm_reserv_depopulate(rv);
 661         return (TRUE);
 662 }
 663
 664 /*
 665  * Initializes the reservation management system.  Specifically, initializes
 666  * the reservation array.
 667  *
 668  * Requires that vm_page_array and first_page are initialized!
 669  */
 670 void
 671 vm_reserv_init(void)
 672 {
 673         vm_paddr_t paddr;
 674         int i;
 675
 676         /*
 677          * Initialize the reservation array.  Specifically, initialize the
 678          * "pages" field for every element that has an underlying superpage.
 679          */
 680         for (i = 0; phys_avail[i + 1] != 0; i += 2) {
 681                 paddr = roundup2(phys_avail[i], VM_LEVEL_0_SIZE);
 682                 while (paddr + VM_LEVEL_0_SIZE <= phys_avail[i + 1]) {
 683                         vm_reserv_array[paddr >> VM_LEVEL_0_SHIFT].pages =
 684                             PHYS_TO_VM_PAGE(paddr);
 685                         paddr += VM_LEVEL_0_SIZE;
 686                 }
 687         }
 688 }
 689
 690 /*
 691  * Returns a reservation level if the given page belongs to a fully-populated
 692  * reservation and -1 otherwise.
 693  */
 694 int
 695 vm_reserv_level_iffullpop(vm_page_t m)
 696 {
 697         vm_reserv_t rv;
 698
 699         rv = vm_reserv_from_page(m);
 700         return (rv->popcnt == VM_LEVEL_0_NPAGES ? 0 : -1);
 701 }
 702
 703 /*
 704  * Prepare for the reactivation of a cached page.
 705  *
 706  * First, suppose that the given page "m" was allocated individually, i.e., not
 707  * as part of a reservation, and cached.  Then, suppose a reservation
 708  * containing "m" is allocated by the same object.  Although "m" and the
 709  * reservation belong to the same object, "m"'s pindex may not match the
 710  * reservation's.
 711  *
 712  * The free page queue must be locked.
 713  */
 714 boolean_t
 715 vm_reserv_reactivate_page(vm_page_t m)
 716 {
 717         vm_reserv_t rv;
 718         int i, m_index;
 719
 720         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 721         rv = vm_reserv_from_page(m);
 722         if (rv->object == NULL)
 723                 return (FALSE);
 724         KASSERT((m->flags & PG_CACHED) != 0,
 725             ("vm_reserv_uncache_page: page %p is not cached", m));
 726         if (m->object == rv->object &&
 727             m->pindex - rv->pindex == VM_RESERV_INDEX(m->object, m->pindex))
 728                 vm_reserv_populate(rv);
 729         else {
 730                 KASSERT(rv->inpartpopq,
 731                     ("vm_reserv_uncache_page: reserv %p's inpartpopq is FALSE",
 732                     rv));
 733                 TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq);
 734                 rv->inpartpopq = FALSE;
 735                 LIST_REMOVE(rv, objq);
 736                 rv->object = NULL;
 737                 /* Don't vm_phys_free_pages(m, 0). */
 738                 m_index = m - rv->pages;
 739                 for (i = 0; i < m_index; i++) {
 740                         if ((rv->pages[i].flags & (PG_CACHED | PG_FREE)) != 0)
 741                                 vm_phys_free_pages(&rv->pages[i], 0);
 742                         else
 743                                 rv->popcnt--;
 744                 }
 745                 for (i++; i < VM_LEVEL_0_NPAGES; i++) {
 746                         if ((rv->pages[i].flags & (PG_CACHED | PG_FREE)) != 0)
 747                                 vm_phys_free_pages(&rv->pages[i], 0);
 748                         else
 749                                 rv->popcnt--;
 750                 }
 751                 KASSERT(rv->popcnt == 0,
 752                     ("vm_reserv_uncache_page: reserv %p's popcnt is corrupted",
 753                     rv));
 754                 vm_reserv_broken++;
 755         }
 756         return (TRUE);
 757 }
 758
 759 /*
 760  * Breaks the given partially-populated reservation, releasing its cached and
 761  * free pages to the physical memory allocator.
 762  *
 763  * The free page queue lock must be held.
 764  */
 765 static void
 766 vm_reserv_reclaim(vm_reserv_t rv)
 767 {
 768         int i;
 769
 770         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 771         KASSERT(rv->inpartpopq,
 772             ("vm_reserv_reclaim: reserv %p's inpartpopq is corrupted", rv));
 773         TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq);
 774         rv->inpartpopq = FALSE;
 775         KASSERT(rv->object != NULL,
 776             ("vm_reserv_reclaim: reserv %p is free", rv));
 777         LIST_REMOVE(rv, objq);
 778         rv->object = NULL;
 779         for (i = 0; i < VM_LEVEL_0_NPAGES; i++) {
 780                 if ((rv->pages[i].flags & (PG_CACHED | PG_FREE)) != 0)
 781                         vm_phys_free_pages(&rv->pages[i], 0);
 782                 else
 783                         rv->popcnt--;
 784         }
 785         KASSERT(rv->popcnt == 0,
 786             ("vm_reserv_reclaim: reserv %p's popcnt is corrupted", rv));
 787         vm_reserv_reclaimed++;
 788 }
 789
 790 /*
 791  * Breaks the reservation at the head of the partially-populated reservation
 792  * queue, releasing its cached and free pages to the physical memory
 793  * allocator.  Returns TRUE if a reservation is broken and FALSE otherwise.
 794  *
 795  * The free page queue lock must be held.
 796  */
 797 boolean_t
 798 vm_reserv_reclaim_inactive(void)
 799 {
 800         vm_reserv_t rv;
 801
 802         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 803         if ((rv = TAILQ_FIRST(&vm_rvq_partpop)) != NULL) {
 804                 vm_reserv_reclaim(rv);
 805                 return (TRUE);
 806         }
 807         return (FALSE);
 808 }
 809
 810 /*
 811  * Searches the partially-populated reservation queue for the least recently
 812  * active reservation with unused pages, i.e., cached or free, that satisfy the
 813  * given request for contiguous physical memory.  If a satisfactory reservation
 814  * is found, it is broken.  Returns TRUE if a reservation is broken and FALSE
 815  * otherwise.
 816  *
 817  * The free page queue lock must be held.
 818  */
 819 boolean_t
 820 vm_reserv_reclaim_contig(u_long npages, vm_paddr_t low, vm_paddr_t high,
 821     u_long alignment, vm_paddr_t boundary)
 822 {
 823         vm_paddr_t pa, pa_length, size;
 824         vm_reserv_t rv;
 825         int i;
 826
 827         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 828         if (npages > VM_LEVEL_0_NPAGES - 1)
 829                 return (FALSE);
 830         size = npages << PAGE_SHIFT;
 831         TAILQ_FOREACH(rv, &vm_rvq_partpop, partpopq) {
 832                 pa = VM_PAGE_TO_PHYS(&rv->pages[VM_LEVEL_0_NPAGES - 1]);
 833                 if (pa + PAGE_SIZE - size < low) {
 834                         /* this entire reservation is too low; go to next */
 835                         continue;
 836                 }
 837                 pa_length = 0;
 838                 for (i = 0; i < VM_LEVEL_0_NPAGES; i++)
 839                         if ((rv->pages[i].flags & (PG_CACHED | PG_FREE)) != 0) {
 840                                 pa_length += PAGE_SIZE;
 841                                 if (pa_length == PAGE_SIZE) {
 842                                         pa = VM_PAGE_TO_PHYS(&rv->pages[i]);
 843                                         if (pa + size > high) {
 844                                                 /* skip to next reservation */
 845                                                 break;
 846                                         } else if (pa < low ||
 847                                             (pa & (alignment - 1)) != 0 ||
 848                                             ((pa ^ (pa + size - 1)) &
 849                                             ~(boundary - 1)) != 0)
 850                                                 pa_length = 0;
 851                                 }
 852                                 if (pa_length >= size) {
 853                                         vm_reserv_reclaim(rv);
 854                                         return (TRUE);
 855                                 }
 856                         } else
 857                                 pa_length = 0;
 858         }
 859         return (FALSE);
 860 }
 861
 862 /*
 863  * Transfers the reservation underlying the given page to a new object.
 864  *
 865  * The object must be locked.
 866  */
 867 void
 868 vm_reserv_rename(vm_page_t m, vm_object_t new_object, vm_object_t old_object,
 869     vm_pindex_t old_object_offset)
 870 {
 871         vm_reserv_t rv;
 872
 873         VM_OBJECT_LOCK_ASSERT(new_object, MA_OWNED);
 874         rv = vm_reserv_from_page(m);
 875         if (rv->object == old_object) {
 876                 mtx_lock(&vm_page_queue_free_mtx);
 877                 if (rv->object == old_object) {
 878                         LIST_REMOVE(rv, objq);
 879                         LIST_INSERT_HEAD(&new_object->rvq, rv, objq);
 880                         rv->object = new_object;
 881                         rv->pindex -= old_object_offset;
 882                 }
 883                 mtx_unlock(&vm_page_queue_free_mtx);
 884         }
 885 }
 886
 887 /*
 888  * Allocates the virtual and physical memory required by the reservation
 889  * management system's data structures, in particular, the reservation array.
 890  */
 891 vm_paddr_t
 892 vm_reserv_startup(vm_offset_t *vaddr, vm_paddr_t end, vm_paddr_t high_water)
 893 {
 894         vm_paddr_t new_end;
 895         size_t size;
 896
 897         /*
 898          * Calculate the size (in bytes) of the reservation array.  Round up
 899          * from "high_water" because every small page is mapped to an element
 900          * in the reservation array based on its physical address.  Thus, the
 901          * number of elements in the reservation array can be greater than the
 902          * number of superpages.
 903          */
 904         size = howmany(high_water, VM_LEVEL_0_SIZE) * sizeof(struct vm_reserv);
 905
 906         /*
 907          * Allocate and map the physical memory for the reservation array.  The
 908          * next available virtual address is returned by reference.
 909          */
 910         new_end = end - round_page(size);
 911         vm_reserv_array = (void *)(uintptr_t)pmap_map(vaddr, new_end, end,
 912             VM_PROT_READ | VM_PROT_WRITE);
 913         bzero(vm_reserv_array, size);
 914
 915         /*
 916          * Return the next available physical address.
 917          */
 918         return (new_end);
 919 }
 920
 921 #endif  /* VM_NRESERVLEVEL > 0 */