sys/vm/vm_page.c

   1 /*-
   2  * Copyright (c) 1991 Regents of the University of California.
   3  * All rights reserved.
   4  * Copyright (c) 1998 Matthew Dillon.  All Rights Reserved.
   5  *
   6  * This code is derived from software contributed to Berkeley by
   7  * The Mach Operating System project at Carnegie-Mellon University.
   8  *
   9  * Redistribution and use in source and binary forms, with or without
  10  * modification, are permitted provided that the following conditions
  11  * are met:
  12  * 1. Redistributions of source code must retain the above copyright
  13  *    notice, this list of conditions and the following disclaimer.
  14  * 2. Redistributions in binary form must reproduce the above copyright
  15  *    notice, this list of conditions and the following disclaimer in the
  16  *    documentation and/or other materials provided with the distribution.
  17  * 4. Neither the name of the University nor the names of its contributors
  18  *    may be used to endorse or promote products derived from this software
  19  *    without specific prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  31  * SUCH DAMAGE.
  32  *
  33  *      from: @(#)vm_page.c     7.4 (Berkeley) 5/7/91
  34  */
  35
  36 /*-
  37  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  38  * All rights reserved.
  39  *
  40  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  41  *
  42  * Permission to use, copy, modify and distribute this software and
  43  * its documentation is hereby granted, provided that both the copyright
  44  * notice and this permission notice appear in all copies of the
  45  * software, derivative works or modified versions, and any portions
  46  * thereof, and that both notices appear in supporting documentation.
  47  *
  48  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  49  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  50  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  51  *
  52  * Carnegie Mellon requests users of this software to return to
  53  *
  54  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  55  *  School of Computer Science
  56  *  Carnegie Mellon University
  57  *  Pittsburgh PA 15213-3890
  58  *
  59  * any improvements or extensions that they make and grant Carnegie the
  60  * rights to redistribute these changes.
  61  */
  62
  63 /*
  64  *                      GENERAL RULES ON VM_PAGE MANIPULATION
  65  *
  66  *      - A page queue lock is required when adding or removing a page from a
  67  *        page queue (vm_pagequeues[]), regardless of other locks or the
  68  *        busy state of a page.
  69  *
  70  *              * In general, no thread besides the page daemon can acquire or
  71  *                hold more than one page queue lock at a time.
  72  *
  73  *              * The page daemon can acquire and hold any pair of page queue
  74  *                locks in any order.
  75  *
  76  *      - The object lock is required when inserting or removing
  77  *        pages from an object (vm_page_insert() or vm_page_remove()).
  78  *
  79  */
  80
  81 /*
  82  *      Resident memory management module.
  83  */
  84
  85 #include <sys/cdefs.h>
  86 __FBSDID("$FreeBSD$");
  87
  88 #include "opt_vm.h"
  89
  90 #include <sys/param.h>
  91 #include <sys/systm.h>
  92 #include <sys/lock.h>
  93 #include <sys/kernel.h>
  94 #include <sys/limits.h>
  95 #include <sys/malloc.h>
  96 #include <sys/mman.h>
  97 #include <sys/msgbuf.h>
  98 #include <sys/mutex.h>
  99 #include <sys/proc.h>
 100 #include <sys/rwlock.h>
 101 #include <sys/sysctl.h>
 102 #include <sys/vmmeter.h>
 103 #include <sys/vnode.h>
 104
 105 #include <vm/vm.h>
 106 #include <vm/pmap.h>
 107 #include <vm/vm_param.h>
 108 #include <vm/vm_kern.h>
 109 #include <vm/vm_object.h>
 110 #include <vm/vm_page.h>
 111 #include <vm/vm_pageout.h>
 112 #include <vm/vm_pager.h>
 113 #include <vm/vm_phys.h>
 114 #include <vm/vm_radix.h>
 115 #include <vm/vm_reserv.h>
 116 #include <vm/vm_extern.h>
 117 #include <vm/uma.h>
 118 #include <vm/uma_int.h>
 119
 120 #include <machine/md_var.h>
 121
 122 /*
 123  *      Associated with page of user-allocatable memory is a
 124  *      page structure.
 125  */
 126
 127 struct vm_pagequeue vm_pagequeues[PQ_COUNT] = {
 128         [PQ_INACTIVE] = {
 129                 .pq_pl = TAILQ_HEAD_INITIALIZER(
 130                     vm_pagequeues[PQ_INACTIVE].pq_pl),
 131                 .pq_cnt = &cnt.v_inactive_count,
 132                 .pq_name = "vm inactive pagequeue"
 133         },
 134         [PQ_ACTIVE] = {
 135                 .pq_pl = TAILQ_HEAD_INITIALIZER(
 136                     vm_pagequeues[PQ_ACTIVE].pq_pl),
 137                 .pq_cnt = &cnt.v_active_count,
 138                 .pq_name = "vm active pagequeue"
 139         }
 140 };
 141 struct mtx_padalign vm_page_queue_free_mtx;
 142
 143 struct mtx_padalign pa_lock[PA_LOCK_COUNT];
 144
 145 vm_page_t vm_page_array;
 146 long vm_page_array_size;
 147 long first_page;
 148 int vm_page_zero_count;
 149
 150 static int boot_pages = UMA_BOOT_PAGES;
 151 TUNABLE_INT("vm.boot_pages", &boot_pages);
 152 SYSCTL_INT(_vm, OID_AUTO, boot_pages, CTLFLAG_RD, &boot_pages, 0,
 153         "number of pages allocated for bootstrapping the VM system");
 154
 155 static int pa_tryrelock_restart;
 156 SYSCTL_INT(_vm, OID_AUTO, tryrelock_restart, CTLFLAG_RD,
 157     &pa_tryrelock_restart, 0, "Number of tryrelock restarts");
 158
 159 static uma_zone_t fakepg_zone;
 160
 161 static struct vnode *vm_page_alloc_init(vm_page_t m);
 162 static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits);
 163 static void vm_page_enqueue(int queue, vm_page_t m);
 164 static void vm_page_init_fakepg(void *dummy);
 165 static void vm_page_insert_after(vm_page_t m, vm_object_t object,
 166     vm_pindex_t pindex, vm_page_t mpred);
 167
 168 SYSINIT(vm_page, SI_SUB_VM, SI_ORDER_SECOND, vm_page_init_fakepg, NULL);
 169
 170 static void
 171 vm_page_init_fakepg(void *dummy)
 172 {
 173
 174         fakepg_zone = uma_zcreate("fakepg", sizeof(struct vm_page), NULL, NULL,
 175             NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM);
 176 }
 177
 178 /* Make sure that u_long is at least 64 bits when PAGE_SIZE is 32K. */
 179 #if PAGE_SIZE == 32768
 180 #ifdef CTASSERT
 181 CTASSERT(sizeof(u_long) >= 8);
 182 #endif
 183 #endif
 184
 185 /*
 186  * Try to acquire a physical address lock while a pmap is locked.  If we
 187  * fail to trylock we unlock and lock the pmap directly and cache the
 188  * locked pa in *locked.  The caller should then restart their loop in case
 189  * the virtual to physical mapping has changed.
 190  */
 191 int
 192 vm_page_pa_tryrelock(pmap_t pmap, vm_paddr_t pa, vm_paddr_t *locked)
 193 {
 194         vm_paddr_t lockpa;
 195
 196         lockpa = *locked;
 197         *locked = pa;
 198         if (lockpa) {
 199                 PA_LOCK_ASSERT(lockpa, MA_OWNED);
 200                 if (PA_LOCKPTR(pa) == PA_LOCKPTR(lockpa))
 201                         return (0);
 202                 PA_UNLOCK(lockpa);
 203         }
 204         if (PA_TRYLOCK(pa))
 205                 return (0);
 206         PMAP_UNLOCK(pmap);
 207         atomic_add_int(&pa_tryrelock_restart, 1);
 208         PA_LOCK(pa);
 209         PMAP_LOCK(pmap);
 210         return (EAGAIN);
 211 }
 212
 213 /*
 214  *      vm_set_page_size:
 215  *
 216  *      Sets the page size, perhaps based upon the memory
 217  *      size.  Must be called before any use of page-size
 218  *      dependent functions.
 219  */
 220 void
 221 vm_set_page_size(void)
 222 {
 223         if (cnt.v_page_size == 0)
 224                 cnt.v_page_size = PAGE_SIZE;
 225         if (((cnt.v_page_size - 1) & cnt.v_page_size) != 0)
 226                 panic("vm_set_page_size: page size not a power of two");
 227 }
 228
 229 /*
 230  *      vm_page_blacklist_lookup:
 231  *
 232  *      See if a physical address in this page has been listed
 233  *      in the blacklist tunable.  Entries in the tunable are
 234  *      separated by spaces or commas.  If an invalid integer is
 235  *      encountered then the rest of the string is skipped.
 236  */
 237 static int
 238 vm_page_blacklist_lookup(char *list, vm_paddr_t pa)
 239 {
 240         vm_paddr_t bad;
 241         char *cp, *pos;
 242
 243         for (pos = list; *pos != '\0'; pos = cp) {
 244                 bad = strtoq(pos, &cp, 0);
 245                 if (*cp != '\0') {
 246                         if (*cp == ' ' || *cp == ',') {
 247                                 cp++;
 248                                 if (cp == pos)
 249                                         continue;
 250                         } else
 251                                 break;
 252                 }
 253                 if (pa == trunc_page(bad))
 254                         return (1);
 255         }
 256         return (0);
 257 }
 258
 259 /*
 260  *      vm_page_startup:
 261  *
 262  *      Initializes the resident memory module.
 263  *
 264  *      Allocates memory for the page cells, and
 265  *      for the object/offset-to-page hash table headers.
 266  *      Each page cell is initialized and placed on the free list.
 267  */
 268 vm_offset_t
 269 vm_page_startup(vm_offset_t vaddr)
 270 {
 271         vm_offset_t mapped;
 272         vm_paddr_t page_range;
 273         vm_paddr_t new_end;
 274         int i;
 275         vm_paddr_t pa;
 276         vm_paddr_t last_pa;
 277         char *list;
 278
 279         /* the biggest memory array is the second group of pages */
 280         vm_paddr_t end;
 281         vm_paddr_t biggestsize;
 282         vm_paddr_t low_water, high_water;
 283         int biggestone;
 284
 285         biggestsize = 0;
 286         biggestone = 0;
 287         vaddr = round_page(vaddr);
 288
 289         for (i = 0; phys_avail[i + 1]; i += 2) {
 290                 phys_avail[i] = round_page(phys_avail[i]);
 291                 phys_avail[i + 1] = trunc_page(phys_avail[i + 1]);
 292         }
 293
 294         low_water = phys_avail[0];
 295         high_water = phys_avail[1];
 296
 297         for (i = 0; phys_avail[i + 1]; i += 2) {
 298                 vm_paddr_t size = phys_avail[i + 1] - phys_avail[i];
 299
 300                 if (size > biggestsize) {
 301                         biggestone = i;
 302                         biggestsize = size;
 303                 }
 304                 if (phys_avail[i] < low_water)
 305                         low_water = phys_avail[i];
 306                 if (phys_avail[i + 1] > high_water)
 307                         high_water = phys_avail[i + 1];
 308         }
 309
 310 #ifdef XEN
 311         low_water = 0;
 312 #endif
 313
 314         end = phys_avail[biggestone+1];
 315
 316         /*
 317          * Initialize the page and queue locks.
 318          */
 319         mtx_init(&vm_page_queue_free_mtx, "vm page free queue", NULL, MTX_DEF);
 320         for (i = 0; i < PA_LOCK_COUNT; i++)
 321                 mtx_init(&pa_lock[i], "vm page", NULL, MTX_DEF);
 322         for (i = 0; i < PQ_COUNT; i++)
 323                 vm_pagequeue_init_lock(&vm_pagequeues[i]);
 324
 325         /*
 326          * Allocate memory for use when boot strapping the kernel memory
 327          * allocator.
 328          */
 329         new_end = end - (boot_pages * UMA_SLAB_SIZE);
 330         new_end = trunc_page(new_end);
 331         mapped = pmap_map(&vaddr, new_end, end,
 332             VM_PROT_READ | VM_PROT_WRITE);
 333         bzero((void *)mapped, end - new_end);
 334         uma_startup((void *)mapped, boot_pages);
 335
 336 #if defined(__amd64__) || defined(__i386__) || defined(__arm__) || \
 337     defined(__mips__)
 338         /*
 339          * Allocate a bitmap to indicate that a random physical page
 340          * needs to be included in a minidump.
 341          *
 342          * The amd64 port needs this to indicate which direct map pages
 343          * need to be dumped, via calls to dump_add_page()/dump_drop_page().
 344          *
 345          * However, i386 still needs this workspace internally within the
 346          * minidump code.  In theory, they are not needed on i386, but are
 347          * included should the sf_buf code decide to use them.
 348          */
 349         last_pa = 0;
 350         for (i = 0; dump_avail[i + 1] != 0; i += 2)
 351                 if (dump_avail[i + 1] > last_pa)
 352                         last_pa = dump_avail[i + 1];
 353         page_range = last_pa / PAGE_SIZE;
 354         vm_page_dump_size = round_page(roundup2(page_range, NBBY) / NBBY);
 355         new_end -= vm_page_dump_size;
 356         vm_page_dump = (void *)(uintptr_t)pmap_map(&vaddr, new_end,
 357             new_end + vm_page_dump_size, VM_PROT_READ | VM_PROT_WRITE);
 358         bzero((void *)vm_page_dump, vm_page_dump_size);
 359 #endif
 360 #ifdef __amd64__
 361         /*
 362          * Request that the physical pages underlying the message buffer be
 363          * included in a crash dump.  Since the message buffer is accessed
 364          * through the direct map, they are not automatically included.
 365          */
 366         pa = DMAP_TO_PHYS((vm_offset_t)msgbufp->msg_ptr);
 367         last_pa = pa + round_page(msgbufsize);
 368         while (pa < last_pa) {
 369                 dump_add_page(pa);
 370                 pa += PAGE_SIZE;
 371         }
 372 #endif
 373         /*
 374          * Compute the number of pages of memory that will be available for
 375          * use (taking into account the overhead of a page structure per
 376          * page).
 377          */
 378         first_page = low_water / PAGE_SIZE;
 379 #ifdef VM_PHYSSEG_SPARSE
 380         page_range = 0;
 381         for (i = 0; phys_avail[i + 1] != 0; i += 2)
 382                 page_range += atop(phys_avail[i + 1] - phys_avail[i]);
 383 #elif defined(VM_PHYSSEG_DENSE)
 384         page_range = high_water / PAGE_SIZE - first_page;
 385 #else
 386 #error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined."
 387 #endif
 388         end = new_end;
 389
 390         /*
 391          * Reserve an unmapped guard page to trap access to vm_page_array[-1].
 392          */
 393         vaddr += PAGE_SIZE;
 394
 395         /*
 396          * Initialize the mem entry structures now, and put them in the free
 397          * queue.
 398          */
 399         new_end = trunc_page(end - page_range * sizeof(struct vm_page));
 400         mapped = pmap_map(&vaddr, new_end, end,
 401             VM_PROT_READ | VM_PROT_WRITE);
 402         vm_page_array = (vm_page_t) mapped;
 403 #if VM_NRESERVLEVEL > 0
 404         /*
 405          * Allocate memory for the reservation management system's data
 406          * structures.
 407          */
 408         new_end = vm_reserv_startup(&vaddr, new_end, high_water);
 409 #endif
 410 #if defined(__amd64__) || defined(__mips__)
 411         /*
 412          * pmap_map on amd64 and mips can come out of the direct-map, not kvm
 413          * like i386, so the pages must be tracked for a crashdump to include
 414          * this data.  This includes the vm_page_array and the early UMA
 415          * bootstrap pages.
 416          */
 417         for (pa = new_end; pa < phys_avail[biggestone + 1]; pa += PAGE_SIZE)
 418                 dump_add_page(pa);
 419 #endif
 420         phys_avail[biggestone + 1] = new_end;
 421
 422         /*
 423          * Clear all of the page structures
 424          */
 425         bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page));
 426         for (i = 0; i < page_range; i++)
 427                 vm_page_array[i].order = VM_NFREEORDER;
 428         vm_page_array_size = page_range;
 429
 430         /*
 431          * Initialize the physical memory allocator.
 432          */
 433         vm_phys_init();
 434
 435         /*
 436          * Add every available physical page that is not blacklisted to
 437          * the free lists.
 438          */
 439         cnt.v_page_count = 0;
 440         cnt.v_free_count = 0;
 441         list = getenv("vm.blacklist");
 442         for (i = 0; phys_avail[i + 1] != 0; i += 2) {
 443                 pa = phys_avail[i];
 444                 last_pa = phys_avail[i + 1];
 445                 while (pa < last_pa) {
 446                         if (list != NULL &&
 447                             vm_page_blacklist_lookup(list, pa))
 448                                 printf("Skipping page with pa 0x%jx\n",
 449                                     (uintmax_t)pa);
 450                         else
 451                                 vm_phys_add_page(pa);
 452                         pa += PAGE_SIZE;
 453                 }
 454         }
 455         freeenv(list);
 456 #if VM_NRESERVLEVEL > 0
 457         /*
 458          * Initialize the reservation management system.
 459          */
 460         vm_reserv_init();
 461 #endif
 462         return (vaddr);
 463 }
 464
 465 void
 466 vm_page_reference(vm_page_t m)
 467 {
 468
 469         vm_page_aflag_set(m, PGA_REFERENCED);
 470 }
 471
 472 void
 473 vm_page_busy(vm_page_t m)
 474 {
 475
 476         VM_OBJECT_ASSERT_WLOCKED(m->object);
 477         KASSERT((m->oflags & VPO_BUSY) == 0,
 478             ("vm_page_busy: page already busy!!!"));
 479         m->oflags |= VPO_BUSY;
 480 }
 481
 482 /*
 483  *      vm_page_flash:
 484  *
 485  *      wakeup anyone waiting for the page.
 486  */
 487 void
 488 vm_page_flash(vm_page_t m)
 489 {
 490
 491         VM_OBJECT_ASSERT_WLOCKED(m->object);
 492         if (m->oflags & VPO_WANTED) {
 493                 m->oflags &= ~VPO_WANTED;
 494                 wakeup(m);
 495         }
 496 }
 497
 498 /*
 499  *      vm_page_wakeup:
 500  *
 501  *      clear the VPO_BUSY flag and wakeup anyone waiting for the
 502  *      page.
 503  *
 504  */
 505 void
 506 vm_page_wakeup(vm_page_t m)
 507 {
 508
 509         VM_OBJECT_ASSERT_WLOCKED(m->object);
 510         KASSERT(m->oflags & VPO_BUSY, ("vm_page_wakeup: page not busy!!!"));
 511         m->oflags &= ~VPO_BUSY;
 512         vm_page_flash(m);
 513 }
 514
 515 void
 516 vm_page_io_start(vm_page_t m)
 517 {
 518
 519         VM_OBJECT_ASSERT_WLOCKED(m->object);
 520         m->busy++;
 521 }
 522
 523 void
 524 vm_page_io_finish(vm_page_t m)
 525 {
 526
 527         VM_OBJECT_ASSERT_WLOCKED(m->object);
 528         KASSERT(m->busy > 0, ("vm_page_io_finish: page %p is not busy", m));
 529         m->busy--;
 530         if (m->busy == 0)
 531                 vm_page_flash(m);
 532 }
 533
 534 /*
 535  * Keep page from being freed by the page daemon
 536  * much of the same effect as wiring, except much lower
 537  * overhead and should be used only for *very* temporary
 538  * holding ("wiring").
 539  */
 540 void
 541 vm_page_hold(vm_page_t mem)
 542 {
 543
 544         vm_page_lock_assert(mem, MA_OWNED);
 545         mem->hold_count++;
 546 }
 547
 548 void
 549 vm_page_unhold(vm_page_t mem)
 550 {
 551
 552         vm_page_lock_assert(mem, MA_OWNED);
 553         --mem->hold_count;
 554         KASSERT(mem->hold_count >= 0, ("vm_page_unhold: hold count < 0!!!"));
 555         if (mem->hold_count == 0 && (mem->flags & PG_UNHOLDFREE) != 0)
 556                 vm_page_free_toq(mem);
 557 }
 558
 559 /*
 560  *      vm_page_unhold_pages:
 561  *
 562  *      Unhold each of the pages that is referenced by the given array.
 563  */
 564 void
 565 vm_page_unhold_pages(vm_page_t *ma, int count)
 566 {
 567         struct mtx *mtx, *new_mtx;
 568
 569         mtx = NULL;
 570         for (; count != 0; count--) {
 571                 /*
 572                  * Avoid releasing and reacquiring the same page lock.
 573                  */
 574                 new_mtx = vm_page_lockptr(*ma);
 575                 if (mtx != new_mtx) {
 576                         if (mtx != NULL)
 577                                 mtx_unlock(mtx);
 578                         mtx = new_mtx;
 579                         mtx_lock(mtx);
 580                 }
 581                 vm_page_unhold(*ma);
 582                 ma++;
 583         }
 584         if (mtx != NULL)
 585                 mtx_unlock(mtx);
 586 }
 587
 588 vm_page_t
 589 PHYS_TO_VM_PAGE(vm_paddr_t pa)
 590 {
 591         vm_page_t m;
 592
 593 #ifdef VM_PHYSSEG_SPARSE
 594         m = vm_phys_paddr_to_vm_page(pa);
 595         if (m == NULL)
 596                 m = vm_phys_fictitious_to_vm_page(pa);
 597         return (m);
 598 #elif defined(VM_PHYSSEG_DENSE)
 599         long pi;
 600
 601         pi = atop(pa);
 602         if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
 603                 m = &vm_page_array[pi - first_page];
 604                 return (m);
 605         }
 606         return (vm_phys_fictitious_to_vm_page(pa));
 607 #else
 608 #error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined."
 609 #endif
 610 }
 611
 612 /*
 613  *      vm_page_getfake:
 614  *
 615  *      Create a fictitious page with the specified physical address and
 616  *      memory attribute.  The memory attribute is the only the machine-
 617  *      dependent aspect of a fictitious page that must be initialized.
 618  */
 619 vm_page_t
 620 vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr)
 621 {
 622         vm_page_t m;
 623
 624         m = uma_zalloc(fakepg_zone, M_WAITOK | M_ZERO);
 625         vm_page_initfake(m, paddr, memattr);
 626         return (m);
 627 }
 628
 629 void
 630 vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr)
 631 {
 632
 633         if ((m->flags & PG_FICTITIOUS) != 0) {
 634                 /*
 635                  * The page's memattr might have changed since the
 636                  * previous initialization.  Update the pmap to the
 637                  * new memattr.
 638                  */
 639                 goto memattr;
 640         }
 641         m->phys_addr = paddr;
 642         m->queue = PQ_NONE;
 643         /* Fictitious pages don't use "segind". */
 644         m->flags = PG_FICTITIOUS;
 645         /* Fictitious pages don't use "order" or "pool". */
 646         m->oflags = VPO_BUSY | VPO_UNMANAGED;
 647         m->wire_count = 1;
 648 memattr:
 649         pmap_page_set_memattr(m, memattr);
 650 }
 651
 652 /*
 653  *      vm_page_putfake:
 654  *
 655  *      Release a fictitious page.
 656  */
 657 void
 658 vm_page_putfake(vm_page_t m)
 659 {
 660
 661         KASSERT((m->oflags & VPO_UNMANAGED) != 0, ("managed %p", m));
 662         KASSERT((m->flags & PG_FICTITIOUS) != 0,
 663             ("vm_page_putfake: bad page %p", m));
 664         uma_zfree(fakepg_zone, m);
 665 }
 666
 667 /*
 668  *      vm_page_updatefake:
 669  *
 670  *      Update the given fictitious page to the specified physical address and
 671  *      memory attribute.
 672  */
 673 void
 674 vm_page_updatefake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr)
 675 {
 676
 677         KASSERT((m->flags & PG_FICTITIOUS) != 0,
 678             ("vm_page_updatefake: bad page %p", m));
 679         m->phys_addr = paddr;
 680         pmap_page_set_memattr(m, memattr);
 681 }
 682
 683 /*
 684  *      vm_page_free:
 685  *
 686  *      Free a page.
 687  */
 688 void
 689 vm_page_free(vm_page_t m)
 690 {
 691
 692         m->flags &= ~PG_ZERO;
 693         vm_page_free_toq(m);
 694 }
 695
 696 /*
 697  *      vm_page_free_zero:
 698  *
 699  *      Free a page to the zerod-pages queue
 700  */
 701 void
 702 vm_page_free_zero(vm_page_t m)
 703 {
 704
 705         m->flags |= PG_ZERO;
 706         vm_page_free_toq(m);
 707 }
 708
 709 /*
 710  * Unbusy and handle the page queueing for a page from the VOP_GETPAGES()
 711  * array which is not the request page.
 712  */
 713 void
 714 vm_page_readahead_finish(vm_page_t m)
 715 {
 716
 717         if (m->valid != 0) {
 718                 /*
 719                  * Since the page is not the requested page, whether
 720                  * it should be activated or deactivated is not
 721                  * obvious.  Empirical results have shown that
 722                  * deactivating the page is usually the best choice,
 723                  * unless the page is wanted by another thread.
 724                  */
 725                 if (m->oflags & VPO_WANTED) {
 726                         vm_page_lock(m);
 727                         vm_page_activate(m);
 728                         vm_page_unlock(m);
 729                 } else {
 730                         vm_page_lock(m);
 731                         vm_page_deactivate(m);
 732                         vm_page_unlock(m);
 733                 }
 734                 vm_page_wakeup(m);
 735         } else {
 736                 /*
 737                  * Free the completely invalid page.  Such page state
 738                  * occurs due to the short read operation which did
 739                  * not covered our page at all, or in case when a read
 740                  * error happens.
 741                  */
 742                 vm_page_lock(m);
 743                 vm_page_free(m);
 744                 vm_page_unlock(m);
 745         }
 746 }
 747
 748 /*
 749  *      vm_page_sleep:
 750  *
 751  *      Sleep and release the page lock.
 752  *
 753  *      The object containing the given page must be locked.
 754  */
 755 void
 756 vm_page_sleep(vm_page_t m, const char *msg)
 757 {
 758
 759         VM_OBJECT_ASSERT_WLOCKED(m->object);
 760         if (mtx_owned(vm_page_lockptr(m)))
 761                 vm_page_unlock(m);
 762
 763         /*
 764          * It's possible that while we sleep, the page will get
 765          * unbusied and freed.  If we are holding the object
 766          * lock, we will assume we hold a reference to the object
 767          * such that even if m->object changes, we can re-lock
 768          * it.
 769          */
 770         m->oflags |= VPO_WANTED;
 771         VM_OBJECT_SLEEP(m->object, m, PVM, msg, 0);
 772 }
 773
 774 /*
 775  *      vm_page_dirty_KBI:              [ internal use only ]
 776  *
 777  *      Set all bits in the page's dirty field.
 778  *
 779  *      The object containing the specified page must be locked if the
 780  *      call is made from the machine-independent layer.
 781  *
 782  *      See vm_page_clear_dirty_mask().
 783  *
 784  *      This function should only be called by vm_page_dirty().
 785  */
 786 void
 787 vm_page_dirty_KBI(vm_page_t m)
 788 {
 789
 790         /* These assertions refer to this operation by its public name. */
 791         KASSERT((m->flags & PG_CACHED) == 0,
 792             ("vm_page_dirty: page in cache!"));
 793         KASSERT(!VM_PAGE_IS_FREE(m),
 794             ("vm_page_dirty: page is free!"));
 795         KASSERT(m->valid == VM_PAGE_BITS_ALL,
 796             ("vm_page_dirty: page is invalid!"));
 797         m->dirty = VM_PAGE_BITS_ALL;
 798 }
 799
 800 /*
 801  *      vm_page_insert:         [ internal use only ]
 802  *
 803  *      Inserts the given mem entry into the object and object list.
 804  *
 805  *      The object must be locked.
 806  */
 807 void
 808 vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
 809 {
 810         vm_page_t mpred;
 811
 812         VM_OBJECT_ASSERT_WLOCKED(object);
 813         mpred = vm_radix_lookup_le(&object->rtree, pindex);
 814         vm_page_insert_after(m, object, pindex, mpred);
 815 }
 816
 817 /*
 818  *      vm_page_insert_after:
 819  *
 820  *      Inserts the page "m" into the specified object at offset "pindex".
 821  *
 822  *      The page "mpred" must immediately precede the offset "pindex" within
 823  *      the specified object.
 824  *
 825  *      The object must be locked.
 826  */
 827 static void
 828 vm_page_insert_after(vm_page_t m, vm_object_t object, vm_pindex_t pindex,
 829     vm_page_t mpred)
 830 {
 831         vm_page_t msucc;
 832
 833         VM_OBJECT_ASSERT_WLOCKED(object);
 834         KASSERT(m->object == NULL,
 835             ("vm_page_insert_after: page already inserted"));
 836         if (mpred != NULL) {
 837                 KASSERT(mpred->object == object ||
 838                     (mpred->flags & PG_SLAB) != 0,
 839                     ("vm_page_insert_after: object doesn't contain mpred"));
 840                 KASSERT(mpred->pindex < pindex,
 841                     ("vm_page_insert_after: mpred doesn't precede pindex"));
 842                 msucc = TAILQ_NEXT(mpred, listq);
 843         } else
 844                 msucc = TAILQ_FIRST(&object->memq);
 845         if (msucc != NULL)
 846                 KASSERT(msucc->pindex > pindex,
 847                     ("vm_page_insert_after: msucc doesn't succeed pindex"));
 848
 849         /*
 850          * Record the object/offset pair in this page
 851          */
 852         m->object = object;
 853         m->pindex = pindex;
 854
 855         /*
 856          * Now link into the object's ordered list of backed pages.
 857          */
 858         if (mpred != NULL)
 859                 TAILQ_INSERT_AFTER(&object->memq, mpred, m, listq);
 860         else
 861                 TAILQ_INSERT_HEAD(&object->memq, m, listq);
 862         vm_radix_insert(&object->rtree, m);
 863
 864         /*
 865          * Show that the object has one more resident page.
 866          */
 867         object->resident_page_count++;
 868
 869         /*
 870          * Hold the vnode until the last page is released.
 871          */
 872         if (object->resident_page_count == 1 && object->type == OBJT_VNODE)
 873                 vhold(object->handle);
 874
 875         /*
 876          * Since we are inserting a new and possibly dirty page,
 877          * update the object's OBJ_MIGHTBEDIRTY flag.
 878          */
 879         if (pmap_page_is_write_mapped(m))
 880                 vm_object_set_writeable_dirty(object);
 881 }
 882
 883 /*
 884  *      vm_page_remove:
 885  *
 886  *      Removes the given mem entry from the object/offset-page
 887  *      table and the object page list, but do not invalidate/terminate
 888  *      the backing store.
 889  *
 890  *      The object must be locked.  The page must be locked if it is managed.
 891  */
 892 void
 893 vm_page_remove(vm_page_t m)
 894 {
 895         vm_object_t object;
 896
 897         if ((m->oflags & VPO_UNMANAGED) == 0)
 898                 vm_page_lock_assert(m, MA_OWNED);
 899         if ((object = m->object) == NULL)
 900                 return;
 901         VM_OBJECT_ASSERT_WLOCKED(object);
 902         if (m->oflags & VPO_BUSY) {
 903                 m->oflags &= ~VPO_BUSY;
 904                 vm_page_flash(m);
 905         }
 906
 907         /*
 908          * Now remove from the object's list of backed pages.
 909          */
 910         vm_radix_remove(&object->rtree, m->pindex);
 911         TAILQ_REMOVE(&object->memq, m, listq);
 912
 913         /*
 914          * And show that the object has one fewer resident page.
 915          */
 916         object->resident_page_count--;
 917
 918         /*
 919          * The vnode may now be recycled.
 920          */
 921         if (object->resident_page_count == 0 && object->type == OBJT_VNODE)
 922                 vdrop(object->handle);
 923
 924         m->object = NULL;
 925 }
 926
 927 /*
 928  *      vm_page_lookup:
 929  *
 930  *      Returns the page associated with the object/offset
 931  *      pair specified; if none is found, NULL is returned.
 932  *
 933  *      The object must be locked.
 934  */
 935 vm_page_t
 936 vm_page_lookup(vm_object_t object, vm_pindex_t pindex)
 937 {
 938
 939         VM_OBJECT_ASSERT_LOCKED(object);
 940         return (vm_radix_lookup(&object->rtree, pindex));
 941 }
 942
 943 /*
 944  *      vm_page_find_least:
 945  *
 946  *      Returns the page associated with the object with least pindex
 947  *      greater than or equal to the parameter pindex, or NULL.
 948  *
 949  *      The object must be locked.
 950  */
 951 vm_page_t
 952 vm_page_find_least(vm_object_t object, vm_pindex_t pindex)
 953 {
 954         vm_page_t m;
 955
 956         VM_OBJECT_ASSERT_LOCKED(object);
 957         if ((m = TAILQ_FIRST(&object->memq)) != NULL && m->pindex < pindex)
 958                 m = vm_radix_lookup_ge(&object->rtree, pindex);
 959         return (m);
 960 }
 961
 962 /*
 963  * Returns the given page's successor (by pindex) within the object if it is
 964  * resident; if none is found, NULL is returned.
 965  *
 966  * The object must be locked.
 967  */
 968 vm_page_t
 969 vm_page_next(vm_page_t m)
 970 {
 971         vm_page_t next;
 972
 973         VM_OBJECT_ASSERT_WLOCKED(m->object);
 974         if ((next = TAILQ_NEXT(m, listq)) != NULL &&
 975             next->pindex != m->pindex + 1)
 976                 next = NULL;
 977         return (next);
 978 }
 979
 980 /*
 981  * Returns the given page's predecessor (by pindex) within the object if it is
 982  * resident; if none is found, NULL is returned.
 983  *
 984  * The object must be locked.
 985  */
 986 vm_page_t
 987 vm_page_prev(vm_page_t m)
 988 {
 989         vm_page_t prev;
 990
 991         VM_OBJECT_ASSERT_WLOCKED(m->object);
 992         if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL &&
 993             prev->pindex != m->pindex - 1)
 994                 prev = NULL;
 995         return (prev);
 996 }
 997
 998 /*
 999  *      vm_page_rename:
1000  *
1001  *      Move the given memory entry from its
1002  *      current object to the specified target object/offset.
1003  *
1004  *      Note: swap associated with the page must be invalidated by the move.  We
1005  *            have to do this for several reasons:  (1) we aren't freeing the
1006  *            page, (2) we are dirtying the page, (3) the VM system is probably
1007  *            moving the page from object A to B, and will then later move
1008  *            the backing store from A to B and we can't have a conflict.
1009  *
1010  *      Note: we *always* dirty the page.  It is necessary both for the
1011  *            fact that we moved it, and because we may be invalidating
1012  *            swap.  If the page is on the cache, we have to deactivate it
1013  *            or vm_page_dirty() will panic.  Dirty pages are not allowed
1014  *            on the cache.
1015  *
1016  *      The objects must be locked.  The page must be locked if it is managed.
1017  */
1018 void
1019 vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex)
1020 {
1021
1022         vm_page_remove(m);
1023         vm_page_insert(m, new_object, new_pindex);
1024         vm_page_dirty(m);
1025 }
1026
1027 /*
1028  *      Convert all of the given object's cached pages that have a
1029  *      pindex within the given range into free pages.  If the value
1030  *      zero is given for "end", then the range's upper bound is
1031  *      infinity.  If the given object is backed by a vnode and it
1032  *      transitions from having one or more cached pages to none, the
1033  *      vnode's hold count is reduced.
1034  */
1035 void
1036 vm_page_cache_free(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
1037 {
1038         vm_page_t m;
1039         boolean_t empty;
1040
1041         mtx_lock(&vm_page_queue_free_mtx);
1042         if (__predict_false(vm_radix_is_empty(&object->cache))) {
1043                 mtx_unlock(&vm_page_queue_free_mtx);
1044                 return;
1045         }
1046         while ((m = vm_radix_lookup_ge(&object->cache, start)) != NULL) {
1047                 if (end != 0 && m->pindex >= end)
1048                         break;
1049                 vm_radix_remove(&object->cache, m->pindex);
1050                 m->object = NULL;
1051                 m->valid = 0;
1052                 /* Clear PG_CACHED and set PG_FREE. */
1053                 m->flags ^= PG_CACHED | PG_FREE;
1054                 KASSERT((m->flags & (PG_CACHED | PG_FREE)) == PG_FREE,
1055                     ("vm_page_cache_free: page %p has inconsistent flags", m));
1056                 cnt.v_cache_count--;
1057                 cnt.v_free_count++;
1058         }
1059         empty = vm_radix_is_empty(&object->cache);
1060         mtx_unlock(&vm_page_queue_free_mtx);
1061         if (object->type == OBJT_VNODE && empty)
1062                 vdrop(object->handle);
1063 }
1064
1065 /*
1066  *      Returns the cached page that is associated with the given
1067  *      object and offset.  If, however, none exists, returns NULL.
1068  *
1069  *      The free page queue must be locked.
1070  */
1071 static inline vm_page_t
1072 vm_page_cache_lookup(vm_object_t object, vm_pindex_t pindex)
1073 {
1074
1075         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
1076         return (vm_radix_lookup(&object->cache, pindex));
1077 }
1078
1079 /*
1080  *      Remove the given cached page from its containing object's
1081  *      collection of cached pages.
1082  *
1083  *      The free page queue must be locked.
1084  */
1085 static void
1086 vm_page_cache_remove(vm_page_t m)
1087 {
1088
1089         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
1090         KASSERT((m->flags & PG_CACHED) != 0,
1091             ("vm_page_cache_remove: page %p is not cached", m));
1092         vm_radix_remove(&m->object->cache, m->pindex);
1093         m->object = NULL;
1094         cnt.v_cache_count--;
1095 }
1096
1097 /*
1098  *      Transfer all of the cached pages with offset greater than or
1099  *      equal to 'offidxstart' from the original object's cache to the
1100  *      new object's cache.  However, any cached pages with offset
1101  *      greater than or equal to the new object's size are kept in the
1102  *      original object.  Initially, the new object's cache must be
1103  *      empty.  Offset 'offidxstart' in the original object must
1104  *      correspond to offset zero in the new object.
1105  *
1106  *      The new object must be locked.
1107  */
1108 void
1109 vm_page_cache_transfer(vm_object_t orig_object, vm_pindex_t offidxstart,
1110     vm_object_t new_object)
1111 {
1112         vm_page_t m;
1113
1114         /*
1115          * Insertion into an object's collection of cached pages
1116          * requires the object to be locked.  In contrast, removal does
1117          * not.
1118          */
1119         VM_OBJECT_ASSERT_WLOCKED(new_object);
1120         KASSERT(vm_radix_is_empty(&new_object->cache),
1121             ("vm_page_cache_transfer: object %p has cached pages",
1122             new_object));
1123         mtx_lock(&vm_page_queue_free_mtx);
1124         while ((m = vm_radix_lookup_ge(&orig_object->cache,
1125             offidxstart)) != NULL) {
1126                 /*
1127                  * Transfer all of the pages with offset greater than or
1128                  * equal to 'offidxstart' from the original object's
1129                  * cache to the new object's cache.
1130                  */
1131                 if ((m->pindex - offidxstart) >= new_object->size)
1132                         break;
1133                 vm_radix_remove(&orig_object->cache, m->pindex);
1134                 /* Update the page's object and offset. */
1135                 m->object = new_object;
1136                 m->pindex -= offidxstart;
1137                 vm_radix_insert(&new_object->cache, m);
1138         }
1139         mtx_unlock(&vm_page_queue_free_mtx);
1140 }
1141
1142 /*
1143  *      Returns TRUE if a cached page is associated with the given object and
1144  *      offset, and FALSE otherwise.
1145  *
1146  *      The object must be locked.
1147  */
1148 boolean_t
1149 vm_page_is_cached(vm_object_t object, vm_pindex_t pindex)
1150 {
1151         vm_page_t m;
1152
1153         /*
1154          * Insertion into an object's collection of cached pages requires the
1155          * object to be locked.  Therefore, if the object is locked and the
1156          * object's collection is empty, there is no need to acquire the free
1157          * page queues lock in order to prove that the specified page doesn't
1158          * exist.
1159          */
1160         VM_OBJECT_ASSERT_WLOCKED(object);
1161         if (__predict_true(vm_object_cache_is_empty(object)))
1162                 return (FALSE);
1163         mtx_lock(&vm_page_queue_free_mtx);
1164         m = vm_page_cache_lookup(object, pindex);
1165         mtx_unlock(&vm_page_queue_free_mtx);
1166         return (m != NULL);
1167 }
1168
1169 /*
1170  *      vm_page_alloc:
1171  *
1172  *      Allocate and return a page that is associated with the specified
1173  *      object and offset pair.  By default, this page has the flag VPO_BUSY
1174  *      set.
1175  *
1176  *      The caller must always specify an allocation class.
1177  *
1178  *      allocation classes:
1179  *      VM_ALLOC_NORMAL         normal process request
1180  *      VM_ALLOC_SYSTEM         system *really* needs a page
1181  *      VM_ALLOC_INTERRUPT      interrupt time request
1182  *
1183  *      optional allocation flags:
1184  *      VM_ALLOC_COUNT(number)  the number of additional pages that the caller
1185  *                              intends to allocate
1186  *      VM_ALLOC_IFCACHED       return page only if it is cached
1187  *      VM_ALLOC_IFNOTCACHED    return NULL, do not reactivate if the page
1188  *                              is cached
1189  *      VM_ALLOC_NOBUSY         do not set the flag VPO_BUSY on the page
1190  *      VM_ALLOC_NODUMP         do not include the page in a kernel core dump
1191  *      VM_ALLOC_NOOBJ          page is not associated with an object and
1192  *                              should not have the flag VPO_BUSY set
1193  *      VM_ALLOC_WIRED          wire the allocated page
1194  *      VM_ALLOC_ZERO           prefer a zeroed page
1195  *
1196  *      This routine may not sleep.
1197  */
1198 vm_page_t
1199 vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req)
1200 {
1201         struct vnode *vp = NULL;
1202         vm_object_t m_object;
1203         vm_page_t m, mpred;
1204         int flags, req_class;
1205
1206         mpred = 0;      /* XXX: pacify gcc */
1207         KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0),
1208             ("vm_page_alloc: inconsistent object/req"));
1209         if (object != NULL)
1210                 VM_OBJECT_ASSERT_WLOCKED(object);
1211
1212         req_class = req & VM_ALLOC_CLASS_MASK;
1213
1214         /*
1215          * The page daemon is allowed to dig deeper into the free page list.
1216          */
1217         if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
1218                 req_class = VM_ALLOC_SYSTEM;
1219
1220         if (object != NULL) {
1221                 mpred = vm_radix_lookup_le(&object->rtree, pindex);
1222                 KASSERT(mpred == NULL || mpred->pindex != pindex,
1223                    ("vm_page_alloc: pindex already allocated"));
1224         }
1225         mtx_lock(&vm_page_queue_free_mtx);
1226         if (cnt.v_free_count + cnt.v_cache_count > cnt.v_free_reserved ||
1227             (req_class == VM_ALLOC_SYSTEM &&
1228             cnt.v_free_count + cnt.v_cache_count > cnt.v_interrupt_free_min) ||
1229             (req_class == VM_ALLOC_INTERRUPT &&
1230             cnt.v_free_count + cnt.v_cache_count > 0)) {
1231                 /*
1232                  * Allocate from the free queue if the number of free pages
1233                  * exceeds the minimum for the request class.
1234                  */
1235                 if (object != NULL &&
1236                     (m = vm_page_cache_lookup(object, pindex)) != NULL) {
1237                         if ((req & VM_ALLOC_IFNOTCACHED) != 0) {
1238                                 mtx_unlock(&vm_page_queue_free_mtx);
1239                                 return (NULL);
1240                         }
1241                         if (vm_phys_unfree_page(m))
1242                                 vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, 0);
1243 #if VM_NRESERVLEVEL > 0
1244                         else if (!vm_reserv_reactivate_page(m))
1245 #else
1246                         else
1247 #endif
1248                                 panic("vm_page_alloc: cache page %p is missing"
1249                                     " from the free queue", m);
1250                 } else if ((req & VM_ALLOC_IFCACHED) != 0) {
1251                         mtx_unlock(&vm_page_queue_free_mtx);
1252                         return (NULL);
1253 #if VM_NRESERVLEVEL > 0
1254                 } else if (object == NULL || (object->flags & (OBJ_COLORED |
1255                     OBJ_FICTITIOUS)) != OBJ_COLORED || (m =
1256                     vm_reserv_alloc_page(object, pindex, mpred)) == NULL) {
1257 #else
1258                 } else {
1259 #endif
1260                         m = vm_phys_alloc_pages(object != NULL ?
1261                             VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, 0);
1262 #if VM_NRESERVLEVEL > 0
1263                         if (m == NULL && vm_reserv_reclaim_inactive()) {
1264                                 m = vm_phys_alloc_pages(object != NULL ?
1265                                     VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT,
1266                                     0);
1267                         }
1268 #endif
1269                 }
1270         } else {
1271                 /*
1272                  * Not allocatable, give up.
1273                  */
1274                 mtx_unlock(&vm_page_queue_free_mtx);
1275                 atomic_add_int(&vm_pageout_deficit,
1276                     max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1));
1277                 pagedaemon_wakeup();
1278                 return (NULL);
1279         }
1280
1281         /*
1282          *  At this point we had better have found a good page.
1283          */
1284         KASSERT(m != NULL, ("vm_page_alloc: missing page"));
1285         KASSERT(m->queue == PQ_NONE,
1286             ("vm_page_alloc: page %p has unexpected queue %d", m, m->queue));
1287         KASSERT(m->wire_count == 0, ("vm_page_alloc: page %p is wired", m));
1288         KASSERT(m->hold_count == 0, ("vm_page_alloc: page %p is held", m));
1289         KASSERT(m->busy == 0, ("vm_page_alloc: page %p is busy", m));
1290         KASSERT(m->dirty == 0, ("vm_page_alloc: page %p is dirty", m));
1291         KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT,
1292             ("vm_page_alloc: page %p has unexpected memattr %d", m,
1293             pmap_page_get_memattr(m)));
1294         if ((m->flags & PG_CACHED) != 0) {
1295                 KASSERT((m->flags & PG_ZERO) == 0,
1296                     ("vm_page_alloc: cached page %p is PG_ZERO", m));
1297                 KASSERT(m->valid != 0,
1298                     ("vm_page_alloc: cached page %p is invalid", m));
1299                 if (m->object == object && m->pindex == pindex)
1300                         cnt.v_reactivated++;
1301                 else
1302                         m->valid = 0;
1303                 m_object = m->object;
1304                 vm_page_cache_remove(m);
1305                 if (m_object->type == OBJT_VNODE &&
1306                     vm_object_cache_is_empty(m_object))
1307                         vp = m_object->handle;
1308         } else {
1309                 KASSERT(VM_PAGE_IS_FREE(m),
1310                     ("vm_page_alloc: page %p is not free", m));
1311                 KASSERT(m->valid == 0,
1312                     ("vm_page_alloc: free page %p is valid", m));
1313                 cnt.v_free_count--;
1314         }
1315
1316         /*
1317          * Only the PG_ZERO flag is inherited.  The PG_CACHED or PG_FREE flag
1318          * must be cleared before the free page queues lock is released.
1319          */
1320         flags = 0;
1321         if (m->flags & PG_ZERO) {
1322                 vm_page_zero_count--;
1323                 if (req & VM_ALLOC_ZERO)
1324                         flags = PG_ZERO;
1325         }
1326         if (req & VM_ALLOC_NODUMP)
1327                 flags |= PG_NODUMP;
1328         m->flags = flags;
1329         mtx_unlock(&vm_page_queue_free_mtx);
1330         m->aflags = 0;
1331         m->oflags = object == NULL || (object->flags & OBJ_UNMANAGED) != 0 ?
1332             VPO_UNMANAGED : 0;
1333         if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ)) == 0)
1334                 m->oflags |= VPO_BUSY;
1335         if (req & VM_ALLOC_WIRED) {
1336                 /*
1337                  * The page lock is not required for wiring a page until that
1338                  * page is inserted into the object.
1339                  */
1340                 atomic_add_int(&cnt.v_wire_count, 1);
1341                 m->wire_count = 1;
1342         }
1343         m->act_count = 0;
1344
1345         if (object != NULL) {
1346                 /* Ignore device objects; the pager sets "memattr" for them. */
1347                 if (object->memattr != VM_MEMATTR_DEFAULT &&
1348                     (object->flags & OBJ_FICTITIOUS) == 0)
1349                         pmap_page_set_memattr(m, object->memattr);
1350                 vm_page_insert_after(m, object, pindex, mpred);
1351         } else
1352                 m->pindex = pindex;
1353
1354         /*
1355          * The following call to vdrop() must come after the above call
1356          * to vm_page_insert() in case both affect the same object and
1357          * vnode.  Otherwise, the affected vnode's hold count could
1358          * temporarily become zero.
1359          */
1360         if (vp != NULL)
1361                 vdrop(vp);
1362
1363         /*
1364          * Don't wakeup too often - wakeup the pageout daemon when
1365          * we would be nearly out of memory.
1366          */
1367         if (vm_paging_needed())
1368                 pagedaemon_wakeup();
1369
1370         return (m);
1371 }
1372
1373 /*
1374  *      vm_page_alloc_contig:
1375  *
1376  *      Allocate a contiguous set of physical pages of the given size "npages"
1377  *      from the free lists.  All of the physical pages must be at or above
1378  *      the given physical address "low" and below the given physical address
1379  *      "high".  The given value "alignment" determines the alignment of the
1380  *      first physical page in the set.  If the given value "boundary" is
1381  *      non-zero, then the set of physical pages cannot cross any physical
1382  *      address boundary that is a multiple of that value.  Both "alignment"
1383  *      and "boundary" must be a power of two.
1384  *
1385  *      If the specified memory attribute, "memattr", is VM_MEMATTR_DEFAULT,
1386  *      then the memory attribute setting for the physical pages is configured
1387  *      to the object's memory attribute setting.  Otherwise, the memory
1388  *      attribute setting for the physical pages is configured to "memattr",
1389  *      overriding the object's memory attribute setting.  However, if the
1390  *      object's memory attribute setting is not VM_MEMATTR_DEFAULT, then the
1391  *      memory attribute setting for the physical pages cannot be configured
1392  *      to VM_MEMATTR_DEFAULT.
1393  *
1394  *      The caller must always specify an allocation class.
1395  *
1396  *      allocation classes:
1397  *      VM_ALLOC_NORMAL         normal process request
1398  *      VM_ALLOC_SYSTEM         system *really* needs a page
1399  *      VM_ALLOC_INTERRUPT      interrupt time request
1400  *
1401  *      optional allocation flags:
1402  *      VM_ALLOC_NOBUSY         do not set the flag VPO_BUSY on the page
1403  *      VM_ALLOC_NOOBJ          page is not associated with an object and
1404  *                              should not have the flag VPO_BUSY set
1405  *      VM_ALLOC_WIRED          wire the allocated page
1406  *      VM_ALLOC_ZERO           prefer a zeroed page
1407  *
1408  *      This routine may not sleep.
1409  */
1410 vm_page_t
1411 vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req,
1412     u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment,
1413     vm_paddr_t boundary, vm_memattr_t memattr)
1414 {
1415         struct vnode *drop;
1416         vm_page_t deferred_vdrop_list, m, m_ret;
1417         u_int flags, oflags;
1418         int req_class;
1419
1420         KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0),
1421             ("vm_page_alloc_contig: inconsistent object/req"));
1422         if (object != NULL) {
1423                 VM_OBJECT_ASSERT_WLOCKED(object);
1424                 KASSERT(object->type == OBJT_PHYS,
1425                     ("vm_page_alloc_contig: object %p isn't OBJT_PHYS",
1426                     object));
1427         }
1428         KASSERT(npages > 0, ("vm_page_alloc_contig: npages is zero"));
1429         req_class = req & VM_ALLOC_CLASS_MASK;
1430
1431         /*
1432          * The page daemon is allowed to dig deeper into the free page list.
1433          */
1434         if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
1435                 req_class = VM_ALLOC_SYSTEM;
1436
1437         deferred_vdrop_list = NULL;
1438         mtx_lock(&vm_page_queue_free_mtx);
1439         if (cnt.v_free_count + cnt.v_cache_count >= npages +
1440             cnt.v_free_reserved || (req_class == VM_ALLOC_SYSTEM &&
1441             cnt.v_free_count + cnt.v_cache_count >= npages +
1442             cnt.v_interrupt_free_min) || (req_class == VM_ALLOC_INTERRUPT &&
1443             cnt.v_free_count + cnt.v_cache_count >= npages)) {
1444 #if VM_NRESERVLEVEL > 0
1445 retry:
1446                 if (object == NULL || (object->flags & OBJ_COLORED) == 0 ||
1447                     (m_ret = vm_reserv_alloc_contig(object, pindex, npages,
1448                     low, high, alignment, boundary)) == NULL)
1449 #endif
1450                         m_ret = vm_phys_alloc_contig(npages, low, high,
1451                             alignment, boundary);
1452         } else {
1453                 mtx_unlock(&vm_page_queue_free_mtx);
1454                 atomic_add_int(&vm_pageout_deficit, npages);
1455                 pagedaemon_wakeup();
1456                 return (NULL);
1457         }
1458         if (m_ret != NULL)
1459                 for (m = m_ret; m < &m_ret[npages]; m++) {
1460                         drop = vm_page_alloc_init(m);
1461                         if (drop != NULL) {
1462                                 /*
1463                                  * Enqueue the vnode for deferred vdrop().
1464                                  *
1465                                  * Once the pages are removed from the free
1466                                  * page list, "pageq" can be safely abused to
1467                                  * construct a short-lived list of vnodes.
1468                                  */
1469                                 m->pageq.tqe_prev = (void *)drop;
1470                                 m->pageq.tqe_next = deferred_vdrop_list;
1471                                 deferred_vdrop_list = m;
1472                         }
1473                 }
1474         else {
1475 #if VM_NRESERVLEVEL > 0
1476                 if (vm_reserv_reclaim_contig(npages, low, high, alignment,
1477                     boundary))
1478                         goto retry;
1479 #endif
1480         }
1481         mtx_unlock(&vm_page_queue_free_mtx);
1482         if (m_ret == NULL)
1483                 return (NULL);
1484
1485         /*
1486          * Initialize the pages.  Only the PG_ZERO flag is inherited.
1487          */
1488         flags = 0;
1489         if ((req & VM_ALLOC_ZERO) != 0)
1490                 flags = PG_ZERO;
1491         if ((req & VM_ALLOC_NODUMP) != 0)
1492                 flags |= PG_NODUMP;
1493         if ((req & VM_ALLOC_WIRED) != 0)
1494                 atomic_add_int(&cnt.v_wire_count, npages);
1495         oflags = VPO_UNMANAGED;
1496         if (object != NULL) {
1497                 if ((req & VM_ALLOC_NOBUSY) == 0)
1498                         oflags |= VPO_BUSY;
1499                 if (object->memattr != VM_MEMATTR_DEFAULT &&
1500                     memattr == VM_MEMATTR_DEFAULT)
1501                         memattr = object->memattr;
1502         }
1503         for (m = m_ret; m < &m_ret[npages]; m++) {
1504                 m->aflags = 0;
1505                 m->flags = (m->flags | PG_NODUMP) & flags;
1506                 if ((req & VM_ALLOC_WIRED) != 0)
1507                         m->wire_count = 1;
1508                 /* Unmanaged pages don't use "act_count". */
1509                 m->oflags = oflags;
1510                 if (memattr != VM_MEMATTR_DEFAULT)
1511                         pmap_page_set_memattr(m, memattr);
1512                 if (object != NULL)
1513                         vm_page_insert(m, object, pindex);
1514                 else
1515                         m->pindex = pindex;
1516                 pindex++;
1517         }
1518         while (deferred_vdrop_list != NULL) {
1519                 vdrop((struct vnode *)deferred_vdrop_list->pageq.tqe_prev);
1520                 deferred_vdrop_list = deferred_vdrop_list->pageq.tqe_next;
1521         }
1522         if (vm_paging_needed())
1523                 pagedaemon_wakeup();
1524         return (m_ret);
1525 }
1526
1527 /*
1528  * Initialize a page that has been freshly dequeued from a freelist.
1529  * The caller has to drop the vnode returned, if it is not NULL.
1530  *
1531  * This function may only be used to initialize unmanaged pages.
1532  *
1533  * To be called with vm_page_queue_free_mtx held.
1534  */
1535 static struct vnode *
1536 vm_page_alloc_init(vm_page_t m)
1537 {
1538         struct vnode *drop;
1539         vm_object_t m_object;
1540
1541         KASSERT(m->queue == PQ_NONE,
1542             ("vm_page_alloc_init: page %p has unexpected queue %d",
1543             m, m->queue));
1544         KASSERT(m->wire_count == 0,
1545             ("vm_page_alloc_init: page %p is wired", m));
1546         KASSERT(m->hold_count == 0,
1547             ("vm_page_alloc_init: page %p is held", m));
1548         KASSERT(m->busy == 0,
1549             ("vm_page_alloc_init: page %p is busy", m));
1550         KASSERT(m->dirty == 0,
1551             ("vm_page_alloc_init: page %p is dirty", m));
1552         KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT,
1553             ("vm_page_alloc_init: page %p has unexpected memattr %d",
1554             m, pmap_page_get_memattr(m)));
1555         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
1556         drop = NULL;
1557         if ((m->flags & PG_CACHED) != 0) {
1558                 KASSERT((m->flags & PG_ZERO) == 0,
1559                     ("vm_page_alloc_init: cached page %p is PG_ZERO", m));
1560                 m->valid = 0;
1561                 m_object = m->object;
1562                 vm_page_cache_remove(m);
1563                 if (m_object->type == OBJT_VNODE &&
1564                     vm_object_cache_is_empty(m_object))
1565                         drop = m_object->handle;
1566         } else {
1567                 KASSERT(VM_PAGE_IS_FREE(m),
1568                     ("vm_page_alloc_init: page %p is not free", m));
1569                 KASSERT(m->valid == 0,
1570                     ("vm_page_alloc_init: free page %p is valid", m));
1571                 cnt.v_free_count--;
1572                 if ((m->flags & PG_ZERO) != 0)
1573                         vm_page_zero_count--;
1574         }
1575         /* Don't clear the PG_ZERO flag; we'll need it later. */
1576         m->flags &= PG_ZERO;
1577         return (drop);
1578 }
1579
1580 /*
1581  *      vm_page_alloc_freelist:
1582  *
1583  *      Allocate a physical page from the specified free page list.
1584  *
1585  *      The caller must always specify an allocation class.
1586  *
1587  *      allocation classes:
1588  *      VM_ALLOC_NORMAL         normal process request
1589  *      VM_ALLOC_SYSTEM         system *really* needs a page
1590  *      VM_ALLOC_INTERRUPT      interrupt time request
1591  *
1592  *      optional allocation flags:
1593  *      VM_ALLOC_COUNT(number)  the number of additional pages that the caller
1594  *                              intends to allocate
1595  *      VM_ALLOC_WIRED          wire the allocated page
1596  *      VM_ALLOC_ZERO           prefer a zeroed page
1597  *
1598  *      This routine may not sleep.
1599  */
1600 vm_page_t
1601 vm_page_alloc_freelist(int flind, int req)
1602 {
1603         struct vnode *drop;
1604         vm_page_t m;
1605         u_int flags;
1606         int req_class;
1607
1608         req_class = req & VM_ALLOC_CLASS_MASK;
1609
1610         /*
1611          * The page daemon is allowed to dig deeper into the free page list.
1612          */
1613         if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
1614                 req_class = VM_ALLOC_SYSTEM;
1615
1616         /*
1617          * Do not allocate reserved pages unless the req has asked for it.
1618          */
1619         mtx_lock(&vm_page_queue_free_mtx);
1620         if (cnt.v_free_count + cnt.v_cache_count > cnt.v_free_reserved ||
1621             (req_class == VM_ALLOC_SYSTEM &&
1622             cnt.v_free_count + cnt.v_cache_count > cnt.v_interrupt_free_min) ||
1623             (req_class == VM_ALLOC_INTERRUPT &&
1624             cnt.v_free_count + cnt.v_cache_count > 0))
1625                 m = vm_phys_alloc_freelist_pages(flind, VM_FREEPOOL_DIRECT, 0);
1626         else {
1627                 mtx_unlock(&vm_page_queue_free_mtx);
1628                 atomic_add_int(&vm_pageout_deficit,
1629                     max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1));
1630                 pagedaemon_wakeup();
1631                 return (NULL);
1632         }
1633         if (m == NULL) {
1634                 mtx_unlock(&vm_page_queue_free_mtx);
1635                 return (NULL);
1636         }
1637         drop = vm_page_alloc_init(m);
1638         mtx_unlock(&vm_page_queue_free_mtx);
1639
1640         /*
1641          * Initialize the page.  Only the PG_ZERO flag is inherited.
1642          */
1643         m->aflags = 0;
1644         flags = 0;
1645         if ((req & VM_ALLOC_ZERO) != 0)
1646                 flags = PG_ZERO;
1647         m->flags &= flags;
1648         if ((req & VM_ALLOC_WIRED) != 0) {
1649                 /*
1650                  * The page lock is not required for wiring a page that does
1651                  * not belong to an object.
1652                  */
1653                 atomic_add_int(&cnt.v_wire_count, 1);
1654                 m->wire_count = 1;
1655         }
1656         /* Unmanaged pages don't use "act_count". */
1657         m->oflags = VPO_UNMANAGED;
1658         if (drop != NULL)
1659                 vdrop(drop);
1660         if (vm_paging_needed())
1661                 pagedaemon_wakeup();
1662         return (m);
1663 }
1664
1665 /*
1666  *      vm_wait:        (also see VM_WAIT macro)
1667  *
1668  *      Sleep until free pages are available for allocation.
1669  *      - Called in various places before memory allocations.
1670  */
1671 void
1672 vm_wait(void)
1673 {
1674
1675         mtx_lock(&vm_page_queue_free_mtx);
1676         if (curproc == pageproc) {
1677                 vm_pageout_pages_needed = 1;
1678                 msleep(&vm_pageout_pages_needed, &vm_page_queue_free_mtx,
1679                     PDROP | PSWP, "VMWait", 0);
1680         } else {
1681                 if (!vm_pages_needed) {
1682                         vm_pages_needed = 1;
1683                         wakeup(&vm_pages_needed);
1684                 }
1685                 msleep(&cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PVM,
1686                     "vmwait", 0);
1687         }
1688 }
1689
1690 /*
1691  *      vm_waitpfault:  (also see VM_WAITPFAULT macro)
1692  *
1693  *      Sleep until free pages are available for allocation.
1694  *      - Called only in vm_fault so that processes page faulting
1695  *        can be easily tracked.
1696  *      - Sleeps at a lower priority than vm_wait() so that vm_wait()ing
1697  *        processes will be able to grab memory first.  Do not change
1698  *        this balance without careful testing first.
1699  */
1700 void
1701 vm_waitpfault(void)
1702 {
1703
1704         mtx_lock(&vm_page_queue_free_mtx);
1705         if (!vm_pages_needed) {
1706                 vm_pages_needed = 1;
1707                 wakeup(&vm_pages_needed);
1708         }
1709         msleep(&cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PUSER,
1710             "pfault", 0);
1711 }
1712
1713 /*
1714  *      vm_page_dequeue:
1715  *
1716  *      Remove the given page from its current page queue.
1717  *
1718  *      The page must be locked.
1719  */
1720 void
1721 vm_page_dequeue(vm_page_t m)
1722 {
1723         struct vm_pagequeue *pq;
1724
1725         vm_page_lock_assert(m, MA_OWNED);
1726         KASSERT(m->queue != PQ_NONE,
1727             ("vm_page_dequeue: page %p is not queued", m));
1728         pq = &vm_pagequeues[m->queue];
1729         vm_pagequeue_lock(pq);
1730         m->queue = PQ_NONE;
1731         TAILQ_REMOVE(&pq->pq_pl, m, pageq);
1732         (*pq->pq_cnt)--;
1733         vm_pagequeue_unlock(pq);
1734 }
1735
1736 /*
1737  *      vm_page_dequeue_locked:
1738  *
1739  *      Remove the given page from its current page queue.
1740  *
1741  *      The page and page queue must be locked.
1742  */
1743 void
1744 vm_page_dequeue_locked(vm_page_t m)
1745 {
1746         struct vm_pagequeue *pq;
1747
1748         vm_page_lock_assert(m, MA_OWNED);
1749         pq = &vm_pagequeues[m->queue];
1750         vm_pagequeue_assert_locked(pq);
1751         m->queue = PQ_NONE;
1752         TAILQ_REMOVE(&pq->pq_pl, m, pageq);
1753         (*pq->pq_cnt)--;
1754 }
1755
1756 /*
1757  *      vm_page_enqueue:
1758  *
1759  *      Add the given page to the specified page queue.
1760  *
1761  *      The page must be locked.
1762  */
1763 static void
1764 vm_page_enqueue(int queue, vm_page_t m)
1765 {
1766         struct vm_pagequeue *pq;
1767
1768         vm_page_lock_assert(m, MA_OWNED);
1769         pq = &vm_pagequeues[queue];
1770         vm_pagequeue_lock(pq);
1771         m->queue = queue;
1772         TAILQ_INSERT_TAIL(&pq->pq_pl, m, pageq);
1773         ++*pq->pq_cnt;
1774         vm_pagequeue_unlock(pq);
1775 }
1776
1777 /*
1778  *      vm_page_requeue:
1779  *
1780  *      Move the given page to the tail of its current page queue.
1781  *
1782  *      The page must be locked.
1783  */
1784 void
1785 vm_page_requeue(vm_page_t m)
1786 {
1787         struct vm_pagequeue *pq;
1788
1789         vm_page_lock_assert(m, MA_OWNED);
1790         KASSERT(m->queue != PQ_NONE,
1791             ("vm_page_requeue: page %p is not queued", m));
1792         pq = &vm_pagequeues[m->queue];
1793         vm_pagequeue_lock(pq);
1794         TAILQ_REMOVE(&pq->pq_pl, m, pageq);
1795         TAILQ_INSERT_TAIL(&pq->pq_pl, m, pageq);
1796         vm_pagequeue_unlock(pq);
1797 }
1798
1799 /*
1800  *      vm_page_requeue_locked:
1801  *
1802  *      Move the given page to the tail of its current page queue.
1803  *
1804  *      The page queue must be locked.
1805  */
1806 void
1807 vm_page_requeue_locked(vm_page_t m)
1808 {
1809         struct vm_pagequeue *pq;
1810
1811         KASSERT(m->queue != PQ_NONE,
1812             ("vm_page_requeue_locked: page %p is not queued", m));
1813         pq = &vm_pagequeues[m->queue];
1814         vm_pagequeue_assert_locked(pq);
1815         TAILQ_REMOVE(&pq->pq_pl, m, pageq);
1816         TAILQ_INSERT_TAIL(&pq->pq_pl, m, pageq);
1817 }
1818
1819 /*
1820  *      vm_page_activate:
1821  *
1822  *      Put the specified page on the active list (if appropriate).
1823  *      Ensure that act_count is at least ACT_INIT but do not otherwise
1824  *      mess with it.
1825  *
1826  *      The page must be locked.
1827  */
1828 void
1829 vm_page_activate(vm_page_t m)
1830 {
1831         int queue;
1832
1833         vm_page_lock_assert(m, MA_OWNED);
1834         if ((queue = m->queue) != PQ_ACTIVE) {
1835                 if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) {
1836                         if (m->act_count < ACT_INIT)
1837                                 m->act_count = ACT_INIT;
1838                         if (queue != PQ_NONE)
1839                                 vm_page_dequeue(m);
1840                         vm_page_enqueue(PQ_ACTIVE, m);
1841                 } else
1842                         KASSERT(queue == PQ_NONE,
1843                             ("vm_page_activate: wired page %p is queued", m));
1844         } else {
1845                 if (m->act_count < ACT_INIT)
1846                         m->act_count = ACT_INIT;
1847         }
1848 }
1849
1850 /*
1851  *      vm_page_free_wakeup:
1852  *
1853  *      Helper routine for vm_page_free_toq() and vm_page_cache().  This
1854  *      routine is called when a page has been added to the cache or free
1855  *      queues.
1856  *
1857  *      The page queues must be locked.
1858  */
1859 static inline void
1860 vm_page_free_wakeup(void)
1861 {
1862
1863         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
1864         /*
1865          * if pageout daemon needs pages, then tell it that there are
1866          * some free.
1867          */
1868         if (vm_pageout_pages_needed &&
1869             cnt.v_cache_count + cnt.v_free_count >= cnt.v_pageout_free_min) {
1870                 wakeup(&vm_pageout_pages_needed);
1871                 vm_pageout_pages_needed = 0;
1872         }
1873         /*
1874          * wakeup processes that are waiting on memory if we hit a
1875          * high water mark. And wakeup scheduler process if we have
1876          * lots of memory. this process will swapin processes.
1877          */
1878         if (vm_pages_needed && !vm_page_count_min()) {
1879                 vm_pages_needed = 0;
1880                 wakeup(&cnt.v_free_count);
1881         }
1882 }
1883
1884 /*
1885  *      vm_page_free_toq:
1886  *
1887  *      Returns the given page to the free list,
1888  *      disassociating it with any VM object.
1889  *
1890  *      The object must be locked.  The page must be locked if it is managed.
1891  */
1892 void
1893 vm_page_free_toq(vm_page_t m)
1894 {
1895
1896         if ((m->oflags & VPO_UNMANAGED) == 0) {
1897                 vm_page_lock_assert(m, MA_OWNED);
1898                 KASSERT(!pmap_page_is_mapped(m),
1899                     ("vm_page_free_toq: freeing mapped page %p", m));
1900         } else
1901                 KASSERT(m->queue == PQ_NONE,
1902                     ("vm_page_free_toq: unmanaged page %p is queued", m));
1903         PCPU_INC(cnt.v_tfree);
1904
1905         if (VM_PAGE_IS_FREE(m))
1906                 panic("vm_page_free: freeing free page %p", m);
1907         else if (m->busy != 0)
1908                 panic("vm_page_free: freeing busy page %p", m);
1909
1910         /*
1911          * Unqueue, then remove page.  Note that we cannot destroy
1912          * the page here because we do not want to call the pager's
1913          * callback routine until after we've put the page on the
1914          * appropriate free queue.
1915          */
1916         vm_page_remque(m);
1917         vm_page_remove(m);
1918
1919         /*
1920          * If fictitious remove object association and
1921          * return, otherwise delay object association removal.
1922          */
1923         if ((m->flags & PG_FICTITIOUS) != 0) {
1924                 return;
1925         }
1926
1927         m->valid = 0;
1928         vm_page_undirty(m);
1929
1930         if (m->wire_count != 0)
1931                 panic("vm_page_free: freeing wired page %p", m);
1932         if (m->hold_count != 0) {
1933                 m->flags &= ~PG_ZERO;
1934                 KASSERT((m->flags & PG_UNHOLDFREE) == 0,
1935                     ("vm_page_free: freeing PG_UNHOLDFREE page %p", m));
1936                 m->flags |= PG_UNHOLDFREE;
1937         } else {
1938                 /*
1939                  * Restore the default memory attribute to the page.
1940                  */
1941                 if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT)
1942                         pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT);
1943
1944                 /*
1945                  * Insert the page into the physical memory allocator's
1946                  * cache/free page queues.
1947                  */
1948                 mtx_lock(&vm_page_queue_free_mtx);
1949                 m->flags |= PG_FREE;
1950                 cnt.v_free_count++;
1951 #if VM_NRESERVLEVEL > 0
1952                 if (!vm_reserv_free_page(m))
1953 #else
1954                 if (TRUE)
1955 #endif
1956                         vm_phys_free_pages(m, 0);
1957                 if ((m->flags & PG_ZERO) != 0)
1958                         ++vm_page_zero_count;
1959                 else
1960                         vm_page_zero_idle_wakeup();
1961                 vm_page_free_wakeup();
1962                 mtx_unlock(&vm_page_queue_free_mtx);
1963         }
1964 }
1965
1966 /*
1967  *      vm_page_wire:
1968  *
1969  *      Mark this page as wired down by yet
1970  *      another map, removing it from paging queues
1971  *      as necessary.
1972  *
1973  *      If the page is fictitious, then its wire count must remain one.
1974  *
1975  *      The page must be locked.
1976  */
1977 void
1978 vm_page_wire(vm_page_t m)
1979 {
1980
1981         /*
1982          * Only bump the wire statistics if the page is not already wired,
1983          * and only unqueue the page if it is on some queue (if it is unmanaged
1984          * it is already off the queues).
1985          */
1986         vm_page_lock_assert(m, MA_OWNED);
1987         if ((m->flags & PG_FICTITIOUS) != 0) {
1988                 KASSERT(m->wire_count == 1,
1989                     ("vm_page_wire: fictitious page %p's wire count isn't one",
1990                     m));
1991                 return;
1992         }
1993         if (m->wire_count == 0) {
1994                 KASSERT((m->oflags & VPO_UNMANAGED) == 0 ||
1995                     m->queue == PQ_NONE,
1996                     ("vm_page_wire: unmanaged page %p is queued", m));
1997                 vm_page_remque(m);
1998                 atomic_add_int(&cnt.v_wire_count, 1);
1999         }
2000         m->wire_count++;
2001         KASSERT(m->wire_count != 0, ("vm_page_wire: wire_count overflow m=%p", m));
2002 }
2003
2004 /*
2005  * vm_page_unwire:
2006  *
2007  * Release one wiring of the specified page, potentially enabling it to be
2008  * paged again.  If paging is enabled, then the value of the parameter
2009  * "activate" determines to which queue the page is added.  If "activate" is
2010  * non-zero, then the page is added to the active queue.  Otherwise, it is
2011  * added to the inactive queue.
2012  *
2013  * However, unless the page belongs to an object, it is not enqueued because
2014  * it cannot be paged out.
2015  *
2016  * If a page is fictitious, then its wire count must alway be one.
2017  *
2018  * A managed page must be locked.
2019  */
2020 void
2021 vm_page_unwire(vm_page_t m, int activate)
2022 {
2023
2024         if ((m->oflags & VPO_UNMANAGED) == 0)
2025                 vm_page_lock_assert(m, MA_OWNED);
2026         if ((m->flags & PG_FICTITIOUS) != 0) {
2027                 KASSERT(m->wire_count == 1,
2028             ("vm_page_unwire: fictitious page %p's wire count isn't one", m));
2029                 return;
2030         }
2031         if (m->wire_count > 0) {
2032                 m->wire_count--;
2033                 if (m->wire_count == 0) {
2034                         atomic_subtract_int(&cnt.v_wire_count, 1);
2035                         if ((m->oflags & VPO_UNMANAGED) != 0 ||
2036                             m->object == NULL)
2037                                 return;
2038                         if (!activate)
2039                                 m->flags &= ~PG_WINATCFLS;
2040                         vm_page_enqueue(activate ? PQ_ACTIVE : PQ_INACTIVE, m);
2041                 }
2042         } else
2043                 panic("vm_page_unwire: page %p's wire count is zero", m);
2044 }
2045
2046 /*
2047  * Move the specified page to the inactive queue.
2048  *
2049  * Many pages placed on the inactive queue should actually go
2050  * into the cache, but it is difficult to figure out which.  What
2051  * we do instead, if the inactive target is well met, is to put
2052  * clean pages at the head of the inactive queue instead of the tail.
2053  * This will cause them to be moved to the cache more quickly and
2054  * if not actively re-referenced, reclaimed more quickly.  If we just
2055  * stick these pages at the end of the inactive queue, heavy filesystem
2056  * meta-data accesses can cause an unnecessary paging load on memory bound
2057  * processes.  This optimization causes one-time-use metadata to be
2058  * reused more quickly.
2059  *
2060  * Normally athead is 0 resulting in LRU operation.  athead is set
2061  * to 1 if we want this page to be 'as if it were placed in the cache',
2062  * except without unmapping it from the process address space.
2063  *
2064  * The page must be locked.
2065  */
2066 static inline void
2067 _vm_page_deactivate(vm_page_t m, int athead)
2068 {
2069         struct vm_pagequeue *pq;
2070         int queue;
2071
2072         vm_page_lock_assert(m, MA_OWNED);
2073
2074         /*
2075          * Ignore if already inactive.
2076          */
2077         if ((queue = m->queue) == PQ_INACTIVE)
2078                 return;
2079         if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) {
2080                 if (queue != PQ_NONE)
2081                         vm_page_dequeue(m);
2082                 m->flags &= ~PG_WINATCFLS;
2083                 pq = &vm_pagequeues[PQ_INACTIVE];
2084                 vm_pagequeue_lock(pq);
2085                 m->queue = PQ_INACTIVE;
2086                 if (athead)
2087                         TAILQ_INSERT_HEAD(&pq->pq_pl, m, pageq);
2088                 else
2089                         TAILQ_INSERT_TAIL(&pq->pq_pl, m, pageq);
2090                 cnt.v_inactive_count++;
2091                 vm_pagequeue_unlock(pq);
2092         }
2093 }
2094
2095 /*
2096  * Move the specified page to the inactive queue.
2097  *
2098  * The page must be locked.
2099  */
2100 void
2101 vm_page_deactivate(vm_page_t m)
2102 {
2103
2104         _vm_page_deactivate(m, 0);
2105 }
2106
2107 /*
2108  * vm_page_try_to_cache:
2109  *
2110  * Returns 0 on failure, 1 on success
2111  */
2112 int
2113 vm_page_try_to_cache(vm_page_t m)
2114 {
2115
2116         vm_page_lock_assert(m, MA_OWNED);
2117         VM_OBJECT_ASSERT_WLOCKED(m->object);
2118         if (m->dirty || m->hold_count || m->busy || m->wire_count ||
2119             (m->oflags & (VPO_BUSY | VPO_UNMANAGED)) != 0)
2120                 return (0);
2121         pmap_remove_all(m);
2122         if (m->dirty)
2123                 return (0);
2124         vm_page_cache(m);
2125         return (1);
2126 }
2127
2128 /*
2129  * vm_page_try_to_free()
2130  *
2131  *      Attempt to free the page.  If we cannot free it, we do nothing.
2132  *      1 is returned on success, 0 on failure.
2133  */
2134 int
2135 vm_page_try_to_free(vm_page_t m)
2136 {
2137
2138         vm_page_lock_assert(m, MA_OWNED);
2139         if (m->object != NULL)
2140                 VM_OBJECT_ASSERT_WLOCKED(m->object);
2141         if (m->dirty || m->hold_count || m->busy || m->wire_count ||
2142             (m->oflags & (VPO_BUSY | VPO_UNMANAGED)) != 0)
2143                 return (0);
2144         pmap_remove_all(m);
2145         if (m->dirty)
2146                 return (0);
2147         vm_page_free(m);
2148         return (1);
2149 }
2150
2151 /*
2152  * vm_page_cache
2153  *
2154  * Put the specified page onto the page cache queue (if appropriate).
2155  *
2156  * The object and page must be locked.
2157  */
2158 void
2159 vm_page_cache(vm_page_t m)
2160 {
2161         vm_object_t object;
2162         boolean_t cache_was_empty;
2163
2164         vm_page_lock_assert(m, MA_OWNED);
2165         object = m->object;
2166         VM_OBJECT_ASSERT_WLOCKED(object);
2167         if ((m->oflags & (VPO_UNMANAGED | VPO_BUSY)) || m->busy ||
2168             m->hold_count || m->wire_count)
2169                 panic("vm_page_cache: attempting to cache busy page");
2170         KASSERT(!pmap_page_is_mapped(m),
2171             ("vm_page_cache: page %p is mapped", m));
2172         KASSERT(m->dirty == 0, ("vm_page_cache: page %p is dirty", m));
2173         if (m->valid == 0 || object->type == OBJT_DEFAULT ||
2174             (object->type == OBJT_SWAP &&
2175             !vm_pager_has_page(object, m->pindex, NULL, NULL))) {
2176                 /*
2177                  * Hypothesis: A cache-elgible page belonging to a
2178                  * default object or swap object but without a backing
2179                  * store must be zero filled.
2180                  */
2181                 vm_page_free(m);
2182                 return;
2183         }
2184         KASSERT((m->flags & PG_CACHED) == 0,
2185             ("vm_page_cache: page %p is already cached", m));
2186         PCPU_INC(cnt.v_tcached);
2187
2188         /*
2189          * Remove the page from the paging queues.
2190          */
2191         vm_page_remque(m);
2192
2193         /*
2194          * Remove the page from the object's collection of resident
2195          * pages.
2196          */
2197         vm_radix_remove(&object->rtree, m->pindex);
2198         TAILQ_REMOVE(&object->memq, m, listq);
2199         object->resident_page_count--;
2200
2201         /*
2202          * Restore the default memory attribute to the page.
2203          */
2204         if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT)
2205                 pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT);
2206
2207         /*
2208          * Insert the page into the object's collection of cached pages
2209          * and the physical memory allocator's cache/free page queues.
2210          */
2211         m->flags &= ~PG_ZERO;
2212         mtx_lock(&vm_page_queue_free_mtx);
2213         m->flags |= PG_CACHED;
2214         cnt.v_cache_count++;
2215         cache_was_empty = vm_radix_is_empty(&object->cache);
2216         vm_radix_insert(&object->cache, m);
2217 #if VM_NRESERVLEVEL > 0
2218         if (!vm_reserv_free_page(m)) {
2219 #else
2220         if (TRUE) {
2221 #endif
2222                 vm_phys_set_pool(VM_FREEPOOL_CACHE, m, 0);
2223                 vm_phys_free_pages(m, 0);
2224         }
2225         vm_page_free_wakeup();
2226         mtx_unlock(&vm_page_queue_free_mtx);
2227
2228         /*
2229          * Increment the vnode's hold count if this is the object's only
2230          * cached page.  Decrement the vnode's hold count if this was
2231          * the object's only resident page.
2232          */
2233         if (object->type == OBJT_VNODE) {
2234                 if (cache_was_empty && object->resident_page_count != 0)
2235                         vhold(object->handle);
2236                 else if (!cache_was_empty && object->resident_page_count == 0)
2237                         vdrop(object->handle);
2238         }
2239 }
2240
2241 /*
2242  * vm_page_advise
2243  *
2244  *      Cache, deactivate, or do nothing as appropriate.  This routine
2245  *      is used by madvise().
2246  *
2247  *      Generally speaking we want to move the page into the cache so
2248  *      it gets reused quickly.  However, this can result in a silly syndrome
2249  *      due to the page recycling too quickly.  Small objects will not be
2250  *      fully cached.  On the other hand, if we move the page to the inactive
2251  *      queue we wind up with a problem whereby very large objects
2252  *      unnecessarily blow away our inactive and cache queues.
2253  *
2254  *      The solution is to move the pages based on a fixed weighting.  We
2255  *      either leave them alone, deactivate them, or move them to the cache,
2256  *      where moving them to the cache has the highest weighting.
2257  *      By forcing some pages into other queues we eventually force the
2258  *      system to balance the queues, potentially recovering other unrelated
2259  *      space from active.  The idea is to not force this to happen too
2260  *      often.
2261  *
2262  *      The object and page must be locked.
2263  */
2264 void
2265 vm_page_advise(vm_page_t m, int advice)
2266 {
2267         int dnw, head;
2268
2269         vm_page_assert_locked(m);
2270         VM_OBJECT_ASSERT_WLOCKED(m->object);
2271         if (advice == MADV_FREE) {
2272                 /*
2273                  * Mark the page clean.  This will allow the page to be freed
2274                  * up by the system.  However, such pages are often reused
2275                  * quickly by malloc() so we do not do anything that would
2276                  * cause a page fault if we can help it.
2277                  *
2278                  * Specifically, we do not try to actually free the page now
2279                  * nor do we try to put it in the cache (which would cause a
2280                  * page fault on reuse).
2281                  *
2282                  * But we do make the page is freeable as we can without
2283                  * actually taking the step of unmapping it.
2284                  */
2285                 pmap_clear_modify(m);
2286                 m->dirty = 0;
2287                 m->act_count = 0;
2288         } else if (advice != MADV_DONTNEED)
2289                 return;
2290         dnw = PCPU_GET(dnweight);
2291         PCPU_INC(dnweight);
2292
2293         /*
2294          * Occasionally leave the page alone.
2295          */
2296         if ((dnw & 0x01F0) == 0 || m->queue == PQ_INACTIVE) {
2297                 if (m->act_count >= ACT_INIT)
2298                         --m->act_count;
2299                 return;
2300         }
2301
2302         /*
2303          * Clear any references to the page.  Otherwise, the page daemon will
2304          * immediately reactivate the page.
2305          *
2306          * Perform the pmap_clear_reference() first.  Otherwise, a concurrent
2307          * pmap operation, such as pmap_remove(), could clear a reference in
2308          * the pmap and set PGA_REFERENCED on the page before the
2309          * pmap_clear_reference() had completed.  Consequently, the page would
2310          * appear referenced based upon an old reference that occurred before
2311          * this function ran.
2312          */
2313         pmap_clear_reference(m);
2314         vm_page_aflag_clear(m, PGA_REFERENCED);
2315
2316         if (advice != MADV_FREE && m->dirty == 0 && pmap_is_modified(m))
2317                 vm_page_dirty(m);
2318
2319         if (m->dirty || (dnw & 0x0070) == 0) {
2320                 /*
2321                  * Deactivate the page 3 times out of 32.
2322                  */
2323                 head = 0;
2324         } else {
2325                 /*
2326                  * Cache the page 28 times out of every 32.  Note that
2327                  * the page is deactivated instead of cached, but placed
2328                  * at the head of the queue instead of the tail.
2329                  */
2330                 head = 1;
2331         }
2332         _vm_page_deactivate(m, head);
2333 }
2334
2335 /*
2336  * Grab a page, waiting until we are waken up due to the page
2337  * changing state.  We keep on waiting, if the page continues
2338  * to be in the object.  If the page doesn't exist, first allocate it
2339  * and then conditionally zero it.
2340  *
2341  * The caller must always specify the VM_ALLOC_RETRY flag.  This is intended
2342  * to facilitate its eventual removal.
2343  *
2344  * This routine may sleep.
2345  *
2346  * The object must be locked on entry.  The lock will, however, be released
2347  * and reacquired if the routine sleeps.
2348  */
2349 vm_page_t
2350 vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags)
2351 {
2352         vm_page_t m;
2353
2354         VM_OBJECT_ASSERT_WLOCKED(object);
2355         KASSERT((allocflags & VM_ALLOC_RETRY) != 0,
2356             ("vm_page_grab: VM_ALLOC_RETRY is required"));
2357 retrylookup:
2358         if ((m = vm_page_lookup(object, pindex)) != NULL) {
2359                 if ((m->oflags & VPO_BUSY) != 0 ||
2360                     ((allocflags & VM_ALLOC_IGN_SBUSY) == 0 && m->busy != 0)) {
2361                         /*
2362                          * Reference the page before unlocking and
2363                          * sleeping so that the page daemon is less
2364                          * likely to reclaim it.
2365                          */
2366                         vm_page_aflag_set(m, PGA_REFERENCED);
2367                         vm_page_sleep(m, "pgrbwt");
2368                         goto retrylookup;
2369                 } else {
2370                         if ((allocflags & VM_ALLOC_WIRED) != 0) {
2371                                 vm_page_lock(m);
2372                                 vm_page_wire(m);
2373                                 vm_page_unlock(m);
2374                         }
2375                         if ((allocflags & VM_ALLOC_NOBUSY) == 0)
2376                                 vm_page_busy(m);
2377                         return (m);
2378                 }
2379         }
2380         m = vm_page_alloc(object, pindex, allocflags & ~(VM_ALLOC_RETRY |
2381             VM_ALLOC_IGN_SBUSY));
2382         if (m == NULL) {
2383                 VM_OBJECT_WUNLOCK(object);
2384                 VM_WAIT;
2385                 VM_OBJECT_WLOCK(object);
2386                 goto retrylookup;
2387         } else if (m->valid != 0)
2388                 return (m);
2389         if (allocflags & VM_ALLOC_ZERO && (m->flags & PG_ZERO) == 0)
2390                 pmap_zero_page(m);
2391         return (m);
2392 }
2393
2394 /*
2395  * Mapping function for valid or dirty bits in a page.
2396  *
2397  * Inputs are required to range within a page.
2398  */
2399 vm_page_bits_t
2400 vm_page_bits(int base, int size)
2401 {
2402         int first_bit;
2403         int last_bit;
2404
2405         KASSERT(
2406             base + size <= PAGE_SIZE,
2407             ("vm_page_bits: illegal base/size %d/%d", base, size)
2408         );
2409
2410         if (size == 0)          /* handle degenerate case */
2411                 return (0);
2412
2413         first_bit = base >> DEV_BSHIFT;
2414         last_bit = (base + size - 1) >> DEV_BSHIFT;
2415
2416         return (((vm_page_bits_t)2 << last_bit) -
2417             ((vm_page_bits_t)1 << first_bit));
2418 }
2419
2420 /*
2421  *      vm_page_set_valid_range:
2422  *
2423  *      Sets portions of a page valid.  The arguments are expected
2424  *      to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
2425  *      of any partial chunks touched by the range.  The invalid portion of
2426  *      such chunks will be zeroed.
2427  *
2428  *      (base + size) must be less then or equal to PAGE_SIZE.
2429  */
2430 void
2431 vm_page_set_valid_range(vm_page_t m, int base, int size)
2432 {
2433         int endoff, frag;
2434
2435         VM_OBJECT_ASSERT_WLOCKED(m->object);
2436         if (size == 0)  /* handle degenerate case */
2437                 return;
2438
2439         /*
2440          * If the base is not DEV_BSIZE aligned and the valid
2441          * bit is clear, we have to zero out a portion of the
2442          * first block.
2443          */
2444         if ((frag = base & ~(DEV_BSIZE - 1)) != base &&
2445             (m->valid & (1 << (base >> DEV_BSHIFT))) == 0)
2446                 pmap_zero_page_area(m, frag, base - frag);
2447
2448         /*
2449          * If the ending offset is not DEV_BSIZE aligned and the
2450          * valid bit is clear, we have to zero out a portion of
2451          * the last block.
2452          */
2453         endoff = base + size;
2454         if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff &&
2455             (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0)
2456                 pmap_zero_page_area(m, endoff,
2457                     DEV_BSIZE - (endoff & (DEV_BSIZE - 1)));
2458
2459         /*
2460          * Assert that no previously invalid block that is now being validated
2461          * is already dirty.
2462          */
2463         KASSERT((~m->valid & vm_page_bits(base, size) & m->dirty) == 0,
2464             ("vm_page_set_valid_range: page %p is dirty", m));
2465
2466         /*
2467          * Set valid bits inclusive of any overlap.
2468          */
2469         m->valid |= vm_page_bits(base, size);
2470 }
2471
2472 /*
2473  * Clear the given bits from the specified page's dirty field.
2474  */
2475 static __inline void
2476 vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits)
2477 {
2478         uintptr_t addr;
2479 #if PAGE_SIZE < 16384
2480         int shift;
2481 #endif
2482
2483         /*
2484          * If the object is locked and the page is neither VPO_BUSY nor
2485          * write mapped, then the page's dirty field cannot possibly be
2486          * set by a concurrent pmap operation.
2487          */
2488         VM_OBJECT_ASSERT_WLOCKED(m->object);
2489         if ((m->oflags & VPO_BUSY) == 0 && !pmap_page_is_write_mapped(m))
2490                 m->dirty &= ~pagebits;
2491         else {
2492                 /*
2493                  * The pmap layer can call vm_page_dirty() without
2494                  * holding a distinguished lock.  The combination of
2495                  * the object's lock and an atomic operation suffice
2496                  * to guarantee consistency of the page dirty field.
2497                  *
2498                  * For PAGE_SIZE == 32768 case, compiler already
2499                  * properly aligns the dirty field, so no forcible
2500                  * alignment is needed. Only require existence of
2501                  * atomic_clear_64 when page size is 32768.
2502                  */
2503                 addr = (uintptr_t)&m->dirty;
2504 #if PAGE_SIZE == 32768
2505                 atomic_clear_64((uint64_t *)addr, pagebits);
2506 #elif PAGE_SIZE == 16384
2507                 atomic_clear_32((uint32_t *)addr, pagebits);
2508 #else           /* PAGE_SIZE <= 8192 */
2509                 /*
2510                  * Use a trick to perform a 32-bit atomic on the
2511                  * containing aligned word, to not depend on the existence
2512                  * of atomic_clear_{8, 16}.
2513                  */
2514                 shift = addr & (sizeof(uint32_t) - 1);
2515 #if BYTE_ORDER == BIG_ENDIAN
2516                 shift = (sizeof(uint32_t) - sizeof(m->dirty) - shift) * NBBY;
2517 #else
2518                 shift *= NBBY;
2519 #endif
2520                 addr &= ~(sizeof(uint32_t) - 1);
2521                 atomic_clear_32((uint32_t *)addr, pagebits << shift);
2522 #endif          /* PAGE_SIZE */
2523         }
2524 }
2525
2526 /*
2527  *      vm_page_set_validclean:
2528  *
2529  *      Sets portions of a page valid and clean.  The arguments are expected
2530  *      to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
2531  *      of any partial chunks touched by the range.  The invalid portion of
2532  *      such chunks will be zero'd.
2533  *
2534  *      (base + size) must be less then or equal to PAGE_SIZE.
2535  */
2536 void
2537 vm_page_set_validclean(vm_page_t m, int base, int size)
2538 {
2539         vm_page_bits_t oldvalid, pagebits;
2540         int endoff, frag;
2541
2542         VM_OBJECT_ASSERT_WLOCKED(m->object);
2543         if (size == 0)  /* handle degenerate case */
2544                 return;
2545
2546         /*
2547          * If the base is not DEV_BSIZE aligned and the valid
2548          * bit is clear, we have to zero out a portion of the
2549          * first block.
2550          */
2551         if ((frag = base & ~(DEV_BSIZE - 1)) != base &&
2552             (m->valid & ((vm_page_bits_t)1 << (base >> DEV_BSHIFT))) == 0)
2553                 pmap_zero_page_area(m, frag, base - frag);
2554
2555         /*
2556          * If the ending offset is not DEV_BSIZE aligned and the
2557          * valid bit is clear, we have to zero out a portion of
2558          * the last block.
2559          */
2560         endoff = base + size;
2561         if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff &&
2562             (m->valid & ((vm_page_bits_t)1 << (endoff >> DEV_BSHIFT))) == 0)
2563                 pmap_zero_page_area(m, endoff,
2564                     DEV_BSIZE - (endoff & (DEV_BSIZE - 1)));
2565
2566         /*
2567          * Set valid, clear dirty bits.  If validating the entire
2568          * page we can safely clear the pmap modify bit.  We also
2569          * use this opportunity to clear the VPO_NOSYNC flag.  If a process
2570          * takes a write fault on a MAP_NOSYNC memory area the flag will
2571          * be set again.
2572          *
2573          * We set valid bits inclusive of any overlap, but we can only
2574          * clear dirty bits for DEV_BSIZE chunks that are fully within
2575          * the range.
2576          */
2577         oldvalid = m->valid;
2578         pagebits = vm_page_bits(base, size);
2579         m->valid |= pagebits;
2580 #if 0   /* NOT YET */
2581         if ((frag = base & (DEV_BSIZE - 1)) != 0) {
2582                 frag = DEV_BSIZE - frag;
2583                 base += frag;
2584                 size -= frag;
2585                 if (size < 0)
2586                         size = 0;
2587         }
2588         pagebits = vm_page_bits(base, size & (DEV_BSIZE - 1));
2589 #endif
2590         if (base == 0 && size == PAGE_SIZE) {
2591                 /*
2592                  * The page can only be modified within the pmap if it is
2593                  * mapped, and it can only be mapped if it was previously
2594                  * fully valid.
2595                  */
2596                 if (oldvalid == VM_PAGE_BITS_ALL)
2597                         /*
2598                          * Perform the pmap_clear_modify() first.  Otherwise,
2599                          * a concurrent pmap operation, such as
2600                          * pmap_protect(), could clear a modification in the
2601                          * pmap and set the dirty field on the page before
2602                          * pmap_clear_modify() had begun and after the dirty
2603                          * field was cleared here.
2604                          */
2605                         pmap_clear_modify(m);
2606                 m->dirty = 0;
2607                 m->oflags &= ~VPO_NOSYNC;
2608         } else if (oldvalid != VM_PAGE_BITS_ALL)
2609                 m->dirty &= ~pagebits;
2610         else
2611                 vm_page_clear_dirty_mask(m, pagebits);
2612 }
2613
2614 void
2615 vm_page_clear_dirty(vm_page_t m, int base, int size)
2616 {
2617
2618         vm_page_clear_dirty_mask(m, vm_page_bits(base, size));
2619 }
2620
2621 /*
2622  *      vm_page_set_invalid:
2623  *
2624  *      Invalidates DEV_BSIZE'd chunks within a page.  Both the
2625  *      valid and dirty bits for the effected areas are cleared.
2626  */
2627 void
2628 vm_page_set_invalid(vm_page_t m, int base, int size)
2629 {
2630         vm_page_bits_t bits;
2631
2632         VM_OBJECT_ASSERT_WLOCKED(m->object);
2633         KASSERT((m->oflags & VPO_BUSY) == 0,
2634             ("vm_page_set_invalid: page %p is busy", m));
2635         bits = vm_page_bits(base, size);
2636         if (m->valid == VM_PAGE_BITS_ALL && bits != 0)
2637                 pmap_remove_all(m);
2638         KASSERT(!pmap_page_is_mapped(m),
2639             ("vm_page_set_invalid: page %p is mapped", m));
2640         m->valid &= ~bits;
2641         m->dirty &= ~bits;
2642 }
2643
2644 /*
2645  * vm_page_zero_invalid()
2646  *
2647  *      The kernel assumes that the invalid portions of a page contain
2648  *      garbage, but such pages can be mapped into memory by user code.
2649  *      When this occurs, we must zero out the non-valid portions of the
2650  *      page so user code sees what it expects.
2651  *
2652  *      Pages are most often semi-valid when the end of a file is mapped
2653  *      into memory and the file's size is not page aligned.
2654  */
2655 void
2656 vm_page_zero_invalid(vm_page_t m, boolean_t setvalid)
2657 {
2658         int b;
2659         int i;
2660
2661         VM_OBJECT_ASSERT_WLOCKED(m->object);
2662         /*
2663          * Scan the valid bits looking for invalid sections that
2664          * must be zerod.  Invalid sub-DEV_BSIZE'd areas ( where the
2665          * valid bit may be set ) have already been zerod by
2666          * vm_page_set_validclean().
2667          */
2668         for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) {
2669                 if (i == (PAGE_SIZE / DEV_BSIZE) ||
2670                     (m->valid & ((vm_page_bits_t)1 << i))) {
2671                         if (i > b) {
2672                                 pmap_zero_page_area(m,
2673                                     b << DEV_BSHIFT, (i - b) << DEV_BSHIFT);
2674                         }
2675                         b = i + 1;
2676                 }
2677         }
2678
2679         /*
2680          * setvalid is TRUE when we can safely set the zero'd areas
2681          * as being valid.  We can do this if there are no cache consistancy
2682          * issues.  e.g. it is ok to do with UFS, but not ok to do with NFS.
2683          */
2684         if (setvalid)
2685                 m->valid = VM_PAGE_BITS_ALL;
2686 }
2687
2688 /*
2689  *      vm_page_is_valid:
2690  *
2691  *      Is (partial) page valid?  Note that the case where size == 0
2692  *      will return FALSE in the degenerate case where the page is
2693  *      entirely invalid, and TRUE otherwise.
2694  */
2695 int
2696 vm_page_is_valid(vm_page_t m, int base, int size)
2697 {
2698         vm_page_bits_t bits;
2699
2700         VM_OBJECT_ASSERT_WLOCKED(m->object);
2701         bits = vm_page_bits(base, size);
2702         return (m->valid != 0 && (m->valid & bits) == bits);
2703 }
2704
2705 /*
2706  * Set the page's dirty bits if the page is modified.
2707  */
2708 void
2709 vm_page_test_dirty(vm_page_t m)
2710 {
2711
2712         VM_OBJECT_ASSERT_WLOCKED(m->object);
2713         if (m->dirty != VM_PAGE_BITS_ALL && pmap_is_modified(m))
2714                 vm_page_dirty(m);
2715 }
2716
2717 void
2718 vm_page_lock_KBI(vm_page_t m, const char *file, int line)
2719 {
2720
2721         mtx_lock_flags_(vm_page_lockptr(m), 0, file, line);
2722 }
2723
2724 void
2725 vm_page_unlock_KBI(vm_page_t m, const char *file, int line)
2726 {
2727
2728         mtx_unlock_flags_(vm_page_lockptr(m), 0, file, line);
2729 }
2730
2731 int
2732 vm_page_trylock_KBI(vm_page_t m, const char *file, int line)
2733 {
2734
2735         return (mtx_trylock_flags_(vm_page_lockptr(m), 0, file, line));
2736 }
2737
2738 #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT)
2739 void
2740 vm_page_assert_locked_KBI(vm_page_t m, const char *file, int line)
2741 {
2742
2743         vm_page_lock_assert_KBI(m, MA_OWNED, file, line);
2744 }
2745
2746 void
2747 vm_page_lock_assert_KBI(vm_page_t m, int a, const char *file, int line)
2748 {
2749
2750         mtx_assert_(vm_page_lockptr(m), a, file, line);
2751 }
2752 #endif
2753
2754 int so_zerocp_fullpage = 0;
2755
2756 /*
2757  *      Replace the given page with a copy.  The copied page assumes
2758  *      the portion of the given page's "wire_count" that is not the
2759  *      responsibility of this copy-on-write mechanism.
2760  *
2761  *      The object containing the given page must have a non-zero
2762  *      paging-in-progress count and be locked.
2763  */
2764 void
2765 vm_page_cowfault(vm_page_t m)
2766 {
2767         vm_page_t mnew;
2768         vm_object_t object;
2769         vm_pindex_t pindex;
2770
2771         vm_page_lock_assert(m, MA_OWNED);
2772         object = m->object;
2773         VM_OBJECT_ASSERT_WLOCKED(object);
2774         KASSERT(object->paging_in_progress != 0,
2775             ("vm_page_cowfault: object %p's paging-in-progress count is zero.",
2776             object));
2777         pindex = m->pindex;
2778
2779  retry_alloc:
2780         pmap_remove_all(m);
2781         vm_page_remove(m);
2782         mnew = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY);
2783         if (mnew == NULL) {
2784                 vm_page_insert(m, object, pindex);
2785                 vm_page_unlock(m);
2786                 VM_OBJECT_WUNLOCK(object);
2787                 VM_WAIT;
2788                 VM_OBJECT_WLOCK(object);
2789                 if (m == vm_page_lookup(object, pindex)) {
2790                         vm_page_lock(m);
2791                         goto retry_alloc;
2792                 } else {
2793                         /*
2794                          * Page disappeared during the wait.
2795                          */
2796                         return;
2797                 }
2798         }
2799
2800         if (m->cow == 0) {
2801                 /*
2802                  * check to see if we raced with an xmit complete when
2803                  * waiting to allocate a page.  If so, put things back
2804                  * the way they were
2805                  */
2806                 vm_page_unlock(m);
2807                 vm_page_lock(mnew);
2808                 vm_page_free(mnew);
2809                 vm_page_unlock(mnew);
2810                 vm_page_insert(m, object, pindex);
2811         } else { /* clear COW & copy page */
2812                 if (!so_zerocp_fullpage)
2813                         pmap_copy_page(m, mnew);
2814                 mnew->valid = VM_PAGE_BITS_ALL;
2815                 vm_page_dirty(mnew);
2816                 mnew->wire_count = m->wire_count - m->cow;
2817                 m->wire_count = m->cow;
2818                 vm_page_unlock(m);
2819         }
2820 }
2821
2822 void
2823 vm_page_cowclear(vm_page_t m)
2824 {
2825
2826         vm_page_lock_assert(m, MA_OWNED);
2827         if (m->cow) {
2828                 m->cow--;
2829                 /*
2830                  * let vm_fault add back write permission  lazily
2831                  */
2832         }
2833         /*
2834          *  sf_buf_free() will free the page, so we needn't do it here
2835          */
2836 }
2837
2838 int
2839 vm_page_cowsetup(vm_page_t m)
2840 {
2841
2842         vm_page_lock_assert(m, MA_OWNED);
2843         if ((m->flags & PG_FICTITIOUS) != 0 ||
2844             (m->oflags & VPO_UNMANAGED) != 0 ||
2845             m->cow == USHRT_MAX - 1 || !VM_OBJECT_TRYWLOCK(m->object))
2846                 return (EBUSY);
2847         m->cow++;
2848         pmap_remove_write(m);
2849         VM_OBJECT_WUNLOCK(m->object);
2850         return (0);
2851 }
2852
2853 #ifdef INVARIANTS
2854 void
2855 vm_page_object_lock_assert(vm_page_t m)
2856 {
2857
2858         /*
2859          * Certain of the page's fields may only be modified by the
2860          * holder of the containing object's lock or the setter of the
2861          * page's VPO_BUSY flag.  Unfortunately, the setter of the
2862          * VPO_BUSY flag is not recorded, and thus cannot be checked
2863          * here.
2864          */
2865         if (m->object != NULL && (m->oflags & VPO_BUSY) == 0)
2866                 VM_OBJECT_ASSERT_WLOCKED(m->object);
2867 }
2868 #endif
2869
2870 #include "opt_ddb.h"
2871 #ifdef DDB
2872 #include <sys/kernel.h>
2873
2874 #include <ddb/ddb.h>
2875
2876 DB_SHOW_COMMAND(page, vm_page_print_page_info)
2877 {
2878         db_printf("cnt.v_free_count: %d\n", cnt.v_free_count);
2879         db_printf("cnt.v_cache_count: %d\n", cnt.v_cache_count);
2880         db_printf("cnt.v_inactive_count: %d\n", cnt.v_inactive_count);
2881         db_printf("cnt.v_active_count: %d\n", cnt.v_active_count);
2882         db_printf("cnt.v_wire_count: %d\n", cnt.v_wire_count);
2883         db_printf("cnt.v_free_reserved: %d\n", cnt.v_free_reserved);
2884         db_printf("cnt.v_free_min: %d\n", cnt.v_free_min);
2885         db_printf("cnt.v_free_target: %d\n", cnt.v_free_target);
2886         db_printf("cnt.v_cache_min: %d\n", cnt.v_cache_min);
2887         db_printf("cnt.v_inactive_target: %d\n", cnt.v_inactive_target);
2888 }
2889
2890 DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info)
2891 {
2892
2893         db_printf("PQ_FREE:");
2894         db_printf(" %d", cnt.v_free_count);
2895         db_printf("\n");
2896
2897         db_printf("PQ_CACHE:");
2898         db_printf(" %d", cnt.v_cache_count);
2899         db_printf("\n");
2900
2901         db_printf("PQ_ACTIVE: %d, PQ_INACTIVE: %d\n",
2902                 *vm_pagequeues[PQ_ACTIVE].pq_cnt,
2903                 *vm_pagequeues[PQ_INACTIVE].pq_cnt);
2904 }
2905
2906 DB_SHOW_COMMAND(pginfo, vm_page_print_pginfo)
2907 {
2908         vm_page_t m;
2909         boolean_t phys;
2910
2911         if (!have_addr) {
2912                 db_printf("show pginfo addr\n");
2913                 return;
2914         }
2915
2916         phys = strchr(modif, 'p') != NULL;
2917         if (phys)
2918                 m = PHYS_TO_VM_PAGE(addr);
2919         else
2920                 m = (vm_page_t)addr;
2921         db_printf(
2922     "page %p obj %p pidx 0x%jx phys 0x%jx q %d hold %d wire %d\n"
2923     "  af 0x%x of 0x%x f 0x%x act %d busy %d valid 0x%x dirty 0x%x\n",
2924             m, m->object, (uintmax_t)m->pindex, (uintmax_t)m->phys_addr,
2925             m->queue, m->hold_count, m->wire_count, m->aflags, m->oflags,
2926             m->flags, m->act_count, m->busy, m->valid, m->dirty);
2927 }
2928 #endif /* DDB */