sys/vm/vm_object.c

   1 /*-
   2  * Copyright (c) 1991, 1993
   3  *      The Regents of the University of California.  All rights reserved.
   4  *
   5  * This code is derived from software contributed to Berkeley by
   6  * The Mach Operating System project at Carnegie-Mellon University.
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in the
  15  *    documentation and/or other materials provided with the distribution.
  16  * 4. Neither the name of the University nor the names of its contributors
  17  *    may be used to endorse or promote products derived from this software
  18  *    without specific prior written permission.
  19  *
  20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  30  * SUCH DAMAGE.
  31  *
  32  *      from: @(#)vm_object.c   8.5 (Berkeley) 3/22/94
  33  *
  34  *
  35  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  36  * All rights reserved.
  37  *
  38  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  39  *
  40  * Permission to use, copy, modify and distribute this software and
  41  * its documentation is hereby granted, provided that both the copyright
  42  * notice and this permission notice appear in all copies of the
  43  * software, derivative works or modified versions, and any portions
  44  * thereof, and that both notices appear in supporting documentation.
  45  *
  46  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  47  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  48  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  49  *
  50  * Carnegie Mellon requests users of this software to return to
  51  *
  52  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  53  *  School of Computer Science
  54  *  Carnegie Mellon University
  55  *  Pittsburgh PA 15213-3890
  56  *
  57  * any improvements or extensions that they make and grant Carnegie the
  58  * rights to redistribute these changes.
  59  */
  60
  61 /*
  62  *      Virtual memory object module.
  63  */
  64
  65 #include <sys/cdefs.h>
  66 __FBSDID("$FreeBSD$");
  67
  68 #include "opt_vm.h"
  69
  70 #include <sys/param.h>
  71 #include <sys/systm.h>
  72 #include <sys/lock.h>
  73 #include <sys/mman.h>
  74 #include <sys/mount.h>
  75 #include <sys/kernel.h>
  76 #include <sys/sysctl.h>
  77 #include <sys/mutex.h>
  78 #include <sys/proc.h>           /* for curproc, pageproc */
  79 #include <sys/socket.h>
  80 #include <sys/resourcevar.h>
  81 #include <sys/vnode.h>
  82 #include <sys/vmmeter.h>
  83 #include <sys/sx.h>
  84
  85 #include <vm/vm.h>
  86 #include <vm/vm_param.h>
  87 #include <vm/pmap.h>
  88 #include <vm/vm_map.h>
  89 #include <vm/vm_object.h>
  90 #include <vm/vm_page.h>
  91 #include <vm/vm_pageout.h>
  92 #include <vm/vm_pager.h>
  93 #include <vm/swap_pager.h>
  94 #include <vm/vm_kern.h>
  95 #include <vm/vm_extern.h>
  96 #include <vm/vm_reserv.h>
  97 #include <vm/uma.h>
  98
  99 #define EASY_SCAN_FACTOR       8
 100
 101 #define MSYNC_FLUSH_HARDSEQ     0x01
 102 #define MSYNC_FLUSH_SOFTSEQ     0x02
 103
 104 /*
 105  * msync / VM object flushing optimizations
 106  */
 107 static int msync_flush_flags = MSYNC_FLUSH_HARDSEQ | MSYNC_FLUSH_SOFTSEQ;
 108 SYSCTL_INT(_vm, OID_AUTO, msync_flush_flags, CTLFLAG_RW, &msync_flush_flags, 0,
 109     "Enable sequential iteration optimization");
 110
 111 static int old_msync;
 112 SYSCTL_INT(_vm, OID_AUTO, old_msync, CTLFLAG_RW, &old_msync, 0,
 113     "Use old (insecure) msync behavior");
 114
 115 static void     vm_object_qcollapse(vm_object_t object);
 116 static int      vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int curgeneration, int pagerflags);
 117 static void     vm_object_vndeallocate(vm_object_t object);
 118
 119 /*
 120  *      Virtual memory objects maintain the actual data
 121  *      associated with allocated virtual memory.  A given
 122  *      page of memory exists within exactly one object.
 123  *
 124  *      An object is only deallocated when all "references"
 125  *      are given up.  Only one "reference" to a given
 126  *      region of an object should be writeable.
 127  *
 128  *      Associated with each object is a list of all resident
 129  *      memory pages belonging to that object; this list is
 130  *      maintained by the "vm_page" module, and locked by the object's
 131  *      lock.
 132  *
 133  *      Each object also records a "pager" routine which is
 134  *      used to retrieve (and store) pages to the proper backing
 135  *      storage.  In addition, objects may be backed by other
 136  *      objects from which they were virtual-copied.
 137  *
 138  *      The only items within the object structure which are
 139  *      modified after time of creation are:
 140  *              reference count         locked by object's lock
 141  *              pager routine           locked by object's lock
 142  *
 143  */
 144
 145 struct object_q vm_object_list;
 146 struct mtx vm_object_list_mtx;  /* lock for object list and count */
 147
 148 struct vm_object kernel_object_store;
 149 struct vm_object kmem_object_store;
 150
 151 SYSCTL_NODE(_vm_stats, OID_AUTO, object, CTLFLAG_RD, 0, "VM object stats");
 152
 153 static long object_collapses;
 154 SYSCTL_LONG(_vm_stats_object, OID_AUTO, collapses, CTLFLAG_RD,
 155     &object_collapses, 0, "VM object collapses");
 156
 157 static long object_bypasses;
 158 SYSCTL_LONG(_vm_stats_object, OID_AUTO, bypasses, CTLFLAG_RD,
 159     &object_bypasses, 0, "VM object bypasses");
 160
 161 static uma_zone_t obj_zone;
 162
 163 static int vm_object_zinit(void *mem, int size, int flags);
 164
 165 #ifdef INVARIANTS
 166 static void vm_object_zdtor(void *mem, int size, void *arg);
 167
 168 static void
 169 vm_object_zdtor(void *mem, int size, void *arg)
 170 {
 171         vm_object_t object;
 172
 173         object = (vm_object_t)mem;
 174         KASSERT(TAILQ_EMPTY(&object->memq),
 175             ("object %p has resident pages",
 176             object));
 177 #if VM_NRESERVLEVEL > 0
 178         KASSERT(LIST_EMPTY(&object->rvq),
 179             ("object %p has reservations",
 180             object));
 181 #endif
 182         KASSERT(object->cache == NULL,
 183             ("object %p has cached pages",
 184             object));
 185         KASSERT(object->paging_in_progress == 0,
 186             ("object %p paging_in_progress = %d",
 187             object, object->paging_in_progress));
 188         KASSERT(object->resident_page_count == 0,
 189             ("object %p resident_page_count = %d",
 190             object, object->resident_page_count));
 191         KASSERT(object->shadow_count == 0,
 192             ("object %p shadow_count = %d",
 193             object, object->shadow_count));
 194 }
 195 #endif
 196
 197 static int
 198 vm_object_zinit(void *mem, int size, int flags)
 199 {
 200         vm_object_t object;
 201
 202         object = (vm_object_t)mem;
 203         bzero(&object->mtx, sizeof(object->mtx));
 204         VM_OBJECT_LOCK_INIT(object, "standard object");
 205
 206         /* These are true for any object that has been freed */
 207         object->paging_in_progress = 0;
 208         object->resident_page_count = 0;
 209         object->shadow_count = 0;
 210         return (0);
 211 }
 212
 213 void
 214 _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object)
 215 {
 216
 217         TAILQ_INIT(&object->memq);
 218         LIST_INIT(&object->shadow_head);
 219
 220         object->root = NULL;
 221         object->type = type;
 222         object->size = size;
 223         object->generation = 1;
 224         object->ref_count = 1;
 225         object->memattr = VM_MEMATTR_DEFAULT;
 226         object->flags = 0;
 227         object->uip = NULL;
 228         object->charge = 0;
 229         if ((object->type == OBJT_DEFAULT) || (object->type == OBJT_SWAP))
 230                 object->flags = OBJ_ONEMAPPING;
 231         object->pg_color = 0;
 232         object->handle = NULL;
 233         object->backing_object = NULL;
 234         object->backing_object_offset = (vm_ooffset_t) 0;
 235 #if VM_NRESERVLEVEL > 0
 236         LIST_INIT(&object->rvq);
 237 #endif
 238         object->cache = NULL;
 239
 240         mtx_lock(&vm_object_list_mtx);
 241         TAILQ_INSERT_TAIL(&vm_object_list, object, object_list);
 242         mtx_unlock(&vm_object_list_mtx);
 243 }
 244
 245 /*
 246  *      vm_object_init:
 247  *
 248  *      Initialize the VM objects module.
 249  */
 250 void
 251 vm_object_init(void)
 252 {
 253         TAILQ_INIT(&vm_object_list);
 254         mtx_init(&vm_object_list_mtx, "vm object_list", NULL, MTX_DEF);
 255
 256         VM_OBJECT_LOCK_INIT(&kernel_object_store, "kernel object");
 257         _vm_object_allocate(OBJT_PHYS, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS),
 258             kernel_object);
 259 #if VM_NRESERVLEVEL > 0
 260         kernel_object->flags |= OBJ_COLORED;
 261         kernel_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS);
 262 #endif
 263
 264         VM_OBJECT_LOCK_INIT(&kmem_object_store, "kmem object");
 265         _vm_object_allocate(OBJT_PHYS, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS),
 266             kmem_object);
 267 #if VM_NRESERVLEVEL > 0
 268         kmem_object->flags |= OBJ_COLORED;
 269         kmem_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS);
 270 #endif
 271
 272         /*
 273          * The lock portion of struct vm_object must be type stable due
 274          * to vm_pageout_fallback_object_lock locking a vm object
 275          * without holding any references to it.
 276          */
 277         obj_zone = uma_zcreate("VM OBJECT", sizeof (struct vm_object), NULL,
 278 #ifdef INVARIANTS
 279             vm_object_zdtor,
 280 #else
 281             NULL,
 282 #endif
 283             vm_object_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM|UMA_ZONE_NOFREE);
 284 }
 285
 286 void
 287 vm_object_clear_flag(vm_object_t object, u_short bits)
 288 {
 289
 290         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 291         object->flags &= ~bits;
 292 }
 293
 294 /*
 295  *      Sets the default memory attribute for the specified object.  Pages
 296  *      that are allocated to this object are by default assigned this memory
 297  *      attribute.
 298  *
 299  *      Presently, this function must be called before any pages are allocated
 300  *      to the object.  In the future, this requirement may be relaxed for
 301  *      "default" and "swap" objects.
 302  */
 303 int
 304 vm_object_set_memattr(vm_object_t object, vm_memattr_t memattr)
 305 {
 306
 307         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 308         switch (object->type) {
 309         case OBJT_DEFAULT:
 310         case OBJT_DEVICE:
 311         case OBJT_PHYS:
 312         case OBJT_SG:
 313         case OBJT_SWAP:
 314         case OBJT_VNODE:
 315                 if (!TAILQ_EMPTY(&object->memq))
 316                         return (KERN_FAILURE);
 317                 break;
 318         case OBJT_DEAD:
 319                 return (KERN_INVALID_ARGUMENT);
 320         }
 321         object->memattr = memattr;
 322         return (KERN_SUCCESS);
 323 }
 324
 325 void
 326 vm_object_pip_add(vm_object_t object, short i)
 327 {
 328
 329         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 330         object->paging_in_progress += i;
 331 }
 332
 333 void
 334 vm_object_pip_subtract(vm_object_t object, short i)
 335 {
 336
 337         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 338         object->paging_in_progress -= i;
 339 }
 340
 341 void
 342 vm_object_pip_wakeup(vm_object_t object)
 343 {
 344
 345         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 346         object->paging_in_progress--;
 347         if ((object->flags & OBJ_PIPWNT) && object->paging_in_progress == 0) {
 348                 vm_object_clear_flag(object, OBJ_PIPWNT);
 349                 wakeup(object);
 350         }
 351 }
 352
 353 void
 354 vm_object_pip_wakeupn(vm_object_t object, short i)
 355 {
 356
 357         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 358         if (i)
 359                 object->paging_in_progress -= i;
 360         if ((object->flags & OBJ_PIPWNT) && object->paging_in_progress == 0) {
 361                 vm_object_clear_flag(object, OBJ_PIPWNT);
 362                 wakeup(object);
 363         }
 364 }
 365
 366 void
 367 vm_object_pip_wait(vm_object_t object, char *waitid)
 368 {
 369
 370         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 371         while (object->paging_in_progress) {
 372                 object->flags |= OBJ_PIPWNT;
 373                 msleep(object, VM_OBJECT_MTX(object), PVM, waitid, 0);
 374         }
 375 }
 376
 377 /*
 378  *      vm_object_allocate:
 379  *
 380  *      Returns a new object with the given size.
 381  */
 382 vm_object_t
 383 vm_object_allocate(objtype_t type, vm_pindex_t size)
 384 {
 385         vm_object_t object;
 386
 387         object = (vm_object_t)uma_zalloc(obj_zone, M_WAITOK);
 388         _vm_object_allocate(type, size, object);
 389         return (object);
 390 }
 391
 392
 393 /*
 394  *      vm_object_reference:
 395  *
 396  *      Gets another reference to the given object.  Note: OBJ_DEAD
 397  *      objects can be referenced during final cleaning.
 398  */
 399 void
 400 vm_object_reference(vm_object_t object)
 401 {
 402         if (object == NULL)
 403                 return;
 404         VM_OBJECT_LOCK(object);
 405         vm_object_reference_locked(object);
 406         VM_OBJECT_UNLOCK(object);
 407 }
 408
 409 /*
 410  *      vm_object_reference_locked:
 411  *
 412  *      Gets another reference to the given object.
 413  *
 414  *      The object must be locked.
 415  */
 416 void
 417 vm_object_reference_locked(vm_object_t object)
 418 {
 419         struct vnode *vp;
 420
 421         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 422         object->ref_count++;
 423         if (object->type == OBJT_VNODE) {
 424                 vp = object->handle;
 425                 vref(vp);
 426         }
 427 }
 428
 429 /*
 430  * Handle deallocating an object of type OBJT_VNODE.
 431  */
 432 static void
 433 vm_object_vndeallocate(vm_object_t object)
 434 {
 435         struct vnode *vp = (struct vnode *) object->handle;
 436
 437         VFS_ASSERT_GIANT(vp->v_mount);
 438         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 439         KASSERT(object->type == OBJT_VNODE,
 440             ("vm_object_vndeallocate: not a vnode object"));
 441         KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp"));
 442 #ifdef INVARIANTS
 443         if (object->ref_count == 0) {
 444                 vprint("vm_object_vndeallocate", vp);
 445                 panic("vm_object_vndeallocate: bad object reference count");
 446         }
 447 #endif
 448
 449         object->ref_count--;
 450         if (object->ref_count == 0) {
 451                 mp_fixme("Unlocked vflag access.");
 452                 vp->v_vflag &= ~VV_TEXT;
 453         }
 454         VM_OBJECT_UNLOCK(object);
 455         /*
 456          * vrele may need a vop lock
 457          */
 458         vrele(vp);
 459 }
 460
 461 /*
 462  *      vm_object_deallocate:
 463  *
 464  *      Release a reference to the specified object,
 465  *      gained either through a vm_object_allocate
 466  *      or a vm_object_reference call.  When all references
 467  *      are gone, storage associated with this object
 468  *      may be relinquished.
 469  *
 470  *      No object may be locked.
 471  */
 472 void
 473 vm_object_deallocate(vm_object_t object)
 474 {
 475         vm_object_t temp;
 476
 477         while (object != NULL) {
 478                 int vfslocked;
 479
 480                 vfslocked = 0;
 481         restart:
 482                 VM_OBJECT_LOCK(object);
 483                 if (object->type == OBJT_VNODE) {
 484                         struct vnode *vp = (struct vnode *) object->handle;
 485
 486                         /*
 487                          * Conditionally acquire Giant for a vnode-backed
 488                          * object.  We have to be careful since the type of
 489                          * a vnode object can change while the object is
 490                          * unlocked.
 491                          */
 492                         if (VFS_NEEDSGIANT(vp->v_mount) && !vfslocked) {
 493                                 vfslocked = 1;
 494                                 if (!mtx_trylock(&Giant)) {
 495                                         VM_OBJECT_UNLOCK(object);
 496                                         mtx_lock(&Giant);
 497                                         goto restart;
 498                                 }
 499                         }
 500                         vm_object_vndeallocate(object);
 501                         VFS_UNLOCK_GIANT(vfslocked);
 502                         return;
 503                 } else
 504                         /*
 505                          * This is to handle the case that the object
 506                          * changed type while we dropped its lock to
 507                          * obtain Giant.
 508                          */
 509                         VFS_UNLOCK_GIANT(vfslocked);
 510
 511                 KASSERT(object->ref_count != 0,
 512                         ("vm_object_deallocate: object deallocated too many times: %d", object->type));
 513
 514                 /*
 515                  * If the reference count goes to 0 we start calling
 516                  * vm_object_terminate() on the object chain.
 517                  * A ref count of 1 may be a special case depending on the
 518                  * shadow count being 0 or 1.
 519                  */
 520                 object->ref_count--;
 521                 if (object->ref_count > 1) {
 522                         VM_OBJECT_UNLOCK(object);
 523                         return;
 524                 } else if (object->ref_count == 1) {
 525                         if (object->shadow_count == 0 &&
 526                             object->handle == NULL &&
 527                             (object->type == OBJT_DEFAULT ||
 528                              object->type == OBJT_SWAP)) {
 529                                 vm_object_set_flag(object, OBJ_ONEMAPPING);
 530                         } else if ((object->shadow_count == 1) &&
 531                             (object->handle == NULL) &&
 532                             (object->type == OBJT_DEFAULT ||
 533                              object->type == OBJT_SWAP)) {
 534                                 vm_object_t robject;
 535
 536                                 robject = LIST_FIRST(&object->shadow_head);
 537                                 KASSERT(robject != NULL,
 538                                     ("vm_object_deallocate: ref_count: %d, shadow_count: %d",
 539                                          object->ref_count,
 540                                          object->shadow_count));
 541                                 if (!VM_OBJECT_TRYLOCK(robject)) {
 542                                         /*
 543                                          * Avoid a potential deadlock.
 544                                          */
 545                                         object->ref_count++;
 546                                         VM_OBJECT_UNLOCK(object);
 547                                         /*
 548                                          * More likely than not the thread
 549                                          * holding robject's lock has lower
 550                                          * priority than the current thread.
 551                                          * Let the lower priority thread run.
 552                                          */
 553                                         pause("vmo_de", 1);
 554                                         continue;
 555                                 }
 556                                 /*
 557                                  * Collapse object into its shadow unless its
 558                                  * shadow is dead.  In that case, object will
 559                                  * be deallocated by the thread that is
 560                                  * deallocating its shadow.
 561                                  */
 562                                 if ((robject->flags & OBJ_DEAD) == 0 &&
 563                                     (robject->handle == NULL) &&
 564                                     (robject->type == OBJT_DEFAULT ||
 565                                      robject->type == OBJT_SWAP)) {
 566
 567                                         robject->ref_count++;
 568 retry:
 569                                         if (robject->paging_in_progress) {
 570                                                 VM_OBJECT_UNLOCK(object);
 571                                                 vm_object_pip_wait(robject,
 572                                                     "objde1");
 573                                                 temp = robject->backing_object;
 574                                                 if (object == temp) {
 575                                                         VM_OBJECT_LOCK(object);
 576                                                         goto retry;
 577                                                 }
 578                                         } else if (object->paging_in_progress) {
 579                                                 VM_OBJECT_UNLOCK(robject);
 580                                                 object->flags |= OBJ_PIPWNT;
 581                                                 msleep(object,
 582                                                     VM_OBJECT_MTX(object),
 583                                                     PDROP | PVM, "objde2", 0);
 584                                                 VM_OBJECT_LOCK(robject);
 585                                                 temp = robject->backing_object;
 586                                                 if (object == temp) {
 587                                                         VM_OBJECT_LOCK(object);
 588                                                         goto retry;
 589                                                 }
 590                                         } else
 591                                                 VM_OBJECT_UNLOCK(object);
 592
 593                                         if (robject->ref_count == 1) {
 594                                                 robject->ref_count--;
 595                                                 object = robject;
 596                                                 goto doterm;
 597                                         }
 598                                         object = robject;
 599                                         vm_object_collapse(object);
 600                                         VM_OBJECT_UNLOCK(object);
 601                                         continue;
 602                                 }
 603                                 VM_OBJECT_UNLOCK(robject);
 604                         }
 605                         VM_OBJECT_UNLOCK(object);
 606                         return;
 607                 }
 608 doterm:
 609                 temp = object->backing_object;
 610                 if (temp != NULL) {
 611                         VM_OBJECT_LOCK(temp);
 612                         LIST_REMOVE(object, shadow_list);
 613                         temp->shadow_count--;
 614                         temp->generation++;
 615                         VM_OBJECT_UNLOCK(temp);
 616                         object->backing_object = NULL;
 617                 }
 618                 /*
 619                  * Don't double-terminate, we could be in a termination
 620                  * recursion due to the terminate having to sync data
 621                  * to disk.
 622                  */
 623                 if ((object->flags & OBJ_DEAD) == 0)
 624                         vm_object_terminate(object);
 625                 else
 626                         VM_OBJECT_UNLOCK(object);
 627                 object = temp;
 628         }
 629 }
 630
 631 /*
 632  *      vm_object_destroy removes the object from the global object list
 633  *      and frees the space for the object.
 634  */
 635 void
 636 vm_object_destroy(vm_object_t object)
 637 {
 638
 639         /*
 640          * Remove the object from the global object list.
 641          */
 642         mtx_lock(&vm_object_list_mtx);
 643         TAILQ_REMOVE(&vm_object_list, object, object_list);
 644         mtx_unlock(&vm_object_list_mtx);
 645
 646         /*
 647          * Release the allocation charge.
 648          */
 649         if (object->uip != NULL) {
 650                 KASSERT(object->type == OBJT_DEFAULT ||
 651                     object->type == OBJT_SWAP,
 652                     ("vm_object_terminate: non-swap obj %p has uip",
 653                      object));
 654                 swap_release_by_uid(object->charge, object->uip);
 655                 object->charge = 0;
 656                 uifree(object->uip);
 657                 object->uip = NULL;
 658         }
 659
 660         /*
 661          * Free the space for the object.
 662          */
 663         uma_zfree(obj_zone, object);
 664 }
 665
 666 /*
 667  *      vm_object_terminate actually destroys the specified object, freeing
 668  *      up all previously used resources.
 669  *
 670  *      The object must be locked.
 671  *      This routine may block.
 672  */
 673 void
 674 vm_object_terminate(vm_object_t object)
 675 {
 676         vm_page_t p;
 677
 678         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 679
 680         /*
 681          * Make sure no one uses us.
 682          */
 683         vm_object_set_flag(object, OBJ_DEAD);
 684
 685         /*
 686          * wait for the pageout daemon to be done with the object
 687          */
 688         vm_object_pip_wait(object, "objtrm");
 689
 690         KASSERT(!object->paging_in_progress,
 691                 ("vm_object_terminate: pageout in progress"));
 692
 693         /*
 694          * Clean and free the pages, as appropriate. All references to the
 695          * object are gone, so we don't need to lock it.
 696          */
 697         if (object->type == OBJT_VNODE) {
 698                 struct vnode *vp = (struct vnode *)object->handle;
 699
 700                 /*
 701                  * Clean pages and flush buffers.
 702                  */
 703                 vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
 704                 VM_OBJECT_UNLOCK(object);
 705
 706                 vinvalbuf(vp, V_SAVE, 0, 0);
 707
 708                 VM_OBJECT_LOCK(object);
 709         }
 710
 711         KASSERT(object->ref_count == 0,
 712                 ("vm_object_terminate: object with references, ref_count=%d",
 713                 object->ref_count));
 714
 715         /*
 716          * Now free any remaining pages. For internal objects, this also
 717          * removes them from paging queues. Don't free wired pages, just
 718          * remove them from the object.
 719          */
 720         vm_page_lock_queues();
 721         while ((p = TAILQ_FIRST(&object->memq)) != NULL) {
 722                 KASSERT(!p->busy && (p->oflags & VPO_BUSY) == 0,
 723                         ("vm_object_terminate: freeing busy page %p "
 724                         "p->busy = %d, p->oflags %x\n", p, p->busy, p->oflags));
 725                 if (p->wire_count == 0) {
 726                         vm_page_free(p);
 727                         cnt.v_pfree++;
 728                 } else {
 729                         vm_page_remove(p);
 730                 }
 731         }
 732         vm_page_unlock_queues();
 733
 734 #if VM_NRESERVLEVEL > 0
 735         if (__predict_false(!LIST_EMPTY(&object->rvq)))
 736                 vm_reserv_break_all(object);
 737 #endif
 738         if (__predict_false(object->cache != NULL))
 739                 vm_page_cache_free(object, 0, 0);
 740
 741         /*
 742          * Let the pager know object is dead.
 743          */
 744         vm_pager_deallocate(object);
 745         VM_OBJECT_UNLOCK(object);
 746
 747         vm_object_destroy(object);
 748 }
 749
 750 /*
 751  *      vm_object_page_clean
 752  *
 753  *      Clean all dirty pages in the specified range of object.  Leaves page
 754  *      on whatever queue it is currently on.   If NOSYNC is set then do not
 755  *      write out pages with VPO_NOSYNC set (originally comes from MAP_NOSYNC),
 756  *      leaving the object dirty.
 757  *
 758  *      When stuffing pages asynchronously, allow clustering.  XXX we need a
 759  *      synchronous clustering mode implementation.
 760  *
 761  *      Odd semantics: if start == end, we clean everything.
 762  *
 763  *      The object must be locked.
 764  */
 765 void
 766 vm_object_page_clean(vm_object_t object, vm_pindex_t start, vm_pindex_t end, int flags)
 767 {
 768         vm_page_t p, np;
 769         vm_pindex_t tstart, tend;
 770         vm_pindex_t pi;
 771         int clearobjflags;
 772         int pagerflags;
 773         int curgeneration;
 774
 775         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 776         if ((object->flags & OBJ_MIGHTBEDIRTY) == 0)
 777                 return;
 778         KASSERT(object->type == OBJT_VNODE, ("Not a vnode object"));
 779
 780         pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) ? VM_PAGER_PUT_SYNC : VM_PAGER_CLUSTER_OK;
 781         pagerflags |= (flags & OBJPC_INVAL) ? VM_PAGER_PUT_INVAL : 0;
 782
 783         vm_object_set_flag(object, OBJ_CLEANING);
 784
 785         tstart = start;
 786         if (end == 0) {
 787                 tend = object->size;
 788         } else {
 789                 tend = end;
 790         }
 791
 792         vm_page_lock_queues();
 793         /*
 794          * If the caller is smart and only msync()s a range he knows is
 795          * dirty, we may be able to avoid an object scan.  This results in
 796          * a phenominal improvement in performance.  We cannot do this
 797          * as a matter of course because the object may be huge - e.g.
 798          * the size might be in the gigabytes or terrabytes.
 799          */
 800         if (msync_flush_flags & MSYNC_FLUSH_HARDSEQ) {
 801                 vm_pindex_t tscan;
 802                 int scanlimit;
 803                 int scanreset;
 804
 805                 scanreset = object->resident_page_count / EASY_SCAN_FACTOR;
 806                 if (scanreset < 16)
 807                         scanreset = 16;
 808                 pagerflags |= VM_PAGER_IGNORE_CLEANCHK;
 809
 810                 scanlimit = scanreset;
 811                 tscan = tstart;
 812                 while (tscan < tend) {
 813                         curgeneration = object->generation;
 814                         p = vm_page_lookup(object, tscan);
 815                         if (p == NULL || p->valid == 0) {
 816                                 if (--scanlimit == 0)
 817                                         break;
 818                                 ++tscan;
 819                                 continue;
 820                         }
 821                         vm_page_test_dirty(p);
 822                         if (p->dirty == 0) {
 823                                 if (--scanlimit == 0)
 824                                         break;
 825                                 ++tscan;
 826                                 continue;
 827                         }
 828                         /*
 829                          * If we have been asked to skip nosync pages and
 830                          * this is a nosync page, we can't continue.
 831                          */
 832                         if ((flags & OBJPC_NOSYNC) && (p->oflags & VPO_NOSYNC)) {
 833                                 if (--scanlimit == 0)
 834                                         break;
 835                                 ++tscan;
 836                                 continue;
 837                         }
 838                         scanlimit = scanreset;
 839
 840                         /*
 841                          * This returns 0 if it was unable to busy the first
 842                          * page (i.e. had to sleep).
 843                          */
 844                         tscan += vm_object_page_collect_flush(object, p, curgeneration, pagerflags);
 845                 }
 846
 847                 /*
 848                  * If everything was dirty and we flushed it successfully,
 849                  * and the requested range is not the entire object, we
 850                  * don't have to mess with CLEANCHK or MIGHTBEDIRTY and can
 851                  * return immediately.
 852                  */
 853                 if (tscan >= tend && (tstart || tend < object->size)) {
 854                         vm_page_unlock_queues();
 855                         vm_object_clear_flag(object, OBJ_CLEANING);
 856                         return;
 857                 }
 858                 pagerflags &= ~VM_PAGER_IGNORE_CLEANCHK;
 859         }
 860
 861         /*
 862          * Generally set CLEANCHK interlock and make the page read-only so
 863          * we can then clear the object flags.
 864          *
 865          * However, if this is a nosync mmap then the object is likely to
 866          * stay dirty so do not mess with the page and do not clear the
 867          * object flags.
 868          */
 869         clearobjflags = 1;
 870         TAILQ_FOREACH(p, &object->memq, listq) {
 871                 p->oflags |= VPO_CLEANCHK;
 872                 if ((flags & OBJPC_NOSYNC) && (p->oflags & VPO_NOSYNC))
 873                         clearobjflags = 0;
 874                 else
 875                         pmap_remove_write(p);
 876         }
 877
 878         if (clearobjflags && (tstart == 0) && (tend == object->size))
 879                 vm_object_clear_flag(object, OBJ_MIGHTBEDIRTY);
 880
 881 rescan:
 882         curgeneration = object->generation;
 883
 884         for (p = TAILQ_FIRST(&object->memq); p; p = np) {
 885                 int n;
 886
 887                 np = TAILQ_NEXT(p, listq);
 888
 889 again:
 890                 pi = p->pindex;
 891                 if ((p->oflags & VPO_CLEANCHK) == 0 ||
 892                         (pi < tstart) || (pi >= tend) ||
 893                     p->valid == 0) {
 894                         p->oflags &= ~VPO_CLEANCHK;
 895                         continue;
 896                 }
 897
 898                 vm_page_test_dirty(p);
 899                 if (p->dirty == 0) {
 900                         p->oflags &= ~VPO_CLEANCHK;
 901                         continue;
 902                 }
 903
 904                 /*
 905                  * If we have been asked to skip nosync pages and this is a
 906                  * nosync page, skip it.  Note that the object flags were
 907                  * not cleared in this case so we do not have to set them.
 908                  */
 909                 if ((flags & OBJPC_NOSYNC) && (p->oflags & VPO_NOSYNC)) {
 910                         p->oflags &= ~VPO_CLEANCHK;
 911                         continue;
 912                 }
 913
 914                 n = vm_object_page_collect_flush(object, p,
 915                         curgeneration, pagerflags);
 916                 if (n == 0)
 917                         goto rescan;
 918
 919                 if (object->generation != curgeneration)
 920                         goto rescan;
 921
 922                 /*
 923                  * Try to optimize the next page.  If we can't we pick up
 924                  * our (random) scan where we left off.
 925                  */
 926                 if (msync_flush_flags & MSYNC_FLUSH_SOFTSEQ) {
 927                         if ((p = vm_page_lookup(object, pi + n)) != NULL)
 928                                 goto again;
 929                 }
 930         }
 931         vm_page_unlock_queues();
 932 #if 0
 933         VOP_FSYNC(vp, (pagerflags & VM_PAGER_PUT_SYNC)?MNT_WAIT:0, curproc);
 934 #endif
 935
 936         vm_object_clear_flag(object, OBJ_CLEANING);
 937         return;
 938 }
 939
 940 static int
 941 vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int curgeneration, int pagerflags)
 942 {
 943         int runlen;
 944         int maxf;
 945         int chkb;
 946         int maxb;
 947         int i;
 948         vm_pindex_t pi;
 949         vm_page_t maf[vm_pageout_page_count];
 950         vm_page_t mab[vm_pageout_page_count];
 951         vm_page_t ma[vm_pageout_page_count];
 952
 953         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 954         pi = p->pindex;
 955         while (vm_page_sleep_if_busy(p, TRUE, "vpcwai")) {
 956                 vm_page_lock_queues();
 957                 if (object->generation != curgeneration) {
 958                         return(0);
 959                 }
 960         }
 961         maxf = 0;
 962         for(i = 1; i < vm_pageout_page_count; i++) {
 963                 vm_page_t tp;
 964
 965                 if ((tp = vm_page_lookup(object, pi + i)) != NULL) {
 966                         if ((tp->oflags & VPO_BUSY) ||
 967                                 ((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 &&
 968                                  (tp->oflags & VPO_CLEANCHK) == 0) ||
 969                                 (tp->busy != 0))
 970                                 break;
 971                         vm_page_test_dirty(tp);
 972                         if (tp->dirty == 0) {
 973                                 tp->oflags &= ~VPO_CLEANCHK;
 974                                 break;
 975                         }
 976                         maf[ i - 1 ] = tp;
 977                         maxf++;
 978                         continue;
 979                 }
 980                 break;
 981         }
 982
 983         maxb = 0;
 984         chkb = vm_pageout_page_count -  maxf;
 985         if (chkb) {
 986                 for(i = 1; i < chkb;i++) {
 987                         vm_page_t tp;
 988
 989                         if ((tp = vm_page_lookup(object, pi - i)) != NULL) {
 990                                 if ((tp->oflags & VPO_BUSY) ||
 991                                         ((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 &&
 992                                          (tp->oflags & VPO_CLEANCHK) == 0) ||
 993                                         (tp->busy != 0))
 994                                         break;
 995                                 vm_page_test_dirty(tp);
 996                                 if (tp->dirty == 0) {
 997                                         tp->oflags &= ~VPO_CLEANCHK;
 998                                         break;
 999                                 }
1000                                 mab[ i - 1 ] = tp;
1001                                 maxb++;
1002                                 continue;
1003                         }
1004                         break;
1005                 }
1006         }
1007
1008         for(i = 0; i < maxb; i++) {
1009                 int index = (maxb - i) - 1;
1010                 ma[index] = mab[i];
1011                 ma[index]->oflags &= ~VPO_CLEANCHK;
1012         }
1013         p->oflags &= ~VPO_CLEANCHK;
1014         ma[maxb] = p;
1015         for(i = 0; i < maxf; i++) {
1016                 int index = (maxb + i) + 1;
1017                 ma[index] = maf[i];
1018                 ma[index]->oflags &= ~VPO_CLEANCHK;
1019         }
1020         runlen = maxb + maxf + 1;
1021
1022         vm_pageout_flush(ma, runlen, pagerflags);
1023         for (i = 0; i < runlen; i++) {
1024                 if (ma[i]->dirty) {
1025                         pmap_remove_write(ma[i]);
1026                         ma[i]->oflags |= VPO_CLEANCHK;
1027
1028                         /*
1029                          * maxf will end up being the actual number of pages
1030                          * we wrote out contiguously, non-inclusive of the
1031                          * first page.  We do not count look-behind pages.
1032                          */
1033                         if (i >= maxb + 1 && (maxf > i - maxb - 1))
1034                                 maxf = i - maxb - 1;
1035                 }
1036         }
1037         return(maxf + 1);
1038 }
1039
1040 /*
1041  * Note that there is absolutely no sense in writing out
1042  * anonymous objects, so we track down the vnode object
1043  * to write out.
1044  * We invalidate (remove) all pages from the address space
1045  * for semantic correctness.
1046  *
1047  * Note: certain anonymous maps, such as MAP_NOSYNC maps,
1048  * may start out with a NULL object.
1049  */
1050 void
1051 vm_object_sync(vm_object_t object, vm_ooffset_t offset, vm_size_t size,
1052     boolean_t syncio, boolean_t invalidate)
1053 {
1054         vm_object_t backing_object;
1055         struct vnode *vp;
1056         struct mount *mp;
1057         int flags;
1058
1059         if (object == NULL)
1060                 return;
1061         VM_OBJECT_LOCK(object);
1062         while ((backing_object = object->backing_object) != NULL) {
1063                 VM_OBJECT_LOCK(backing_object);
1064                 offset += object->backing_object_offset;
1065                 VM_OBJECT_UNLOCK(object);
1066                 object = backing_object;
1067                 if (object->size < OFF_TO_IDX(offset + size))
1068                         size = IDX_TO_OFF(object->size) - offset;
1069         }
1070         /*
1071          * Flush pages if writing is allowed, invalidate them
1072          * if invalidation requested.  Pages undergoing I/O
1073          * will be ignored by vm_object_page_remove().
1074          *
1075          * We cannot lock the vnode and then wait for paging
1076          * to complete without deadlocking against vm_fault.
1077          * Instead we simply call vm_object_page_remove() and
1078          * allow it to block internally on a page-by-page
1079          * basis when it encounters pages undergoing async
1080          * I/O.
1081          */
1082         if (object->type == OBJT_VNODE &&
1083             (object->flags & OBJ_MIGHTBEDIRTY) != 0) {
1084                 int vfslocked;
1085                 vp = object->handle;
1086                 VM_OBJECT_UNLOCK(object);
1087                 (void) vn_start_write(vp, &mp, V_WAIT);
1088                 vfslocked = VFS_LOCK_GIANT(vp->v_mount);
1089                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1090                 flags = (syncio || invalidate) ? OBJPC_SYNC : 0;
1091                 flags |= invalidate ? OBJPC_INVAL : 0;
1092                 VM_OBJECT_LOCK(object);
1093                 vm_object_page_clean(object,
1094                     OFF_TO_IDX(offset),
1095                     OFF_TO_IDX(offset + size + PAGE_MASK),
1096                     flags);
1097                 VM_OBJECT_UNLOCK(object);
1098                 VOP_UNLOCK(vp, 0);
1099                 VFS_UNLOCK_GIANT(vfslocked);
1100                 vn_finished_write(mp);
1101                 VM_OBJECT_LOCK(object);
1102         }
1103         if ((object->type == OBJT_VNODE ||
1104              object->type == OBJT_DEVICE) && invalidate) {
1105                 boolean_t purge;
1106                 purge = old_msync || (object->type == OBJT_DEVICE);
1107                 vm_object_page_remove(object,
1108                     OFF_TO_IDX(offset),
1109                     OFF_TO_IDX(offset + size + PAGE_MASK),
1110                     purge ? FALSE : TRUE);
1111         }
1112         VM_OBJECT_UNLOCK(object);
1113 }
1114
1115 /*
1116  *      vm_object_madvise:
1117  *
1118  *      Implements the madvise function at the object/page level.
1119  *
1120  *      MADV_WILLNEED   (any object)
1121  *
1122  *          Activate the specified pages if they are resident.
1123  *
1124  *      MADV_DONTNEED   (any object)
1125  *
1126  *          Deactivate the specified pages if they are resident.
1127  *
1128  *      MADV_FREE       (OBJT_DEFAULT/OBJT_SWAP objects,
1129  *                       OBJ_ONEMAPPING only)
1130  *
1131  *          Deactivate and clean the specified pages if they are
1132  *          resident.  This permits the process to reuse the pages
1133  *          without faulting or the kernel to reclaim the pages
1134  *          without I/O.
1135  */
1136 void
1137 vm_object_madvise(vm_object_t object, vm_pindex_t pindex, int count, int advise)
1138 {
1139         vm_pindex_t end, tpindex;
1140         vm_object_t backing_object, tobject;
1141         vm_page_t m;
1142
1143         if (object == NULL)
1144                 return;
1145         VM_OBJECT_LOCK(object);
1146         end = pindex + count;
1147         /*
1148          * Locate and adjust resident pages
1149          */
1150         for (; pindex < end; pindex += 1) {
1151 relookup:
1152                 tobject = object;
1153                 tpindex = pindex;
1154 shadowlookup:
1155                 /*
1156                  * MADV_FREE only operates on OBJT_DEFAULT or OBJT_SWAP pages
1157                  * and those pages must be OBJ_ONEMAPPING.
1158                  */
1159                 if (advise == MADV_FREE) {
1160                         if ((tobject->type != OBJT_DEFAULT &&
1161                              tobject->type != OBJT_SWAP) ||
1162                             (tobject->flags & OBJ_ONEMAPPING) == 0) {
1163                                 goto unlock_tobject;
1164                         }
1165                 }
1166                 m = vm_page_lookup(tobject, tpindex);
1167                 if (m == NULL && advise == MADV_WILLNEED) {
1168                         /*
1169                          * If the page is cached, reactivate it.
1170                          */
1171                         m = vm_page_alloc(tobject, tpindex, VM_ALLOC_IFCACHED |
1172                             VM_ALLOC_NOBUSY);
1173                 }
1174                 if (m == NULL) {
1175                         /*
1176                          * There may be swap even if there is no backing page
1177                          */
1178                         if (advise == MADV_FREE && tobject->type == OBJT_SWAP)
1179                                 swap_pager_freespace(tobject, tpindex, 1);
1180                         /*
1181                          * next object
1182                          */
1183                         backing_object = tobject->backing_object;
1184                         if (backing_object == NULL)
1185                                 goto unlock_tobject;
1186                         VM_OBJECT_LOCK(backing_object);
1187                         tpindex += OFF_TO_IDX(tobject->backing_object_offset);
1188                         if (tobject != object)
1189                                 VM_OBJECT_UNLOCK(tobject);
1190                         tobject = backing_object;
1191                         goto shadowlookup;
1192                 }
1193                 /*
1194                  * If the page is busy or not in a normal active state,
1195                  * we skip it.  If the page is not managed there are no
1196                  * page queues to mess with.  Things can break if we mess
1197                  * with pages in any of the below states.
1198                  */
1199                 vm_page_lock_queues();
1200                 if (m->hold_count ||
1201                     m->wire_count ||
1202                     (m->flags & PG_UNMANAGED) ||
1203                     m->valid != VM_PAGE_BITS_ALL) {
1204                         vm_page_unlock_queues();
1205                         goto unlock_tobject;
1206                 }
1207                 if ((m->oflags & VPO_BUSY) || m->busy) {
1208                         if (advise == MADV_WILLNEED)
1209                                 /*
1210                                  * Reference the page before unlocking and
1211                                  * sleeping so that the page daemon is less
1212                                  * likely to reclaim it.
1213                                  */
1214                                 vm_page_flag_set(m, PG_REFERENCED);
1215                         vm_page_unlock_queues();
1216                         if (object != tobject)
1217                                 VM_OBJECT_UNLOCK(object);
1218                         m->oflags |= VPO_WANTED;
1219                         msleep(m, VM_OBJECT_MTX(tobject), PDROP | PVM, "madvpo",
1220                             0);
1221                         VM_OBJECT_LOCK(object);
1222                         goto relookup;
1223                 }
1224                 if (advise == MADV_WILLNEED) {
1225                         vm_page_activate(m);
1226                 } else if (advise == MADV_DONTNEED) {
1227                         vm_page_dontneed(m);
1228                 } else if (advise == MADV_FREE) {
1229                         /*
1230                          * Mark the page clean.  This will allow the page
1231                          * to be freed up by the system.  However, such pages
1232                          * are often reused quickly by malloc()/free()
1233                          * so we do not do anything that would cause
1234                          * a page fault if we can help it.
1235                          *
1236                          * Specifically, we do not try to actually free
1237                          * the page now nor do we try to put it in the
1238                          * cache (which would cause a page fault on reuse).
1239                          *
1240                          * But we do make the page is freeable as we
1241                          * can without actually taking the step of unmapping
1242                          * it.
1243                          */
1244                         pmap_clear_modify(m);
1245                         m->dirty = 0;
1246                         m->act_count = 0;
1247                         vm_page_dontneed(m);
1248                 }
1249                 vm_page_unlock_queues();
1250                 if (advise == MADV_FREE && tobject->type == OBJT_SWAP)
1251                         swap_pager_freespace(tobject, tpindex, 1);
1252 unlock_tobject:
1253                 if (tobject != object)
1254                         VM_OBJECT_UNLOCK(tobject);
1255         }
1256         VM_OBJECT_UNLOCK(object);
1257 }
1258
1259 /*
1260  *      vm_object_shadow:
1261  *
1262  *      Create a new object which is backed by the
1263  *      specified existing object range.  The source
1264  *      object reference is deallocated.
1265  *
1266  *      The new object and offset into that object
1267  *      are returned in the source parameters.
1268  */
1269 void
1270 vm_object_shadow(
1271         vm_object_t *object,    /* IN/OUT */
1272         vm_ooffset_t *offset,   /* IN/OUT */
1273         vm_size_t length)
1274 {
1275         vm_object_t source;
1276         vm_object_t result;
1277
1278         source = *object;
1279
1280         /*
1281          * Don't create the new object if the old object isn't shared.
1282          */
1283         if (source != NULL) {
1284                 VM_OBJECT_LOCK(source);
1285                 if (source->ref_count == 1 &&
1286                     source->handle == NULL &&
1287                     (source->type == OBJT_DEFAULT ||
1288                      source->type == OBJT_SWAP)) {
1289                         VM_OBJECT_UNLOCK(source);
1290                         return;
1291                 }
1292                 VM_OBJECT_UNLOCK(source);
1293         }
1294
1295         /*
1296          * Allocate a new object with the given length.
1297          */
1298         result = vm_object_allocate(OBJT_DEFAULT, length);
1299
1300         /*
1301          * The new object shadows the source object, adding a reference to it.
1302          * Our caller changes his reference to point to the new object,
1303          * removing a reference to the source object.  Net result: no change
1304          * of reference count.
1305          *
1306          * Try to optimize the result object's page color when shadowing
1307          * in order to maintain page coloring consistency in the combined
1308          * shadowed object.
1309          */
1310         result->backing_object = source;
1311         /*
1312          * Store the offset into the source object, and fix up the offset into
1313          * the new object.
1314          */
1315         result->backing_object_offset = *offset;
1316         if (source != NULL) {
1317                 VM_OBJECT_LOCK(source);
1318                 LIST_INSERT_HEAD(&source->shadow_head, result, shadow_list);
1319                 source->shadow_count++;
1320                 source->generation++;
1321 #if VM_NRESERVLEVEL > 0
1322                 result->flags |= source->flags & OBJ_COLORED;
1323                 result->pg_color = (source->pg_color + OFF_TO_IDX(*offset)) &
1324                     ((1 << (VM_NFREEORDER - 1)) - 1);
1325 #endif
1326                 VM_OBJECT_UNLOCK(source);
1327         }
1328
1329
1330         /*
1331          * Return the new things
1332          */
1333         *offset = 0;
1334         *object = result;
1335 }
1336
1337 /*
1338  *      vm_object_split:
1339  *
1340  * Split the pages in a map entry into a new object.  This affords
1341  * easier removal of unused pages, and keeps object inheritance from
1342  * being a negative impact on memory usage.
1343  */
1344 void
1345 vm_object_split(vm_map_entry_t entry)
1346 {
1347         vm_page_t m, m_next;
1348         vm_object_t orig_object, new_object, source;
1349         vm_pindex_t idx, offidxstart;
1350         vm_size_t size;
1351
1352         orig_object = entry->object.vm_object;
1353         if (orig_object->type != OBJT_DEFAULT && orig_object->type != OBJT_SWAP)
1354                 return;
1355         if (orig_object->ref_count <= 1)
1356                 return;
1357         VM_OBJECT_UNLOCK(orig_object);
1358
1359         offidxstart = OFF_TO_IDX(entry->offset);
1360         size = atop(entry->end - entry->start);
1361
1362         /*
1363          * If swap_pager_copy() is later called, it will convert new_object
1364          * into a swap object.
1365          */
1366         new_object = vm_object_allocate(OBJT_DEFAULT, size);
1367
1368         /*
1369          * At this point, the new object is still private, so the order in
1370          * which the original and new objects are locked does not matter.
1371          */
1372         VM_OBJECT_LOCK(new_object);
1373         VM_OBJECT_LOCK(orig_object);
1374         source = orig_object->backing_object;
1375         if (source != NULL) {
1376                 VM_OBJECT_LOCK(source);
1377                 if ((source->flags & OBJ_DEAD) != 0) {
1378                         VM_OBJECT_UNLOCK(source);
1379                         VM_OBJECT_UNLOCK(orig_object);
1380                         VM_OBJECT_UNLOCK(new_object);
1381                         vm_object_deallocate(new_object);
1382                         VM_OBJECT_LOCK(orig_object);
1383                         return;
1384                 }
1385                 LIST_INSERT_HEAD(&source->shadow_head,
1386                                   new_object, shadow_list);
1387                 source->shadow_count++;
1388                 source->generation++;
1389                 vm_object_reference_locked(source);     /* for new_object */
1390                 vm_object_clear_flag(source, OBJ_ONEMAPPING);
1391                 VM_OBJECT_UNLOCK(source);
1392                 new_object->backing_object_offset =
1393                         orig_object->backing_object_offset + entry->offset;
1394                 new_object->backing_object = source;
1395         }
1396         if (orig_object->uip != NULL) {
1397                 new_object->uip = orig_object->uip;
1398                 uihold(orig_object->uip);
1399                 new_object->charge = ptoa(size);
1400                 KASSERT(orig_object->charge >= ptoa(size),
1401                     ("orig_object->charge < 0"));
1402                 orig_object->charge -= ptoa(size);
1403         }
1404 retry:
1405         if ((m = TAILQ_FIRST(&orig_object->memq)) != NULL) {
1406                 if (m->pindex < offidxstart) {
1407                         m = vm_page_splay(offidxstart, orig_object->root);
1408                         if ((orig_object->root = m)->pindex < offidxstart)
1409                                 m = TAILQ_NEXT(m, listq);
1410                 }
1411         }
1412         vm_page_lock_queues();
1413         for (; m != NULL && (idx = m->pindex - offidxstart) < size;
1414             m = m_next) {
1415                 m_next = TAILQ_NEXT(m, listq);
1416
1417                 /*
1418                  * We must wait for pending I/O to complete before we can
1419                  * rename the page.
1420                  *
1421                  * We do not have to VM_PROT_NONE the page as mappings should
1422                  * not be changed by this operation.
1423                  */
1424                 if ((m->oflags & VPO_BUSY) || m->busy) {
1425                         vm_page_unlock_queues();
1426                         VM_OBJECT_UNLOCK(new_object);
1427                         m->oflags |= VPO_WANTED;
1428                         msleep(m, VM_OBJECT_MTX(orig_object), PVM, "spltwt", 0);
1429                         VM_OBJECT_LOCK(new_object);
1430                         goto retry;
1431                 }
1432                 vm_page_rename(m, new_object, idx);
1433                 /* page automatically made dirty by rename and cache handled */
1434                 vm_page_busy(m);
1435         }
1436         vm_page_unlock_queues();
1437         if (orig_object->type == OBJT_SWAP) {
1438                 /*
1439                  * swap_pager_copy() can sleep, in which case the orig_object's
1440                  * and new_object's locks are released and reacquired.
1441                  */
1442                 swap_pager_copy(orig_object, new_object, offidxstart, 0);
1443
1444                 /*
1445                  * Transfer any cached pages from orig_object to new_object.
1446                  */
1447                 if (__predict_false(orig_object->cache != NULL))
1448                         vm_page_cache_transfer(orig_object, offidxstart,
1449                             new_object);
1450         }
1451         VM_OBJECT_UNLOCK(orig_object);
1452         TAILQ_FOREACH(m, &new_object->memq, listq)
1453                 vm_page_wakeup(m);
1454         VM_OBJECT_UNLOCK(new_object);
1455         entry->object.vm_object = new_object;
1456         entry->offset = 0LL;
1457         vm_object_deallocate(orig_object);
1458         VM_OBJECT_LOCK(new_object);
1459 }
1460
1461 #define OBSC_TEST_ALL_SHADOWED  0x0001
1462 #define OBSC_COLLAPSE_NOWAIT    0x0002
1463 #define OBSC_COLLAPSE_WAIT      0x0004
1464
1465 static int
1466 vm_object_backing_scan(vm_object_t object, int op)
1467 {
1468         int r = 1;
1469         vm_page_t p;
1470         vm_object_t backing_object;
1471         vm_pindex_t backing_offset_index;
1472
1473         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
1474         VM_OBJECT_LOCK_ASSERT(object->backing_object, MA_OWNED);
1475
1476         backing_object = object->backing_object;
1477         backing_offset_index = OFF_TO_IDX(object->backing_object_offset);
1478
1479         /*
1480          * Initial conditions
1481          */
1482         if (op & OBSC_TEST_ALL_SHADOWED) {
1483                 /*
1484                  * We do not want to have to test for the existence of cache
1485                  * or swap pages in the backing object.  XXX but with the
1486                  * new swapper this would be pretty easy to do.
1487                  *
1488                  * XXX what about anonymous MAP_SHARED memory that hasn't
1489                  * been ZFOD faulted yet?  If we do not test for this, the
1490                  * shadow test may succeed! XXX
1491                  */
1492                 if (backing_object->type != OBJT_DEFAULT) {
1493                         return (0);
1494                 }
1495         }
1496         if (op & OBSC_COLLAPSE_WAIT) {
1497                 vm_object_set_flag(backing_object, OBJ_DEAD);
1498         }
1499
1500         /*
1501          * Our scan
1502          */
1503         p = TAILQ_FIRST(&backing_object->memq);
1504         while (p) {
1505                 vm_page_t next = TAILQ_NEXT(p, listq);
1506                 vm_pindex_t new_pindex = p->pindex - backing_offset_index;
1507
1508                 if (op & OBSC_TEST_ALL_SHADOWED) {
1509                         vm_page_t pp;
1510
1511                         /*
1512                          * Ignore pages outside the parent object's range
1513                          * and outside the parent object's mapping of the
1514                          * backing object.
1515                          *
1516                          * note that we do not busy the backing object's
1517                          * page.
1518                          */
1519                         if (
1520                             p->pindex < backing_offset_index ||
1521                             new_pindex >= object->size
1522                         ) {
1523                                 p = next;
1524                                 continue;
1525                         }
1526
1527                         /*
1528                          * See if the parent has the page or if the parent's
1529                          * object pager has the page.  If the parent has the
1530                          * page but the page is not valid, the parent's
1531                          * object pager must have the page.
1532                          *
1533                          * If this fails, the parent does not completely shadow
1534                          * the object and we might as well give up now.
1535                          */
1536
1537                         pp = vm_page_lookup(object, new_pindex);
1538                         if (
1539                             (pp == NULL || pp->valid == 0) &&
1540                             !vm_pager_has_page(object, new_pindex, NULL, NULL)
1541                         ) {
1542                                 r = 0;
1543                                 break;
1544                         }
1545                 }
1546
1547                 /*
1548                  * Check for busy page
1549                  */
1550                 if (op & (OBSC_COLLAPSE_WAIT | OBSC_COLLAPSE_NOWAIT)) {
1551                         vm_page_t pp;
1552
1553                         if (op & OBSC_COLLAPSE_NOWAIT) {
1554                                 if ((p->oflags & VPO_BUSY) ||
1555                                     !p->valid ||
1556                                     p->busy) {
1557                                         p = next;
1558                                         continue;
1559                                 }
1560                         } else if (op & OBSC_COLLAPSE_WAIT) {
1561                                 if ((p->oflags & VPO_BUSY) || p->busy) {
1562                                         VM_OBJECT_UNLOCK(object);
1563                                         p->oflags |= VPO_WANTED;
1564                                         msleep(p, VM_OBJECT_MTX(backing_object),
1565                                             PDROP | PVM, "vmocol", 0);
1566                                         VM_OBJECT_LOCK(object);
1567                                         VM_OBJECT_LOCK(backing_object);
1568                                         /*
1569                                          * If we slept, anything could have
1570                                          * happened.  Since the object is
1571                                          * marked dead, the backing offset
1572                                          * should not have changed so we
1573                                          * just restart our scan.
1574                                          */
1575                                         p = TAILQ_FIRST(&backing_object->memq);
1576                                         continue;
1577                                 }
1578                         }
1579
1580                         KASSERT(
1581                             p->object == backing_object,
1582                             ("vm_object_backing_scan: object mismatch")
1583                         );
1584
1585                         /*
1586                          * Destroy any associated swap
1587                          */
1588                         if (backing_object->type == OBJT_SWAP) {
1589                                 swap_pager_freespace(
1590                                     backing_object,
1591                                     p->pindex,
1592                                     1
1593                                 );
1594                         }
1595
1596                         if (
1597                             p->pindex < backing_offset_index ||
1598                             new_pindex >= object->size
1599                         ) {
1600                                 /*
1601                                  * Page is out of the parent object's range, we
1602                                  * can simply destroy it.
1603                                  */
1604                                 vm_page_lock_queues();
1605                                 KASSERT(!pmap_page_is_mapped(p),
1606                                     ("freeing mapped page %p", p));
1607                                 if (p->wire_count == 0)
1608                                         vm_page_free(p);
1609                                 else
1610                                         vm_page_remove(p);
1611                                 vm_page_unlock_queues();
1612                                 p = next;
1613                                 continue;
1614                         }
1615
1616                         pp = vm_page_lookup(object, new_pindex);
1617                         if (
1618                             pp != NULL ||
1619                             vm_pager_has_page(object, new_pindex, NULL, NULL)
1620                         ) {
1621                                 /*
1622                                  * page already exists in parent OR swap exists
1623                                  * for this location in the parent.  Destroy
1624                                  * the original page from the backing object.
1625                                  *
1626                                  * Leave the parent's page alone
1627                                  */
1628                                 vm_page_lock_queues();
1629                                 KASSERT(!pmap_page_is_mapped(p),
1630                                     ("freeing mapped page %p", p));
1631                                 if (p->wire_count == 0)
1632                                         vm_page_free(p);
1633                                 else
1634                                         vm_page_remove(p);
1635                                 vm_page_unlock_queues();
1636                                 p = next;
1637                                 continue;
1638                         }
1639
1640 #if VM_NRESERVLEVEL > 0
1641                         /*
1642                          * Rename the reservation.
1643                          */
1644                         vm_reserv_rename(p, object, backing_object,
1645                             backing_offset_index);
1646 #endif
1647
1648                         /*
1649                          * Page does not exist in parent, rename the
1650                          * page from the backing object to the main object.
1651                          *
1652                          * If the page was mapped to a process, it can remain
1653                          * mapped through the rename.
1654                          */
1655                         vm_page_lock_queues();
1656                         vm_page_rename(p, object, new_pindex);
1657                         vm_page_unlock_queues();
1658                         /* page automatically made dirty by rename */
1659                 }
1660                 p = next;
1661         }
1662         return (r);
1663 }
1664
1665
1666 /*
1667  * this version of collapse allows the operation to occur earlier and
1668  * when paging_in_progress is true for an object...  This is not a complete
1669  * operation, but should plug 99.9% of the rest of the leaks.
1670  */
1671 static void
1672 vm_object_qcollapse(vm_object_t object)
1673 {
1674         vm_object_t backing_object = object->backing_object;
1675
1676         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
1677         VM_OBJECT_LOCK_ASSERT(backing_object, MA_OWNED);
1678
1679         if (backing_object->ref_count != 1)
1680                 return;
1681
1682         vm_object_backing_scan(object, OBSC_COLLAPSE_NOWAIT);
1683 }
1684
1685 /*
1686  *      vm_object_collapse:
1687  *
1688  *      Collapse an object with the object backing it.
1689  *      Pages in the backing object are moved into the
1690  *      parent, and the backing object is deallocated.
1691  */
1692 void
1693 vm_object_collapse(vm_object_t object)
1694 {
1695         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
1696
1697         while (TRUE) {
1698                 vm_object_t backing_object;
1699
1700                 /*
1701                  * Verify that the conditions are right for collapse:
1702                  *
1703                  * The object exists and the backing object exists.
1704                  */
1705                 if ((backing_object = object->backing_object) == NULL)
1706                         break;
1707
1708                 /*
1709                  * we check the backing object first, because it is most likely
1710                  * not collapsable.
1711                  */
1712                 VM_OBJECT_LOCK(backing_object);
1713                 if (backing_object->handle != NULL ||
1714                     (backing_object->type != OBJT_DEFAULT &&
1715                      backing_object->type != OBJT_SWAP) ||
1716                     (backing_object->flags & OBJ_DEAD) ||
1717                     object->handle != NULL ||
1718                     (object->type != OBJT_DEFAULT &&
1719                      object->type != OBJT_SWAP) ||
1720                     (object->flags & OBJ_DEAD)) {
1721                         VM_OBJECT_UNLOCK(backing_object);
1722                         break;
1723                 }
1724
1725                 if (
1726                     object->paging_in_progress != 0 ||
1727                     backing_object->paging_in_progress != 0
1728                 ) {
1729                         vm_object_qcollapse(object);
1730                         VM_OBJECT_UNLOCK(backing_object);
1731                         break;
1732                 }
1733                 /*
1734                  * We know that we can either collapse the backing object (if
1735                  * the parent is the only reference to it) or (perhaps) have
1736                  * the parent bypass the object if the parent happens to shadow
1737                  * all the resident pages in the entire backing object.
1738                  *
1739                  * This is ignoring pager-backed pages such as swap pages.
1740                  * vm_object_backing_scan fails the shadowing test in this
1741                  * case.
1742                  */
1743                 if (backing_object->ref_count == 1) {
1744                         /*
1745                          * If there is exactly one reference to the backing
1746                          * object, we can collapse it into the parent.
1747                          */
1748                         vm_object_backing_scan(object, OBSC_COLLAPSE_WAIT);
1749
1750 #if VM_NRESERVLEVEL > 0
1751                         /*
1752                          * Break any reservations from backing_object.
1753                          */
1754                         if (__predict_false(!LIST_EMPTY(&backing_object->rvq)))
1755                                 vm_reserv_break_all(backing_object);
1756 #endif
1757
1758                         /*
1759                          * Move the pager from backing_object to object.
1760                          */
1761                         if (backing_object->type == OBJT_SWAP) {
1762                                 /*
1763                                  * swap_pager_copy() can sleep, in which case
1764                                  * the backing_object's and object's locks are
1765                                  * released and reacquired.
1766                                  */
1767                                 swap_pager_copy(
1768                                     backing_object,
1769                                     object,
1770                                     OFF_TO_IDX(object->backing_object_offset), TRUE);
1771
1772                                 /*
1773                                  * Free any cached pages from backing_object.
1774                                  */
1775                                 if (__predict_false(backing_object->cache != NULL))
1776                                         vm_page_cache_free(backing_object, 0, 0);
1777                         }
1778                         /*
1779                          * Object now shadows whatever backing_object did.
1780                          * Note that the reference to
1781                          * backing_object->backing_object moves from within
1782                          * backing_object to within object.
1783                          */
1784                         LIST_REMOVE(object, shadow_list);
1785                         backing_object->shadow_count--;
1786                         backing_object->generation++;
1787                         if (backing_object->backing_object) {
1788                                 VM_OBJECT_LOCK(backing_object->backing_object);
1789                                 LIST_REMOVE(backing_object, shadow_list);
1790                                 LIST_INSERT_HEAD(
1791                                     &backing_object->backing_object->shadow_head,
1792                                     object, shadow_list);
1793                                 /*
1794                                  * The shadow_count has not changed.
1795                                  */
1796                                 backing_object->backing_object->generation++;
1797                                 VM_OBJECT_UNLOCK(backing_object->backing_object);
1798                         }
1799                         object->backing_object = backing_object->backing_object;
1800                         object->backing_object_offset +=
1801                             backing_object->backing_object_offset;
1802
1803                         /*
1804                          * Discard backing_object.
1805                          *
1806                          * Since the backing object has no pages, no pager left,
1807                          * and no object references within it, all that is
1808                          * necessary is to dispose of it.
1809                          */
1810                         KASSERT(backing_object->ref_count == 1, (
1811 "backing_object %p was somehow re-referenced during collapse!",
1812                             backing_object));
1813                         VM_OBJECT_UNLOCK(backing_object);
1814                         vm_object_destroy(backing_object);
1815
1816                         object_collapses++;
1817                 } else {
1818                         vm_object_t new_backing_object;
1819
1820                         /*
1821                          * If we do not entirely shadow the backing object,
1822                          * there is nothing we can do so we give up.
1823                          */
1824                         if (object->resident_page_count != object->size &&
1825                             vm_object_backing_scan(object,
1826                             OBSC_TEST_ALL_SHADOWED) == 0) {
1827                                 VM_OBJECT_UNLOCK(backing_object);
1828                                 break;
1829                         }
1830
1831                         /*
1832                          * Make the parent shadow the next object in the
1833                          * chain.  Deallocating backing_object will not remove
1834                          * it, since its reference count is at least 2.
1835                          */
1836                         LIST_REMOVE(object, shadow_list);
1837                         backing_object->shadow_count--;
1838                         backing_object->generation++;
1839
1840                         new_backing_object = backing_object->backing_object;
1841                         if ((object->backing_object = new_backing_object) != NULL) {
1842                                 VM_OBJECT_LOCK(new_backing_object);
1843                                 LIST_INSERT_HEAD(
1844                                     &new_backing_object->shadow_head,
1845                                     object,
1846                                     shadow_list
1847                                 );
1848                                 new_backing_object->shadow_count++;
1849                                 new_backing_object->generation++;
1850                                 vm_object_reference_locked(new_backing_object);
1851                                 VM_OBJECT_UNLOCK(new_backing_object);
1852                                 object->backing_object_offset +=
1853                                         backing_object->backing_object_offset;
1854                         }
1855
1856                         /*
1857                          * Drop the reference count on backing_object. Since
1858                          * its ref_count was at least 2, it will not vanish.
1859                          */
1860                         backing_object->ref_count--;
1861                         VM_OBJECT_UNLOCK(backing_object);
1862                         object_bypasses++;
1863                 }
1864
1865                 /*
1866                  * Try again with this object's new backing object.
1867                  */
1868         }
1869 }
1870
1871 /*
1872  *      vm_object_page_remove:
1873  *
1874  *      For the given object, either frees or invalidates each of the
1875  *      specified pages.  In general, a page is freed.  However, if a
1876  *      page is wired for any reason other than the existence of a
1877  *      managed, wired mapping, then it may be invalidated but not
1878  *      removed from the object.  Pages are specified by the given
1879  *      range ["start", "end") and Boolean "clean_only".  As a
1880  *      special case, if "end" is zero, then the range extends from
1881  *      "start" to the end of the object.  If "clean_only" is TRUE,
1882  *      then only the non-dirty pages within the specified range are
1883  *      affected.
1884  *
1885  *      In general, this operation should only be performed on objects
1886  *      that contain managed pages.  There are two exceptions.  First,
1887  *      it may be performed on the kernel and kmem objects.  Second,
1888  *      it may be used by msync(..., MS_INVALIDATE) to invalidate
1889  *      device-backed pages.
1890  *
1891  *      The object must be locked.
1892  */
1893 void
1894 vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
1895     boolean_t clean_only)
1896 {
1897         vm_page_t p, next;
1898         int wirings;
1899
1900         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
1901         if (object->resident_page_count == 0)
1902                 goto skipmemq;
1903
1904         /*
1905          * Since physically-backed objects do not use managed pages, we can't
1906          * remove pages from the object (we must instead remove the page
1907          * references, and then destroy the object).
1908          */
1909         KASSERT(object->type != OBJT_PHYS || object == kernel_object ||
1910             object == kmem_object,
1911             ("attempt to remove pages from a physical object"));
1912
1913         vm_object_pip_add(object, 1);
1914 again:
1915         if ((p = TAILQ_FIRST(&object->memq)) != NULL) {
1916                 if (p->pindex < start) {
1917                         p = vm_page_splay(start, object->root);
1918                         if ((object->root = p)->pindex < start)
1919                                 p = TAILQ_NEXT(p, listq);
1920                 }
1921         }
1922         vm_page_lock_queues();
1923         /*
1924          * Assert: the variable p is either (1) the page with the
1925          * least pindex greater than or equal to the parameter pindex
1926          * or (2) NULL.
1927          */
1928         for (;
1929              p != NULL && (p->pindex < end || end == 0);
1930              p = next) {
1931                 next = TAILQ_NEXT(p, listq);
1932
1933                 /*
1934                  * If the page is wired for any reason besides the
1935                  * existence of managed, wired mappings, then it cannot
1936                  * be freed.  For example, fictitious pages, which
1937                  * represent device memory, are inherently wired and
1938                  * cannot be freed.  They can, however, be invalidated
1939                  * if "clean_only" is FALSE.
1940                  */
1941                 if ((wirings = p->wire_count) != 0 &&
1942                     (wirings = pmap_page_wired_mappings(p)) != p->wire_count) {
1943                         /* Fictitious pages do not have managed mappings. */
1944                         if ((p->flags & PG_FICTITIOUS) == 0)
1945                                 pmap_remove_all(p);
1946                         /* Account for removal of managed, wired mappings. */
1947                         p->wire_count -= wirings;
1948                         if (!clean_only) {
1949                                 p->valid = 0;
1950                                 vm_page_undirty(p);
1951                         }
1952                         continue;
1953                 }
1954                 if (vm_page_sleep_if_busy(p, TRUE, "vmopar"))
1955                         goto again;
1956                 KASSERT((p->flags & PG_FICTITIOUS) == 0,
1957                     ("vm_object_page_remove: page %p is fictitious", p));
1958                 if (clean_only && p->valid) {
1959                         pmap_remove_write(p);
1960                         if (p->dirty)
1961                                 continue;
1962                 }
1963                 pmap_remove_all(p);
1964                 /* Account for removal of managed, wired mappings. */
1965                 if (wirings != 0)
1966                         p->wire_count -= wirings;
1967                 vm_page_free(p);
1968         }
1969         vm_page_unlock_queues();
1970         vm_object_pip_wakeup(object);
1971 skipmemq:
1972         if (__predict_false(object->cache != NULL))
1973                 vm_page_cache_free(object, start, end);
1974 }
1975
1976 /*
1977  *      Populate the specified range of the object with valid pages.  Returns
1978  *      TRUE if the range is successfully populated and FALSE otherwise.
1979  *
1980  *      Note: This function should be optimized to pass a larger array of
1981  *      pages to vm_pager_get_pages() before it is applied to a non-
1982  *      OBJT_DEVICE object.
1983  *
1984  *      The object must be locked.
1985  */
1986 boolean_t
1987 vm_object_populate(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
1988 {
1989         vm_page_t m, ma[1];
1990         vm_pindex_t pindex;
1991         int rv;
1992
1993         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
1994         for (pindex = start; pindex < end; pindex++) {
1995                 m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL |
1996                     VM_ALLOC_RETRY);
1997                 if (m->valid != VM_PAGE_BITS_ALL) {
1998                         ma[0] = m;
1999                         rv = vm_pager_get_pages(object, ma, 1, 0);
2000                         m = vm_page_lookup(object, pindex);
2001                         if (m == NULL)
2002                                 break;
2003                         if (rv != VM_PAGER_OK) {
2004                                 vm_page_lock_queues();
2005                                 vm_page_free(m);
2006                                 vm_page_unlock_queues();
2007                                 break;
2008                         }
2009                 }
2010                 /*
2011                  * Keep "m" busy because a subsequent iteration may unlock
2012                  * the object.
2013                  */
2014         }
2015         if (pindex > start) {
2016                 m = vm_page_lookup(object, start);
2017                 while (m != NULL && m->pindex < pindex) {
2018                         vm_page_wakeup(m);
2019                         m = TAILQ_NEXT(m, listq);
2020                 }
2021         }
2022         return (pindex == end);
2023 }
2024
2025 /*
2026  *      Routine:        vm_object_coalesce
2027  *      Function:       Coalesces two objects backing up adjoining
2028  *                      regions of memory into a single object.
2029  *
2030  *      returns TRUE if objects were combined.
2031  *
2032  *      NOTE:   Only works at the moment if the second object is NULL -
2033  *              if it's not, which object do we lock first?
2034  *
2035  *      Parameters:
2036  *              prev_object     First object to coalesce
2037  *              prev_offset     Offset into prev_object
2038  *              prev_size       Size of reference to prev_object
2039  *              next_size       Size of reference to the second object
2040  *              reserved        Indicator that extension region has
2041  *                              swap accounted for
2042  *
2043  *      Conditions:
2044  *      The object must *not* be locked.
2045  */
2046 boolean_t
2047 vm_object_coalesce(vm_object_t prev_object, vm_ooffset_t prev_offset,
2048     vm_size_t prev_size, vm_size_t next_size, boolean_t reserved)
2049 {
2050         vm_pindex_t next_pindex;
2051
2052         if (prev_object == NULL)
2053                 return (TRUE);
2054         VM_OBJECT_LOCK(prev_object);
2055         if (prev_object->type != OBJT_DEFAULT &&
2056             prev_object->type != OBJT_SWAP) {
2057                 VM_OBJECT_UNLOCK(prev_object);
2058                 return (FALSE);
2059         }
2060
2061         /*
2062          * Try to collapse the object first
2063          */
2064         vm_object_collapse(prev_object);
2065
2066         /*
2067          * Can't coalesce if: . more than one reference . paged out . shadows
2068          * another object . has a copy elsewhere (any of which mean that the
2069          * pages not mapped to prev_entry may be in use anyway)
2070          */
2071         if (prev_object->backing_object != NULL) {
2072                 VM_OBJECT_UNLOCK(prev_object);
2073                 return (FALSE);
2074         }
2075
2076         prev_size >>= PAGE_SHIFT;
2077         next_size >>= PAGE_SHIFT;
2078         next_pindex = OFF_TO_IDX(prev_offset) + prev_size;
2079
2080         if ((prev_object->ref_count > 1) &&
2081             (prev_object->size != next_pindex)) {
2082                 VM_OBJECT_UNLOCK(prev_object);
2083                 return (FALSE);
2084         }
2085
2086         /*
2087          * Account for the charge.
2088          */
2089         if (prev_object->uip != NULL) {
2090
2091                 /*
2092                  * If prev_object was charged, then this mapping,
2093                  * althought not charged now, may become writable
2094                  * later. Non-NULL uip in the object would prevent
2095                  * swap reservation during enabling of the write
2096                  * access, so reserve swap now. Failed reservation
2097                  * cause allocation of the separate object for the map
2098                  * entry, and swap reservation for this entry is
2099                  * managed in appropriate time.
2100                  */
2101                 if (!reserved && !swap_reserve_by_uid(ptoa(next_size),
2102                     prev_object->uip)) {
2103                         return (FALSE);
2104                 }
2105                 prev_object->charge += ptoa(next_size);
2106         }
2107
2108         /*
2109          * Remove any pages that may still be in the object from a previous
2110          * deallocation.
2111          */
2112         if (next_pindex < prev_object->size) {
2113                 vm_object_page_remove(prev_object,
2114                                       next_pindex,
2115                                       next_pindex + next_size, FALSE);
2116                 if (prev_object->type == OBJT_SWAP)
2117                         swap_pager_freespace(prev_object,
2118                                              next_pindex, next_size);
2119 #if 0
2120                 if (prev_object->uip != NULL) {
2121                         KASSERT(prev_object->charge >=
2122                             ptoa(prev_object->size - next_pindex),
2123                             ("object %p overcharged 1 %jx %jx", prev_object,
2124                                 (uintmax_t)next_pindex, (uintmax_t)next_size));
2125                         prev_object->charge -= ptoa(prev_object->size -
2126                             next_pindex);
2127                 }
2128 #endif
2129         }
2130
2131         /*
2132          * Extend the object if necessary.
2133          */
2134         if (next_pindex + next_size > prev_object->size)
2135                 prev_object->size = next_pindex + next_size;
2136
2137         VM_OBJECT_UNLOCK(prev_object);
2138         return (TRUE);
2139 }
2140
2141 void
2142 vm_object_set_writeable_dirty(vm_object_t object)
2143 {
2144
2145         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
2146         if (object->type != OBJT_VNODE ||
2147             (object->flags & OBJ_MIGHTBEDIRTY) != 0)
2148                 return;
2149         vm_object_set_flag(object, OBJ_MIGHTBEDIRTY);
2150 }
2151
2152 #include "opt_ddb.h"
2153 #ifdef DDB
2154 #include <sys/kernel.h>
2155
2156 #include <sys/cons.h>
2157
2158 #include <ddb/ddb.h>
2159
2160 static int
2161 _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry)
2162 {
2163         vm_map_t tmpm;
2164         vm_map_entry_t tmpe;
2165         vm_object_t obj;
2166         int entcount;
2167
2168         if (map == 0)
2169                 return 0;
2170
2171         if (entry == 0) {
2172                 tmpe = map->header.next;
2173                 entcount = map->nentries;
2174                 while (entcount-- && (tmpe != &map->header)) {
2175                         if (_vm_object_in_map(map, object, tmpe)) {
2176                                 return 1;
2177                         }
2178                         tmpe = tmpe->next;
2179                 }
2180         } else if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
2181                 tmpm = entry->object.sub_map;
2182                 tmpe = tmpm->header.next;
2183                 entcount = tmpm->nentries;
2184                 while (entcount-- && tmpe != &tmpm->header) {
2185                         if (_vm_object_in_map(tmpm, object, tmpe)) {
2186                                 return 1;
2187                         }
2188                         tmpe = tmpe->next;
2189                 }
2190         } else if ((obj = entry->object.vm_object) != NULL) {
2191                 for (; obj; obj = obj->backing_object)
2192                         if (obj == object) {
2193                                 return 1;
2194                         }
2195         }
2196         return 0;
2197 }
2198
2199 static int
2200 vm_object_in_map(vm_object_t object)
2201 {
2202         struct proc *p;
2203
2204         /* sx_slock(&allproc_lock); */
2205         FOREACH_PROC_IN_SYSTEM(p) {
2206                 if (!p->p_vmspace /* || (p->p_flag & (P_SYSTEM|P_WEXIT)) */)
2207                         continue;
2208                 if (_vm_object_in_map(&p->p_vmspace->vm_map, object, 0)) {
2209                         /* sx_sunlock(&allproc_lock); */
2210                         return 1;
2211                 }
2212         }
2213         /* sx_sunlock(&allproc_lock); */
2214         if (_vm_object_in_map(kernel_map, object, 0))
2215                 return 1;
2216         if (_vm_object_in_map(kmem_map, object, 0))
2217                 return 1;
2218         if (_vm_object_in_map(pager_map, object, 0))
2219                 return 1;
2220         if (_vm_object_in_map(buffer_map, object, 0))
2221                 return 1;
2222         return 0;
2223 }
2224
2225 DB_SHOW_COMMAND(vmochk, vm_object_check)
2226 {
2227         vm_object_t object;
2228
2229         /*
2230          * make sure that internal objs are in a map somewhere
2231          * and none have zero ref counts.
2232          */
2233         TAILQ_FOREACH(object, &vm_object_list, object_list) {
2234                 if (object->handle == NULL &&
2235                     (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP)) {
2236                         if (object->ref_count == 0) {
2237                                 db_printf("vmochk: internal obj has zero ref count: %ld\n",
2238                                         (long)object->size);
2239                         }
2240                         if (!vm_object_in_map(object)) {
2241                                 db_printf(
2242                         "vmochk: internal obj is not in a map: "
2243                         "ref: %d, size: %lu: 0x%lx, backing_object: %p\n",
2244                                     object->ref_count, (u_long)object->size,
2245                                     (u_long)object->size,
2246                                     (void *)object->backing_object);
2247                         }
2248                 }
2249         }
2250 }
2251
2252 /*
2253  *      vm_object_print:        [ debug ]
2254  */
2255 DB_SHOW_COMMAND(object, vm_object_print_static)
2256 {
2257         /* XXX convert args. */
2258         vm_object_t object = (vm_object_t)addr;
2259         boolean_t full = have_addr;
2260
2261         vm_page_t p;
2262
2263         /* XXX count is an (unused) arg.  Avoid shadowing it. */
2264 #define count   was_count
2265
2266         int count;
2267
2268         if (object == NULL)
2269                 return;
2270
2271         db_iprintf(
2272             "Object %p: type=%d, size=0x%jx, res=%d, ref=%d, flags=0x%x uip %d charge %jx\n",
2273             object, (int)object->type, (uintmax_t)object->size,
2274             object->resident_page_count, object->ref_count, object->flags,
2275             object->uip ? object->uip->ui_uid : -1, (uintmax_t)object->charge);
2276         db_iprintf(" sref=%d, backing_object(%d)=(%p)+0x%jx\n",
2277             object->shadow_count,
2278             object->backing_object ? object->backing_object->ref_count : 0,
2279             object->backing_object, (uintmax_t)object->backing_object_offset);
2280
2281         if (!full)
2282                 return;
2283
2284         db_indent += 2;
2285         count = 0;
2286         TAILQ_FOREACH(p, &object->memq, listq) {
2287                 if (count == 0)
2288                         db_iprintf("memory:=");
2289                 else if (count == 6) {
2290                         db_printf("\n");
2291                         db_iprintf(" ...");
2292                         count = 0;
2293                 } else
2294                         db_printf(",");
2295                 count++;
2296
2297                 db_printf("(off=0x%jx,page=0x%jx)",
2298                     (uintmax_t)p->pindex, (uintmax_t)VM_PAGE_TO_PHYS(p));
2299         }
2300         if (count != 0)
2301                 db_printf("\n");
2302         db_indent -= 2;
2303 }
2304
2305 /* XXX. */
2306 #undef count
2307
2308 /* XXX need this non-static entry for calling from vm_map_print. */
2309 void
2310 vm_object_print(
2311         /* db_expr_t */ long addr,
2312         boolean_t have_addr,
2313         /* db_expr_t */ long count,
2314         char *modif)
2315 {
2316         vm_object_print_static(addr, have_addr, count, modif);
2317 }
2318
2319 DB_SHOW_COMMAND(vmopag, vm_object_print_pages)
2320 {
2321         vm_object_t object;
2322         vm_pindex_t fidx;
2323         vm_paddr_t pa;
2324         vm_page_t m, prev_m;
2325         int rcount, nl, c;
2326
2327         nl = 0;
2328         TAILQ_FOREACH(object, &vm_object_list, object_list) {
2329                 db_printf("new object: %p\n", (void *)object);
2330                 if (nl > 18) {
2331                         c = cngetc();
2332                         if (c != ' ')
2333                                 return;
2334                         nl = 0;
2335                 }
2336                 nl++;
2337                 rcount = 0;
2338                 fidx = 0;
2339                 pa = -1;
2340                 TAILQ_FOREACH(m, &object->memq, listq) {
2341                         if (m->pindex > 128)
2342                                 break;
2343                         if ((prev_m = TAILQ_PREV(m, pglist, listq)) != NULL &&
2344                             prev_m->pindex + 1 != m->pindex) {
2345                                 if (rcount) {
2346                                         db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
2347                                                 (long)fidx, rcount, (long)pa);
2348                                         if (nl > 18) {
2349                                                 c = cngetc();
2350                                                 if (c != ' ')
2351                                                         return;
2352                                                 nl = 0;
2353                                         }
2354                                         nl++;
2355                                         rcount = 0;
2356                                 }
2357                         }
2358                         if (rcount &&
2359                                 (VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) {
2360                                 ++rcount;
2361                                 continue;
2362                         }
2363                         if (rcount) {
2364                                 db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
2365                                         (long)fidx, rcount, (long)pa);
2366                                 if (nl > 18) {
2367                                         c = cngetc();
2368                                         if (c != ' ')
2369                                                 return;
2370                                         nl = 0;
2371                                 }
2372                                 nl++;
2373                         }
2374                         fidx = m->pindex;
2375                         pa = VM_PAGE_TO_PHYS(m);
2376                         rcount = 1;
2377                 }
2378                 if (rcount) {
2379                         db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
2380                                 (long)fidx, rcount, (long)pa);
2381                         if (nl > 18) {
2382                                 c = cngetc();
2383                                 if (c != ' ')
2384                                         return;
2385                                 nl = 0;
2386                         }
2387                         nl++;
2388                 }
2389         }
2390 }
2391 #endif /* DDB */