sys/vm/vm_map.c

   1 /*-
   2  * Copyright (c) 1991, 1993
   3  *      The Regents of the University of California.  All rights reserved.
   4  *
   5  * This code is derived from software contributed to Berkeley by
   6  * The Mach Operating System project at Carnegie-Mellon University.
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in the
  15  *    documentation and/or other materials provided with the distribution.
  16  * 4. Neither the name of the University nor the names of its contributors
  17  *    may be used to endorse or promote products derived from this software
  18  *    without specific prior written permission.
  19  *
  20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  30  * SUCH DAMAGE.
  31  *
  32  *      from: @(#)vm_map.c      8.3 (Berkeley) 1/12/94
  33  *
  34  *
  35  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  36  * All rights reserved.
  37  *
  38  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  39  *
  40  * Permission to use, copy, modify and distribute this software and
  41  * its documentation is hereby granted, provided that both the copyright
  42  * notice and this permission notice appear in all copies of the
  43  * software, derivative works or modified versions, and any portions
  44  * thereof, and that both notices appear in supporting documentation.
  45  *
  46  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  47  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  48  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  49  *
  50  * Carnegie Mellon requests users of this software to return to
  51  *
  52  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  53  *  School of Computer Science
  54  *  Carnegie Mellon University
  55  *  Pittsburgh PA 15213-3890
  56  *
  57  * any improvements or extensions that they make and grant Carnegie the
  58  * rights to redistribute these changes.
  59  */
  60
  61 /*
  62  *      Virtual memory mapping module.
  63  */
  64
  65 #include <sys/cdefs.h>
  66 __FBSDID("$FreeBSD$");
  67
  68 #include <sys/param.h>
  69 #include <sys/systm.h>
  70 #include <sys/kernel.h>
  71 #include <sys/ktr.h>
  72 #include <sys/lock.h>
  73 #include <sys/mutex.h>
  74 #include <sys/proc.h>
  75 #include <sys/vmmeter.h>
  76 #include <sys/mman.h>
  77 #include <sys/vnode.h>
  78 #include <sys/racct.h>
  79 #include <sys/resourcevar.h>
  80 #include <sys/rwlock.h>
  81 #include <sys/file.h>
  82 #include <sys/sysctl.h>
  83 #include <sys/sysent.h>
  84 #include <sys/shm.h>
  85
  86 #include <vm/vm.h>
  87 #include <vm/vm_param.h>
  88 #include <vm/pmap.h>
  89 #include <vm/vm_map.h>
  90 #include <vm/vm_page.h>
  91 #include <vm/vm_object.h>
  92 #include <vm/vm_pager.h>
  93 #include <vm/vm_kern.h>
  94 #include <vm/vm_extern.h>
  95 #include <vm/vnode_pager.h>
  96 #include <vm/swap_pager.h>
  97 #include <vm/uma.h>
  98
  99 /*
 100  *      Virtual memory maps provide for the mapping, protection,
 101  *      and sharing of virtual memory objects.  In addition,
 102  *      this module provides for an efficient virtual copy of
 103  *      memory from one map to another.
 104  *
 105  *      Synchronization is required prior to most operations.
 106  *
 107  *      Maps consist of an ordered doubly-linked list of simple
 108  *      entries; a self-adjusting binary search tree of these
 109  *      entries is used to speed up lookups.
 110  *
 111  *      Since portions of maps are specified by start/end addresses,
 112  *      which may not align with existing map entries, all
 113  *      routines merely "clip" entries to these start/end values.
 114  *      [That is, an entry is split into two, bordering at a
 115  *      start or end value.]  Note that these clippings may not
 116  *      always be necessary (as the two resulting entries are then
 117  *      not changed); however, the clipping is done for convenience.
 118  *
 119  *      As mentioned above, virtual copy operations are performed
 120  *      by copying VM object references from one map to
 121  *      another, and then marking both regions as copy-on-write.
 122  */
 123
 124 static struct mtx map_sleep_mtx;
 125 static uma_zone_t mapentzone;
 126 static uma_zone_t kmapentzone;
 127 static uma_zone_t mapzone;
 128 static uma_zone_t vmspace_zone;
 129 static int vmspace_zinit(void *mem, int size, int flags);
 130 static int vm_map_zinit(void *mem, int ize, int flags);
 131 static void _vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min,
 132     vm_offset_t max);
 133 static void vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map);
 134 static void vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry);
 135 #ifdef INVARIANTS
 136 static void vm_map_zdtor(void *mem, int size, void *arg);
 137 static void vmspace_zdtor(void *mem, int size, void *arg);
 138 #endif
 139 static int vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos,
 140     vm_size_t max_ssize, vm_size_t growsize, vm_prot_t prot, vm_prot_t max,
 141     int cow);
 142
 143 #define ENTRY_CHARGED(e) ((e)->cred != NULL || \
 144     ((e)->object.vm_object != NULL && (e)->object.vm_object->cred != NULL && \
 145      !((e)->eflags & MAP_ENTRY_NEEDS_COPY)))
 146
 147 /*
 148  * PROC_VMSPACE_{UN,}LOCK() can be a noop as long as vmspaces are type
 149  * stable.
 150  */
 151 #define PROC_VMSPACE_LOCK(p) do { } while (0)
 152 #define PROC_VMSPACE_UNLOCK(p) do { } while (0)
 153
 154 /*
 155  *      VM_MAP_RANGE_CHECK:     [ internal use only ]
 156  *
 157  *      Asserts that the starting and ending region
 158  *      addresses fall within the valid range of the map.
 159  */
 160 #define VM_MAP_RANGE_CHECK(map, start, end)             \
 161                 {                                       \
 162                 if (start < vm_map_min(map))            \
 163                         start = vm_map_min(map);        \
 164                 if (end > vm_map_max(map))              \
 165                         end = vm_map_max(map);          \
 166                 if (start > end)                        \
 167                         start = end;                    \
 168                 }
 169
 170 /*
 171  *      vm_map_startup:
 172  *
 173  *      Initialize the vm_map module.  Must be called before
 174  *      any other vm_map routines.
 175  *
 176  *      Map and entry structures are allocated from the general
 177  *      purpose memory pool with some exceptions:
 178  *
 179  *      - The kernel map and kmem submap are allocated statically.
 180  *      - Kernel map entries are allocated out of a static pool.
 181  *
 182  *      These restrictions are necessary since malloc() uses the
 183  *      maps and requires map entries.
 184  */
 185
 186 void
 187 vm_map_startup(void)
 188 {
 189         mtx_init(&map_sleep_mtx, "vm map sleep mutex", NULL, MTX_DEF);
 190         mapzone = uma_zcreate("MAP", sizeof(struct vm_map), NULL,
 191 #ifdef INVARIANTS
 192             vm_map_zdtor,
 193 #else
 194             NULL,
 195 #endif
 196             vm_map_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 197         uma_prealloc(mapzone, MAX_KMAP);
 198         kmapentzone = uma_zcreate("KMAP ENTRY", sizeof(struct vm_map_entry),
 199             NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
 200             UMA_ZONE_MTXCLASS | UMA_ZONE_VM);
 201         mapentzone = uma_zcreate("MAP ENTRY", sizeof(struct vm_map_entry),
 202             NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 203         vmspace_zone = uma_zcreate("VMSPACE", sizeof(struct vmspace), NULL,
 204 #ifdef INVARIANTS
 205             vmspace_zdtor,
 206 #else
 207             NULL,
 208 #endif
 209             vmspace_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 210 }
 211
 212 static int
 213 vmspace_zinit(void *mem, int size, int flags)
 214 {
 215         struct vmspace *vm;
 216
 217         vm = (struct vmspace *)mem;
 218
 219         vm->vm_map.pmap = NULL;
 220         (void)vm_map_zinit(&vm->vm_map, sizeof(vm->vm_map), flags);
 221         PMAP_LOCK_INIT(vmspace_pmap(vm));
 222         return (0);
 223 }
 224
 225 static int
 226 vm_map_zinit(void *mem, int size, int flags)
 227 {
 228         vm_map_t map;
 229
 230         map = (vm_map_t)mem;
 231         memset(map, 0, sizeof(*map));
 232         mtx_init(&map->system_mtx, "vm map (system)", NULL, MTX_DEF | MTX_DUPOK);
 233         sx_init(&map->lock, "vm map (user)");
 234         return (0);
 235 }
 236
 237 #ifdef INVARIANTS
 238 static void
 239 vmspace_zdtor(void *mem, int size, void *arg)
 240 {
 241         struct vmspace *vm;
 242
 243         vm = (struct vmspace *)mem;
 244
 245         vm_map_zdtor(&vm->vm_map, sizeof(vm->vm_map), arg);
 246 }
 247 static void
 248 vm_map_zdtor(void *mem, int size, void *arg)
 249 {
 250         vm_map_t map;
 251
 252         map = (vm_map_t)mem;
 253         KASSERT(map->nentries == 0,
 254             ("map %p nentries == %d on free.",
 255             map, map->nentries));
 256         KASSERT(map->size == 0,
 257             ("map %p size == %lu on free.",
 258             map, (unsigned long)map->size));
 259 }
 260 #endif  /* INVARIANTS */
 261
 262 /*
 263  * Allocate a vmspace structure, including a vm_map and pmap,
 264  * and initialize those structures.  The refcnt is set to 1.
 265  *
 266  * If 'pinit' is NULL then the embedded pmap is initialized via pmap_pinit().
 267  */
 268 struct vmspace *
 269 vmspace_alloc(vm_offset_t min, vm_offset_t max, pmap_pinit_t pinit)
 270 {
 271         struct vmspace *vm;
 272
 273         vm = uma_zalloc(vmspace_zone, M_WAITOK);
 274
 275         KASSERT(vm->vm_map.pmap == NULL, ("vm_map.pmap must be NULL"));
 276
 277         if (pinit == NULL)
 278                 pinit = &pmap_pinit;
 279
 280         if (!pinit(vmspace_pmap(vm))) {
 281                 uma_zfree(vmspace_zone, vm);
 282                 return (NULL);
 283         }
 284         CTR1(KTR_VM, "vmspace_alloc: %p", vm);
 285         _vm_map_init(&vm->vm_map, vmspace_pmap(vm), min, max);
 286         vm->vm_refcnt = 1;
 287         vm->vm_shm = NULL;
 288         vm->vm_swrss = 0;
 289         vm->vm_tsize = 0;
 290         vm->vm_dsize = 0;
 291         vm->vm_ssize = 0;
 292         vm->vm_taddr = 0;
 293         vm->vm_daddr = 0;
 294         vm->vm_maxsaddr = 0;
 295         return (vm);
 296 }
 297
 298 static void
 299 vmspace_container_reset(struct proc *p)
 300 {
 301
 302 #ifdef RACCT
 303         PROC_LOCK(p);
 304         racct_set(p, RACCT_DATA, 0);
 305         racct_set(p, RACCT_STACK, 0);
 306         racct_set(p, RACCT_RSS, 0);
 307         racct_set(p, RACCT_MEMLOCK, 0);
 308         racct_set(p, RACCT_VMEM, 0);
 309         PROC_UNLOCK(p);
 310 #endif
 311 }
 312
 313 static inline void
 314 vmspace_dofree(struct vmspace *vm)
 315 {
 316
 317         CTR1(KTR_VM, "vmspace_free: %p", vm);
 318
 319         /*
 320          * Make sure any SysV shm is freed, it might not have been in
 321          * exit1().
 322          */
 323         shmexit(vm);
 324
 325         /*
 326          * Lock the map, to wait out all other references to it.
 327          * Delete all of the mappings and pages they hold, then call
 328          * the pmap module to reclaim anything left.
 329          */
 330         (void)vm_map_remove(&vm->vm_map, vm->vm_map.min_offset,
 331             vm->vm_map.max_offset);
 332
 333         pmap_release(vmspace_pmap(vm));
 334         vm->vm_map.pmap = NULL;
 335         uma_zfree(vmspace_zone, vm);
 336 }
 337
 338 void
 339 vmspace_free(struct vmspace *vm)
 340 {
 341
 342         if (vm->vm_refcnt == 0)
 343                 panic("vmspace_free: attempt to free already freed vmspace");
 344
 345         if (atomic_fetchadd_int(&vm->vm_refcnt, -1) == 1)
 346                 vmspace_dofree(vm);
 347 }
 348
 349 void
 350 vmspace_exitfree(struct proc *p)
 351 {
 352         struct vmspace *vm;
 353
 354         PROC_VMSPACE_LOCK(p);
 355         vm = p->p_vmspace;
 356         p->p_vmspace = NULL;
 357         PROC_VMSPACE_UNLOCK(p);
 358         KASSERT(vm == &vmspace0, ("vmspace_exitfree: wrong vmspace"));
 359         vmspace_free(vm);
 360 }
 361
 362 void
 363 vmspace_exit(struct thread *td)
 364 {
 365         int refcnt;
 366         struct vmspace *vm;
 367         struct proc *p;
 368
 369         /*
 370          * Release user portion of address space.
 371          * This releases references to vnodes,
 372          * which could cause I/O if the file has been unlinked.
 373          * Need to do this early enough that we can still sleep.
 374          *
 375          * The last exiting process to reach this point releases as
 376          * much of the environment as it can. vmspace_dofree() is the
 377          * slower fallback in case another process had a temporary
 378          * reference to the vmspace.
 379          */
 380
 381         p = td->td_proc;
 382         vm = p->p_vmspace;
 383         atomic_add_int(&vmspace0.vm_refcnt, 1);
 384         do {
 385                 refcnt = vm->vm_refcnt;
 386                 if (refcnt > 1 && p->p_vmspace != &vmspace0) {
 387                         /* Switch now since other proc might free vmspace */
 388                         PROC_VMSPACE_LOCK(p);
 389                         p->p_vmspace = &vmspace0;
 390                         PROC_VMSPACE_UNLOCK(p);
 391                         pmap_activate(td);
 392                 }
 393         } while (!atomic_cmpset_int(&vm->vm_refcnt, refcnt, refcnt - 1));
 394         if (refcnt == 1) {
 395                 if (p->p_vmspace != vm) {
 396                         /* vmspace not yet freed, switch back */
 397                         PROC_VMSPACE_LOCK(p);
 398                         p->p_vmspace = vm;
 399                         PROC_VMSPACE_UNLOCK(p);
 400                         pmap_activate(td);
 401                 }
 402                 pmap_remove_pages(vmspace_pmap(vm));
 403                 /* Switch now since this proc will free vmspace */
 404                 PROC_VMSPACE_LOCK(p);
 405                 p->p_vmspace = &vmspace0;
 406                 PROC_VMSPACE_UNLOCK(p);
 407                 pmap_activate(td);
 408                 vmspace_dofree(vm);
 409         }
 410         vmspace_container_reset(p);
 411 }
 412
 413 /* Acquire reference to vmspace owned by another process. */
 414
 415 struct vmspace *
 416 vmspace_acquire_ref(struct proc *p)
 417 {
 418         struct vmspace *vm;
 419         int refcnt;
 420
 421         PROC_VMSPACE_LOCK(p);
 422         vm = p->p_vmspace;
 423         if (vm == NULL) {
 424                 PROC_VMSPACE_UNLOCK(p);
 425                 return (NULL);
 426         }
 427         do {
 428                 refcnt = vm->vm_refcnt;
 429                 if (refcnt <= 0) {      /* Avoid 0->1 transition */
 430                         PROC_VMSPACE_UNLOCK(p);
 431                         return (NULL);
 432                 }
 433         } while (!atomic_cmpset_int(&vm->vm_refcnt, refcnt, refcnt + 1));
 434         if (vm != p->p_vmspace) {
 435                 PROC_VMSPACE_UNLOCK(p);
 436                 vmspace_free(vm);
 437                 return (NULL);
 438         }
 439         PROC_VMSPACE_UNLOCK(p);
 440         return (vm);
 441 }
 442
 443 void
 444 _vm_map_lock(vm_map_t map, const char *file, int line)
 445 {
 446
 447         if (map->system_map)
 448                 mtx_lock_flags_(&map->system_mtx, 0, file, line);
 449         else
 450                 sx_xlock_(&map->lock, file, line);
 451         map->timestamp++;
 452 }
 453
 454 static void
 455 vm_map_process_deferred(void)
 456 {
 457         struct thread *td;
 458         vm_map_entry_t entry, next;
 459         vm_object_t object;
 460
 461         td = curthread;
 462         entry = td->td_map_def_user;
 463         td->td_map_def_user = NULL;
 464         while (entry != NULL) {
 465                 next = entry->next;
 466                 if ((entry->eflags & MAP_ENTRY_VN_WRITECNT) != 0) {
 467                         /*
 468                          * Decrement the object's writemappings and
 469                          * possibly the vnode's v_writecount.
 470                          */
 471                         KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
 472                             ("Submap with writecount"));
 473                         object = entry->object.vm_object;
 474                         KASSERT(object != NULL, ("No object for writecount"));
 475                         vnode_pager_release_writecount(object, entry->start,
 476                             entry->end);
 477                 }
 478                 vm_map_entry_deallocate(entry, FALSE);
 479                 entry = next;
 480         }
 481 }
 482
 483 void
 484 _vm_map_unlock(vm_map_t map, const char *file, int line)
 485 {
 486
 487         if (map->system_map)
 488                 mtx_unlock_flags_(&map->system_mtx, 0, file, line);
 489         else {
 490                 sx_xunlock_(&map->lock, file, line);
 491                 vm_map_process_deferred();
 492         }
 493 }
 494
 495 void
 496 _vm_map_lock_read(vm_map_t map, const char *file, int line)
 497 {
 498
 499         if (map->system_map)
 500                 mtx_lock_flags_(&map->system_mtx, 0, file, line);
 501         else
 502                 sx_slock_(&map->lock, file, line);
 503 }
 504
 505 void
 506 _vm_map_unlock_read(vm_map_t map, const char *file, int line)
 507 {
 508
 509         if (map->system_map)
 510                 mtx_unlock_flags_(&map->system_mtx, 0, file, line);
 511         else {
 512                 sx_sunlock_(&map->lock, file, line);
 513                 vm_map_process_deferred();
 514         }
 515 }
 516
 517 int
 518 _vm_map_trylock(vm_map_t map, const char *file, int line)
 519 {
 520         int error;
 521
 522         error = map->system_map ?
 523             !mtx_trylock_flags_(&map->system_mtx, 0, file, line) :
 524             !sx_try_xlock_(&map->lock, file, line);
 525         if (error == 0)
 526                 map->timestamp++;
 527         return (error == 0);
 528 }
 529
 530 int
 531 _vm_map_trylock_read(vm_map_t map, const char *file, int line)
 532 {
 533         int error;
 534
 535         error = map->system_map ?
 536             !mtx_trylock_flags_(&map->system_mtx, 0, file, line) :
 537             !sx_try_slock_(&map->lock, file, line);
 538         return (error == 0);
 539 }
 540
 541 /*
 542  *      _vm_map_lock_upgrade:   [ internal use only ]
 543  *
 544  *      Tries to upgrade a read (shared) lock on the specified map to a write
 545  *      (exclusive) lock.  Returns the value "0" if the upgrade succeeds and a
 546  *      non-zero value if the upgrade fails.  If the upgrade fails, the map is
 547  *      returned without a read or write lock held.
 548  *
 549  *      Requires that the map be read locked.
 550  */
 551 int
 552 _vm_map_lock_upgrade(vm_map_t map, const char *file, int line)
 553 {
 554         unsigned int last_timestamp;
 555
 556         if (map->system_map) {
 557                 mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
 558         } else {
 559                 if (!sx_try_upgrade_(&map->lock, file, line)) {
 560                         last_timestamp = map->timestamp;
 561                         sx_sunlock_(&map->lock, file, line);
 562                         vm_map_process_deferred();
 563                         /*
 564                          * If the map's timestamp does not change while the
 565                          * map is unlocked, then the upgrade succeeds.
 566                          */
 567                         sx_xlock_(&map->lock, file, line);
 568                         if (last_timestamp != map->timestamp) {
 569                                 sx_xunlock_(&map->lock, file, line);
 570                                 return (1);
 571                         }
 572                 }
 573         }
 574         map->timestamp++;
 575         return (0);
 576 }
 577
 578 void
 579 _vm_map_lock_downgrade(vm_map_t map, const char *file, int line)
 580 {
 581
 582         if (map->system_map) {
 583                 mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
 584         } else
 585                 sx_downgrade_(&map->lock, file, line);
 586 }
 587
 588 /*
 589  *      vm_map_locked:
 590  *
 591  *      Returns a non-zero value if the caller holds a write (exclusive) lock
 592  *      on the specified map and the value "0" otherwise.
 593  */
 594 int
 595 vm_map_locked(vm_map_t map)
 596 {
 597
 598         if (map->system_map)
 599                 return (mtx_owned(&map->system_mtx));
 600         else
 601                 return (sx_xlocked(&map->lock));
 602 }
 603
 604 #ifdef INVARIANTS
 605 static void
 606 _vm_map_assert_locked(vm_map_t map, const char *file, int line)
 607 {
 608
 609         if (map->system_map)
 610                 mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
 611         else
 612                 sx_assert_(&map->lock, SA_XLOCKED, file, line);
 613 }
 614
 615 #define VM_MAP_ASSERT_LOCKED(map) \
 616     _vm_map_assert_locked(map, LOCK_FILE, LOCK_LINE)
 617 #else
 618 #define VM_MAP_ASSERT_LOCKED(map)
 619 #endif
 620
 621 /*
 622  *      _vm_map_unlock_and_wait:
 623  *
 624  *      Atomically releases the lock on the specified map and puts the calling
 625  *      thread to sleep.  The calling thread will remain asleep until either
 626  *      vm_map_wakeup() is performed on the map or the specified timeout is
 627  *      exceeded.
 628  *
 629  *      WARNING!  This function does not perform deferred deallocations of
 630  *      objects and map entries.  Therefore, the calling thread is expected to
 631  *      reacquire the map lock after reawakening and later perform an ordinary
 632  *      unlock operation, such as vm_map_unlock(), before completing its
 633  *      operation on the map.
 634  */
 635 int
 636 _vm_map_unlock_and_wait(vm_map_t map, int timo, const char *file, int line)
 637 {
 638
 639         mtx_lock(&map_sleep_mtx);
 640         if (map->system_map)
 641                 mtx_unlock_flags_(&map->system_mtx, 0, file, line);
 642         else
 643                 sx_xunlock_(&map->lock, file, line);
 644         return (msleep(&map->root, &map_sleep_mtx, PDROP | PVM, "vmmaps",
 645             timo));
 646 }
 647
 648 /*
 649  *      vm_map_wakeup:
 650  *
 651  *      Awaken any threads that have slept on the map using
 652  *      vm_map_unlock_and_wait().
 653  */
 654 void
 655 vm_map_wakeup(vm_map_t map)
 656 {
 657
 658         /*
 659          * Acquire and release map_sleep_mtx to prevent a wakeup()
 660          * from being performed (and lost) between the map unlock
 661          * and the msleep() in _vm_map_unlock_and_wait().
 662          */
 663         mtx_lock(&map_sleep_mtx);
 664         mtx_unlock(&map_sleep_mtx);
 665         wakeup(&map->root);
 666 }
 667
 668 void
 669 vm_map_busy(vm_map_t map)
 670 {
 671
 672         VM_MAP_ASSERT_LOCKED(map);
 673         map->busy++;
 674 }
 675
 676 void
 677 vm_map_unbusy(vm_map_t map)
 678 {
 679
 680         VM_MAP_ASSERT_LOCKED(map);
 681         KASSERT(map->busy, ("vm_map_unbusy: not busy"));
 682         if (--map->busy == 0 && (map->flags & MAP_BUSY_WAKEUP)) {
 683                 vm_map_modflags(map, 0, MAP_BUSY_WAKEUP);
 684                 wakeup(&map->busy);
 685         }
 686 }
 687
 688 void
 689 vm_map_wait_busy(vm_map_t map)
 690 {
 691
 692         VM_MAP_ASSERT_LOCKED(map);
 693         while (map->busy) {
 694                 vm_map_modflags(map, MAP_BUSY_WAKEUP, 0);
 695                 if (map->system_map)
 696                         msleep(&map->busy, &map->system_mtx, 0, "mbusy", 0);
 697                 else
 698                         sx_sleep(&map->busy, &map->lock, 0, "mbusy", 0);
 699         }
 700         map->timestamp++;
 701 }
 702
 703 long
 704 vmspace_resident_count(struct vmspace *vmspace)
 705 {
 706         return pmap_resident_count(vmspace_pmap(vmspace));
 707 }
 708
 709 /*
 710  *      vm_map_create:
 711  *
 712  *      Creates and returns a new empty VM map with
 713  *      the given physical map structure, and having
 714  *      the given lower and upper address bounds.
 715  */
 716 vm_map_t
 717 vm_map_create(pmap_t pmap, vm_offset_t min, vm_offset_t max)
 718 {
 719         vm_map_t result;
 720
 721         result = uma_zalloc(mapzone, M_WAITOK);
 722         CTR1(KTR_VM, "vm_map_create: %p", result);
 723         _vm_map_init(result, pmap, min, max);
 724         return (result);
 725 }
 726
 727 /*
 728  * Initialize an existing vm_map structure
 729  * such as that in the vmspace structure.
 730  */
 731 static void
 732 _vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max)
 733 {
 734
 735         map->header.next = map->header.prev = &map->header;
 736         map->needs_wakeup = FALSE;
 737         map->system_map = 0;
 738         map->pmap = pmap;
 739         map->min_offset = min;
 740         map->max_offset = max;
 741         map->flags = 0;
 742         map->root = NULL;
 743         map->timestamp = 0;
 744         map->busy = 0;
 745 }
 746
 747 void
 748 vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max)
 749 {
 750
 751         _vm_map_init(map, pmap, min, max);
 752         mtx_init(&map->system_mtx, "system map", NULL, MTX_DEF | MTX_DUPOK);
 753         sx_init(&map->lock, "user map");
 754 }
 755
 756 /*
 757  *      vm_map_entry_dispose:   [ internal use only ]
 758  *
 759  *      Inverse of vm_map_entry_create.
 760  */
 761 static void
 762 vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry)
 763 {
 764         uma_zfree(map->system_map ? kmapentzone : mapentzone, entry);
 765 }
 766
 767 /*
 768  *      vm_map_entry_create:    [ internal use only ]
 769  *
 770  *      Allocates a VM map entry for insertion.
 771  *      No entry fields are filled in.
 772  */
 773 static vm_map_entry_t
 774 vm_map_entry_create(vm_map_t map)
 775 {
 776         vm_map_entry_t new_entry;
 777
 778         if (map->system_map)
 779                 new_entry = uma_zalloc(kmapentzone, M_NOWAIT);
 780         else
 781                 new_entry = uma_zalloc(mapentzone, M_WAITOK);
 782         if (new_entry == NULL)
 783                 panic("vm_map_entry_create: kernel resources exhausted");
 784         return (new_entry);
 785 }
 786
 787 /*
 788  *      vm_map_entry_set_behavior:
 789  *
 790  *      Set the expected access behavior, either normal, random, or
 791  *      sequential.
 792  */
 793 static inline void
 794 vm_map_entry_set_behavior(vm_map_entry_t entry, u_char behavior)
 795 {
 796         entry->eflags = (entry->eflags & ~MAP_ENTRY_BEHAV_MASK) |
 797             (behavior & MAP_ENTRY_BEHAV_MASK);
 798 }
 799
 800 /*
 801  *      vm_map_entry_set_max_free:
 802  *
 803  *      Set the max_free field in a vm_map_entry.
 804  */
 805 static inline void
 806 vm_map_entry_set_max_free(vm_map_entry_t entry)
 807 {
 808
 809         entry->max_free = entry->adj_free;
 810         if (entry->left != NULL && entry->left->max_free > entry->max_free)
 811                 entry->max_free = entry->left->max_free;
 812         if (entry->right != NULL && entry->right->max_free > entry->max_free)
 813                 entry->max_free = entry->right->max_free;
 814 }
 815
 816 /*
 817  *      vm_map_entry_splay:
 818  *
 819  *      The Sleator and Tarjan top-down splay algorithm with the
 820  *      following variation.  Max_free must be computed bottom-up, so
 821  *      on the downward pass, maintain the left and right spines in
 822  *      reverse order.  Then, make a second pass up each side to fix
 823  *      the pointers and compute max_free.  The time bound is O(log n)
 824  *      amortized.
 825  *
 826  *      The new root is the vm_map_entry containing "addr", or else an
 827  *      adjacent entry (lower or higher) if addr is not in the tree.
 828  *
 829  *      The map must be locked, and leaves it so.
 830  *
 831  *      Returns: the new root.
 832  */
 833 static vm_map_entry_t
 834 vm_map_entry_splay(vm_offset_t addr, vm_map_entry_t root)
 835 {
 836         vm_map_entry_t llist, rlist;
 837         vm_map_entry_t ltree, rtree;
 838         vm_map_entry_t y;
 839
 840         /* Special case of empty tree. */
 841         if (root == NULL)
 842                 return (root);
 843
 844         /*
 845          * Pass One: Splay down the tree until we find addr or a NULL
 846          * pointer where addr would go.  llist and rlist are the two
 847          * sides in reverse order (bottom-up), with llist linked by
 848          * the right pointer and rlist linked by the left pointer in
 849          * the vm_map_entry.  Wait until Pass Two to set max_free on
 850          * the two spines.
 851          */
 852         llist = NULL;
 853         rlist = NULL;
 854         for (;;) {
 855                 /* root is never NULL in here. */
 856                 if (addr < root->start) {
 857                         y = root->left;
 858                         if (y == NULL)
 859                                 break;
 860                         if (addr < y->start && y->left != NULL) {
 861                                 /* Rotate right and put y on rlist. */
 862                                 root->left = y->right;
 863                                 y->right = root;
 864                                 vm_map_entry_set_max_free(root);
 865                                 root = y->left;
 866                                 y->left = rlist;
 867                                 rlist = y;
 868                         } else {
 869                                 /* Put root on rlist. */
 870                                 root->left = rlist;
 871                                 rlist = root;
 872                                 root = y;
 873                         }
 874                 } else if (addr >= root->end) {
 875                         y = root->right;
 876                         if (y == NULL)
 877                                 break;
 878                         if (addr >= y->end && y->right != NULL) {
 879                                 /* Rotate left and put y on llist. */
 880                                 root->right = y->left;
 881                                 y->left = root;
 882                                 vm_map_entry_set_max_free(root);
 883                                 root = y->right;
 884                                 y->right = llist;
 885                                 llist = y;
 886                         } else {
 887                                 /* Put root on llist. */
 888                                 root->right = llist;
 889                                 llist = root;
 890                                 root = y;
 891                         }
 892                 } else
 893                         break;
 894         }
 895
 896         /*
 897          * Pass Two: Walk back up the two spines, flip the pointers
 898          * and set max_free.  The subtrees of the root go at the
 899          * bottom of llist and rlist.
 900          */
 901         ltree = root->left;
 902         while (llist != NULL) {
 903                 y = llist->right;
 904                 llist->right = ltree;
 905                 vm_map_entry_set_max_free(llist);
 906                 ltree = llist;
 907                 llist = y;
 908         }
 909         rtree = root->right;
 910         while (rlist != NULL) {
 911                 y = rlist->left;
 912                 rlist->left = rtree;
 913                 vm_map_entry_set_max_free(rlist);
 914                 rtree = rlist;
 915                 rlist = y;
 916         }
 917
 918         /*
 919          * Final assembly: add ltree and rtree as subtrees of root.
 920          */
 921         root->left = ltree;
 922         root->right = rtree;
 923         vm_map_entry_set_max_free(root);
 924
 925         return (root);
 926 }
 927
 928 /*
 929  *      vm_map_entry_{un,}link:
 930  *
 931  *      Insert/remove entries from maps.
 932  */
 933 static void
 934 vm_map_entry_link(vm_map_t map,
 935                   vm_map_entry_t after_where,
 936                   vm_map_entry_t entry)
 937 {
 938
 939         CTR4(KTR_VM,
 940             "vm_map_entry_link: map %p, nentries %d, entry %p, after %p", map,
 941             map->nentries, entry, after_where);
 942         VM_MAP_ASSERT_LOCKED(map);
 943         KASSERT(after_where == &map->header ||
 944             after_where->end <= entry->start,
 945             ("vm_map_entry_link: prev end %jx new start %jx overlap",
 946             (uintmax_t)after_where->end, (uintmax_t)entry->start));
 947         KASSERT(after_where->next == &map->header ||
 948             entry->end <= after_where->next->start,
 949             ("vm_map_entry_link: new end %jx next start %jx overlap",
 950             (uintmax_t)entry->end, (uintmax_t)after_where->next->start));
 951
 952         map->nentries++;
 953         entry->prev = after_where;
 954         entry->next = after_where->next;
 955         entry->next->prev = entry;
 956         after_where->next = entry;
 957
 958         if (after_where != &map->header) {
 959                 if (after_where != map->root)
 960                         vm_map_entry_splay(after_where->start, map->root);
 961                 entry->right = after_where->right;
 962                 entry->left = after_where;
 963                 after_where->right = NULL;
 964                 after_where->adj_free = entry->start - after_where->end;
 965                 vm_map_entry_set_max_free(after_where);
 966         } else {
 967                 entry->right = map->root;
 968                 entry->left = NULL;
 969         }
 970         entry->adj_free = (entry->next == &map->header ? map->max_offset :
 971             entry->next->start) - entry->end;
 972         vm_map_entry_set_max_free(entry);
 973         map->root = entry;
 974 }
 975
 976 static void
 977 vm_map_entry_unlink(vm_map_t map,
 978                     vm_map_entry_t entry)
 979 {
 980         vm_map_entry_t next, prev, root;
 981
 982         VM_MAP_ASSERT_LOCKED(map);
 983         if (entry != map->root)
 984                 vm_map_entry_splay(entry->start, map->root);
 985         if (entry->left == NULL)
 986                 root = entry->right;
 987         else {
 988                 root = vm_map_entry_splay(entry->start, entry->left);
 989                 root->right = entry->right;
 990                 root->adj_free = (entry->next == &map->header ? map->max_offset :
 991                     entry->next->start) - root->end;
 992                 vm_map_entry_set_max_free(root);
 993         }
 994         map->root = root;
 995
 996         prev = entry->prev;
 997         next = entry->next;
 998         next->prev = prev;
 999         prev->next = next;
1000         map->nentries--;
1001         CTR3(KTR_VM, "vm_map_entry_unlink: map %p, nentries %d, entry %p", map,
1002             map->nentries, entry);
1003 }
1004
1005 /*
1006  *      vm_map_entry_resize_free:
1007  *
1008  *      Recompute the amount of free space following a vm_map_entry
1009  *      and propagate that value up the tree.  Call this function after
1010  *      resizing a map entry in-place, that is, without a call to
1011  *      vm_map_entry_link() or _unlink().
1012  *
1013  *      The map must be locked, and leaves it so.
1014  */
1015 static void
1016 vm_map_entry_resize_free(vm_map_t map, vm_map_entry_t entry)
1017 {
1018
1019         /*
1020          * Using splay trees without parent pointers, propagating
1021          * max_free up the tree is done by moving the entry to the
1022          * root and making the change there.
1023          */
1024         if (entry != map->root)
1025                 map->root = vm_map_entry_splay(entry->start, map->root);
1026
1027         entry->adj_free = (entry->next == &map->header ? map->max_offset :
1028             entry->next->start) - entry->end;
1029         vm_map_entry_set_max_free(entry);
1030 }
1031
1032 /*
1033  *      vm_map_lookup_entry:    [ internal use only ]
1034  *
1035  *      Finds the map entry containing (or
1036  *      immediately preceding) the specified address
1037  *      in the given map; the entry is returned
1038  *      in the "entry" parameter.  The boolean
1039  *      result indicates whether the address is
1040  *      actually contained in the map.
1041  */
1042 boolean_t
1043 vm_map_lookup_entry(
1044         vm_map_t map,
1045         vm_offset_t address,
1046         vm_map_entry_t *entry)  /* OUT */
1047 {
1048         vm_map_entry_t cur;
1049         boolean_t locked;
1050
1051         /*
1052          * If the map is empty, then the map entry immediately preceding
1053          * "address" is the map's header.
1054          */
1055         cur = map->root;
1056         if (cur == NULL)
1057                 *entry = &map->header;
1058         else if (address >= cur->start && cur->end > address) {
1059                 *entry = cur;
1060                 return (TRUE);
1061         } else if ((locked = vm_map_locked(map)) ||
1062             sx_try_upgrade(&map->lock)) {
1063                 /*
1064                  * Splay requires a write lock on the map.  However, it only
1065                  * restructures the binary search tree; it does not otherwise
1066                  * change the map.  Thus, the map's timestamp need not change
1067                  * on a temporary upgrade.
1068                  */
1069                 map->root = cur = vm_map_entry_splay(address, cur);
1070                 if (!locked)
1071                         sx_downgrade(&map->lock);
1072
1073                 /*
1074                  * If "address" is contained within a map entry, the new root
1075                  * is that map entry.  Otherwise, the new root is a map entry
1076                  * immediately before or after "address".
1077                  */
1078                 if (address >= cur->start) {
1079                         *entry = cur;
1080                         if (cur->end > address)
1081                                 return (TRUE);
1082                 } else
1083                         *entry = cur->prev;
1084         } else
1085                 /*
1086                  * Since the map is only locked for read access, perform a
1087                  * standard binary search tree lookup for "address".
1088                  */
1089                 for (;;) {
1090                         if (address < cur->start) {
1091                                 if (cur->left == NULL) {
1092                                         *entry = cur->prev;
1093                                         break;
1094                                 }
1095                                 cur = cur->left;
1096                         } else if (cur->end > address) {
1097                                 *entry = cur;
1098                                 return (TRUE);
1099                         } else {
1100                                 if (cur->right == NULL) {
1101                                         *entry = cur;
1102                                         break;
1103                                 }
1104                                 cur = cur->right;
1105                         }
1106                 }
1107         return (FALSE);
1108 }
1109
1110 /*
1111  *      vm_map_insert:
1112  *
1113  *      Inserts the given whole VM object into the target
1114  *      map at the specified address range.  The object's
1115  *      size should match that of the address range.
1116  *
1117  *      Requires that the map be locked, and leaves it so.
1118  *
1119  *      If object is non-NULL, ref count must be bumped by caller
1120  *      prior to making call to account for the new entry.
1121  */
1122 int
1123 vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1124     vm_offset_t start, vm_offset_t end, vm_prot_t prot, vm_prot_t max, int cow)
1125 {
1126         vm_map_entry_t new_entry, prev_entry, temp_entry;
1127         vm_eflags_t protoeflags;
1128         struct ucred *cred;
1129         vm_inherit_t inheritance;
1130
1131         VM_MAP_ASSERT_LOCKED(map);
1132         KASSERT((object != kmem_object && object != kernel_object) ||
1133             (cow & MAP_COPY_ON_WRITE) == 0,
1134             ("vm_map_insert: kmem or kernel object and COW"));
1135         KASSERT(object == NULL || (cow & MAP_NOFAULT) == 0,
1136             ("vm_map_insert: paradoxical MAP_NOFAULT request"));
1137
1138         /*
1139          * Check that the start and end points are not bogus.
1140          */
1141         if ((start < map->min_offset) || (end > map->max_offset) ||
1142             (start >= end))
1143                 return (KERN_INVALID_ADDRESS);
1144
1145         /*
1146          * Find the entry prior to the proposed starting address; if it's part
1147          * of an existing entry, this range is bogus.
1148          */
1149         if (vm_map_lookup_entry(map, start, &temp_entry))
1150                 return (KERN_NO_SPACE);
1151
1152         prev_entry = temp_entry;
1153
1154         /*
1155          * Assert that the next entry doesn't overlap the end point.
1156          */
1157         if ((prev_entry->next != &map->header) &&
1158             (prev_entry->next->start < end))
1159                 return (KERN_NO_SPACE);
1160
1161         protoeflags = 0;
1162         if (cow & MAP_COPY_ON_WRITE)
1163                 protoeflags |= MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY;
1164         if (cow & MAP_NOFAULT)
1165                 protoeflags |= MAP_ENTRY_NOFAULT;
1166         if (cow & MAP_DISABLE_SYNCER)
1167                 protoeflags |= MAP_ENTRY_NOSYNC;
1168         if (cow & MAP_DISABLE_COREDUMP)
1169                 protoeflags |= MAP_ENTRY_NOCOREDUMP;
1170         if (cow & MAP_STACK_GROWS_DOWN)
1171                 protoeflags |= MAP_ENTRY_GROWS_DOWN;
1172         if (cow & MAP_STACK_GROWS_UP)
1173                 protoeflags |= MAP_ENTRY_GROWS_UP;
1174         if (cow & MAP_VN_WRITECOUNT)
1175                 protoeflags |= MAP_ENTRY_VN_WRITECNT;
1176         if (cow & MAP_INHERIT_SHARE)
1177                 inheritance = VM_INHERIT_SHARE;
1178         else
1179                 inheritance = VM_INHERIT_DEFAULT;
1180
1181         cred = NULL;
1182         if (cow & (MAP_ACC_NO_CHARGE | MAP_NOFAULT))
1183                 goto charged;
1184         if ((cow & MAP_ACC_CHARGED) || ((prot & VM_PROT_WRITE) &&
1185             ((protoeflags & MAP_ENTRY_NEEDS_COPY) || object == NULL))) {
1186                 if (!(cow & MAP_ACC_CHARGED) && !swap_reserve(end - start))
1187                         return (KERN_RESOURCE_SHORTAGE);
1188                 KASSERT(object == NULL || (protoeflags & MAP_ENTRY_NEEDS_COPY) ||
1189                     object->cred == NULL,
1190                     ("OVERCOMMIT: vm_map_insert o %p", object));
1191                 cred = curthread->td_ucred;
1192         }
1193
1194 charged:
1195         /* Expand the kernel pmap, if necessary. */
1196         if (map == kernel_map && end > kernel_vm_end)
1197                 pmap_growkernel(end);
1198         if (object != NULL) {
1199                 /*
1200                  * OBJ_ONEMAPPING must be cleared unless this mapping
1201                  * is trivially proven to be the only mapping for any
1202                  * of the object's pages.  (Object granularity
1203                  * reference counting is insufficient to recognize
1204                  * aliases with precision.)
1205                  */
1206                 VM_OBJECT_WLOCK(object);
1207                 if (object->ref_count > 1 || object->shadow_count != 0)
1208                         vm_object_clear_flag(object, OBJ_ONEMAPPING);
1209                 VM_OBJECT_WUNLOCK(object);
1210         }
1211         else if ((prev_entry != &map->header) &&
1212                  (prev_entry->eflags == protoeflags) &&
1213                  (cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 &&
1214                  (prev_entry->end == start) &&
1215                  (prev_entry->wired_count == 0) &&
1216                  (prev_entry->cred == cred ||
1217                   (prev_entry->object.vm_object != NULL &&
1218                    (prev_entry->object.vm_object->cred == cred))) &&
1219                    vm_object_coalesce(prev_entry->object.vm_object,
1220                        prev_entry->offset,
1221                        (vm_size_t)(prev_entry->end - prev_entry->start),
1222                        (vm_size_t)(end - prev_entry->end), cred != NULL &&
1223                        (protoeflags & MAP_ENTRY_NEEDS_COPY) == 0)) {
1224                 /*
1225                  * We were able to extend the object.  Determine if we
1226                  * can extend the previous map entry to include the
1227                  * new range as well.
1228                  */
1229                 if ((prev_entry->inheritance == inheritance) &&
1230                     (prev_entry->protection == prot) &&
1231                     (prev_entry->max_protection == max)) {
1232                         map->size += (end - prev_entry->end);
1233                         prev_entry->end = end;
1234                         vm_map_entry_resize_free(map, prev_entry);
1235                         vm_map_simplify_entry(map, prev_entry);
1236                         return (KERN_SUCCESS);
1237                 }
1238
1239                 /*
1240                  * If we can extend the object but cannot extend the
1241                  * map entry, we have to create a new map entry.  We
1242                  * must bump the ref count on the extended object to
1243                  * account for it.  object may be NULL.
1244                  */
1245                 object = prev_entry->object.vm_object;
1246                 offset = prev_entry->offset +
1247                         (prev_entry->end - prev_entry->start);
1248                 vm_object_reference(object);
1249                 if (cred != NULL && object != NULL && object->cred != NULL &&
1250                     !(prev_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
1251                         /* Object already accounts for this uid. */
1252                         cred = NULL;
1253                 }
1254         }
1255         if (cred != NULL)
1256                 crhold(cred);
1257
1258         /*
1259          * Create a new entry
1260          */
1261         new_entry = vm_map_entry_create(map);
1262         new_entry->start = start;
1263         new_entry->end = end;
1264         new_entry->cred = NULL;
1265
1266         new_entry->eflags = protoeflags;
1267         new_entry->object.vm_object = object;
1268         new_entry->offset = offset;
1269         new_entry->avail_ssize = 0;
1270
1271         new_entry->inheritance = inheritance;
1272         new_entry->protection = prot;
1273         new_entry->max_protection = max;
1274         new_entry->wired_count = 0;
1275         new_entry->wiring_thread = NULL;
1276         new_entry->read_ahead = VM_FAULT_READ_AHEAD_INIT;
1277         new_entry->next_read = OFF_TO_IDX(offset);
1278
1279         KASSERT(cred == NULL || !ENTRY_CHARGED(new_entry),
1280             ("OVERCOMMIT: vm_map_insert leaks vm_map %p", new_entry));
1281         new_entry->cred = cred;
1282
1283         /*
1284          * Insert the new entry into the list
1285          */
1286         vm_map_entry_link(map, prev_entry, new_entry);
1287         map->size += new_entry->end - new_entry->start;
1288
1289         /*
1290          * Try to coalesce the new entry with both the previous and next
1291          * entries in the list.  Previously, we only attempted to coalesce
1292          * with the previous entry when object is NULL.  Here, we handle the
1293          * other cases, which are less common.
1294          */
1295         vm_map_simplify_entry(map, new_entry);
1296
1297         if (cow & (MAP_PREFAULT|MAP_PREFAULT_PARTIAL)) {
1298                 vm_map_pmap_enter(map, start, prot,
1299                                     object, OFF_TO_IDX(offset), end - start,
1300                                     cow & MAP_PREFAULT_PARTIAL);
1301         }
1302
1303         return (KERN_SUCCESS);
1304 }
1305
1306 /*
1307  *      vm_map_findspace:
1308  *
1309  *      Find the first fit (lowest VM address) for "length" free bytes
1310  *      beginning at address >= start in the given map.
1311  *
1312  *      In a vm_map_entry, "adj_free" is the amount of free space
1313  *      adjacent (higher address) to this entry, and "max_free" is the
1314  *      maximum amount of contiguous free space in its subtree.  This
1315  *      allows finding a free region in one path down the tree, so
1316  *      O(log n) amortized with splay trees.
1317  *
1318  *      The map must be locked, and leaves it so.
1319  *
1320  *      Returns: 0 on success, and starting address in *addr,
1321  *               1 if insufficient space.
1322  */
1323 int
1324 vm_map_findspace(vm_map_t map, vm_offset_t start, vm_size_t length,
1325     vm_offset_t *addr)  /* OUT */
1326 {
1327         vm_map_entry_t entry;
1328         vm_offset_t st;
1329
1330         /*
1331          * Request must fit within min/max VM address and must avoid
1332          * address wrap.
1333          */
1334         if (start < map->min_offset)
1335                 start = map->min_offset;
1336         if (start + length > map->max_offset || start + length < start)
1337                 return (1);
1338
1339         /* Empty tree means wide open address space. */
1340         if (map->root == NULL) {
1341                 *addr = start;
1342                 return (0);
1343         }
1344
1345         /*
1346          * After splay, if start comes before root node, then there
1347          * must be a gap from start to the root.
1348          */
1349         map->root = vm_map_entry_splay(start, map->root);
1350         if (start + length <= map->root->start) {
1351                 *addr = start;
1352                 return (0);
1353         }
1354
1355         /*
1356          * Root is the last node that might begin its gap before
1357          * start, and this is the last comparison where address
1358          * wrap might be a problem.
1359          */
1360         st = (start > map->root->end) ? start : map->root->end;
1361         if (length <= map->root->end + map->root->adj_free - st) {
1362                 *addr = st;
1363                 return (0);
1364         }
1365
1366         /* With max_free, can immediately tell if no solution. */
1367         entry = map->root->right;
1368         if (entry == NULL || length > entry->max_free)
1369                 return (1);
1370
1371         /*
1372          * Search the right subtree in the order: left subtree, root,
1373          * right subtree (first fit).  The previous splay implies that
1374          * all regions in the right subtree have addresses > start.
1375          */
1376         while (entry != NULL) {
1377                 if (entry->left != NULL && entry->left->max_free >= length)
1378                         entry = entry->left;
1379                 else if (entry->adj_free >= length) {
1380                         *addr = entry->end;
1381                         return (0);
1382                 } else
1383                         entry = entry->right;
1384         }
1385
1386         /* Can't get here, so panic if we do. */
1387         panic("vm_map_findspace: max_free corrupt");
1388 }
1389
1390 int
1391 vm_map_fixed(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1392     vm_offset_t start, vm_size_t length, vm_prot_t prot,
1393     vm_prot_t max, int cow)
1394 {
1395         vm_offset_t end;
1396         int result;
1397
1398         end = start + length;
1399         KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 ||
1400             object == NULL,
1401             ("vm_map_fixed: non-NULL backing object for stack"));
1402         vm_map_lock(map);
1403         VM_MAP_RANGE_CHECK(map, start, end);
1404         if ((cow & MAP_CHECK_EXCL) == 0)
1405                 vm_map_delete(map, start, end);
1406         if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) {
1407                 result = vm_map_stack_locked(map, start, length, sgrowsiz,
1408                     prot, max, cow);
1409         } else {
1410                 result = vm_map_insert(map, object, offset, start, end,
1411                     prot, max, cow);
1412         }
1413         vm_map_unlock(map);
1414         return (result);
1415 }
1416
1417 /*
1418  *      vm_map_find finds an unallocated region in the target address
1419  *      map with the given length.  The search is defined to be
1420  *      first-fit from the specified address; the region found is
1421  *      returned in the same parameter.
1422  *
1423  *      If object is non-NULL, ref count must be bumped by caller
1424  *      prior to making call to account for the new entry.
1425  */
1426 int
1427 vm_map_find(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1428             vm_offset_t *addr,  /* IN/OUT */
1429             vm_size_t length, vm_offset_t max_addr, int find_space,
1430             vm_prot_t prot, vm_prot_t max, int cow)
1431 {
1432         vm_offset_t alignment, initial_addr, start;
1433         int result;
1434
1435         KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 ||
1436             object == NULL,
1437             ("vm_map_find: non-NULL backing object for stack"));
1438         if (find_space == VMFS_OPTIMAL_SPACE && (object == NULL ||
1439             (object->flags & OBJ_COLORED) == 0))
1440                 find_space = VMFS_ANY_SPACE;
1441         if (find_space >> 8 != 0) {
1442                 KASSERT((find_space & 0xff) == 0, ("bad VMFS flags"));
1443                 alignment = (vm_offset_t)1 << (find_space >> 8);
1444         } else
1445                 alignment = 0;
1446         initial_addr = *addr;
1447 again:
1448         start = initial_addr;
1449         vm_map_lock(map);
1450         do {
1451                 if (find_space != VMFS_NO_SPACE) {
1452                         if (vm_map_findspace(map, start, length, addr) ||
1453                             (max_addr != 0 && *addr + length > max_addr)) {
1454                                 vm_map_unlock(map);
1455                                 if (find_space == VMFS_OPTIMAL_SPACE) {
1456                                         find_space = VMFS_ANY_SPACE;
1457                                         goto again;
1458                                 }
1459                                 return (KERN_NO_SPACE);
1460                         }
1461                         switch (find_space) {
1462                         case VMFS_SUPER_SPACE:
1463                         case VMFS_OPTIMAL_SPACE:
1464                                 pmap_align_superpage(object, offset, addr,
1465                                     length);
1466                                 break;
1467                         case VMFS_ANY_SPACE:
1468                                 break;
1469                         default:
1470                                 if ((*addr & (alignment - 1)) != 0) {
1471                                         *addr &= ~(alignment - 1);
1472                                         *addr += alignment;
1473                                 }
1474                                 break;
1475                         }
1476
1477                         start = *addr;
1478                 }
1479                 if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) {
1480                         result = vm_map_stack_locked(map, start, length,
1481                             sgrowsiz, prot, max, cow);
1482                 } else {
1483                         result = vm_map_insert(map, object, offset, start,
1484                             start + length, prot, max, cow);
1485                 }
1486         } while (result == KERN_NO_SPACE && find_space != VMFS_NO_SPACE &&
1487             find_space != VMFS_ANY_SPACE);
1488         vm_map_unlock(map);
1489         return (result);
1490 }
1491
1492 /*
1493  *      vm_map_simplify_entry:
1494  *
1495  *      Simplify the given map entry by merging with either neighbor.  This
1496  *      routine also has the ability to merge with both neighbors.
1497  *
1498  *      The map must be locked.
1499  *
1500  *      This routine guarentees that the passed entry remains valid (though
1501  *      possibly extended).  When merging, this routine may delete one or
1502  *      both neighbors.
1503  */
1504 void
1505 vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry)
1506 {
1507         vm_map_entry_t next, prev;
1508         vm_size_t prevsize, esize;
1509
1510         if ((entry->eflags & (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP |
1511             MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_IS_SUB_MAP)) != 0)
1512                 return;
1513
1514         prev = entry->prev;
1515         if (prev != &map->header) {
1516                 prevsize = prev->end - prev->start;
1517                 if ( (prev->end == entry->start) &&
1518                      (prev->object.vm_object == entry->object.vm_object) &&
1519                      (!prev->object.vm_object ||
1520                         (prev->offset + prevsize == entry->offset)) &&
1521                      (prev->eflags == entry->eflags) &&
1522                      (prev->protection == entry->protection) &&
1523                      (prev->max_protection == entry->max_protection) &&
1524                      (prev->inheritance == entry->inheritance) &&
1525                      (prev->wired_count == entry->wired_count) &&
1526                      (prev->cred == entry->cred)) {
1527                         vm_map_entry_unlink(map, prev);
1528                         entry->start = prev->start;
1529                         entry->offset = prev->offset;
1530                         if (entry->prev != &map->header)
1531                                 vm_map_entry_resize_free(map, entry->prev);
1532
1533                         /*
1534                          * If the backing object is a vnode object,
1535                          * vm_object_deallocate() calls vrele().
1536                          * However, vrele() does not lock the vnode
1537                          * because the vnode has additional
1538                          * references.  Thus, the map lock can be kept
1539                          * without causing a lock-order reversal with
1540                          * the vnode lock.
1541                          *
1542                          * Since we count the number of virtual page
1543                          * mappings in object->un_pager.vnp.writemappings,
1544                          * the writemappings value should not be adjusted
1545                          * when the entry is disposed of.
1546                          */
1547                         if (prev->object.vm_object)
1548                                 vm_object_deallocate(prev->object.vm_object);
1549                         if (prev->cred != NULL)
1550                                 crfree(prev->cred);
1551                         vm_map_entry_dispose(map, prev);
1552                 }
1553         }
1554
1555         next = entry->next;
1556         if (next != &map->header) {
1557                 esize = entry->end - entry->start;
1558                 if ((entry->end == next->start) &&
1559                     (next->object.vm_object == entry->object.vm_object) &&
1560                      (!entry->object.vm_object ||
1561                         (entry->offset + esize == next->offset)) &&
1562                     (next->eflags == entry->eflags) &&
1563                     (next->protection == entry->protection) &&
1564                     (next->max_protection == entry->max_protection) &&
1565                     (next->inheritance == entry->inheritance) &&
1566                     (next->wired_count == entry->wired_count) &&
1567                     (next->cred == entry->cred)) {
1568                         vm_map_entry_unlink(map, next);
1569                         entry->end = next->end;
1570                         vm_map_entry_resize_free(map, entry);
1571
1572                         /*
1573                          * See comment above.
1574                          */
1575                         if (next->object.vm_object)
1576                                 vm_object_deallocate(next->object.vm_object);
1577                         if (next->cred != NULL)
1578                                 crfree(next->cred);
1579                         vm_map_entry_dispose(map, next);
1580                 }
1581         }
1582 }
1583 /*
1584  *      vm_map_clip_start:      [ internal use only ]
1585  *
1586  *      Asserts that the given entry begins at or after
1587  *      the specified address; if necessary,
1588  *      it splits the entry into two.
1589  */
1590 #define vm_map_clip_start(map, entry, startaddr) \
1591 { \
1592         if (startaddr > entry->start) \
1593                 _vm_map_clip_start(map, entry, startaddr); \
1594 }
1595
1596 /*
1597  *      This routine is called only when it is known that
1598  *      the entry must be split.
1599  */
1600 static void
1601 _vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start)
1602 {
1603         vm_map_entry_t new_entry;
1604
1605         VM_MAP_ASSERT_LOCKED(map);
1606
1607         /*
1608          * Split off the front portion -- note that we must insert the new
1609          * entry BEFORE this one, so that this entry has the specified
1610          * starting address.
1611          */
1612         vm_map_simplify_entry(map, entry);
1613
1614         /*
1615          * If there is no object backing this entry, we might as well create
1616          * one now.  If we defer it, an object can get created after the map
1617          * is clipped, and individual objects will be created for the split-up
1618          * map.  This is a bit of a hack, but is also about the best place to
1619          * put this improvement.
1620          */
1621         if (entry->object.vm_object == NULL && !map->system_map) {
1622                 vm_object_t object;
1623                 object = vm_object_allocate(OBJT_DEFAULT,
1624                                 atop(entry->end - entry->start));
1625                 entry->object.vm_object = object;
1626                 entry->offset = 0;
1627                 if (entry->cred != NULL) {
1628                         object->cred = entry->cred;
1629                         object->charge = entry->end - entry->start;
1630                         entry->cred = NULL;
1631                 }
1632         } else if (entry->object.vm_object != NULL &&
1633                    ((entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) &&
1634                    entry->cred != NULL) {
1635                 VM_OBJECT_WLOCK(entry->object.vm_object);
1636                 KASSERT(entry->object.vm_object->cred == NULL,
1637                     ("OVERCOMMIT: vm_entry_clip_start: both cred e %p", entry));
1638                 entry->object.vm_object->cred = entry->cred;
1639                 entry->object.vm_object->charge = entry->end - entry->start;
1640                 VM_OBJECT_WUNLOCK(entry->object.vm_object);
1641                 entry->cred = NULL;
1642         }
1643
1644         new_entry = vm_map_entry_create(map);
1645         *new_entry = *entry;
1646
1647         new_entry->end = start;
1648         entry->offset += (start - entry->start);
1649         entry->start = start;
1650         if (new_entry->cred != NULL)
1651                 crhold(entry->cred);
1652
1653         vm_map_entry_link(map, entry->prev, new_entry);
1654
1655         if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
1656                 vm_object_reference(new_entry->object.vm_object);
1657                 /*
1658                  * The object->un_pager.vnp.writemappings for the
1659                  * object of MAP_ENTRY_VN_WRITECNT type entry shall be
1660                  * kept as is here.  The virtual pages are
1661                  * re-distributed among the clipped entries, so the sum is
1662                  * left the same.
1663                  */
1664         }
1665 }
1666
1667 /*
1668  *      vm_map_clip_end:        [ internal use only ]
1669  *
1670  *      Asserts that the given entry ends at or before
1671  *      the specified address; if necessary,
1672  *      it splits the entry into two.
1673  */
1674 #define vm_map_clip_end(map, entry, endaddr) \
1675 { \
1676         if ((endaddr) < (entry->end)) \
1677                 _vm_map_clip_end((map), (entry), (endaddr)); \
1678 }
1679
1680 /*
1681  *      This routine is called only when it is known that
1682  *      the entry must be split.
1683  */
1684 static void
1685 _vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end)
1686 {
1687         vm_map_entry_t new_entry;
1688
1689         VM_MAP_ASSERT_LOCKED(map);
1690
1691         /*
1692          * If there is no object backing this entry, we might as well create
1693          * one now.  If we defer it, an object can get created after the map
1694          * is clipped, and individual objects will be created for the split-up
1695          * map.  This is a bit of a hack, but is also about the best place to
1696          * put this improvement.
1697          */
1698         if (entry->object.vm_object == NULL && !map->system_map) {
1699                 vm_object_t object;
1700                 object = vm_object_allocate(OBJT_DEFAULT,
1701                                 atop(entry->end - entry->start));
1702                 entry->object.vm_object = object;
1703                 entry->offset = 0;
1704                 if (entry->cred != NULL) {
1705                         object->cred = entry->cred;
1706                         object->charge = entry->end - entry->start;
1707                         entry->cred = NULL;
1708                 }
1709         } else if (entry->object.vm_object != NULL &&
1710                    ((entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) &&
1711                    entry->cred != NULL) {
1712                 VM_OBJECT_WLOCK(entry->object.vm_object);
1713                 KASSERT(entry->object.vm_object->cred == NULL,
1714                     ("OVERCOMMIT: vm_entry_clip_end: both cred e %p", entry));
1715                 entry->object.vm_object->cred = entry->cred;
1716                 entry->object.vm_object->charge = entry->end - entry->start;
1717                 VM_OBJECT_WUNLOCK(entry->object.vm_object);
1718                 entry->cred = NULL;
1719         }
1720
1721         /*
1722          * Create a new entry and insert it AFTER the specified entry
1723          */
1724         new_entry = vm_map_entry_create(map);
1725         *new_entry = *entry;
1726
1727         new_entry->start = entry->end = end;
1728         new_entry->offset += (end - entry->start);
1729         if (new_entry->cred != NULL)
1730                 crhold(entry->cred);
1731
1732         vm_map_entry_link(map, entry, new_entry);
1733
1734         if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
1735                 vm_object_reference(new_entry->object.vm_object);
1736         }
1737 }
1738
1739 /*
1740  *      vm_map_submap:          [ kernel use only ]
1741  *
1742  *      Mark the given range as handled by a subordinate map.
1743  *
1744  *      This range must have been created with vm_map_find,
1745  *      and no other operations may have been performed on this
1746  *      range prior to calling vm_map_submap.
1747  *
1748  *      Only a limited number of operations can be performed
1749  *      within this rage after calling vm_map_submap:
1750  *              vm_fault
1751  *      [Don't try vm_map_copy!]
1752  *
1753  *      To remove a submapping, one must first remove the
1754  *      range from the superior map, and then destroy the
1755  *      submap (if desired).  [Better yet, don't try it.]
1756  */
1757 int
1758 vm_map_submap(
1759         vm_map_t map,
1760         vm_offset_t start,
1761         vm_offset_t end,
1762         vm_map_t submap)
1763 {
1764         vm_map_entry_t entry;
1765         int result = KERN_INVALID_ARGUMENT;
1766
1767         vm_map_lock(map);
1768
1769         VM_MAP_RANGE_CHECK(map, start, end);
1770
1771         if (vm_map_lookup_entry(map, start, &entry)) {
1772                 vm_map_clip_start(map, entry, start);
1773         } else
1774                 entry = entry->next;
1775
1776         vm_map_clip_end(map, entry, end);
1777
1778         if ((entry->start == start) && (entry->end == end) &&
1779             ((entry->eflags & MAP_ENTRY_COW) == 0) &&
1780             (entry->object.vm_object == NULL)) {
1781                 entry->object.sub_map = submap;
1782                 entry->eflags |= MAP_ENTRY_IS_SUB_MAP;
1783                 result = KERN_SUCCESS;
1784         }
1785         vm_map_unlock(map);
1786
1787         return (result);
1788 }
1789
1790 /*
1791  * The maximum number of pages to map if MAP_PREFAULT_PARTIAL is specified
1792  */
1793 #define MAX_INIT_PT     96
1794
1795 /*
1796  *      vm_map_pmap_enter:
1797  *
1798  *      Preload the specified map's pmap with mappings to the specified
1799  *      object's memory-resident pages.  No further physical pages are
1800  *      allocated, and no further virtual pages are retrieved from secondary
1801  *      storage.  If the specified flags include MAP_PREFAULT_PARTIAL, then a
1802  *      limited number of page mappings are created at the low-end of the
1803  *      specified address range.  (For this purpose, a superpage mapping
1804  *      counts as one page mapping.)  Otherwise, all resident pages within
1805  *      the specified address range are mapped.  Because these mappings are
1806  *      being created speculatively, cached pages are not reactivated and
1807  *      mapped.
1808  */
1809 void
1810 vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
1811     vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags)
1812 {
1813         vm_offset_t start;
1814         vm_page_t p, p_start;
1815         vm_pindex_t mask, psize, threshold, tmpidx;
1816
1817         if ((prot & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0 || object == NULL)
1818                 return;
1819         VM_OBJECT_RLOCK(object);
1820         if (object->type == OBJT_DEVICE || object->type == OBJT_SG) {
1821                 VM_OBJECT_RUNLOCK(object);
1822                 VM_OBJECT_WLOCK(object);
1823                 if (object->type == OBJT_DEVICE || object->type == OBJT_SG) {
1824                         pmap_object_init_pt(map->pmap, addr, object, pindex,
1825                             size);
1826                         VM_OBJECT_WUNLOCK(object);
1827                         return;
1828                 }
1829                 VM_OBJECT_LOCK_DOWNGRADE(object);
1830         }
1831
1832         psize = atop(size);
1833         if (psize + pindex > object->size) {
1834                 if (object->size < pindex) {
1835                         VM_OBJECT_RUNLOCK(object);
1836                         return;
1837                 }
1838                 psize = object->size - pindex;
1839         }
1840
1841         start = 0;
1842         p_start = NULL;
1843         threshold = MAX_INIT_PT;
1844
1845         p = vm_page_find_least(object, pindex);
1846         /*
1847          * Assert: the variable p is either (1) the page with the
1848          * least pindex greater than or equal to the parameter pindex
1849          * or (2) NULL.
1850          */
1851         for (;
1852              p != NULL && (tmpidx = p->pindex - pindex) < psize;
1853              p = TAILQ_NEXT(p, listq)) {
1854                 /*
1855                  * don't allow an madvise to blow away our really
1856                  * free pages allocating pv entries.
1857                  */
1858                 if (((flags & MAP_PREFAULT_MADVISE) != 0 &&
1859                     vm_cnt.v_free_count < vm_cnt.v_free_reserved) ||
1860                     ((flags & MAP_PREFAULT_PARTIAL) != 0 &&
1861                     tmpidx >= threshold)) {
1862                         psize = tmpidx;
1863                         break;
1864                 }
1865                 if (p->valid == VM_PAGE_BITS_ALL) {
1866                         if (p_start == NULL) {
1867                                 start = addr + ptoa(tmpidx);
1868                                 p_start = p;
1869                         }
1870                         /* Jump ahead if a superpage mapping is possible. */
1871                         if (p->psind > 0 && ((addr + ptoa(tmpidx)) &
1872                             (pagesizes[p->psind] - 1)) == 0) {
1873                                 mask = atop(pagesizes[p->psind]) - 1;
1874                                 if (tmpidx + mask < psize &&
1875                                     vm_page_ps_is_valid(p)) {
1876                                         p += mask;
1877                                         threshold += mask;
1878                                 }
1879                         }
1880                 } else if (p_start != NULL) {
1881                         pmap_enter_object(map->pmap, start, addr +
1882                             ptoa(tmpidx), p_start, prot);
1883                         p_start = NULL;
1884                 }
1885         }
1886         if (p_start != NULL)
1887                 pmap_enter_object(map->pmap, start, addr + ptoa(psize),
1888                     p_start, prot);
1889         VM_OBJECT_RUNLOCK(object);
1890 }
1891
1892 /*
1893  *      vm_map_protect:
1894  *
1895  *      Sets the protection of the specified address
1896  *      region in the target map.  If "set_max" is
1897  *      specified, the maximum protection is to be set;
1898  *      otherwise, only the current protection is affected.
1899  */
1900 int
1901 vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end,
1902                vm_prot_t new_prot, boolean_t set_max)
1903 {
1904         vm_map_entry_t current, entry;
1905         vm_object_t obj;
1906         struct ucred *cred;
1907         vm_prot_t old_prot;
1908
1909         if (start == end)
1910                 return (KERN_SUCCESS);
1911
1912         vm_map_lock(map);
1913
1914         VM_MAP_RANGE_CHECK(map, start, end);
1915
1916         if (vm_map_lookup_entry(map, start, &entry)) {
1917                 vm_map_clip_start(map, entry, start);
1918         } else {
1919                 entry = entry->next;
1920         }
1921
1922         /*
1923          * Make a first pass to check for protection violations.
1924          */
1925         current = entry;
1926         while ((current != &map->header) && (current->start < end)) {
1927                 if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
1928                         vm_map_unlock(map);
1929                         return (KERN_INVALID_ARGUMENT);
1930                 }
1931                 if ((new_prot & current->max_protection) != new_prot) {
1932                         vm_map_unlock(map);
1933                         return (KERN_PROTECTION_FAILURE);
1934                 }
1935                 current = current->next;
1936         }
1937
1938
1939         /*
1940          * Do an accounting pass for private read-only mappings that
1941          * now will do cow due to allowed write (e.g. debugger sets
1942          * breakpoint on text segment)
1943          */
1944         for (current = entry; (current != &map->header) &&
1945              (current->start < end); current = current->next) {
1946
1947                 vm_map_clip_end(map, current, end);
1948
1949                 if (set_max ||
1950                     ((new_prot & ~(current->protection)) & VM_PROT_WRITE) == 0 ||
1951                     ENTRY_CHARGED(current)) {
1952                         continue;
1953                 }
1954
1955                 cred = curthread->td_ucred;
1956                 obj = current->object.vm_object;
1957
1958                 if (obj == NULL || (current->eflags & MAP_ENTRY_NEEDS_COPY)) {
1959                         if (!swap_reserve(current->end - current->start)) {
1960                                 vm_map_unlock(map);
1961                                 return (KERN_RESOURCE_SHORTAGE);
1962                         }
1963                         crhold(cred);
1964                         current->cred = cred;
1965                         continue;
1966                 }
1967
1968                 VM_OBJECT_WLOCK(obj);
1969                 if (obj->type != OBJT_DEFAULT && obj->type != OBJT_SWAP) {
1970                         VM_OBJECT_WUNLOCK(obj);
1971                         continue;
1972                 }
1973
1974                 /*
1975                  * Charge for the whole object allocation now, since
1976                  * we cannot distinguish between non-charged and
1977                  * charged clipped mapping of the same object later.
1978                  */
1979                 KASSERT(obj->charge == 0,
1980                     ("vm_map_protect: object %p overcharged (entry %p)",
1981                     obj, current));
1982                 if (!swap_reserve(ptoa(obj->size))) {
1983                         VM_OBJECT_WUNLOCK(obj);
1984                         vm_map_unlock(map);
1985                         return (KERN_RESOURCE_SHORTAGE);
1986                 }
1987
1988                 crhold(cred);
1989                 obj->cred = cred;
1990                 obj->charge = ptoa(obj->size);
1991                 VM_OBJECT_WUNLOCK(obj);
1992         }
1993
1994         /*
1995          * Go back and fix up protections. [Note that clipping is not
1996          * necessary the second time.]
1997          */
1998         current = entry;
1999         while ((current != &map->header) && (current->start < end)) {
2000                 old_prot = current->protection;
2001
2002                 if (set_max)
2003                         current->protection =
2004                             (current->max_protection = new_prot) &
2005                             old_prot;
2006                 else
2007                         current->protection = new_prot;
2008
2009                 /*
2010                  * For user wired map entries, the normal lazy evaluation of
2011                  * write access upgrades through soft page faults is
2012                  * undesirable.  Instead, immediately copy any pages that are
2013                  * copy-on-write and enable write access in the physical map.
2014                  */
2015                 if ((current->eflags & MAP_ENTRY_USER_WIRED) != 0 &&
2016                     (current->protection & VM_PROT_WRITE) != 0 &&
2017                     (old_prot & VM_PROT_WRITE) == 0)
2018                         vm_fault_copy_entry(map, map, current, current, NULL);
2019
2020                 /*
2021                  * When restricting access, update the physical map.  Worry
2022                  * about copy-on-write here.
2023                  */
2024                 if ((old_prot & ~current->protection) != 0) {
2025 #define MASK(entry)     (((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \
2026                                                         VM_PROT_ALL)
2027                         pmap_protect(map->pmap, current->start,
2028                             current->end,
2029                             current->protection & MASK(current));
2030 #undef  MASK
2031                 }
2032                 vm_map_simplify_entry(map, current);
2033                 current = current->next;
2034         }
2035         vm_map_unlock(map);
2036         return (KERN_SUCCESS);
2037 }
2038
2039 /*
2040  *      vm_map_madvise:
2041  *
2042  *      This routine traverses a processes map handling the madvise
2043  *      system call.  Advisories are classified as either those effecting
2044  *      the vm_map_entry structure, or those effecting the underlying
2045  *      objects.
2046  */
2047 int
2048 vm_map_madvise(
2049         vm_map_t map,
2050         vm_offset_t start,
2051         vm_offset_t end,
2052         int behav)
2053 {
2054         vm_map_entry_t current, entry;
2055         int modify_map = 0;
2056
2057         /*
2058          * Some madvise calls directly modify the vm_map_entry, in which case
2059          * we need to use an exclusive lock on the map and we need to perform
2060          * various clipping operations.  Otherwise we only need a read-lock
2061          * on the map.
2062          */
2063         switch(behav) {
2064         case MADV_NORMAL:
2065         case MADV_SEQUENTIAL:
2066         case MADV_RANDOM:
2067         case MADV_NOSYNC:
2068         case MADV_AUTOSYNC:
2069         case MADV_NOCORE:
2070         case MADV_CORE:
2071                 if (start == end)
2072                         return (KERN_SUCCESS);
2073                 modify_map = 1;
2074                 vm_map_lock(map);
2075                 break;
2076         case MADV_WILLNEED:
2077         case MADV_DONTNEED:
2078         case MADV_FREE:
2079                 if (start == end)
2080                         return (KERN_SUCCESS);
2081                 vm_map_lock_read(map);
2082                 break;
2083         default:
2084                 return (KERN_INVALID_ARGUMENT);
2085         }
2086
2087         /*
2088          * Locate starting entry and clip if necessary.
2089          */
2090         VM_MAP_RANGE_CHECK(map, start, end);
2091
2092         if (vm_map_lookup_entry(map, start, &entry)) {
2093                 if (modify_map)
2094                         vm_map_clip_start(map, entry, start);
2095         } else {
2096                 entry = entry->next;
2097         }
2098
2099         if (modify_map) {
2100                 /*
2101                  * madvise behaviors that are implemented in the vm_map_entry.
2102                  *
2103                  * We clip the vm_map_entry so that behavioral changes are
2104                  * limited to the specified address range.
2105                  */
2106                 for (current = entry;
2107                      (current != &map->header) && (current->start < end);
2108                      current = current->next
2109                 ) {
2110                         if (current->eflags & MAP_ENTRY_IS_SUB_MAP)
2111                                 continue;
2112
2113                         vm_map_clip_end(map, current, end);
2114
2115                         switch (behav) {
2116                         case MADV_NORMAL:
2117                                 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_NORMAL);
2118                                 break;
2119                         case MADV_SEQUENTIAL:
2120                                 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_SEQUENTIAL);
2121                                 break;
2122                         case MADV_RANDOM:
2123                                 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_RANDOM);
2124                                 break;
2125                         case MADV_NOSYNC:
2126                                 current->eflags |= MAP_ENTRY_NOSYNC;
2127                                 break;
2128                         case MADV_AUTOSYNC:
2129                                 current->eflags &= ~MAP_ENTRY_NOSYNC;
2130                                 break;
2131                         case MADV_NOCORE:
2132                                 current->eflags |= MAP_ENTRY_NOCOREDUMP;
2133                                 break;
2134                         case MADV_CORE:
2135                                 current->eflags &= ~MAP_ENTRY_NOCOREDUMP;
2136                                 break;
2137                         default:
2138                                 break;
2139                         }
2140                         vm_map_simplify_entry(map, current);
2141                 }
2142                 vm_map_unlock(map);
2143         } else {
2144                 vm_pindex_t pstart, pend;
2145
2146                 /*
2147                  * madvise behaviors that are implemented in the underlying
2148                  * vm_object.
2149                  *
2150                  * Since we don't clip the vm_map_entry, we have to clip
2151                  * the vm_object pindex and count.
2152                  */
2153                 for (current = entry;
2154                      (current != &map->header) && (current->start < end);
2155                      current = current->next
2156                 ) {
2157                         vm_offset_t useEnd, useStart;
2158
2159                         if (current->eflags & MAP_ENTRY_IS_SUB_MAP)
2160                                 continue;
2161
2162                         pstart = OFF_TO_IDX(current->offset);
2163                         pend = pstart + atop(current->end - current->start);
2164                         useStart = current->start;
2165                         useEnd = current->end;
2166
2167                         if (current->start < start) {
2168                                 pstart += atop(start - current->start);
2169                                 useStart = start;
2170                         }
2171                         if (current->end > end) {
2172                                 pend -= atop(current->end - end);
2173                                 useEnd = end;
2174                         }
2175
2176                         if (pstart >= pend)
2177                                 continue;
2178
2179                         /*
2180                          * Perform the pmap_advise() before clearing
2181                          * PGA_REFERENCED in vm_page_advise().  Otherwise, a
2182                          * concurrent pmap operation, such as pmap_remove(),
2183                          * could clear a reference in the pmap and set
2184                          * PGA_REFERENCED on the page before the pmap_advise()
2185                          * had completed.  Consequently, the page would appear
2186                          * referenced based upon an old reference that
2187                          * occurred before this pmap_advise() ran.
2188                          */
2189                         if (behav == MADV_DONTNEED || behav == MADV_FREE)
2190                                 pmap_advise(map->pmap, useStart, useEnd,
2191                                     behav);
2192
2193                         vm_object_madvise(current->object.vm_object, pstart,
2194                             pend, behav);
2195                         if (behav == MADV_WILLNEED) {
2196                                 vm_map_pmap_enter(map,
2197                                     useStart,
2198                                     current->protection,
2199                                     current->object.vm_object,
2200                                     pstart,
2201                                     ptoa(pend - pstart),
2202                                     MAP_PREFAULT_MADVISE
2203                                 );
2204                         }
2205                 }
2206                 vm_map_unlock_read(map);
2207         }
2208         return (0);
2209 }
2210
2211
2212 /*
2213  *      vm_map_inherit:
2214  *
2215  *      Sets the inheritance of the specified address
2216  *      range in the target map.  Inheritance
2217  *      affects how the map will be shared with
2218  *      child maps at the time of vmspace_fork.
2219  */
2220 int
2221 vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end,
2222                vm_inherit_t new_inheritance)
2223 {
2224         vm_map_entry_t entry;
2225         vm_map_entry_t temp_entry;
2226
2227         switch (new_inheritance) {
2228         case VM_INHERIT_NONE:
2229         case VM_INHERIT_COPY:
2230         case VM_INHERIT_SHARE:
2231                 break;
2232         default:
2233                 return (KERN_INVALID_ARGUMENT);
2234         }
2235         if (start == end)
2236                 return (KERN_SUCCESS);
2237         vm_map_lock(map);
2238         VM_MAP_RANGE_CHECK(map, start, end);
2239         if (vm_map_lookup_entry(map, start, &temp_entry)) {
2240                 entry = temp_entry;
2241                 vm_map_clip_start(map, entry, start);
2242         } else
2243                 entry = temp_entry->next;
2244         while ((entry != &map->header) && (entry->start < end)) {
2245                 vm_map_clip_end(map, entry, end);
2246                 entry->inheritance = new_inheritance;
2247                 vm_map_simplify_entry(map, entry);
2248                 entry = entry->next;
2249         }
2250         vm_map_unlock(map);
2251         return (KERN_SUCCESS);
2252 }
2253
2254 /*
2255  *      vm_map_unwire:
2256  *
2257  *      Implements both kernel and user unwiring.
2258  */
2259 int
2260 vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end,
2261     int flags)
2262 {
2263         vm_map_entry_t entry, first_entry, tmp_entry;
2264         vm_offset_t saved_start;
2265         unsigned int last_timestamp;
2266         int rv;
2267         boolean_t need_wakeup, result, user_unwire;
2268
2269         if (start == end)
2270                 return (KERN_SUCCESS);
2271         user_unwire = (flags & VM_MAP_WIRE_USER) ? TRUE : FALSE;
2272         vm_map_lock(map);
2273         VM_MAP_RANGE_CHECK(map, start, end);
2274         if (!vm_map_lookup_entry(map, start, &first_entry)) {
2275                 if (flags & VM_MAP_WIRE_HOLESOK)
2276                         first_entry = first_entry->next;
2277                 else {
2278                         vm_map_unlock(map);
2279                         return (KERN_INVALID_ADDRESS);
2280                 }
2281         }
2282         last_timestamp = map->timestamp;
2283         entry = first_entry;
2284         while (entry != &map->header && entry->start < end) {
2285                 if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
2286                         /*
2287                          * We have not yet clipped the entry.
2288                          */
2289                         saved_start = (start >= entry->start) ? start :
2290                             entry->start;
2291                         entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
2292                         if (vm_map_unlock_and_wait(map, 0)) {
2293                                 /*
2294                                  * Allow interruption of user unwiring?
2295                                  */
2296                         }
2297                         vm_map_lock(map);
2298                         if (last_timestamp+1 != map->timestamp) {
2299                                 /*
2300                                  * Look again for the entry because the map was
2301                                  * modified while it was unlocked.
2302                                  * Specifically, the entry may have been
2303                                  * clipped, merged, or deleted.
2304                                  */
2305                                 if (!vm_map_lookup_entry(map, saved_start,
2306                                     &tmp_entry)) {
2307                                         if (flags & VM_MAP_WIRE_HOLESOK)
2308                                                 tmp_entry = tmp_entry->next;
2309                                         else {
2310                                                 if (saved_start == start) {
2311                                                         /*
2312                                                          * First_entry has been deleted.
2313                                                          */
2314                                                         vm_map_unlock(map);
2315                                                         return (KERN_INVALID_ADDRESS);
2316                                                 }
2317                                                 end = saved_start;
2318                                                 rv = KERN_INVALID_ADDRESS;
2319                                                 goto done;
2320                                         }
2321                                 }
2322                                 if (entry == first_entry)
2323                                         first_entry = tmp_entry;
2324                                 else
2325                                         first_entry = NULL;
2326                                 entry = tmp_entry;
2327                         }
2328                         last_timestamp = map->timestamp;
2329                         continue;
2330                 }
2331                 vm_map_clip_start(map, entry, start);
2332                 vm_map_clip_end(map, entry, end);
2333                 /*
2334                  * Mark the entry in case the map lock is released.  (See
2335                  * above.)
2336                  */
2337                 KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 &&
2338                     entry->wiring_thread == NULL,
2339                     ("owned map entry %p", entry));
2340                 entry->eflags |= MAP_ENTRY_IN_TRANSITION;
2341                 entry->wiring_thread = curthread;
2342                 /*
2343                  * Check the map for holes in the specified region.
2344                  * If VM_MAP_WIRE_HOLESOK was specified, skip this check.
2345                  */
2346                 if (((flags & VM_MAP_WIRE_HOLESOK) == 0) &&
2347                     (entry->end < end && (entry->next == &map->header ||
2348                     entry->next->start > entry->end))) {
2349                         end = entry->end;
2350                         rv = KERN_INVALID_ADDRESS;
2351                         goto done;
2352                 }
2353                 /*
2354                  * If system unwiring, require that the entry is system wired.
2355                  */
2356                 if (!user_unwire &&
2357                     vm_map_entry_system_wired_count(entry) == 0) {
2358                         end = entry->end;
2359                         rv = KERN_INVALID_ARGUMENT;
2360                         goto done;
2361                 }
2362                 entry = entry->next;
2363         }
2364         rv = KERN_SUCCESS;
2365 done:
2366         need_wakeup = FALSE;
2367         if (first_entry == NULL) {
2368                 result = vm_map_lookup_entry(map, start, &first_entry);
2369                 if (!result && (flags & VM_MAP_WIRE_HOLESOK))
2370                         first_entry = first_entry->next;
2371                 else
2372                         KASSERT(result, ("vm_map_unwire: lookup failed"));
2373         }
2374         for (entry = first_entry; entry != &map->header && entry->start < end;
2375             entry = entry->next) {
2376                 /*
2377                  * If VM_MAP_WIRE_HOLESOK was specified, an empty
2378                  * space in the unwired region could have been mapped
2379                  * while the map lock was dropped for draining
2380                  * MAP_ENTRY_IN_TRANSITION.  Moreover, another thread
2381                  * could be simultaneously wiring this new mapping
2382                  * entry.  Detect these cases and skip any entries
2383                  * marked as in transition by us.
2384                  */
2385                 if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 ||
2386                     entry->wiring_thread != curthread) {
2387                         KASSERT((flags & VM_MAP_WIRE_HOLESOK) != 0,
2388                             ("vm_map_unwire: !HOLESOK and new/changed entry"));
2389                         continue;
2390                 }
2391
2392                 if (rv == KERN_SUCCESS && (!user_unwire ||
2393                     (entry->eflags & MAP_ENTRY_USER_WIRED))) {
2394                         if (user_unwire)
2395                                 entry->eflags &= ~MAP_ENTRY_USER_WIRED;
2396                         entry->wired_count--;
2397                         if (entry->wired_count == 0) {
2398                                 /*
2399                                  * Retain the map lock.
2400                                  */
2401                                 vm_fault_unwire(map, entry->start, entry->end,
2402                                     entry->object.vm_object != NULL &&
2403                                     (entry->object.vm_object->flags &
2404                                     OBJ_FICTITIOUS) != 0);
2405                         }
2406                 }
2407                 KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0,
2408                     ("vm_map_unwire: in-transition flag missing %p", entry));
2409                 KASSERT(entry->wiring_thread == curthread,
2410                     ("vm_map_unwire: alien wire %p", entry));
2411                 entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
2412                 entry->wiring_thread = NULL;
2413                 if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
2414                         entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
2415                         need_wakeup = TRUE;
2416                 }
2417                 vm_map_simplify_entry(map, entry);
2418         }
2419         vm_map_unlock(map);
2420         if (need_wakeup)
2421                 vm_map_wakeup(map);
2422         return (rv);
2423 }
2424
2425 /*
2426  *      vm_map_wire:
2427  *
2428  *      Implements both kernel and user wiring.
2429  */
2430 int
2431 vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end,
2432     int flags)
2433 {
2434         vm_map_entry_t entry, first_entry, tmp_entry;
2435         vm_offset_t saved_end, saved_start;
2436         unsigned int last_timestamp;
2437         int rv;
2438         boolean_t fictitious, need_wakeup, result, user_wire;
2439         vm_prot_t prot;
2440
2441         if (start == end)
2442                 return (KERN_SUCCESS);
2443         prot = 0;
2444         if (flags & VM_MAP_WIRE_WRITE)
2445                 prot |= VM_PROT_WRITE;
2446         user_wire = (flags & VM_MAP_WIRE_USER) ? TRUE : FALSE;
2447         vm_map_lock(map);
2448         VM_MAP_RANGE_CHECK(map, start, end);
2449         if (!vm_map_lookup_entry(map, start, &first_entry)) {
2450                 if (flags & VM_MAP_WIRE_HOLESOK)
2451                         first_entry = first_entry->next;
2452                 else {
2453                         vm_map_unlock(map);
2454                         return (KERN_INVALID_ADDRESS);
2455                 }
2456         }
2457         last_timestamp = map->timestamp;
2458         entry = first_entry;
2459         while (entry != &map->header && entry->start < end) {
2460                 if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
2461                         /*
2462                          * We have not yet clipped the entry.
2463                          */
2464                         saved_start = (start >= entry->start) ? start :
2465                             entry->start;
2466                         entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
2467                         if (vm_map_unlock_and_wait(map, 0)) {
2468                                 /*
2469                                  * Allow interruption of user wiring?
2470                                  */
2471                         }
2472                         vm_map_lock(map);
2473                         if (last_timestamp + 1 != map->timestamp) {
2474                                 /*
2475                                  * Look again for the entry because the map was
2476                                  * modified while it was unlocked.
2477                                  * Specifically, the entry may have been
2478                                  * clipped, merged, or deleted.
2479                                  */
2480                                 if (!vm_map_lookup_entry(map, saved_start,
2481                                     &tmp_entry)) {
2482                                         if (flags & VM_MAP_WIRE_HOLESOK)
2483                                                 tmp_entry = tmp_entry->next;
2484                                         else {
2485                                                 if (saved_start == start) {
2486                                                         /*
2487                                                          * first_entry has been deleted.
2488                                                          */
2489                                                         vm_map_unlock(map);
2490                                                         return (KERN_INVALID_ADDRESS);
2491                                                 }
2492                                                 end = saved_start;
2493                                                 rv = KERN_INVALID_ADDRESS;
2494                                                 goto done;
2495                                         }
2496                                 }
2497                                 if (entry == first_entry)
2498                                         first_entry = tmp_entry;
2499                                 else
2500                                         first_entry = NULL;
2501                                 entry = tmp_entry;
2502                         }
2503                         last_timestamp = map->timestamp;
2504                         continue;
2505                 }
2506                 vm_map_clip_start(map, entry, start);
2507                 vm_map_clip_end(map, entry, end);
2508                 /*
2509                  * Mark the entry in case the map lock is released.  (See
2510                  * above.)
2511                  */
2512                 KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 &&
2513                     entry->wiring_thread == NULL,
2514                     ("owned map entry %p", entry));
2515                 entry->eflags |= MAP_ENTRY_IN_TRANSITION;
2516                 entry->wiring_thread = curthread;
2517                 if ((entry->protection & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0
2518                     || (entry->protection & prot) != prot) {
2519                         entry->eflags |= MAP_ENTRY_WIRE_SKIPPED;
2520                         if ((flags & VM_MAP_WIRE_HOLESOK) == 0) {
2521                                 end = entry->end;
2522                                 rv = KERN_INVALID_ADDRESS;
2523                                 goto done;
2524                         }
2525                         goto next_entry;
2526                 }
2527                 if (entry->wired_count == 0) {
2528                         entry->wired_count++;
2529                         saved_start = entry->start;
2530                         saved_end = entry->end;
2531                         fictitious = entry->object.vm_object != NULL &&
2532                             (entry->object.vm_object->flags &
2533                             OBJ_FICTITIOUS) != 0;
2534                         /*
2535                          * Release the map lock, relying on the in-transition
2536                          * mark.  Mark the map busy for fork.
2537                          */
2538                         vm_map_busy(map);
2539                         vm_map_unlock(map);
2540                         rv = vm_fault_wire(map, saved_start, saved_end,
2541                             fictitious);
2542                         vm_map_lock(map);
2543                         vm_map_unbusy(map);
2544                         if (last_timestamp + 1 != map->timestamp) {
2545                                 /*
2546                                  * Look again for the entry because the map was
2547                                  * modified while it was unlocked.  The entry
2548                                  * may have been clipped, but NOT merged or
2549                                  * deleted.
2550                                  */
2551                                 result = vm_map_lookup_entry(map, saved_start,
2552                                     &tmp_entry);
2553                                 KASSERT(result, ("vm_map_wire: lookup failed"));
2554                                 if (entry == first_entry)
2555                                         first_entry = tmp_entry;
2556                                 else
2557                                         first_entry = NULL;
2558                                 entry = tmp_entry;
2559                                 while (entry->end < saved_end) {
2560                                         if (rv != KERN_SUCCESS) {
2561                                                 KASSERT(entry->wired_count == 1,
2562                                                     ("vm_map_wire: bad count"));
2563                                                 entry->wired_count = -1;
2564                                         }
2565                                         entry = entry->next;
2566                                 }
2567                         }
2568                         last_timestamp = map->timestamp;
2569                         if (rv != KERN_SUCCESS) {
2570                                 KASSERT(entry->wired_count == 1,
2571                                     ("vm_map_wire: bad count"));
2572                                 /*
2573                                  * Assign an out-of-range value to represent
2574                                  * the failure to wire this entry.
2575                                  */
2576                                 entry->wired_count = -1;
2577                                 end = entry->end;
2578                                 goto done;
2579                         }
2580                 } else if (!user_wire ||
2581                            (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
2582                         entry->wired_count++;
2583                 }
2584                 /*
2585                  * Check the map for holes in the specified region.
2586                  * If VM_MAP_WIRE_HOLESOK was specified, skip this check.
2587                  */
2588         next_entry:
2589                 if (((flags & VM_MAP_WIRE_HOLESOK) == 0) &&
2590                     (entry->end < end && (entry->next == &map->header ||
2591                     entry->next->start > entry->end))) {
2592                         end = entry->end;
2593                         rv = KERN_INVALID_ADDRESS;
2594                         goto done;
2595                 }
2596                 entry = entry->next;
2597         }
2598         rv = KERN_SUCCESS;
2599 done:
2600         need_wakeup = FALSE;
2601         if (first_entry == NULL) {
2602                 result = vm_map_lookup_entry(map, start, &first_entry);
2603                 if (!result && (flags & VM_MAP_WIRE_HOLESOK))
2604                         first_entry = first_entry->next;
2605                 else
2606                         KASSERT(result, ("vm_map_wire: lookup failed"));
2607         }
2608         for (entry = first_entry; entry != &map->header && entry->start < end;
2609             entry = entry->next) {
2610                 if ((entry->eflags & MAP_ENTRY_WIRE_SKIPPED) != 0)
2611                         goto next_entry_done;
2612
2613                 /*
2614                  * If VM_MAP_WIRE_HOLESOK was specified, an empty
2615                  * space in the unwired region could have been mapped
2616                  * while the map lock was dropped for faulting in the
2617                  * pages or draining MAP_ENTRY_IN_TRANSITION.
2618                  * Moreover, another thread could be simultaneously
2619                  * wiring this new mapping entry.  Detect these cases
2620                  * and skip any entries marked as in transition by us.
2621                  */
2622                 if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 ||
2623                     entry->wiring_thread != curthread) {
2624                         KASSERT((flags & VM_MAP_WIRE_HOLESOK) != 0,
2625                             ("vm_map_wire: !HOLESOK and new/changed entry"));
2626                         continue;
2627                 }
2628
2629                 if (rv == KERN_SUCCESS) {
2630                         if (user_wire)
2631                                 entry->eflags |= MAP_ENTRY_USER_WIRED;
2632                 } else if (entry->wired_count == -1) {
2633                         /*
2634                          * Wiring failed on this entry.  Thus, unwiring is
2635                          * unnecessary.
2636                          */
2637                         entry->wired_count = 0;
2638                 } else {
2639                         if (!user_wire ||
2640                             (entry->eflags & MAP_ENTRY_USER_WIRED) == 0)
2641                                 entry->wired_count--;
2642                         if (entry->wired_count == 0) {
2643                                 /*
2644                                  * Retain the map lock.
2645                                  */
2646                                 vm_fault_unwire(map, entry->start, entry->end,
2647                                     entry->object.vm_object != NULL &&
2648                                     (entry->object.vm_object->flags &
2649                                     OBJ_FICTITIOUS) != 0);
2650                         }
2651                 }
2652         next_entry_done:
2653                 KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0,
2654                     ("vm_map_wire: in-transition flag missing %p", entry));
2655                 KASSERT(entry->wiring_thread == curthread,
2656                     ("vm_map_wire: alien wire %p", entry));
2657                 entry->eflags &= ~(MAP_ENTRY_IN_TRANSITION |
2658                     MAP_ENTRY_WIRE_SKIPPED);
2659                 entry->wiring_thread = NULL;
2660                 if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
2661                         entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
2662                         need_wakeup = TRUE;
2663                 }
2664                 vm_map_simplify_entry(map, entry);
2665         }
2666         vm_map_unlock(map);
2667         if (need_wakeup)
2668                 vm_map_wakeup(map);
2669         return (rv);
2670 }
2671
2672 /*
2673  * vm_map_sync
2674  *
2675  * Push any dirty cached pages in the address range to their pager.
2676  * If syncio is TRUE, dirty pages are written synchronously.
2677  * If invalidate is TRUE, any cached pages are freed as well.
2678  *
2679  * If the size of the region from start to end is zero, we are
2680  * supposed to flush all modified pages within the region containing
2681  * start.  Unfortunately, a region can be split or coalesced with
2682  * neighboring regions, making it difficult to determine what the
2683  * original region was.  Therefore, we approximate this requirement by
2684  * flushing the current region containing start.
2685  *
2686  * Returns an error if any part of the specified range is not mapped.
2687  */
2688 int
2689 vm_map_sync(
2690         vm_map_t map,
2691         vm_offset_t start,
2692         vm_offset_t end,
2693         boolean_t syncio,
2694         boolean_t invalidate)
2695 {
2696         vm_map_entry_t current;
2697         vm_map_entry_t entry;
2698         vm_size_t size;
2699         vm_object_t object;
2700         vm_ooffset_t offset;
2701         unsigned int last_timestamp;
2702         boolean_t failed;
2703
2704         vm_map_lock_read(map);
2705         VM_MAP_RANGE_CHECK(map, start, end);
2706         if (!vm_map_lookup_entry(map, start, &entry)) {
2707                 vm_map_unlock_read(map);
2708                 return (KERN_INVALID_ADDRESS);
2709         } else if (start == end) {
2710                 start = entry->start;
2711                 end = entry->end;
2712         }
2713         /*
2714          * Make a first pass to check for user-wired memory and holes.
2715          */
2716         for (current = entry; current != &map->header && current->start < end;
2717             current = current->next) {
2718                 if (invalidate && (current->eflags & MAP_ENTRY_USER_WIRED)) {
2719                         vm_map_unlock_read(map);
2720                         return (KERN_INVALID_ARGUMENT);
2721                 }
2722                 if (end > current->end &&
2723                     (current->next == &map->header ||
2724                         current->end != current->next->start)) {
2725                         vm_map_unlock_read(map);
2726                         return (KERN_INVALID_ADDRESS);
2727                 }
2728         }
2729
2730         if (invalidate)
2731                 pmap_remove(map->pmap, start, end);
2732         failed = FALSE;
2733
2734         /*
2735          * Make a second pass, cleaning/uncaching pages from the indicated
2736          * objects as we go.
2737          */
2738         for (current = entry; current != &map->header && current->start < end;) {
2739                 offset = current->offset + (start - current->start);
2740                 size = (end <= current->end ? end : current->end) - start;
2741                 if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
2742                         vm_map_t smap;
2743                         vm_map_entry_t tentry;
2744                         vm_size_t tsize;
2745
2746                         smap = current->object.sub_map;
2747                         vm_map_lock_read(smap);
2748                         (void) vm_map_lookup_entry(smap, offset, &tentry);
2749                         tsize = tentry->end - offset;
2750                         if (tsize < size)
2751                                 size = tsize;
2752                         object = tentry->object.vm_object;
2753                         offset = tentry->offset + (offset - tentry->start);
2754                         vm_map_unlock_read(smap);
2755                 } else {
2756                         object = current->object.vm_object;
2757                 }
2758                 vm_object_reference(object);
2759                 last_timestamp = map->timestamp;
2760                 vm_map_unlock_read(map);
2761                 if (!vm_object_sync(object, offset, size, syncio, invalidate))
2762                         failed = TRUE;
2763                 start += size;
2764                 vm_object_deallocate(object);
2765                 vm_map_lock_read(map);
2766                 if (last_timestamp == map->timestamp ||
2767                     !vm_map_lookup_entry(map, start, &current))
2768                         current = current->next;
2769         }
2770
2771         vm_map_unlock_read(map);
2772         return (failed ? KERN_FAILURE : KERN_SUCCESS);
2773 }
2774
2775 /*
2776  *      vm_map_entry_unwire:    [ internal use only ]
2777  *
2778  *      Make the region specified by this entry pageable.
2779  *
2780  *      The map in question should be locked.
2781  *      [This is the reason for this routine's existence.]
2782  */
2783 static void
2784 vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry)
2785 {
2786         vm_fault_unwire(map, entry->start, entry->end,
2787             entry->object.vm_object != NULL &&
2788             (entry->object.vm_object->flags & OBJ_FICTITIOUS) != 0);
2789         entry->wired_count = 0;
2790 }
2791
2792 static void
2793 vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map)
2794 {
2795
2796         if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0)
2797                 vm_object_deallocate(entry->object.vm_object);
2798         uma_zfree(system_map ? kmapentzone : mapentzone, entry);
2799 }
2800
2801 /*
2802  *      vm_map_entry_delete:    [ internal use only ]
2803  *
2804  *      Deallocate the given entry from the target map.
2805  */
2806 static void
2807 vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry)
2808 {
2809         vm_object_t object;
2810         vm_pindex_t offidxstart, offidxend, count, size1;
2811         vm_ooffset_t size;
2812
2813         vm_map_entry_unlink(map, entry);
2814         object = entry->object.vm_object;
2815         size = entry->end - entry->start;
2816         map->size -= size;
2817
2818         if (entry->cred != NULL) {
2819                 swap_release_by_cred(size, entry->cred);
2820                 crfree(entry->cred);
2821         }
2822
2823         if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0 &&
2824             (object != NULL)) {
2825                 KASSERT(entry->cred == NULL || object->cred == NULL ||
2826                     (entry->eflags & MAP_ENTRY_NEEDS_COPY),
2827                     ("OVERCOMMIT vm_map_entry_delete: both cred %p", entry));
2828                 count = OFF_TO_IDX(size);
2829                 offidxstart = OFF_TO_IDX(entry->offset);
2830                 offidxend = offidxstart + count;
2831                 VM_OBJECT_WLOCK(object);
2832                 if (object->ref_count != 1 &&
2833                     ((object->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) == OBJ_ONEMAPPING ||
2834                     object == kernel_object || object == kmem_object)) {
2835                         vm_object_collapse(object);
2836
2837                         /*
2838                          * The option OBJPR_NOTMAPPED can be passed here
2839                          * because vm_map_delete() already performed
2840                          * pmap_remove() on the only mapping to this range
2841                          * of pages.
2842                          */
2843                         vm_object_page_remove(object, offidxstart, offidxend,
2844                             OBJPR_NOTMAPPED);
2845                         if (object->type == OBJT_SWAP)
2846                                 swap_pager_freespace(object, offidxstart, count);
2847                         if (offidxend >= object->size &&
2848                             offidxstart < object->size) {
2849                                 size1 = object->size;
2850                                 object->size = offidxstart;
2851                                 if (object->cred != NULL) {
2852                                         size1 -= object->size;
2853                                         KASSERT(object->charge >= ptoa(size1),
2854                                             ("vm_map_entry_delete: object->charge < 0"));
2855                                         swap_release_by_cred(ptoa(size1), object->cred);
2856                                         object->charge -= ptoa(size1);
2857                                 }
2858                         }
2859                 }
2860                 VM_OBJECT_WUNLOCK(object);
2861         } else
2862                 entry->object.vm_object = NULL;
2863         if (map->system_map)
2864                 vm_map_entry_deallocate(entry, TRUE);
2865         else {
2866                 entry->next = curthread->td_map_def_user;
2867                 curthread->td_map_def_user = entry;
2868         }
2869 }
2870
2871 /*
2872  *      vm_map_delete:  [ internal use only ]
2873  *
2874  *      Deallocates the given address range from the target
2875  *      map.
2876  */
2877 int
2878 vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end)
2879 {
2880         vm_map_entry_t entry;
2881         vm_map_entry_t first_entry;
2882
2883         VM_MAP_ASSERT_LOCKED(map);
2884         if (start == end)
2885                 return (KERN_SUCCESS);
2886
2887         /*
2888          * Find the start of the region, and clip it
2889          */
2890         if (!vm_map_lookup_entry(map, start, &first_entry))
2891                 entry = first_entry->next;
2892         else {
2893                 entry = first_entry;
2894                 vm_map_clip_start(map, entry, start);
2895         }
2896
2897         /*
2898          * Step through all entries in this region
2899          */
2900         while ((entry != &map->header) && (entry->start < end)) {
2901                 vm_map_entry_t next;
2902
2903                 /*
2904                  * Wait for wiring or unwiring of an entry to complete.
2905                  * Also wait for any system wirings to disappear on
2906                  * user maps.
2907                  */
2908                 if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 ||
2909                     (vm_map_pmap(map) != kernel_pmap &&
2910                     vm_map_entry_system_wired_count(entry) != 0)) {
2911                         unsigned int last_timestamp;
2912                         vm_offset_t saved_start;
2913                         vm_map_entry_t tmp_entry;
2914
2915                         saved_start = entry->start;
2916                         entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
2917                         last_timestamp = map->timestamp;
2918                         (void) vm_map_unlock_and_wait(map, 0);
2919                         vm_map_lock(map);
2920                         if (last_timestamp + 1 != map->timestamp) {
2921                                 /*
2922                                  * Look again for the entry because the map was
2923                                  * modified while it was unlocked.
2924                                  * Specifically, the entry may have been
2925                                  * clipped, merged, or deleted.
2926                                  */
2927                                 if (!vm_map_lookup_entry(map, saved_start,
2928                                                          &tmp_entry))
2929                                         entry = tmp_entry->next;
2930                                 else {
2931                                         entry = tmp_entry;
2932                                         vm_map_clip_start(map, entry,
2933                                                           saved_start);
2934                                 }
2935                         }
2936                         continue;
2937                 }
2938                 vm_map_clip_end(map, entry, end);
2939
2940                 next = entry->next;
2941
2942                 /*
2943                  * Unwire before removing addresses from the pmap; otherwise,
2944                  * unwiring will put the entries back in the pmap.
2945                  */
2946                 if (entry->wired_count != 0) {
2947                         vm_map_entry_unwire(map, entry);
2948                 }
2949
2950                 pmap_remove(map->pmap, entry->start, entry->end);
2951
2952                 /*
2953                  * Delete the entry only after removing all pmap
2954                  * entries pointing to its pages.  (Otherwise, its
2955                  * page frames may be reallocated, and any modify bits
2956                  * will be set in the wrong object!)
2957                  */
2958                 vm_map_entry_delete(map, entry);
2959                 entry = next;
2960         }
2961         return (KERN_SUCCESS);
2962 }
2963
2964 /*
2965  *      vm_map_remove:
2966  *
2967  *      Remove the given address range from the target map.
2968  *      This is the exported form of vm_map_delete.
2969  */
2970 int
2971 vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end)
2972 {
2973         int result;
2974
2975         vm_map_lock(map);
2976         VM_MAP_RANGE_CHECK(map, start, end);
2977         result = vm_map_delete(map, start, end);
2978         vm_map_unlock(map);
2979         return (result);
2980 }
2981
2982 /*
2983  *      vm_map_check_protection:
2984  *
2985  *      Assert that the target map allows the specified privilege on the
2986  *      entire address region given.  The entire region must be allocated.
2987  *
2988  *      WARNING!  This code does not and should not check whether the
2989  *      contents of the region is accessible.  For example a smaller file
2990  *      might be mapped into a larger address space.
2991  *
2992  *      NOTE!  This code is also called by munmap().
2993  *
2994  *      The map must be locked.  A read lock is sufficient.
2995  */
2996 boolean_t
2997 vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end,
2998                         vm_prot_t protection)
2999 {
3000         vm_map_entry_t entry;
3001         vm_map_entry_t tmp_entry;
3002
3003         if (!vm_map_lookup_entry(map, start, &tmp_entry))
3004                 return (FALSE);
3005         entry = tmp_entry;
3006
3007         while (start < end) {
3008                 if (entry == &map->header)
3009                         return (FALSE);
3010                 /*
3011                  * No holes allowed!
3012                  */
3013                 if (start < entry->start)
3014                         return (FALSE);
3015                 /*
3016                  * Check protection associated with entry.
3017                  */
3018                 if ((entry->protection & protection) != protection)
3019                         return (FALSE);
3020                 /* go to next entry */
3021                 start = entry->end;
3022                 entry = entry->next;
3023         }
3024         return (TRUE);
3025 }
3026
3027 /*
3028  *      vm_map_copy_entry:
3029  *
3030  *      Copies the contents of the source entry to the destination
3031  *      entry.  The entries *must* be aligned properly.
3032  */
3033 static void
3034 vm_map_copy_entry(
3035         vm_map_t src_map,
3036         vm_map_t dst_map,
3037         vm_map_entry_t src_entry,
3038         vm_map_entry_t dst_entry,
3039         vm_ooffset_t *fork_charge)
3040 {
3041         vm_object_t src_object;
3042         vm_map_entry_t fake_entry;
3043         vm_offset_t size;
3044         struct ucred *cred;
3045         int charged;
3046
3047         VM_MAP_ASSERT_LOCKED(dst_map);
3048
3049         if ((dst_entry->eflags|src_entry->eflags) & MAP_ENTRY_IS_SUB_MAP)
3050                 return;
3051
3052         if (src_entry->wired_count == 0 ||
3053             (src_entry->protection & VM_PROT_WRITE) == 0) {
3054                 /*
3055                  * If the source entry is marked needs_copy, it is already
3056                  * write-protected.
3057                  */
3058                 if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0 &&
3059                     (src_entry->protection & VM_PROT_WRITE) != 0) {
3060                         pmap_protect(src_map->pmap,
3061                             src_entry->start,
3062                             src_entry->end,
3063                             src_entry->protection & ~VM_PROT_WRITE);
3064                 }
3065
3066                 /*
3067                  * Make a copy of the object.
3068                  */
3069                 size = src_entry->end - src_entry->start;
3070                 if ((src_object = src_entry->object.vm_object) != NULL) {
3071                         VM_OBJECT_WLOCK(src_object);
3072                         charged = ENTRY_CHARGED(src_entry);
3073                         if ((src_object->handle == NULL) &&
3074                                 (src_object->type == OBJT_DEFAULT ||
3075                                  src_object->type == OBJT_SWAP)) {
3076                                 vm_object_collapse(src_object);
3077                                 if ((src_object->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) == OBJ_ONEMAPPING) {
3078                                         vm_object_split(src_entry);
3079                                         src_object = src_entry->object.vm_object;
3080                                 }
3081                         }
3082                         vm_object_reference_locked(src_object);
3083                         vm_object_clear_flag(src_object, OBJ_ONEMAPPING);
3084                         if (src_entry->cred != NULL &&
3085                             !(src_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
3086                                 KASSERT(src_object->cred == NULL,
3087                                     ("OVERCOMMIT: vm_map_copy_entry: cred %p",
3088                                      src_object));
3089                                 src_object->cred = src_entry->cred;
3090                                 src_object->charge = size;
3091                         }
3092                         VM_OBJECT_WUNLOCK(src_object);
3093                         dst_entry->object.vm_object = src_object;
3094                         if (charged) {
3095                                 cred = curthread->td_ucred;
3096                                 crhold(cred);
3097                                 dst_entry->cred = cred;
3098                                 *fork_charge += size;
3099                                 if (!(src_entry->eflags &
3100                                       MAP_ENTRY_NEEDS_COPY)) {
3101                                         crhold(cred);
3102                                         src_entry->cred = cred;
3103                                         *fork_charge += size;
3104                                 }
3105                         }
3106                         src_entry->eflags |= (MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY);
3107                         dst_entry->eflags |= (MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY);
3108                         dst_entry->offset = src_entry->offset;
3109                         if (src_entry->eflags & MAP_ENTRY_VN_WRITECNT) {
3110                                 /*
3111                                  * MAP_ENTRY_VN_WRITECNT cannot
3112                                  * indicate write reference from
3113                                  * src_entry, since the entry is
3114                                  * marked as needs copy.  Allocate a
3115                                  * fake entry that is used to
3116                                  * decrement object->un_pager.vnp.writecount
3117                                  * at the appropriate time.  Attach
3118                                  * fake_entry to the deferred list.
3119                                  */
3120                                 fake_entry = vm_map_entry_create(dst_map);
3121                                 fake_entry->eflags = MAP_ENTRY_VN_WRITECNT;
3122                                 src_entry->eflags &= ~MAP_ENTRY_VN_WRITECNT;
3123                                 vm_object_reference(src_object);
3124                                 fake_entry->object.vm_object = src_object;
3125                                 fake_entry->start = src_entry->start;
3126                                 fake_entry->end = src_entry->end;
3127                                 fake_entry->next = curthread->td_map_def_user;
3128                                 curthread->td_map_def_user = fake_entry;
3129                         }
3130                 } else {
3131                         dst_entry->object.vm_object = NULL;
3132                         dst_entry->offset = 0;
3133                         if (src_entry->cred != NULL) {
3134                                 dst_entry->cred = curthread->td_ucred;
3135                                 crhold(dst_entry->cred);
3136                                 *fork_charge += size;
3137                         }
3138                 }
3139
3140                 pmap_copy(dst_map->pmap, src_map->pmap, dst_entry->start,
3141                     dst_entry->end - dst_entry->start, src_entry->start);
3142         } else {
3143                 /*
3144                  * We don't want to make writeable wired pages copy-on-write.
3145                  * Immediately copy these pages into the new map by simulating
3146                  * page faults.  The new pages are pageable.
3147                  */
3148                 vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry,
3149                     fork_charge);
3150         }
3151 }
3152
3153 /*
3154  * vmspace_map_entry_forked:
3155  * Update the newly-forked vmspace each time a map entry is inherited
3156  * or copied.  The values for vm_dsize and vm_tsize are approximate
3157  * (and mostly-obsolete ideas in the face of mmap(2) et al.)
3158  */
3159 static void
3160 vmspace_map_entry_forked(const struct vmspace *vm1, struct vmspace *vm2,
3161     vm_map_entry_t entry)
3162 {
3163         vm_size_t entrysize;
3164         vm_offset_t newend;
3165
3166         entrysize = entry->end - entry->start;
3167         vm2->vm_map.size += entrysize;
3168         if (entry->eflags & (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP)) {
3169                 vm2->vm_ssize += btoc(entrysize);
3170         } else if (entry->start >= (vm_offset_t)vm1->vm_daddr &&
3171             entry->start < (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize)) {
3172                 newend = MIN(entry->end,
3173                     (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize));
3174                 vm2->vm_dsize += btoc(newend - entry->start);
3175         } else if (entry->start >= (vm_offset_t)vm1->vm_taddr &&
3176             entry->start < (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize)) {
3177                 newend = MIN(entry->end,
3178                     (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize));
3179                 vm2->vm_tsize += btoc(newend - entry->start);
3180         }
3181 }
3182
3183 /*
3184  * vmspace_fork:
3185  * Create a new process vmspace structure and vm_map
3186  * based on those of an existing process.  The new map
3187  * is based on the old map, according to the inheritance
3188  * values on the regions in that map.
3189  *
3190  * XXX It might be worth coalescing the entries added to the new vmspace.
3191  *
3192  * The source map must not be locked.
3193  */
3194 struct vmspace *
3195 vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_charge)
3196 {
3197         struct vmspace *vm2;
3198         vm_map_t new_map, old_map;
3199         vm_map_entry_t new_entry, old_entry;
3200         vm_object_t object;
3201         int locked;
3202
3203         old_map = &vm1->vm_map;
3204         /* Copy immutable fields of vm1 to vm2. */
3205         vm2 = vmspace_alloc(old_map->min_offset, old_map->max_offset, NULL);
3206         if (vm2 == NULL)
3207                 return (NULL);
3208         vm2->vm_taddr = vm1->vm_taddr;
3209         vm2->vm_daddr = vm1->vm_daddr;
3210         vm2->vm_maxsaddr = vm1->vm_maxsaddr;
3211         vm_map_lock(old_map);
3212         if (old_map->busy)
3213                 vm_map_wait_busy(old_map);
3214         new_map = &vm2->vm_map;
3215         locked = vm_map_trylock(new_map); /* trylock to silence WITNESS */
3216         KASSERT(locked, ("vmspace_fork: lock failed"));
3217
3218         old_entry = old_map->header.next;
3219
3220         while (old_entry != &old_map->header) {
3221                 if (old_entry->eflags & MAP_ENTRY_IS_SUB_MAP)
3222                         panic("vm_map_fork: encountered a submap");
3223
3224                 switch (old_entry->inheritance) {
3225                 case VM_INHERIT_NONE:
3226                         break;
3227
3228                 case VM_INHERIT_SHARE:
3229                         /*
3230                          * Clone the entry, creating the shared object if necessary.
3231                          */
3232                         object = old_entry->object.vm_object;
3233                         if (object == NULL) {
3234                                 object = vm_object_allocate(OBJT_DEFAULT,
3235                                         atop(old_entry->end - old_entry->start));
3236                                 old_entry->object.vm_object = object;
3237                                 old_entry->offset = 0;
3238                                 if (old_entry->cred != NULL) {
3239                                         object->cred = old_entry->cred;
3240                                         object->charge = old_entry->end -
3241                                             old_entry->start;
3242                                         old_entry->cred = NULL;
3243                                 }
3244                         }
3245
3246                         /*
3247                          * Add the reference before calling vm_object_shadow
3248                          * to insure that a shadow object is created.
3249                          */
3250                         vm_object_reference(object);
3251                         if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) {
3252                                 vm_object_shadow(&old_entry->object.vm_object,
3253                                     &old_entry->offset,
3254                                     old_entry->end - old_entry->start);
3255                                 old_entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
3256                                 /* Transfer the second reference too. */
3257                                 vm_object_reference(
3258                                     old_entry->object.vm_object);
3259
3260                                 /*
3261                                  * As in vm_map_simplify_entry(), the
3262                                  * vnode lock will not be acquired in
3263                                  * this call to vm_object_deallocate().
3264                                  */
3265                                 vm_object_deallocate(object);
3266                                 object = old_entry->object.vm_object;
3267                         }
3268                         VM_OBJECT_WLOCK(object);
3269                         vm_object_clear_flag(object, OBJ_ONEMAPPING);
3270                         if (old_entry->cred != NULL) {
3271                                 KASSERT(object->cred == NULL, ("vmspace_fork both cred"));
3272                                 object->cred = old_entry->cred;
3273                                 object->charge = old_entry->end - old_entry->start;
3274                                 old_entry->cred = NULL;
3275                         }
3276
3277                         /*
3278                          * Assert the correct state of the vnode
3279                          * v_writecount while the object is locked, to
3280                          * not relock it later for the assertion
3281                          * correctness.
3282                          */
3283                         if (old_entry->eflags & MAP_ENTRY_VN_WRITECNT &&
3284                             object->type == OBJT_VNODE) {
3285                                 KASSERT(((struct vnode *)object->handle)->
3286                                     v_writecount > 0,
3287                                     ("vmspace_fork: v_writecount %p", object));
3288                                 KASSERT(object->un_pager.vnp.writemappings > 0,
3289                                     ("vmspace_fork: vnp.writecount %p",
3290                                     object));
3291                         }
3292                         VM_OBJECT_WUNLOCK(object);
3293
3294                         /*
3295                          * Clone the entry, referencing the shared object.
3296                          */
3297                         new_entry = vm_map_entry_create(new_map);
3298                         *new_entry = *old_entry;
3299                         new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED |
3300                             MAP_ENTRY_IN_TRANSITION);
3301                         new_entry->wiring_thread = NULL;
3302                         new_entry->wired_count = 0;
3303                         if (new_entry->eflags & MAP_ENTRY_VN_WRITECNT) {
3304                                 vnode_pager_update_writecount(object,
3305                                     new_entry->start, new_entry->end);
3306                         }
3307
3308                         /*
3309                          * Insert the entry into the new map -- we know we're
3310                          * inserting at the end of the new map.
3311                          */
3312                         vm_map_entry_link(new_map, new_map->header.prev,
3313                             new_entry);
3314                         vmspace_map_entry_forked(vm1, vm2, new_entry);
3315
3316                         /*
3317                          * Update the physical map
3318                          */
3319                         pmap_copy(new_map->pmap, old_map->pmap,
3320                             new_entry->start,
3321                             (old_entry->end - old_entry->start),
3322                             old_entry->start);
3323                         break;
3324
3325                 case VM_INHERIT_COPY:
3326                         /*
3327                          * Clone the entry and link into the map.
3328                          */
3329                         new_entry = vm_map_entry_create(new_map);
3330                         *new_entry = *old_entry;
3331                         /*
3332                          * Copied entry is COW over the old object.
3333                          */
3334                         new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED |
3335                             MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_VN_WRITECNT);
3336                         new_entry->wiring_thread = NULL;
3337                         new_entry->wired_count = 0;
3338                         new_entry->object.vm_object = NULL;
3339                         new_entry->cred = NULL;
3340                         vm_map_entry_link(new_map, new_map->header.prev,
3341                             new_entry);
3342                         vmspace_map_entry_forked(vm1, vm2, new_entry);
3343                         vm_map_copy_entry(old_map, new_map, old_entry,
3344                             new_entry, fork_charge);
3345                         break;
3346                 }
3347                 old_entry = old_entry->next;
3348         }
3349         /*
3350          * Use inlined vm_map_unlock() to postpone handling the deferred
3351          * map entries, which cannot be done until both old_map and
3352          * new_map locks are released.
3353          */
3354         sx_xunlock(&old_map->lock);
3355         sx_xunlock(&new_map->lock);
3356         vm_map_process_deferred();
3357
3358         return (vm2);
3359 }
3360
3361 int
3362 vm_map_stack(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
3363     vm_prot_t prot, vm_prot_t max, int cow)
3364 {
3365         vm_size_t growsize, init_ssize;
3366         rlim_t lmemlim, vmemlim;
3367         int rv;
3368
3369         growsize = sgrowsiz;
3370         init_ssize = (max_ssize < growsize) ? max_ssize : growsize;
3371         vm_map_lock(map);
3372         PROC_LOCK(curproc);
3373         lmemlim = lim_cur(curproc, RLIMIT_MEMLOCK);
3374         vmemlim = lim_cur(curproc, RLIMIT_VMEM);
3375         PROC_UNLOCK(curproc);
3376         if (!old_mlock && map->flags & MAP_WIREFUTURE) {
3377                 if (ptoa(pmap_wired_count(map->pmap)) + init_ssize > lmemlim) {
3378                         rv = KERN_NO_SPACE;
3379                         goto out;
3380                 }
3381         }
3382         /* If we would blow our VMEM resource limit, no go */
3383         if (map->size + init_ssize > vmemlim) {
3384                 rv = KERN_NO_SPACE;
3385                 goto out;
3386         }
3387         rv = vm_map_stack_locked(map, addrbos, max_ssize, growsize, prot,
3388             max, cow);
3389 out:
3390         vm_map_unlock(map);
3391         return (rv);
3392 }
3393
3394 static int
3395 vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
3396     vm_size_t growsize, vm_prot_t prot, vm_prot_t max, int cow)
3397 {
3398         vm_map_entry_t new_entry, prev_entry;
3399         vm_offset_t bot, top;
3400         vm_size_t init_ssize;
3401         int orient, rv;
3402
3403         /*
3404          * The stack orientation is piggybacked with the cow argument.
3405          * Extract it into orient and mask the cow argument so that we
3406          * don't pass it around further.
3407          * NOTE: We explicitly allow bi-directional stacks.
3408          */
3409         orient = cow & (MAP_STACK_GROWS_DOWN|MAP_STACK_GROWS_UP);
3410         KASSERT(orient != 0, ("No stack grow direction"));
3411
3412         if (addrbos < vm_map_min(map) ||
3413             addrbos > vm_map_max(map) ||
3414             addrbos + max_ssize < addrbos)
3415                 return (KERN_NO_SPACE);
3416
3417         init_ssize = (max_ssize < growsize) ? max_ssize : growsize;
3418
3419         /* If addr is already mapped, no go */
3420         if (vm_map_lookup_entry(map, addrbos, &prev_entry))
3421                 return (KERN_NO_SPACE);
3422
3423         /*
3424          * If we can't accomodate max_ssize in the current mapping, no go.
3425          * However, we need to be aware that subsequent user mappings might
3426          * map into the space we have reserved for stack, and currently this
3427          * space is not protected.
3428          *
3429          * Hopefully we will at least detect this condition when we try to
3430          * grow the stack.
3431          */
3432         if ((prev_entry->next != &map->header) &&
3433             (prev_entry->next->start < addrbos + max_ssize))
3434                 return (KERN_NO_SPACE);
3435
3436         /*
3437          * We initially map a stack of only init_ssize.  We will grow as
3438          * needed later.  Depending on the orientation of the stack (i.e.
3439          * the grow direction) we either map at the top of the range, the
3440          * bottom of the range or in the middle.
3441          *
3442          * Note: we would normally expect prot and max to be VM_PROT_ALL,
3443          * and cow to be 0.  Possibly we should eliminate these as input
3444          * parameters, and just pass these values here in the insert call.
3445          */
3446         if (orient == MAP_STACK_GROWS_DOWN)
3447                 bot = addrbos + max_ssize - init_ssize;
3448         else if (orient == MAP_STACK_GROWS_UP)
3449                 bot = addrbos;
3450         else
3451                 bot = round_page(addrbos + max_ssize/2 - init_ssize/2);
3452         top = bot + init_ssize;
3453         rv = vm_map_insert(map, NULL, 0, bot, top, prot, max, cow);
3454
3455         /* Now set the avail_ssize amount. */
3456         if (rv == KERN_SUCCESS) {
3457                 new_entry = prev_entry->next;
3458                 if (new_entry->end != top || new_entry->start != bot)
3459                         panic("Bad entry start/end for new stack entry");
3460
3461                 new_entry->avail_ssize = max_ssize - init_ssize;
3462                 KASSERT((orient & MAP_STACK_GROWS_DOWN) == 0 ||
3463                     (new_entry->eflags & MAP_ENTRY_GROWS_DOWN) != 0,
3464                     ("new entry lacks MAP_ENTRY_GROWS_DOWN"));
3465                 KASSERT((orient & MAP_STACK_GROWS_UP) == 0 ||
3466                     (new_entry->eflags & MAP_ENTRY_GROWS_UP) != 0,
3467                     ("new entry lacks MAP_ENTRY_GROWS_UP"));
3468         }
3469
3470         return (rv);
3471 }
3472
3473 static int stack_guard_page = 0;
3474 SYSCTL_INT(_security_bsd, OID_AUTO, stack_guard_page, CTLFLAG_RWTUN,
3475     &stack_guard_page, 0,
3476     "Insert stack guard page ahead of the growable segments.");
3477
3478 /* Attempts to grow a vm stack entry.  Returns KERN_SUCCESS if the
3479  * desired address is already mapped, or if we successfully grow
3480  * the stack.  Also returns KERN_SUCCESS if addr is outside the
3481  * stack range (this is strange, but preserves compatibility with
3482  * the grow function in vm_machdep.c).
3483  */
3484 int
3485 vm_map_growstack(struct proc *p, vm_offset_t addr)
3486 {
3487         vm_map_entry_t next_entry, prev_entry;
3488         vm_map_entry_t new_entry, stack_entry;
3489         struct vmspace *vm = p->p_vmspace;
3490         vm_map_t map = &vm->vm_map;
3491         vm_offset_t end;
3492         vm_size_t growsize;
3493         size_t grow_amount, max_grow;
3494         rlim_t lmemlim, stacklim, vmemlim;
3495         int is_procstack, rv;
3496         struct ucred *cred;
3497 #ifdef notyet
3498         uint64_t limit;
3499 #endif
3500 #ifdef RACCT
3501         int error;
3502 #endif
3503
3504 Retry:
3505         PROC_LOCK(p);
3506         lmemlim = lim_cur(p, RLIMIT_MEMLOCK);
3507         stacklim = lim_cur(p, RLIMIT_STACK);
3508         vmemlim = lim_cur(p, RLIMIT_VMEM);
3509         PROC_UNLOCK(p);
3510
3511         vm_map_lock_read(map);
3512
3513         /* If addr is already in the entry range, no need to grow.*/
3514         if (vm_map_lookup_entry(map, addr, &prev_entry)) {
3515                 vm_map_unlock_read(map);
3516                 return (KERN_SUCCESS);
3517         }
3518
3519         next_entry = prev_entry->next;
3520         if (!(prev_entry->eflags & MAP_ENTRY_GROWS_UP)) {
3521                 /*
3522                  * This entry does not grow upwards. Since the address lies
3523                  * beyond this entry, the next entry (if one exists) has to
3524                  * be a downward growable entry. The entry list header is
3525                  * never a growable entry, so it suffices to check the flags.
3526                  */
3527                 if (!(next_entry->eflags & MAP_ENTRY_GROWS_DOWN)) {
3528                         vm_map_unlock_read(map);
3529                         return (KERN_SUCCESS);
3530                 }
3531                 stack_entry = next_entry;
3532         } else {
3533                 /*
3534                  * This entry grows upward. If the next entry does not at
3535                  * least grow downwards, this is the entry we need to grow.
3536                  * otherwise we have two possible choices and we have to
3537                  * select one.
3538                  */
3539                 if (next_entry->eflags & MAP_ENTRY_GROWS_DOWN) {
3540                         /*
3541                          * We have two choices; grow the entry closest to
3542                          * the address to minimize the amount of growth.
3543                          */
3544                         if (addr - prev_entry->end <= next_entry->start - addr)
3545                                 stack_entry = prev_entry;
3546                         else
3547                                 stack_entry = next_entry;
3548                 } else
3549                         stack_entry = prev_entry;
3550         }
3551
3552         if (stack_entry == next_entry) {
3553                 KASSERT(stack_entry->eflags & MAP_ENTRY_GROWS_DOWN, ("foo"));
3554                 KASSERT(addr < stack_entry->start, ("foo"));
3555                 end = (prev_entry != &map->header) ? prev_entry->end :
3556                     stack_entry->start - stack_entry->avail_ssize;
3557                 grow_amount = roundup(stack_entry->start - addr, PAGE_SIZE);
3558                 max_grow = stack_entry->start - end;
3559         } else {
3560                 KASSERT(stack_entry->eflags & MAP_ENTRY_GROWS_UP, ("foo"));
3561                 KASSERT(addr >= stack_entry->end, ("foo"));
3562                 end = (next_entry != &map->header) ? next_entry->start :
3563                     stack_entry->end + stack_entry->avail_ssize;
3564                 grow_amount = roundup(addr + 1 - stack_entry->end, PAGE_SIZE);
3565                 max_grow = end - stack_entry->end;
3566         }
3567
3568         if (grow_amount > stack_entry->avail_ssize) {
3569                 vm_map_unlock_read(map);
3570                 return (KERN_NO_SPACE);
3571         }
3572
3573         /*
3574          * If there is no longer enough space between the entries nogo, and
3575          * adjust the available space.  Note: this  should only happen if the
3576          * user has mapped into the stack area after the stack was created,
3577          * and is probably an error.
3578          *
3579          * This also effectively destroys any guard page the user might have
3580          * intended by limiting the stack size.
3581          */
3582         if (grow_amount + (stack_guard_page ? PAGE_SIZE : 0) > max_grow) {
3583                 if (vm_map_lock_upgrade(map))
3584                         goto Retry;
3585
3586                 stack_entry->avail_ssize = max_grow;
3587
3588                 vm_map_unlock(map);
3589                 return (KERN_NO_SPACE);
3590         }
3591
3592         is_procstack = (addr >= (vm_offset_t)vm->vm_maxsaddr) ? 1 : 0;
3593
3594         /*
3595          * If this is the main process stack, see if we're over the stack
3596          * limit.
3597          */
3598         if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) {
3599                 vm_map_unlock_read(map);
3600                 return (KERN_NO_SPACE);
3601         }
3602 #ifdef RACCT
3603         PROC_LOCK(p);
3604         if (is_procstack &&
3605             racct_set(p, RACCT_STACK, ctob(vm->vm_ssize) + grow_amount)) {
3606                 PROC_UNLOCK(p);
3607                 vm_map_unlock_read(map);
3608                 return (KERN_NO_SPACE);
3609         }
3610         PROC_UNLOCK(p);
3611 #endif
3612
3613         /* Round up the grow amount modulo sgrowsiz */
3614         growsize = sgrowsiz;
3615         grow_amount = roundup(grow_amount, growsize);
3616         if (grow_amount > stack_entry->avail_ssize)
3617                 grow_amount = stack_entry->avail_ssize;
3618         if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) {
3619                 grow_amount = trunc_page((vm_size_t)stacklim) -
3620                     ctob(vm->vm_ssize);
3621         }
3622 #ifdef notyet
3623         PROC_LOCK(p);
3624         limit = racct_get_available(p, RACCT_STACK);
3625         PROC_UNLOCK(p);
3626         if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > limit))
3627                 grow_amount = limit - ctob(vm->vm_ssize);
3628 #endif
3629         if (!old_mlock && map->flags & MAP_WIREFUTURE) {
3630                 if (ptoa(pmap_wired_count(map->pmap)) + grow_amount > lmemlim) {
3631                         vm_map_unlock_read(map);
3632                         rv = KERN_NO_SPACE;
3633                         goto out;
3634                 }
3635 #ifdef RACCT
3636                 PROC_LOCK(p);
3637                 if (racct_set(p, RACCT_MEMLOCK,
3638                     ptoa(pmap_wired_count(map->pmap)) + grow_amount)) {
3639                         PROC_UNLOCK(p);
3640                         vm_map_unlock_read(map);
3641                         rv = KERN_NO_SPACE;
3642                         goto out;
3643                 }
3644                 PROC_UNLOCK(p);
3645 #endif
3646         }
3647         /* If we would blow our VMEM resource limit, no go */
3648         if (map->size + grow_amount > vmemlim) {
3649                 vm_map_unlock_read(map);
3650                 rv = KERN_NO_SPACE;
3651                 goto out;
3652         }
3653 #ifdef RACCT
3654         PROC_LOCK(p);
3655         if (racct_set(p, RACCT_VMEM, map->size + grow_amount)) {
3656                 PROC_UNLOCK(p);
3657                 vm_map_unlock_read(map);
3658                 rv = KERN_NO_SPACE;
3659                 goto out;
3660         }
3661         PROC_UNLOCK(p);
3662 #endif
3663
3664         if (vm_map_lock_upgrade(map))
3665                 goto Retry;
3666
3667         if (stack_entry == next_entry) {
3668                 /*
3669                  * Growing downward.
3670                  */
3671                 /* Get the preliminary new entry start value */
3672                 addr = stack_entry->start - grow_amount;
3673
3674                 /*
3675                  * If this puts us into the previous entry, cut back our
3676                  * growth to the available space. Also, see the note above.
3677                  */
3678                 if (addr < end) {
3679                         stack_entry->avail_ssize = max_grow;
3680                         addr = end;
3681                         if (stack_guard_page)
3682                                 addr += PAGE_SIZE;
3683                 }
3684
3685                 rv = vm_map_insert(map, NULL, 0, addr, stack_entry->start,
3686                     next_entry->protection, next_entry->max_protection,
3687                     MAP_STACK_GROWS_DOWN);
3688
3689                 /* Adjust the available stack space by the amount we grew. */
3690                 if (rv == KERN_SUCCESS) {
3691                         new_entry = prev_entry->next;
3692                         KASSERT(new_entry == stack_entry->prev, ("foo"));
3693                         KASSERT(new_entry->end == stack_entry->start, ("foo"));
3694                         KASSERT(new_entry->start == addr, ("foo"));
3695                         KASSERT((new_entry->eflags & MAP_ENTRY_GROWS_DOWN) !=
3696                             0, ("new entry lacks MAP_ENTRY_GROWS_DOWN"));
3697                         grow_amount = new_entry->end - new_entry->start;
3698                         new_entry->avail_ssize = stack_entry->avail_ssize -
3699                             grow_amount;
3700                         stack_entry->eflags &= ~MAP_ENTRY_GROWS_DOWN;
3701                 }
3702         } else {
3703                 /*
3704                  * Growing upward.
3705                  */
3706                 addr = stack_entry->end + grow_amount;
3707
3708                 /*
3709                  * If this puts us into the next entry, cut back our growth
3710                  * to the available space. Also, see the note above.
3711                  */
3712                 if (addr > end) {
3713                         stack_entry->avail_ssize = end - stack_entry->end;
3714                         addr = end;
3715                         if (stack_guard_page)
3716                                 addr -= PAGE_SIZE;
3717                 }
3718
3719                 grow_amount = addr - stack_entry->end;
3720                 cred = stack_entry->cred;
3721                 if (cred == NULL && stack_entry->object.vm_object != NULL)
3722                         cred = stack_entry->object.vm_object->cred;
3723                 if (cred != NULL && !swap_reserve_by_cred(grow_amount, cred))
3724                         rv = KERN_NO_SPACE;
3725                 /* Grow the underlying object if applicable. */
3726                 else if (stack_entry->object.vm_object == NULL ||
3727                          vm_object_coalesce(stack_entry->object.vm_object,
3728                          stack_entry->offset,
3729                          (vm_size_t)(stack_entry->end - stack_entry->start),
3730                          (vm_size_t)grow_amount, cred != NULL)) {
3731                         map->size += (addr - stack_entry->end);
3732                         /* Update the current entry. */
3733                         stack_entry->end = addr;
3734                         stack_entry->avail_ssize -= grow_amount;
3735                         vm_map_entry_resize_free(map, stack_entry);
3736                         rv = KERN_SUCCESS;
3737                 } else
3738                         rv = KERN_FAILURE;
3739         }
3740
3741         if (rv == KERN_SUCCESS && is_procstack)
3742                 vm->vm_ssize += btoc(grow_amount);
3743
3744         vm_map_unlock(map);
3745
3746         /*
3747          * Heed the MAP_WIREFUTURE flag if it was set for this process.
3748          */
3749         if (rv == KERN_SUCCESS && (map->flags & MAP_WIREFUTURE)) {
3750                 vm_map_wire(map,
3751                     (stack_entry == next_entry) ? addr : addr - grow_amount,
3752                     (stack_entry == next_entry) ? stack_entry->start : addr,
3753                     (p->p_flag & P_SYSTEM)
3754                     ? VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES
3755                     : VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES);
3756         }
3757
3758 out:
3759 #ifdef RACCT
3760         if (rv != KERN_SUCCESS) {
3761                 PROC_LOCK(p);
3762                 error = racct_set(p, RACCT_VMEM, map->size);
3763                 KASSERT(error == 0, ("decreasing RACCT_VMEM failed"));
3764                 if (!old_mlock) {
3765                         error = racct_set(p, RACCT_MEMLOCK,
3766                             ptoa(pmap_wired_count(map->pmap)));
3767                         KASSERT(error == 0, ("decreasing RACCT_MEMLOCK failed"));
3768                 }
3769                 error = racct_set(p, RACCT_STACK, ctob(vm->vm_ssize));
3770                 KASSERT(error == 0, ("decreasing RACCT_STACK failed"));
3771                 PROC_UNLOCK(p);
3772         }
3773 #endif
3774
3775         return (rv);
3776 }
3777
3778 /*
3779  * Unshare the specified VM space for exec.  If other processes are
3780  * mapped to it, then create a new one.  The new vmspace is null.
3781  */
3782 int
3783 vmspace_exec(struct proc *p, vm_offset_t minuser, vm_offset_t maxuser)
3784 {
3785         struct vmspace *oldvmspace = p->p_vmspace;
3786         struct vmspace *newvmspace;
3787
3788         KASSERT((curthread->td_pflags & TDP_EXECVMSPC) == 0,
3789             ("vmspace_exec recursed"));
3790         newvmspace = vmspace_alloc(minuser, maxuser, NULL);
3791         if (newvmspace == NULL)
3792                 return (ENOMEM);
3793         newvmspace->vm_swrss = oldvmspace->vm_swrss;
3794         /*
3795          * This code is written like this for prototype purposes.  The
3796          * goal is to avoid running down the vmspace here, but let the
3797          * other process's that are still using the vmspace to finally
3798          * run it down.  Even though there is little or no chance of blocking
3799          * here, it is a good idea to keep this form for future mods.
3800          */
3801         PROC_VMSPACE_LOCK(p);
3802         p->p_vmspace = newvmspace;
3803         PROC_VMSPACE_UNLOCK(p);
3804         if (p == curthread->td_proc)
3805                 pmap_activate(curthread);
3806         curthread->td_pflags |= TDP_EXECVMSPC;
3807         return (0);
3808 }
3809
3810 /*
3811  * Unshare the specified VM space for forcing COW.  This
3812  * is called by rfork, for the (RFMEM|RFPROC) == 0 case.
3813  */
3814 int
3815 vmspace_unshare(struct proc *p)
3816 {
3817         struct vmspace *oldvmspace = p->p_vmspace;
3818         struct vmspace *newvmspace;
3819         vm_ooffset_t fork_charge;
3820
3821         if (oldvmspace->vm_refcnt == 1)
3822                 return (0);
3823         fork_charge = 0;
3824         newvmspace = vmspace_fork(oldvmspace, &fork_charge);
3825         if (newvmspace == NULL)
3826                 return (ENOMEM);
3827         if (!swap_reserve_by_cred(fork_charge, p->p_ucred)) {
3828                 vmspace_free(newvmspace);
3829                 return (ENOMEM);
3830         }
3831         PROC_VMSPACE_LOCK(p);
3832         p->p_vmspace = newvmspace;
3833         PROC_VMSPACE_UNLOCK(p);
3834         if (p == curthread->td_proc)
3835                 pmap_activate(curthread);
3836         vmspace_free(oldvmspace);
3837         return (0);
3838 }
3839
3840 /*
3841  *      vm_map_lookup:
3842  *
3843  *      Finds the VM object, offset, and
3844  *      protection for a given virtual address in the
3845  *      specified map, assuming a page fault of the
3846  *      type specified.
3847  *
3848  *      Leaves the map in question locked for read; return
3849  *      values are guaranteed until a vm_map_lookup_done
3850  *      call is performed.  Note that the map argument
3851  *      is in/out; the returned map must be used in
3852  *      the call to vm_map_lookup_done.
3853  *
3854  *      A handle (out_entry) is returned for use in
3855  *      vm_map_lookup_done, to make that fast.
3856  *
3857  *      If a lookup is requested with "write protection"
3858  *      specified, the map may be changed to perform virtual
3859  *      copying operations, although the data referenced will
3860  *      remain the same.
3861  */
3862 int
3863 vm_map_lookup(vm_map_t *var_map,                /* IN/OUT */
3864               vm_offset_t vaddr,
3865               vm_prot_t fault_typea,
3866               vm_map_entry_t *out_entry,        /* OUT */
3867               vm_object_t *object,              /* OUT */
3868               vm_pindex_t *pindex,              /* OUT */
3869               vm_prot_t *out_prot,              /* OUT */
3870               boolean_t *wired)                 /* OUT */
3871 {
3872         vm_map_entry_t entry;
3873         vm_map_t map = *var_map;
3874         vm_prot_t prot;
3875         vm_prot_t fault_type = fault_typea;
3876         vm_object_t eobject;
3877         vm_size_t size;
3878         struct ucred *cred;
3879
3880 RetryLookup:;
3881
3882         vm_map_lock_read(map);
3883
3884         /*
3885          * Lookup the faulting address.
3886          */
3887         if (!vm_map_lookup_entry(map, vaddr, out_entry)) {
3888                 vm_map_unlock_read(map);
3889                 return (KERN_INVALID_ADDRESS);
3890         }
3891
3892         entry = *out_entry;
3893
3894         /*
3895          * Handle submaps.
3896          */
3897         if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
3898                 vm_map_t old_map = map;
3899
3900                 *var_map = map = entry->object.sub_map;
3901                 vm_map_unlock_read(old_map);
3902                 goto RetryLookup;
3903         }
3904
3905         /*
3906          * Check whether this task is allowed to have this page.
3907          */
3908         prot = entry->protection;
3909         fault_type &= (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
3910         if ((fault_type & prot) != fault_type || prot == VM_PROT_NONE) {
3911                 vm_map_unlock_read(map);
3912                 return (KERN_PROTECTION_FAILURE);
3913         }
3914         if ((entry->eflags & MAP_ENTRY_USER_WIRED) &&
3915             (entry->eflags & MAP_ENTRY_COW) &&
3916             (fault_type & VM_PROT_WRITE)) {
3917                 vm_map_unlock_read(map);
3918                 return (KERN_PROTECTION_FAILURE);
3919         }
3920         if ((fault_typea & VM_PROT_COPY) != 0 &&
3921             (entry->max_protection & VM_PROT_WRITE) == 0 &&
3922             (entry->eflags & MAP_ENTRY_COW) == 0) {
3923                 vm_map_unlock_read(map);
3924                 return (KERN_PROTECTION_FAILURE);
3925         }
3926
3927         /*
3928          * If this page is not pageable, we have to get it for all possible
3929          * accesses.
3930          */
3931         *wired = (entry->wired_count != 0);
3932         if (*wired)
3933                 fault_type = entry->protection;
3934         size = entry->end - entry->start;
3935         /*
3936          * If the entry was copy-on-write, we either ...
3937          */
3938         if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
3939                 /*
3940                  * If we want to write the page, we may as well handle that
3941                  * now since we've got the map locked.
3942                  *
3943                  * If we don't need to write the page, we just demote the
3944                  * permissions allowed.
3945                  */
3946                 if ((fault_type & VM_PROT_WRITE) != 0 ||
3947                     (fault_typea & VM_PROT_COPY) != 0) {
3948                         /*
3949                          * Make a new object, and place it in the object
3950                          * chain.  Note that no new references have appeared
3951                          * -- one just moved from the map to the new
3952                          * object.
3953                          */
3954                         if (vm_map_lock_upgrade(map))
3955                                 goto RetryLookup;
3956
3957                         if (entry->cred == NULL) {
3958                                 /*
3959                                  * The debugger owner is charged for
3960                                  * the memory.
3961                                  */
3962                                 cred = curthread->td_ucred;
3963                                 crhold(cred);
3964                                 if (!swap_reserve_by_cred(size, cred)) {
3965                                         crfree(cred);
3966                                         vm_map_unlock(map);
3967                                         return (KERN_RESOURCE_SHORTAGE);
3968                                 }
3969                                 entry->cred = cred;
3970                         }
3971                         vm_object_shadow(&entry->object.vm_object,
3972                             &entry->offset, size);
3973                         entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
3974                         eobject = entry->object.vm_object;
3975                         if (eobject->cred != NULL) {
3976                                 /*
3977                                  * The object was not shadowed.
3978                                  */
3979                                 swap_release_by_cred(size, entry->cred);
3980                                 crfree(entry->cred);
3981                                 entry->cred = NULL;
3982                         } else if (entry->cred != NULL) {
3983                                 VM_OBJECT_WLOCK(eobject);
3984                                 eobject->cred = entry->cred;
3985                                 eobject->charge = size;
3986                                 VM_OBJECT_WUNLOCK(eobject);
3987                                 entry->cred = NULL;
3988                         }
3989
3990                         vm_map_lock_downgrade(map);
3991                 } else {
3992                         /*
3993                          * We're attempting to read a copy-on-write page --
3994                          * don't allow writes.
3995                          */
3996                         prot &= ~VM_PROT_WRITE;
3997                 }
3998         }
3999
4000         /*
4001          * Create an object if necessary.
4002          */
4003         if (entry->object.vm_object == NULL &&
4004             !map->system_map) {
4005                 if (vm_map_lock_upgrade(map))
4006                         goto RetryLookup;
4007                 entry->object.vm_object = vm_object_allocate(OBJT_DEFAULT,
4008                     atop(size));
4009                 entry->offset = 0;
4010                 if (entry->cred != NULL) {
4011                         VM_OBJECT_WLOCK(entry->object.vm_object);
4012                         entry->object.vm_object->cred = entry->cred;
4013                         entry->object.vm_object->charge = size;
4014                         VM_OBJECT_WUNLOCK(entry->object.vm_object);
4015                         entry->cred = NULL;
4016                 }
4017                 vm_map_lock_downgrade(map);
4018         }
4019
4020         /*
4021          * Return the object/offset from this entry.  If the entry was
4022          * copy-on-write or empty, it has been fixed up.
4023          */
4024         *pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
4025         *object = entry->object.vm_object;
4026
4027         *out_prot = prot;
4028         return (KERN_SUCCESS);
4029 }
4030
4031 /*
4032  *      vm_map_lookup_locked:
4033  *
4034  *      Lookup the faulting address.  A version of vm_map_lookup that returns
4035  *      KERN_FAILURE instead of blocking on map lock or memory allocation.
4036  */
4037 int
4038 vm_map_lookup_locked(vm_map_t *var_map,         /* IN/OUT */
4039                      vm_offset_t vaddr,
4040                      vm_prot_t fault_typea,
4041                      vm_map_entry_t *out_entry, /* OUT */
4042                      vm_object_t *object,       /* OUT */
4043                      vm_pindex_t *pindex,       /* OUT */
4044                      vm_prot_t *out_prot,       /* OUT */
4045                      boolean_t *wired)          /* OUT */
4046 {
4047         vm_map_entry_t entry;
4048         vm_map_t map = *var_map;
4049         vm_prot_t prot;
4050         vm_prot_t fault_type = fault_typea;
4051
4052         /*
4053          * Lookup the faulting address.
4054          */
4055         if (!vm_map_lookup_entry(map, vaddr, out_entry))
4056                 return (KERN_INVALID_ADDRESS);
4057
4058         entry = *out_entry;
4059
4060         /*
4061          * Fail if the entry refers to a submap.
4062          */
4063         if (entry->eflags & MAP_ENTRY_IS_SUB_MAP)
4064                 return (KERN_FAILURE);
4065
4066         /*
4067          * Check whether this task is allowed to have this page.
4068          */
4069         prot = entry->protection;
4070         fault_type &= VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE;
4071         if ((fault_type & prot) != fault_type)
4072                 return (KERN_PROTECTION_FAILURE);
4073         if ((entry->eflags & MAP_ENTRY_USER_WIRED) &&
4074             (entry->eflags & MAP_ENTRY_COW) &&
4075             (fault_type & VM_PROT_WRITE))
4076                 return (KERN_PROTECTION_FAILURE);
4077
4078         /*
4079          * If this page is not pageable, we have to get it for all possible
4080          * accesses.
4081          */
4082         *wired = (entry->wired_count != 0);
4083         if (*wired)
4084                 fault_type = entry->protection;
4085
4086         if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
4087                 /*
4088                  * Fail if the entry was copy-on-write for a write fault.
4089                  */
4090                 if (fault_type & VM_PROT_WRITE)
4091                         return (KERN_FAILURE);
4092                 /*
4093                  * We're attempting to read a copy-on-write page --
4094                  * don't allow writes.
4095                  */
4096                 prot &= ~VM_PROT_WRITE;
4097         }
4098
4099         /*
4100          * Fail if an object should be created.
4101          */
4102         if (entry->object.vm_object == NULL && !map->system_map)
4103                 return (KERN_FAILURE);
4104
4105         /*
4106          * Return the object/offset from this entry.  If the entry was
4107          * copy-on-write or empty, it has been fixed up.
4108          */
4109         *pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
4110         *object = entry->object.vm_object;
4111
4112         *out_prot = prot;
4113         return (KERN_SUCCESS);
4114 }
4115
4116 /*
4117  *      vm_map_lookup_done:
4118  *
4119  *      Releases locks acquired by a vm_map_lookup
4120  *      (according to the handle returned by that lookup).
4121  */
4122 void
4123 vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry)
4124 {
4125         /*
4126          * Unlock the main-level map
4127          */
4128         vm_map_unlock_read(map);
4129 }
4130
4131 #include "opt_ddb.h"
4132 #ifdef DDB
4133 #include <sys/kernel.h>
4134
4135 #include <ddb/ddb.h>
4136
4137 static void
4138 vm_map_print(vm_map_t map)
4139 {
4140         vm_map_entry_t entry;
4141
4142         db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n",
4143             (void *)map,
4144             (void *)map->pmap, map->nentries, map->timestamp);
4145
4146         db_indent += 2;
4147         for (entry = map->header.next; entry != &map->header;
4148             entry = entry->next) {
4149                 db_iprintf("map entry %p: start=%p, end=%p\n",
4150                     (void *)entry, (void *)entry->start, (void *)entry->end);
4151                 {
4152                         static char *inheritance_name[4] =
4153                         {"share", "copy", "none", "donate_copy"};
4154
4155                         db_iprintf(" prot=%x/%x/%s",
4156                             entry->protection,
4157                             entry->max_protection,
4158                             inheritance_name[(int)(unsigned char)entry->inheritance]);
4159                         if (entry->wired_count != 0)
4160                                 db_printf(", wired");
4161                 }
4162                 if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
4163                         db_printf(", share=%p, offset=0x%jx\n",
4164                             (void *)entry->object.sub_map,
4165                             (uintmax_t)entry->offset);
4166                         if ((entry->prev == &map->header) ||
4167                             (entry->prev->object.sub_map !=
4168                                 entry->object.sub_map)) {
4169                                 db_indent += 2;
4170                                 vm_map_print((vm_map_t)entry->object.sub_map);
4171                                 db_indent -= 2;
4172                         }
4173                 } else {
4174                         if (entry->cred != NULL)
4175                                 db_printf(", ruid %d", entry->cred->cr_ruid);
4176                         db_printf(", object=%p, offset=0x%jx",
4177                             (void *)entry->object.vm_object,
4178                             (uintmax_t)entry->offset);
4179                         if (entry->object.vm_object && entry->object.vm_object->cred)
4180                                 db_printf(", obj ruid %d charge %jx",
4181                                     entry->object.vm_object->cred->cr_ruid,
4182                                     (uintmax_t)entry->object.vm_object->charge);
4183                         if (entry->eflags & MAP_ENTRY_COW)
4184                                 db_printf(", copy (%s)",
4185                                     (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
4186                         db_printf("\n");
4187
4188                         if ((entry->prev == &map->header) ||
4189                             (entry->prev->object.vm_object !=
4190                                 entry->object.vm_object)) {
4191                                 db_indent += 2;
4192                                 vm_object_print((db_expr_t)(intptr_t)
4193                                                 entry->object.vm_object,
4194                                                 0, 0, (char *)0);
4195                                 db_indent -= 2;
4196                         }
4197                 }
4198         }
4199         db_indent -= 2;
4200 }
4201
4202 DB_SHOW_COMMAND(map, map)
4203 {
4204
4205         if (!have_addr) {
4206                 db_printf("usage: show map <addr>\n");
4207                 return;
4208         }
4209         vm_map_print((vm_map_t)addr);
4210 }
4211
4212 DB_SHOW_COMMAND(procvm, procvm)
4213 {
4214         struct proc *p;
4215
4216         if (have_addr) {
4217                 p = (struct proc *) addr;
4218         } else {
4219                 p = curproc;
4220         }
4221
4222         db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n",
4223             (void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map,
4224             (void *)vmspace_pmap(p->p_vmspace));
4225
4226         vm_map_print((vm_map_t)&p->p_vmspace->vm_map);
4227 }
4228
4229 #endif /* DDB */