sys/vm/vm_map.c

   1 /*
   2  * Copyright (c) 1991, 1993
   3  *      The Regents of the University of California.  All rights reserved.
   4  *
   5  * This code is derived from software contributed to Berkeley by
   6  * The Mach Operating System project at Carnegie-Mellon University.
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in the
  15  *    documentation and/or other materials provided with the distribution.
  16  * 3. All advertising materials mentioning features or use of this software
  17  *    must display the following acknowledgement:
  18  *      This product includes software developed by the University of
  19  *      California, Berkeley and its contributors.
  20  * 4. Neither the name of the University nor the names of its contributors
  21  *    may be used to endorse or promote products derived from this software
  22  *    without specific prior written permission.
  23  *
  24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  34  * SUCH DAMAGE.
  35  *
  36  *      from: @(#)vm_map.c      8.3 (Berkeley) 1/12/94
  37  *
  38  *
  39  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  40  * All rights reserved.
  41  *
  42  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  43  *
  44  * Permission to use, copy, modify and distribute this software and
  45  * its documentation is hereby granted, provided that both the copyright
  46  * notice and this permission notice appear in all copies of the
  47  * software, derivative works or modified versions, and any portions
  48  * thereof, and that both notices appear in supporting documentation.
  49  *
  50  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  51  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  52  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  53  *
  54  * Carnegie Mellon requests users of this software to return to
  55  *
  56  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  57  *  School of Computer Science
  58  *  Carnegie Mellon University
  59  *  Pittsburgh PA 15213-3890
  60  *
  61  * any improvements or extensions that they make and grant Carnegie the
  62  * rights to redistribute these changes.
  63  *
  64  * $FreeBSD$
  65  */
  66
  67 /*
  68  *      Virtual memory mapping module.
  69  */
  70
  71 #include <sys/param.h>
  72 #include <sys/systm.h>
  73 #include <sys/ktr.h>
  74 #include <sys/lock.h>
  75 #include <sys/mutex.h>
  76 #include <sys/proc.h>
  77 #include <sys/vmmeter.h>
  78 #include <sys/mman.h>
  79 #include <sys/vnode.h>
  80 #include <sys/resourcevar.h>
  81 #include <sys/sysent.h>
  82 #include <sys/stdint.h>
  83
  84 #include <vm/vm.h>
  85 #include <vm/vm_param.h>
  86 #include <vm/pmap.h>
  87 #include <vm/vm_map.h>
  88 #include <vm/vm_page.h>
  89 #include <vm/vm_object.h>
  90 #include <vm/vm_pager.h>
  91 #include <vm/vm_kern.h>
  92 #include <vm/vm_extern.h>
  93 #include <vm/swap_pager.h>
  94 #include <vm/uma.h>
  95
  96 /*
  97  *      Virtual memory maps provide for the mapping, protection,
  98  *      and sharing of virtual memory objects.  In addition,
  99  *      this module provides for an efficient virtual copy of
 100  *      memory from one map to another.
 101  *
 102  *      Synchronization is required prior to most operations.
 103  *
 104  *      Maps consist of an ordered doubly-linked list of simple
 105  *      entries; a single hint is used to speed up lookups.
 106  *
 107  *      Since portions of maps are specified by start/end addresses,
 108  *      which may not align with existing map entries, all
 109  *      routines merely "clip" entries to these start/end values.
 110  *      [That is, an entry is split into two, bordering at a
 111  *      start or end value.]  Note that these clippings may not
 112  *      always be necessary (as the two resulting entries are then
 113  *      not changed); however, the clipping is done for convenience.
 114  *
 115  *      As mentioned above, virtual copy operations are performed
 116  *      by copying VM object references from one map to
 117  *      another, and then marking both regions as copy-on-write.
 118  */
 119
 120 /*
 121  *      vm_map_startup:
 122  *
 123  *      Initialize the vm_map module.  Must be called before
 124  *      any other vm_map routines.
 125  *
 126  *      Map and entry structures are allocated from the general
 127  *      purpose memory pool with some exceptions:
 128  *
 129  *      - The kernel map and kmem submap are allocated statically.
 130  *      - Kernel map entries are allocated out of a static pool.
 131  *
 132  *      These restrictions are necessary since malloc() uses the
 133  *      maps and requires map entries.
 134  */
 135
 136 static uma_zone_t mapentzone;
 137 static uma_zone_t kmapentzone;
 138 static uma_zone_t mapzone;
 139 static uma_zone_t vmspace_zone;
 140 static struct vm_object kmapentobj;
 141 static void vmspace_zinit(void *mem, int size);
 142 static void vmspace_zfini(void *mem, int size);
 143 static void vm_map_zinit(void *mem, int size);
 144 static void vm_map_zfini(void *mem, int size);
 145 static void _vm_map_init(vm_map_t map, vm_offset_t min, vm_offset_t max);
 146
 147 #ifdef INVARIANTS
 148 static void vm_map_zdtor(void *mem, int size, void *arg);
 149 static void vmspace_zdtor(void *mem, int size, void *arg);
 150 #endif
 151
 152 void
 153 vm_map_startup(void)
 154 {
 155         mapzone = uma_zcreate("MAP", sizeof(struct vm_map), NULL,
 156 #ifdef INVARIANTS
 157             vm_map_zdtor,
 158 #else
 159             NULL,
 160 #endif
 161             vm_map_zinit, vm_map_zfini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 162         uma_prealloc(mapzone, MAX_KMAP);
 163         kmapentzone = uma_zcreate("KMAP ENTRY", sizeof(struct vm_map_entry),
 164             NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
 165             UMA_ZONE_MTXCLASS | UMA_ZONE_VM);
 166         uma_prealloc(kmapentzone, MAX_KMAPENT);
 167         mapentzone = uma_zcreate("MAP ENTRY", sizeof(struct vm_map_entry),
 168             NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 169         uma_prealloc(mapentzone, MAX_MAPENT);
 170 }
 171
 172 static void
 173 vmspace_zfini(void *mem, int size)
 174 {
 175         struct vmspace *vm;
 176
 177         vm = (struct vmspace *)mem;
 178
 179         vm_map_zfini(&vm->vm_map, sizeof(vm->vm_map));
 180 }
 181
 182 static void
 183 vmspace_zinit(void *mem, int size)
 184 {
 185         struct vmspace *vm;
 186
 187         vm = (struct vmspace *)mem;
 188
 189         vm_map_zinit(&vm->vm_map, sizeof(vm->vm_map));
 190 }
 191
 192 static void
 193 vm_map_zfini(void *mem, int size)
 194 {
 195         vm_map_t map;
 196
 197         map = (vm_map_t)mem;
 198
 199         lockdestroy(&map->lock);
 200 }
 201
 202 static void
 203 vm_map_zinit(void *mem, int size)
 204 {
 205         vm_map_t map;
 206
 207         map = (vm_map_t)mem;
 208         map->nentries = 0;
 209         map->size = 0;
 210         map->infork = 0;
 211         lockinit(&map->lock, PVM, "thrd_sleep", 0, LK_NOPAUSE);
 212 }
 213
 214 #ifdef INVARIANTS
 215 static void
 216 vmspace_zdtor(void *mem, int size, void *arg)
 217 {
 218         struct vmspace *vm;
 219
 220         vm = (struct vmspace *)mem;
 221
 222         vm_map_zdtor(&vm->vm_map, sizeof(vm->vm_map), arg);
 223 }
 224 static void
 225 vm_map_zdtor(void *mem, int size, void *arg)
 226 {
 227         vm_map_t map;
 228
 229         map = (vm_map_t)mem;
 230         KASSERT(map->nentries == 0,
 231             ("map %p nentries == %d on free.",
 232             map, map->nentries));
 233         KASSERT(map->size == 0,
 234             ("map %p size == %lu on free.",
 235             map, (unsigned long)map->size));
 236         KASSERT(map->infork == 0,
 237             ("map %p infork == %d on free.",
 238             map, map->infork));
 239 }
 240 #endif  /* INVARIANTS */
 241
 242 /*
 243  * Allocate a vmspace structure, including a vm_map and pmap,
 244  * and initialize those structures.  The refcnt is set to 1.
 245  * The remaining fields must be initialized by the caller.
 246  */
 247 struct vmspace *
 248 vmspace_alloc(min, max)
 249         vm_offset_t min, max;
 250 {
 251         struct vmspace *vm;
 252
 253         GIANT_REQUIRED;
 254         vm = uma_zalloc(vmspace_zone, M_WAITOK);
 255         CTR1(KTR_VM, "vmspace_alloc: %p", vm);
 256         _vm_map_init(&vm->vm_map, min, max);
 257         pmap_pinit(vmspace_pmap(vm));
 258         vm->vm_map.pmap = vmspace_pmap(vm);             /* XXX */
 259         vm->vm_refcnt = 1;
 260         vm->vm_shm = NULL;
 261         vm->vm_exitingcnt = 0;
 262         return (vm);
 263 }
 264
 265 void
 266 vm_init2(void)
 267 {
 268         uma_zone_set_obj(kmapentzone, &kmapentobj, lmin(cnt.v_page_count,
 269             (VM_MAX_KERNEL_ADDRESS - KERNBASE) / PAGE_SIZE) / 8);
 270         vmspace_zone = uma_zcreate("VMSPACE", sizeof(struct vmspace), NULL,
 271 #ifdef INVARIANTS
 272             vmspace_zdtor,
 273 #else
 274             NULL,
 275 #endif
 276             vmspace_zinit, vmspace_zfini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 277         pmap_init2();
 278 }
 279
 280 static __inline void
 281 vmspace_dofree(struct vmspace *vm)
 282 {
 283         CTR1(KTR_VM, "vmspace_free: %p", vm);
 284         /*
 285          * Lock the map, to wait out all other references to it.
 286          * Delete all of the mappings and pages they hold, then call
 287          * the pmap module to reclaim anything left.
 288          */
 289         vm_map_lock(&vm->vm_map);
 290         (void) vm_map_delete(&vm->vm_map, vm->vm_map.min_offset,
 291             vm->vm_map.max_offset);
 292         vm_map_unlock(&vm->vm_map);
 293
 294         pmap_release(vmspace_pmap(vm));
 295         uma_zfree(vmspace_zone, vm);
 296 }
 297
 298 void
 299 vmspace_free(struct vmspace *vm)
 300 {
 301         GIANT_REQUIRED;
 302
 303         if (vm->vm_refcnt == 0)
 304                 panic("vmspace_free: attempt to free already freed vmspace");
 305
 306         if (--vm->vm_refcnt == 0 && vm->vm_exitingcnt == 0)
 307                 vmspace_dofree(vm);
 308 }
 309
 310 void
 311 vmspace_exitfree(struct proc *p)
 312 {
 313         struct vmspace *vm;
 314
 315         GIANT_REQUIRED;
 316         vm = p->p_vmspace;
 317         p->p_vmspace = NULL;
 318
 319         /*
 320          * cleanup by parent process wait()ing on exiting child.  vm_refcnt
 321          * may not be 0 (e.g. fork() and child exits without exec()ing).
 322          * exitingcnt may increment above 0 and drop back down to zero
 323          * several times while vm_refcnt is held non-zero.  vm_refcnt
 324          * may also increment above 0 and drop back down to zero several
 325          * times while vm_exitingcnt is held non-zero.
 326          *
 327          * The last wait on the exiting child's vmspace will clean up
 328          * the remainder of the vmspace.
 329          */
 330         if (--vm->vm_exitingcnt == 0 && vm->vm_refcnt == 0)
 331                 vmspace_dofree(vm);
 332 }
 333
 334 /*
 335  * vmspace_swap_count() - count the approximate swap useage in pages for a
 336  *                        vmspace.
 337  *
 338  *      Swap useage is determined by taking the proportional swap used by
 339  *      VM objects backing the VM map.  To make up for fractional losses,
 340  *      if the VM object has any swap use at all the associated map entries
 341  *      count for at least 1 swap page.
 342  */
 343 int
 344 vmspace_swap_count(struct vmspace *vmspace)
 345 {
 346         vm_map_t map = &vmspace->vm_map;
 347         vm_map_entry_t cur;
 348         int count = 0;
 349
 350         vm_map_lock_read(map);
 351         for (cur = map->header.next; cur != &map->header; cur = cur->next) {
 352                 vm_object_t object;
 353
 354                 if ((cur->eflags & MAP_ENTRY_IS_SUB_MAP) == 0 &&
 355                     (object = cur->object.vm_object) != NULL &&
 356                     object->type == OBJT_SWAP
 357                 ) {
 358                         int n = (cur->end - cur->start) / PAGE_SIZE;
 359
 360                         if (object->un_pager.swp.swp_bcount) {
 361                                 count += object->un_pager.swp.swp_bcount *
 362                                     SWAP_META_PAGES * n / object->size + 1;
 363                         }
 364                 }
 365         }
 366         vm_map_unlock_read(map);
 367         return (count);
 368 }
 369
 370 void
 371 _vm_map_lock(vm_map_t map, const char *file, int line)
 372 {
 373         int error;
 374
 375         if (map->system_map)
 376                 GIANT_REQUIRED;
 377         error = lockmgr(&map->lock, LK_EXCLUSIVE, NULL, curthread);
 378         KASSERT(error == 0, ("%s: failed to get lock", __func__));
 379         map->timestamp++;
 380 }
 381
 382 void
 383 _vm_map_unlock(vm_map_t map, const char *file, int line)
 384 {
 385
 386         lockmgr(&map->lock, LK_RELEASE, NULL, curthread);
 387 }
 388
 389 void
 390 _vm_map_lock_read(vm_map_t map, const char *file, int line)
 391 {
 392         int error;
 393
 394         if (map->system_map)
 395                 GIANT_REQUIRED;
 396         error = lockmgr(&map->lock, LK_EXCLUSIVE, NULL, curthread);
 397         KASSERT(error == 0, ("%s: failed to get lock", __func__));
 398 }
 399
 400 void
 401 _vm_map_unlock_read(vm_map_t map, const char *file, int line)
 402 {
 403
 404         lockmgr(&map->lock, LK_RELEASE, NULL, curthread);
 405 }
 406
 407 int
 408 _vm_map_trylock(vm_map_t map, const char *file, int line)
 409 {
 410         int error;
 411
 412         if (map->system_map)
 413                 GIANT_REQUIRED;
 414         error = lockmgr(&map->lock, LK_EXCLUSIVE | LK_NOWAIT, NULL, curthread);
 415         return (error == 0);
 416 }
 417
 418 int
 419 _vm_map_lock_upgrade(vm_map_t map, const char *file, int line)
 420 {
 421
 422         KASSERT(lockstatus(&map->lock, curthread) == LK_EXCLUSIVE,
 423                 ("%s: lock not held", __func__));
 424         map->timestamp++;
 425         return (0);
 426 }
 427
 428 void
 429 _vm_map_lock_downgrade(vm_map_t map, const char *file, int line)
 430 {
 431
 432         KASSERT(lockstatus(&map->lock, curthread) == LK_EXCLUSIVE,
 433                 ("%s: lock not held", __func__));
 434 }
 435
 436 /*
 437  *      vm_map_unlock_and_wait:
 438  */
 439 int
 440 vm_map_unlock_and_wait(vm_map_t map, boolean_t user_wait)
 441 {
 442         int retval;
 443
 444         mtx_lock(&Giant);
 445         vm_map_unlock(map);
 446         retval = tsleep(&map->root, PVM, "vmmapw", 0);
 447         mtx_unlock(&Giant);
 448         return (retval);
 449 }
 450
 451 /*
 452  *      vm_map_wakeup:
 453  */
 454 void
 455 vm_map_wakeup(vm_map_t map)
 456 {
 457
 458         /*
 459          * Acquire and release Giant to prevent a wakeup() from being
 460          * performed (and lost) between the vm_map_unlock() and the
 461          * tsleep() in vm_map_unlock_and_wait().
 462          */
 463         mtx_lock(&Giant);
 464         mtx_unlock(&Giant);
 465         wakeup(&map->root);
 466 }
 467
 468 long
 469 vmspace_resident_count(struct vmspace *vmspace)
 470 {
 471         return pmap_resident_count(vmspace_pmap(vmspace));
 472 }
 473
 474 /*
 475  *      vm_map_create:
 476  *
 477  *      Creates and returns a new empty VM map with
 478  *      the given physical map structure, and having
 479  *      the given lower and upper address bounds.
 480  */
 481 vm_map_t
 482 vm_map_create(pmap_t pmap, vm_offset_t min, vm_offset_t max)
 483 {
 484         vm_map_t result;
 485
 486         result = uma_zalloc(mapzone, M_WAITOK);
 487         CTR1(KTR_VM, "vm_map_create: %p", result);
 488         _vm_map_init(result, min, max);
 489         result->pmap = pmap;
 490         return (result);
 491 }
 492
 493 /*
 494  * Initialize an existing vm_map structure
 495  * such as that in the vmspace structure.
 496  * The pmap is set elsewhere.
 497  */
 498 static void
 499 _vm_map_init(vm_map_t map, vm_offset_t min, vm_offset_t max)
 500 {
 501
 502         map->header.next = map->header.prev = &map->header;
 503         map->needs_wakeup = FALSE;
 504         map->system_map = 0;
 505         map->min_offset = min;
 506         map->max_offset = max;
 507         map->first_free = &map->header;
 508         map->root = NULL;
 509         map->timestamp = 0;
 510 }
 511
 512 void
 513 vm_map_init(vm_map_t map, vm_offset_t min, vm_offset_t max)
 514 {
 515         _vm_map_init(map, min, max);
 516         lockinit(&map->lock, PVM, "thrd_sleep", 0, LK_NOPAUSE);
 517 }
 518
 519 /*
 520  *      vm_map_entry_dispose:   [ internal use only ]
 521  *
 522  *      Inverse of vm_map_entry_create.
 523  */
 524 static void
 525 vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry)
 526 {
 527         uma_zfree(map->system_map ? kmapentzone : mapentzone, entry);
 528 }
 529
 530 /*
 531  *      vm_map_entry_create:    [ internal use only ]
 532  *
 533  *      Allocates a VM map entry for insertion.
 534  *      No entry fields are filled in.
 535  */
 536 static vm_map_entry_t
 537 vm_map_entry_create(vm_map_t map)
 538 {
 539         vm_map_entry_t new_entry;
 540
 541         if (map->system_map)
 542                 new_entry = uma_zalloc(kmapentzone, M_NOWAIT);
 543         else
 544                 new_entry = uma_zalloc(mapentzone, M_WAITOK);
 545         if (new_entry == NULL)
 546                 panic("vm_map_entry_create: kernel resources exhausted");
 547         return (new_entry);
 548 }
 549
 550 /*
 551  *      vm_map_entry_set_behavior:
 552  *
 553  *      Set the expected access behavior, either normal, random, or
 554  *      sequential.
 555  */
 556 static __inline void
 557 vm_map_entry_set_behavior(vm_map_entry_t entry, u_char behavior)
 558 {
 559         entry->eflags = (entry->eflags & ~MAP_ENTRY_BEHAV_MASK) |
 560             (behavior & MAP_ENTRY_BEHAV_MASK);
 561 }
 562
 563 /*
 564  *      vm_map_entry_splay:
 565  *
 566  *      Implements Sleator and Tarjan's top-down splay algorithm.  Returns
 567  *      the vm_map_entry containing the given address.  If, however, that
 568  *      address is not found in the vm_map, returns a vm_map_entry that is
 569  *      adjacent to the address, coming before or after it.
 570  */
 571 static vm_map_entry_t
 572 vm_map_entry_splay(vm_offset_t address, vm_map_entry_t root)
 573 {
 574         struct vm_map_entry dummy;
 575         vm_map_entry_t lefttreemax, righttreemin, y;
 576
 577         if (root == NULL)
 578                 return (root);
 579         lefttreemax = righttreemin = &dummy;
 580         for (;; root = y) {
 581                 if (address < root->start) {
 582                         if ((y = root->left) == NULL)
 583                                 break;
 584                         if (address < y->start) {
 585                                 /* Rotate right. */
 586                                 root->left = y->right;
 587                                 y->right = root;
 588                                 root = y;
 589                                 if ((y = root->left) == NULL)
 590                                         break;
 591                         }
 592                         /* Link into the new root's right tree. */
 593                         righttreemin->left = root;
 594                         righttreemin = root;
 595                 } else if (address >= root->end) {
 596                         if ((y = root->right) == NULL)
 597                                 break;
 598                         if (address >= y->end) {
 599                                 /* Rotate left. */
 600                                 root->right = y->left;
 601                                 y->left = root;
 602                                 root = y;
 603                                 if ((y = root->right) == NULL)
 604                                         break;
 605                         }
 606                         /* Link into the new root's left tree. */
 607                         lefttreemax->right = root;
 608                         lefttreemax = root;
 609                 } else
 610                         break;
 611         }
 612         /* Assemble the new root. */
 613         lefttreemax->right = root->left;
 614         righttreemin->left = root->right;
 615         root->left = dummy.right;
 616         root->right = dummy.left;
 617         return (root);
 618 }
 619
 620 /*
 621  *      vm_map_entry_{un,}link:
 622  *
 623  *      Insert/remove entries from maps.
 624  */
 625 static void
 626 vm_map_entry_link(vm_map_t map,
 627                   vm_map_entry_t after_where,
 628                   vm_map_entry_t entry)
 629 {
 630
 631         CTR4(KTR_VM,
 632             "vm_map_entry_link: map %p, nentries %d, entry %p, after %p", map,
 633             map->nentries, entry, after_where);
 634         map->nentries++;
 635         entry->prev = after_where;
 636         entry->next = after_where->next;
 637         entry->next->prev = entry;
 638         after_where->next = entry;
 639
 640         if (after_where != &map->header) {
 641                 if (after_where != map->root)
 642                         vm_map_entry_splay(after_where->start, map->root);
 643                 entry->right = after_where->right;
 644                 entry->left = after_where;
 645                 after_where->right = NULL;
 646         } else {
 647                 entry->right = map->root;
 648                 entry->left = NULL;
 649         }
 650         map->root = entry;
 651 }
 652
 653 static void
 654 vm_map_entry_unlink(vm_map_t map,
 655                     vm_map_entry_t entry)
 656 {
 657         vm_map_entry_t next, prev, root;
 658
 659         if (entry != map->root)
 660                 vm_map_entry_splay(entry->start, map->root);
 661         if (entry->left == NULL)
 662                 root = entry->right;
 663         else {
 664                 root = vm_map_entry_splay(entry->start, entry->left);
 665                 root->right = entry->right;
 666         }
 667         map->root = root;
 668
 669         prev = entry->prev;
 670         next = entry->next;
 671         next->prev = prev;
 672         prev->next = next;
 673         map->nentries--;
 674         CTR3(KTR_VM, "vm_map_entry_unlink: map %p, nentries %d, entry %p", map,
 675             map->nentries, entry);
 676 }
 677
 678 /*
 679  *      vm_map_lookup_entry:    [ internal use only ]
 680  *
 681  *      Finds the map entry containing (or
 682  *      immediately preceding) the specified address
 683  *      in the given map; the entry is returned
 684  *      in the "entry" parameter.  The boolean
 685  *      result indicates whether the address is
 686  *      actually contained in the map.
 687  */
 688 boolean_t
 689 vm_map_lookup_entry(
 690         vm_map_t map,
 691         vm_offset_t address,
 692         vm_map_entry_t *entry)  /* OUT */
 693 {
 694         vm_map_entry_t cur;
 695
 696         cur = vm_map_entry_splay(address, map->root);
 697         if (cur == NULL)
 698                 *entry = &map->header;
 699         else {
 700                 map->root = cur;
 701
 702                 if (address >= cur->start) {
 703                         *entry = cur;
 704                         if (cur->end > address)
 705                                 return (TRUE);
 706                 } else
 707                         *entry = cur->prev;
 708         }
 709         return (FALSE);
 710 }
 711
 712 /*
 713  *      vm_map_insert:
 714  *
 715  *      Inserts the given whole VM object into the target
 716  *      map at the specified address range.  The object's
 717  *      size should match that of the address range.
 718  *
 719  *      Requires that the map be locked, and leaves it so.
 720  *
 721  *      If object is non-NULL, ref count must be bumped by caller
 722  *      prior to making call to account for the new entry.
 723  */
 724 int
 725 vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
 726               vm_offset_t start, vm_offset_t end, vm_prot_t prot, vm_prot_t max,
 727               int cow)
 728 {
 729         vm_map_entry_t new_entry;
 730         vm_map_entry_t prev_entry;
 731         vm_map_entry_t temp_entry;
 732         vm_eflags_t protoeflags;
 733
 734         /*
 735          * Check that the start and end points are not bogus.
 736          */
 737         if ((start < map->min_offset) || (end > map->max_offset) ||
 738             (start >= end))
 739                 return (KERN_INVALID_ADDRESS);
 740
 741         /*
 742          * Find the entry prior to the proposed starting address; if it's part
 743          * of an existing entry, this range is bogus.
 744          */
 745         if (vm_map_lookup_entry(map, start, &temp_entry))
 746                 return (KERN_NO_SPACE);
 747
 748         prev_entry = temp_entry;
 749
 750         /*
 751          * Assert that the next entry doesn't overlap the end point.
 752          */
 753         if ((prev_entry->next != &map->header) &&
 754             (prev_entry->next->start < end))
 755                 return (KERN_NO_SPACE);
 756
 757         protoeflags = 0;
 758
 759         if (cow & MAP_COPY_ON_WRITE)
 760                 protoeflags |= MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY;
 761
 762         if (cow & MAP_NOFAULT) {
 763                 protoeflags |= MAP_ENTRY_NOFAULT;
 764
 765                 KASSERT(object == NULL,
 766                         ("vm_map_insert: paradoxical MAP_NOFAULT request"));
 767         }
 768         if (cow & MAP_DISABLE_SYNCER)
 769                 protoeflags |= MAP_ENTRY_NOSYNC;
 770         if (cow & MAP_DISABLE_COREDUMP)
 771                 protoeflags |= MAP_ENTRY_NOCOREDUMP;
 772
 773         if (object) {
 774                 /*
 775                  * When object is non-NULL, it could be shared with another
 776                  * process.  We have to set or clear OBJ_ONEMAPPING
 777                  * appropriately.
 778                  */
 779                 vm_object_lock(object);
 780                 if ((object->ref_count > 1) || (object->shadow_count != 0)) {
 781                         vm_object_clear_flag(object, OBJ_ONEMAPPING);
 782                 }
 783                 vm_object_unlock(object);
 784         }
 785         else if ((prev_entry != &map->header) &&
 786                  (prev_entry->eflags == protoeflags) &&
 787                  (prev_entry->end == start) &&
 788                  (prev_entry->wired_count == 0) &&
 789                  ((prev_entry->object.vm_object == NULL) ||
 790                   vm_object_coalesce(prev_entry->object.vm_object,
 791                                      OFF_TO_IDX(prev_entry->offset),
 792                                      (vm_size_t)(prev_entry->end - prev_entry->start),
 793                                      (vm_size_t)(end - prev_entry->end)))) {
 794                 /*
 795                  * We were able to extend the object.  Determine if we
 796                  * can extend the previous map entry to include the
 797                  * new range as well.
 798                  */
 799                 if ((prev_entry->inheritance == VM_INHERIT_DEFAULT) &&
 800                     (prev_entry->protection == prot) &&
 801                     (prev_entry->max_protection == max)) {
 802                         map->size += (end - prev_entry->end);
 803                         prev_entry->end = end;
 804                         vm_map_simplify_entry(map, prev_entry);
 805                         return (KERN_SUCCESS);
 806                 }
 807
 808                 /*
 809                  * If we can extend the object but cannot extend the
 810                  * map entry, we have to create a new map entry.  We
 811                  * must bump the ref count on the extended object to
 812                  * account for it.  object may be NULL.
 813                  */
 814                 object = prev_entry->object.vm_object;
 815                 offset = prev_entry->offset +
 816                         (prev_entry->end - prev_entry->start);
 817                 vm_object_reference(object);
 818         }
 819
 820         /*
 821          * NOTE: if conditionals fail, object can be NULL here.  This occurs
 822          * in things like the buffer map where we manage kva but do not manage
 823          * backing objects.
 824          */
 825
 826         /*
 827          * Create a new entry
 828          */
 829         new_entry = vm_map_entry_create(map);
 830         new_entry->start = start;
 831         new_entry->end = end;
 832
 833         new_entry->eflags = protoeflags;
 834         new_entry->object.vm_object = object;
 835         new_entry->offset = offset;
 836         new_entry->avail_ssize = 0;
 837
 838         new_entry->inheritance = VM_INHERIT_DEFAULT;
 839         new_entry->protection = prot;
 840         new_entry->max_protection = max;
 841         new_entry->wired_count = 0;
 842
 843         /*
 844          * Insert the new entry into the list
 845          */
 846         vm_map_entry_link(map, prev_entry, new_entry);
 847         map->size += new_entry->end - new_entry->start;
 848
 849         /*
 850          * Update the free space hint
 851          */
 852         if ((map->first_free == prev_entry) &&
 853             (prev_entry->end >= new_entry->start)) {
 854                 map->first_free = new_entry;
 855         }
 856
 857 #if 0
 858         /*
 859          * Temporarily removed to avoid MAP_STACK panic, due to
 860          * MAP_STACK being a huge hack.  Will be added back in
 861          * when MAP_STACK (and the user stack mapping) is fixed.
 862          */
 863         /*
 864          * It may be possible to simplify the entry
 865          */
 866         vm_map_simplify_entry(map, new_entry);
 867 #endif
 868
 869         if (cow & (MAP_PREFAULT|MAP_PREFAULT_PARTIAL)) {
 870                 mtx_lock(&Giant);
 871                 pmap_object_init_pt(map->pmap, start,
 872                                     object, OFF_TO_IDX(offset), end - start,
 873                                     cow & MAP_PREFAULT_PARTIAL);
 874                 mtx_unlock(&Giant);
 875         }
 876
 877         return (KERN_SUCCESS);
 878 }
 879
 880 /*
 881  * Find sufficient space for `length' bytes in the given map, starting at
 882  * `start'.  The map must be locked.  Returns 0 on success, 1 on no space.
 883  */
 884 int
 885 vm_map_findspace(
 886         vm_map_t map,
 887         vm_offset_t start,
 888         vm_size_t length,
 889         vm_offset_t *addr)
 890 {
 891         vm_map_entry_t entry, next;
 892         vm_offset_t end;
 893
 894         if (start < map->min_offset)
 895                 start = map->min_offset;
 896         if (start > map->max_offset)
 897                 return (1);
 898
 899         /*
 900          * Look for the first possible address; if there's already something
 901          * at this address, we have to start after it.
 902          */
 903         if (start == map->min_offset) {
 904                 if ((entry = map->first_free) != &map->header)
 905                         start = entry->end;
 906         } else {
 907                 vm_map_entry_t tmp;
 908
 909                 if (vm_map_lookup_entry(map, start, &tmp))
 910                         start = tmp->end;
 911                 entry = tmp;
 912         }
 913
 914         /*
 915          * Look through the rest of the map, trying to fit a new region in the
 916          * gap between existing regions, or after the very last region.
 917          */
 918         for (;; start = (entry = next)->end) {
 919                 /*
 920                  * Find the end of the proposed new region.  Be sure we didn't
 921                  * go beyond the end of the map, or wrap around the address;
 922                  * if so, we lose.  Otherwise, if this is the last entry, or
 923                  * if the proposed new region fits before the next entry, we
 924                  * win.
 925                  */
 926                 end = start + length;
 927                 if (end > map->max_offset || end < start)
 928                         return (1);
 929                 next = entry->next;
 930                 if (next == &map->header || next->start >= end)
 931                         break;
 932         }
 933         *addr = start;
 934         if (map == kernel_map) {
 935                 vm_offset_t ksize;
 936                 if ((ksize = round_page(start + length)) > kernel_vm_end) {
 937                         mtx_lock(&Giant);
 938                         pmap_growkernel(ksize);
 939                         mtx_unlock(&Giant);
 940                 }
 941         }
 942         return (0);
 943 }
 944
 945 /*
 946  *      vm_map_find finds an unallocated region in the target address
 947  *      map with the given length.  The search is defined to be
 948  *      first-fit from the specified address; the region found is
 949  *      returned in the same parameter.
 950  *
 951  *      If object is non-NULL, ref count must be bumped by caller
 952  *      prior to making call to account for the new entry.
 953  */
 954 int
 955 vm_map_find(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
 956             vm_offset_t *addr,  /* IN/OUT */
 957             vm_size_t length, boolean_t find_space, vm_prot_t prot,
 958             vm_prot_t max, int cow)
 959 {
 960         vm_offset_t start;
 961         int result, s = 0;
 962
 963         start = *addr;
 964
 965         if (map == kmem_map)
 966                 s = splvm();
 967
 968         vm_map_lock(map);
 969         if (find_space) {
 970                 if (vm_map_findspace(map, start, length, addr)) {
 971                         vm_map_unlock(map);
 972                         if (map == kmem_map)
 973                                 splx(s);
 974                         return (KERN_NO_SPACE);
 975                 }
 976                 start = *addr;
 977         }
 978         result = vm_map_insert(map, object, offset,
 979                 start, start + length, prot, max, cow);
 980         vm_map_unlock(map);
 981
 982         if (map == kmem_map)
 983                 splx(s);
 984
 985         return (result);
 986 }
 987
 988 /*
 989  *      vm_map_simplify_entry:
 990  *
 991  *      Simplify the given map entry by merging with either neighbor.  This
 992  *      routine also has the ability to merge with both neighbors.
 993  *
 994  *      The map must be locked.
 995  *
 996  *      This routine guarentees that the passed entry remains valid (though
 997  *      possibly extended).  When merging, this routine may delete one or
 998  *      both neighbors.
 999  */
1000 void
1001 vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry)
1002 {
1003         vm_map_entry_t next, prev;
1004         vm_size_t prevsize, esize;
1005
1006         if (entry->eflags & (MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_IS_SUB_MAP))
1007                 return;
1008
1009         prev = entry->prev;
1010         if (prev != &map->header) {
1011                 prevsize = prev->end - prev->start;
1012                 if ( (prev->end == entry->start) &&
1013                      (prev->object.vm_object == entry->object.vm_object) &&
1014                      (!prev->object.vm_object ||
1015                         (prev->offset + prevsize == entry->offset)) &&
1016                      (prev->eflags == entry->eflags) &&
1017                      (prev->protection == entry->protection) &&
1018                      (prev->max_protection == entry->max_protection) &&
1019                      (prev->inheritance == entry->inheritance) &&
1020                      (prev->wired_count == entry->wired_count)) {
1021                         if (map->first_free == prev)
1022                                 map->first_free = entry;
1023                         vm_map_entry_unlink(map, prev);
1024                         entry->start = prev->start;
1025                         entry->offset = prev->offset;
1026                         if (prev->object.vm_object)
1027                                 vm_object_deallocate(prev->object.vm_object);
1028                         vm_map_entry_dispose(map, prev);
1029                 }
1030         }
1031
1032         next = entry->next;
1033         if (next != &map->header) {
1034                 esize = entry->end - entry->start;
1035                 if ((entry->end == next->start) &&
1036                     (next->object.vm_object == entry->object.vm_object) &&
1037                      (!entry->object.vm_object ||
1038                         (entry->offset + esize == next->offset)) &&
1039                     (next->eflags == entry->eflags) &&
1040                     (next->protection == entry->protection) &&
1041                     (next->max_protection == entry->max_protection) &&
1042                     (next->inheritance == entry->inheritance) &&
1043                     (next->wired_count == entry->wired_count)) {
1044                         if (map->first_free == next)
1045                                 map->first_free = entry;
1046                         vm_map_entry_unlink(map, next);
1047                         entry->end = next->end;
1048                         if (next->object.vm_object)
1049                                 vm_object_deallocate(next->object.vm_object);
1050                         vm_map_entry_dispose(map, next);
1051                 }
1052         }
1053 }
1054 /*
1055  *      vm_map_clip_start:      [ internal use only ]
1056  *
1057  *      Asserts that the given entry begins at or after
1058  *      the specified address; if necessary,
1059  *      it splits the entry into two.
1060  */
1061 #define vm_map_clip_start(map, entry, startaddr) \
1062 { \
1063         if (startaddr > entry->start) \
1064                 _vm_map_clip_start(map, entry, startaddr); \
1065 }
1066
1067 /*
1068  *      This routine is called only when it is known that
1069  *      the entry must be split.
1070  */
1071 static void
1072 _vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start)
1073 {
1074         vm_map_entry_t new_entry;
1075
1076         /*
1077          * Split off the front portion -- note that we must insert the new
1078          * entry BEFORE this one, so that this entry has the specified
1079          * starting address.
1080          */
1081         vm_map_simplify_entry(map, entry);
1082
1083         /*
1084          * If there is no object backing this entry, we might as well create
1085          * one now.  If we defer it, an object can get created after the map
1086          * is clipped, and individual objects will be created for the split-up
1087          * map.  This is a bit of a hack, but is also about the best place to
1088          * put this improvement.
1089          */
1090         if (entry->object.vm_object == NULL && !map->system_map) {
1091                 vm_object_t object;
1092                 object = vm_object_allocate(OBJT_DEFAULT,
1093                                 atop(entry->end - entry->start));
1094                 entry->object.vm_object = object;
1095                 entry->offset = 0;
1096         }
1097
1098         new_entry = vm_map_entry_create(map);
1099         *new_entry = *entry;
1100
1101         new_entry->end = start;
1102         entry->offset += (start - entry->start);
1103         entry->start = start;
1104
1105         vm_map_entry_link(map, entry->prev, new_entry);
1106
1107         if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
1108                 vm_object_reference(new_entry->object.vm_object);
1109         }
1110 }
1111
1112 /*
1113  *      vm_map_clip_end:        [ internal use only ]
1114  *
1115  *      Asserts that the given entry ends at or before
1116  *      the specified address; if necessary,
1117  *      it splits the entry into two.
1118  */
1119 #define vm_map_clip_end(map, entry, endaddr) \
1120 { \
1121         if ((endaddr) < (entry->end)) \
1122                 _vm_map_clip_end((map), (entry), (endaddr)); \
1123 }
1124
1125 /*
1126  *      This routine is called only when it is known that
1127  *      the entry must be split.
1128  */
1129 static void
1130 _vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end)
1131 {
1132         vm_map_entry_t new_entry;
1133
1134         /*
1135          * If there is no object backing this entry, we might as well create
1136          * one now.  If we defer it, an object can get created after the map
1137          * is clipped, and individual objects will be created for the split-up
1138          * map.  This is a bit of a hack, but is also about the best place to
1139          * put this improvement.
1140          */
1141         if (entry->object.vm_object == NULL && !map->system_map) {
1142                 vm_object_t object;
1143                 object = vm_object_allocate(OBJT_DEFAULT,
1144                                 atop(entry->end - entry->start));
1145                 entry->object.vm_object = object;
1146                 entry->offset = 0;
1147         }
1148
1149         /*
1150          * Create a new entry and insert it AFTER the specified entry
1151          */
1152         new_entry = vm_map_entry_create(map);
1153         *new_entry = *entry;
1154
1155         new_entry->start = entry->end = end;
1156         new_entry->offset += (end - entry->start);
1157
1158         vm_map_entry_link(map, entry, new_entry);
1159
1160         if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
1161                 vm_object_reference(new_entry->object.vm_object);
1162         }
1163 }
1164
1165 /*
1166  *      VM_MAP_RANGE_CHECK:     [ internal use only ]
1167  *
1168  *      Asserts that the starting and ending region
1169  *      addresses fall within the valid range of the map.
1170  */
1171 #define VM_MAP_RANGE_CHECK(map, start, end)             \
1172                 {                                       \
1173                 if (start < vm_map_min(map))            \
1174                         start = vm_map_min(map);        \
1175                 if (end > vm_map_max(map))              \
1176                         end = vm_map_max(map);          \
1177                 if (start > end)                        \
1178                         start = end;                    \
1179                 }
1180
1181 /*
1182  *      vm_map_submap:          [ kernel use only ]
1183  *
1184  *      Mark the given range as handled by a subordinate map.
1185  *
1186  *      This range must have been created with vm_map_find,
1187  *      and no other operations may have been performed on this
1188  *      range prior to calling vm_map_submap.
1189  *
1190  *      Only a limited number of operations can be performed
1191  *      within this rage after calling vm_map_submap:
1192  *              vm_fault
1193  *      [Don't try vm_map_copy!]
1194  *
1195  *      To remove a submapping, one must first remove the
1196  *      range from the superior map, and then destroy the
1197  *      submap (if desired).  [Better yet, don't try it.]
1198  */
1199 int
1200 vm_map_submap(
1201         vm_map_t map,
1202         vm_offset_t start,
1203         vm_offset_t end,
1204         vm_map_t submap)
1205 {
1206         vm_map_entry_t entry;
1207         int result = KERN_INVALID_ARGUMENT;
1208
1209         vm_map_lock(map);
1210
1211         VM_MAP_RANGE_CHECK(map, start, end);
1212
1213         if (vm_map_lookup_entry(map, start, &entry)) {
1214                 vm_map_clip_start(map, entry, start);
1215         } else
1216                 entry = entry->next;
1217
1218         vm_map_clip_end(map, entry, end);
1219
1220         if ((entry->start == start) && (entry->end == end) &&
1221             ((entry->eflags & MAP_ENTRY_COW) == 0) &&
1222             (entry->object.vm_object == NULL)) {
1223                 entry->object.sub_map = submap;
1224                 entry->eflags |= MAP_ENTRY_IS_SUB_MAP;
1225                 result = KERN_SUCCESS;
1226         }
1227         vm_map_unlock(map);
1228
1229         return (result);
1230 }
1231
1232 /*
1233  *      vm_map_protect:
1234  *
1235  *      Sets the protection of the specified address
1236  *      region in the target map.  If "set_max" is
1237  *      specified, the maximum protection is to be set;
1238  *      otherwise, only the current protection is affected.
1239  */
1240 int
1241 vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end,
1242                vm_prot_t new_prot, boolean_t set_max)
1243 {
1244         vm_map_entry_t current;
1245         vm_map_entry_t entry;
1246
1247         vm_map_lock(map);
1248
1249         VM_MAP_RANGE_CHECK(map, start, end);
1250
1251         if (vm_map_lookup_entry(map, start, &entry)) {
1252                 vm_map_clip_start(map, entry, start);
1253         } else {
1254                 entry = entry->next;
1255         }
1256
1257         /*
1258          * Make a first pass to check for protection violations.
1259          */
1260         current = entry;
1261         while ((current != &map->header) && (current->start < end)) {
1262                 if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
1263                         vm_map_unlock(map);
1264                         return (KERN_INVALID_ARGUMENT);
1265                 }
1266                 if ((new_prot & current->max_protection) != new_prot) {
1267                         vm_map_unlock(map);
1268                         return (KERN_PROTECTION_FAILURE);
1269                 }
1270                 current = current->next;
1271         }
1272
1273         /*
1274          * Go back and fix up protections. [Note that clipping is not
1275          * necessary the second time.]
1276          */
1277         current = entry;
1278         while ((current != &map->header) && (current->start < end)) {
1279                 vm_prot_t old_prot;
1280
1281                 vm_map_clip_end(map, current, end);
1282
1283                 old_prot = current->protection;
1284                 if (set_max)
1285                         current->protection =
1286                             (current->max_protection = new_prot) &
1287                             old_prot;
1288                 else
1289                         current->protection = new_prot;
1290
1291                 /*
1292                  * Update physical map if necessary. Worry about copy-on-write
1293                  * here -- CHECK THIS XXX
1294                  */
1295                 if (current->protection != old_prot) {
1296                         mtx_lock(&Giant);
1297                         vm_page_lock_queues();
1298 #define MASK(entry)     (((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \
1299                                                         VM_PROT_ALL)
1300                         pmap_protect(map->pmap, current->start,
1301                             current->end,
1302                             current->protection & MASK(current));
1303 #undef  MASK
1304                         vm_page_unlock_queues();
1305                         mtx_unlock(&Giant);
1306                 }
1307                 vm_map_simplify_entry(map, current);
1308                 current = current->next;
1309         }
1310         vm_map_unlock(map);
1311         return (KERN_SUCCESS);
1312 }
1313
1314 /*
1315  *      vm_map_madvise:
1316  *
1317  *      This routine traverses a processes map handling the madvise
1318  *      system call.  Advisories are classified as either those effecting
1319  *      the vm_map_entry structure, or those effecting the underlying
1320  *      objects.
1321  */
1322 int
1323 vm_map_madvise(
1324         vm_map_t map,
1325         vm_offset_t start,
1326         vm_offset_t end,
1327         int behav)
1328 {
1329         vm_map_entry_t current, entry;
1330         int modify_map = 0;
1331
1332         /*
1333          * Some madvise calls directly modify the vm_map_entry, in which case
1334          * we need to use an exclusive lock on the map and we need to perform
1335          * various clipping operations.  Otherwise we only need a read-lock
1336          * on the map.
1337          */
1338         switch(behav) {
1339         case MADV_NORMAL:
1340         case MADV_SEQUENTIAL:
1341         case MADV_RANDOM:
1342         case MADV_NOSYNC:
1343         case MADV_AUTOSYNC:
1344         case MADV_NOCORE:
1345         case MADV_CORE:
1346                 modify_map = 1;
1347                 vm_map_lock(map);
1348                 break;
1349         case MADV_WILLNEED:
1350         case MADV_DONTNEED:
1351         case MADV_FREE:
1352                 vm_map_lock_read(map);
1353                 break;
1354         default:
1355                 return (KERN_INVALID_ARGUMENT);
1356         }
1357
1358         /*
1359          * Locate starting entry and clip if necessary.
1360          */
1361         VM_MAP_RANGE_CHECK(map, start, end);
1362
1363         if (vm_map_lookup_entry(map, start, &entry)) {
1364                 if (modify_map)
1365                         vm_map_clip_start(map, entry, start);
1366         } else {
1367                 entry = entry->next;
1368         }
1369
1370         if (modify_map) {
1371                 /*
1372                  * madvise behaviors that are implemented in the vm_map_entry.
1373                  *
1374                  * We clip the vm_map_entry so that behavioral changes are
1375                  * limited to the specified address range.
1376                  */
1377                 for (current = entry;
1378                      (current != &map->header) && (current->start < end);
1379                      current = current->next
1380                 ) {
1381                         if (current->eflags & MAP_ENTRY_IS_SUB_MAP)
1382                                 continue;
1383
1384                         vm_map_clip_end(map, current, end);
1385
1386                         switch (behav) {
1387                         case MADV_NORMAL:
1388                                 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_NORMAL);
1389                                 break;
1390                         case MADV_SEQUENTIAL:
1391                                 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_SEQUENTIAL);
1392                                 break;
1393                         case MADV_RANDOM:
1394                                 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_RANDOM);
1395                                 break;
1396                         case MADV_NOSYNC:
1397                                 current->eflags |= MAP_ENTRY_NOSYNC;
1398                                 break;
1399                         case MADV_AUTOSYNC:
1400                                 current->eflags &= ~MAP_ENTRY_NOSYNC;
1401                                 break;
1402                         case MADV_NOCORE:
1403                                 current->eflags |= MAP_ENTRY_NOCOREDUMP;
1404                                 break;
1405                         case MADV_CORE:
1406                                 current->eflags &= ~MAP_ENTRY_NOCOREDUMP;
1407                                 break;
1408                         default:
1409                                 break;
1410                         }
1411                         vm_map_simplify_entry(map, current);
1412                 }
1413                 vm_map_unlock(map);
1414         } else {
1415                 vm_pindex_t pindex;
1416                 int count;
1417
1418                 /*
1419                  * madvise behaviors that are implemented in the underlying
1420                  * vm_object.
1421                  *
1422                  * Since we don't clip the vm_map_entry, we have to clip
1423                  * the vm_object pindex and count.
1424                  */
1425                 for (current = entry;
1426                      (current != &map->header) && (current->start < end);
1427                      current = current->next
1428                 ) {
1429                         vm_offset_t useStart;
1430
1431                         if (current->eflags & MAP_ENTRY_IS_SUB_MAP)
1432                                 continue;
1433
1434                         pindex = OFF_TO_IDX(current->offset);
1435                         count = atop(current->end - current->start);
1436                         useStart = current->start;
1437
1438                         if (current->start < start) {
1439                                 pindex += atop(start - current->start);
1440                                 count -= atop(start - current->start);
1441                                 useStart = start;
1442                         }
1443                         if (current->end > end)
1444                                 count -= atop(current->end - end);
1445
1446                         if (count <= 0)
1447                                 continue;
1448
1449                         vm_object_madvise(current->object.vm_object,
1450                                           pindex, count, behav);
1451                         if (behav == MADV_WILLNEED) {
1452                                 mtx_lock(&Giant);
1453                                 pmap_object_init_pt(
1454                                     map->pmap,
1455                                     useStart,
1456                                     current->object.vm_object,
1457                                     pindex,
1458                                     (count << PAGE_SHIFT),
1459                                     MAP_PREFAULT_MADVISE
1460                                 );
1461                                 mtx_unlock(&Giant);
1462                         }
1463                 }
1464                 vm_map_unlock_read(map);
1465         }
1466         return (0);
1467 }
1468
1469
1470 /*
1471  *      vm_map_inherit:
1472  *
1473  *      Sets the inheritance of the specified address
1474  *      range in the target map.  Inheritance
1475  *      affects how the map will be shared with
1476  *      child maps at the time of vm_map_fork.
1477  */
1478 int
1479 vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end,
1480                vm_inherit_t new_inheritance)
1481 {
1482         vm_map_entry_t entry;
1483         vm_map_entry_t temp_entry;
1484
1485         switch (new_inheritance) {
1486         case VM_INHERIT_NONE:
1487         case VM_INHERIT_COPY:
1488         case VM_INHERIT_SHARE:
1489                 break;
1490         default:
1491                 return (KERN_INVALID_ARGUMENT);
1492         }
1493         vm_map_lock(map);
1494         VM_MAP_RANGE_CHECK(map, start, end);
1495         if (vm_map_lookup_entry(map, start, &temp_entry)) {
1496                 entry = temp_entry;
1497                 vm_map_clip_start(map, entry, start);
1498         } else
1499                 entry = temp_entry->next;
1500         while ((entry != &map->header) && (entry->start < end)) {
1501                 vm_map_clip_end(map, entry, end);
1502                 entry->inheritance = new_inheritance;
1503                 vm_map_simplify_entry(map, entry);
1504                 entry = entry->next;
1505         }
1506         vm_map_unlock(map);
1507         return (KERN_SUCCESS);
1508 }
1509
1510 /*
1511  *      vm_map_unwire:
1512  *
1513  *      Implements both kernel and user unwiring.
1514  */
1515 int
1516 vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end,
1517         boolean_t user_unwire)
1518 {
1519         vm_map_entry_t entry, first_entry, tmp_entry;
1520         vm_offset_t saved_start;
1521         unsigned int last_timestamp;
1522         int rv;
1523         boolean_t need_wakeup, result;
1524
1525         vm_map_lock(map);
1526         VM_MAP_RANGE_CHECK(map, start, end);
1527         if (!vm_map_lookup_entry(map, start, &first_entry)) {
1528                 vm_map_unlock(map);
1529                 return (KERN_INVALID_ADDRESS);
1530         }
1531         last_timestamp = map->timestamp;
1532         entry = first_entry;
1533         while (entry != &map->header && entry->start < end) {
1534                 if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
1535                         /*
1536                          * We have not yet clipped the entry.
1537                          */
1538                         saved_start = (start >= entry->start) ? start :
1539                             entry->start;
1540                         entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
1541                         if (vm_map_unlock_and_wait(map, user_unwire)) {
1542                                 /*
1543                                  * Allow interruption of user unwiring?
1544                                  */
1545                         }
1546                         vm_map_lock(map);
1547                         if (last_timestamp+1 != map->timestamp) {
1548                                 /*
1549                                  * Look again for the entry because the map was
1550                                  * modified while it was unlocked.
1551                                  * Specifically, the entry may have been
1552                                  * clipped, merged, or deleted.
1553                                  */
1554                                 if (!vm_map_lookup_entry(map, saved_start,
1555                                     &tmp_entry)) {
1556                                         if (saved_start == start) {
1557                                                 /*
1558                                                  * First_entry has been deleted.
1559                                                  */
1560                                                 vm_map_unlock(map);
1561                                                 return (KERN_INVALID_ADDRESS);
1562                                         }
1563                                         end = saved_start;
1564                                         rv = KERN_INVALID_ADDRESS;
1565                                         goto done;
1566                                 }
1567                                 if (entry == first_entry)
1568                                         first_entry = tmp_entry;
1569                                 else
1570                                         first_entry = NULL;
1571                                 entry = tmp_entry;
1572                         }
1573                         last_timestamp = map->timestamp;
1574                         continue;
1575                 }
1576                 vm_map_clip_start(map, entry, start);
1577                 vm_map_clip_end(map, entry, end);
1578                 /*
1579                  * Mark the entry in case the map lock is released.  (See
1580                  * above.)
1581                  */
1582                 entry->eflags |= MAP_ENTRY_IN_TRANSITION;
1583                 /*
1584                  * Check the map for holes in the specified region.
1585                  */
1586                 if (entry->end < end && (entry->next == &map->header ||
1587                     entry->next->start > entry->end)) {
1588                         end = entry->end;
1589                         rv = KERN_INVALID_ADDRESS;
1590                         goto done;
1591                 }
1592                 /*
1593                  * Require that the entry is wired.
1594                  */
1595                 if (entry->wired_count == 0 || (user_unwire &&
1596                     (entry->eflags & MAP_ENTRY_USER_WIRED) == 0)) {
1597                         end = entry->end;
1598                         rv = KERN_INVALID_ARGUMENT;
1599                         goto done;
1600                 }
1601                 entry = entry->next;
1602         }
1603         rv = KERN_SUCCESS;
1604 done:
1605         need_wakeup = FALSE;
1606         if (first_entry == NULL) {
1607                 result = vm_map_lookup_entry(map, start, &first_entry);
1608                 KASSERT(result, ("vm_map_unwire: lookup failed"));
1609         }
1610         entry = first_entry;
1611         while (entry != &map->header && entry->start < end) {
1612                 if (rv == KERN_SUCCESS) {
1613                         if (user_unwire)
1614                                 entry->eflags &= ~MAP_ENTRY_USER_WIRED;
1615                         entry->wired_count--;
1616                         if (entry->wired_count == 0) {
1617                                 /*
1618                                  * Retain the map lock.
1619                                  */
1620                                 vm_fault_unwire(map, entry->start, entry->end);
1621                         }
1622                 }
1623                 KASSERT(entry->eflags & MAP_ENTRY_IN_TRANSITION,
1624                         ("vm_map_unwire: in-transition flag missing"));
1625                 entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
1626                 if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
1627                         entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
1628                         need_wakeup = TRUE;
1629                 }
1630                 vm_map_simplify_entry(map, entry);
1631                 entry = entry->next;
1632         }
1633         vm_map_unlock(map);
1634         if (need_wakeup)
1635                 vm_map_wakeup(map);
1636         return (rv);
1637 }
1638
1639 /*
1640  *      vm_map_wire:
1641  *
1642  *      Implements both kernel and user wiring.
1643  */
1644 int
1645 vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end,
1646         boolean_t user_wire)
1647 {
1648         vm_map_entry_t entry, first_entry, tmp_entry;
1649         vm_offset_t saved_end, saved_start;
1650         unsigned int last_timestamp;
1651         int rv;
1652         boolean_t need_wakeup, result;
1653
1654         vm_map_lock(map);
1655         VM_MAP_RANGE_CHECK(map, start, end);
1656         if (!vm_map_lookup_entry(map, start, &first_entry)) {
1657                 vm_map_unlock(map);
1658                 return (KERN_INVALID_ADDRESS);
1659         }
1660         last_timestamp = map->timestamp;
1661         entry = first_entry;
1662         while (entry != &map->header && entry->start < end) {
1663                 if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
1664                         /*
1665                          * We have not yet clipped the entry.
1666                          */
1667                         saved_start = (start >= entry->start) ? start :
1668                             entry->start;
1669                         entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
1670                         if (vm_map_unlock_and_wait(map, user_wire)) {
1671                                 /*
1672                                  * Allow interruption of user wiring?
1673                                  */
1674                         }
1675                         vm_map_lock(map);
1676                         if (last_timestamp + 1 != map->timestamp) {
1677                                 /*
1678                                  * Look again for the entry because the map was
1679                                  * modified while it was unlocked.
1680                                  * Specifically, the entry may have been
1681                                  * clipped, merged, or deleted.
1682                                  */
1683                                 if (!vm_map_lookup_entry(map, saved_start,
1684                                     &tmp_entry)) {
1685                                         if (saved_start == start) {
1686                                                 /*
1687                                                  * first_entry has been deleted.
1688                                                  */
1689                                                 vm_map_unlock(map);
1690                                                 return (KERN_INVALID_ADDRESS);
1691                                         }
1692                                         end = saved_start;
1693                                         rv = KERN_INVALID_ADDRESS;
1694                                         goto done;
1695                                 }
1696                                 if (entry == first_entry)
1697                                         first_entry = tmp_entry;
1698                                 else
1699                                         first_entry = NULL;
1700                                 entry = tmp_entry;
1701                         }
1702                         last_timestamp = map->timestamp;
1703                         continue;
1704                 }
1705                 vm_map_clip_start(map, entry, start);
1706                 vm_map_clip_end(map, entry, end);
1707                 /*
1708                  * Mark the entry in case the map lock is released.  (See
1709                  * above.)
1710                  */
1711                 entry->eflags |= MAP_ENTRY_IN_TRANSITION;
1712                 /*
1713                  *
1714                  */
1715                 if (entry->wired_count == 0) {
1716                         entry->wired_count++;
1717                         saved_start = entry->start;
1718                         saved_end = entry->end;
1719                         /*
1720                          * Release the map lock, relying on the in-transition
1721                          * mark.
1722                          */
1723                         vm_map_unlock(map);
1724                         rv = vm_fault_wire(map, saved_start, saved_end,
1725                             user_wire);
1726                         vm_map_lock(map);
1727                         if (last_timestamp + 1 != map->timestamp) {
1728                                 /*
1729                                  * Look again for the entry because the map was
1730                                  * modified while it was unlocked.  The entry
1731                                  * may have been clipped, but NOT merged or
1732                                  * deleted.
1733                                  */
1734                                 result = vm_map_lookup_entry(map, saved_start,
1735                                     &tmp_entry);
1736                                 KASSERT(result, ("vm_map_wire: lookup failed"));
1737                                 if (entry == first_entry)
1738                                         first_entry = tmp_entry;
1739                                 else
1740                                         first_entry = NULL;
1741                                 entry = tmp_entry;
1742                                 while (entry->end < saved_end) {
1743                                         if (rv != KERN_SUCCESS) {
1744                                                 KASSERT(entry->wired_count == 1,
1745                                                     ("vm_map_wire: bad count"));
1746                                                 entry->wired_count = -1;
1747                                         }
1748                                         entry = entry->next;
1749                                 }
1750                         }
1751                         last_timestamp = map->timestamp;
1752                         if (rv != KERN_SUCCESS) {
1753                                 KASSERT(entry->wired_count == 1,
1754                                     ("vm_map_wire: bad count"));
1755                                 /*
1756                                  * Assign an out-of-range value to represent
1757                                  * the failure to wire this entry.
1758                                  */
1759                                 entry->wired_count = -1;
1760                                 end = entry->end;
1761                                 goto done;
1762                         }
1763                 } else if (!user_wire ||
1764                            (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
1765                         entry->wired_count++;
1766                 }
1767                 /*
1768                  * Check the map for holes in the specified region.
1769                  */
1770                 if (entry->end < end && (entry->next == &map->header ||
1771                     entry->next->start > entry->end)) {
1772                         end = entry->end;
1773                         rv = KERN_INVALID_ADDRESS;
1774                         goto done;
1775                 }
1776                 entry = entry->next;
1777         }
1778         rv = KERN_SUCCESS;
1779 done:
1780         need_wakeup = FALSE;
1781         if (first_entry == NULL) {
1782                 result = vm_map_lookup_entry(map, start, &first_entry);
1783                 KASSERT(result, ("vm_map_wire: lookup failed"));
1784         }
1785         entry = first_entry;
1786         while (entry != &map->header && entry->start < end) {
1787                 if (rv == KERN_SUCCESS) {
1788                         if (user_wire)
1789                                 entry->eflags |= MAP_ENTRY_USER_WIRED;
1790                 } else if (entry->wired_count == -1) {
1791                         /*
1792                          * Wiring failed on this entry.  Thus, unwiring is
1793                          * unnecessary.
1794                          */
1795                         entry->wired_count = 0;
1796                 } else {
1797                         if (!user_wire ||
1798                             (entry->eflags & MAP_ENTRY_USER_WIRED) == 0)
1799                                 entry->wired_count--;
1800                         if (entry->wired_count == 0) {
1801                                 /*
1802                                  * Retain the map lock.
1803                                  */
1804                                 vm_fault_unwire(map, entry->start, entry->end);
1805                         }
1806                 }
1807                 KASSERT(entry->eflags & MAP_ENTRY_IN_TRANSITION,
1808                         ("vm_map_wire: in-transition flag missing"));
1809                 entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
1810                 if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
1811                         entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
1812                         need_wakeup = TRUE;
1813                 }
1814                 vm_map_simplify_entry(map, entry);
1815                 entry = entry->next;
1816         }
1817         vm_map_unlock(map);
1818         if (need_wakeup)
1819                 vm_map_wakeup(map);
1820         return (rv);
1821 }
1822
1823 /*
1824  * vm_map_clean
1825  *
1826  * Push any dirty cached pages in the address range to their pager.
1827  * If syncio is TRUE, dirty pages are written synchronously.
1828  * If invalidate is TRUE, any cached pages are freed as well.
1829  *
1830  * Returns an error if any part of the specified range is not mapped.
1831  */
1832 int
1833 vm_map_clean(
1834         vm_map_t map,
1835         vm_offset_t start,
1836         vm_offset_t end,
1837         boolean_t syncio,
1838         boolean_t invalidate)
1839 {
1840         vm_map_entry_t current;
1841         vm_map_entry_t entry;
1842         vm_size_t size;
1843         vm_object_t object;
1844         vm_ooffset_t offset;
1845
1846         GIANT_REQUIRED;
1847
1848         vm_map_lock_read(map);
1849         VM_MAP_RANGE_CHECK(map, start, end);
1850         if (!vm_map_lookup_entry(map, start, &entry)) {
1851                 vm_map_unlock_read(map);
1852                 return (KERN_INVALID_ADDRESS);
1853         }
1854         /*
1855          * Make a first pass to check for holes.
1856          */
1857         for (current = entry; current->start < end; current = current->next) {
1858                 if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
1859                         vm_map_unlock_read(map);
1860                         return (KERN_INVALID_ARGUMENT);
1861                 }
1862                 if (end > current->end &&
1863                     (current->next == &map->header ||
1864                         current->end != current->next->start)) {
1865                         vm_map_unlock_read(map);
1866                         return (KERN_INVALID_ADDRESS);
1867                 }
1868         }
1869
1870         if (invalidate) {
1871                 vm_page_lock_queues();
1872                 pmap_remove(map->pmap, start, end);
1873                 vm_page_unlock_queues();
1874         }
1875         /*
1876          * Make a second pass, cleaning/uncaching pages from the indicated
1877          * objects as we go.
1878          */
1879         for (current = entry; current->start < end; current = current->next) {
1880                 offset = current->offset + (start - current->start);
1881                 size = (end <= current->end ? end : current->end) - start;
1882                 if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
1883                         vm_map_t smap;
1884                         vm_map_entry_t tentry;
1885                         vm_size_t tsize;
1886
1887                         smap = current->object.sub_map;
1888                         vm_map_lock_read(smap);
1889                         (void) vm_map_lookup_entry(smap, offset, &tentry);
1890                         tsize = tentry->end - offset;
1891                         if (tsize < size)
1892                                 size = tsize;
1893                         object = tentry->object.vm_object;
1894                         offset = tentry->offset + (offset - tentry->start);
1895                         vm_map_unlock_read(smap);
1896                 } else {
1897                         object = current->object.vm_object;
1898                 }
1899                 /*
1900                  * Note that there is absolutely no sense in writing out
1901                  * anonymous objects, so we track down the vnode object
1902                  * to write out.
1903                  * We invalidate (remove) all pages from the address space
1904                  * anyway, for semantic correctness.
1905                  *
1906                  * note: certain anonymous maps, such as MAP_NOSYNC maps,
1907                  * may start out with a NULL object.
1908                  */
1909                 while (object && object->backing_object) {
1910                         object = object->backing_object;
1911                         offset += object->backing_object_offset;
1912                         if (object->size < OFF_TO_IDX(offset + size))
1913                                 size = IDX_TO_OFF(object->size) - offset;
1914                 }
1915                 if (object && (object->type == OBJT_VNODE) &&
1916                     (current->protection & VM_PROT_WRITE)) {
1917                         /*
1918                          * Flush pages if writing is allowed, invalidate them
1919                          * if invalidation requested.  Pages undergoing I/O
1920                          * will be ignored by vm_object_page_remove().
1921                          *
1922                          * We cannot lock the vnode and then wait for paging
1923                          * to complete without deadlocking against vm_fault.
1924                          * Instead we simply call vm_object_page_remove() and
1925                          * allow it to block internally on a page-by-page
1926                          * basis when it encounters pages undergoing async
1927                          * I/O.
1928                          */
1929                         int flags;
1930
1931                         vm_object_reference(object);
1932                         vn_lock(object->handle, LK_EXCLUSIVE | LK_RETRY, curthread);
1933                         flags = (syncio || invalidate) ? OBJPC_SYNC : 0;
1934                         flags |= invalidate ? OBJPC_INVAL : 0;
1935                         vm_object_page_clean(object,
1936                             OFF_TO_IDX(offset),
1937                             OFF_TO_IDX(offset + size + PAGE_MASK),
1938                             flags);
1939                         VOP_UNLOCK(object->handle, 0, curthread);
1940                         vm_object_deallocate(object);
1941                 }
1942                 if (object && invalidate &&
1943                     ((object->type == OBJT_VNODE) ||
1944                      (object->type == OBJT_DEVICE))) {
1945                         vm_object_reference(object);
1946                         vm_object_lock(object);
1947                         vm_object_page_remove(object,
1948                             OFF_TO_IDX(offset),
1949                             OFF_TO_IDX(offset + size + PAGE_MASK),
1950                             FALSE);
1951                         vm_object_unlock(object);
1952                         vm_object_deallocate(object);
1953                 }
1954                 start += size;
1955         }
1956
1957         vm_map_unlock_read(map);
1958         return (KERN_SUCCESS);
1959 }
1960
1961 /*
1962  *      vm_map_entry_unwire:    [ internal use only ]
1963  *
1964  *      Make the region specified by this entry pageable.
1965  *
1966  *      The map in question should be locked.
1967  *      [This is the reason for this routine's existence.]
1968  */
1969 static void
1970 vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry)
1971 {
1972         vm_fault_unwire(map, entry->start, entry->end);
1973         entry->wired_count = 0;
1974 }
1975
1976 /*
1977  *      vm_map_entry_delete:    [ internal use only ]
1978  *
1979  *      Deallocate the given entry from the target map.
1980  */
1981 static void
1982 vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry)
1983 {
1984         vm_map_entry_unlink(map, entry);
1985         map->size -= entry->end - entry->start;
1986
1987         if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
1988                 vm_object_deallocate(entry->object.vm_object);
1989         }
1990
1991         vm_map_entry_dispose(map, entry);
1992 }
1993
1994 /*
1995  *      vm_map_delete:  [ internal use only ]
1996  *
1997  *      Deallocates the given address range from the target
1998  *      map.
1999  */
2000 int
2001 vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end)
2002 {
2003         vm_object_t object;
2004         vm_map_entry_t entry;
2005         vm_map_entry_t first_entry;
2006
2007         /*
2008          * Find the start of the region, and clip it
2009          */
2010         if (!vm_map_lookup_entry(map, start, &first_entry))
2011                 entry = first_entry->next;
2012         else {
2013                 entry = first_entry;
2014                 vm_map_clip_start(map, entry, start);
2015         }
2016
2017         /*
2018          * Save the free space hint
2019          */
2020         if (entry == &map->header) {
2021                 map->first_free = &map->header;
2022         } else if (map->first_free->start >= start) {
2023                 map->first_free = entry->prev;
2024         }
2025
2026         /*
2027          * Step through all entries in this region
2028          */
2029         while ((entry != &map->header) && (entry->start < end)) {
2030                 vm_map_entry_t next;
2031                 vm_offset_t s, e;
2032                 vm_pindex_t offidxstart, offidxend, count;
2033
2034                 /*
2035                  * Wait for wiring or unwiring of an entry to complete.
2036                  */
2037                 if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0) {
2038                         unsigned int last_timestamp;
2039                         vm_offset_t saved_start;
2040                         vm_map_entry_t tmp_entry;
2041
2042                         saved_start = entry->start;
2043                         entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
2044                         last_timestamp = map->timestamp;
2045                         (void) vm_map_unlock_and_wait(map, FALSE);
2046                         vm_map_lock(map);
2047                         if (last_timestamp + 1 != map->timestamp) {
2048                                 /*
2049                                  * Look again for the entry because the map was
2050                                  * modified while it was unlocked.
2051                                  * Specifically, the entry may have been
2052                                  * clipped, merged, or deleted.
2053                                  */
2054                                 if (!vm_map_lookup_entry(map, saved_start,
2055                                                          &tmp_entry))
2056                                         entry = tmp_entry->next;
2057                                 else {
2058                                         entry = tmp_entry;
2059                                         vm_map_clip_start(map, entry,
2060                                                           saved_start);
2061                                 }
2062                         }
2063                         continue;
2064                 }
2065                 vm_map_clip_end(map, entry, end);
2066
2067                 s = entry->start;
2068                 e = entry->end;
2069                 next = entry->next;
2070
2071                 offidxstart = OFF_TO_IDX(entry->offset);
2072                 count = OFF_TO_IDX(e - s);
2073                 object = entry->object.vm_object;
2074
2075                 /*
2076                  * Unwire before removing addresses from the pmap; otherwise,
2077                  * unwiring will put the entries back in the pmap.
2078                  */
2079                 if (entry->wired_count != 0) {
2080                         vm_map_entry_unwire(map, entry);
2081                 }
2082
2083                 offidxend = offidxstart + count;
2084
2085                 if ((object == kernel_object) || (object == kmem_object)) {
2086                         vm_object_lock(object);
2087                         vm_object_page_remove(object, offidxstart, offidxend, FALSE);
2088                         vm_object_unlock(object);
2089                 } else {
2090                         vm_object_lock(object);
2091                         vm_page_lock_queues();
2092                         pmap_remove(map->pmap, s, e);
2093                         vm_page_unlock_queues();
2094                         if (object != NULL &&
2095                             object->ref_count != 1 &&
2096                             (object->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) == OBJ_ONEMAPPING &&
2097                             (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP)) {
2098                                 vm_object_collapse(object);
2099                                 vm_object_page_remove(object, offidxstart, offidxend, FALSE);
2100                                 if (object->type == OBJT_SWAP) {
2101                                         swap_pager_freespace(object, offidxstart, count);
2102                                 }
2103                                 if (offidxend >= object->size &&
2104                                     offidxstart < object->size) {
2105                                         object->size = offidxstart;
2106                                 }
2107                         }
2108                         vm_object_unlock(object);
2109                 }
2110
2111                 /*
2112                  * Delete the entry (which may delete the object) only after
2113                  * removing all pmap entries pointing to its pages.
2114                  * (Otherwise, its page frames may be reallocated, and any
2115                  * modify bits will be set in the wrong object!)
2116                  */
2117                 vm_map_entry_delete(map, entry);
2118                 entry = next;
2119         }
2120         return (KERN_SUCCESS);
2121 }
2122
2123 /*
2124  *      vm_map_remove:
2125  *
2126  *      Remove the given address range from the target map.
2127  *      This is the exported form of vm_map_delete.
2128  */
2129 int
2130 vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end)
2131 {
2132         int result, s = 0;
2133
2134         if (map == kmem_map)
2135                 s = splvm();
2136
2137         vm_map_lock(map);
2138         VM_MAP_RANGE_CHECK(map, start, end);
2139         result = vm_map_delete(map, start, end);
2140         vm_map_unlock(map);
2141
2142         if (map == kmem_map)
2143                 splx(s);
2144
2145         return (result);
2146 }
2147
2148 /*
2149  *      vm_map_check_protection:
2150  *
2151  *      Assert that the target map allows the specified
2152  *      privilege on the entire address region given.
2153  *      The entire region must be allocated.
2154  */
2155 boolean_t
2156 vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end,
2157                         vm_prot_t protection)
2158 {
2159         vm_map_entry_t entry;
2160         vm_map_entry_t tmp_entry;
2161
2162         vm_map_lock_read(map);
2163         if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
2164                 vm_map_unlock_read(map);
2165                 return (FALSE);
2166         }
2167         entry = tmp_entry;
2168
2169         while (start < end) {
2170                 if (entry == &map->header) {
2171                         vm_map_unlock_read(map);
2172                         return (FALSE);
2173                 }
2174                 /*
2175                  * No holes allowed!
2176                  */
2177                 if (start < entry->start) {
2178                         vm_map_unlock_read(map);
2179                         return (FALSE);
2180                 }
2181                 /*
2182                  * Check protection associated with entry.
2183                  */
2184                 if ((entry->protection & protection) != protection) {
2185                         vm_map_unlock_read(map);
2186                         return (FALSE);
2187                 }
2188                 /* go to next entry */
2189                 start = entry->end;
2190                 entry = entry->next;
2191         }
2192         vm_map_unlock_read(map);
2193         return (TRUE);
2194 }
2195
2196 /*
2197  *      vm_map_copy_entry:
2198  *
2199  *      Copies the contents of the source entry to the destination
2200  *      entry.  The entries *must* be aligned properly.
2201  */
2202 static void
2203 vm_map_copy_entry(
2204         vm_map_t src_map,
2205         vm_map_t dst_map,
2206         vm_map_entry_t src_entry,
2207         vm_map_entry_t dst_entry)
2208 {
2209         vm_object_t src_object;
2210
2211         if ((dst_entry->eflags|src_entry->eflags) & MAP_ENTRY_IS_SUB_MAP)
2212                 return;
2213
2214         if (src_entry->wired_count == 0) {
2215
2216                 /*
2217                  * If the source entry is marked needs_copy, it is already
2218                  * write-protected.
2219                  */
2220                 if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) {
2221                         vm_page_lock_queues();
2222                         pmap_protect(src_map->pmap,
2223                             src_entry->start,
2224                             src_entry->end,
2225                             src_entry->protection & ~VM_PROT_WRITE);
2226                         vm_page_unlock_queues();
2227                 }
2228
2229                 /*
2230                  * Make a copy of the object.
2231                  */
2232                 if ((src_object = src_entry->object.vm_object) != NULL) {
2233
2234                         if ((src_object->handle == NULL) &&
2235                                 (src_object->type == OBJT_DEFAULT ||
2236                                  src_object->type == OBJT_SWAP)) {
2237                                 vm_object_collapse(src_object);
2238                                 if ((src_object->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) == OBJ_ONEMAPPING) {
2239                                         vm_object_split(src_entry);
2240                                         src_object = src_entry->object.vm_object;
2241                                 }
2242                         }
2243
2244                         vm_object_reference(src_object);
2245                         vm_object_clear_flag(src_object, OBJ_ONEMAPPING);
2246                         dst_entry->object.vm_object = src_object;
2247                         src_entry->eflags |= (MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY);
2248                         dst_entry->eflags |= (MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY);
2249                         dst_entry->offset = src_entry->offset;
2250                 } else {
2251                         dst_entry->object.vm_object = NULL;
2252                         dst_entry->offset = 0;
2253                 }
2254
2255                 pmap_copy(dst_map->pmap, src_map->pmap, dst_entry->start,
2256                     dst_entry->end - dst_entry->start, src_entry->start);
2257         } else {
2258                 /*
2259                  * Of course, wired down pages can't be set copy-on-write.
2260                  * Cause wired pages to be copied into the new map by
2261                  * simulating faults (the new pages are pageable)
2262                  */
2263                 vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry);
2264         }
2265 }
2266
2267 /*
2268  * vmspace_fork:
2269  * Create a new process vmspace structure and vm_map
2270  * based on those of an existing process.  The new map
2271  * is based on the old map, according to the inheritance
2272  * values on the regions in that map.
2273  *
2274  * The source map must not be locked.
2275  */
2276 struct vmspace *
2277 vmspace_fork(struct vmspace *vm1)
2278 {
2279         struct vmspace *vm2;
2280         vm_map_t old_map = &vm1->vm_map;
2281         vm_map_t new_map;
2282         vm_map_entry_t old_entry;
2283         vm_map_entry_t new_entry;
2284         vm_object_t object;
2285
2286         GIANT_REQUIRED;
2287
2288         vm_map_lock(old_map);
2289         old_map->infork = 1;
2290
2291         vm2 = vmspace_alloc(old_map->min_offset, old_map->max_offset);
2292         bcopy(&vm1->vm_startcopy, &vm2->vm_startcopy,
2293             (caddr_t) &vm1->vm_endcopy - (caddr_t) &vm1->vm_startcopy);
2294         new_map = &vm2->vm_map; /* XXX */
2295         new_map->timestamp = 1;
2296
2297         old_entry = old_map->header.next;
2298
2299         while (old_entry != &old_map->header) {
2300                 if (old_entry->eflags & MAP_ENTRY_IS_SUB_MAP)
2301                         panic("vm_map_fork: encountered a submap");
2302
2303                 switch (old_entry->inheritance) {
2304                 case VM_INHERIT_NONE:
2305                         break;
2306
2307                 case VM_INHERIT_SHARE:
2308                         /*
2309                          * Clone the entry, creating the shared object if necessary.
2310                          */
2311                         object = old_entry->object.vm_object;
2312                         if (object == NULL) {
2313                                 object = vm_object_allocate(OBJT_DEFAULT,
2314                                         atop(old_entry->end - old_entry->start));
2315                                 old_entry->object.vm_object = object;
2316                                 old_entry->offset = (vm_offset_t) 0;
2317                         }
2318
2319                         /*
2320                          * Add the reference before calling vm_object_shadow
2321                          * to insure that a shadow object is created.
2322                          */
2323                         vm_object_reference(object);
2324                         if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) {
2325                                 vm_object_shadow(&old_entry->object.vm_object,
2326                                         &old_entry->offset,
2327                                         atop(old_entry->end - old_entry->start));
2328                                 old_entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
2329                                 /* Transfer the second reference too. */
2330                                 vm_object_reference(
2331                                     old_entry->object.vm_object);
2332                                 vm_object_deallocate(object);
2333                                 object = old_entry->object.vm_object;
2334                         }
2335                         vm_object_clear_flag(object, OBJ_ONEMAPPING);
2336
2337                         /*
2338                          * Clone the entry, referencing the shared object.
2339                          */
2340                         new_entry = vm_map_entry_create(new_map);
2341                         *new_entry = *old_entry;
2342                         new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
2343                         new_entry->wired_count = 0;
2344
2345                         /*
2346                          * Insert the entry into the new map -- we know we're
2347                          * inserting at the end of the new map.
2348                          */
2349                         vm_map_entry_link(new_map, new_map->header.prev,
2350                             new_entry);
2351
2352                         /*
2353                          * Update the physical map
2354                          */
2355                         pmap_copy(new_map->pmap, old_map->pmap,
2356                             new_entry->start,
2357                             (old_entry->end - old_entry->start),
2358                             old_entry->start);
2359                         break;
2360
2361                 case VM_INHERIT_COPY:
2362                         /*
2363                          * Clone the entry and link into the map.
2364                          */
2365                         new_entry = vm_map_entry_create(new_map);
2366                         *new_entry = *old_entry;
2367                         new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
2368                         new_entry->wired_count = 0;
2369                         new_entry->object.vm_object = NULL;
2370                         vm_map_entry_link(new_map, new_map->header.prev,
2371                             new_entry);
2372                         vm_map_copy_entry(old_map, new_map, old_entry,
2373                             new_entry);
2374                         break;
2375                 }
2376                 old_entry = old_entry->next;
2377         }
2378
2379         new_map->size = old_map->size;
2380         old_map->infork = 0;
2381         vm_map_unlock(old_map);
2382
2383         return (vm2);
2384 }
2385
2386 int
2387 vm_map_stack (vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
2388               vm_prot_t prot, vm_prot_t max, int cow)
2389 {
2390         vm_map_entry_t prev_entry;
2391         vm_map_entry_t new_stack_entry;
2392         vm_size_t      init_ssize;
2393         int            rv;
2394
2395         if (addrbos < vm_map_min(map))
2396                 return (KERN_NO_SPACE);
2397
2398         if (max_ssize < sgrowsiz)
2399                 init_ssize = max_ssize;
2400         else
2401                 init_ssize = sgrowsiz;
2402
2403         vm_map_lock(map);
2404
2405         /* If addr is already mapped, no go */
2406         if (vm_map_lookup_entry(map, addrbos, &prev_entry)) {
2407                 vm_map_unlock(map);
2408                 return (KERN_NO_SPACE);
2409         }
2410
2411         /* If we would blow our VMEM resource limit, no go */
2412         if (map->size + init_ssize >
2413             curthread->td_proc->p_rlimit[RLIMIT_VMEM].rlim_cur) {
2414                 vm_map_unlock(map);
2415                 return (KERN_NO_SPACE);
2416         }
2417
2418         /* If we can't accomodate max_ssize in the current mapping,
2419          * no go.  However, we need to be aware that subsequent user
2420          * mappings might map into the space we have reserved for
2421          * stack, and currently this space is not protected.
2422          *
2423          * Hopefully we will at least detect this condition
2424          * when we try to grow the stack.
2425          */
2426         if ((prev_entry->next != &map->header) &&
2427             (prev_entry->next->start < addrbos + max_ssize)) {
2428                 vm_map_unlock(map);
2429                 return (KERN_NO_SPACE);
2430         }
2431
2432         /* We initially map a stack of only init_ssize.  We will
2433          * grow as needed later.  Since this is to be a grow
2434          * down stack, we map at the top of the range.
2435          *
2436          * Note: we would normally expect prot and max to be
2437          * VM_PROT_ALL, and cow to be 0.  Possibly we should
2438          * eliminate these as input parameters, and just
2439          * pass these values here in the insert call.
2440          */
2441         rv = vm_map_insert(map, NULL, 0, addrbos + max_ssize - init_ssize,
2442                            addrbos + max_ssize, prot, max, cow);
2443
2444         /* Now set the avail_ssize amount */
2445         if (rv == KERN_SUCCESS){
2446                 if (prev_entry != &map->header)
2447                         vm_map_clip_end(map, prev_entry, addrbos + max_ssize - init_ssize);
2448                 new_stack_entry = prev_entry->next;
2449                 if (new_stack_entry->end   != addrbos + max_ssize ||
2450                     new_stack_entry->start != addrbos + max_ssize - init_ssize)
2451                         panic ("Bad entry start/end for new stack entry");
2452                 else
2453                         new_stack_entry->avail_ssize = max_ssize - init_ssize;
2454         }
2455
2456         vm_map_unlock(map);
2457         return (rv);
2458 }
2459
2460 /* Attempts to grow a vm stack entry.  Returns KERN_SUCCESS if the
2461  * desired address is already mapped, or if we successfully grow
2462  * the stack.  Also returns KERN_SUCCESS if addr is outside the
2463  * stack range (this is strange, but preserves compatibility with
2464  * the grow function in vm_machdep.c).
2465  */
2466 int
2467 vm_map_growstack (struct proc *p, vm_offset_t addr)
2468 {
2469         vm_map_entry_t prev_entry;
2470         vm_map_entry_t stack_entry;
2471         vm_map_entry_t new_stack_entry;
2472         struct vmspace *vm = p->p_vmspace;
2473         vm_map_t map = &vm->vm_map;
2474         vm_offset_t    end;
2475         int      grow_amount;
2476         int      rv;
2477         int      is_procstack;
2478
2479         GIANT_REQUIRED;
2480
2481 Retry:
2482         vm_map_lock_read(map);
2483
2484         /* If addr is already in the entry range, no need to grow.*/
2485         if (vm_map_lookup_entry(map, addr, &prev_entry)) {
2486                 vm_map_unlock_read(map);
2487                 return (KERN_SUCCESS);
2488         }
2489
2490         if ((stack_entry = prev_entry->next) == &map->header) {
2491                 vm_map_unlock_read(map);
2492                 return (KERN_SUCCESS);
2493         }
2494         if (prev_entry == &map->header)
2495                 end = stack_entry->start - stack_entry->avail_ssize;
2496         else
2497                 end = prev_entry->end;
2498
2499         /* This next test mimics the old grow function in vm_machdep.c.
2500          * It really doesn't quite make sense, but we do it anyway
2501          * for compatibility.
2502          *
2503          * If not growable stack, return success.  This signals the
2504          * caller to proceed as he would normally with normal vm.
2505          */
2506         if (stack_entry->avail_ssize < 1 ||
2507             addr >= stack_entry->start ||
2508             addr <  stack_entry->start - stack_entry->avail_ssize) {
2509                 vm_map_unlock_read(map);
2510                 return (KERN_SUCCESS);
2511         }
2512
2513         /* Find the minimum grow amount */
2514         grow_amount = roundup (stack_entry->start - addr, PAGE_SIZE);
2515         if (grow_amount > stack_entry->avail_ssize) {
2516                 vm_map_unlock_read(map);
2517                 return (KERN_NO_SPACE);
2518         }
2519
2520         /* If there is no longer enough space between the entries
2521          * nogo, and adjust the available space.  Note: this
2522          * should only happen if the user has mapped into the
2523          * stack area after the stack was created, and is
2524          * probably an error.
2525          *
2526          * This also effectively destroys any guard page the user
2527          * might have intended by limiting the stack size.
2528          */
2529         if (grow_amount > stack_entry->start - end) {
2530                 if (vm_map_lock_upgrade(map))
2531                         goto Retry;
2532
2533                 stack_entry->avail_ssize = stack_entry->start - end;
2534
2535                 vm_map_unlock(map);
2536                 return (KERN_NO_SPACE);
2537         }
2538
2539         is_procstack = addr >= (vm_offset_t)vm->vm_maxsaddr;
2540
2541         /* If this is the main process stack, see if we're over the
2542          * stack limit.
2543          */
2544         if (is_procstack && (ctob(vm->vm_ssize) + grow_amount >
2545                              p->p_rlimit[RLIMIT_STACK].rlim_cur)) {
2546                 vm_map_unlock_read(map);
2547                 return (KERN_NO_SPACE);
2548         }
2549
2550         /* Round up the grow amount modulo SGROWSIZ */
2551         grow_amount = roundup (grow_amount, sgrowsiz);
2552         if (grow_amount > stack_entry->avail_ssize) {
2553                 grow_amount = stack_entry->avail_ssize;
2554         }
2555         if (is_procstack && (ctob(vm->vm_ssize) + grow_amount >
2556                              p->p_rlimit[RLIMIT_STACK].rlim_cur)) {
2557                 grow_amount = p->p_rlimit[RLIMIT_STACK].rlim_cur -
2558                               ctob(vm->vm_ssize);
2559         }
2560
2561         /* If we would blow our VMEM resource limit, no go */
2562         if (map->size + grow_amount >
2563             curthread->td_proc->p_rlimit[RLIMIT_VMEM].rlim_cur) {
2564                 vm_map_unlock_read(map);
2565                 return (KERN_NO_SPACE);
2566         }
2567
2568         if (vm_map_lock_upgrade(map))
2569                 goto Retry;
2570
2571         /* Get the preliminary new entry start value */
2572         addr = stack_entry->start - grow_amount;
2573
2574         /* If this puts us into the previous entry, cut back our growth
2575          * to the available space.  Also, see the note above.
2576          */
2577         if (addr < end) {
2578                 stack_entry->avail_ssize = stack_entry->start - end;
2579                 addr = end;
2580         }
2581
2582         rv = vm_map_insert(map, NULL, 0, addr, stack_entry->start,
2583             p->p_sysent->sv_stackprot, VM_PROT_ALL, 0);
2584
2585         /* Adjust the available stack space by the amount we grew. */
2586         if (rv == KERN_SUCCESS) {
2587                 if (prev_entry != &map->header)
2588                         vm_map_clip_end(map, prev_entry, addr);
2589                 new_stack_entry = prev_entry->next;
2590                 if (new_stack_entry->end   != stack_entry->start  ||
2591                     new_stack_entry->start != addr)
2592                         panic ("Bad stack grow start/end in new stack entry");
2593                 else {
2594                         new_stack_entry->avail_ssize = stack_entry->avail_ssize -
2595                                                         (new_stack_entry->end -
2596                                                          new_stack_entry->start);
2597                         if (is_procstack)
2598                                 vm->vm_ssize += btoc(new_stack_entry->end -
2599                                                      new_stack_entry->start);
2600                 }
2601         }
2602
2603         vm_map_unlock(map);
2604         return (rv);
2605 }
2606
2607 /*
2608  * Unshare the specified VM space for exec.  If other processes are
2609  * mapped to it, then create a new one.  The new vmspace is null.
2610  */
2611 void
2612 vmspace_exec(struct proc *p, vm_offset_t minuser, vm_offset_t maxuser)
2613 {
2614         struct vmspace *oldvmspace = p->p_vmspace;
2615         struct vmspace *newvmspace;
2616
2617         GIANT_REQUIRED;
2618         newvmspace = vmspace_alloc(minuser, maxuser);
2619         bcopy(&oldvmspace->vm_startcopy, &newvmspace->vm_startcopy,
2620             (caddr_t) (newvmspace + 1) - (caddr_t) &newvmspace->vm_startcopy);
2621         /*
2622          * This code is written like this for prototype purposes.  The
2623          * goal is to avoid running down the vmspace here, but let the
2624          * other process's that are still using the vmspace to finally
2625          * run it down.  Even though there is little or no chance of blocking
2626          * here, it is a good idea to keep this form for future mods.
2627          */
2628         p->p_vmspace = newvmspace;
2629         pmap_pinit2(vmspace_pmap(newvmspace));
2630         vmspace_free(oldvmspace);
2631         if (p == curthread->td_proc)            /* XXXKSE ? */
2632                 pmap_activate(curthread);
2633 }
2634
2635 /*
2636  * Unshare the specified VM space for forcing COW.  This
2637  * is called by rfork, for the (RFMEM|RFPROC) == 0 case.
2638  */
2639 void
2640 vmspace_unshare(struct proc *p)
2641 {
2642         struct vmspace *oldvmspace = p->p_vmspace;
2643         struct vmspace *newvmspace;
2644
2645         GIANT_REQUIRED;
2646         if (oldvmspace->vm_refcnt == 1)
2647                 return;
2648         newvmspace = vmspace_fork(oldvmspace);
2649         p->p_vmspace = newvmspace;
2650         pmap_pinit2(vmspace_pmap(newvmspace));
2651         vmspace_free(oldvmspace);
2652         if (p == curthread->td_proc)            /* XXXKSE ? */
2653                 pmap_activate(curthread);
2654 }
2655
2656 /*
2657  *      vm_map_lookup:
2658  *
2659  *      Finds the VM object, offset, and
2660  *      protection for a given virtual address in the
2661  *      specified map, assuming a page fault of the
2662  *      type specified.
2663  *
2664  *      Leaves the map in question locked for read; return
2665  *      values are guaranteed until a vm_map_lookup_done
2666  *      call is performed.  Note that the map argument
2667  *      is in/out; the returned map must be used in
2668  *      the call to vm_map_lookup_done.
2669  *
2670  *      A handle (out_entry) is returned for use in
2671  *      vm_map_lookup_done, to make that fast.
2672  *
2673  *      If a lookup is requested with "write protection"
2674  *      specified, the map may be changed to perform virtual
2675  *      copying operations, although the data referenced will
2676  *      remain the same.
2677  */
2678 int
2679 vm_map_lookup(vm_map_t *var_map,                /* IN/OUT */
2680               vm_offset_t vaddr,
2681               vm_prot_t fault_typea,
2682               vm_map_entry_t *out_entry,        /* OUT */
2683               vm_object_t *object,              /* OUT */
2684               vm_pindex_t *pindex,              /* OUT */
2685               vm_prot_t *out_prot,              /* OUT */
2686               boolean_t *wired)                 /* OUT */
2687 {
2688         vm_map_entry_t entry;
2689         vm_map_t map = *var_map;
2690         vm_prot_t prot;
2691         vm_prot_t fault_type = fault_typea;
2692
2693 RetryLookup:;
2694         /*
2695          * Lookup the faulting address.
2696          */
2697
2698         vm_map_lock_read(map);
2699 #define RETURN(why) \
2700                 { \
2701                 vm_map_unlock_read(map); \
2702                 return (why); \
2703                 }
2704
2705         /*
2706          * If the map has an interesting hint, try it before calling full
2707          * blown lookup routine.
2708          */
2709         entry = map->root;
2710         *out_entry = entry;
2711         if (entry == NULL ||
2712             (vaddr < entry->start) || (vaddr >= entry->end)) {
2713                 /*
2714                  * Entry was either not a valid hint, or the vaddr was not
2715                  * contained in the entry, so do a full lookup.
2716                  */
2717                 if (!vm_map_lookup_entry(map, vaddr, out_entry))
2718                         RETURN(KERN_INVALID_ADDRESS);
2719
2720                 entry = *out_entry;
2721         }
2722
2723         /*
2724          * Handle submaps.
2725          */
2726         if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
2727                 vm_map_t old_map = map;
2728
2729                 *var_map = map = entry->object.sub_map;
2730                 vm_map_unlock_read(old_map);
2731                 goto RetryLookup;
2732         }
2733
2734         /*
2735          * Check whether this task is allowed to have this page.
2736          * Note the special case for MAP_ENTRY_COW
2737          * pages with an override.  This is to implement a forced
2738          * COW for debuggers.
2739          */
2740         if (fault_type & VM_PROT_OVERRIDE_WRITE)
2741                 prot = entry->max_protection;
2742         else
2743                 prot = entry->protection;
2744         fault_type &= (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
2745         if ((fault_type & prot) != fault_type) {
2746                         RETURN(KERN_PROTECTION_FAILURE);
2747         }
2748         if ((entry->eflags & MAP_ENTRY_USER_WIRED) &&
2749             (entry->eflags & MAP_ENTRY_COW) &&
2750             (fault_type & VM_PROT_WRITE) &&
2751             (fault_typea & VM_PROT_OVERRIDE_WRITE) == 0) {
2752                 RETURN(KERN_PROTECTION_FAILURE);
2753         }
2754
2755         /*
2756          * If this page is not pageable, we have to get it for all possible
2757          * accesses.
2758          */
2759         *wired = (entry->wired_count != 0);
2760         if (*wired)
2761                 prot = fault_type = entry->protection;
2762
2763         /*
2764          * If the entry was copy-on-write, we either ...
2765          */
2766         if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
2767                 /*
2768                  * If we want to write the page, we may as well handle that
2769                  * now since we've got the map locked.
2770                  *
2771                  * If we don't need to write the page, we just demote the
2772                  * permissions allowed.
2773                  */
2774                 if (fault_type & VM_PROT_WRITE) {
2775                         /*
2776                          * Make a new object, and place it in the object
2777                          * chain.  Note that no new references have appeared
2778                          * -- one just moved from the map to the new
2779                          * object.
2780                          */
2781                         if (vm_map_lock_upgrade(map))
2782                                 goto RetryLookup;
2783
2784                         vm_object_shadow(
2785                             &entry->object.vm_object,
2786                             &entry->offset,
2787                             atop(entry->end - entry->start));
2788                         entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
2789
2790                         vm_map_lock_downgrade(map);
2791                 } else {
2792                         /*
2793                          * We're attempting to read a copy-on-write page --
2794                          * don't allow writes.
2795                          */
2796                         prot &= ~VM_PROT_WRITE;
2797                 }
2798         }
2799
2800         /*
2801          * Create an object if necessary.
2802          */
2803         if (entry->object.vm_object == NULL &&
2804             !map->system_map) {
2805                 if (vm_map_lock_upgrade(map))
2806                         goto RetryLookup;
2807                 entry->object.vm_object = vm_object_allocate(OBJT_DEFAULT,
2808                     atop(entry->end - entry->start));
2809                 entry->offset = 0;
2810                 vm_map_lock_downgrade(map);
2811         }
2812
2813         /*
2814          * Return the object/offset from this entry.  If the entry was
2815          * copy-on-write or empty, it has been fixed up.
2816          */
2817         *pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
2818         *object = entry->object.vm_object;
2819
2820         /*
2821          * Return whether this is the only map sharing this data.
2822          */
2823         *out_prot = prot;
2824         return (KERN_SUCCESS);
2825
2826 #undef  RETURN
2827 }
2828
2829 /*
2830  *      vm_map_lookup_done:
2831  *
2832  *      Releases locks acquired by a vm_map_lookup
2833  *      (according to the handle returned by that lookup).
2834  */
2835 void
2836 vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry)
2837 {
2838         /*
2839          * Unlock the main-level map
2840          */
2841         vm_map_unlock_read(map);
2842 }
2843
2844 #ifdef ENABLE_VFS_IOOPT
2845 /*
2846  * Experimental support for zero-copy I/O
2847  *
2848  * Implement uiomove with VM operations.  This handles (and collateral changes)
2849  * support every combination of source object modification, and COW type
2850  * operations.
2851  */
2852 int
2853 vm_uiomove(
2854         vm_map_t mapa,
2855         vm_object_t srcobject,
2856         off_t cp,
2857         int cnta,
2858         vm_offset_t uaddra,
2859         int *npages)
2860 {
2861         vm_map_t map;
2862         vm_object_t first_object, oldobject, object;
2863         vm_map_entry_t entry;
2864         vm_prot_t prot;
2865         boolean_t wired;
2866         int tcnt, rv;
2867         vm_offset_t uaddr, start, end, tend;
2868         vm_pindex_t first_pindex, oindex;
2869         vm_size_t osize;
2870         off_t ooffset;
2871         int cnt;
2872
2873         GIANT_REQUIRED;
2874
2875         if (npages)
2876                 *npages = 0;
2877
2878         cnt = cnta;
2879         uaddr = uaddra;
2880
2881         while (cnt > 0) {
2882                 map = mapa;
2883
2884                 if ((vm_map_lookup(&map, uaddr,
2885                         VM_PROT_READ, &entry, &first_object,
2886                         &first_pindex, &prot, &wired)) != KERN_SUCCESS) {
2887                         return EFAULT;
2888                 }
2889
2890                 vm_map_clip_start(map, entry, uaddr);
2891
2892                 tcnt = cnt;
2893                 tend = uaddr + tcnt;
2894                 if (tend > entry->end) {
2895                         tcnt = entry->end - uaddr;
2896                         tend = entry->end;
2897                 }
2898
2899                 vm_map_clip_end(map, entry, tend);
2900
2901                 start = entry->start;
2902                 end = entry->end;
2903
2904                 osize = atop(tcnt);
2905
2906                 oindex = OFF_TO_IDX(cp);
2907                 if (npages) {
2908                         vm_size_t idx;
2909                         for (idx = 0; idx < osize; idx++) {
2910                                 vm_page_t m;
2911                                 if ((m = vm_page_lookup(srcobject, oindex + idx)) == NULL) {
2912                                         vm_map_lookup_done(map, entry);
2913                                         return 0;
2914                                 }
2915                                 /*
2916                                  * disallow busy or invalid pages, but allow
2917                                  * m->busy pages if they are entirely valid.
2918                                  */
2919                                 if ((m->flags & PG_BUSY) ||
2920                                         ((m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL)) {
2921                                         vm_map_lookup_done(map, entry);
2922                                         return 0;
2923                                 }
2924                         }
2925                 }
2926
2927 /*
2928  * If we are changing an existing map entry, just redirect
2929  * the object, and change mappings.
2930  */
2931                 if ((first_object->type == OBJT_VNODE) &&
2932                         ((oldobject = entry->object.vm_object) == first_object)) {
2933
2934                         if ((entry->offset != cp) || (oldobject != srcobject)) {
2935                                 /*
2936                                 * Remove old window into the file
2937                                 */
2938                                 vm_page_lock_queues();
2939                                 pmap_remove(map->pmap, uaddr, tend);
2940                                 vm_page_unlock_queues();
2941
2942                                 /*
2943                                 * Force copy on write for mmaped regions
2944                                 */
2945                                 vm_object_pmap_copy_1 (srcobject, oindex, oindex + osize);
2946
2947                                 /*
2948                                 * Point the object appropriately
2949                                 */
2950                                 if (oldobject != srcobject) {
2951
2952                                 /*
2953                                 * Set the object optimization hint flag
2954                                 */
2955                                         vm_object_set_flag(srcobject, OBJ_OPT);
2956                                         vm_object_reference(srcobject);
2957                                         entry->object.vm_object = srcobject;
2958
2959                                         if (oldobject) {
2960                                                 vm_object_deallocate(oldobject);
2961                                         }
2962                                 }
2963
2964                                 entry->offset = cp;
2965                                 map->timestamp++;
2966                         } else {
2967                                 vm_page_lock_queues();
2968                                 pmap_remove(map->pmap, uaddr, tend);
2969                                 vm_page_unlock_queues();
2970                         }
2971
2972                 } else if ((first_object->ref_count == 1) &&
2973                         (first_object->size == osize) &&
2974                         ((first_object->type == OBJT_DEFAULT) ||
2975                                 (first_object->type == OBJT_SWAP)) ) {
2976
2977                         oldobject = first_object->backing_object;
2978
2979                         if ((first_object->backing_object_offset != cp) ||
2980                                 (oldobject != srcobject)) {
2981                                 /*
2982                                 * Remove old window into the file
2983                                 */
2984                                 vm_page_lock_queues();
2985                                 pmap_remove(map->pmap, uaddr, tend);
2986                                 vm_page_unlock_queues();
2987
2988                                 /*
2989                                  * Remove unneeded old pages
2990                                  */
2991                                 vm_object_lock(first_object);
2992                                 vm_object_page_remove(first_object, 0, 0, 0);
2993                                 vm_object_unlock(first_object);
2994
2995                                 /*
2996                                  * Invalidate swap space
2997                                  */
2998                                 if (first_object->type == OBJT_SWAP) {
2999                                         swap_pager_freespace(first_object,
3000                                                 0,
3001                                                 first_object->size);
3002                                 }
3003
3004                                 /*
3005                                  * Force copy on write for mmaped regions
3006                                  */
3007                                 vm_object_pmap_copy_1 (srcobject, oindex, oindex + osize);
3008
3009                                 /*
3010                                  * Point the object appropriately
3011                                  */
3012                                 if (oldobject != srcobject) {
3013                                         /*
3014                                          * Set the object optimization hint flag
3015                                          */
3016                                         vm_object_set_flag(srcobject, OBJ_OPT);
3017                                         vm_object_reference(srcobject);
3018
3019                                         if (oldobject) {
3020                                                 TAILQ_REMOVE(&oldobject->shadow_head,
3021                                                         first_object, shadow_list);
3022                                                 oldobject->shadow_count--;
3023                                                 /* XXX bump generation? */
3024                                                 vm_object_deallocate(oldobject);
3025                                         }
3026
3027                                         TAILQ_INSERT_TAIL(&srcobject->shadow_head,
3028                                                 first_object, shadow_list);
3029                                         srcobject->shadow_count++;
3030                                         /* XXX bump generation? */
3031
3032                                         first_object->backing_object = srcobject;
3033                                 }
3034                                 first_object->backing_object_offset = cp;
3035                                 map->timestamp++;
3036                         } else {
3037                                 vm_page_lock_queues();
3038                                 pmap_remove(map->pmap, uaddr, tend);
3039                                 vm_page_unlock_queues();
3040                         }
3041 /*
3042  * Otherwise, we have to do a logical mmap.
3043  */
3044                 } else {
3045
3046                         vm_object_set_flag(srcobject, OBJ_OPT);
3047                         vm_object_reference(srcobject);
3048
3049                         vm_page_lock_queues();
3050                         pmap_remove(map->pmap, uaddr, tend);
3051                         vm_page_unlock_queues();
3052
3053                         vm_object_pmap_copy_1 (srcobject, oindex, oindex + osize);
3054                         vm_map_lock_upgrade(map);
3055
3056                         if (entry == &map->header) {
3057                                 map->first_free = &map->header;
3058                         } else if (map->first_free->start >= start) {
3059                                 map->first_free = entry->prev;
3060                         }
3061
3062                         vm_map_entry_delete(map, entry);
3063
3064                         object = srcobject;
3065                         ooffset = cp;
3066
3067                         rv = vm_map_insert(map, object, ooffset, start, tend,
3068                                 VM_PROT_ALL, VM_PROT_ALL, MAP_COPY_ON_WRITE);
3069
3070                         if (rv != KERN_SUCCESS)
3071                                 panic("vm_uiomove: could not insert new entry: %d", rv);
3072                 }
3073
3074 /*
3075  * Map the window directly, if it is already in memory
3076  */
3077                 pmap_object_init_pt(map->pmap, uaddr,
3078                         srcobject, oindex, tcnt, 0);
3079
3080                 map->timestamp++;
3081                 vm_map_unlock(map);
3082
3083                 cnt -= tcnt;
3084                 uaddr += tcnt;
3085                 cp += tcnt;
3086                 if (npages)
3087                         *npages += osize;
3088         }
3089         return 0;
3090 }
3091 #endif
3092
3093 #include "opt_ddb.h"
3094 #ifdef DDB
3095 #include <sys/kernel.h>
3096
3097 #include <ddb/ddb.h>
3098
3099 /*
3100  *      vm_map_print:   [ debug ]
3101  */
3102 DB_SHOW_COMMAND(map, vm_map_print)
3103 {
3104         static int nlines;
3105         /* XXX convert args. */
3106         vm_map_t map = (vm_map_t)addr;
3107         boolean_t full = have_addr;
3108
3109         vm_map_entry_t entry;
3110
3111         db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n",
3112             (void *)map,
3113             (void *)map->pmap, map->nentries, map->timestamp);
3114         nlines++;
3115
3116         if (!full && db_indent)
3117                 return;
3118
3119         db_indent += 2;
3120         for (entry = map->header.next; entry != &map->header;
3121             entry = entry->next) {
3122                 db_iprintf("map entry %p: start=%p, end=%p\n",
3123                     (void *)entry, (void *)entry->start, (void *)entry->end);
3124                 nlines++;
3125                 {
3126                         static char *inheritance_name[4] =
3127                         {"share", "copy", "none", "donate_copy"};
3128
3129                         db_iprintf(" prot=%x/%x/%s",
3130                             entry->protection,
3131                             entry->max_protection,
3132                             inheritance_name[(int)(unsigned char)entry->inheritance]);
3133                         if (entry->wired_count != 0)
3134                                 db_printf(", wired");
3135                 }
3136                 if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
3137                         db_printf(", share=%p, offset=0x%jx\n",
3138                             (void *)entry->object.sub_map,
3139                             (uintmax_t)entry->offset);
3140                         nlines++;
3141                         if ((entry->prev == &map->header) ||
3142                             (entry->prev->object.sub_map !=
3143                                 entry->object.sub_map)) {
3144                                 db_indent += 2;
3145                                 vm_map_print((db_expr_t)(intptr_t)
3146                                              entry->object.sub_map,
3147                                              full, 0, (char *)0);
3148                                 db_indent -= 2;
3149                         }
3150                 } else {
3151                         db_printf(", object=%p, offset=0x%jx",
3152                             (void *)entry->object.vm_object,
3153                             (uintmax_t)entry->offset);
3154                         if (entry->eflags & MAP_ENTRY_COW)
3155                                 db_printf(", copy (%s)",
3156                                     (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
3157                         db_printf("\n");
3158                         nlines++;
3159
3160                         if ((entry->prev == &map->header) ||
3161                             (entry->prev->object.vm_object !=
3162                                 entry->object.vm_object)) {
3163                                 db_indent += 2;
3164                                 vm_object_print((db_expr_t)(intptr_t)
3165                                                 entry->object.vm_object,
3166                                                 full, 0, (char *)0);
3167                                 nlines += 4;
3168                                 db_indent -= 2;
3169                         }
3170                 }
3171         }
3172         db_indent -= 2;
3173         if (db_indent == 0)
3174                 nlines = 0;
3175 }
3176
3177
3178 DB_SHOW_COMMAND(procvm, procvm)
3179 {
3180         struct proc *p;
3181
3182         if (have_addr) {
3183                 p = (struct proc *) addr;
3184         } else {
3185                 p = curproc;
3186         }
3187
3188         db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n",
3189             (void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map,
3190             (void *)vmspace_pmap(p->p_vmspace));
3191
3192         vm_map_print((db_expr_t)(intptr_t)&p->p_vmspace->vm_map, 1, 0, NULL);
3193 }
3194
3195 #endif /* DDB */