]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/vm/vm_map.c
Inline some splay helper functions to improve performance on a
[FreeBSD/FreeBSD.git] / sys / vm / vm_map.c
1 /*-
2  * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU)
3  *
4  * Copyright (c) 1991, 1993
5  *      The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * The Mach Operating System project at Carnegie-Mellon University.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *      from: @(#)vm_map.c      8.3 (Berkeley) 1/12/94
35  *
36  *
37  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
38  * All rights reserved.
39  *
40  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
41  *
42  * Permission to use, copy, modify and distribute this software and
43  * its documentation is hereby granted, provided that both the copyright
44  * notice and this permission notice appear in all copies of the
45  * software, derivative works or modified versions, and any portions
46  * thereof, and that both notices appear in supporting documentation.
47  *
48  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
49  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
50  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
51  *
52  * Carnegie Mellon requests users of this software to return to
53  *
54  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
55  *  School of Computer Science
56  *  Carnegie Mellon University
57  *  Pittsburgh PA 15213-3890
58  *
59  * any improvements or extensions that they make and grant Carnegie the
60  * rights to redistribute these changes.
61  */
62
63 /*
64  *      Virtual memory mapping module.
65  */
66
67 #include <sys/cdefs.h>
68 __FBSDID("$FreeBSD$");
69
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/elf.h>
73 #include <sys/kernel.h>
74 #include <sys/ktr.h>
75 #include <sys/lock.h>
76 #include <sys/mutex.h>
77 #include <sys/proc.h>
78 #include <sys/vmmeter.h>
79 #include <sys/mman.h>
80 #include <sys/vnode.h>
81 #include <sys/racct.h>
82 #include <sys/resourcevar.h>
83 #include <sys/rwlock.h>
84 #include <sys/file.h>
85 #include <sys/sysctl.h>
86 #include <sys/sysent.h>
87 #include <sys/shm.h>
88
89 #include <vm/vm.h>
90 #include <vm/vm_param.h>
91 #include <vm/pmap.h>
92 #include <vm/vm_map.h>
93 #include <vm/vm_page.h>
94 #include <vm/vm_pageout.h>
95 #include <vm/vm_object.h>
96 #include <vm/vm_pager.h>
97 #include <vm/vm_kern.h>
98 #include <vm/vm_extern.h>
99 #include <vm/vnode_pager.h>
100 #include <vm/swap_pager.h>
101 #include <vm/uma.h>
102
103 /*
104  *      Virtual memory maps provide for the mapping, protection,
105  *      and sharing of virtual memory objects.  In addition,
106  *      this module provides for an efficient virtual copy of
107  *      memory from one map to another.
108  *
109  *      Synchronization is required prior to most operations.
110  *
111  *      Maps consist of an ordered doubly-linked list of simple
112  *      entries; a self-adjusting binary search tree of these
113  *      entries is used to speed up lookups.
114  *
115  *      Since portions of maps are specified by start/end addresses,
116  *      which may not align with existing map entries, all
117  *      routines merely "clip" entries to these start/end values.
118  *      [That is, an entry is split into two, bordering at a
119  *      start or end value.]  Note that these clippings may not
120  *      always be necessary (as the two resulting entries are then
121  *      not changed); however, the clipping is done for convenience.
122  *
123  *      As mentioned above, virtual copy operations are performed
124  *      by copying VM object references from one map to
125  *      another, and then marking both regions as copy-on-write.
126  */
127
128 static struct mtx map_sleep_mtx;
129 static uma_zone_t mapentzone;
130 static uma_zone_t kmapentzone;
131 static uma_zone_t mapzone;
132 static uma_zone_t vmspace_zone;
133 static int vmspace_zinit(void *mem, int size, int flags);
134 static int vm_map_zinit(void *mem, int ize, int flags);
135 static void _vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min,
136     vm_offset_t max);
137 static void vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map);
138 static void vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry);
139 static void vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry);
140 static int vm_map_growstack(vm_map_t map, vm_offset_t addr,
141     vm_map_entry_t gap_entry);
142 static void vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
143     vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags);
144 #ifdef INVARIANTS
145 static void vm_map_zdtor(void *mem, int size, void *arg);
146 static void vmspace_zdtor(void *mem, int size, void *arg);
147 #endif
148 static int vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos,
149     vm_size_t max_ssize, vm_size_t growsize, vm_prot_t prot, vm_prot_t max,
150     int cow);
151 static void vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry,
152     vm_offset_t failed_addr);
153
154 #define ENTRY_CHARGED(e) ((e)->cred != NULL || \
155     ((e)->object.vm_object != NULL && (e)->object.vm_object->cred != NULL && \
156      !((e)->eflags & MAP_ENTRY_NEEDS_COPY)))
157
158 /* 
159  * PROC_VMSPACE_{UN,}LOCK() can be a noop as long as vmspaces are type
160  * stable.
161  */
162 #define PROC_VMSPACE_LOCK(p) do { } while (0)
163 #define PROC_VMSPACE_UNLOCK(p) do { } while (0)
164
165 /*
166  *      VM_MAP_RANGE_CHECK:     [ internal use only ]
167  *
168  *      Asserts that the starting and ending region
169  *      addresses fall within the valid range of the map.
170  */
171 #define VM_MAP_RANGE_CHECK(map, start, end)             \
172                 {                                       \
173                 if (start < vm_map_min(map))            \
174                         start = vm_map_min(map);        \
175                 if (end > vm_map_max(map))              \
176                         end = vm_map_max(map);          \
177                 if (start > end)                        \
178                         start = end;                    \
179                 }
180
181 /*
182  *      vm_map_startup:
183  *
184  *      Initialize the vm_map module.  Must be called before
185  *      any other vm_map routines.
186  *
187  *      Map and entry structures are allocated from the general
188  *      purpose memory pool with some exceptions:
189  *
190  *      - The kernel map and kmem submap are allocated statically.
191  *      - Kernel map entries are allocated out of a static pool.
192  *
193  *      These restrictions are necessary since malloc() uses the
194  *      maps and requires map entries.
195  */
196
197 void
198 vm_map_startup(void)
199 {
200         mtx_init(&map_sleep_mtx, "vm map sleep mutex", NULL, MTX_DEF);
201         mapzone = uma_zcreate("MAP", sizeof(struct vm_map), NULL,
202 #ifdef INVARIANTS
203             vm_map_zdtor,
204 #else
205             NULL,
206 #endif
207             vm_map_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
208         uma_prealloc(mapzone, MAX_KMAP);
209         kmapentzone = uma_zcreate("KMAP ENTRY", sizeof(struct vm_map_entry),
210             NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
211             UMA_ZONE_MTXCLASS | UMA_ZONE_VM);
212         mapentzone = uma_zcreate("MAP ENTRY", sizeof(struct vm_map_entry),
213             NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
214         vmspace_zone = uma_zcreate("VMSPACE", sizeof(struct vmspace), NULL,
215 #ifdef INVARIANTS
216             vmspace_zdtor,
217 #else
218             NULL,
219 #endif
220             vmspace_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
221 }
222
223 static int
224 vmspace_zinit(void *mem, int size, int flags)
225 {
226         struct vmspace *vm;
227
228         vm = (struct vmspace *)mem;
229
230         vm->vm_map.pmap = NULL;
231         (void)vm_map_zinit(&vm->vm_map, sizeof(vm->vm_map), flags);
232         PMAP_LOCK_INIT(vmspace_pmap(vm));
233         return (0);
234 }
235
236 static int
237 vm_map_zinit(void *mem, int size, int flags)
238 {
239         vm_map_t map;
240
241         map = (vm_map_t)mem;
242         memset(map, 0, sizeof(*map));
243         mtx_init(&map->system_mtx, "vm map (system)", NULL, MTX_DEF | MTX_DUPOK);
244         sx_init(&map->lock, "vm map (user)");
245         return (0);
246 }
247
248 #ifdef INVARIANTS
249 static void
250 vmspace_zdtor(void *mem, int size, void *arg)
251 {
252         struct vmspace *vm;
253
254         vm = (struct vmspace *)mem;
255
256         vm_map_zdtor(&vm->vm_map, sizeof(vm->vm_map), arg);
257 }
258 static void
259 vm_map_zdtor(void *mem, int size, void *arg)
260 {
261         vm_map_t map;
262
263         map = (vm_map_t)mem;
264         KASSERT(map->nentries == 0,
265             ("map %p nentries == %d on free.",
266             map, map->nentries));
267         KASSERT(map->size == 0,
268             ("map %p size == %lu on free.",
269             map, (unsigned long)map->size));
270 }
271 #endif  /* INVARIANTS */
272
273 /*
274  * Allocate a vmspace structure, including a vm_map and pmap,
275  * and initialize those structures.  The refcnt is set to 1.
276  *
277  * If 'pinit' is NULL then the embedded pmap is initialized via pmap_pinit().
278  */
279 struct vmspace *
280 vmspace_alloc(vm_offset_t min, vm_offset_t max, pmap_pinit_t pinit)
281 {
282         struct vmspace *vm;
283
284         vm = uma_zalloc(vmspace_zone, M_WAITOK);
285         KASSERT(vm->vm_map.pmap == NULL, ("vm_map.pmap must be NULL"));
286         if (!pinit(vmspace_pmap(vm))) {
287                 uma_zfree(vmspace_zone, vm);
288                 return (NULL);
289         }
290         CTR1(KTR_VM, "vmspace_alloc: %p", vm);
291         _vm_map_init(&vm->vm_map, vmspace_pmap(vm), min, max);
292         vm->vm_refcnt = 1;
293         vm->vm_shm = NULL;
294         vm->vm_swrss = 0;
295         vm->vm_tsize = 0;
296         vm->vm_dsize = 0;
297         vm->vm_ssize = 0;
298         vm->vm_taddr = 0;
299         vm->vm_daddr = 0;
300         vm->vm_maxsaddr = 0;
301         return (vm);
302 }
303
304 #ifdef RACCT
305 static void
306 vmspace_container_reset(struct proc *p)
307 {
308
309         PROC_LOCK(p);
310         racct_set(p, RACCT_DATA, 0);
311         racct_set(p, RACCT_STACK, 0);
312         racct_set(p, RACCT_RSS, 0);
313         racct_set(p, RACCT_MEMLOCK, 0);
314         racct_set(p, RACCT_VMEM, 0);
315         PROC_UNLOCK(p);
316 }
317 #endif
318
319 static inline void
320 vmspace_dofree(struct vmspace *vm)
321 {
322
323         CTR1(KTR_VM, "vmspace_free: %p", vm);
324
325         /*
326          * Make sure any SysV shm is freed, it might not have been in
327          * exit1().
328          */
329         shmexit(vm);
330
331         /*
332          * Lock the map, to wait out all other references to it.
333          * Delete all of the mappings and pages they hold, then call
334          * the pmap module to reclaim anything left.
335          */
336         (void)vm_map_remove(&vm->vm_map, vm_map_min(&vm->vm_map),
337             vm_map_max(&vm->vm_map));
338
339         pmap_release(vmspace_pmap(vm));
340         vm->vm_map.pmap = NULL;
341         uma_zfree(vmspace_zone, vm);
342 }
343
344 void
345 vmspace_free(struct vmspace *vm)
346 {
347
348         WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
349             "vmspace_free() called");
350
351         if (vm->vm_refcnt == 0)
352                 panic("vmspace_free: attempt to free already freed vmspace");
353
354         if (atomic_fetchadd_int(&vm->vm_refcnt, -1) == 1)
355                 vmspace_dofree(vm);
356 }
357
358 void
359 vmspace_exitfree(struct proc *p)
360 {
361         struct vmspace *vm;
362
363         PROC_VMSPACE_LOCK(p);
364         vm = p->p_vmspace;
365         p->p_vmspace = NULL;
366         PROC_VMSPACE_UNLOCK(p);
367         KASSERT(vm == &vmspace0, ("vmspace_exitfree: wrong vmspace"));
368         vmspace_free(vm);
369 }
370
371 void
372 vmspace_exit(struct thread *td)
373 {
374         int refcnt;
375         struct vmspace *vm;
376         struct proc *p;
377
378         /*
379          * Release user portion of address space.
380          * This releases references to vnodes,
381          * which could cause I/O if the file has been unlinked.
382          * Need to do this early enough that we can still sleep.
383          *
384          * The last exiting process to reach this point releases as
385          * much of the environment as it can. vmspace_dofree() is the
386          * slower fallback in case another process had a temporary
387          * reference to the vmspace.
388          */
389
390         p = td->td_proc;
391         vm = p->p_vmspace;
392         atomic_add_int(&vmspace0.vm_refcnt, 1);
393         refcnt = vm->vm_refcnt;
394         do {
395                 if (refcnt > 1 && p->p_vmspace != &vmspace0) {
396                         /* Switch now since other proc might free vmspace */
397                         PROC_VMSPACE_LOCK(p);
398                         p->p_vmspace = &vmspace0;
399                         PROC_VMSPACE_UNLOCK(p);
400                         pmap_activate(td);
401                 }
402         } while (!atomic_fcmpset_int(&vm->vm_refcnt, &refcnt, refcnt - 1));
403         if (refcnt == 1) {
404                 if (p->p_vmspace != vm) {
405                         /* vmspace not yet freed, switch back */
406                         PROC_VMSPACE_LOCK(p);
407                         p->p_vmspace = vm;
408                         PROC_VMSPACE_UNLOCK(p);
409                         pmap_activate(td);
410                 }
411                 pmap_remove_pages(vmspace_pmap(vm));
412                 /* Switch now since this proc will free vmspace */
413                 PROC_VMSPACE_LOCK(p);
414                 p->p_vmspace = &vmspace0;
415                 PROC_VMSPACE_UNLOCK(p);
416                 pmap_activate(td);
417                 vmspace_dofree(vm);
418         }
419 #ifdef RACCT
420         if (racct_enable)
421                 vmspace_container_reset(p);
422 #endif
423 }
424
425 /* Acquire reference to vmspace owned by another process. */
426
427 struct vmspace *
428 vmspace_acquire_ref(struct proc *p)
429 {
430         struct vmspace *vm;
431         int refcnt;
432
433         PROC_VMSPACE_LOCK(p);
434         vm = p->p_vmspace;
435         if (vm == NULL) {
436                 PROC_VMSPACE_UNLOCK(p);
437                 return (NULL);
438         }
439         refcnt = vm->vm_refcnt;
440         do {
441                 if (refcnt <= 0) {      /* Avoid 0->1 transition */
442                         PROC_VMSPACE_UNLOCK(p);
443                         return (NULL);
444                 }
445         } while (!atomic_fcmpset_int(&vm->vm_refcnt, &refcnt, refcnt + 1));
446         if (vm != p->p_vmspace) {
447                 PROC_VMSPACE_UNLOCK(p);
448                 vmspace_free(vm);
449                 return (NULL);
450         }
451         PROC_VMSPACE_UNLOCK(p);
452         return (vm);
453 }
454
455 /*
456  * Switch between vmspaces in an AIO kernel process.
457  *
458  * The new vmspace is either the vmspace of a user process obtained
459  * from an active AIO request or the initial vmspace of the AIO kernel
460  * process (when it is idling).  Because user processes will block to
461  * drain any active AIO requests before proceeding in exit() or
462  * execve(), the reference count for vmspaces from AIO requests can
463  * never be 0.  Similarly, AIO kernel processes hold an extra
464  * reference on their initial vmspace for the life of the process.  As
465  * a result, the 'newvm' vmspace always has a non-zero reference
466  * count.  This permits an additional reference on 'newvm' to be
467  * acquired via a simple atomic increment rather than the loop in
468  * vmspace_acquire_ref() above.
469  */
470 void
471 vmspace_switch_aio(struct vmspace *newvm)
472 {
473         struct vmspace *oldvm;
474
475         /* XXX: Need some way to assert that this is an aio daemon. */
476
477         KASSERT(newvm->vm_refcnt > 0,
478             ("vmspace_switch_aio: newvm unreferenced"));
479
480         oldvm = curproc->p_vmspace;
481         if (oldvm == newvm)
482                 return;
483
484         /*
485          * Point to the new address space and refer to it.
486          */
487         curproc->p_vmspace = newvm;
488         atomic_add_int(&newvm->vm_refcnt, 1);
489
490         /* Activate the new mapping. */
491         pmap_activate(curthread);
492
493         vmspace_free(oldvm);
494 }
495
496 void
497 _vm_map_lock(vm_map_t map, const char *file, int line)
498 {
499
500         if (map->system_map)
501                 mtx_lock_flags_(&map->system_mtx, 0, file, line);
502         else
503                 sx_xlock_(&map->lock, file, line);
504         map->timestamp++;
505 }
506
507 void
508 vm_map_entry_set_vnode_text(vm_map_entry_t entry, bool add)
509 {
510         vm_object_t object, object1;
511         struct vnode *vp;
512
513         if ((entry->eflags & MAP_ENTRY_VN_EXEC) == 0)
514                 return;
515         KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
516             ("Submap with execs"));
517         object = entry->object.vm_object;
518         KASSERT(object != NULL, ("No object for text, entry %p", entry));
519         VM_OBJECT_RLOCK(object);
520         while ((object1 = object->backing_object) != NULL) {
521                 VM_OBJECT_RLOCK(object1);
522                 VM_OBJECT_RUNLOCK(object);
523                 object = object1;
524         }
525
526         vp = NULL;
527         if (object->type == OBJT_DEAD) {
528                 /*
529                  * For OBJT_DEAD objects, v_writecount was handled in
530                  * vnode_pager_dealloc().
531                  */
532         } else if (object->type == OBJT_VNODE) {
533                 vp = object->handle;
534         } else if (object->type == OBJT_SWAP) {
535                 KASSERT((object->flags & OBJ_TMPFS_NODE) != 0,
536                     ("vm_map_entry_set_vnode_text: swap and !TMPFS "
537                     "entry %p, object %p, add %d", entry, object, add));
538                 /*
539                  * Tmpfs VREG node, which was reclaimed, has
540                  * OBJ_TMPFS_NODE flag set, but not OBJ_TMPFS.  In
541                  * this case there is no v_writecount to adjust.
542                  */
543                 if ((object->flags & OBJ_TMPFS) != 0)
544                         vp = object->un_pager.swp.swp_tmpfs;
545         } else {
546                 KASSERT(0,
547                     ("vm_map_entry_set_vnode_text: wrong object type, "
548                     "entry %p, object %p, add %d", entry, object, add));
549         }
550         if (vp != NULL) {
551                 if (add) {
552                         VOP_SET_TEXT_CHECKED(vp);
553                         VM_OBJECT_RUNLOCK(object);
554                 } else {
555                         vhold(vp);
556                         VM_OBJECT_RUNLOCK(object);
557                         vn_lock(vp, LK_SHARED | LK_RETRY);
558                         VOP_UNSET_TEXT_CHECKED(vp);
559                         VOP_UNLOCK(vp, 0);
560                         vdrop(vp);
561                 }
562         } else {
563                 VM_OBJECT_RUNLOCK(object);
564         }
565 }
566
567 /*
568  * Use a different name for this vm_map_entry field when it's use
569  * is not consistent with its use as part of an ordered search tree.
570  */
571 #define defer_next right
572
573 static void
574 vm_map_process_deferred(void)
575 {
576         struct thread *td;
577         vm_map_entry_t entry, next;
578         vm_object_t object;
579
580         td = curthread;
581         entry = td->td_map_def_user;
582         td->td_map_def_user = NULL;
583         while (entry != NULL) {
584                 next = entry->defer_next;
585                 MPASS((entry->eflags & (MAP_ENTRY_WRITECNT |
586                     MAP_ENTRY_VN_EXEC)) != (MAP_ENTRY_WRITECNT |
587                     MAP_ENTRY_VN_EXEC));
588                 if ((entry->eflags & MAP_ENTRY_WRITECNT) != 0) {
589                         /*
590                          * Decrement the object's writemappings and
591                          * possibly the vnode's v_writecount.
592                          */
593                         KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
594                             ("Submap with writecount"));
595                         object = entry->object.vm_object;
596                         KASSERT(object != NULL, ("No object for writecount"));
597                         vm_pager_release_writecount(object, entry->start,
598                             entry->end);
599                 }
600                 vm_map_entry_set_vnode_text(entry, false);
601                 vm_map_entry_deallocate(entry, FALSE);
602                 entry = next;
603         }
604 }
605
606 #ifdef INVARIANTS
607 static void
608 _vm_map_assert_locked(vm_map_t map, const char *file, int line)
609 {
610
611         if (map->system_map)
612                 mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
613         else
614                 sx_assert_(&map->lock, SA_XLOCKED, file, line);
615 }
616
617 #define VM_MAP_ASSERT_LOCKED(map) \
618     _vm_map_assert_locked(map, LOCK_FILE, LOCK_LINE)
619
620 enum { VMMAP_CHECK_NONE, VMMAP_CHECK_UNLOCK, VMMAP_CHECK_ALL };
621 #ifdef DIAGNOSTIC
622 static int enable_vmmap_check = VMMAP_CHECK_UNLOCK;
623 #else
624 static int enable_vmmap_check = VMMAP_CHECK_NONE;
625 #endif
626 SYSCTL_INT(_debug, OID_AUTO, vmmap_check, CTLFLAG_RWTUN,
627     &enable_vmmap_check, 0, "Enable vm map consistency checking");
628
629 static void _vm_map_assert_consistent(vm_map_t map, int check);
630
631 #define VM_MAP_ASSERT_CONSISTENT(map) \
632     _vm_map_assert_consistent(map, VMMAP_CHECK_ALL)
633 #ifdef DIAGNOSTIC
634 #define VM_MAP_UNLOCK_CONSISTENT(map) do {                              \
635         if (map->nupdates > map->nentries) {                            \
636                 _vm_map_assert_consistent(map, VMMAP_CHECK_UNLOCK);     \
637                 map->nupdates = 0;                                      \
638         }                                                               \
639 } while (0)
640 #else
641 #define VM_MAP_UNLOCK_CONSISTENT(map)
642 #endif
643 #else
644 #define VM_MAP_ASSERT_LOCKED(map)
645 #define VM_MAP_ASSERT_CONSISTENT(map)
646 #define VM_MAP_UNLOCK_CONSISTENT(map)
647 #endif /* INVARIANTS */
648
649 void
650 _vm_map_unlock(vm_map_t map, const char *file, int line)
651 {
652
653         VM_MAP_UNLOCK_CONSISTENT(map);
654         if (map->system_map)
655                 mtx_unlock_flags_(&map->system_mtx, 0, file, line);
656         else {
657                 sx_xunlock_(&map->lock, file, line);
658                 vm_map_process_deferred();
659         }
660 }
661
662 void
663 _vm_map_lock_read(vm_map_t map, const char *file, int line)
664 {
665
666         if (map->system_map)
667                 mtx_lock_flags_(&map->system_mtx, 0, file, line);
668         else
669                 sx_slock_(&map->lock, file, line);
670 }
671
672 void
673 _vm_map_unlock_read(vm_map_t map, const char *file, int line)
674 {
675
676         if (map->system_map)
677                 mtx_unlock_flags_(&map->system_mtx, 0, file, line);
678         else {
679                 sx_sunlock_(&map->lock, file, line);
680                 vm_map_process_deferred();
681         }
682 }
683
684 int
685 _vm_map_trylock(vm_map_t map, const char *file, int line)
686 {
687         int error;
688
689         error = map->system_map ?
690             !mtx_trylock_flags_(&map->system_mtx, 0, file, line) :
691             !sx_try_xlock_(&map->lock, file, line);
692         if (error == 0)
693                 map->timestamp++;
694         return (error == 0);
695 }
696
697 int
698 _vm_map_trylock_read(vm_map_t map, const char *file, int line)
699 {
700         int error;
701
702         error = map->system_map ?
703             !mtx_trylock_flags_(&map->system_mtx, 0, file, line) :
704             !sx_try_slock_(&map->lock, file, line);
705         return (error == 0);
706 }
707
708 /*
709  *      _vm_map_lock_upgrade:   [ internal use only ]
710  *
711  *      Tries to upgrade a read (shared) lock on the specified map to a write
712  *      (exclusive) lock.  Returns the value "0" if the upgrade succeeds and a
713  *      non-zero value if the upgrade fails.  If the upgrade fails, the map is
714  *      returned without a read or write lock held.
715  *
716  *      Requires that the map be read locked.
717  */
718 int
719 _vm_map_lock_upgrade(vm_map_t map, const char *file, int line)
720 {
721         unsigned int last_timestamp;
722
723         if (map->system_map) {
724                 mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
725         } else {
726                 if (!sx_try_upgrade_(&map->lock, file, line)) {
727                         last_timestamp = map->timestamp;
728                         sx_sunlock_(&map->lock, file, line);
729                         vm_map_process_deferred();
730                         /*
731                          * If the map's timestamp does not change while the
732                          * map is unlocked, then the upgrade succeeds.
733                          */
734                         sx_xlock_(&map->lock, file, line);
735                         if (last_timestamp != map->timestamp) {
736                                 sx_xunlock_(&map->lock, file, line);
737                                 return (1);
738                         }
739                 }
740         }
741         map->timestamp++;
742         return (0);
743 }
744
745 void
746 _vm_map_lock_downgrade(vm_map_t map, const char *file, int line)
747 {
748
749         if (map->system_map) {
750                 mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
751         } else {
752                 VM_MAP_UNLOCK_CONSISTENT(map);
753                 sx_downgrade_(&map->lock, file, line);
754         }
755 }
756
757 /*
758  *      vm_map_locked:
759  *
760  *      Returns a non-zero value if the caller holds a write (exclusive) lock
761  *      on the specified map and the value "0" otherwise.
762  */
763 int
764 vm_map_locked(vm_map_t map)
765 {
766
767         if (map->system_map)
768                 return (mtx_owned(&map->system_mtx));
769         else
770                 return (sx_xlocked(&map->lock));
771 }
772
773 /*
774  *      _vm_map_unlock_and_wait:
775  *
776  *      Atomically releases the lock on the specified map and puts the calling
777  *      thread to sleep.  The calling thread will remain asleep until either
778  *      vm_map_wakeup() is performed on the map or the specified timeout is
779  *      exceeded.
780  *
781  *      WARNING!  This function does not perform deferred deallocations of
782  *      objects and map entries.  Therefore, the calling thread is expected to
783  *      reacquire the map lock after reawakening and later perform an ordinary
784  *      unlock operation, such as vm_map_unlock(), before completing its
785  *      operation on the map.
786  */
787 int
788 _vm_map_unlock_and_wait(vm_map_t map, int timo, const char *file, int line)
789 {
790
791         VM_MAP_UNLOCK_CONSISTENT(map);
792         mtx_lock(&map_sleep_mtx);
793         if (map->system_map)
794                 mtx_unlock_flags_(&map->system_mtx, 0, file, line);
795         else
796                 sx_xunlock_(&map->lock, file, line);
797         return (msleep(&map->root, &map_sleep_mtx, PDROP | PVM, "vmmaps",
798             timo));
799 }
800
801 /*
802  *      vm_map_wakeup:
803  *
804  *      Awaken any threads that have slept on the map using
805  *      vm_map_unlock_and_wait().
806  */
807 void
808 vm_map_wakeup(vm_map_t map)
809 {
810
811         /*
812          * Acquire and release map_sleep_mtx to prevent a wakeup()
813          * from being performed (and lost) between the map unlock
814          * and the msleep() in _vm_map_unlock_and_wait().
815          */
816         mtx_lock(&map_sleep_mtx);
817         mtx_unlock(&map_sleep_mtx);
818         wakeup(&map->root);
819 }
820
821 void
822 vm_map_busy(vm_map_t map)
823 {
824
825         VM_MAP_ASSERT_LOCKED(map);
826         map->busy++;
827 }
828
829 void
830 vm_map_unbusy(vm_map_t map)
831 {
832
833         VM_MAP_ASSERT_LOCKED(map);
834         KASSERT(map->busy, ("vm_map_unbusy: not busy"));
835         if (--map->busy == 0 && (map->flags & MAP_BUSY_WAKEUP)) {
836                 vm_map_modflags(map, 0, MAP_BUSY_WAKEUP);
837                 wakeup(&map->busy);
838         }
839 }
840
841 void 
842 vm_map_wait_busy(vm_map_t map)
843 {
844
845         VM_MAP_ASSERT_LOCKED(map);
846         while (map->busy) {
847                 vm_map_modflags(map, MAP_BUSY_WAKEUP, 0);
848                 if (map->system_map)
849                         msleep(&map->busy, &map->system_mtx, 0, "mbusy", 0);
850                 else
851                         sx_sleep(&map->busy, &map->lock, 0, "mbusy", 0);
852         }
853         map->timestamp++;
854 }
855
856 long
857 vmspace_resident_count(struct vmspace *vmspace)
858 {
859         return pmap_resident_count(vmspace_pmap(vmspace));
860 }
861
862 /*
863  *      vm_map_create:
864  *
865  *      Creates and returns a new empty VM map with
866  *      the given physical map structure, and having
867  *      the given lower and upper address bounds.
868  */
869 vm_map_t
870 vm_map_create(pmap_t pmap, vm_offset_t min, vm_offset_t max)
871 {
872         vm_map_t result;
873
874         result = uma_zalloc(mapzone, M_WAITOK);
875         CTR1(KTR_VM, "vm_map_create: %p", result);
876         _vm_map_init(result, pmap, min, max);
877         return (result);
878 }
879
880 /*
881  * Initialize an existing vm_map structure
882  * such as that in the vmspace structure.
883  */
884 static void
885 _vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max)
886 {
887
888         map->header.next = map->header.prev = &map->header;
889         map->header.eflags = MAP_ENTRY_HEADER;
890         map->needs_wakeup = FALSE;
891         map->system_map = 0;
892         map->pmap = pmap;
893         map->header.end = min;
894         map->header.start = max;
895         map->flags = 0;
896         map->root = NULL;
897         map->timestamp = 0;
898         map->busy = 0;
899         map->anon_loc = 0;
900 #ifdef DIAGNOSTIC
901         map->nupdates = 0;
902 #endif
903 }
904
905 void
906 vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max)
907 {
908
909         _vm_map_init(map, pmap, min, max);
910         mtx_init(&map->system_mtx, "system map", NULL, MTX_DEF | MTX_DUPOK);
911         sx_init(&map->lock, "user map");
912 }
913
914 /*
915  *      vm_map_entry_dispose:   [ internal use only ]
916  *
917  *      Inverse of vm_map_entry_create.
918  */
919 static void
920 vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry)
921 {
922         uma_zfree(map->system_map ? kmapentzone : mapentzone, entry);
923 }
924
925 /*
926  *      vm_map_entry_create:    [ internal use only ]
927  *
928  *      Allocates a VM map entry for insertion.
929  *      No entry fields are filled in.
930  */
931 static vm_map_entry_t
932 vm_map_entry_create(vm_map_t map)
933 {
934         vm_map_entry_t new_entry;
935
936         if (map->system_map)
937                 new_entry = uma_zalloc(kmapentzone, M_NOWAIT);
938         else
939                 new_entry = uma_zalloc(mapentzone, M_WAITOK);
940         if (new_entry == NULL)
941                 panic("vm_map_entry_create: kernel resources exhausted");
942         return (new_entry);
943 }
944
945 /*
946  *      vm_map_entry_set_behavior:
947  *
948  *      Set the expected access behavior, either normal, random, or
949  *      sequential.
950  */
951 static inline void
952 vm_map_entry_set_behavior(vm_map_entry_t entry, u_char behavior)
953 {
954         entry->eflags = (entry->eflags & ~MAP_ENTRY_BEHAV_MASK) |
955             (behavior & MAP_ENTRY_BEHAV_MASK);
956 }
957
958 /*
959  *      vm_map_entry_max_free_{left,right}:
960  *
961  *      Compute the size of the largest free gap between two entries,
962  *      one the root of a tree and the other the ancestor of that root
963  *      that is the least or greatest ancestor found on the search path.
964  */
965 static inline vm_size_t
966 vm_map_entry_max_free_left(vm_map_entry_t root, vm_map_entry_t left_ancestor)
967 {
968
969         return (root->left != NULL ?
970             root->left->max_free : root->start - left_ancestor->end);
971 }
972
973 static inline vm_size_t
974 vm_map_entry_max_free_right(vm_map_entry_t root, vm_map_entry_t right_ancestor)
975 {
976
977         return (root->right != NULL ?
978             root->right->max_free : right_ancestor->start - root->end);
979 }
980
981 /*
982  *      vm_map_entry_{pred,succ}:
983  *
984  *      Find the {predecessor, successor} of the entry by taking one step
985  *      in the appropriate direction and backtracking as much as necessary.
986  */
987 static inline vm_map_entry_t
988 vm_map_entry_pred(vm_map_entry_t entry)
989 {
990
991         return (entry->prev);
992 }
993
994 /* vm_map_entry_succ is defined in vm_map.h. */
995
996 #define SPLAY_LEFT_STEP(root, y, rlist, test) do {                      \
997         vm_size_t max_free;                                             \
998                                                                         \
999         /*                                                              \
1000          * Infer root->right->max_free == root->max_free when           \
1001          * y->max_free < root->max_free || root->max_free == 0.         \
1002          * Otherwise, look right to find it.                            \
1003          */                                                             \
1004         y = root->left;                                                 \
1005         max_free = root->max_free;                                      \
1006         KASSERT(max_free >= vm_map_entry_max_free_right(root, rlist),   \
1007             ("%s: max_free invariant fails", __func__));                \
1008         if (y == NULL ? max_free > 0 : max_free - 1 < y->max_free)      \
1009                 max_free = vm_map_entry_max_free_right(root, rlist);    \
1010         if (y != NULL && (test)) {                                      \
1011                 /* Rotate right and make y root. */                     \
1012                 root->left = y->right;                                  \
1013                 y->right = root;                                        \
1014                 if (max_free < y->max_free)                             \
1015                         root->max_free = max_free = MAX(max_free,       \
1016                             vm_map_entry_max_free_left(root, y));       \
1017                 root = y;                                               \
1018                 y = root->left;                                         \
1019         }                                                               \
1020         /* Copy right->max_free.  Put root on rlist. */                 \
1021         root->max_free = max_free;                                      \
1022         KASSERT(max_free == vm_map_entry_max_free_right(root, rlist),   \
1023             ("%s: max_free not copied from right", __func__));          \
1024         root->left = rlist;                                             \
1025         rlist = root;                                                   \
1026         root = y;                                                       \
1027 } while (0)
1028
1029 #define SPLAY_RIGHT_STEP(root, y, llist, test) do {                     \
1030         vm_size_t max_free;                                             \
1031                                                                         \
1032         /*                                                              \
1033          * Infer root->left->max_free == root->max_free when            \
1034          * y->max_free < root->max_free || root->max_free == 0.         \
1035          * Otherwise, look left to find it.                             \
1036          */                                                             \
1037         y = root->right;                                                \
1038         max_free = root->max_free;                                      \
1039         KASSERT(max_free >= vm_map_entry_max_free_left(root, llist),    \
1040             ("%s: max_free invariant fails", __func__));                \
1041         if (y == NULL ? max_free > 0 : max_free - 1 < y->max_free)      \
1042                 max_free = vm_map_entry_max_free_left(root, llist);     \
1043         if (y != NULL && (test)) {                                      \
1044                 /* Rotate left and make y root. */                      \
1045                 root->right = y->left;                                  \
1046                 y->left = root;                                         \
1047                 if (max_free < y->max_free)                             \
1048                         root->max_free = max_free = MAX(max_free,       \
1049                             vm_map_entry_max_free_right(root, y));      \
1050                 root = y;                                               \
1051                 y = root->right;                                        \
1052         }                                                               \
1053         /* Copy left->max_free.  Put root on llist. */                  \
1054         root->max_free = max_free;                                      \
1055         KASSERT(max_free == vm_map_entry_max_free_left(root, llist),    \
1056             ("%s: max_free not copied from left", __func__));           \
1057         root->right = llist;                                            \
1058         llist = root;                                                   \
1059         root = y;                                                       \
1060 } while (0)
1061
1062 /*
1063  * Walk down the tree until we find addr or a NULL pointer where addr would go,
1064  * breaking off left and right subtrees of nodes less than, or greater than
1065  * addr.  Treat pointers to nodes with max_free < length as NULL pointers.
1066  * llist and rlist are the two sides in reverse order (bottom-up), with llist
1067  * linked by the right pointer and rlist linked by the left pointer in the
1068  * vm_map_entry, and both lists terminated by &map->header.  This function, and
1069  * the subsequent call to vm_map_splay_merge, rely on the start and end address
1070  * values in &map->header.
1071  */
1072 static __always_inline vm_map_entry_t
1073 vm_map_splay_split(vm_map_t map, vm_offset_t addr, vm_size_t length,
1074     vm_map_entry_t *llist, vm_map_entry_t *rlist)
1075 {
1076         vm_map_entry_t root, y;
1077
1078         *llist = *rlist = &map->header;
1079         root = map->root;
1080         while (root != NULL && root->max_free >= length) {
1081                 KASSERT((*llist)->end <= root->start &&
1082                     root->end <= (*rlist)->start,
1083                     ("%s: root not within tree bounds", __func__));
1084                 if (addr < root->start) {
1085                         SPLAY_LEFT_STEP(root, y, *rlist,
1086                             y->max_free >= length && addr < y->start);
1087                 } else if (addr >= root->end) {
1088                         SPLAY_RIGHT_STEP(root, y, *llist,
1089                             y->max_free >= length && addr >= y->end);
1090                 } else
1091                         break;
1092         }
1093         return (root);
1094 }
1095
1096 static __always_inline void
1097 vm_map_splay_findnext(vm_map_entry_t root, vm_map_entry_t *rlist)
1098 {
1099         vm_map_entry_t hi, y;
1100
1101         hi = root->right;
1102         while (hi != NULL)
1103                 SPLAY_LEFT_STEP(hi, y, *rlist, true);
1104 }
1105
1106 static __always_inline void
1107 vm_map_splay_findprev(vm_map_entry_t root, vm_map_entry_t *llist)
1108 {
1109         vm_map_entry_t lo, y;
1110
1111         lo = root->left;
1112         while (lo != NULL)
1113                 SPLAY_RIGHT_STEP(lo, y, *llist, true);
1114 }
1115
1116 static inline void
1117 vm_map_entry_swap(vm_map_entry_t *a, vm_map_entry_t *b)
1118 {
1119         vm_map_entry_t tmp;
1120
1121         tmp = *b;
1122         *b = *a;
1123         *a = tmp;
1124 }
1125
1126 /*
1127  * Walk back up the two spines, flip the pointers and set max_free.  The
1128  * subtrees of the root go at the bottom of llist and rlist.
1129  */
1130 static void
1131 vm_map_splay_merge(vm_map_t map, vm_map_entry_t root,
1132     vm_map_entry_t llist, vm_map_entry_t rlist)
1133 {
1134         vm_map_entry_t prev;
1135         vm_size_t max_free_left, max_free_right;
1136
1137         max_free_left = vm_map_entry_max_free_left(root, llist);
1138         if (llist != &map->header) {
1139                 prev = root->left;
1140                 do {
1141                         /*
1142                          * The max_free values of the children of llist are in
1143                          * llist->max_free and max_free_left.  Update with the
1144                          * max value.
1145                          */
1146                         llist->max_free = max_free_left =
1147                             MAX(llist->max_free, max_free_left);
1148                         vm_map_entry_swap(&llist->right, &prev);
1149                         vm_map_entry_swap(&prev, &llist);
1150                 } while (llist != &map->header);
1151                 root->left = prev;
1152         }
1153         max_free_right = vm_map_entry_max_free_right(root, rlist);
1154         if (rlist != &map->header) {
1155                 prev = root->right;
1156                 do {
1157                         /*
1158                          * The max_free values of the children of rlist are in
1159                          * rlist->max_free and max_free_right.  Update with the
1160                          * max value.
1161                          */
1162                         rlist->max_free = max_free_right =
1163                             MAX(rlist->max_free, max_free_right);
1164                         vm_map_entry_swap(&rlist->left, &prev);
1165                         vm_map_entry_swap(&prev, &rlist);
1166                 } while (rlist != &map->header);
1167                 root->right = prev;
1168         }               
1169         root->max_free = MAX(max_free_left, max_free_right);
1170         map->root = root;
1171 #ifdef DIAGNOSTIC
1172         ++map->nupdates;
1173 #endif
1174 }
1175
1176 /*
1177  *      vm_map_splay:
1178  *
1179  *      The Sleator and Tarjan top-down splay algorithm with the
1180  *      following variation.  Max_free must be computed bottom-up, so
1181  *      on the downward pass, maintain the left and right spines in
1182  *      reverse order.  Then, make a second pass up each side to fix
1183  *      the pointers and compute max_free.  The time bound is O(log n)
1184  *      amortized.
1185  *
1186  *      The new root is the vm_map_entry containing "addr", or else an
1187  *      adjacent entry (lower if possible) if addr is not in the tree.
1188  *
1189  *      The map must be locked, and leaves it so.
1190  *
1191  *      Returns: the new root.
1192  */
1193 static vm_map_entry_t
1194 vm_map_splay(vm_map_t map, vm_offset_t addr)
1195 {
1196         vm_map_entry_t llist, rlist, root;
1197
1198         root = vm_map_splay_split(map, addr, 0, &llist, &rlist);
1199         if (root != NULL) {
1200                 /* do nothing */
1201         } else if (llist != &map->header) {
1202                 /*
1203                  * Recover the greatest node in the left
1204                  * subtree and make it the root.
1205                  */
1206                 root = llist;
1207                 llist = root->right;
1208                 root->right = NULL;
1209         } else if (rlist != &map->header) {
1210                 /*
1211                  * Recover the least node in the right
1212                  * subtree and make it the root.
1213                  */
1214                 root = rlist;
1215                 rlist = root->left;
1216                 root->left = NULL;
1217         } else {
1218                 /* There is no root. */
1219                 return (NULL);
1220         }
1221         vm_map_splay_merge(map, root, llist, rlist);
1222         VM_MAP_ASSERT_CONSISTENT(map);
1223         return (root);
1224 }
1225
1226 /*
1227  *      vm_map_entry_{un,}link:
1228  *
1229  *      Insert/remove entries from maps.
1230  */
1231 static void
1232 vm_map_entry_link(vm_map_t map, vm_map_entry_t entry)
1233 {
1234         vm_map_entry_t llist, rlist, root;
1235
1236         CTR3(KTR_VM,
1237             "vm_map_entry_link: map %p, nentries %d, entry %p", map,
1238             map->nentries, entry);
1239         VM_MAP_ASSERT_LOCKED(map);
1240         map->nentries++;
1241         root = vm_map_splay_split(map, entry->start, 0, &llist, &rlist);
1242         KASSERT(root == NULL,
1243             ("vm_map_entry_link: link object already mapped"));
1244         entry->prev = llist;
1245         entry->next = rlist;
1246         llist->next = rlist->prev = entry;
1247         entry->left = entry->right = NULL;
1248         vm_map_splay_merge(map, entry, llist, rlist);
1249         VM_MAP_ASSERT_CONSISTENT(map);
1250 }
1251
1252 enum unlink_merge_type {
1253         UNLINK_MERGE_NONE,
1254         UNLINK_MERGE_NEXT
1255 };
1256
1257 static void
1258 vm_map_entry_unlink(vm_map_t map, vm_map_entry_t entry,
1259     enum unlink_merge_type op)
1260 {
1261         vm_map_entry_t llist, rlist, root, y;
1262
1263         VM_MAP_ASSERT_LOCKED(map);
1264         root = vm_map_splay_split(map, entry->start, 0, &llist, &rlist);
1265         KASSERT(root != NULL,
1266             ("vm_map_entry_unlink: unlink object not mapped"));
1267
1268         vm_map_splay_findprev(root, &llist);
1269         vm_map_splay_findnext(root, &rlist);
1270         if (op == UNLINK_MERGE_NEXT) {
1271                 rlist->start = root->start;
1272                 rlist->offset = root->offset;
1273         }
1274         if (llist != &map->header) {
1275                 root = llist;
1276                 llist = root->right;
1277                 root->right = NULL;
1278         } else if (rlist != &map->header) {
1279                 root = rlist;
1280                 rlist = root->left;
1281                 root->left = NULL;
1282         } else
1283                 root = NULL;
1284         y = entry->next;
1285         y->prev = entry->prev;
1286         y->prev->next = y;
1287         if (root != NULL)
1288                 vm_map_splay_merge(map, root, llist, rlist);
1289         else
1290                 map->root = NULL;
1291         VM_MAP_ASSERT_CONSISTENT(map);
1292         map->nentries--;
1293         CTR3(KTR_VM, "vm_map_entry_unlink: map %p, nentries %d, entry %p", map,
1294             map->nentries, entry);
1295 }
1296
1297 /*
1298  *      vm_map_entry_resize:
1299  *
1300  *      Resize a vm_map_entry, recompute the amount of free space that
1301  *      follows it and propagate that value up the tree.
1302  *
1303  *      The map must be locked, and leaves it so.
1304  */
1305 static void
1306 vm_map_entry_resize(vm_map_t map, vm_map_entry_t entry, vm_size_t grow_amount)
1307 {
1308         vm_map_entry_t llist, rlist, root;
1309
1310         VM_MAP_ASSERT_LOCKED(map);
1311         root = vm_map_splay_split(map, entry->start, 0, &llist, &rlist);
1312         KASSERT(root != NULL, ("%s: resize object not mapped", __func__));
1313         vm_map_splay_findnext(root, &rlist);
1314         root->right = NULL;
1315         entry->end += grow_amount;
1316         vm_map_splay_merge(map, root, llist, rlist);
1317         VM_MAP_ASSERT_CONSISTENT(map);
1318         CTR4(KTR_VM, "%s: map %p, nentries %d, entry %p",
1319             __func__, map, map->nentries, entry);
1320 }
1321
1322 /*
1323  *      vm_map_lookup_entry:    [ internal use only ]
1324  *
1325  *      Finds the map entry containing (or
1326  *      immediately preceding) the specified address
1327  *      in the given map; the entry is returned
1328  *      in the "entry" parameter.  The boolean
1329  *      result indicates whether the address is
1330  *      actually contained in the map.
1331  */
1332 boolean_t
1333 vm_map_lookup_entry(
1334         vm_map_t map,
1335         vm_offset_t address,
1336         vm_map_entry_t *entry)  /* OUT */
1337 {
1338         vm_map_entry_t cur, lbound;
1339         boolean_t locked;
1340
1341         /*
1342          * If the map is empty, then the map entry immediately preceding
1343          * "address" is the map's header.
1344          */
1345         cur = map->root;
1346         if (cur == NULL) {
1347                 *entry = &map->header;
1348                 return (FALSE);
1349         }
1350         if (address >= cur->start && cur->end > address) {
1351                 *entry = cur;
1352                 return (TRUE);
1353         }
1354         if ((locked = vm_map_locked(map)) ||
1355             sx_try_upgrade(&map->lock)) {
1356                 /*
1357                  * Splay requires a write lock on the map.  However, it only
1358                  * restructures the binary search tree; it does not otherwise
1359                  * change the map.  Thus, the map's timestamp need not change
1360                  * on a temporary upgrade.
1361                  */
1362                 cur = vm_map_splay(map, address);
1363                 if (!locked) {
1364                         VM_MAP_UNLOCK_CONSISTENT(map);
1365                         sx_downgrade(&map->lock);
1366                 }
1367
1368                 /*
1369                  * If "address" is contained within a map entry, the new root
1370                  * is that map entry.  Otherwise, the new root is a map entry
1371                  * immediately before or after "address".
1372                  */
1373                 if (address < cur->start) {
1374                         *entry = &map->header;
1375                         return (FALSE);
1376                 }
1377                 *entry = cur;
1378                 return (address < cur->end);
1379         }
1380         /*
1381          * Since the map is only locked for read access, perform a
1382          * standard binary search tree lookup for "address".
1383          */
1384         lbound = &map->header;
1385         do {
1386                 if (address < cur->start) {
1387                         cur = cur->left;
1388                 } else if (cur->end <= address) {
1389                         lbound = cur;
1390                         cur = cur->right;
1391                 } else {
1392                         *entry = cur;
1393                         return (TRUE);
1394                 }
1395         } while (cur != NULL);
1396         *entry = lbound;
1397         return (FALSE);
1398 }
1399
1400 /*
1401  *      vm_map_insert:
1402  *
1403  *      Inserts the given whole VM object into the target
1404  *      map at the specified address range.  The object's
1405  *      size should match that of the address range.
1406  *
1407  *      Requires that the map be locked, and leaves it so.
1408  *
1409  *      If object is non-NULL, ref count must be bumped by caller
1410  *      prior to making call to account for the new entry.
1411  */
1412 int
1413 vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1414     vm_offset_t start, vm_offset_t end, vm_prot_t prot, vm_prot_t max, int cow)
1415 {
1416         vm_map_entry_t new_entry, next_entry, prev_entry;
1417         struct ucred *cred;
1418         vm_eflags_t protoeflags;
1419         vm_inherit_t inheritance;
1420
1421         VM_MAP_ASSERT_LOCKED(map);
1422         KASSERT(object != kernel_object ||
1423             (cow & MAP_COPY_ON_WRITE) == 0,
1424             ("vm_map_insert: kernel object and COW"));
1425         KASSERT(object == NULL || (cow & MAP_NOFAULT) == 0,
1426             ("vm_map_insert: paradoxical MAP_NOFAULT request"));
1427         KASSERT((prot & ~max) == 0,
1428             ("prot %#x is not subset of max_prot %#x", prot, max));
1429
1430         /*
1431          * Check that the start and end points are not bogus.
1432          */
1433         if (start < vm_map_min(map) || end > vm_map_max(map) ||
1434             start >= end)
1435                 return (KERN_INVALID_ADDRESS);
1436
1437         /*
1438          * Find the entry prior to the proposed starting address; if it's part
1439          * of an existing entry, this range is bogus.
1440          */
1441         if (vm_map_lookup_entry(map, start, &prev_entry))
1442                 return (KERN_NO_SPACE);
1443
1444         /*
1445          * Assert that the next entry doesn't overlap the end point.
1446          */
1447         next_entry = vm_map_entry_succ(prev_entry);
1448         if (next_entry->start < end)
1449                 return (KERN_NO_SPACE);
1450
1451         if ((cow & MAP_CREATE_GUARD) != 0 && (object != NULL ||
1452             max != VM_PROT_NONE))
1453                 return (KERN_INVALID_ARGUMENT);
1454
1455         protoeflags = 0;
1456         if (cow & MAP_COPY_ON_WRITE)
1457                 protoeflags |= MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY;
1458         if (cow & MAP_NOFAULT)
1459                 protoeflags |= MAP_ENTRY_NOFAULT;
1460         if (cow & MAP_DISABLE_SYNCER)
1461                 protoeflags |= MAP_ENTRY_NOSYNC;
1462         if (cow & MAP_DISABLE_COREDUMP)
1463                 protoeflags |= MAP_ENTRY_NOCOREDUMP;
1464         if (cow & MAP_STACK_GROWS_DOWN)
1465                 protoeflags |= MAP_ENTRY_GROWS_DOWN;
1466         if (cow & MAP_STACK_GROWS_UP)
1467                 protoeflags |= MAP_ENTRY_GROWS_UP;
1468         if (cow & MAP_WRITECOUNT)
1469                 protoeflags |= MAP_ENTRY_WRITECNT;
1470         if (cow & MAP_VN_EXEC)
1471                 protoeflags |= MAP_ENTRY_VN_EXEC;
1472         if ((cow & MAP_CREATE_GUARD) != 0)
1473                 protoeflags |= MAP_ENTRY_GUARD;
1474         if ((cow & MAP_CREATE_STACK_GAP_DN) != 0)
1475                 protoeflags |= MAP_ENTRY_STACK_GAP_DN;
1476         if ((cow & MAP_CREATE_STACK_GAP_UP) != 0)
1477                 protoeflags |= MAP_ENTRY_STACK_GAP_UP;
1478         if (cow & MAP_INHERIT_SHARE)
1479                 inheritance = VM_INHERIT_SHARE;
1480         else
1481                 inheritance = VM_INHERIT_DEFAULT;
1482
1483         cred = NULL;
1484         if ((cow & (MAP_ACC_NO_CHARGE | MAP_NOFAULT | MAP_CREATE_GUARD)) != 0)
1485                 goto charged;
1486         if ((cow & MAP_ACC_CHARGED) || ((prot & VM_PROT_WRITE) &&
1487             ((protoeflags & MAP_ENTRY_NEEDS_COPY) || object == NULL))) {
1488                 if (!(cow & MAP_ACC_CHARGED) && !swap_reserve(end - start))
1489                         return (KERN_RESOURCE_SHORTAGE);
1490                 KASSERT(object == NULL ||
1491                     (protoeflags & MAP_ENTRY_NEEDS_COPY) != 0 ||
1492                     object->cred == NULL,
1493                     ("overcommit: vm_map_insert o %p", object));
1494                 cred = curthread->td_ucred;
1495         }
1496
1497 charged:
1498         /* Expand the kernel pmap, if necessary. */
1499         if (map == kernel_map && end > kernel_vm_end)
1500                 pmap_growkernel(end);
1501         if (object != NULL) {
1502                 /*
1503                  * OBJ_ONEMAPPING must be cleared unless this mapping
1504                  * is trivially proven to be the only mapping for any
1505                  * of the object's pages.  (Object granularity
1506                  * reference counting is insufficient to recognize
1507                  * aliases with precision.)
1508                  */
1509                 if ((object->flags & OBJ_ANON) != 0) {
1510                         VM_OBJECT_WLOCK(object);
1511                         if (object->ref_count > 1 || object->shadow_count != 0)
1512                                 vm_object_clear_flag(object, OBJ_ONEMAPPING);
1513                         VM_OBJECT_WUNLOCK(object);
1514                 }
1515         } else if ((prev_entry->eflags & ~MAP_ENTRY_USER_WIRED) ==
1516             protoeflags &&
1517             (cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP |
1518             MAP_VN_EXEC)) == 0 &&
1519             prev_entry->end == start && (prev_entry->cred == cred ||
1520             (prev_entry->object.vm_object != NULL &&
1521             prev_entry->object.vm_object->cred == cred)) &&
1522             vm_object_coalesce(prev_entry->object.vm_object,
1523             prev_entry->offset,
1524             (vm_size_t)(prev_entry->end - prev_entry->start),
1525             (vm_size_t)(end - prev_entry->end), cred != NULL &&
1526             (protoeflags & MAP_ENTRY_NEEDS_COPY) == 0)) {
1527                 /*
1528                  * We were able to extend the object.  Determine if we
1529                  * can extend the previous map entry to include the
1530                  * new range as well.
1531                  */
1532                 if (prev_entry->inheritance == inheritance &&
1533                     prev_entry->protection == prot &&
1534                     prev_entry->max_protection == max &&
1535                     prev_entry->wired_count == 0) {
1536                         KASSERT((prev_entry->eflags & MAP_ENTRY_USER_WIRED) ==
1537                             0, ("prev_entry %p has incoherent wiring",
1538                             prev_entry));
1539                         if ((prev_entry->eflags & MAP_ENTRY_GUARD) == 0)
1540                                 map->size += end - prev_entry->end;
1541                         vm_map_entry_resize(map, prev_entry,
1542                             end - prev_entry->end);
1543                         vm_map_try_merge_entries(map, prev_entry, next_entry);
1544                         return (KERN_SUCCESS);
1545                 }
1546
1547                 /*
1548                  * If we can extend the object but cannot extend the
1549                  * map entry, we have to create a new map entry.  We
1550                  * must bump the ref count on the extended object to
1551                  * account for it.  object may be NULL.
1552                  */
1553                 object = prev_entry->object.vm_object;
1554                 offset = prev_entry->offset +
1555                     (prev_entry->end - prev_entry->start);
1556                 vm_object_reference(object);
1557                 if (cred != NULL && object != NULL && object->cred != NULL &&
1558                     !(prev_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
1559                         /* Object already accounts for this uid. */
1560                         cred = NULL;
1561                 }
1562         }
1563         if (cred != NULL)
1564                 crhold(cred);
1565
1566         /*
1567          * Create a new entry
1568          */
1569         new_entry = vm_map_entry_create(map);
1570         new_entry->start = start;
1571         new_entry->end = end;
1572         new_entry->cred = NULL;
1573
1574         new_entry->eflags = protoeflags;
1575         new_entry->object.vm_object = object;
1576         new_entry->offset = offset;
1577
1578         new_entry->inheritance = inheritance;
1579         new_entry->protection = prot;
1580         new_entry->max_protection = max;
1581         new_entry->wired_count = 0;
1582         new_entry->wiring_thread = NULL;
1583         new_entry->read_ahead = VM_FAULT_READ_AHEAD_INIT;
1584         new_entry->next_read = start;
1585
1586         KASSERT(cred == NULL || !ENTRY_CHARGED(new_entry),
1587             ("overcommit: vm_map_insert leaks vm_map %p", new_entry));
1588         new_entry->cred = cred;
1589
1590         /*
1591          * Insert the new entry into the list
1592          */
1593         vm_map_entry_link(map, new_entry);
1594         if ((new_entry->eflags & MAP_ENTRY_GUARD) == 0)
1595                 map->size += new_entry->end - new_entry->start;
1596
1597         /*
1598          * Try to coalesce the new entry with both the previous and next
1599          * entries in the list.  Previously, we only attempted to coalesce
1600          * with the previous entry when object is NULL.  Here, we handle the
1601          * other cases, which are less common.
1602          */
1603         vm_map_try_merge_entries(map, prev_entry, new_entry);
1604         vm_map_try_merge_entries(map, new_entry, next_entry);
1605
1606         if ((cow & (MAP_PREFAULT | MAP_PREFAULT_PARTIAL)) != 0) {
1607                 vm_map_pmap_enter(map, start, prot, object, OFF_TO_IDX(offset),
1608                     end - start, cow & MAP_PREFAULT_PARTIAL);
1609         }
1610
1611         return (KERN_SUCCESS);
1612 }
1613
1614 /*
1615  *      vm_map_findspace:
1616  *
1617  *      Find the first fit (lowest VM address) for "length" free bytes
1618  *      beginning at address >= start in the given map.
1619  *
1620  *      In a vm_map_entry, "max_free" is the maximum amount of
1621  *      contiguous free space between an entry in its subtree and a
1622  *      neighbor of that entry.  This allows finding a free region in
1623  *      one path down the tree, so O(log n) amortized with splay
1624  *      trees.
1625  *
1626  *      The map must be locked, and leaves it so.
1627  *
1628  *      Returns: starting address if sufficient space,
1629  *               vm_map_max(map)-length+1 if insufficient space.
1630  */
1631 vm_offset_t
1632 vm_map_findspace(vm_map_t map, vm_offset_t start, vm_size_t length)
1633 {
1634         vm_map_entry_t llist, rlist, root, y;
1635         vm_size_t left_length;
1636         vm_offset_t gap_end;
1637
1638         /*
1639          * Request must fit within min/max VM address and must avoid
1640          * address wrap.
1641          */
1642         start = MAX(start, vm_map_min(map));
1643         if (start >= vm_map_max(map) || length > vm_map_max(map) - start)
1644                 return (vm_map_max(map) - length + 1);
1645
1646         /* Empty tree means wide open address space. */
1647         if (map->root == NULL)
1648                 return (start);
1649
1650         /*
1651          * After splay_split, if start is within an entry, push it to the start
1652          * of the following gap.  If rlist is at the end of the gap containing
1653          * start, save the end of that gap in gap_end to see if the gap is big
1654          * enough; otherwise set gap_end to start skip gap-checking and move
1655          * directly to a search of the right subtree.
1656          */
1657         root = vm_map_splay_split(map, start, length, &llist, &rlist);
1658         gap_end = rlist->start;
1659         if (root != NULL) {
1660                 start = root->end;
1661                 if (root->right != NULL)
1662                         gap_end = start;
1663         } else if (rlist != &map->header) {
1664                 root = rlist;
1665                 rlist = root->left;
1666                 root->left = NULL;
1667         } else {
1668                 root = llist;
1669                 llist = root->right;
1670                 root->right = NULL;
1671         }
1672         vm_map_splay_merge(map, root, llist, rlist);
1673         VM_MAP_ASSERT_CONSISTENT(map);
1674         if (length <= gap_end - start)
1675                 return (start);
1676
1677         /* With max_free, can immediately tell if no solution. */
1678         if (root->right == NULL || length > root->right->max_free)
1679                 return (vm_map_max(map) - length + 1);
1680
1681         /*
1682          * Splay for the least large-enough gap in the right subtree.
1683          */
1684         llist = rlist = &map->header;
1685         for (left_length = 0;;
1686             left_length = vm_map_entry_max_free_left(root, llist)) {
1687                 if (length <= left_length)
1688                         SPLAY_LEFT_STEP(root, y, rlist,
1689                             length <= vm_map_entry_max_free_left(y, llist));
1690                 else
1691                         SPLAY_RIGHT_STEP(root, y, llist,
1692                             length > vm_map_entry_max_free_left(y, root));
1693                 if (root == NULL)
1694                         break;
1695         }
1696         root = llist;
1697         llist = root->right;
1698         root->right = NULL;
1699         if (rlist != &map->header) {
1700                 y = rlist;
1701                 rlist = y->left;
1702                 y->left = NULL;
1703                 vm_map_splay_merge(map, y, &map->header, rlist);
1704                 y->max_free = MAX(
1705                     vm_map_entry_max_free_left(y, root),
1706                     vm_map_entry_max_free_right(y, &map->header));
1707                 root->right = y;
1708         }
1709         vm_map_splay_merge(map, root, llist, &map->header);
1710         VM_MAP_ASSERT_CONSISTENT(map);
1711         return (root->end);
1712 }
1713
1714 int
1715 vm_map_fixed(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1716     vm_offset_t start, vm_size_t length, vm_prot_t prot,
1717     vm_prot_t max, int cow)
1718 {
1719         vm_offset_t end;
1720         int result;
1721
1722         end = start + length;
1723         KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 ||
1724             object == NULL,
1725             ("vm_map_fixed: non-NULL backing object for stack"));
1726         vm_map_lock(map);
1727         VM_MAP_RANGE_CHECK(map, start, end);
1728         if ((cow & MAP_CHECK_EXCL) == 0)
1729                 vm_map_delete(map, start, end);
1730         if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) {
1731                 result = vm_map_stack_locked(map, start, length, sgrowsiz,
1732                     prot, max, cow);
1733         } else {
1734                 result = vm_map_insert(map, object, offset, start, end,
1735                     prot, max, cow);
1736         }
1737         vm_map_unlock(map);
1738         return (result);
1739 }
1740
1741 static const int aslr_pages_rnd_64[2] = {0x1000, 0x10};
1742 static const int aslr_pages_rnd_32[2] = {0x100, 0x4};
1743
1744 static int cluster_anon = 1;
1745 SYSCTL_INT(_vm, OID_AUTO, cluster_anon, CTLFLAG_RW,
1746     &cluster_anon, 0,
1747     "Cluster anonymous mappings: 0 = no, 1 = yes if no hint, 2 = always");
1748
1749 static bool
1750 clustering_anon_allowed(vm_offset_t addr)
1751 {
1752
1753         switch (cluster_anon) {
1754         case 0:
1755                 return (false);
1756         case 1:
1757                 return (addr == 0);
1758         case 2:
1759         default:
1760                 return (true);
1761         }
1762 }
1763
1764 static long aslr_restarts;
1765 SYSCTL_LONG(_vm, OID_AUTO, aslr_restarts, CTLFLAG_RD,
1766     &aslr_restarts, 0,
1767     "Number of aslr failures");
1768
1769 #define MAP_32BIT_MAX_ADDR      ((vm_offset_t)1 << 31)
1770
1771 /*
1772  * Searches for the specified amount of free space in the given map with the
1773  * specified alignment.  Performs an address-ordered, first-fit search from
1774  * the given address "*addr", with an optional upper bound "max_addr".  If the
1775  * parameter "alignment" is zero, then the alignment is computed from the
1776  * given (object, offset) pair so as to enable the greatest possible use of
1777  * superpage mappings.  Returns KERN_SUCCESS and the address of the free space
1778  * in "*addr" if successful.  Otherwise, returns KERN_NO_SPACE.
1779  *
1780  * The map must be locked.  Initially, there must be at least "length" bytes
1781  * of free space at the given address.
1782  */
1783 static int
1784 vm_map_alignspace(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1785     vm_offset_t *addr, vm_size_t length, vm_offset_t max_addr,
1786     vm_offset_t alignment)
1787 {
1788         vm_offset_t aligned_addr, free_addr;
1789
1790         VM_MAP_ASSERT_LOCKED(map);
1791         free_addr = *addr;
1792         KASSERT(free_addr == vm_map_findspace(map, free_addr, length),
1793             ("caller failed to provide space %#jx at address %p",
1794              (uintmax_t)length, (void *)free_addr));
1795         for (;;) {
1796                 /*
1797                  * At the start of every iteration, the free space at address
1798                  * "*addr" is at least "length" bytes.
1799                  */
1800                 if (alignment == 0)
1801                         pmap_align_superpage(object, offset, addr, length);
1802                 else if ((*addr & (alignment - 1)) != 0) {
1803                         *addr &= ~(alignment - 1);
1804                         *addr += alignment;
1805                 }
1806                 aligned_addr = *addr;
1807                 if (aligned_addr == free_addr) {
1808                         /*
1809                          * Alignment did not change "*addr", so "*addr" must
1810                          * still provide sufficient free space.
1811                          */
1812                         return (KERN_SUCCESS);
1813                 }
1814
1815                 /*
1816                  * Test for address wrap on "*addr".  A wrapped "*addr" could
1817                  * be a valid address, in which case vm_map_findspace() cannot
1818                  * be relied upon to fail.
1819                  */
1820                 if (aligned_addr < free_addr)
1821                         return (KERN_NO_SPACE);
1822                 *addr = vm_map_findspace(map, aligned_addr, length);
1823                 if (*addr + length > vm_map_max(map) ||
1824                     (max_addr != 0 && *addr + length > max_addr))
1825                         return (KERN_NO_SPACE);
1826                 free_addr = *addr;
1827                 if (free_addr == aligned_addr) {
1828                         /*
1829                          * If a successful call to vm_map_findspace() did not
1830                          * change "*addr", then "*addr" must still be aligned
1831                          * and provide sufficient free space.
1832                          */
1833                         return (KERN_SUCCESS);
1834                 }
1835         }
1836 }
1837
1838 /*
1839  *      vm_map_find finds an unallocated region in the target address
1840  *      map with the given length.  The search is defined to be
1841  *      first-fit from the specified address; the region found is
1842  *      returned in the same parameter.
1843  *
1844  *      If object is non-NULL, ref count must be bumped by caller
1845  *      prior to making call to account for the new entry.
1846  */
1847 int
1848 vm_map_find(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1849             vm_offset_t *addr,  /* IN/OUT */
1850             vm_size_t length, vm_offset_t max_addr, int find_space,
1851             vm_prot_t prot, vm_prot_t max, int cow)
1852 {
1853         vm_offset_t alignment, curr_min_addr, min_addr;
1854         int gap, pidx, rv, try;
1855         bool cluster, en_aslr, update_anon;
1856
1857         KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 ||
1858             object == NULL,
1859             ("vm_map_find: non-NULL backing object for stack"));
1860         MPASS((cow & MAP_REMAP) == 0 || (find_space == VMFS_NO_SPACE &&
1861             (cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0));
1862         if (find_space == VMFS_OPTIMAL_SPACE && (object == NULL ||
1863             (object->flags & OBJ_COLORED) == 0))
1864                 find_space = VMFS_ANY_SPACE;
1865         if (find_space >> 8 != 0) {
1866                 KASSERT((find_space & 0xff) == 0, ("bad VMFS flags"));
1867                 alignment = (vm_offset_t)1 << (find_space >> 8);
1868         } else
1869                 alignment = 0;
1870         en_aslr = (map->flags & MAP_ASLR) != 0;
1871         update_anon = cluster = clustering_anon_allowed(*addr) &&
1872             (map->flags & MAP_IS_SUB_MAP) == 0 && max_addr == 0 &&
1873             find_space != VMFS_NO_SPACE && object == NULL &&
1874             (cow & (MAP_INHERIT_SHARE | MAP_STACK_GROWS_UP |
1875             MAP_STACK_GROWS_DOWN)) == 0 && prot != PROT_NONE;
1876         curr_min_addr = min_addr = *addr;
1877         if (en_aslr && min_addr == 0 && !cluster &&
1878             find_space != VMFS_NO_SPACE &&
1879             (map->flags & MAP_ASLR_IGNSTART) != 0)
1880                 curr_min_addr = min_addr = vm_map_min(map);
1881         try = 0;
1882         vm_map_lock(map);
1883         if (cluster) {
1884                 curr_min_addr = map->anon_loc;
1885                 if (curr_min_addr == 0)
1886                         cluster = false;
1887         }
1888         if (find_space != VMFS_NO_SPACE) {
1889                 KASSERT(find_space == VMFS_ANY_SPACE ||
1890                     find_space == VMFS_OPTIMAL_SPACE ||
1891                     find_space == VMFS_SUPER_SPACE ||
1892                     alignment != 0, ("unexpected VMFS flag"));
1893 again:
1894                 /*
1895                  * When creating an anonymous mapping, try clustering
1896                  * with an existing anonymous mapping first.
1897                  *
1898                  * We make up to two attempts to find address space
1899                  * for a given find_space value. The first attempt may
1900                  * apply randomization or may cluster with an existing
1901                  * anonymous mapping. If this first attempt fails,
1902                  * perform a first-fit search of the available address
1903                  * space.
1904                  *
1905                  * If all tries failed, and find_space is
1906                  * VMFS_OPTIMAL_SPACE, fallback to VMFS_ANY_SPACE.
1907                  * Again enable clustering and randomization.
1908                  */
1909                 try++;
1910                 MPASS(try <= 2);
1911
1912                 if (try == 2) {
1913                         /*
1914                          * Second try: we failed either to find a
1915                          * suitable region for randomizing the
1916                          * allocation, or to cluster with an existing
1917                          * mapping.  Retry with free run.
1918                          */
1919                         curr_min_addr = (map->flags & MAP_ASLR_IGNSTART) != 0 ?
1920                             vm_map_min(map) : min_addr;
1921                         atomic_add_long(&aslr_restarts, 1);
1922                 }
1923
1924                 if (try == 1 && en_aslr && !cluster) {
1925                         /*
1926                          * Find space for allocation, including
1927                          * gap needed for later randomization.
1928                          */
1929                         pidx = MAXPAGESIZES > 1 && pagesizes[1] != 0 &&
1930                             (find_space == VMFS_SUPER_SPACE || find_space ==
1931                             VMFS_OPTIMAL_SPACE) ? 1 : 0;
1932                         gap = vm_map_max(map) > MAP_32BIT_MAX_ADDR &&
1933                             (max_addr == 0 || max_addr > MAP_32BIT_MAX_ADDR) ?
1934                             aslr_pages_rnd_64[pidx] : aslr_pages_rnd_32[pidx];
1935                         *addr = vm_map_findspace(map, curr_min_addr,
1936                             length + gap * pagesizes[pidx]);
1937                         if (*addr + length + gap * pagesizes[pidx] >
1938                             vm_map_max(map))
1939                                 goto again;
1940                         /* And randomize the start address. */
1941                         *addr += (arc4random() % gap) * pagesizes[pidx];
1942                         if (max_addr != 0 && *addr + length > max_addr)
1943                                 goto again;
1944                 } else {
1945                         *addr = vm_map_findspace(map, curr_min_addr, length);
1946                         if (*addr + length > vm_map_max(map) ||
1947                             (max_addr != 0 && *addr + length > max_addr)) {
1948                                 if (cluster) {
1949                                         cluster = false;
1950                                         MPASS(try == 1);
1951                                         goto again;
1952                                 }
1953                                 rv = KERN_NO_SPACE;
1954                                 goto done;
1955                         }
1956                 }
1957
1958                 if (find_space != VMFS_ANY_SPACE &&
1959                     (rv = vm_map_alignspace(map, object, offset, addr, length,
1960                     max_addr, alignment)) != KERN_SUCCESS) {
1961                         if (find_space == VMFS_OPTIMAL_SPACE) {
1962                                 find_space = VMFS_ANY_SPACE;
1963                                 curr_min_addr = min_addr;
1964                                 cluster = update_anon;
1965                                 try = 0;
1966                                 goto again;
1967                         }
1968                         goto done;
1969                 }
1970         } else if ((cow & MAP_REMAP) != 0) {
1971                 if (*addr < vm_map_min(map) ||
1972                     *addr + length > vm_map_max(map) ||
1973                     *addr + length <= length) {
1974                         rv = KERN_INVALID_ADDRESS;
1975                         goto done;
1976                 }
1977                 vm_map_delete(map, *addr, *addr + length);
1978         }
1979         if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) {
1980                 rv = vm_map_stack_locked(map, *addr, length, sgrowsiz, prot,
1981                     max, cow);
1982         } else {
1983                 rv = vm_map_insert(map, object, offset, *addr, *addr + length,
1984                     prot, max, cow);
1985         }
1986         if (rv == KERN_SUCCESS && update_anon)
1987                 map->anon_loc = *addr + length;
1988 done:
1989         vm_map_unlock(map);
1990         return (rv);
1991 }
1992
1993 /*
1994  *      vm_map_find_min() is a variant of vm_map_find() that takes an
1995  *      additional parameter (min_addr) and treats the given address
1996  *      (*addr) differently.  Specifically, it treats *addr as a hint
1997  *      and not as the minimum address where the mapping is created.
1998  *
1999  *      This function works in two phases.  First, it tries to
2000  *      allocate above the hint.  If that fails and the hint is
2001  *      greater than min_addr, it performs a second pass, replacing
2002  *      the hint with min_addr as the minimum address for the
2003  *      allocation.
2004  */
2005 int
2006 vm_map_find_min(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
2007     vm_offset_t *addr, vm_size_t length, vm_offset_t min_addr,
2008     vm_offset_t max_addr, int find_space, vm_prot_t prot, vm_prot_t max,
2009     int cow)
2010 {
2011         vm_offset_t hint;
2012         int rv;
2013
2014         hint = *addr;
2015         for (;;) {
2016                 rv = vm_map_find(map, object, offset, addr, length, max_addr,
2017                     find_space, prot, max, cow);
2018                 if (rv == KERN_SUCCESS || min_addr >= hint)
2019                         return (rv);
2020                 *addr = hint = min_addr;
2021         }
2022 }
2023
2024 /*
2025  * A map entry with any of the following flags set must not be merged with
2026  * another entry.
2027  */
2028 #define MAP_ENTRY_NOMERGE_MASK  (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP | \
2029             MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_IS_SUB_MAP | MAP_ENTRY_VN_EXEC)
2030
2031 static bool
2032 vm_map_mergeable_neighbors(vm_map_entry_t prev, vm_map_entry_t entry)
2033 {
2034
2035         KASSERT((prev->eflags & MAP_ENTRY_NOMERGE_MASK) == 0 ||
2036             (entry->eflags & MAP_ENTRY_NOMERGE_MASK) == 0,
2037             ("vm_map_mergeable_neighbors: neither %p nor %p are mergeable",
2038             prev, entry));
2039         return (prev->end == entry->start &&
2040             prev->object.vm_object == entry->object.vm_object &&
2041             (prev->object.vm_object == NULL ||
2042             prev->offset + (prev->end - prev->start) == entry->offset) &&
2043             prev->eflags == entry->eflags &&
2044             prev->protection == entry->protection &&
2045             prev->max_protection == entry->max_protection &&
2046             prev->inheritance == entry->inheritance &&
2047             prev->wired_count == entry->wired_count &&
2048             prev->cred == entry->cred);
2049 }
2050
2051 static void
2052 vm_map_merged_neighbor_dispose(vm_map_t map, vm_map_entry_t entry)
2053 {
2054
2055         /*
2056          * If the backing object is a vnode object, vm_object_deallocate()
2057          * calls vrele().  However, vrele() does not lock the vnode because
2058          * the vnode has additional references.  Thus, the map lock can be
2059          * kept without causing a lock-order reversal with the vnode lock.
2060          *
2061          * Since we count the number of virtual page mappings in
2062          * object->un_pager.vnp.writemappings, the writemappings value
2063          * should not be adjusted when the entry is disposed of.
2064          */
2065         if (entry->object.vm_object != NULL)
2066                 vm_object_deallocate(entry->object.vm_object);
2067         if (entry->cred != NULL)
2068                 crfree(entry->cred);
2069         vm_map_entry_dispose(map, entry);
2070 }
2071
2072 /*
2073  *      vm_map_try_merge_entries:
2074  *
2075  *      Compare the given map entry to its predecessor, and merge its precessor
2076  *      into it if possible.  The entry remains valid, and may be extended.
2077  *      The predecessor may be deleted.
2078  *
2079  *      The map must be locked.
2080  */
2081 void
2082 vm_map_try_merge_entries(vm_map_t map, vm_map_entry_t prev_entry,
2083     vm_map_entry_t entry)
2084 {
2085
2086         VM_MAP_ASSERT_LOCKED(map);
2087         if ((entry->eflags & MAP_ENTRY_NOMERGE_MASK) == 0 &&
2088             vm_map_mergeable_neighbors(prev_entry, entry)) {
2089                 vm_map_entry_unlink(map, prev_entry, UNLINK_MERGE_NEXT);
2090                 vm_map_merged_neighbor_dispose(map, prev_entry);
2091         }
2092 }
2093
2094 /*
2095  *      vm_map_entry_back:
2096  *
2097  *      Allocate an object to back a map entry.
2098  */
2099 static inline void
2100 vm_map_entry_back(vm_map_entry_t entry)
2101 {
2102         vm_object_t object;
2103
2104         KASSERT(entry->object.vm_object == NULL,
2105             ("map entry %p has backing object", entry));
2106         KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
2107             ("map entry %p is a submap", entry));
2108         object = vm_object_allocate_anon(atop(entry->end - entry->start));
2109         entry->object.vm_object = object;
2110         entry->offset = 0;
2111         if (entry->cred != NULL) {
2112                 object->cred = entry->cred;
2113                 object->charge = entry->end - entry->start;
2114                 entry->cred = NULL;
2115         }
2116 }
2117
2118 /*
2119  *      vm_map_entry_charge_object
2120  *
2121  *      If there is no object backing this entry, create one.  Otherwise, if
2122  *      the entry has cred, give it to the backing object.
2123  */
2124 static inline void
2125 vm_map_entry_charge_object(vm_map_t map, vm_map_entry_t entry)
2126 {
2127
2128         VM_MAP_ASSERT_LOCKED(map);
2129         KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
2130             ("map entry %p is a submap", entry));
2131         if (entry->object.vm_object == NULL && !map->system_map &&
2132             (entry->eflags & MAP_ENTRY_GUARD) == 0)
2133                 vm_map_entry_back(entry);
2134         else if (entry->object.vm_object != NULL &&
2135             ((entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) &&
2136             entry->cred != NULL) {
2137                 VM_OBJECT_WLOCK(entry->object.vm_object);
2138                 KASSERT(entry->object.vm_object->cred == NULL,
2139                     ("OVERCOMMIT: %s: both cred e %p", __func__, entry));
2140                 entry->object.vm_object->cred = entry->cred;
2141                 entry->object.vm_object->charge = entry->end - entry->start;
2142                 VM_OBJECT_WUNLOCK(entry->object.vm_object);
2143                 entry->cred = NULL;
2144         }
2145 }
2146
2147 /*
2148  *      vm_map_clip_start:      [ internal use only ]
2149  *
2150  *      Asserts that the given entry begins at or after
2151  *      the specified address; if necessary,
2152  *      it splits the entry into two.
2153  */
2154 #define vm_map_clip_start(map, entry, startaddr) \
2155 { \
2156         if (startaddr > entry->start) \
2157                 _vm_map_clip_start(map, entry, startaddr); \
2158 }
2159
2160 /*
2161  *      This routine is called only when it is known that
2162  *      the entry must be split.
2163  */
2164 static void
2165 _vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start)
2166 {
2167         vm_map_entry_t new_entry;
2168
2169         VM_MAP_ASSERT_LOCKED(map);
2170         KASSERT(entry->end > start && entry->start < start,
2171             ("_vm_map_clip_start: invalid clip of entry %p", entry));
2172
2173         /*
2174          * Create a backing object now, if none exists, so that more individual
2175          * objects won't be created after the map entry is split.
2176          */
2177         vm_map_entry_charge_object(map, entry);
2178
2179         /* Clone the entry. */
2180         new_entry = vm_map_entry_create(map);
2181         *new_entry = *entry;
2182
2183         /*
2184          * Split off the front portion.  Insert the new entry BEFORE this one,
2185          * so that this entry has the specified starting address.
2186          */
2187         new_entry->end = start;
2188         entry->offset += (start - entry->start);
2189         entry->start = start;
2190         if (new_entry->cred != NULL)
2191                 crhold(entry->cred);
2192
2193         vm_map_entry_link(map, new_entry);
2194
2195         if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
2196                 vm_object_reference(new_entry->object.vm_object);
2197                 vm_map_entry_set_vnode_text(new_entry, true);
2198                 /*
2199                  * The object->un_pager.vnp.writemappings for the
2200                  * object of MAP_ENTRY_WRITECNT type entry shall be
2201                  * kept as is here.  The virtual pages are
2202                  * re-distributed among the clipped entries, so the sum is
2203                  * left the same.
2204                  */
2205         }
2206 }
2207
2208 /*
2209  *      vm_map_clip_end:        [ internal use only ]
2210  *
2211  *      Asserts that the given entry ends at or before
2212  *      the specified address; if necessary,
2213  *      it splits the entry into two.
2214  */
2215 #define vm_map_clip_end(map, entry, endaddr) \
2216 { \
2217         if ((endaddr) < (entry->end)) \
2218                 _vm_map_clip_end((map), (entry), (endaddr)); \
2219 }
2220
2221 /*
2222  *      This routine is called only when it is known that
2223  *      the entry must be split.
2224  */
2225 static void
2226 _vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end)
2227 {
2228         vm_map_entry_t new_entry;
2229
2230         VM_MAP_ASSERT_LOCKED(map);
2231         KASSERT(entry->start < end && entry->end > end,
2232             ("_vm_map_clip_end: invalid clip of entry %p", entry));
2233
2234         /*
2235          * Create a backing object now, if none exists, so that more individual
2236          * objects won't be created after the map entry is split.
2237          */
2238         vm_map_entry_charge_object(map, entry);
2239
2240         /* Clone the entry. */
2241         new_entry = vm_map_entry_create(map);
2242         *new_entry = *entry;
2243
2244         /*
2245          * Split off the back portion.  Insert the new entry AFTER this one,
2246          * so that this entry has the specified ending address.
2247          */
2248         new_entry->start = entry->end = end;
2249         new_entry->offset += (end - entry->start);
2250         if (new_entry->cred != NULL)
2251                 crhold(entry->cred);
2252
2253         vm_map_entry_link(map, new_entry);
2254
2255         if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
2256                 vm_object_reference(new_entry->object.vm_object);
2257                 vm_map_entry_set_vnode_text(new_entry, true);
2258         }
2259 }
2260
2261 /*
2262  *      vm_map_submap:          [ kernel use only ]
2263  *
2264  *      Mark the given range as handled by a subordinate map.
2265  *
2266  *      This range must have been created with vm_map_find,
2267  *      and no other operations may have been performed on this
2268  *      range prior to calling vm_map_submap.
2269  *
2270  *      Only a limited number of operations can be performed
2271  *      within this rage after calling vm_map_submap:
2272  *              vm_fault
2273  *      [Don't try vm_map_copy!]
2274  *
2275  *      To remove a submapping, one must first remove the
2276  *      range from the superior map, and then destroy the
2277  *      submap (if desired).  [Better yet, don't try it.]
2278  */
2279 int
2280 vm_map_submap(
2281         vm_map_t map,
2282         vm_offset_t start,
2283         vm_offset_t end,
2284         vm_map_t submap)
2285 {
2286         vm_map_entry_t entry;
2287         int result;
2288
2289         result = KERN_INVALID_ARGUMENT;
2290
2291         vm_map_lock(submap);
2292         submap->flags |= MAP_IS_SUB_MAP;
2293         vm_map_unlock(submap);
2294
2295         vm_map_lock(map);
2296
2297         VM_MAP_RANGE_CHECK(map, start, end);
2298
2299         if (vm_map_lookup_entry(map, start, &entry)) {
2300                 vm_map_clip_start(map, entry, start);
2301         } else
2302                 entry = vm_map_entry_succ(entry);
2303
2304         vm_map_clip_end(map, entry, end);
2305
2306         if ((entry->start == start) && (entry->end == end) &&
2307             ((entry->eflags & MAP_ENTRY_COW) == 0) &&
2308             (entry->object.vm_object == NULL)) {
2309                 entry->object.sub_map = submap;
2310                 entry->eflags |= MAP_ENTRY_IS_SUB_MAP;
2311                 result = KERN_SUCCESS;
2312         }
2313         vm_map_unlock(map);
2314
2315         if (result != KERN_SUCCESS) {
2316                 vm_map_lock(submap);
2317                 submap->flags &= ~MAP_IS_SUB_MAP;
2318                 vm_map_unlock(submap);
2319         }
2320         return (result);
2321 }
2322
2323 /*
2324  * The maximum number of pages to map if MAP_PREFAULT_PARTIAL is specified
2325  */
2326 #define MAX_INIT_PT     96
2327
2328 /*
2329  *      vm_map_pmap_enter:
2330  *
2331  *      Preload the specified map's pmap with mappings to the specified
2332  *      object's memory-resident pages.  No further physical pages are
2333  *      allocated, and no further virtual pages are retrieved from secondary
2334  *      storage.  If the specified flags include MAP_PREFAULT_PARTIAL, then a
2335  *      limited number of page mappings are created at the low-end of the
2336  *      specified address range.  (For this purpose, a superpage mapping
2337  *      counts as one page mapping.)  Otherwise, all resident pages within
2338  *      the specified address range are mapped.
2339  */
2340 static void
2341 vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
2342     vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags)
2343 {
2344         vm_offset_t start;
2345         vm_page_t p, p_start;
2346         vm_pindex_t mask, psize, threshold, tmpidx;
2347
2348         if ((prot & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0 || object == NULL)
2349                 return;
2350         VM_OBJECT_RLOCK(object);
2351         if (object->type == OBJT_DEVICE || object->type == OBJT_SG) {
2352                 VM_OBJECT_RUNLOCK(object);
2353                 VM_OBJECT_WLOCK(object);
2354                 if (object->type == OBJT_DEVICE || object->type == OBJT_SG) {
2355                         pmap_object_init_pt(map->pmap, addr, object, pindex,
2356                             size);
2357                         VM_OBJECT_WUNLOCK(object);
2358                         return;
2359                 }
2360                 VM_OBJECT_LOCK_DOWNGRADE(object);
2361         }
2362
2363         psize = atop(size);
2364         if (psize + pindex > object->size) {
2365                 if (object->size < pindex) {
2366                         VM_OBJECT_RUNLOCK(object);
2367                         return;
2368                 }
2369                 psize = object->size - pindex;
2370         }
2371
2372         start = 0;
2373         p_start = NULL;
2374         threshold = MAX_INIT_PT;
2375
2376         p = vm_page_find_least(object, pindex);
2377         /*
2378          * Assert: the variable p is either (1) the page with the
2379          * least pindex greater than or equal to the parameter pindex
2380          * or (2) NULL.
2381          */
2382         for (;
2383              p != NULL && (tmpidx = p->pindex - pindex) < psize;
2384              p = TAILQ_NEXT(p, listq)) {
2385                 /*
2386                  * don't allow an madvise to blow away our really
2387                  * free pages allocating pv entries.
2388                  */
2389                 if (((flags & MAP_PREFAULT_MADVISE) != 0 &&
2390                     vm_page_count_severe()) ||
2391                     ((flags & MAP_PREFAULT_PARTIAL) != 0 &&
2392                     tmpidx >= threshold)) {
2393                         psize = tmpidx;
2394                         break;
2395                 }
2396                 if (vm_page_all_valid(p)) {
2397                         if (p_start == NULL) {
2398                                 start = addr + ptoa(tmpidx);
2399                                 p_start = p;
2400                         }
2401                         /* Jump ahead if a superpage mapping is possible. */
2402                         if (p->psind > 0 && ((addr + ptoa(tmpidx)) &
2403                             (pagesizes[p->psind] - 1)) == 0) {
2404                                 mask = atop(pagesizes[p->psind]) - 1;
2405                                 if (tmpidx + mask < psize &&
2406                                     vm_page_ps_test(p, PS_ALL_VALID, NULL)) {
2407                                         p += mask;
2408                                         threshold += mask;
2409                                 }
2410                         }
2411                 } else if (p_start != NULL) {
2412                         pmap_enter_object(map->pmap, start, addr +
2413                             ptoa(tmpidx), p_start, prot);
2414                         p_start = NULL;
2415                 }
2416         }
2417         if (p_start != NULL)
2418                 pmap_enter_object(map->pmap, start, addr + ptoa(psize),
2419                     p_start, prot);
2420         VM_OBJECT_RUNLOCK(object);
2421 }
2422
2423 /*
2424  *      vm_map_protect:
2425  *
2426  *      Sets the protection of the specified address
2427  *      region in the target map.  If "set_max" is
2428  *      specified, the maximum protection is to be set;
2429  *      otherwise, only the current protection is affected.
2430  */
2431 int
2432 vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end,
2433                vm_prot_t new_prot, boolean_t set_max)
2434 {
2435         vm_map_entry_t entry, first_entry, in_tran, prev_entry;
2436         vm_object_t obj;
2437         struct ucred *cred;
2438         vm_prot_t old_prot;
2439         int rv;
2440
2441         if (start == end)
2442                 return (KERN_SUCCESS);
2443
2444 again:
2445         in_tran = NULL;
2446         vm_map_lock(map);
2447
2448         /*
2449          * Ensure that we are not concurrently wiring pages.  vm_map_wire() may
2450          * need to fault pages into the map and will drop the map lock while
2451          * doing so, and the VM object may end up in an inconsistent state if we
2452          * update the protection on the map entry in between faults.
2453          */
2454         vm_map_wait_busy(map);
2455
2456         VM_MAP_RANGE_CHECK(map, start, end);
2457
2458         if (!vm_map_lookup_entry(map, start, &first_entry))
2459                 first_entry = vm_map_entry_succ(first_entry);
2460
2461         /*
2462          * Make a first pass to check for protection violations.
2463          */
2464         for (entry = first_entry; entry->start < end;
2465             entry = vm_map_entry_succ(entry)) {
2466                 if ((entry->eflags & MAP_ENTRY_GUARD) != 0)
2467                         continue;
2468                 if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) {
2469                         vm_map_unlock(map);
2470                         return (KERN_INVALID_ARGUMENT);
2471                 }
2472                 if ((new_prot & entry->max_protection) != new_prot) {
2473                         vm_map_unlock(map);
2474                         return (KERN_PROTECTION_FAILURE);
2475                 }
2476                 if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0)
2477                         in_tran = entry;
2478         }
2479
2480         /*
2481          * Postpone the operation until all in-transition map entries have
2482          * stabilized.  An in-transition entry might already have its pages
2483          * wired and wired_count incremented, but not yet have its
2484          * MAP_ENTRY_USER_WIRED flag set.  In which case, we would fail to call
2485          * vm_fault_copy_entry() in the final loop below.
2486          */
2487         if (in_tran != NULL) {
2488                 in_tran->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
2489                 vm_map_unlock_and_wait(map, 0);
2490                 goto again;
2491         }
2492
2493         /*
2494          * Before changing the protections, try to reserve swap space for any
2495          * private (i.e., copy-on-write) mappings that are transitioning from
2496          * read-only to read/write access.  If a reservation fails, break out
2497          * of this loop early and let the next loop simplify the entries, since
2498          * some may now be mergeable.
2499          */
2500         rv = KERN_SUCCESS;
2501         vm_map_clip_start(map, first_entry, start);
2502         for (entry = first_entry; entry->start < end;
2503             entry = vm_map_entry_succ(entry)) {
2504                 vm_map_clip_end(map, entry, end);
2505
2506                 if (set_max ||
2507                     ((new_prot & ~entry->protection) & VM_PROT_WRITE) == 0 ||
2508                     ENTRY_CHARGED(entry) ||
2509                     (entry->eflags & MAP_ENTRY_GUARD) != 0) {
2510                         continue;
2511                 }
2512
2513                 cred = curthread->td_ucred;
2514                 obj = entry->object.vm_object;
2515
2516                 if (obj == NULL ||
2517                     (entry->eflags & MAP_ENTRY_NEEDS_COPY) != 0) {
2518                         if (!swap_reserve(entry->end - entry->start)) {
2519                                 rv = KERN_RESOURCE_SHORTAGE;
2520                                 end = entry->end;
2521                                 break;
2522                         }
2523                         crhold(cred);
2524                         entry->cred = cred;
2525                         continue;
2526                 }
2527
2528                 VM_OBJECT_WLOCK(obj);
2529                 if (obj->type != OBJT_DEFAULT && obj->type != OBJT_SWAP) {
2530                         VM_OBJECT_WUNLOCK(obj);
2531                         continue;
2532                 }
2533
2534                 /*
2535                  * Charge for the whole object allocation now, since
2536                  * we cannot distinguish between non-charged and
2537                  * charged clipped mapping of the same object later.
2538                  */
2539                 KASSERT(obj->charge == 0,
2540                     ("vm_map_protect: object %p overcharged (entry %p)",
2541                     obj, entry));
2542                 if (!swap_reserve(ptoa(obj->size))) {
2543                         VM_OBJECT_WUNLOCK(obj);
2544                         rv = KERN_RESOURCE_SHORTAGE;
2545                         end = entry->end;
2546                         break;
2547                 }
2548
2549                 crhold(cred);
2550                 obj->cred = cred;
2551                 obj->charge = ptoa(obj->size);
2552                 VM_OBJECT_WUNLOCK(obj);
2553         }
2554
2555         /*
2556          * If enough swap space was available, go back and fix up protections.
2557          * Otherwise, just simplify entries, since some may have been modified.
2558          * [Note that clipping is not necessary the second time.]
2559          */
2560         for (prev_entry = vm_map_entry_pred(first_entry), entry = first_entry;
2561             entry->start < end;
2562             vm_map_try_merge_entries(map, prev_entry, entry),
2563             prev_entry = entry, entry = vm_map_entry_succ(entry)) {
2564                 if (rv != KERN_SUCCESS ||
2565                     (entry->eflags & MAP_ENTRY_GUARD) != 0)
2566                         continue;
2567
2568                 old_prot = entry->protection;
2569
2570                 if (set_max)
2571                         entry->protection =
2572                             (entry->max_protection = new_prot) &
2573                             old_prot;
2574                 else
2575                         entry->protection = new_prot;
2576
2577                 /*
2578                  * For user wired map entries, the normal lazy evaluation of
2579                  * write access upgrades through soft page faults is
2580                  * undesirable.  Instead, immediately copy any pages that are
2581                  * copy-on-write and enable write access in the physical map.
2582                  */
2583                 if ((entry->eflags & MAP_ENTRY_USER_WIRED) != 0 &&
2584                     (entry->protection & VM_PROT_WRITE) != 0 &&
2585                     (old_prot & VM_PROT_WRITE) == 0)
2586                         vm_fault_copy_entry(map, map, entry, entry, NULL);
2587
2588                 /*
2589                  * When restricting access, update the physical map.  Worry
2590                  * about copy-on-write here.
2591                  */
2592                 if ((old_prot & ~entry->protection) != 0) {
2593 #define MASK(entry)     (((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \
2594                                                         VM_PROT_ALL)
2595                         pmap_protect(map->pmap, entry->start,
2596                             entry->end,
2597                             entry->protection & MASK(entry));
2598 #undef  MASK
2599                 }
2600         }
2601         vm_map_try_merge_entries(map, prev_entry, entry);
2602         vm_map_unlock(map);
2603         return (rv);
2604 }
2605
2606 /*
2607  *      vm_map_madvise:
2608  *
2609  *      This routine traverses a processes map handling the madvise
2610  *      system call.  Advisories are classified as either those effecting
2611  *      the vm_map_entry structure, or those effecting the underlying
2612  *      objects.
2613  */
2614 int
2615 vm_map_madvise(
2616         vm_map_t map,
2617         vm_offset_t start,
2618         vm_offset_t end,
2619         int behav)
2620 {
2621         vm_map_entry_t entry, prev_entry;
2622         bool modify_map;
2623
2624         /*
2625          * Some madvise calls directly modify the vm_map_entry, in which case
2626          * we need to use an exclusive lock on the map and we need to perform
2627          * various clipping operations.  Otherwise we only need a read-lock
2628          * on the map.
2629          */
2630         switch(behav) {
2631         case MADV_NORMAL:
2632         case MADV_SEQUENTIAL:
2633         case MADV_RANDOM:
2634         case MADV_NOSYNC:
2635         case MADV_AUTOSYNC:
2636         case MADV_NOCORE:
2637         case MADV_CORE:
2638                 if (start == end)
2639                         return (0);
2640                 modify_map = true;
2641                 vm_map_lock(map);
2642                 break;
2643         case MADV_WILLNEED:
2644         case MADV_DONTNEED:
2645         case MADV_FREE:
2646                 if (start == end)
2647                         return (0);
2648                 modify_map = false;
2649                 vm_map_lock_read(map);
2650                 break;
2651         default:
2652                 return (EINVAL);
2653         }
2654
2655         /*
2656          * Locate starting entry and clip if necessary.
2657          */
2658         VM_MAP_RANGE_CHECK(map, start, end);
2659
2660         if (vm_map_lookup_entry(map, start, &entry)) {
2661                 if (modify_map)
2662                         vm_map_clip_start(map, entry, start);
2663                 prev_entry = vm_map_entry_pred(entry);
2664         } else {
2665                 prev_entry = entry;
2666                 entry = vm_map_entry_succ(entry);
2667         }
2668
2669         if (modify_map) {
2670                 /*
2671                  * madvise behaviors that are implemented in the vm_map_entry.
2672                  *
2673                  * We clip the vm_map_entry so that behavioral changes are
2674                  * limited to the specified address range.
2675                  */
2676                 for (; entry->start < end;
2677                      prev_entry = entry, entry = vm_map_entry_succ(entry)) {
2678                         if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0)
2679                                 continue;
2680
2681                         vm_map_clip_end(map, entry, end);
2682
2683                         switch (behav) {
2684                         case MADV_NORMAL:
2685                                 vm_map_entry_set_behavior(entry,
2686                                     MAP_ENTRY_BEHAV_NORMAL);
2687                                 break;
2688                         case MADV_SEQUENTIAL:
2689                                 vm_map_entry_set_behavior(entry,
2690                                     MAP_ENTRY_BEHAV_SEQUENTIAL);
2691                                 break;
2692                         case MADV_RANDOM:
2693                                 vm_map_entry_set_behavior(entry,
2694                                     MAP_ENTRY_BEHAV_RANDOM);
2695                                 break;
2696                         case MADV_NOSYNC:
2697                                 entry->eflags |= MAP_ENTRY_NOSYNC;
2698                                 break;
2699                         case MADV_AUTOSYNC:
2700                                 entry->eflags &= ~MAP_ENTRY_NOSYNC;
2701                                 break;
2702                         case MADV_NOCORE:
2703                                 entry->eflags |= MAP_ENTRY_NOCOREDUMP;
2704                                 break;
2705                         case MADV_CORE:
2706                                 entry->eflags &= ~MAP_ENTRY_NOCOREDUMP;
2707                                 break;
2708                         default:
2709                                 break;
2710                         }
2711                         vm_map_try_merge_entries(map, prev_entry, entry);
2712                 }
2713                 vm_map_try_merge_entries(map, prev_entry, entry);
2714                 vm_map_unlock(map);
2715         } else {
2716                 vm_pindex_t pstart, pend;
2717
2718                 /*
2719                  * madvise behaviors that are implemented in the underlying
2720                  * vm_object.
2721                  *
2722                  * Since we don't clip the vm_map_entry, we have to clip
2723                  * the vm_object pindex and count.
2724                  */
2725                 for (; entry->start < end;
2726                     entry = vm_map_entry_succ(entry)) {
2727                         vm_offset_t useEnd, useStart;
2728
2729                         if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0)
2730                                 continue;
2731
2732                         /*
2733                          * MADV_FREE would otherwise rewind time to
2734                          * the creation of the shadow object.  Because
2735                          * we hold the VM map read-locked, neither the
2736                          * entry's object nor the presence of a
2737                          * backing object can change.
2738                          */
2739                         if (behav == MADV_FREE &&
2740                             entry->object.vm_object != NULL &&
2741                             entry->object.vm_object->backing_object != NULL)
2742                                 continue;
2743
2744                         pstart = OFF_TO_IDX(entry->offset);
2745                         pend = pstart + atop(entry->end - entry->start);
2746                         useStart = entry->start;
2747                         useEnd = entry->end;
2748
2749                         if (entry->start < start) {
2750                                 pstart += atop(start - entry->start);
2751                                 useStart = start;
2752                         }
2753                         if (entry->end > end) {
2754                                 pend -= atop(entry->end - end);
2755                                 useEnd = end;
2756                         }
2757
2758                         if (pstart >= pend)
2759                                 continue;
2760
2761                         /*
2762                          * Perform the pmap_advise() before clearing
2763                          * PGA_REFERENCED in vm_page_advise().  Otherwise, a
2764                          * concurrent pmap operation, such as pmap_remove(),
2765                          * could clear a reference in the pmap and set
2766                          * PGA_REFERENCED on the page before the pmap_advise()
2767                          * had completed.  Consequently, the page would appear
2768                          * referenced based upon an old reference that
2769                          * occurred before this pmap_advise() ran.
2770                          */
2771                         if (behav == MADV_DONTNEED || behav == MADV_FREE)
2772                                 pmap_advise(map->pmap, useStart, useEnd,
2773                                     behav);
2774
2775                         vm_object_madvise(entry->object.vm_object, pstart,
2776                             pend, behav);
2777
2778                         /*
2779                          * Pre-populate paging structures in the
2780                          * WILLNEED case.  For wired entries, the
2781                          * paging structures are already populated.
2782                          */
2783                         if (behav == MADV_WILLNEED &&
2784                             entry->wired_count == 0) {
2785                                 vm_map_pmap_enter(map,
2786                                     useStart,
2787                                     entry->protection,
2788                                     entry->object.vm_object,
2789                                     pstart,
2790                                     ptoa(pend - pstart),
2791                                     MAP_PREFAULT_MADVISE
2792                                 );
2793                         }
2794                 }
2795                 vm_map_unlock_read(map);
2796         }
2797         return (0);
2798 }
2799
2800
2801 /*
2802  *      vm_map_inherit:
2803  *
2804  *      Sets the inheritance of the specified address
2805  *      range in the target map.  Inheritance
2806  *      affects how the map will be shared with
2807  *      child maps at the time of vmspace_fork.
2808  */
2809 int
2810 vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end,
2811                vm_inherit_t new_inheritance)
2812 {
2813         vm_map_entry_t entry, prev_entry;
2814
2815         switch (new_inheritance) {
2816         case VM_INHERIT_NONE:
2817         case VM_INHERIT_COPY:
2818         case VM_INHERIT_SHARE:
2819         case VM_INHERIT_ZERO:
2820                 break;
2821         default:
2822                 return (KERN_INVALID_ARGUMENT);
2823         }
2824         if (start == end)
2825                 return (KERN_SUCCESS);
2826         vm_map_lock(map);
2827         VM_MAP_RANGE_CHECK(map, start, end);
2828         if (vm_map_lookup_entry(map, start, &prev_entry)) {
2829                 entry = prev_entry;
2830                 vm_map_clip_start(map, entry, start);
2831                 prev_entry = vm_map_entry_pred(entry);
2832         } else
2833                 entry = vm_map_entry_succ(prev_entry);
2834         for (; entry->start < end;
2835             prev_entry = entry, entry = vm_map_entry_succ(entry)) {
2836                 vm_map_clip_end(map, entry, end);
2837                 if ((entry->eflags & MAP_ENTRY_GUARD) == 0 ||
2838                     new_inheritance != VM_INHERIT_ZERO)
2839                         entry->inheritance = new_inheritance;
2840                 vm_map_try_merge_entries(map, prev_entry, entry);
2841         }
2842         vm_map_try_merge_entries(map, prev_entry, entry);
2843         vm_map_unlock(map);
2844         return (KERN_SUCCESS);
2845 }
2846
2847 /*
2848  *      vm_map_entry_in_transition:
2849  *
2850  *      Release the map lock, and sleep until the entry is no longer in
2851  *      transition.  Awake and acquire the map lock.  If the map changed while
2852  *      another held the lock, lookup a possibly-changed entry at or after the
2853  *      'start' position of the old entry.
2854  */
2855 static vm_map_entry_t
2856 vm_map_entry_in_transition(vm_map_t map, vm_offset_t in_start,
2857     vm_offset_t *io_end, bool holes_ok, vm_map_entry_t in_entry)
2858 {
2859         vm_map_entry_t entry;
2860         vm_offset_t start;
2861         u_int last_timestamp;
2862
2863         VM_MAP_ASSERT_LOCKED(map);
2864         KASSERT((in_entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0,
2865             ("not in-tranition map entry %p", in_entry));
2866         /*
2867          * We have not yet clipped the entry.
2868          */
2869         start = MAX(in_start, in_entry->start);
2870         in_entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
2871         last_timestamp = map->timestamp;
2872         if (vm_map_unlock_and_wait(map, 0)) {
2873                 /*
2874                  * Allow interruption of user wiring/unwiring?
2875                  */
2876         }
2877         vm_map_lock(map);
2878         if (last_timestamp + 1 == map->timestamp)
2879                 return (in_entry);
2880
2881         /*
2882          * Look again for the entry because the map was modified while it was
2883          * unlocked.  Specifically, the entry may have been clipped, merged, or
2884          * deleted.
2885          */
2886         if (!vm_map_lookup_entry(map, start, &entry)) {
2887                 if (!holes_ok) {
2888                         *io_end = start;
2889                         return (NULL);
2890                 }
2891                 entry = vm_map_entry_succ(entry);
2892         }
2893         return (entry);
2894 }
2895
2896 /*
2897  *      vm_map_unwire:
2898  *
2899  *      Implements both kernel and user unwiring.
2900  */
2901 int
2902 vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end,
2903     int flags)
2904 {
2905         vm_map_entry_t entry, first_entry, next_entry, prev_entry;
2906         int rv;
2907         bool holes_ok, need_wakeup, user_unwire;
2908
2909         if (start == end)
2910                 return (KERN_SUCCESS);
2911         holes_ok = (flags & VM_MAP_WIRE_HOLESOK) != 0;
2912         user_unwire = (flags & VM_MAP_WIRE_USER) != 0;
2913         vm_map_lock(map);
2914         VM_MAP_RANGE_CHECK(map, start, end);
2915         if (!vm_map_lookup_entry(map, start, &first_entry)) {
2916                 if (holes_ok)
2917                         first_entry = vm_map_entry_succ(first_entry);
2918                 else {
2919                         vm_map_unlock(map);
2920                         return (KERN_INVALID_ADDRESS);
2921                 }
2922         }
2923         rv = KERN_SUCCESS;
2924         for (entry = first_entry; entry->start < end; entry = next_entry) {
2925                 if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
2926                         /*
2927                          * We have not yet clipped the entry.
2928                          */
2929                         next_entry = vm_map_entry_in_transition(map, start,
2930                             &end, holes_ok, entry);
2931                         if (next_entry == NULL) {
2932                                 if (entry == first_entry) {
2933                                         vm_map_unlock(map);
2934                                         return (KERN_INVALID_ADDRESS);
2935                                 }
2936                                 rv = KERN_INVALID_ADDRESS;
2937                                 break;
2938                         }
2939                         first_entry = (entry == first_entry) ?
2940                             next_entry : NULL;
2941                         continue;
2942                 }
2943                 vm_map_clip_start(map, entry, start);
2944                 vm_map_clip_end(map, entry, end);
2945                 /*
2946                  * Mark the entry in case the map lock is released.  (See
2947                  * above.)
2948                  */
2949                 KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 &&
2950                     entry->wiring_thread == NULL,
2951                     ("owned map entry %p", entry));
2952                 entry->eflags |= MAP_ENTRY_IN_TRANSITION;
2953                 entry->wiring_thread = curthread;
2954                 next_entry = vm_map_entry_succ(entry);
2955                 /*
2956                  * Check the map for holes in the specified region.
2957                  * If holes_ok, skip this check.
2958                  */
2959                 if (!holes_ok &&
2960                     entry->end < end && next_entry->start > entry->end) {
2961                         end = entry->end;
2962                         rv = KERN_INVALID_ADDRESS;
2963                         break;
2964                 }
2965                 /*
2966                  * If system unwiring, require that the entry is system wired.
2967                  */
2968                 if (!user_unwire &&
2969                     vm_map_entry_system_wired_count(entry) == 0) {
2970                         end = entry->end;
2971                         rv = KERN_INVALID_ARGUMENT;
2972                         break;
2973                 }
2974         }
2975         need_wakeup = false;
2976         if (first_entry == NULL &&
2977             !vm_map_lookup_entry(map, start, &first_entry)) {
2978                 KASSERT(holes_ok, ("vm_map_unwire: lookup failed"));
2979                 prev_entry = first_entry;
2980                 entry = vm_map_entry_succ(first_entry);
2981         } else {
2982                 prev_entry = vm_map_entry_pred(first_entry);
2983                 entry = first_entry;
2984         }
2985         for (; entry->start < end;
2986             prev_entry = entry, entry = vm_map_entry_succ(entry)) {
2987                 /*
2988                  * If holes_ok was specified, an empty
2989                  * space in the unwired region could have been mapped
2990                  * while the map lock was dropped for draining
2991                  * MAP_ENTRY_IN_TRANSITION.  Moreover, another thread
2992                  * could be simultaneously wiring this new mapping
2993                  * entry.  Detect these cases and skip any entries
2994                  * marked as in transition by us.
2995                  */
2996                 if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 ||
2997                     entry->wiring_thread != curthread) {
2998                         KASSERT(holes_ok,
2999                             ("vm_map_unwire: !HOLESOK and new/changed entry"));
3000                         continue;
3001                 }
3002
3003                 if (rv == KERN_SUCCESS && (!user_unwire ||
3004                     (entry->eflags & MAP_ENTRY_USER_WIRED))) {
3005                         if (entry->wired_count == 1)
3006                                 vm_map_entry_unwire(map, entry);
3007                         else
3008                                 entry->wired_count--;
3009                         if (user_unwire)
3010                                 entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3011                 }
3012                 KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0,
3013                     ("vm_map_unwire: in-transition flag missing %p", entry));
3014                 KASSERT(entry->wiring_thread == curthread,
3015                     ("vm_map_unwire: alien wire %p", entry));
3016                 entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
3017                 entry->wiring_thread = NULL;
3018                 if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
3019                         entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
3020                         need_wakeup = true;
3021                 }
3022                 vm_map_try_merge_entries(map, prev_entry, entry);
3023         }
3024         vm_map_try_merge_entries(map, prev_entry, entry);
3025         vm_map_unlock(map);
3026         if (need_wakeup)
3027                 vm_map_wakeup(map);
3028         return (rv);
3029 }
3030
3031 static void
3032 vm_map_wire_user_count_sub(u_long npages)
3033 {
3034
3035         atomic_subtract_long(&vm_user_wire_count, npages);
3036 }
3037
3038 static bool
3039 vm_map_wire_user_count_add(u_long npages)
3040 {
3041         u_long wired;
3042
3043         wired = vm_user_wire_count;
3044         do {
3045                 if (npages + wired > vm_page_max_user_wired)
3046                         return (false);
3047         } while (!atomic_fcmpset_long(&vm_user_wire_count, &wired,
3048             npages + wired));
3049
3050         return (true);
3051 }
3052
3053 /*
3054  *      vm_map_wire_entry_failure:
3055  *
3056  *      Handle a wiring failure on the given entry.
3057  *
3058  *      The map should be locked.
3059  */
3060 static void
3061 vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry,
3062     vm_offset_t failed_addr)
3063 {
3064
3065         VM_MAP_ASSERT_LOCKED(map);
3066         KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 &&
3067             entry->wired_count == 1,
3068             ("vm_map_wire_entry_failure: entry %p isn't being wired", entry));
3069         KASSERT(failed_addr < entry->end,
3070             ("vm_map_wire_entry_failure: entry %p was fully wired", entry));
3071
3072         /*
3073          * If any pages at the start of this entry were successfully wired,
3074          * then unwire them.
3075          */
3076         if (failed_addr > entry->start) {
3077                 pmap_unwire(map->pmap, entry->start, failed_addr);
3078                 vm_object_unwire(entry->object.vm_object, entry->offset,
3079                     failed_addr - entry->start, PQ_ACTIVE);
3080         }
3081
3082         /*
3083          * Assign an out-of-range value to represent the failure to wire this
3084          * entry.
3085          */
3086         entry->wired_count = -1;
3087 }
3088
3089 int
3090 vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags)
3091 {
3092         int rv;
3093
3094         vm_map_lock(map);
3095         rv = vm_map_wire_locked(map, start, end, flags);
3096         vm_map_unlock(map);
3097         return (rv);
3098 }
3099
3100
3101 /*
3102  *      vm_map_wire_locked:
3103  *
3104  *      Implements both kernel and user wiring.  Returns with the map locked,
3105  *      the map lock may be dropped.
3106  */
3107 int
3108 vm_map_wire_locked(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags)
3109 {
3110         vm_map_entry_t entry, first_entry, next_entry, prev_entry;
3111         vm_offset_t faddr, saved_end, saved_start;
3112         u_long npages;
3113         u_int last_timestamp;
3114         int rv;
3115         bool holes_ok, need_wakeup, user_wire;
3116         vm_prot_t prot;
3117
3118         VM_MAP_ASSERT_LOCKED(map);
3119
3120         if (start == end)
3121                 return (KERN_SUCCESS);
3122         prot = 0;
3123         if (flags & VM_MAP_WIRE_WRITE)
3124                 prot |= VM_PROT_WRITE;
3125         holes_ok = (flags & VM_MAP_WIRE_HOLESOK) != 0;
3126         user_wire = (flags & VM_MAP_WIRE_USER) != 0;
3127         VM_MAP_RANGE_CHECK(map, start, end);
3128         if (!vm_map_lookup_entry(map, start, &first_entry)) {
3129                 if (holes_ok)
3130                         first_entry = vm_map_entry_succ(first_entry);
3131                 else
3132                         return (KERN_INVALID_ADDRESS);
3133         }
3134         for (entry = first_entry; entry->start < end; entry = next_entry) {
3135                 if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
3136                         /*
3137                          * We have not yet clipped the entry.
3138                          */
3139                         next_entry = vm_map_entry_in_transition(map, start,
3140                             &end, holes_ok, entry);
3141                         if (next_entry == NULL) {
3142                                 if (entry == first_entry)
3143                                         return (KERN_INVALID_ADDRESS);
3144                                 rv = KERN_INVALID_ADDRESS;
3145                                 goto done;
3146                         }
3147                         first_entry = (entry == first_entry) ?
3148                             next_entry : NULL;
3149                         continue;
3150                 }
3151                 vm_map_clip_start(map, entry, start);
3152                 vm_map_clip_end(map, entry, end);
3153                 /*
3154                  * Mark the entry in case the map lock is released.  (See
3155                  * above.)
3156                  */
3157                 KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 &&
3158                     entry->wiring_thread == NULL,
3159                     ("owned map entry %p", entry));
3160                 entry->eflags |= MAP_ENTRY_IN_TRANSITION;
3161                 entry->wiring_thread = curthread;
3162                 if ((entry->protection & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0
3163                     || (entry->protection & prot) != prot) {
3164                         entry->eflags |= MAP_ENTRY_WIRE_SKIPPED;
3165                         if (!holes_ok) {
3166                                 end = entry->end;
3167                                 rv = KERN_INVALID_ADDRESS;
3168                                 goto done;
3169                         }
3170                 } else if (entry->wired_count == 0) {
3171                         entry->wired_count++;
3172
3173                         npages = atop(entry->end - entry->start);
3174                         if (user_wire && !vm_map_wire_user_count_add(npages)) {
3175                                 vm_map_wire_entry_failure(map, entry,
3176                                     entry->start);
3177                                 end = entry->end;
3178                                 rv = KERN_RESOURCE_SHORTAGE;
3179                                 goto done;
3180                         }
3181
3182                         /*
3183                          * Release the map lock, relying on the in-transition
3184                          * mark.  Mark the map busy for fork.
3185                          */
3186                         saved_start = entry->start;
3187                         saved_end = entry->end;
3188                         last_timestamp = map->timestamp;
3189                         vm_map_busy(map);
3190                         vm_map_unlock(map);
3191
3192                         faddr = saved_start;
3193                         do {
3194                                 /*
3195                                  * Simulate a fault to get the page and enter
3196                                  * it into the physical map.
3197                                  */
3198                                 if ((rv = vm_fault(map, faddr,
3199                                     VM_PROT_NONE, VM_FAULT_WIRE, NULL)) !=
3200                                     KERN_SUCCESS)
3201                                         break;
3202                         } while ((faddr += PAGE_SIZE) < saved_end);
3203                         vm_map_lock(map);
3204                         vm_map_unbusy(map);
3205                         if (last_timestamp + 1 != map->timestamp) {
3206                                 /*
3207                                  * Look again for the entry because the map was
3208                                  * modified while it was unlocked.  The entry
3209                                  * may have been clipped, but NOT merged or
3210                                  * deleted.
3211                                  */
3212                                 if (!vm_map_lookup_entry(map, saved_start,
3213                                     &next_entry))
3214                                         KASSERT(false,
3215                                             ("vm_map_wire: lookup failed"));
3216                                 first_entry = (entry == first_entry) ?
3217                                     next_entry : NULL;
3218                                 for (entry = next_entry; entry->end < saved_end;
3219                                     entry = vm_map_entry_succ(entry)) {
3220                                         /*
3221                                          * In case of failure, handle entries
3222                                          * that were not fully wired here;
3223                                          * fully wired entries are handled
3224                                          * later.
3225                                          */
3226                                         if (rv != KERN_SUCCESS &&
3227                                             faddr < entry->end)
3228                                                 vm_map_wire_entry_failure(map,
3229                                                     entry, faddr);
3230                                 }
3231                         }
3232                         if (rv != KERN_SUCCESS) {
3233                                 vm_map_wire_entry_failure(map, entry, faddr);
3234                                 if (user_wire)
3235                                         vm_map_wire_user_count_sub(npages);
3236                                 end = entry->end;
3237                                 goto done;
3238                         }
3239                 } else if (!user_wire ||
3240                            (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
3241                         entry->wired_count++;
3242                 }
3243                 /*
3244                  * Check the map for holes in the specified region.
3245                  * If holes_ok was specified, skip this check.
3246                  */
3247                 next_entry = vm_map_entry_succ(entry);
3248                 if (!holes_ok &&
3249                     entry->end < end && next_entry->start > entry->end) {
3250                         end = entry->end;
3251                         rv = KERN_INVALID_ADDRESS;
3252                         goto done;
3253                 }
3254         }
3255         rv = KERN_SUCCESS;
3256 done:
3257         need_wakeup = false;
3258         if (first_entry == NULL &&
3259             !vm_map_lookup_entry(map, start, &first_entry)) {
3260                 KASSERT(holes_ok, ("vm_map_wire: lookup failed"));
3261                 prev_entry = first_entry;
3262                 entry = vm_map_entry_succ(first_entry);
3263         } else {
3264                 prev_entry = vm_map_entry_pred(first_entry);
3265                 entry = first_entry;
3266         }
3267         for (; entry->start < end;
3268             prev_entry = entry, entry = vm_map_entry_succ(entry)) {
3269                 /*
3270                  * If holes_ok was specified, an empty
3271                  * space in the unwired region could have been mapped
3272                  * while the map lock was dropped for faulting in the
3273                  * pages or draining MAP_ENTRY_IN_TRANSITION.
3274                  * Moreover, another thread could be simultaneously
3275                  * wiring this new mapping entry.  Detect these cases
3276                  * and skip any entries marked as in transition not by us.
3277                  */
3278                 if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 ||
3279                     entry->wiring_thread != curthread) {
3280                         KASSERT(holes_ok,
3281                             ("vm_map_wire: !HOLESOK and new/changed entry"));
3282                         continue;
3283                 }
3284
3285                 if ((entry->eflags & MAP_ENTRY_WIRE_SKIPPED) != 0) {
3286                         /* do nothing */
3287                 } else if (rv == KERN_SUCCESS) {
3288                         if (user_wire)
3289                                 entry->eflags |= MAP_ENTRY_USER_WIRED;
3290                 } else if (entry->wired_count == -1) {
3291                         /*
3292                          * Wiring failed on this entry.  Thus, unwiring is
3293                          * unnecessary.
3294                          */
3295                         entry->wired_count = 0;
3296                 } else if (!user_wire ||
3297                     (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
3298                         /*
3299                          * Undo the wiring.  Wiring succeeded on this entry
3300                          * but failed on a later entry.  
3301                          */
3302                         if (entry->wired_count == 1) {
3303                                 vm_map_entry_unwire(map, entry);
3304                                 if (user_wire)
3305                                         vm_map_wire_user_count_sub(
3306                                             atop(entry->end - entry->start));
3307                         } else
3308                                 entry->wired_count--;
3309                 }
3310                 KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0,
3311                     ("vm_map_wire: in-transition flag missing %p", entry));
3312                 KASSERT(entry->wiring_thread == curthread,
3313                     ("vm_map_wire: alien wire %p", entry));
3314                 entry->eflags &= ~(MAP_ENTRY_IN_TRANSITION |
3315                     MAP_ENTRY_WIRE_SKIPPED);
3316                 entry->wiring_thread = NULL;
3317                 if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
3318                         entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
3319                         need_wakeup = true;
3320                 }
3321                 vm_map_try_merge_entries(map, prev_entry, entry);
3322         }
3323         vm_map_try_merge_entries(map, prev_entry, entry);
3324         if (need_wakeup)
3325                 vm_map_wakeup(map);
3326         return (rv);
3327 }
3328
3329 /*
3330  * vm_map_sync
3331  *
3332  * Push any dirty cached pages in the address range to their pager.
3333  * If syncio is TRUE, dirty pages are written synchronously.
3334  * If invalidate is TRUE, any cached pages are freed as well.
3335  *
3336  * If the size of the region from start to end is zero, we are
3337  * supposed to flush all modified pages within the region containing
3338  * start.  Unfortunately, a region can be split or coalesced with
3339  * neighboring regions, making it difficult to determine what the
3340  * original region was.  Therefore, we approximate this requirement by
3341  * flushing the current region containing start.
3342  *
3343  * Returns an error if any part of the specified range is not mapped.
3344  */
3345 int
3346 vm_map_sync(
3347         vm_map_t map,
3348         vm_offset_t start,
3349         vm_offset_t end,
3350         boolean_t syncio,
3351         boolean_t invalidate)
3352 {
3353         vm_map_entry_t entry, first_entry, next_entry;
3354         vm_size_t size;
3355         vm_object_t object;
3356         vm_ooffset_t offset;
3357         unsigned int last_timestamp;
3358         boolean_t failed;
3359
3360         vm_map_lock_read(map);
3361         VM_MAP_RANGE_CHECK(map, start, end);
3362         if (!vm_map_lookup_entry(map, start, &first_entry)) {
3363                 vm_map_unlock_read(map);
3364                 return (KERN_INVALID_ADDRESS);
3365         } else if (start == end) {
3366                 start = first_entry->start;
3367                 end = first_entry->end;
3368         }
3369         /*
3370          * Make a first pass to check for user-wired memory and holes.
3371          */
3372         for (entry = first_entry; entry->start < end; entry = next_entry) {
3373                 if (invalidate &&
3374                     (entry->eflags & MAP_ENTRY_USER_WIRED) != 0) {
3375                         vm_map_unlock_read(map);
3376                         return (KERN_INVALID_ARGUMENT);
3377                 }
3378                 next_entry = vm_map_entry_succ(entry);
3379                 if (end > entry->end &&
3380                     entry->end != next_entry->start) {
3381                         vm_map_unlock_read(map);
3382                         return (KERN_INVALID_ADDRESS);
3383                 }
3384         }
3385
3386         if (invalidate)
3387                 pmap_remove(map->pmap, start, end);
3388         failed = FALSE;
3389
3390         /*
3391          * Make a second pass, cleaning/uncaching pages from the indicated
3392          * objects as we go.
3393          */
3394         for (entry = first_entry; entry->start < end;) {
3395                 offset = entry->offset + (start - entry->start);
3396                 size = (end <= entry->end ? end : entry->end) - start;
3397                 if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) {
3398                         vm_map_t smap;
3399                         vm_map_entry_t tentry;
3400                         vm_size_t tsize;
3401
3402                         smap = entry->object.sub_map;
3403                         vm_map_lock_read(smap);
3404                         (void) vm_map_lookup_entry(smap, offset, &tentry);
3405                         tsize = tentry->end - offset;
3406                         if (tsize < size)
3407                                 size = tsize;
3408                         object = tentry->object.vm_object;
3409                         offset = tentry->offset + (offset - tentry->start);
3410                         vm_map_unlock_read(smap);
3411                 } else {
3412                         object = entry->object.vm_object;
3413                 }
3414                 vm_object_reference(object);
3415                 last_timestamp = map->timestamp;
3416                 vm_map_unlock_read(map);
3417                 if (!vm_object_sync(object, offset, size, syncio, invalidate))
3418                         failed = TRUE;
3419                 start += size;
3420                 vm_object_deallocate(object);
3421                 vm_map_lock_read(map);
3422                 if (last_timestamp == map->timestamp ||
3423                     !vm_map_lookup_entry(map, start, &entry))
3424                         entry = vm_map_entry_succ(entry);
3425         }
3426
3427         vm_map_unlock_read(map);
3428         return (failed ? KERN_FAILURE : KERN_SUCCESS);
3429 }
3430
3431 /*
3432  *      vm_map_entry_unwire:    [ internal use only ]
3433  *
3434  *      Make the region specified by this entry pageable.
3435  *
3436  *      The map in question should be locked.
3437  *      [This is the reason for this routine's existence.]
3438  */
3439 static void
3440 vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry)
3441 {
3442         vm_size_t size;
3443
3444         VM_MAP_ASSERT_LOCKED(map);
3445         KASSERT(entry->wired_count > 0,
3446             ("vm_map_entry_unwire: entry %p isn't wired", entry));
3447
3448         size = entry->end - entry->start;
3449         if ((entry->eflags & MAP_ENTRY_USER_WIRED) != 0)
3450                 vm_map_wire_user_count_sub(atop(size));
3451         pmap_unwire(map->pmap, entry->start, entry->end);
3452         vm_object_unwire(entry->object.vm_object, entry->offset, size,
3453             PQ_ACTIVE);
3454         entry->wired_count = 0;
3455 }
3456
3457 static void
3458 vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map)
3459 {
3460
3461         if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0)
3462                 vm_object_deallocate(entry->object.vm_object);
3463         uma_zfree(system_map ? kmapentzone : mapentzone, entry);
3464 }
3465
3466 /*
3467  *      vm_map_entry_delete:    [ internal use only ]
3468  *
3469  *      Deallocate the given entry from the target map.
3470  */
3471 static void
3472 vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry)
3473 {
3474         vm_object_t object;
3475         vm_pindex_t offidxstart, offidxend, count, size1;
3476         vm_size_t size;
3477
3478         vm_map_entry_unlink(map, entry, UNLINK_MERGE_NONE);
3479         object = entry->object.vm_object;
3480
3481         if ((entry->eflags & MAP_ENTRY_GUARD) != 0) {
3482                 MPASS(entry->cred == NULL);
3483                 MPASS((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0);
3484                 MPASS(object == NULL);
3485                 vm_map_entry_deallocate(entry, map->system_map);
3486                 return;
3487         }
3488
3489         size = entry->end - entry->start;
3490         map->size -= size;
3491
3492         if (entry->cred != NULL) {
3493                 swap_release_by_cred(size, entry->cred);
3494                 crfree(entry->cred);
3495         }
3496
3497         if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0 || object == NULL) {
3498                 entry->object.vm_object = NULL;
3499         } else if ((object->flags & OBJ_ANON) != 0 ||
3500             object == kernel_object) {
3501                 KASSERT(entry->cred == NULL || object->cred == NULL ||
3502                     (entry->eflags & MAP_ENTRY_NEEDS_COPY),
3503                     ("OVERCOMMIT vm_map_entry_delete: both cred %p", entry));
3504                 count = atop(size);
3505                 offidxstart = OFF_TO_IDX(entry->offset);
3506                 offidxend = offidxstart + count;
3507                 VM_OBJECT_WLOCK(object);
3508                 if (object->ref_count != 1 &&
3509                     ((object->flags & OBJ_ONEMAPPING) != 0 ||
3510                     object == kernel_object)) {
3511                         vm_object_collapse(object);
3512
3513                         /*
3514                          * The option OBJPR_NOTMAPPED can be passed here
3515                          * because vm_map_delete() already performed
3516                          * pmap_remove() on the only mapping to this range
3517                          * of pages. 
3518                          */
3519                         vm_object_page_remove(object, offidxstart, offidxend,
3520                             OBJPR_NOTMAPPED);
3521                         if (object->type == OBJT_SWAP)
3522                                 swap_pager_freespace(object, offidxstart,
3523                                     count);
3524                         if (offidxend >= object->size &&
3525                             offidxstart < object->size) {
3526                                 size1 = object->size;
3527                                 object->size = offidxstart;
3528                                 if (object->cred != NULL) {
3529                                         size1 -= object->size;
3530                                         KASSERT(object->charge >= ptoa(size1),
3531                                             ("object %p charge < 0", object));
3532                                         swap_release_by_cred(ptoa(size1),
3533                                             object->cred);
3534                                         object->charge -= ptoa(size1);
3535                                 }
3536                         }
3537                 }
3538                 VM_OBJECT_WUNLOCK(object);
3539         }
3540         if (map->system_map)
3541                 vm_map_entry_deallocate(entry, TRUE);
3542         else {
3543                 entry->defer_next = curthread->td_map_def_user;
3544                 curthread->td_map_def_user = entry;
3545         }
3546 }
3547
3548 /*
3549  *      vm_map_delete:  [ internal use only ]
3550  *
3551  *      Deallocates the given address range from the target
3552  *      map.
3553  */
3554 int
3555 vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end)
3556 {
3557         vm_map_entry_t entry;
3558         vm_map_entry_t first_entry;
3559
3560         VM_MAP_ASSERT_LOCKED(map);
3561         if (start == end)
3562                 return (KERN_SUCCESS);
3563
3564         /*
3565          * Find the start of the region, and clip it
3566          */
3567         if (!vm_map_lookup_entry(map, start, &first_entry))
3568                 entry = vm_map_entry_succ(first_entry);
3569         else {
3570                 entry = first_entry;
3571                 vm_map_clip_start(map, entry, start);
3572         }
3573
3574         /*
3575          * Step through all entries in this region
3576          */
3577         while (entry->start < end) {
3578                 vm_map_entry_t next;
3579
3580                 /*
3581                  * Wait for wiring or unwiring of an entry to complete.
3582                  * Also wait for any system wirings to disappear on
3583                  * user maps.
3584                  */
3585                 if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 ||
3586                     (vm_map_pmap(map) != kernel_pmap &&
3587                     vm_map_entry_system_wired_count(entry) != 0)) {
3588                         unsigned int last_timestamp;
3589                         vm_offset_t saved_start;
3590                         vm_map_entry_t tmp_entry;
3591
3592                         saved_start = entry->start;
3593                         entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
3594                         last_timestamp = map->timestamp;
3595                         (void) vm_map_unlock_and_wait(map, 0);
3596                         vm_map_lock(map);
3597                         if (last_timestamp + 1 != map->timestamp) {
3598                                 /*
3599                                  * Look again for the entry because the map was
3600                                  * modified while it was unlocked.
3601                                  * Specifically, the entry may have been
3602                                  * clipped, merged, or deleted.
3603                                  */
3604                                 if (!vm_map_lookup_entry(map, saved_start,
3605                                                          &tmp_entry))
3606                                         entry = vm_map_entry_succ(tmp_entry);
3607                                 else {
3608                                         entry = tmp_entry;
3609                                         vm_map_clip_start(map, entry,
3610                                                           saved_start);
3611                                 }
3612                         }
3613                         continue;
3614                 }
3615                 vm_map_clip_end(map, entry, end);
3616
3617                 next = vm_map_entry_succ(entry);
3618
3619                 /*
3620                  * Unwire before removing addresses from the pmap; otherwise,
3621                  * unwiring will put the entries back in the pmap.
3622                  */
3623                 if (entry->wired_count != 0)
3624                         vm_map_entry_unwire(map, entry);
3625
3626                 /*
3627                  * Remove mappings for the pages, but only if the
3628                  * mappings could exist.  For instance, it does not
3629                  * make sense to call pmap_remove() for guard entries.
3630                  */
3631                 if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0 ||
3632                     entry->object.vm_object != NULL)
3633                         pmap_remove(map->pmap, entry->start, entry->end);
3634
3635                 if (entry->end == map->anon_loc)
3636                         map->anon_loc = entry->start;
3637
3638                 /*
3639                  * Delete the entry only after removing all pmap
3640                  * entries pointing to its pages.  (Otherwise, its
3641                  * page frames may be reallocated, and any modify bits
3642                  * will be set in the wrong object!)
3643                  */
3644                 vm_map_entry_delete(map, entry);
3645                 entry = next;
3646         }
3647         return (KERN_SUCCESS);
3648 }
3649
3650 /*
3651  *      vm_map_remove:
3652  *
3653  *      Remove the given address range from the target map.
3654  *      This is the exported form of vm_map_delete.
3655  */
3656 int
3657 vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end)
3658 {
3659         int result;
3660
3661         vm_map_lock(map);
3662         VM_MAP_RANGE_CHECK(map, start, end);
3663         result = vm_map_delete(map, start, end);
3664         vm_map_unlock(map);
3665         return (result);
3666 }
3667
3668 /*
3669  *      vm_map_check_protection:
3670  *
3671  *      Assert that the target map allows the specified privilege on the
3672  *      entire address region given.  The entire region must be allocated.
3673  *
3674  *      WARNING!  This code does not and should not check whether the
3675  *      contents of the region is accessible.  For example a smaller file
3676  *      might be mapped into a larger address space.
3677  *
3678  *      NOTE!  This code is also called by munmap().
3679  *
3680  *      The map must be locked.  A read lock is sufficient.
3681  */
3682 boolean_t
3683 vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end,
3684                         vm_prot_t protection)
3685 {
3686         vm_map_entry_t entry;
3687         vm_map_entry_t tmp_entry;
3688
3689         if (!vm_map_lookup_entry(map, start, &tmp_entry))
3690                 return (FALSE);
3691         entry = tmp_entry;
3692
3693         while (start < end) {
3694                 /*
3695                  * No holes allowed!
3696                  */
3697                 if (start < entry->start)
3698                         return (FALSE);
3699                 /*
3700                  * Check protection associated with entry.
3701                  */
3702                 if ((entry->protection & protection) != protection)
3703                         return (FALSE);
3704                 /* go to next entry */
3705                 start = entry->end;
3706                 entry = vm_map_entry_succ(entry);
3707         }
3708         return (TRUE);
3709 }
3710
3711
3712 /*
3713  *
3714  *      vm_map_copy_anon_object:
3715  *
3716  *      Copies an anonymous object from an existing map entry to a
3717  *      new one.  Carries forward the swap charge.  May change the
3718  *      src object on return.
3719  */
3720 static void
3721 vm_map_copy_anon_object(vm_map_entry_t src_entry, vm_map_entry_t dst_entry,
3722     vm_offset_t size, vm_ooffset_t *fork_charge)
3723 {
3724         vm_object_t src_object;
3725         struct ucred *cred;
3726         int charged;
3727
3728         src_object = src_entry->object.vm_object;
3729         VM_OBJECT_WLOCK(src_object);
3730         charged = ENTRY_CHARGED(src_entry);
3731         vm_object_collapse(src_object);
3732         if ((src_object->flags & OBJ_ONEMAPPING) != 0) {
3733                 vm_object_split(src_entry);
3734                 src_object = src_entry->object.vm_object;
3735         }
3736         vm_object_reference_locked(src_object);
3737         vm_object_clear_flag(src_object, OBJ_ONEMAPPING);
3738         if (src_entry->cred != NULL &&
3739             !(src_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
3740                 KASSERT(src_object->cred == NULL,
3741                     ("OVERCOMMIT: vm_map_copy_anon_entry: cred %p",
3742                      src_object));
3743                 src_object->cred = src_entry->cred;
3744                 src_object->charge = size;
3745         }
3746         VM_OBJECT_WUNLOCK(src_object);
3747         dst_entry->object.vm_object = src_object;
3748         if (charged) {
3749                 cred = curthread->td_ucred;
3750                 crhold(cred);
3751                 dst_entry->cred = cred;
3752                 *fork_charge += size;
3753                 if (!(src_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
3754                         crhold(cred);
3755                         src_entry->cred = cred;
3756                         *fork_charge += size;
3757                 }
3758         }
3759 }
3760
3761 /*
3762  *      vm_map_copy_entry:
3763  *
3764  *      Copies the contents of the source entry to the destination
3765  *      entry.  The entries *must* be aligned properly.
3766  */
3767 static void
3768 vm_map_copy_entry(
3769         vm_map_t src_map,
3770         vm_map_t dst_map,
3771         vm_map_entry_t src_entry,
3772         vm_map_entry_t dst_entry,
3773         vm_ooffset_t *fork_charge)
3774 {
3775         vm_object_t src_object;
3776         vm_map_entry_t fake_entry;
3777         vm_offset_t size;
3778
3779         VM_MAP_ASSERT_LOCKED(dst_map);
3780
3781         if ((dst_entry->eflags|src_entry->eflags) & MAP_ENTRY_IS_SUB_MAP)
3782                 return;
3783
3784         if (src_entry->wired_count == 0 ||
3785             (src_entry->protection & VM_PROT_WRITE) == 0) {
3786                 /*
3787                  * If the source entry is marked needs_copy, it is already
3788                  * write-protected.
3789                  */
3790                 if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0 &&
3791                     (src_entry->protection & VM_PROT_WRITE) != 0) {
3792                         pmap_protect(src_map->pmap,
3793                             src_entry->start,
3794                             src_entry->end,
3795                             src_entry->protection & ~VM_PROT_WRITE);
3796                 }
3797
3798                 /*
3799                  * Make a copy of the object.
3800                  */
3801                 size = src_entry->end - src_entry->start;
3802                 if ((src_object = src_entry->object.vm_object) != NULL) {
3803                         if ((src_object->flags & OBJ_ANON) != 0) {
3804                                 vm_map_copy_anon_object(src_entry, dst_entry,
3805                                     size, fork_charge);
3806                                 /* May have split/collapsed, reload obj. */
3807                                 src_object = src_entry->object.vm_object;
3808                         } else {
3809                                 vm_object_reference(src_object);
3810                                 dst_entry->object.vm_object = src_object;
3811                         }
3812                         src_entry->eflags |= MAP_ENTRY_COW |
3813                             MAP_ENTRY_NEEDS_COPY;
3814                         dst_entry->eflags |= MAP_ENTRY_COW |
3815                             MAP_ENTRY_NEEDS_COPY;
3816                         dst_entry->offset = src_entry->offset;
3817                         if (src_entry->eflags & MAP_ENTRY_WRITECNT) {
3818                                 /*
3819                                  * MAP_ENTRY_WRITECNT cannot
3820                                  * indicate write reference from
3821                                  * src_entry, since the entry is
3822                                  * marked as needs copy.  Allocate a
3823                                  * fake entry that is used to
3824                                  * decrement object->un_pager writecount
3825                                  * at the appropriate time.  Attach
3826                                  * fake_entry to the deferred list.
3827                                  */
3828                                 fake_entry = vm_map_entry_create(dst_map);
3829                                 fake_entry->eflags = MAP_ENTRY_WRITECNT;
3830                                 src_entry->eflags &= ~MAP_ENTRY_WRITECNT;
3831                                 vm_object_reference(src_object);
3832                                 fake_entry->object.vm_object = src_object;
3833                                 fake_entry->start = src_entry->start;
3834                                 fake_entry->end = src_entry->end;
3835                                 fake_entry->defer_next =
3836                                     curthread->td_map_def_user;
3837                                 curthread->td_map_def_user = fake_entry;
3838                         }
3839
3840                         pmap_copy(dst_map->pmap, src_map->pmap,
3841                             dst_entry->start, dst_entry->end - dst_entry->start,
3842                             src_entry->start);
3843                 } else {
3844                         dst_entry->object.vm_object = NULL;
3845                         dst_entry->offset = 0;
3846                         if (src_entry->cred != NULL) {
3847                                 dst_entry->cred = curthread->td_ucred;
3848                                 crhold(dst_entry->cred);
3849                                 *fork_charge += size;
3850                         }
3851                 }
3852         } else {
3853                 /*
3854                  * We don't want to make writeable wired pages copy-on-write.
3855                  * Immediately copy these pages into the new map by simulating
3856                  * page faults.  The new pages are pageable.
3857                  */
3858                 vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry,
3859                     fork_charge);
3860         }
3861 }
3862
3863 /*
3864  * vmspace_map_entry_forked:
3865  * Update the newly-forked vmspace each time a map entry is inherited
3866  * or copied.  The values for vm_dsize and vm_tsize are approximate
3867  * (and mostly-obsolete ideas in the face of mmap(2) et al.)
3868  */
3869 static void
3870 vmspace_map_entry_forked(const struct vmspace *vm1, struct vmspace *vm2,
3871     vm_map_entry_t entry)
3872 {
3873         vm_size_t entrysize;
3874         vm_offset_t newend;
3875
3876         if ((entry->eflags & MAP_ENTRY_GUARD) != 0)
3877                 return;
3878         entrysize = entry->end - entry->start;
3879         vm2->vm_map.size += entrysize;
3880         if (entry->eflags & (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP)) {
3881                 vm2->vm_ssize += btoc(entrysize);
3882         } else if (entry->start >= (vm_offset_t)vm1->vm_daddr &&
3883             entry->start < (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize)) {
3884                 newend = MIN(entry->end,
3885                     (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize));
3886                 vm2->vm_dsize += btoc(newend - entry->start);
3887         } else if (entry->start >= (vm_offset_t)vm1->vm_taddr &&
3888             entry->start < (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize)) {
3889                 newend = MIN(entry->end,
3890                     (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize));
3891                 vm2->vm_tsize += btoc(newend - entry->start);
3892         }
3893 }
3894
3895 /*
3896  * vmspace_fork:
3897  * Create a new process vmspace structure and vm_map
3898  * based on those of an existing process.  The new map
3899  * is based on the old map, according to the inheritance
3900  * values on the regions in that map.
3901  *
3902  * XXX It might be worth coalescing the entries added to the new vmspace.
3903  *
3904  * The source map must not be locked.
3905  */
3906 struct vmspace *
3907 vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_charge)
3908 {
3909         struct vmspace *vm2;
3910         vm_map_t new_map, old_map;
3911         vm_map_entry_t new_entry, old_entry;
3912         vm_object_t object;
3913         int error, locked;
3914         vm_inherit_t inh;
3915
3916         old_map = &vm1->vm_map;
3917         /* Copy immutable fields of vm1 to vm2. */
3918         vm2 = vmspace_alloc(vm_map_min(old_map), vm_map_max(old_map),
3919             pmap_pinit);
3920         if (vm2 == NULL)
3921                 return (NULL);
3922
3923         vm2->vm_taddr = vm1->vm_taddr;
3924         vm2->vm_daddr = vm1->vm_daddr;
3925         vm2->vm_maxsaddr = vm1->vm_maxsaddr;
3926         vm_map_lock(old_map);
3927         if (old_map->busy)
3928                 vm_map_wait_busy(old_map);
3929         new_map = &vm2->vm_map;
3930         locked = vm_map_trylock(new_map); /* trylock to silence WITNESS */
3931         KASSERT(locked, ("vmspace_fork: lock failed"));
3932
3933         error = pmap_vmspace_copy(new_map->pmap, old_map->pmap);
3934         if (error != 0) {
3935                 sx_xunlock(&old_map->lock);
3936                 sx_xunlock(&new_map->lock);
3937                 vm_map_process_deferred();
3938                 vmspace_free(vm2);
3939                 return (NULL);
3940         }
3941
3942         new_map->anon_loc = old_map->anon_loc;
3943
3944         VM_MAP_ENTRY_FOREACH(old_entry, old_map) {
3945                 if ((old_entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0)
3946                         panic("vm_map_fork: encountered a submap");
3947
3948                 inh = old_entry->inheritance;
3949                 if ((old_entry->eflags & MAP_ENTRY_GUARD) != 0 &&
3950                     inh != VM_INHERIT_NONE)
3951                         inh = VM_INHERIT_COPY;
3952
3953                 switch (inh) {
3954                 case VM_INHERIT_NONE:
3955                         break;
3956
3957                 case VM_INHERIT_SHARE:
3958                         /*
3959                          * Clone the entry, creating the shared object if
3960                          * necessary.
3961                          */
3962                         object = old_entry->object.vm_object;
3963                         if (object == NULL) {
3964                                 vm_map_entry_back(old_entry);
3965                                 object = old_entry->object.vm_object;
3966                         }
3967
3968                         /*
3969                          * Add the reference before calling vm_object_shadow
3970                          * to insure that a shadow object is created.
3971                          */
3972                         vm_object_reference(object);
3973                         if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) {
3974                                 vm_object_shadow(&old_entry->object.vm_object,
3975                                     &old_entry->offset,
3976                                     old_entry->end - old_entry->start);
3977                                 old_entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
3978                                 /* Transfer the second reference too. */
3979                                 vm_object_reference(
3980                                     old_entry->object.vm_object);
3981
3982                                 /*
3983                                  * As in vm_map_merged_neighbor_dispose(),
3984                                  * the vnode lock will not be acquired in
3985                                  * this call to vm_object_deallocate().
3986                                  */
3987                                 vm_object_deallocate(object);
3988                                 object = old_entry->object.vm_object;
3989                         }
3990                         VM_OBJECT_WLOCK(object);
3991                         vm_object_clear_flag(object, OBJ_ONEMAPPING);
3992                         if (old_entry->cred != NULL) {
3993                                 KASSERT(object->cred == NULL, ("vmspace_fork both cred"));
3994                                 object->cred = old_entry->cred;
3995                                 object->charge = old_entry->end - old_entry->start;
3996                                 old_entry->cred = NULL;
3997                         }
3998
3999                         /*
4000                          * Assert the correct state of the vnode
4001                          * v_writecount while the object is locked, to
4002                          * not relock it later for the assertion
4003                          * correctness.
4004                          */
4005                         if (old_entry->eflags & MAP_ENTRY_WRITECNT &&
4006                             object->type == OBJT_VNODE) {
4007                                 KASSERT(((struct vnode *)object->handle)->
4008                                     v_writecount > 0,
4009                                     ("vmspace_fork: v_writecount %p", object));
4010                                 KASSERT(object->un_pager.vnp.writemappings > 0,
4011                                     ("vmspace_fork: vnp.writecount %p",
4012                                     object));
4013                         }
4014                         VM_OBJECT_WUNLOCK(object);
4015
4016                         /*
4017                          * Clone the entry, referencing the shared object.
4018                          */
4019                         new_entry = vm_map_entry_create(new_map);
4020                         *new_entry = *old_entry;
4021                         new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED |
4022                             MAP_ENTRY_IN_TRANSITION);
4023                         new_entry->wiring_thread = NULL;
4024                         new_entry->wired_count = 0;
4025                         if (new_entry->eflags & MAP_ENTRY_WRITECNT) {
4026                                 vm_pager_update_writecount(object,
4027                                     new_entry->start, new_entry->end);
4028                         }
4029                         vm_map_entry_set_vnode_text(new_entry, true);
4030
4031                         /*
4032                          * Insert the entry into the new map -- we know we're
4033                          * inserting at the end of the new map.
4034                          */
4035                         vm_map_entry_link(new_map, new_entry);
4036                         vmspace_map_entry_forked(vm1, vm2, new_entry);
4037
4038                         /*
4039                          * Update the physical map
4040                          */
4041                         pmap_copy(new_map->pmap, old_map->pmap,
4042                             new_entry->start,
4043                             (old_entry->end - old_entry->start),
4044                             old_entry->start);
4045                         break;
4046
4047                 case VM_INHERIT_COPY:
4048                         /*
4049                          * Clone the entry and link into the map.
4050                          */
4051                         new_entry = vm_map_entry_create(new_map);
4052                         *new_entry = *old_entry;
4053                         /*
4054                          * Copied entry is COW over the old object.
4055                          */
4056                         new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED |
4057                             MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_WRITECNT);
4058                         new_entry->wiring_thread = NULL;
4059                         new_entry->wired_count = 0;
4060                         new_entry->object.vm_object = NULL;
4061                         new_entry->cred = NULL;
4062                         vm_map_entry_link(new_map, new_entry);
4063                         vmspace_map_entry_forked(vm1, vm2, new_entry);
4064                         vm_map_copy_entry(old_map, new_map, old_entry,
4065                             new_entry, fork_charge);
4066                         vm_map_entry_set_vnode_text(new_entry, true);
4067                         break;
4068
4069                 case VM_INHERIT_ZERO:
4070                         /*
4071                          * Create a new anonymous mapping entry modelled from
4072                          * the old one.
4073                          */
4074                         new_entry = vm_map_entry_create(new_map);
4075                         memset(new_entry, 0, sizeof(*new_entry));
4076
4077                         new_entry->start = old_entry->start;
4078                         new_entry->end = old_entry->end;
4079                         new_entry->eflags = old_entry->eflags &
4080                             ~(MAP_ENTRY_USER_WIRED | MAP_ENTRY_IN_TRANSITION |
4081                             MAP_ENTRY_WRITECNT | MAP_ENTRY_VN_EXEC);
4082                         new_entry->protection = old_entry->protection;
4083                         new_entry->max_protection = old_entry->max_protection;
4084                         new_entry->inheritance = VM_INHERIT_ZERO;
4085
4086                         vm_map_entry_link(new_map, new_entry);
4087                         vmspace_map_entry_forked(vm1, vm2, new_entry);
4088
4089                         new_entry->cred = curthread->td_ucred;
4090                         crhold(new_entry->cred);
4091                         *fork_charge += (new_entry->end - new_entry->start);
4092
4093                         break;
4094                 }
4095         }
4096         /*
4097          * Use inlined vm_map_unlock() to postpone handling the deferred
4098          * map entries, which cannot be done until both old_map and
4099          * new_map locks are released.
4100          */
4101         sx_xunlock(&old_map->lock);
4102         sx_xunlock(&new_map->lock);
4103         vm_map_process_deferred();
4104
4105         return (vm2);
4106 }
4107
4108 /*
4109  * Create a process's stack for exec_new_vmspace().  This function is never
4110  * asked to wire the newly created stack.
4111  */
4112 int
4113 vm_map_stack(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
4114     vm_prot_t prot, vm_prot_t max, int cow)
4115 {
4116         vm_size_t growsize, init_ssize;
4117         rlim_t vmemlim;
4118         int rv;
4119
4120         MPASS((map->flags & MAP_WIREFUTURE) == 0);
4121         growsize = sgrowsiz;
4122         init_ssize = (max_ssize < growsize) ? max_ssize : growsize;
4123         vm_map_lock(map);
4124         vmemlim = lim_cur(curthread, RLIMIT_VMEM);
4125         /* If we would blow our VMEM resource limit, no go */
4126         if (map->size + init_ssize > vmemlim) {
4127                 rv = KERN_NO_SPACE;
4128                 goto out;
4129         }
4130         rv = vm_map_stack_locked(map, addrbos, max_ssize, growsize, prot,
4131             max, cow);
4132 out:
4133         vm_map_unlock(map);
4134         return (rv);
4135 }
4136
4137 static int stack_guard_page = 1;
4138 SYSCTL_INT(_security_bsd, OID_AUTO, stack_guard_page, CTLFLAG_RWTUN,
4139     &stack_guard_page, 0,
4140     "Specifies the number of guard pages for a stack that grows");
4141
4142 static int
4143 vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
4144     vm_size_t growsize, vm_prot_t prot, vm_prot_t max, int cow)
4145 {
4146         vm_map_entry_t new_entry, prev_entry;
4147         vm_offset_t bot, gap_bot, gap_top, top;
4148         vm_size_t init_ssize, sgp;
4149         int orient, rv;
4150
4151         /*
4152          * The stack orientation is piggybacked with the cow argument.
4153          * Extract it into orient and mask the cow argument so that we
4154          * don't pass it around further.
4155          */
4156         orient = cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP);
4157         KASSERT(orient != 0, ("No stack grow direction"));
4158         KASSERT(orient != (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP),
4159             ("bi-dir stack"));
4160
4161         if (addrbos < vm_map_min(map) ||
4162             addrbos + max_ssize > vm_map_max(map) ||
4163             addrbos + max_ssize <= addrbos)
4164                 return (KERN_INVALID_ADDRESS);
4165         sgp = ((curproc->p_flag2 & P2_STKGAP_DISABLE) != 0 ||
4166             (curproc->p_fctl0 & NT_FREEBSD_FCTL_STKGAP_DISABLE) != 0) ? 0 :
4167             (vm_size_t)stack_guard_page * PAGE_SIZE;
4168         if (sgp >= max_ssize)
4169                 return (KERN_INVALID_ARGUMENT);
4170
4171         init_ssize = growsize;
4172         if (max_ssize < init_ssize + sgp)
4173                 init_ssize = max_ssize - sgp;
4174
4175         /* If addr is already mapped, no go */
4176         if (vm_map_lookup_entry(map, addrbos, &prev_entry))
4177                 return (KERN_NO_SPACE);
4178
4179         /*
4180          * If we can't accommodate max_ssize in the current mapping, no go.
4181          */
4182         if (vm_map_entry_succ(prev_entry)->start < addrbos + max_ssize)
4183                 return (KERN_NO_SPACE);
4184
4185         /*
4186          * We initially map a stack of only init_ssize.  We will grow as
4187          * needed later.  Depending on the orientation of the stack (i.e.
4188          * the grow direction) we either map at the top of the range, the
4189          * bottom of the range or in the middle.
4190          *
4191          * Note: we would normally expect prot and max to be VM_PROT_ALL,
4192          * and cow to be 0.  Possibly we should eliminate these as input
4193          * parameters, and just pass these values here in the insert call.
4194          */
4195         if (orient == MAP_STACK_GROWS_DOWN) {
4196                 bot = addrbos + max_ssize - init_ssize;
4197                 top = bot + init_ssize;
4198                 gap_bot = addrbos;
4199                 gap_top = bot;
4200         } else /* if (orient == MAP_STACK_GROWS_UP) */ {
4201                 bot = addrbos;
4202                 top = bot + init_ssize;
4203                 gap_bot = top;
4204                 gap_top = addrbos + max_ssize;
4205         }
4206         rv = vm_map_insert(map, NULL, 0, bot, top, prot, max, cow);
4207         if (rv != KERN_SUCCESS)
4208                 return (rv);
4209         new_entry = vm_map_entry_succ(prev_entry);
4210         KASSERT(new_entry->end == top || new_entry->start == bot,
4211             ("Bad entry start/end for new stack entry"));
4212         KASSERT((orient & MAP_STACK_GROWS_DOWN) == 0 ||
4213             (new_entry->eflags & MAP_ENTRY_GROWS_DOWN) != 0,
4214             ("new entry lacks MAP_ENTRY_GROWS_DOWN"));
4215         KASSERT((orient & MAP_STACK_GROWS_UP) == 0 ||
4216             (new_entry->eflags & MAP_ENTRY_GROWS_UP) != 0,
4217             ("new entry lacks MAP_ENTRY_GROWS_UP"));
4218         if (gap_bot == gap_top)
4219                 return (KERN_SUCCESS);
4220         rv = vm_map_insert(map, NULL, 0, gap_bot, gap_top, VM_PROT_NONE,
4221             VM_PROT_NONE, MAP_CREATE_GUARD | (orient == MAP_STACK_GROWS_DOWN ?
4222             MAP_CREATE_STACK_GAP_DN : MAP_CREATE_STACK_GAP_UP));
4223         if (rv == KERN_SUCCESS) {
4224                 /*
4225                  * Gap can never successfully handle a fault, so
4226                  * read-ahead logic is never used for it.  Re-use
4227                  * next_read of the gap entry to store
4228                  * stack_guard_page for vm_map_growstack().
4229                  */
4230                 if (orient == MAP_STACK_GROWS_DOWN)
4231                         vm_map_entry_pred(new_entry)->next_read = sgp;
4232                 else
4233                         vm_map_entry_succ(new_entry)->next_read = sgp;
4234         } else {
4235                 (void)vm_map_delete(map, bot, top);
4236         }
4237         return (rv);
4238 }
4239
4240 /*
4241  * Attempts to grow a vm stack entry.  Returns KERN_SUCCESS if we
4242  * successfully grow the stack.
4243  */
4244 static int
4245 vm_map_growstack(vm_map_t map, vm_offset_t addr, vm_map_entry_t gap_entry)
4246 {
4247         vm_map_entry_t stack_entry;
4248         struct proc *p;
4249         struct vmspace *vm;
4250         struct ucred *cred;
4251         vm_offset_t gap_end, gap_start, grow_start;
4252         vm_size_t grow_amount, guard, max_grow;
4253         rlim_t lmemlim, stacklim, vmemlim;
4254         int rv, rv1;
4255         bool gap_deleted, grow_down, is_procstack;
4256 #ifdef notyet
4257         uint64_t limit;
4258 #endif
4259 #ifdef RACCT
4260         int error;
4261 #endif
4262
4263         p = curproc;
4264         vm = p->p_vmspace;
4265
4266         /*
4267          * Disallow stack growth when the access is performed by a
4268          * debugger or AIO daemon.  The reason is that the wrong
4269          * resource limits are applied.
4270          */
4271         if (p != initproc && (map != &p->p_vmspace->vm_map ||
4272             p->p_textvp == NULL))
4273                 return (KERN_FAILURE);
4274
4275         MPASS(!map->system_map);
4276
4277         lmemlim = lim_cur(curthread, RLIMIT_MEMLOCK);
4278         stacklim = lim_cur(curthread, RLIMIT_STACK);
4279         vmemlim = lim_cur(curthread, RLIMIT_VMEM);
4280 retry:
4281         /* If addr is not in a hole for a stack grow area, no need to grow. */
4282         if (gap_entry == NULL && !vm_map_lookup_entry(map, addr, &gap_entry))
4283                 return (KERN_FAILURE);
4284         if ((gap_entry->eflags & MAP_ENTRY_GUARD) == 0)
4285                 return (KERN_SUCCESS);
4286         if ((gap_entry->eflags & MAP_ENTRY_STACK_GAP_DN) != 0) {
4287                 stack_entry = vm_map_entry_succ(gap_entry);
4288                 if ((stack_entry->eflags & MAP_ENTRY_GROWS_DOWN) == 0 ||
4289                     stack_entry->start != gap_entry->end)
4290                         return (KERN_FAILURE);
4291                 grow_amount = round_page(stack_entry->start - addr);
4292                 grow_down = true;
4293         } else if ((gap_entry->eflags & MAP_ENTRY_STACK_GAP_UP) != 0) {
4294                 stack_entry = vm_map_entry_pred(gap_entry);
4295                 if ((stack_entry->eflags & MAP_ENTRY_GROWS_UP) == 0 ||
4296                     stack_entry->end != gap_entry->start)
4297                         return (KERN_FAILURE);
4298                 grow_amount = round_page(addr + 1 - stack_entry->end);
4299                 grow_down = false;
4300         } else {
4301                 return (KERN_FAILURE);
4302         }
4303         guard = ((curproc->p_flag2 & P2_STKGAP_DISABLE) != 0 ||
4304             (curproc->p_fctl0 & NT_FREEBSD_FCTL_STKGAP_DISABLE) != 0) ? 0 :
4305             gap_entry->next_read;
4306         max_grow = gap_entry->end - gap_entry->start;
4307         if (guard > max_grow)
4308                 return (KERN_NO_SPACE);
4309         max_grow -= guard;
4310         if (grow_amount > max_grow)
4311                 return (KERN_NO_SPACE);
4312
4313         /*
4314          * If this is the main process stack, see if we're over the stack
4315          * limit.
4316          */
4317         is_procstack = addr >= (vm_offset_t)vm->vm_maxsaddr &&
4318             addr < (vm_offset_t)p->p_sysent->sv_usrstack;
4319         if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim))
4320                 return (KERN_NO_SPACE);
4321
4322 #ifdef RACCT
4323         if (racct_enable) {
4324                 PROC_LOCK(p);
4325                 if (is_procstack && racct_set(p, RACCT_STACK,
4326                     ctob(vm->vm_ssize) + grow_amount)) {
4327                         PROC_UNLOCK(p);
4328                         return (KERN_NO_SPACE);
4329                 }
4330                 PROC_UNLOCK(p);
4331         }
4332 #endif
4333
4334         grow_amount = roundup(grow_amount, sgrowsiz);
4335         if (grow_amount > max_grow)
4336                 grow_amount = max_grow;
4337         if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) {
4338                 grow_amount = trunc_page((vm_size_t)stacklim) -
4339                     ctob(vm->vm_ssize);
4340         }
4341
4342 #ifdef notyet
4343         PROC_LOCK(p);
4344         limit = racct_get_available(p, RACCT_STACK);
4345         PROC_UNLOCK(p);
4346         if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > limit))
4347                 grow_amount = limit - ctob(vm->vm_ssize);
4348 #endif
4349
4350         if (!old_mlock && (map->flags & MAP_WIREFUTURE) != 0) {
4351                 if (ptoa(pmap_wired_count(map->pmap)) + grow_amount > lmemlim) {
4352                         rv = KERN_NO_SPACE;
4353                         goto out;
4354                 }
4355 #ifdef RACCT
4356                 if (racct_enable) {
4357                         PROC_LOCK(p);
4358                         if (racct_set(p, RACCT_MEMLOCK,
4359                             ptoa(pmap_wired_count(map->pmap)) + grow_amount)) {
4360                                 PROC_UNLOCK(p);
4361                                 rv = KERN_NO_SPACE;
4362                                 goto out;
4363                         }
4364                         PROC_UNLOCK(p);
4365                 }
4366 #endif
4367         }
4368
4369         /* If we would blow our VMEM resource limit, no go */
4370         if (map->size + grow_amount > vmemlim) {
4371                 rv = KERN_NO_SPACE;
4372                 goto out;
4373         }
4374 #ifdef RACCT
4375         if (racct_enable) {
4376                 PROC_LOCK(p);
4377                 if (racct_set(p, RACCT_VMEM, map->size + grow_amount)) {
4378                         PROC_UNLOCK(p);
4379                         rv = KERN_NO_SPACE;
4380                         goto out;
4381                 }
4382                 PROC_UNLOCK(p);
4383         }
4384 #endif
4385
4386         if (vm_map_lock_upgrade(map)) {
4387                 gap_entry = NULL;
4388                 vm_map_lock_read(map);
4389                 goto retry;
4390         }
4391
4392         if (grow_down) {
4393                 grow_start = gap_entry->end - grow_amount;
4394                 if (gap_entry->start + grow_amount == gap_entry->end) {
4395                         gap_start = gap_entry->start;
4396                         gap_end = gap_entry->end;
4397                         vm_map_entry_delete(map, gap_entry);
4398                         gap_deleted = true;
4399                 } else {
4400                         MPASS(gap_entry->start < gap_entry->end - grow_amount);
4401                         vm_map_entry_resize(map, gap_entry, -grow_amount);
4402                         gap_deleted = false;
4403                 }
4404                 rv = vm_map_insert(map, NULL, 0, grow_start,
4405                     grow_start + grow_amount,
4406                     stack_entry->protection, stack_entry->max_protection,
4407                     MAP_STACK_GROWS_DOWN);
4408                 if (rv != KERN_SUCCESS) {
4409                         if (gap_deleted) {
4410                                 rv1 = vm_map_insert(map, NULL, 0, gap_start,
4411                                     gap_end, VM_PROT_NONE, VM_PROT_NONE,
4412                                     MAP_CREATE_GUARD | MAP_CREATE_STACK_GAP_DN);
4413                                 MPASS(rv1 == KERN_SUCCESS);
4414                         } else
4415                                 vm_map_entry_resize(map, gap_entry,
4416                                     grow_amount);
4417                 }
4418         } else {
4419                 grow_start = stack_entry->end;
4420                 cred = stack_entry->cred;
4421                 if (cred == NULL && stack_entry->object.vm_object != NULL)
4422                         cred = stack_entry->object.vm_object->cred;
4423                 if (cred != NULL && !swap_reserve_by_cred(grow_amount, cred))
4424                         rv = KERN_NO_SPACE;
4425                 /* Grow the underlying object if applicable. */
4426                 else if (stack_entry->object.vm_object == NULL ||
4427                     vm_object_coalesce(stack_entry->object.vm_object,
4428                     stack_entry->offset,
4429                     (vm_size_t)(stack_entry->end - stack_entry->start),
4430                     grow_amount, cred != NULL)) {
4431                         if (gap_entry->start + grow_amount == gap_entry->end) {
4432                                 vm_map_entry_delete(map, gap_entry);
4433                                 vm_map_entry_resize(map, stack_entry,
4434                                     grow_amount);
4435                         } else {
4436                                 gap_entry->start += grow_amount;
4437                                 stack_entry->end += grow_amount;
4438                         }
4439                         map->size += grow_amount;
4440                         rv = KERN_SUCCESS;
4441                 } else
4442                         rv = KERN_FAILURE;
4443         }
4444         if (rv == KERN_SUCCESS && is_procstack)
4445                 vm->vm_ssize += btoc(grow_amount);
4446
4447         /*
4448          * Heed the MAP_WIREFUTURE flag if it was set for this process.
4449          */
4450         if (rv == KERN_SUCCESS && (map->flags & MAP_WIREFUTURE) != 0) {
4451                 rv = vm_map_wire_locked(map, grow_start,
4452                     grow_start + grow_amount,
4453                     VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
4454         }
4455         vm_map_lock_downgrade(map);
4456
4457 out:
4458 #ifdef RACCT
4459         if (racct_enable && rv != KERN_SUCCESS) {
4460                 PROC_LOCK(p);
4461                 error = racct_set(p, RACCT_VMEM, map->size);
4462                 KASSERT(error == 0, ("decreasing RACCT_VMEM failed"));
4463                 if (!old_mlock) {
4464                         error = racct_set(p, RACCT_MEMLOCK,
4465                             ptoa(pmap_wired_count(map->pmap)));
4466                         KASSERT(error == 0, ("decreasing RACCT_MEMLOCK failed"));
4467                 }
4468                 error = racct_set(p, RACCT_STACK, ctob(vm->vm_ssize));
4469                 KASSERT(error == 0, ("decreasing RACCT_STACK failed"));
4470                 PROC_UNLOCK(p);
4471         }
4472 #endif
4473
4474         return (rv);
4475 }
4476
4477 /*
4478  * Unshare the specified VM space for exec.  If other processes are
4479  * mapped to it, then create a new one.  The new vmspace is null.
4480  */
4481 int
4482 vmspace_exec(struct proc *p, vm_offset_t minuser, vm_offset_t maxuser)
4483 {
4484         struct vmspace *oldvmspace = p->p_vmspace;
4485         struct vmspace *newvmspace;
4486
4487         KASSERT((curthread->td_pflags & TDP_EXECVMSPC) == 0,
4488             ("vmspace_exec recursed"));
4489         newvmspace = vmspace_alloc(minuser, maxuser, pmap_pinit);
4490         if (newvmspace == NULL)
4491                 return (ENOMEM);
4492         newvmspace->vm_swrss = oldvmspace->vm_swrss;
4493         /*
4494          * This code is written like this for prototype purposes.  The
4495          * goal is to avoid running down the vmspace here, but let the
4496          * other process's that are still using the vmspace to finally
4497          * run it down.  Even though there is little or no chance of blocking
4498          * here, it is a good idea to keep this form for future mods.
4499          */
4500         PROC_VMSPACE_LOCK(p);
4501         p->p_vmspace = newvmspace;
4502         PROC_VMSPACE_UNLOCK(p);
4503         if (p == curthread->td_proc)
4504                 pmap_activate(curthread);
4505         curthread->td_pflags |= TDP_EXECVMSPC;
4506         return (0);
4507 }
4508
4509 /*
4510  * Unshare the specified VM space for forcing COW.  This
4511  * is called by rfork, for the (RFMEM|RFPROC) == 0 case.
4512  */
4513 int
4514 vmspace_unshare(struct proc *p)
4515 {
4516         struct vmspace *oldvmspace = p->p_vmspace;
4517         struct vmspace *newvmspace;
4518         vm_ooffset_t fork_charge;
4519
4520         if (oldvmspace->vm_refcnt == 1)
4521                 return (0);
4522         fork_charge = 0;
4523         newvmspace = vmspace_fork(oldvmspace, &fork_charge);
4524         if (newvmspace == NULL)
4525                 return (ENOMEM);
4526         if (!swap_reserve_by_cred(fork_charge, p->p_ucred)) {
4527                 vmspace_free(newvmspace);
4528                 return (ENOMEM);
4529         }
4530         PROC_VMSPACE_LOCK(p);
4531         p->p_vmspace = newvmspace;
4532         PROC_VMSPACE_UNLOCK(p);
4533         if (p == curthread->td_proc)
4534                 pmap_activate(curthread);
4535         vmspace_free(oldvmspace);
4536         return (0);
4537 }
4538
4539 /*
4540  *      vm_map_lookup:
4541  *
4542  *      Finds the VM object, offset, and
4543  *      protection for a given virtual address in the
4544  *      specified map, assuming a page fault of the
4545  *      type specified.
4546  *
4547  *      Leaves the map in question locked for read; return
4548  *      values are guaranteed until a vm_map_lookup_done
4549  *      call is performed.  Note that the map argument
4550  *      is in/out; the returned map must be used in
4551  *      the call to vm_map_lookup_done.
4552  *
4553  *      A handle (out_entry) is returned for use in
4554  *      vm_map_lookup_done, to make that fast.
4555  *
4556  *      If a lookup is requested with "write protection"
4557  *      specified, the map may be changed to perform virtual
4558  *      copying operations, although the data referenced will
4559  *      remain the same.
4560  */
4561 int
4562 vm_map_lookup(vm_map_t *var_map,                /* IN/OUT */
4563               vm_offset_t vaddr,
4564               vm_prot_t fault_typea,
4565               vm_map_entry_t *out_entry,        /* OUT */
4566               vm_object_t *object,              /* OUT */
4567               vm_pindex_t *pindex,              /* OUT */
4568               vm_prot_t *out_prot,              /* OUT */
4569               boolean_t *wired)                 /* OUT */
4570 {
4571         vm_map_entry_t entry;
4572         vm_map_t map = *var_map;
4573         vm_prot_t prot;
4574         vm_prot_t fault_type = fault_typea;
4575         vm_object_t eobject;
4576         vm_size_t size;
4577         struct ucred *cred;
4578
4579 RetryLookup:
4580
4581         vm_map_lock_read(map);
4582
4583 RetryLookupLocked:
4584         /*
4585          * Lookup the faulting address.
4586          */
4587         if (!vm_map_lookup_entry(map, vaddr, out_entry)) {
4588                 vm_map_unlock_read(map);
4589                 return (KERN_INVALID_ADDRESS);
4590         }
4591
4592         entry = *out_entry;
4593
4594         /*
4595          * Handle submaps.
4596          */
4597         if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
4598                 vm_map_t old_map = map;
4599
4600                 *var_map = map = entry->object.sub_map;
4601                 vm_map_unlock_read(old_map);
4602                 goto RetryLookup;
4603         }
4604
4605         /*
4606          * Check whether this task is allowed to have this page.
4607          */
4608         prot = entry->protection;
4609         if ((fault_typea & VM_PROT_FAULT_LOOKUP) != 0) {
4610                 fault_typea &= ~VM_PROT_FAULT_LOOKUP;
4611                 if (prot == VM_PROT_NONE && map != kernel_map &&
4612                     (entry->eflags & MAP_ENTRY_GUARD) != 0 &&
4613                     (entry->eflags & (MAP_ENTRY_STACK_GAP_DN |
4614                     MAP_ENTRY_STACK_GAP_UP)) != 0 &&
4615                     vm_map_growstack(map, vaddr, entry) == KERN_SUCCESS)
4616                         goto RetryLookupLocked;
4617         }
4618         fault_type &= VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE;
4619         if ((fault_type & prot) != fault_type || prot == VM_PROT_NONE) {
4620                 vm_map_unlock_read(map);
4621                 return (KERN_PROTECTION_FAILURE);
4622         }
4623         KASSERT((prot & VM_PROT_WRITE) == 0 || (entry->eflags &
4624             (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY)) !=
4625             (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY),
4626             ("entry %p flags %x", entry, entry->eflags));
4627         if ((fault_typea & VM_PROT_COPY) != 0 &&
4628             (entry->max_protection & VM_PROT_WRITE) == 0 &&
4629             (entry->eflags & MAP_ENTRY_COW) == 0) {
4630                 vm_map_unlock_read(map);
4631                 return (KERN_PROTECTION_FAILURE);
4632         }
4633
4634         /*
4635          * If this page is not pageable, we have to get it for all possible
4636          * accesses.
4637          */
4638         *wired = (entry->wired_count != 0);
4639         if (*wired)
4640                 fault_type = entry->protection;
4641         size = entry->end - entry->start;
4642         /*
4643          * If the entry was copy-on-write, we either ...
4644          */
4645         if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
4646                 /*
4647                  * If we want to write the page, we may as well handle that
4648                  * now since we've got the map locked.
4649                  *
4650                  * If we don't need to write the page, we just demote the
4651                  * permissions allowed.
4652                  */
4653                 if ((fault_type & VM_PROT_WRITE) != 0 ||
4654                     (fault_typea & VM_PROT_COPY) != 0) {
4655                         /*
4656                          * Make a new object, and place it in the object
4657                          * chain.  Note that no new references have appeared
4658                          * -- one just moved from the map to the new
4659                          * object.
4660                          */
4661                         if (vm_map_lock_upgrade(map))
4662                                 goto RetryLookup;
4663
4664                         if (entry->cred == NULL) {
4665                                 /*
4666                                  * The debugger owner is charged for
4667                                  * the memory.
4668                                  */
4669                                 cred = curthread->td_ucred;
4670                                 crhold(cred);
4671                                 if (!swap_reserve_by_cred(size, cred)) {
4672                                         crfree(cred);
4673                                         vm_map_unlock(map);
4674                                         return (KERN_RESOURCE_SHORTAGE);
4675                                 }
4676                                 entry->cred = cred;
4677                         }
4678                         vm_object_shadow(&entry->object.vm_object,
4679                             &entry->offset, size);
4680                         entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
4681                         eobject = entry->object.vm_object;
4682                         if (eobject->cred != NULL) {
4683                                 /*
4684                                  * The object was not shadowed.
4685                                  */
4686                                 swap_release_by_cred(size, entry->cred);
4687                                 crfree(entry->cred);
4688                                 entry->cred = NULL;
4689                         } else if (entry->cred != NULL) {
4690                                 VM_OBJECT_WLOCK(eobject);
4691                                 eobject->cred = entry->cred;
4692                                 eobject->charge = size;
4693                                 VM_OBJECT_WUNLOCK(eobject);
4694                                 entry->cred = NULL;
4695                         }
4696
4697                         vm_map_lock_downgrade(map);
4698                 } else {
4699                         /*
4700                          * We're attempting to read a copy-on-write page --
4701                          * don't allow writes.
4702                          */
4703                         prot &= ~VM_PROT_WRITE;
4704                 }
4705         }
4706
4707         /*
4708          * Create an object if necessary.
4709          */
4710         if (entry->object.vm_object == NULL &&
4711             !map->system_map) {
4712                 if (vm_map_lock_upgrade(map))
4713                         goto RetryLookup;
4714                 entry->object.vm_object = vm_object_allocate_anon(atop(size));
4715                 entry->offset = 0;
4716                 if (entry->cred != NULL) {
4717                         VM_OBJECT_WLOCK(entry->object.vm_object);
4718                         entry->object.vm_object->cred = entry->cred;
4719                         entry->object.vm_object->charge = size;
4720                         VM_OBJECT_WUNLOCK(entry->object.vm_object);
4721                         entry->cred = NULL;
4722                 }
4723                 vm_map_lock_downgrade(map);
4724         }
4725
4726         /*
4727          * Return the object/offset from this entry.  If the entry was
4728          * copy-on-write or empty, it has been fixed up.
4729          */
4730         *pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
4731         *object = entry->object.vm_object;
4732
4733         *out_prot = prot;
4734         return (KERN_SUCCESS);
4735 }
4736
4737 /*
4738  *      vm_map_lookup_locked:
4739  *
4740  *      Lookup the faulting address.  A version of vm_map_lookup that returns 
4741  *      KERN_FAILURE instead of blocking on map lock or memory allocation.
4742  */
4743 int
4744 vm_map_lookup_locked(vm_map_t *var_map,         /* IN/OUT */
4745                      vm_offset_t vaddr,
4746                      vm_prot_t fault_typea,
4747                      vm_map_entry_t *out_entry, /* OUT */
4748                      vm_object_t *object,       /* OUT */
4749                      vm_pindex_t *pindex,       /* OUT */
4750                      vm_prot_t *out_prot,       /* OUT */
4751                      boolean_t *wired)          /* OUT */
4752 {
4753         vm_map_entry_t entry;
4754         vm_map_t map = *var_map;
4755         vm_prot_t prot;
4756         vm_prot_t fault_type = fault_typea;
4757
4758         /*
4759          * Lookup the faulting address.
4760          */
4761         if (!vm_map_lookup_entry(map, vaddr, out_entry))
4762                 return (KERN_INVALID_ADDRESS);
4763
4764         entry = *out_entry;
4765
4766         /*
4767          * Fail if the entry refers to a submap.
4768          */
4769         if (entry->eflags & MAP_ENTRY_IS_SUB_MAP)
4770                 return (KERN_FAILURE);
4771
4772         /*
4773          * Check whether this task is allowed to have this page.
4774          */
4775         prot = entry->protection;
4776         fault_type &= VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE;
4777         if ((fault_type & prot) != fault_type)
4778                 return (KERN_PROTECTION_FAILURE);
4779
4780         /*
4781          * If this page is not pageable, we have to get it for all possible
4782          * accesses.
4783          */
4784         *wired = (entry->wired_count != 0);
4785         if (*wired)
4786                 fault_type = entry->protection;
4787
4788         if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
4789                 /*
4790                  * Fail if the entry was copy-on-write for a write fault.
4791                  */
4792                 if (fault_type & VM_PROT_WRITE)
4793                         return (KERN_FAILURE);
4794                 /*
4795                  * We're attempting to read a copy-on-write page --
4796                  * don't allow writes.
4797                  */
4798                 prot &= ~VM_PROT_WRITE;
4799         }
4800
4801         /*
4802          * Fail if an object should be created.
4803          */
4804         if (entry->object.vm_object == NULL && !map->system_map)
4805                 return (KERN_FAILURE);
4806
4807         /*
4808          * Return the object/offset from this entry.  If the entry was
4809          * copy-on-write or empty, it has been fixed up.
4810          */
4811         *pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
4812         *object = entry->object.vm_object;
4813
4814         *out_prot = prot;
4815         return (KERN_SUCCESS);
4816 }
4817
4818 /*
4819  *      vm_map_lookup_done:
4820  *
4821  *      Releases locks acquired by a vm_map_lookup
4822  *      (according to the handle returned by that lookup).
4823  */
4824 void
4825 vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry)
4826 {
4827         /*
4828          * Unlock the main-level map
4829          */
4830         vm_map_unlock_read(map);
4831 }
4832
4833 vm_offset_t
4834 vm_map_max_KBI(const struct vm_map *map)
4835 {
4836
4837         return (vm_map_max(map));
4838 }
4839
4840 vm_offset_t
4841 vm_map_min_KBI(const struct vm_map *map)
4842 {
4843
4844         return (vm_map_min(map));
4845 }
4846
4847 pmap_t
4848 vm_map_pmap_KBI(vm_map_t map)
4849 {
4850
4851         return (map->pmap);
4852 }
4853
4854 #ifdef INVARIANTS
4855 static void
4856 _vm_map_assert_consistent(vm_map_t map, int check)
4857 {
4858         vm_map_entry_t entry, prev;
4859         vm_size_t max_left, max_right;
4860
4861         if (enable_vmmap_check != check)
4862                 return;
4863
4864         prev = &map->header;
4865         VM_MAP_ENTRY_FOREACH(entry, map) {
4866                 KASSERT(prev->end <= entry->start,
4867                     ("map %p prev->end = %jx, start = %jx", map,
4868                     (uintmax_t)prev->end, (uintmax_t)entry->start));
4869                 KASSERT(entry->start < entry->end,
4870                     ("map %p start = %jx, end = %jx", map,
4871                     (uintmax_t)entry->start, (uintmax_t)entry->end));
4872                 KASSERT(entry->end <= vm_map_entry_succ(entry)->start,
4873                     ("map %p end = %jx, next->start = %jx", map,
4874                      (uintmax_t)entry->end,
4875                      (uintmax_t)vm_map_entry_succ(entry)->start));
4876                 KASSERT(entry->left == NULL ||
4877                     entry->left->start < entry->start,
4878                     ("map %p left->start = %jx, start = %jx", map,
4879                     (uintmax_t)entry->left->start, (uintmax_t)entry->start));
4880                 KASSERT(entry->right == NULL ||
4881                     entry->start < entry->right->start,
4882                     ("map %p start = %jx, right->start = %jx", map,
4883                     (uintmax_t)entry->start, (uintmax_t)entry->right->start));
4884                 max_left = vm_map_entry_max_free_left(entry,
4885                     vm_map_entry_pred(entry));
4886                 max_right = vm_map_entry_max_free_right(entry,
4887                     vm_map_entry_succ(entry));
4888                 KASSERT(entry->max_free == MAX(max_left, max_right),
4889                     ("map %p max = %jx, max_left = %jx, max_right = %jx", map,
4890                     (uintmax_t)entry->max_free,
4891                     (uintmax_t)max_left, (uintmax_t)max_right));
4892                 prev = entry;
4893         }
4894         KASSERT(prev->end <= entry->start,
4895             ("map %p prev->end = %jx, start = %jx", map,
4896             (uintmax_t)prev->end, (uintmax_t)entry->start));
4897 }
4898 #endif
4899
4900 #include "opt_ddb.h"
4901 #ifdef DDB
4902 #include <sys/kernel.h>
4903
4904 #include <ddb/ddb.h>
4905
4906 static void
4907 vm_map_print(vm_map_t map)
4908 {
4909         vm_map_entry_t entry, prev;
4910
4911         db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n",
4912             (void *)map,
4913             (void *)map->pmap, map->nentries, map->timestamp);
4914
4915         db_indent += 2;
4916         prev = &map->header;
4917         VM_MAP_ENTRY_FOREACH(entry, map) {
4918                 db_iprintf("map entry %p: start=%p, end=%p, eflags=%#x, \n",
4919                     (void *)entry, (void *)entry->start, (void *)entry->end,
4920                     entry->eflags);
4921                 {
4922                         static char *inheritance_name[4] =
4923                         {"share", "copy", "none", "donate_copy"};
4924
4925                         db_iprintf(" prot=%x/%x/%s",
4926                             entry->protection,
4927                             entry->max_protection,
4928                             inheritance_name[(int)(unsigned char)
4929                             entry->inheritance]);
4930                         if (entry->wired_count != 0)
4931                                 db_printf(", wired");
4932                 }
4933                 if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
4934                         db_printf(", share=%p, offset=0x%jx\n",
4935                             (void *)entry->object.sub_map,
4936                             (uintmax_t)entry->offset);
4937                         if (prev == &map->header ||
4938                             prev->object.sub_map !=
4939                                 entry->object.sub_map) {
4940                                 db_indent += 2;
4941                                 vm_map_print((vm_map_t)entry->object.sub_map);
4942                                 db_indent -= 2;
4943                         }
4944                 } else {
4945                         if (entry->cred != NULL)
4946                                 db_printf(", ruid %d", entry->cred->cr_ruid);
4947                         db_printf(", object=%p, offset=0x%jx",
4948                             (void *)entry->object.vm_object,
4949                             (uintmax_t)entry->offset);
4950                         if (entry->object.vm_object && entry->object.vm_object->cred)
4951                                 db_printf(", obj ruid %d charge %jx",
4952                                     entry->object.vm_object->cred->cr_ruid,
4953                                     (uintmax_t)entry->object.vm_object->charge);
4954                         if (entry->eflags & MAP_ENTRY_COW)
4955                                 db_printf(", copy (%s)",
4956                                     (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
4957                         db_printf("\n");
4958
4959                         if (prev == &map->header ||
4960                             prev->object.vm_object !=
4961                                 entry->object.vm_object) {
4962                                 db_indent += 2;
4963                                 vm_object_print((db_expr_t)(intptr_t)
4964                                                 entry->object.vm_object,
4965                                                 0, 0, (char *)0);
4966                                 db_indent -= 2;
4967                         }
4968                 }
4969                 prev = entry;
4970         }
4971         db_indent -= 2;
4972 }
4973
4974 DB_SHOW_COMMAND(map, map)
4975 {
4976
4977         if (!have_addr) {
4978                 db_printf("usage: show map <addr>\n");
4979                 return;
4980         }
4981         vm_map_print((vm_map_t)addr);
4982 }
4983
4984 DB_SHOW_COMMAND(procvm, procvm)
4985 {
4986         struct proc *p;
4987
4988         if (have_addr) {
4989                 p = db_lookup_proc(addr);
4990         } else {
4991                 p = curproc;
4992         }
4993
4994         db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n",
4995             (void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map,
4996             (void *)vmspace_pmap(p->p_vmspace));
4997
4998         vm_map_print((vm_map_t)&p->p_vmspace->vm_map);
4999 }
5000
5001 #endif /* DDB */