]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/vm/vm_map.c
Merge ^/head r352319 through r352435.
[FreeBSD/FreeBSD.git] / sys / vm / vm_map.c
1 /*-
2  * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU)
3  *
4  * Copyright (c) 1991, 1993
5  *      The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * The Mach Operating System project at Carnegie-Mellon University.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *      from: @(#)vm_map.c      8.3 (Berkeley) 1/12/94
35  *
36  *
37  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
38  * All rights reserved.
39  *
40  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
41  *
42  * Permission to use, copy, modify and distribute this software and
43  * its documentation is hereby granted, provided that both the copyright
44  * notice and this permission notice appear in all copies of the
45  * software, derivative works or modified versions, and any portions
46  * thereof, and that both notices appear in supporting documentation.
47  *
48  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
49  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
50  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
51  *
52  * Carnegie Mellon requests users of this software to return to
53  *
54  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
55  *  School of Computer Science
56  *  Carnegie Mellon University
57  *  Pittsburgh PA 15213-3890
58  *
59  * any improvements or extensions that they make and grant Carnegie the
60  * rights to redistribute these changes.
61  */
62
63 /*
64  *      Virtual memory mapping module.
65  */
66
67 #include <sys/cdefs.h>
68 __FBSDID("$FreeBSD$");
69
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/kernel.h>
73 #include <sys/ktr.h>
74 #include <sys/lock.h>
75 #include <sys/mutex.h>
76 #include <sys/proc.h>
77 #include <sys/vmmeter.h>
78 #include <sys/mman.h>
79 #include <sys/vnode.h>
80 #include <sys/racct.h>
81 #include <sys/resourcevar.h>
82 #include <sys/rwlock.h>
83 #include <sys/file.h>
84 #include <sys/sysctl.h>
85 #include <sys/sysent.h>
86 #include <sys/shm.h>
87
88 #include <vm/vm.h>
89 #include <vm/vm_param.h>
90 #include <vm/pmap.h>
91 #include <vm/vm_map.h>
92 #include <vm/vm_page.h>
93 #include <vm/vm_pageout.h>
94 #include <vm/vm_object.h>
95 #include <vm/vm_pager.h>
96 #include <vm/vm_kern.h>
97 #include <vm/vm_extern.h>
98 #include <vm/vnode_pager.h>
99 #include <vm/swap_pager.h>
100 #include <vm/uma.h>
101
102 /*
103  *      Virtual memory maps provide for the mapping, protection,
104  *      and sharing of virtual memory objects.  In addition,
105  *      this module provides for an efficient virtual copy of
106  *      memory from one map to another.
107  *
108  *      Synchronization is required prior to most operations.
109  *
110  *      Maps consist of an ordered doubly-linked list of simple
111  *      entries; a self-adjusting binary search tree of these
112  *      entries is used to speed up lookups.
113  *
114  *      Since portions of maps are specified by start/end addresses,
115  *      which may not align with existing map entries, all
116  *      routines merely "clip" entries to these start/end values.
117  *      [That is, an entry is split into two, bordering at a
118  *      start or end value.]  Note that these clippings may not
119  *      always be necessary (as the two resulting entries are then
120  *      not changed); however, the clipping is done for convenience.
121  *
122  *      As mentioned above, virtual copy operations are performed
123  *      by copying VM object references from one map to
124  *      another, and then marking both regions as copy-on-write.
125  */
126
127 static struct mtx map_sleep_mtx;
128 static uma_zone_t mapentzone;
129 static uma_zone_t kmapentzone;
130 static uma_zone_t mapzone;
131 static uma_zone_t vmspace_zone;
132 static int vmspace_zinit(void *mem, int size, int flags);
133 static int vm_map_zinit(void *mem, int ize, int flags);
134 static void _vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min,
135     vm_offset_t max);
136 static void vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map);
137 static void vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry);
138 static void vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry);
139 static int vm_map_growstack(vm_map_t map, vm_offset_t addr,
140     vm_map_entry_t gap_entry);
141 static void vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
142     vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags);
143 #ifdef INVARIANTS
144 static void vm_map_zdtor(void *mem, int size, void *arg);
145 static void vmspace_zdtor(void *mem, int size, void *arg);
146 #endif
147 static int vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos,
148     vm_size_t max_ssize, vm_size_t growsize, vm_prot_t prot, vm_prot_t max,
149     int cow);
150 static void vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry,
151     vm_offset_t failed_addr);
152
153 #define ENTRY_CHARGED(e) ((e)->cred != NULL || \
154     ((e)->object.vm_object != NULL && (e)->object.vm_object->cred != NULL && \
155      !((e)->eflags & MAP_ENTRY_NEEDS_COPY)))
156
157 /* 
158  * PROC_VMSPACE_{UN,}LOCK() can be a noop as long as vmspaces are type
159  * stable.
160  */
161 #define PROC_VMSPACE_LOCK(p) do { } while (0)
162 #define PROC_VMSPACE_UNLOCK(p) do { } while (0)
163
164 /*
165  *      VM_MAP_RANGE_CHECK:     [ internal use only ]
166  *
167  *      Asserts that the starting and ending region
168  *      addresses fall within the valid range of the map.
169  */
170 #define VM_MAP_RANGE_CHECK(map, start, end)             \
171                 {                                       \
172                 if (start < vm_map_min(map))            \
173                         start = vm_map_min(map);        \
174                 if (end > vm_map_max(map))              \
175                         end = vm_map_max(map);          \
176                 if (start > end)                        \
177                         start = end;                    \
178                 }
179
180 /*
181  *      vm_map_startup:
182  *
183  *      Initialize the vm_map module.  Must be called before
184  *      any other vm_map routines.
185  *
186  *      Map and entry structures are allocated from the general
187  *      purpose memory pool with some exceptions:
188  *
189  *      - The kernel map and kmem submap are allocated statically.
190  *      - Kernel map entries are allocated out of a static pool.
191  *
192  *      These restrictions are necessary since malloc() uses the
193  *      maps and requires map entries.
194  */
195
196 void
197 vm_map_startup(void)
198 {
199         mtx_init(&map_sleep_mtx, "vm map sleep mutex", NULL, MTX_DEF);
200         mapzone = uma_zcreate("MAP", sizeof(struct vm_map), NULL,
201 #ifdef INVARIANTS
202             vm_map_zdtor,
203 #else
204             NULL,
205 #endif
206             vm_map_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
207         uma_prealloc(mapzone, MAX_KMAP);
208         kmapentzone = uma_zcreate("KMAP ENTRY", sizeof(struct vm_map_entry),
209             NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
210             UMA_ZONE_MTXCLASS | UMA_ZONE_VM);
211         mapentzone = uma_zcreate("MAP ENTRY", sizeof(struct vm_map_entry),
212             NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
213         vmspace_zone = uma_zcreate("VMSPACE", sizeof(struct vmspace), NULL,
214 #ifdef INVARIANTS
215             vmspace_zdtor,
216 #else
217             NULL,
218 #endif
219             vmspace_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
220 }
221
222 static int
223 vmspace_zinit(void *mem, int size, int flags)
224 {
225         struct vmspace *vm;
226
227         vm = (struct vmspace *)mem;
228
229         vm->vm_map.pmap = NULL;
230         (void)vm_map_zinit(&vm->vm_map, sizeof(vm->vm_map), flags);
231         PMAP_LOCK_INIT(vmspace_pmap(vm));
232         return (0);
233 }
234
235 static int
236 vm_map_zinit(void *mem, int size, int flags)
237 {
238         vm_map_t map;
239
240         map = (vm_map_t)mem;
241         memset(map, 0, sizeof(*map));
242         mtx_init(&map->system_mtx, "vm map (system)", NULL, MTX_DEF | MTX_DUPOK);
243         sx_init(&map->lock, "vm map (user)");
244         return (0);
245 }
246
247 #ifdef INVARIANTS
248 static void
249 vmspace_zdtor(void *mem, int size, void *arg)
250 {
251         struct vmspace *vm;
252
253         vm = (struct vmspace *)mem;
254
255         vm_map_zdtor(&vm->vm_map, sizeof(vm->vm_map), arg);
256 }
257 static void
258 vm_map_zdtor(void *mem, int size, void *arg)
259 {
260         vm_map_t map;
261
262         map = (vm_map_t)mem;
263         KASSERT(map->nentries == 0,
264             ("map %p nentries == %d on free.",
265             map, map->nentries));
266         KASSERT(map->size == 0,
267             ("map %p size == %lu on free.",
268             map, (unsigned long)map->size));
269 }
270 #endif  /* INVARIANTS */
271
272 /*
273  * Allocate a vmspace structure, including a vm_map and pmap,
274  * and initialize those structures.  The refcnt is set to 1.
275  *
276  * If 'pinit' is NULL then the embedded pmap is initialized via pmap_pinit().
277  */
278 struct vmspace *
279 vmspace_alloc(vm_offset_t min, vm_offset_t max, pmap_pinit_t pinit)
280 {
281         struct vmspace *vm;
282
283         vm = uma_zalloc(vmspace_zone, M_WAITOK);
284         KASSERT(vm->vm_map.pmap == NULL, ("vm_map.pmap must be NULL"));
285         if (!pinit(vmspace_pmap(vm))) {
286                 uma_zfree(vmspace_zone, vm);
287                 return (NULL);
288         }
289         CTR1(KTR_VM, "vmspace_alloc: %p", vm);
290         _vm_map_init(&vm->vm_map, vmspace_pmap(vm), min, max);
291         vm->vm_refcnt = 1;
292         vm->vm_shm = NULL;
293         vm->vm_swrss = 0;
294         vm->vm_tsize = 0;
295         vm->vm_dsize = 0;
296         vm->vm_ssize = 0;
297         vm->vm_taddr = 0;
298         vm->vm_daddr = 0;
299         vm->vm_maxsaddr = 0;
300         return (vm);
301 }
302
303 #ifdef RACCT
304 static void
305 vmspace_container_reset(struct proc *p)
306 {
307
308         PROC_LOCK(p);
309         racct_set(p, RACCT_DATA, 0);
310         racct_set(p, RACCT_STACK, 0);
311         racct_set(p, RACCT_RSS, 0);
312         racct_set(p, RACCT_MEMLOCK, 0);
313         racct_set(p, RACCT_VMEM, 0);
314         PROC_UNLOCK(p);
315 }
316 #endif
317
318 static inline void
319 vmspace_dofree(struct vmspace *vm)
320 {
321
322         CTR1(KTR_VM, "vmspace_free: %p", vm);
323
324         /*
325          * Make sure any SysV shm is freed, it might not have been in
326          * exit1().
327          */
328         shmexit(vm);
329
330         /*
331          * Lock the map, to wait out all other references to it.
332          * Delete all of the mappings and pages they hold, then call
333          * the pmap module to reclaim anything left.
334          */
335         (void)vm_map_remove(&vm->vm_map, vm_map_min(&vm->vm_map),
336             vm_map_max(&vm->vm_map));
337
338         pmap_release(vmspace_pmap(vm));
339         vm->vm_map.pmap = NULL;
340         uma_zfree(vmspace_zone, vm);
341 }
342
343 void
344 vmspace_free(struct vmspace *vm)
345 {
346
347         WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
348             "vmspace_free() called");
349
350         if (vm->vm_refcnt == 0)
351                 panic("vmspace_free: attempt to free already freed vmspace");
352
353         if (atomic_fetchadd_int(&vm->vm_refcnt, -1) == 1)
354                 vmspace_dofree(vm);
355 }
356
357 void
358 vmspace_exitfree(struct proc *p)
359 {
360         struct vmspace *vm;
361
362         PROC_VMSPACE_LOCK(p);
363         vm = p->p_vmspace;
364         p->p_vmspace = NULL;
365         PROC_VMSPACE_UNLOCK(p);
366         KASSERT(vm == &vmspace0, ("vmspace_exitfree: wrong vmspace"));
367         vmspace_free(vm);
368 }
369
370 void
371 vmspace_exit(struct thread *td)
372 {
373         int refcnt;
374         struct vmspace *vm;
375         struct proc *p;
376
377         /*
378          * Release user portion of address space.
379          * This releases references to vnodes,
380          * which could cause I/O if the file has been unlinked.
381          * Need to do this early enough that we can still sleep.
382          *
383          * The last exiting process to reach this point releases as
384          * much of the environment as it can. vmspace_dofree() is the
385          * slower fallback in case another process had a temporary
386          * reference to the vmspace.
387          */
388
389         p = td->td_proc;
390         vm = p->p_vmspace;
391         atomic_add_int(&vmspace0.vm_refcnt, 1);
392         refcnt = vm->vm_refcnt;
393         do {
394                 if (refcnt > 1 && p->p_vmspace != &vmspace0) {
395                         /* Switch now since other proc might free vmspace */
396                         PROC_VMSPACE_LOCK(p);
397                         p->p_vmspace = &vmspace0;
398                         PROC_VMSPACE_UNLOCK(p);
399                         pmap_activate(td);
400                 }
401         } while (!atomic_fcmpset_int(&vm->vm_refcnt, &refcnt, refcnt - 1));
402         if (refcnt == 1) {
403                 if (p->p_vmspace != vm) {
404                         /* vmspace not yet freed, switch back */
405                         PROC_VMSPACE_LOCK(p);
406                         p->p_vmspace = vm;
407                         PROC_VMSPACE_UNLOCK(p);
408                         pmap_activate(td);
409                 }
410                 pmap_remove_pages(vmspace_pmap(vm));
411                 /* Switch now since this proc will free vmspace */
412                 PROC_VMSPACE_LOCK(p);
413                 p->p_vmspace = &vmspace0;
414                 PROC_VMSPACE_UNLOCK(p);
415                 pmap_activate(td);
416                 vmspace_dofree(vm);
417         }
418 #ifdef RACCT
419         if (racct_enable)
420                 vmspace_container_reset(p);
421 #endif
422 }
423
424 /* Acquire reference to vmspace owned by another process. */
425
426 struct vmspace *
427 vmspace_acquire_ref(struct proc *p)
428 {
429         struct vmspace *vm;
430         int refcnt;
431
432         PROC_VMSPACE_LOCK(p);
433         vm = p->p_vmspace;
434         if (vm == NULL) {
435                 PROC_VMSPACE_UNLOCK(p);
436                 return (NULL);
437         }
438         refcnt = vm->vm_refcnt;
439         do {
440                 if (refcnt <= 0) {      /* Avoid 0->1 transition */
441                         PROC_VMSPACE_UNLOCK(p);
442                         return (NULL);
443                 }
444         } while (!atomic_fcmpset_int(&vm->vm_refcnt, &refcnt, refcnt + 1));
445         if (vm != p->p_vmspace) {
446                 PROC_VMSPACE_UNLOCK(p);
447                 vmspace_free(vm);
448                 return (NULL);
449         }
450         PROC_VMSPACE_UNLOCK(p);
451         return (vm);
452 }
453
454 /*
455  * Switch between vmspaces in an AIO kernel process.
456  *
457  * The new vmspace is either the vmspace of a user process obtained
458  * from an active AIO request or the initial vmspace of the AIO kernel
459  * process (when it is idling).  Because user processes will block to
460  * drain any active AIO requests before proceeding in exit() or
461  * execve(), the reference count for vmspaces from AIO requests can
462  * never be 0.  Similarly, AIO kernel processes hold an extra
463  * reference on their initial vmspace for the life of the process.  As
464  * a result, the 'newvm' vmspace always has a non-zero reference
465  * count.  This permits an additional reference on 'newvm' to be
466  * acquired via a simple atomic increment rather than the loop in
467  * vmspace_acquire_ref() above.
468  */
469 void
470 vmspace_switch_aio(struct vmspace *newvm)
471 {
472         struct vmspace *oldvm;
473
474         /* XXX: Need some way to assert that this is an aio daemon. */
475
476         KASSERT(newvm->vm_refcnt > 0,
477             ("vmspace_switch_aio: newvm unreferenced"));
478
479         oldvm = curproc->p_vmspace;
480         if (oldvm == newvm)
481                 return;
482
483         /*
484          * Point to the new address space and refer to it.
485          */
486         curproc->p_vmspace = newvm;
487         atomic_add_int(&newvm->vm_refcnt, 1);
488
489         /* Activate the new mapping. */
490         pmap_activate(curthread);
491
492         vmspace_free(oldvm);
493 }
494
495 void
496 _vm_map_lock(vm_map_t map, const char *file, int line)
497 {
498
499         if (map->system_map)
500                 mtx_lock_flags_(&map->system_mtx, 0, file, line);
501         else
502                 sx_xlock_(&map->lock, file, line);
503         map->timestamp++;
504 }
505
506 void
507 vm_map_entry_set_vnode_text(vm_map_entry_t entry, bool add)
508 {
509         vm_object_t object, object1;
510         struct vnode *vp;
511
512         if ((entry->eflags & MAP_ENTRY_VN_EXEC) == 0)
513                 return;
514         KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
515             ("Submap with execs"));
516         object = entry->object.vm_object;
517         KASSERT(object != NULL, ("No object for text, entry %p", entry));
518         VM_OBJECT_RLOCK(object);
519         while ((object1 = object->backing_object) != NULL) {
520                 VM_OBJECT_RLOCK(object1);
521                 VM_OBJECT_RUNLOCK(object);
522                 object = object1;
523         }
524
525         vp = NULL;
526         if (object->type == OBJT_DEAD) {
527                 /*
528                  * For OBJT_DEAD objects, v_writecount was handled in
529                  * vnode_pager_dealloc().
530                  */
531         } else if (object->type == OBJT_VNODE) {
532                 vp = object->handle;
533         } else if (object->type == OBJT_SWAP) {
534                 KASSERT((object->flags & OBJ_TMPFS_NODE) != 0,
535                     ("vm_map_entry_set_vnode_text: swap and !TMPFS "
536                     "entry %p, object %p, add %d", entry, object, add));
537                 /*
538                  * Tmpfs VREG node, which was reclaimed, has
539                  * OBJ_TMPFS_NODE flag set, but not OBJ_TMPFS.  In
540                  * this case there is no v_writecount to adjust.
541                  */
542                 if ((object->flags & OBJ_TMPFS) != 0)
543                         vp = object->un_pager.swp.swp_tmpfs;
544         } else {
545                 KASSERT(0,
546                     ("vm_map_entry_set_vnode_text: wrong object type, "
547                     "entry %p, object %p, add %d", entry, object, add));
548         }
549         if (vp != NULL) {
550                 if (add) {
551                         VOP_SET_TEXT_CHECKED(vp);
552                         VM_OBJECT_RUNLOCK(object);
553                 } else {
554                         vhold(vp);
555                         VM_OBJECT_RUNLOCK(object);
556                         vn_lock(vp, LK_SHARED | LK_RETRY);
557                         VOP_UNSET_TEXT_CHECKED(vp);
558                         VOP_UNLOCK(vp, 0);
559                         vdrop(vp);
560                 }
561         } else {
562                 VM_OBJECT_RUNLOCK(object);
563         }
564 }
565
566 static void
567 vm_map_process_deferred(void)
568 {
569         struct thread *td;
570         vm_map_entry_t entry, next;
571         vm_object_t object;
572
573         td = curthread;
574         entry = td->td_map_def_user;
575         td->td_map_def_user = NULL;
576         while (entry != NULL) {
577                 next = entry->next;
578                 MPASS((entry->eflags & (MAP_ENTRY_WRITECNT |
579                     MAP_ENTRY_VN_EXEC)) != (MAP_ENTRY_WRITECNT |
580                     MAP_ENTRY_VN_EXEC));
581                 if ((entry->eflags & MAP_ENTRY_WRITECNT) != 0) {
582                         /*
583                          * Decrement the object's writemappings and
584                          * possibly the vnode's v_writecount.
585                          */
586                         KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
587                             ("Submap with writecount"));
588                         object = entry->object.vm_object;
589                         KASSERT(object != NULL, ("No object for writecount"));
590                         vm_pager_release_writecount(object, entry->start,
591                             entry->end);
592                 }
593                 vm_map_entry_set_vnode_text(entry, false);
594                 vm_map_entry_deallocate(entry, FALSE);
595                 entry = next;
596         }
597 }
598
599 void
600 _vm_map_unlock(vm_map_t map, const char *file, int line)
601 {
602
603         if (map->system_map)
604                 mtx_unlock_flags_(&map->system_mtx, 0, file, line);
605         else {
606                 sx_xunlock_(&map->lock, file, line);
607                 vm_map_process_deferred();
608         }
609 }
610
611 void
612 _vm_map_lock_read(vm_map_t map, const char *file, int line)
613 {
614
615         if (map->system_map)
616                 mtx_lock_flags_(&map->system_mtx, 0, file, line);
617         else
618                 sx_slock_(&map->lock, file, line);
619 }
620
621 void
622 _vm_map_unlock_read(vm_map_t map, const char *file, int line)
623 {
624
625         if (map->system_map)
626                 mtx_unlock_flags_(&map->system_mtx, 0, file, line);
627         else {
628                 sx_sunlock_(&map->lock, file, line);
629                 vm_map_process_deferred();
630         }
631 }
632
633 int
634 _vm_map_trylock(vm_map_t map, const char *file, int line)
635 {
636         int error;
637
638         error = map->system_map ?
639             !mtx_trylock_flags_(&map->system_mtx, 0, file, line) :
640             !sx_try_xlock_(&map->lock, file, line);
641         if (error == 0)
642                 map->timestamp++;
643         return (error == 0);
644 }
645
646 int
647 _vm_map_trylock_read(vm_map_t map, const char *file, int line)
648 {
649         int error;
650
651         error = map->system_map ?
652             !mtx_trylock_flags_(&map->system_mtx, 0, file, line) :
653             !sx_try_slock_(&map->lock, file, line);
654         return (error == 0);
655 }
656
657 /*
658  *      _vm_map_lock_upgrade:   [ internal use only ]
659  *
660  *      Tries to upgrade a read (shared) lock on the specified map to a write
661  *      (exclusive) lock.  Returns the value "0" if the upgrade succeeds and a
662  *      non-zero value if the upgrade fails.  If the upgrade fails, the map is
663  *      returned without a read or write lock held.
664  *
665  *      Requires that the map be read locked.
666  */
667 int
668 _vm_map_lock_upgrade(vm_map_t map, const char *file, int line)
669 {
670         unsigned int last_timestamp;
671
672         if (map->system_map) {
673                 mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
674         } else {
675                 if (!sx_try_upgrade_(&map->lock, file, line)) {
676                         last_timestamp = map->timestamp;
677                         sx_sunlock_(&map->lock, file, line);
678                         vm_map_process_deferred();
679                         /*
680                          * If the map's timestamp does not change while the
681                          * map is unlocked, then the upgrade succeeds.
682                          */
683                         sx_xlock_(&map->lock, file, line);
684                         if (last_timestamp != map->timestamp) {
685                                 sx_xunlock_(&map->lock, file, line);
686                                 return (1);
687                         }
688                 }
689         }
690         map->timestamp++;
691         return (0);
692 }
693
694 void
695 _vm_map_lock_downgrade(vm_map_t map, const char *file, int line)
696 {
697
698         if (map->system_map) {
699                 mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
700         } else
701                 sx_downgrade_(&map->lock, file, line);
702 }
703
704 /*
705  *      vm_map_locked:
706  *
707  *      Returns a non-zero value if the caller holds a write (exclusive) lock
708  *      on the specified map and the value "0" otherwise.
709  */
710 int
711 vm_map_locked(vm_map_t map)
712 {
713
714         if (map->system_map)
715                 return (mtx_owned(&map->system_mtx));
716         else
717                 return (sx_xlocked(&map->lock));
718 }
719
720 #ifdef INVARIANTS
721 static void
722 _vm_map_assert_locked(vm_map_t map, const char *file, int line)
723 {
724
725         if (map->system_map)
726                 mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
727         else
728                 sx_assert_(&map->lock, SA_XLOCKED, file, line);
729 }
730
731 #define VM_MAP_ASSERT_LOCKED(map) \
732     _vm_map_assert_locked(map, LOCK_FILE, LOCK_LINE)
733
734 #ifdef DIAGNOSTIC
735 static int enable_vmmap_check = 1;
736 #else
737 static int enable_vmmap_check = 0;
738 #endif
739 SYSCTL_INT(_debug, OID_AUTO, vmmap_check, CTLFLAG_RWTUN,
740     &enable_vmmap_check, 0, "Enable vm map consistency checking");
741
742 static void
743 _vm_map_assert_consistent(vm_map_t map)
744 {
745         vm_map_entry_t child, entry, prev;
746         vm_size_t max_left, max_right;
747
748         if (!enable_vmmap_check)
749                 return;
750
751         for (prev = &map->header; (entry = prev->next) != &map->header;
752             prev = entry) {
753                 KASSERT(prev->end <= entry->start,
754                     ("map %p prev->end = %jx, start = %jx", map,
755                     (uintmax_t)prev->end, (uintmax_t)entry->start));
756                 KASSERT(entry->start < entry->end,
757                     ("map %p start = %jx, end = %jx", map,
758                     (uintmax_t)entry->start, (uintmax_t)entry->end));
759                 KASSERT(entry->end <= entry->next->start,
760                     ("map %p end = %jx, next->start = %jx", map,
761                     (uintmax_t)entry->end, (uintmax_t)entry->next->start));
762                 KASSERT(entry->left == NULL ||
763                     entry->left->start < entry->start,
764                     ("map %p left->start = %jx, start = %jx", map,
765                     (uintmax_t)entry->left->start, (uintmax_t)entry->start));
766                 KASSERT(entry->right == NULL ||
767                     entry->start < entry->right->start,
768                     ("map %p start = %jx, right->start = %jx", map,
769                     (uintmax_t)entry->start, (uintmax_t)entry->right->start));
770                 child = entry->left;
771                 max_left = (child != NULL) ? child->max_free :
772                         entry->start - prev->end;
773                 child = entry->right;
774                 max_right = (child != NULL) ? child->max_free :
775                         entry->next->start - entry->end;
776                 KASSERT(entry->max_free == MAX(max_left, max_right),
777                     ("map %p max = %jx, max_left = %jx, max_right = %jx", map,
778                      (uintmax_t)entry->max_free,
779                      (uintmax_t)max_left, (uintmax_t)max_right));
780         }       
781 }
782
783 #define VM_MAP_ASSERT_CONSISTENT(map) \
784     _vm_map_assert_consistent(map)
785 #else
786 #define VM_MAP_ASSERT_LOCKED(map)
787 #define VM_MAP_ASSERT_CONSISTENT(map)
788 #endif /* INVARIANTS */
789
790 /*
791  *      _vm_map_unlock_and_wait:
792  *
793  *      Atomically releases the lock on the specified map and puts the calling
794  *      thread to sleep.  The calling thread will remain asleep until either
795  *      vm_map_wakeup() is performed on the map or the specified timeout is
796  *      exceeded.
797  *
798  *      WARNING!  This function does not perform deferred deallocations of
799  *      objects and map entries.  Therefore, the calling thread is expected to
800  *      reacquire the map lock after reawakening and later perform an ordinary
801  *      unlock operation, such as vm_map_unlock(), before completing its
802  *      operation on the map.
803  */
804 int
805 _vm_map_unlock_and_wait(vm_map_t map, int timo, const char *file, int line)
806 {
807
808         mtx_lock(&map_sleep_mtx);
809         if (map->system_map)
810                 mtx_unlock_flags_(&map->system_mtx, 0, file, line);
811         else
812                 sx_xunlock_(&map->lock, file, line);
813         return (msleep(&map->root, &map_sleep_mtx, PDROP | PVM, "vmmaps",
814             timo));
815 }
816
817 /*
818  *      vm_map_wakeup:
819  *
820  *      Awaken any threads that have slept on the map using
821  *      vm_map_unlock_and_wait().
822  */
823 void
824 vm_map_wakeup(vm_map_t map)
825 {
826
827         /*
828          * Acquire and release map_sleep_mtx to prevent a wakeup()
829          * from being performed (and lost) between the map unlock
830          * and the msleep() in _vm_map_unlock_and_wait().
831          */
832         mtx_lock(&map_sleep_mtx);
833         mtx_unlock(&map_sleep_mtx);
834         wakeup(&map->root);
835 }
836
837 void
838 vm_map_busy(vm_map_t map)
839 {
840
841         VM_MAP_ASSERT_LOCKED(map);
842         map->busy++;
843 }
844
845 void
846 vm_map_unbusy(vm_map_t map)
847 {
848
849         VM_MAP_ASSERT_LOCKED(map);
850         KASSERT(map->busy, ("vm_map_unbusy: not busy"));
851         if (--map->busy == 0 && (map->flags & MAP_BUSY_WAKEUP)) {
852                 vm_map_modflags(map, 0, MAP_BUSY_WAKEUP);
853                 wakeup(&map->busy);
854         }
855 }
856
857 void 
858 vm_map_wait_busy(vm_map_t map)
859 {
860
861         VM_MAP_ASSERT_LOCKED(map);
862         while (map->busy) {
863                 vm_map_modflags(map, MAP_BUSY_WAKEUP, 0);
864                 if (map->system_map)
865                         msleep(&map->busy, &map->system_mtx, 0, "mbusy", 0);
866                 else
867                         sx_sleep(&map->busy, &map->lock, 0, "mbusy", 0);
868         }
869         map->timestamp++;
870 }
871
872 long
873 vmspace_resident_count(struct vmspace *vmspace)
874 {
875         return pmap_resident_count(vmspace_pmap(vmspace));
876 }
877
878 /*
879  *      vm_map_create:
880  *
881  *      Creates and returns a new empty VM map with
882  *      the given physical map structure, and having
883  *      the given lower and upper address bounds.
884  */
885 vm_map_t
886 vm_map_create(pmap_t pmap, vm_offset_t min, vm_offset_t max)
887 {
888         vm_map_t result;
889
890         result = uma_zalloc(mapzone, M_WAITOK);
891         CTR1(KTR_VM, "vm_map_create: %p", result);
892         _vm_map_init(result, pmap, min, max);
893         return (result);
894 }
895
896 /*
897  * Initialize an existing vm_map structure
898  * such as that in the vmspace structure.
899  */
900 static void
901 _vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max)
902 {
903
904         map->header.next = map->header.prev = &map->header;
905         map->header.eflags = MAP_ENTRY_HEADER;
906         map->needs_wakeup = FALSE;
907         map->system_map = 0;
908         map->pmap = pmap;
909         map->header.end = min;
910         map->header.start = max;
911         map->flags = 0;
912         map->root = NULL;
913         map->timestamp = 0;
914         map->busy = 0;
915         map->anon_loc = 0;
916 }
917
918 void
919 vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max)
920 {
921
922         _vm_map_init(map, pmap, min, max);
923         mtx_init(&map->system_mtx, "system map", NULL, MTX_DEF | MTX_DUPOK);
924         sx_init(&map->lock, "user map");
925 }
926
927 /*
928  *      vm_map_entry_dispose:   [ internal use only ]
929  *
930  *      Inverse of vm_map_entry_create.
931  */
932 static void
933 vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry)
934 {
935         uma_zfree(map->system_map ? kmapentzone : mapentzone, entry);
936 }
937
938 /*
939  *      vm_map_entry_create:    [ internal use only ]
940  *
941  *      Allocates a VM map entry for insertion.
942  *      No entry fields are filled in.
943  */
944 static vm_map_entry_t
945 vm_map_entry_create(vm_map_t map)
946 {
947         vm_map_entry_t new_entry;
948
949         if (map->system_map)
950                 new_entry = uma_zalloc(kmapentzone, M_NOWAIT);
951         else
952                 new_entry = uma_zalloc(mapentzone, M_WAITOK);
953         if (new_entry == NULL)
954                 panic("vm_map_entry_create: kernel resources exhausted");
955         return (new_entry);
956 }
957
958 /*
959  *      vm_map_entry_set_behavior:
960  *
961  *      Set the expected access behavior, either normal, random, or
962  *      sequential.
963  */
964 static inline void
965 vm_map_entry_set_behavior(vm_map_entry_t entry, u_char behavior)
966 {
967         entry->eflags = (entry->eflags & ~MAP_ENTRY_BEHAV_MASK) |
968             (behavior & MAP_ENTRY_BEHAV_MASK);
969 }
970
971 /*
972  *      vm_map_entry_max_free_{left,right}:
973  *
974  *      Compute the size of the largest free gap between two entries,
975  *      one the root of a tree and the other the ancestor of that root
976  *      that is the least or greatest ancestor found on the search path.
977  */
978 static inline vm_size_t
979 vm_map_entry_max_free_left(vm_map_entry_t root, vm_map_entry_t left_ancestor)
980 {
981
982         return (root->left != NULL ?
983             root->left->max_free : root->start - left_ancestor->end);
984 }
985
986 static inline vm_size_t
987 vm_map_entry_max_free_right(vm_map_entry_t root, vm_map_entry_t right_ancestor)
988 {
989
990         return (root->right != NULL ?
991             root->right->max_free : right_ancestor->start - root->end);
992 }
993
994 #define SPLAY_LEFT_STEP(root, y, rlist, test) do {                      \
995         vm_size_t max_free;                                             \
996                                                                         \
997         /*                                                              \
998          * Infer root->right->max_free == root->max_free when           \
999          * y->max_free < root->max_free || root->max_free == 0.         \
1000          * Otherwise, look right to find it.                            \
1001          */                                                             \
1002         y = root->left;                                                 \
1003         max_free = root->max_free;                                      \
1004         KASSERT(max_free >= vm_map_entry_max_free_right(root, rlist),   \
1005             ("%s: max_free invariant fails", __func__));                \
1006         if (y == NULL ? max_free > 0 : max_free - 1 < y->max_free)      \
1007                 max_free = vm_map_entry_max_free_right(root, rlist);    \
1008         if (y != NULL && (test)) {                                      \
1009                 /* Rotate right and make y root. */                     \
1010                 root->left = y->right;                                  \
1011                 y->right = root;                                        \
1012                 if (max_free < y->max_free)                             \
1013                         root->max_free = max_free = MAX(max_free,       \
1014                             vm_map_entry_max_free_left(root, y));       \
1015                 root = y;                                               \
1016                 y = root->left;                                         \
1017         }                                                               \
1018         /* Copy right->max_free.  Put root on rlist. */                 \
1019         root->max_free = max_free;                                      \
1020         KASSERT(max_free == vm_map_entry_max_free_right(root, rlist),   \
1021             ("%s: max_free not copied from right", __func__));          \
1022         root->left = rlist;                                             \
1023         rlist = root;                                                   \
1024         root = y;                                                       \
1025 } while (0)
1026
1027 #define SPLAY_RIGHT_STEP(root, y, llist, test) do {                     \
1028         vm_size_t max_free;                                             \
1029                                                                         \
1030         /*                                                              \
1031          * Infer root->left->max_free == root->max_free when            \
1032          * y->max_free < root->max_free || root->max_free == 0.         \
1033          * Otherwise, look left to find it.                             \
1034          */                                                             \
1035         y = root->right;                                                \
1036         max_free = root->max_free;                                      \
1037         KASSERT(max_free >= vm_map_entry_max_free_left(root, llist),    \
1038             ("%s: max_free invariant fails", __func__));                \
1039         if (y == NULL ? max_free > 0 : max_free - 1 < y->max_free)      \
1040                 max_free = vm_map_entry_max_free_left(root, llist);     \
1041         if (y != NULL && (test)) {                                      \
1042                 /* Rotate left and make y root. */                      \
1043                 root->right = y->left;                                  \
1044                 y->left = root;                                         \
1045                 if (max_free < y->max_free)                             \
1046                         root->max_free = max_free = MAX(max_free,       \
1047                             vm_map_entry_max_free_right(root, y));      \
1048                 root = y;                                               \
1049                 y = root->right;                                        \
1050         }                                                               \
1051         /* Copy left->max_free.  Put root on llist. */                  \
1052         root->max_free = max_free;                                      \
1053         KASSERT(max_free == vm_map_entry_max_free_left(root, llist),    \
1054             ("%s: max_free not copied from left", __func__));           \
1055         root->right = llist;                                            \
1056         llist = root;                                                   \
1057         root = y;                                                       \
1058 } while (0)
1059
1060 /*
1061  * Walk down the tree until we find addr or a NULL pointer where addr would go,
1062  * breaking off left and right subtrees of nodes less than, or greater than
1063  * addr.  Treat pointers to nodes with max_free < length as NULL pointers.
1064  * llist and rlist are the two sides in reverse order (bottom-up), with llist
1065  * linked by the right pointer and rlist linked by the left pointer in the
1066  * vm_map_entry, and both lists terminated by &map->header.  This function, and
1067  * the subsequent call to vm_map_splay_merge, rely on the start and end address
1068  * values in &map->header.
1069  */
1070 static vm_map_entry_t
1071 vm_map_splay_split(vm_map_t map, vm_offset_t addr, vm_size_t length,
1072     vm_map_entry_t *out_llist, vm_map_entry_t *out_rlist)
1073 {
1074         vm_map_entry_t llist, rlist, root, y;
1075
1076         llist = rlist = &map->header;
1077         root = map->root;
1078         while (root != NULL && root->max_free >= length) {
1079                 KASSERT(llist->end <= root->start && root->end <= rlist->start,
1080                     ("%s: root not within tree bounds", __func__));
1081                 if (addr < root->start) {
1082                         SPLAY_LEFT_STEP(root, y, rlist,
1083                             y->max_free >= length && addr < y->start);
1084                 } else if (addr >= root->end) {
1085                         SPLAY_RIGHT_STEP(root, y, llist,
1086                             y->max_free >= length && addr >= y->end);
1087                 } else
1088                         break;
1089         }
1090         *out_llist = llist;
1091         *out_rlist = rlist;
1092         return (root);
1093 }
1094
1095 static void
1096 vm_map_splay_findnext(vm_map_entry_t root, vm_map_entry_t *iolist)
1097 {
1098         vm_map_entry_t rlist, y;
1099
1100         root = root->right;
1101         rlist = *iolist;
1102         while (root != NULL)
1103                 SPLAY_LEFT_STEP(root, y, rlist, true);
1104         *iolist = rlist;
1105 }
1106
1107 static void
1108 vm_map_splay_findprev(vm_map_entry_t root, vm_map_entry_t *iolist)
1109 {
1110         vm_map_entry_t llist, y;
1111
1112         root = root->left;
1113         llist = *iolist;
1114         while (root != NULL)
1115                 SPLAY_RIGHT_STEP(root, y, llist, true);
1116         *iolist = llist;
1117 }
1118
1119 static inline void
1120 vm_map_entry_swap(vm_map_entry_t *a, vm_map_entry_t *b)
1121 {
1122         vm_map_entry_t tmp;
1123
1124         tmp = *b;
1125         *b = *a;
1126         *a = tmp;
1127 }
1128
1129 /*
1130  * Walk back up the two spines, flip the pointers and set max_free.  The
1131  * subtrees of the root go at the bottom of llist and rlist.
1132  */
1133 static void
1134 vm_map_splay_merge(vm_map_t map, vm_map_entry_t root,
1135     vm_map_entry_t llist, vm_map_entry_t rlist)
1136 {
1137         vm_map_entry_t prev;
1138         vm_size_t max_free_left, max_free_right;
1139
1140         max_free_left = vm_map_entry_max_free_left(root, llist);
1141         if (llist != &map->header) {
1142                 prev = root->left;
1143                 do {
1144                         /*
1145                          * The max_free values of the children of llist are in
1146                          * llist->max_free and max_free_left.  Update with the
1147                          * max value.
1148                          */
1149                         llist->max_free = max_free_left =
1150                             MAX(llist->max_free, max_free_left);
1151                         vm_map_entry_swap(&llist->right, &prev);
1152                         vm_map_entry_swap(&prev, &llist);
1153                 } while (llist != &map->header);
1154                 root->left = prev;
1155         }
1156         max_free_right = vm_map_entry_max_free_right(root, rlist);
1157         if (rlist != &map->header) {
1158                 prev = root->right;
1159                 do {
1160                         /*
1161                          * The max_free values of the children of rlist are in
1162                          * rlist->max_free and max_free_right.  Update with the
1163                          * max value.
1164                          */
1165                         rlist->max_free = max_free_right =
1166                             MAX(rlist->max_free, max_free_right);
1167                         vm_map_entry_swap(&rlist->left, &prev);
1168                         vm_map_entry_swap(&prev, &rlist);
1169                 } while (rlist != &map->header);
1170                 root->right = prev;
1171         }               
1172         root->max_free = MAX(max_free_left, max_free_right);
1173         map->root = root;
1174 }
1175
1176 /*
1177  *      vm_map_splay:
1178  *
1179  *      The Sleator and Tarjan top-down splay algorithm with the
1180  *      following variation.  Max_free must be computed bottom-up, so
1181  *      on the downward pass, maintain the left and right spines in
1182  *      reverse order.  Then, make a second pass up each side to fix
1183  *      the pointers and compute max_free.  The time bound is O(log n)
1184  *      amortized.
1185  *
1186  *      The new root is the vm_map_entry containing "addr", or else an
1187  *      adjacent entry (lower if possible) if addr is not in the tree.
1188  *
1189  *      The map must be locked, and leaves it so.
1190  *
1191  *      Returns: the new root.
1192  */
1193 static vm_map_entry_t
1194 vm_map_splay(vm_map_t map, vm_offset_t addr)
1195 {
1196         vm_map_entry_t llist, rlist, root;
1197
1198         root = vm_map_splay_split(map, addr, 0, &llist, &rlist);
1199         if (root != NULL) {
1200                 /* do nothing */
1201         } else if (llist != &map->header) {
1202                 /*
1203                  * Recover the greatest node in the left
1204                  * subtree and make it the root.
1205                  */
1206                 root = llist;
1207                 llist = root->right;
1208                 root->right = NULL;
1209         } else if (rlist != &map->header) {
1210                 /*
1211                  * Recover the least node in the right
1212                  * subtree and make it the root.
1213                  */
1214                 root = rlist;
1215                 rlist = root->left;
1216                 root->left = NULL;
1217         } else {
1218                 /* There is no root. */
1219                 return (NULL);
1220         }
1221         vm_map_splay_merge(map, root, llist, rlist);
1222         VM_MAP_ASSERT_CONSISTENT(map);
1223         return (root);
1224 }
1225
1226 /*
1227  *      vm_map_entry_{un,}link:
1228  *
1229  *      Insert/remove entries from maps.
1230  */
1231 static void
1232 vm_map_entry_link(vm_map_t map, vm_map_entry_t entry)
1233 {
1234         vm_map_entry_t llist, rlist, root;
1235
1236         CTR3(KTR_VM,
1237             "vm_map_entry_link: map %p, nentries %d, entry %p", map,
1238             map->nentries, entry);
1239         VM_MAP_ASSERT_LOCKED(map);
1240         map->nentries++;
1241         root = vm_map_splay_split(map, entry->start, 0, &llist, &rlist);
1242         KASSERT(root == NULL,
1243             ("vm_map_entry_link: link object already mapped"));
1244         entry->prev = llist;
1245         entry->next = rlist;
1246         llist->next = rlist->prev = entry;
1247         entry->left = entry->right = NULL;
1248         vm_map_splay_merge(map, entry, llist, rlist);
1249         VM_MAP_ASSERT_CONSISTENT(map);
1250 }
1251
1252 enum unlink_merge_type {
1253         UNLINK_MERGE_NONE,
1254         UNLINK_MERGE_NEXT
1255 };
1256
1257 static void
1258 vm_map_entry_unlink(vm_map_t map, vm_map_entry_t entry,
1259     enum unlink_merge_type op)
1260 {
1261         vm_map_entry_t llist, rlist, root, y;
1262
1263         VM_MAP_ASSERT_LOCKED(map);
1264         root = vm_map_splay_split(map, entry->start, 0, &llist, &rlist);
1265         KASSERT(root != NULL,
1266             ("vm_map_entry_unlink: unlink object not mapped"));
1267
1268         vm_map_splay_findnext(root, &rlist);
1269         switch (op) {
1270         case UNLINK_MERGE_NEXT:
1271                 rlist->start = root->start;
1272                 rlist->offset = root->offset;
1273                 y = root->left;
1274                 root = rlist;
1275                 rlist = root->left;
1276                 root->left = y;
1277                 break;
1278         case UNLINK_MERGE_NONE:
1279                 vm_map_splay_findprev(root, &llist);
1280                 if (llist != &map->header) {
1281                         root = llist;
1282                         llist = root->right;
1283                         root->right = NULL;
1284                 } else if (rlist != &map->header) {
1285                         root = rlist;
1286                         rlist = root->left;
1287                         root->left = NULL;
1288                 } else
1289                         root = NULL;
1290                 break;
1291         }
1292         y = entry->next;
1293         y->prev = entry->prev;
1294         y->prev->next = y;
1295         if (root != NULL)
1296                 vm_map_splay_merge(map, root, llist, rlist);
1297         else
1298                 map->root = NULL;
1299         VM_MAP_ASSERT_CONSISTENT(map);
1300         map->nentries--;
1301         CTR3(KTR_VM, "vm_map_entry_unlink: map %p, nentries %d, entry %p", map,
1302             map->nentries, entry);
1303 }
1304
1305 /*
1306  *      vm_map_entry_resize:
1307  *
1308  *      Resize a vm_map_entry, recompute the amount of free space that
1309  *      follows it and propagate that value up the tree.
1310  *
1311  *      The map must be locked, and leaves it so.
1312  */
1313 static void
1314 vm_map_entry_resize(vm_map_t map, vm_map_entry_t entry, vm_size_t grow_amount)
1315 {
1316         vm_map_entry_t llist, rlist, root;
1317
1318         VM_MAP_ASSERT_LOCKED(map);
1319         root = vm_map_splay_split(map, entry->start, 0, &llist, &rlist);
1320         KASSERT(root != NULL,
1321             ("%s: resize object not mapped", __func__));
1322         vm_map_splay_findnext(root, &rlist);
1323         root->right = NULL;
1324         entry->end += grow_amount;
1325         vm_map_splay_merge(map, root, llist, rlist);
1326         VM_MAP_ASSERT_CONSISTENT(map);
1327         CTR4(KTR_VM, "%s: map %p, nentries %d, entry %p",
1328             __func__, map, map->nentries, entry);
1329 }
1330
1331 /*
1332  *      vm_map_lookup_entry:    [ internal use only ]
1333  *
1334  *      Finds the map entry containing (or
1335  *      immediately preceding) the specified address
1336  *      in the given map; the entry is returned
1337  *      in the "entry" parameter.  The boolean
1338  *      result indicates whether the address is
1339  *      actually contained in the map.
1340  */
1341 boolean_t
1342 vm_map_lookup_entry(
1343         vm_map_t map,
1344         vm_offset_t address,
1345         vm_map_entry_t *entry)  /* OUT */
1346 {
1347         vm_map_entry_t cur, lbound;
1348         boolean_t locked;
1349
1350         /*
1351          * If the map is empty, then the map entry immediately preceding
1352          * "address" is the map's header.
1353          */
1354         cur = map->root;
1355         if (cur == NULL) {
1356                 *entry = &map->header;
1357                 return (FALSE);
1358         }
1359         if (address >= cur->start && cur->end > address) {
1360                 *entry = cur;
1361                 return (TRUE);
1362         }
1363         if ((locked = vm_map_locked(map)) ||
1364             sx_try_upgrade(&map->lock)) {
1365                 /*
1366                  * Splay requires a write lock on the map.  However, it only
1367                  * restructures the binary search tree; it does not otherwise
1368                  * change the map.  Thus, the map's timestamp need not change
1369                  * on a temporary upgrade.
1370                  */
1371                 cur = vm_map_splay(map, address);
1372                 if (!locked)
1373                         sx_downgrade(&map->lock);
1374
1375                 /*
1376                  * If "address" is contained within a map entry, the new root
1377                  * is that map entry.  Otherwise, the new root is a map entry
1378                  * immediately before or after "address".
1379                  */
1380                 if (address < cur->start) {
1381                         *entry = &map->header;
1382                         return (FALSE);
1383                 }
1384                 *entry = cur;
1385                 return (address < cur->end);
1386         }
1387         /*
1388          * Since the map is only locked for read access, perform a
1389          * standard binary search tree lookup for "address".
1390          */
1391         lbound = &map->header;
1392         do {
1393                 if (address < cur->start) {
1394                         cur = cur->left;
1395                 } else if (cur->end <= address) {
1396                         lbound = cur;
1397                         cur = cur->right;
1398                 } else {
1399                         *entry = cur;
1400                         return (TRUE);
1401                 }
1402         } while (cur != NULL);
1403         *entry = lbound;
1404         return (FALSE);
1405 }
1406
1407 /*
1408  *      vm_map_insert:
1409  *
1410  *      Inserts the given whole VM object into the target
1411  *      map at the specified address range.  The object's
1412  *      size should match that of the address range.
1413  *
1414  *      Requires that the map be locked, and leaves it so.
1415  *
1416  *      If object is non-NULL, ref count must be bumped by caller
1417  *      prior to making call to account for the new entry.
1418  */
1419 int
1420 vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1421     vm_offset_t start, vm_offset_t end, vm_prot_t prot, vm_prot_t max, int cow)
1422 {
1423         vm_map_entry_t new_entry, prev_entry;
1424         struct ucred *cred;
1425         vm_eflags_t protoeflags;
1426         vm_inherit_t inheritance;
1427
1428         VM_MAP_ASSERT_LOCKED(map);
1429         KASSERT(object != kernel_object ||
1430             (cow & MAP_COPY_ON_WRITE) == 0,
1431             ("vm_map_insert: kernel object and COW"));
1432         KASSERT(object == NULL || (cow & MAP_NOFAULT) == 0,
1433             ("vm_map_insert: paradoxical MAP_NOFAULT request"));
1434         KASSERT((prot & ~max) == 0,
1435             ("prot %#x is not subset of max_prot %#x", prot, max));
1436
1437         /*
1438          * Check that the start and end points are not bogus.
1439          */
1440         if (start < vm_map_min(map) || end > vm_map_max(map) ||
1441             start >= end)
1442                 return (KERN_INVALID_ADDRESS);
1443
1444         /*
1445          * Find the entry prior to the proposed starting address; if it's part
1446          * of an existing entry, this range is bogus.
1447          */
1448         if (vm_map_lookup_entry(map, start, &prev_entry))
1449                 return (KERN_NO_SPACE);
1450
1451         /*
1452          * Assert that the next entry doesn't overlap the end point.
1453          */
1454         if (prev_entry->next->start < end)
1455                 return (KERN_NO_SPACE);
1456
1457         if ((cow & MAP_CREATE_GUARD) != 0 && (object != NULL ||
1458             max != VM_PROT_NONE))
1459                 return (KERN_INVALID_ARGUMENT);
1460
1461         protoeflags = 0;
1462         if (cow & MAP_COPY_ON_WRITE)
1463                 protoeflags |= MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY;
1464         if (cow & MAP_NOFAULT)
1465                 protoeflags |= MAP_ENTRY_NOFAULT;
1466         if (cow & MAP_DISABLE_SYNCER)
1467                 protoeflags |= MAP_ENTRY_NOSYNC;
1468         if (cow & MAP_DISABLE_COREDUMP)
1469                 protoeflags |= MAP_ENTRY_NOCOREDUMP;
1470         if (cow & MAP_STACK_GROWS_DOWN)
1471                 protoeflags |= MAP_ENTRY_GROWS_DOWN;
1472         if (cow & MAP_STACK_GROWS_UP)
1473                 protoeflags |= MAP_ENTRY_GROWS_UP;
1474         if (cow & MAP_WRITECOUNT)
1475                 protoeflags |= MAP_ENTRY_WRITECNT;
1476         if (cow & MAP_VN_EXEC)
1477                 protoeflags |= MAP_ENTRY_VN_EXEC;
1478         if ((cow & MAP_CREATE_GUARD) != 0)
1479                 protoeflags |= MAP_ENTRY_GUARD;
1480         if ((cow & MAP_CREATE_STACK_GAP_DN) != 0)
1481                 protoeflags |= MAP_ENTRY_STACK_GAP_DN;
1482         if ((cow & MAP_CREATE_STACK_GAP_UP) != 0)
1483                 protoeflags |= MAP_ENTRY_STACK_GAP_UP;
1484         if (cow & MAP_INHERIT_SHARE)
1485                 inheritance = VM_INHERIT_SHARE;
1486         else
1487                 inheritance = VM_INHERIT_DEFAULT;
1488
1489         cred = NULL;
1490         if ((cow & (MAP_ACC_NO_CHARGE | MAP_NOFAULT | MAP_CREATE_GUARD)) != 0)
1491                 goto charged;
1492         if ((cow & MAP_ACC_CHARGED) || ((prot & VM_PROT_WRITE) &&
1493             ((protoeflags & MAP_ENTRY_NEEDS_COPY) || object == NULL))) {
1494                 if (!(cow & MAP_ACC_CHARGED) && !swap_reserve(end - start))
1495                         return (KERN_RESOURCE_SHORTAGE);
1496                 KASSERT(object == NULL ||
1497                     (protoeflags & MAP_ENTRY_NEEDS_COPY) != 0 ||
1498                     object->cred == NULL,
1499                     ("overcommit: vm_map_insert o %p", object));
1500                 cred = curthread->td_ucred;
1501         }
1502
1503 charged:
1504         /* Expand the kernel pmap, if necessary. */
1505         if (map == kernel_map && end > kernel_vm_end)
1506                 pmap_growkernel(end);
1507         if (object != NULL) {
1508                 /*
1509                  * OBJ_ONEMAPPING must be cleared unless this mapping
1510                  * is trivially proven to be the only mapping for any
1511                  * of the object's pages.  (Object granularity
1512                  * reference counting is insufficient to recognize
1513                  * aliases with precision.)
1514                  */
1515                 VM_OBJECT_WLOCK(object);
1516                 if (object->ref_count > 1 || object->shadow_count != 0)
1517                         vm_object_clear_flag(object, OBJ_ONEMAPPING);
1518                 VM_OBJECT_WUNLOCK(object);
1519         } else if ((prev_entry->eflags & ~MAP_ENTRY_USER_WIRED) ==
1520             protoeflags &&
1521             (cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP |
1522             MAP_VN_EXEC)) == 0 &&
1523             prev_entry->end == start && (prev_entry->cred == cred ||
1524             (prev_entry->object.vm_object != NULL &&
1525             prev_entry->object.vm_object->cred == cred)) &&
1526             vm_object_coalesce(prev_entry->object.vm_object,
1527             prev_entry->offset,
1528             (vm_size_t)(prev_entry->end - prev_entry->start),
1529             (vm_size_t)(end - prev_entry->end), cred != NULL &&
1530             (protoeflags & MAP_ENTRY_NEEDS_COPY) == 0)) {
1531                 /*
1532                  * We were able to extend the object.  Determine if we
1533                  * can extend the previous map entry to include the
1534                  * new range as well.
1535                  */
1536                 if (prev_entry->inheritance == inheritance &&
1537                     prev_entry->protection == prot &&
1538                     prev_entry->max_protection == max &&
1539                     prev_entry->wired_count == 0) {
1540                         KASSERT((prev_entry->eflags & MAP_ENTRY_USER_WIRED) ==
1541                             0, ("prev_entry %p has incoherent wiring",
1542                             prev_entry));
1543                         if ((prev_entry->eflags & MAP_ENTRY_GUARD) == 0)
1544                                 map->size += end - prev_entry->end;
1545                         vm_map_entry_resize(map, prev_entry,
1546                             end - prev_entry->end);
1547                         vm_map_try_merge_entries(map, prev_entry, prev_entry->next);
1548                         return (KERN_SUCCESS);
1549                 }
1550
1551                 /*
1552                  * If we can extend the object but cannot extend the
1553                  * map entry, we have to create a new map entry.  We
1554                  * must bump the ref count on the extended object to
1555                  * account for it.  object may be NULL.
1556                  */
1557                 object = prev_entry->object.vm_object;
1558                 offset = prev_entry->offset +
1559                     (prev_entry->end - prev_entry->start);
1560                 vm_object_reference(object);
1561                 if (cred != NULL && object != NULL && object->cred != NULL &&
1562                     !(prev_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
1563                         /* Object already accounts for this uid. */
1564                         cred = NULL;
1565                 }
1566         }
1567         if (cred != NULL)
1568                 crhold(cred);
1569
1570         /*
1571          * Create a new entry
1572          */
1573         new_entry = vm_map_entry_create(map);
1574         new_entry->start = start;
1575         new_entry->end = end;
1576         new_entry->cred = NULL;
1577
1578         new_entry->eflags = protoeflags;
1579         new_entry->object.vm_object = object;
1580         new_entry->offset = offset;
1581
1582         new_entry->inheritance = inheritance;
1583         new_entry->protection = prot;
1584         new_entry->max_protection = max;
1585         new_entry->wired_count = 0;
1586         new_entry->wiring_thread = NULL;
1587         new_entry->read_ahead = VM_FAULT_READ_AHEAD_INIT;
1588         new_entry->next_read = start;
1589
1590         KASSERT(cred == NULL || !ENTRY_CHARGED(new_entry),
1591             ("overcommit: vm_map_insert leaks vm_map %p", new_entry));
1592         new_entry->cred = cred;
1593
1594         /*
1595          * Insert the new entry into the list
1596          */
1597         vm_map_entry_link(map, new_entry);
1598         if ((new_entry->eflags & MAP_ENTRY_GUARD) == 0)
1599                 map->size += new_entry->end - new_entry->start;
1600
1601         /*
1602          * Try to coalesce the new entry with both the previous and next
1603          * entries in the list.  Previously, we only attempted to coalesce
1604          * with the previous entry when object is NULL.  Here, we handle the
1605          * other cases, which are less common.
1606          */
1607         vm_map_try_merge_entries(map, prev_entry, new_entry);
1608         vm_map_try_merge_entries(map, new_entry, new_entry->next);
1609
1610         if ((cow & (MAP_PREFAULT | MAP_PREFAULT_PARTIAL)) != 0) {
1611                 vm_map_pmap_enter(map, start, prot, object, OFF_TO_IDX(offset),
1612                     end - start, cow & MAP_PREFAULT_PARTIAL);
1613         }
1614
1615         return (KERN_SUCCESS);
1616 }
1617
1618 /*
1619  *      vm_map_findspace:
1620  *
1621  *      Find the first fit (lowest VM address) for "length" free bytes
1622  *      beginning at address >= start in the given map.
1623  *
1624  *      In a vm_map_entry, "max_free" is the maximum amount of
1625  *      contiguous free space between an entry in its subtree and a
1626  *      neighbor of that entry.  This allows finding a free region in
1627  *      one path down the tree, so O(log n) amortized with splay
1628  *      trees.
1629  *
1630  *      The map must be locked, and leaves it so.
1631  *
1632  *      Returns: starting address if sufficient space,
1633  *               vm_map_max(map)-length+1 if insufficient space.
1634  */
1635 vm_offset_t
1636 vm_map_findspace(vm_map_t map, vm_offset_t start, vm_size_t length)
1637 {
1638         vm_map_entry_t llist, rlist, root, y;
1639         vm_size_t left_length;
1640         vm_offset_t gap_end;
1641
1642         /*
1643          * Request must fit within min/max VM address and must avoid
1644          * address wrap.
1645          */
1646         start = MAX(start, vm_map_min(map));
1647         if (start >= vm_map_max(map) || length > vm_map_max(map) - start)
1648                 return (vm_map_max(map) - length + 1);
1649
1650         /* Empty tree means wide open address space. */
1651         if (map->root == NULL)
1652                 return (start);
1653
1654         /*
1655          * After splay_split, if start is within an entry, push it to the start
1656          * of the following gap.  If rlist is at the end of the gap containing
1657          * start, save the end of that gap in gap_end to see if the gap is big
1658          * enough; otherwise set gap_end to start skip gap-checking and move
1659          * directly to a search of the right subtree.
1660          */
1661         root = vm_map_splay_split(map, start, length, &llist, &rlist);
1662         gap_end = rlist->start;
1663         if (root != NULL) {
1664                 start = root->end;
1665                 if (root->right != NULL)
1666                         gap_end = start;
1667         } else if (rlist != &map->header) {
1668                 root = rlist;
1669                 rlist = root->left;
1670                 root->left = NULL;
1671         } else {
1672                 root = llist;
1673                 llist = root->right;
1674                 root->right = NULL;
1675         }
1676         vm_map_splay_merge(map, root, llist, rlist);
1677         VM_MAP_ASSERT_CONSISTENT(map);
1678         if (length <= gap_end - start)
1679                 return (start);
1680
1681         /* With max_free, can immediately tell if no solution. */
1682         if (root->right == NULL || length > root->right->max_free)
1683                 return (vm_map_max(map) - length + 1);
1684
1685         /*
1686          * Splay for the least large-enough gap in the right subtree.
1687          */
1688         llist = rlist = &map->header;
1689         for (left_length = 0;;
1690             left_length = vm_map_entry_max_free_left(root, llist)) {
1691                 if (length <= left_length)
1692                         SPLAY_LEFT_STEP(root, y, rlist,
1693                             length <= vm_map_entry_max_free_left(y, llist));
1694                 else
1695                         SPLAY_RIGHT_STEP(root, y, llist,
1696                             length > vm_map_entry_max_free_left(y, root));
1697                 if (root == NULL)
1698                         break;
1699         }
1700         root = llist;
1701         llist = root->right;
1702         root->right = NULL;
1703         if (rlist != &map->header) {
1704                 y = rlist;
1705                 rlist = y->left;
1706                 y->left = NULL;
1707                 vm_map_splay_merge(map, y, &map->header, rlist);
1708                 y->max_free = MAX(
1709                     vm_map_entry_max_free_left(y, root),
1710                     vm_map_entry_max_free_right(y, &map->header));
1711                 root->right = y;
1712         }
1713         vm_map_splay_merge(map, root, llist, &map->header);
1714         VM_MAP_ASSERT_CONSISTENT(map);
1715         return (root->end);
1716 }
1717
1718 int
1719 vm_map_fixed(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1720     vm_offset_t start, vm_size_t length, vm_prot_t prot,
1721     vm_prot_t max, int cow)
1722 {
1723         vm_offset_t end;
1724         int result;
1725
1726         end = start + length;
1727         KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 ||
1728             object == NULL,
1729             ("vm_map_fixed: non-NULL backing object for stack"));
1730         vm_map_lock(map);
1731         VM_MAP_RANGE_CHECK(map, start, end);
1732         if ((cow & MAP_CHECK_EXCL) == 0)
1733                 vm_map_delete(map, start, end);
1734         if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) {
1735                 result = vm_map_stack_locked(map, start, length, sgrowsiz,
1736                     prot, max, cow);
1737         } else {
1738                 result = vm_map_insert(map, object, offset, start, end,
1739                     prot, max, cow);
1740         }
1741         vm_map_unlock(map);
1742         return (result);
1743 }
1744
1745 static const int aslr_pages_rnd_64[2] = {0x1000, 0x10};
1746 static const int aslr_pages_rnd_32[2] = {0x100, 0x4};
1747
1748 static int cluster_anon = 1;
1749 SYSCTL_INT(_vm, OID_AUTO, cluster_anon, CTLFLAG_RW,
1750     &cluster_anon, 0,
1751     "Cluster anonymous mappings: 0 = no, 1 = yes if no hint, 2 = always");
1752
1753 static bool
1754 clustering_anon_allowed(vm_offset_t addr)
1755 {
1756
1757         switch (cluster_anon) {
1758         case 0:
1759                 return (false);
1760         case 1:
1761                 return (addr == 0);
1762         case 2:
1763         default:
1764                 return (true);
1765         }
1766 }
1767
1768 static long aslr_restarts;
1769 SYSCTL_LONG(_vm, OID_AUTO, aslr_restarts, CTLFLAG_RD,
1770     &aslr_restarts, 0,
1771     "Number of aslr failures");
1772
1773 #define MAP_32BIT_MAX_ADDR      ((vm_offset_t)1 << 31)
1774
1775 /*
1776  * Searches for the specified amount of free space in the given map with the
1777  * specified alignment.  Performs an address-ordered, first-fit search from
1778  * the given address "*addr", with an optional upper bound "max_addr".  If the
1779  * parameter "alignment" is zero, then the alignment is computed from the
1780  * given (object, offset) pair so as to enable the greatest possible use of
1781  * superpage mappings.  Returns KERN_SUCCESS and the address of the free space
1782  * in "*addr" if successful.  Otherwise, returns KERN_NO_SPACE.
1783  *
1784  * The map must be locked.  Initially, there must be at least "length" bytes
1785  * of free space at the given address.
1786  */
1787 static int
1788 vm_map_alignspace(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1789     vm_offset_t *addr, vm_size_t length, vm_offset_t max_addr,
1790     vm_offset_t alignment)
1791 {
1792         vm_offset_t aligned_addr, free_addr;
1793
1794         VM_MAP_ASSERT_LOCKED(map);
1795         free_addr = *addr;
1796         KASSERT(free_addr == vm_map_findspace(map, free_addr, length),
1797             ("caller failed to provide space %#jx at address %p",
1798              (uintmax_t)length, (void *)free_addr));
1799         for (;;) {
1800                 /*
1801                  * At the start of every iteration, the free space at address
1802                  * "*addr" is at least "length" bytes.
1803                  */
1804                 if (alignment == 0)
1805                         pmap_align_superpage(object, offset, addr, length);
1806                 else if ((*addr & (alignment - 1)) != 0) {
1807                         *addr &= ~(alignment - 1);
1808                         *addr += alignment;
1809                 }
1810                 aligned_addr = *addr;
1811                 if (aligned_addr == free_addr) {
1812                         /*
1813                          * Alignment did not change "*addr", so "*addr" must
1814                          * still provide sufficient free space.
1815                          */
1816                         return (KERN_SUCCESS);
1817                 }
1818
1819                 /*
1820                  * Test for address wrap on "*addr".  A wrapped "*addr" could
1821                  * be a valid address, in which case vm_map_findspace() cannot
1822                  * be relied upon to fail.
1823                  */
1824                 if (aligned_addr < free_addr)
1825                         return (KERN_NO_SPACE);
1826                 *addr = vm_map_findspace(map, aligned_addr, length);
1827                 if (*addr + length > vm_map_max(map) ||
1828                     (max_addr != 0 && *addr + length > max_addr))
1829                         return (KERN_NO_SPACE);
1830                 free_addr = *addr;
1831                 if (free_addr == aligned_addr) {
1832                         /*
1833                          * If a successful call to vm_map_findspace() did not
1834                          * change "*addr", then "*addr" must still be aligned
1835                          * and provide sufficient free space.
1836                          */
1837                         return (KERN_SUCCESS);
1838                 }
1839         }
1840 }
1841
1842 /*
1843  *      vm_map_find finds an unallocated region in the target address
1844  *      map with the given length.  The search is defined to be
1845  *      first-fit from the specified address; the region found is
1846  *      returned in the same parameter.
1847  *
1848  *      If object is non-NULL, ref count must be bumped by caller
1849  *      prior to making call to account for the new entry.
1850  */
1851 int
1852 vm_map_find(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1853             vm_offset_t *addr,  /* IN/OUT */
1854             vm_size_t length, vm_offset_t max_addr, int find_space,
1855             vm_prot_t prot, vm_prot_t max, int cow)
1856 {
1857         vm_offset_t alignment, curr_min_addr, min_addr;
1858         int gap, pidx, rv, try;
1859         bool cluster, en_aslr, update_anon;
1860
1861         KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 ||
1862             object == NULL,
1863             ("vm_map_find: non-NULL backing object for stack"));
1864         MPASS((cow & MAP_REMAP) == 0 || (find_space == VMFS_NO_SPACE &&
1865             (cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0));
1866         if (find_space == VMFS_OPTIMAL_SPACE && (object == NULL ||
1867             (object->flags & OBJ_COLORED) == 0))
1868                 find_space = VMFS_ANY_SPACE;
1869         if (find_space >> 8 != 0) {
1870                 KASSERT((find_space & 0xff) == 0, ("bad VMFS flags"));
1871                 alignment = (vm_offset_t)1 << (find_space >> 8);
1872         } else
1873                 alignment = 0;
1874         en_aslr = (map->flags & MAP_ASLR) != 0;
1875         update_anon = cluster = clustering_anon_allowed(*addr) &&
1876             (map->flags & MAP_IS_SUB_MAP) == 0 && max_addr == 0 &&
1877             find_space != VMFS_NO_SPACE && object == NULL &&
1878             (cow & (MAP_INHERIT_SHARE | MAP_STACK_GROWS_UP |
1879             MAP_STACK_GROWS_DOWN)) == 0 && prot != PROT_NONE;
1880         curr_min_addr = min_addr = *addr;
1881         if (en_aslr && min_addr == 0 && !cluster &&
1882             find_space != VMFS_NO_SPACE &&
1883             (map->flags & MAP_ASLR_IGNSTART) != 0)
1884                 curr_min_addr = min_addr = vm_map_min(map);
1885         try = 0;
1886         vm_map_lock(map);
1887         if (cluster) {
1888                 curr_min_addr = map->anon_loc;
1889                 if (curr_min_addr == 0)
1890                         cluster = false;
1891         }
1892         if (find_space != VMFS_NO_SPACE) {
1893                 KASSERT(find_space == VMFS_ANY_SPACE ||
1894                     find_space == VMFS_OPTIMAL_SPACE ||
1895                     find_space == VMFS_SUPER_SPACE ||
1896                     alignment != 0, ("unexpected VMFS flag"));
1897 again:
1898                 /*
1899                  * When creating an anonymous mapping, try clustering
1900                  * with an existing anonymous mapping first.
1901                  *
1902                  * We make up to two attempts to find address space
1903                  * for a given find_space value. The first attempt may
1904                  * apply randomization or may cluster with an existing
1905                  * anonymous mapping. If this first attempt fails,
1906                  * perform a first-fit search of the available address
1907                  * space.
1908                  *
1909                  * If all tries failed, and find_space is
1910                  * VMFS_OPTIMAL_SPACE, fallback to VMFS_ANY_SPACE.
1911                  * Again enable clustering and randomization.
1912                  */
1913                 try++;
1914                 MPASS(try <= 2);
1915
1916                 if (try == 2) {
1917                         /*
1918                          * Second try: we failed either to find a
1919                          * suitable region for randomizing the
1920                          * allocation, or to cluster with an existing
1921                          * mapping.  Retry with free run.
1922                          */
1923                         curr_min_addr = (map->flags & MAP_ASLR_IGNSTART) != 0 ?
1924                             vm_map_min(map) : min_addr;
1925                         atomic_add_long(&aslr_restarts, 1);
1926                 }
1927
1928                 if (try == 1 && en_aslr && !cluster) {
1929                         /*
1930                          * Find space for allocation, including
1931                          * gap needed for later randomization.
1932                          */
1933                         pidx = MAXPAGESIZES > 1 && pagesizes[1] != 0 &&
1934                             (find_space == VMFS_SUPER_SPACE || find_space ==
1935                             VMFS_OPTIMAL_SPACE) ? 1 : 0;
1936                         gap = vm_map_max(map) > MAP_32BIT_MAX_ADDR &&
1937                             (max_addr == 0 || max_addr > MAP_32BIT_MAX_ADDR) ?
1938                             aslr_pages_rnd_64[pidx] : aslr_pages_rnd_32[pidx];
1939                         *addr = vm_map_findspace(map, curr_min_addr,
1940                             length + gap * pagesizes[pidx]);
1941                         if (*addr + length + gap * pagesizes[pidx] >
1942                             vm_map_max(map))
1943                                 goto again;
1944                         /* And randomize the start address. */
1945                         *addr += (arc4random() % gap) * pagesizes[pidx];
1946                         if (max_addr != 0 && *addr + length > max_addr)
1947                                 goto again;
1948                 } else {
1949                         *addr = vm_map_findspace(map, curr_min_addr, length);
1950                         if (*addr + length > vm_map_max(map) ||
1951                             (max_addr != 0 && *addr + length > max_addr)) {
1952                                 if (cluster) {
1953                                         cluster = false;
1954                                         MPASS(try == 1);
1955                                         goto again;
1956                                 }
1957                                 rv = KERN_NO_SPACE;
1958                                 goto done;
1959                         }
1960                 }
1961
1962                 if (find_space != VMFS_ANY_SPACE &&
1963                     (rv = vm_map_alignspace(map, object, offset, addr, length,
1964                     max_addr, alignment)) != KERN_SUCCESS) {
1965                         if (find_space == VMFS_OPTIMAL_SPACE) {
1966                                 find_space = VMFS_ANY_SPACE;
1967                                 curr_min_addr = min_addr;
1968                                 cluster = update_anon;
1969                                 try = 0;
1970                                 goto again;
1971                         }
1972                         goto done;
1973                 }
1974         } else if ((cow & MAP_REMAP) != 0) {
1975                 if (*addr < vm_map_min(map) ||
1976                     *addr + length > vm_map_max(map) ||
1977                     *addr + length <= length) {
1978                         rv = KERN_INVALID_ADDRESS;
1979                         goto done;
1980                 }
1981                 vm_map_delete(map, *addr, *addr + length);
1982         }
1983         if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) {
1984                 rv = vm_map_stack_locked(map, *addr, length, sgrowsiz, prot,
1985                     max, cow);
1986         } else {
1987                 rv = vm_map_insert(map, object, offset, *addr, *addr + length,
1988                     prot, max, cow);
1989         }
1990         if (rv == KERN_SUCCESS && update_anon)
1991                 map->anon_loc = *addr + length;
1992 done:
1993         vm_map_unlock(map);
1994         return (rv);
1995 }
1996
1997 /*
1998  *      vm_map_find_min() is a variant of vm_map_find() that takes an
1999  *      additional parameter (min_addr) and treats the given address
2000  *      (*addr) differently.  Specifically, it treats *addr as a hint
2001  *      and not as the minimum address where the mapping is created.
2002  *
2003  *      This function works in two phases.  First, it tries to
2004  *      allocate above the hint.  If that fails and the hint is
2005  *      greater than min_addr, it performs a second pass, replacing
2006  *      the hint with min_addr as the minimum address for the
2007  *      allocation.
2008  */
2009 int
2010 vm_map_find_min(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
2011     vm_offset_t *addr, vm_size_t length, vm_offset_t min_addr,
2012     vm_offset_t max_addr, int find_space, vm_prot_t prot, vm_prot_t max,
2013     int cow)
2014 {
2015         vm_offset_t hint;
2016         int rv;
2017
2018         hint = *addr;
2019         for (;;) {
2020                 rv = vm_map_find(map, object, offset, addr, length, max_addr,
2021                     find_space, prot, max, cow);
2022                 if (rv == KERN_SUCCESS || min_addr >= hint)
2023                         return (rv);
2024                 *addr = hint = min_addr;
2025         }
2026 }
2027
2028 /*
2029  * A map entry with any of the following flags set must not be merged with
2030  * another entry.
2031  */
2032 #define MAP_ENTRY_NOMERGE_MASK  (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP | \
2033             MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_IS_SUB_MAP | MAP_ENTRY_VN_EXEC)
2034
2035 static bool
2036 vm_map_mergeable_neighbors(vm_map_entry_t prev, vm_map_entry_t entry)
2037 {
2038
2039         KASSERT((prev->eflags & MAP_ENTRY_NOMERGE_MASK) == 0 ||
2040             (entry->eflags & MAP_ENTRY_NOMERGE_MASK) == 0,
2041             ("vm_map_mergeable_neighbors: neither %p nor %p are mergeable",
2042             prev, entry));
2043         return (prev->end == entry->start &&
2044             prev->object.vm_object == entry->object.vm_object &&
2045             (prev->object.vm_object == NULL ||
2046             prev->offset + (prev->end - prev->start) == entry->offset) &&
2047             prev->eflags == entry->eflags &&
2048             prev->protection == entry->protection &&
2049             prev->max_protection == entry->max_protection &&
2050             prev->inheritance == entry->inheritance &&
2051             prev->wired_count == entry->wired_count &&
2052             prev->cred == entry->cred);
2053 }
2054
2055 static void
2056 vm_map_merged_neighbor_dispose(vm_map_t map, vm_map_entry_t entry)
2057 {
2058
2059         /*
2060          * If the backing object is a vnode object, vm_object_deallocate()
2061          * calls vrele().  However, vrele() does not lock the vnode because
2062          * the vnode has additional references.  Thus, the map lock can be
2063          * kept without causing a lock-order reversal with the vnode lock.
2064          *
2065          * Since we count the number of virtual page mappings in
2066          * object->un_pager.vnp.writemappings, the writemappings value
2067          * should not be adjusted when the entry is disposed of.
2068          */
2069         if (entry->object.vm_object != NULL)
2070                 vm_object_deallocate(entry->object.vm_object);
2071         if (entry->cred != NULL)
2072                 crfree(entry->cred);
2073         vm_map_entry_dispose(map, entry);
2074 }
2075
2076 /*
2077  *      vm_map_try_merge_entries:
2078  *
2079  *      Compare the given map entry to its predecessor, and merge its precessor
2080  *      into it if possible.  The entry remains valid, and may be extended.
2081  *      The predecessor may be deleted.
2082  *
2083  *      The map must be locked.
2084  */
2085 void
2086 vm_map_try_merge_entries(vm_map_t map, vm_map_entry_t prev, vm_map_entry_t entry)
2087 {
2088
2089         VM_MAP_ASSERT_LOCKED(map);
2090         if ((entry->eflags & MAP_ENTRY_NOMERGE_MASK) == 0 &&
2091             vm_map_mergeable_neighbors(prev, entry)) {
2092                 vm_map_entry_unlink(map, prev, UNLINK_MERGE_NEXT);
2093                 vm_map_merged_neighbor_dispose(map, prev);
2094         }
2095 }
2096
2097 /*
2098  *      vm_map_entry_back:
2099  *
2100  *      Allocate an object to back a map entry.
2101  */
2102 static inline void
2103 vm_map_entry_back(vm_map_entry_t entry)
2104 {
2105         vm_object_t object;
2106
2107         KASSERT(entry->object.vm_object == NULL,
2108             ("map entry %p has backing object", entry));
2109         KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
2110             ("map entry %p is a submap", entry));
2111         object = vm_object_allocate(OBJT_DEFAULT,
2112             atop(entry->end - entry->start));
2113         entry->object.vm_object = object;
2114         entry->offset = 0;
2115         if (entry->cred != NULL) {
2116                 object->cred = entry->cred;
2117                 object->charge = entry->end - entry->start;
2118                 entry->cred = NULL;
2119         }
2120 }
2121
2122 /*
2123  *      vm_map_entry_charge_object
2124  *
2125  *      If there is no object backing this entry, create one.  Otherwise, if
2126  *      the entry has cred, give it to the backing object.
2127  */
2128 static inline void
2129 vm_map_entry_charge_object(vm_map_t map, vm_map_entry_t entry)
2130 {
2131
2132         VM_MAP_ASSERT_LOCKED(map);
2133         KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
2134             ("map entry %p is a submap", entry));
2135         if (entry->object.vm_object == NULL && !map->system_map &&
2136             (entry->eflags & MAP_ENTRY_GUARD) == 0)
2137                 vm_map_entry_back(entry);
2138         else if (entry->object.vm_object != NULL &&
2139             ((entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) &&
2140             entry->cred != NULL) {
2141                 VM_OBJECT_WLOCK(entry->object.vm_object);
2142                 KASSERT(entry->object.vm_object->cred == NULL,
2143                     ("OVERCOMMIT: %s: both cred e %p", __func__, entry));
2144                 entry->object.vm_object->cred = entry->cred;
2145                 entry->object.vm_object->charge = entry->end - entry->start;
2146                 VM_OBJECT_WUNLOCK(entry->object.vm_object);
2147                 entry->cred = NULL;
2148         }
2149 }
2150
2151 /*
2152  *      vm_map_clip_start:      [ internal use only ]
2153  *
2154  *      Asserts that the given entry begins at or after
2155  *      the specified address; if necessary,
2156  *      it splits the entry into two.
2157  */
2158 #define vm_map_clip_start(map, entry, startaddr) \
2159 { \
2160         if (startaddr > entry->start) \
2161                 _vm_map_clip_start(map, entry, startaddr); \
2162 }
2163
2164 /*
2165  *      This routine is called only when it is known that
2166  *      the entry must be split.
2167  */
2168 static void
2169 _vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start)
2170 {
2171         vm_map_entry_t new_entry;
2172
2173         VM_MAP_ASSERT_LOCKED(map);
2174         KASSERT(entry->end > start && entry->start < start,
2175             ("_vm_map_clip_start: invalid clip of entry %p", entry));
2176
2177         /*
2178          * Create a backing object now, if none exists, so that more individual
2179          * objects won't be created after the map entry is split.
2180          */
2181         vm_map_entry_charge_object(map, entry);
2182
2183         /* Clone the entry. */
2184         new_entry = vm_map_entry_create(map);
2185         *new_entry = *entry;
2186
2187         /*
2188          * Split off the front portion.  Insert the new entry BEFORE this one,
2189          * so that this entry has the specified starting address.
2190          */
2191         new_entry->end = start;
2192         entry->offset += (start - entry->start);
2193         entry->start = start;
2194         if (new_entry->cred != NULL)
2195                 crhold(entry->cred);
2196
2197         vm_map_entry_link(map, new_entry);
2198
2199         if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
2200                 vm_object_reference(new_entry->object.vm_object);
2201                 vm_map_entry_set_vnode_text(new_entry, true);
2202                 /*
2203                  * The object->un_pager.vnp.writemappings for the
2204                  * object of MAP_ENTRY_WRITECNT type entry shall be
2205                  * kept as is here.  The virtual pages are
2206                  * re-distributed among the clipped entries, so the sum is
2207                  * left the same.
2208                  */
2209         }
2210 }
2211
2212 /*
2213  *      vm_map_clip_end:        [ internal use only ]
2214  *
2215  *      Asserts that the given entry ends at or before
2216  *      the specified address; if necessary,
2217  *      it splits the entry into two.
2218  */
2219 #define vm_map_clip_end(map, entry, endaddr) \
2220 { \
2221         if ((endaddr) < (entry->end)) \
2222                 _vm_map_clip_end((map), (entry), (endaddr)); \
2223 }
2224
2225 /*
2226  *      This routine is called only when it is known that
2227  *      the entry must be split.
2228  */
2229 static void
2230 _vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end)
2231 {
2232         vm_map_entry_t new_entry;
2233
2234         VM_MAP_ASSERT_LOCKED(map);
2235         KASSERT(entry->start < end && entry->end > end,
2236             ("_vm_map_clip_end: invalid clip of entry %p", entry));
2237
2238         /*
2239          * Create a backing object now, if none exists, so that more individual
2240          * objects won't be created after the map entry is split.
2241          */
2242         vm_map_entry_charge_object(map, entry);
2243
2244         /* Clone the entry. */
2245         new_entry = vm_map_entry_create(map);
2246         *new_entry = *entry;
2247
2248         /*
2249          * Split off the back portion.  Insert the new entry AFTER this one,
2250          * so that this entry has the specified ending address.
2251          */
2252         new_entry->start = entry->end = end;
2253         new_entry->offset += (end - entry->start);
2254         if (new_entry->cred != NULL)
2255                 crhold(entry->cred);
2256
2257         vm_map_entry_link(map, new_entry);
2258
2259         if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
2260                 vm_object_reference(new_entry->object.vm_object);
2261                 vm_map_entry_set_vnode_text(new_entry, true);
2262         }
2263 }
2264
2265 /*
2266  *      vm_map_submap:          [ kernel use only ]
2267  *
2268  *      Mark the given range as handled by a subordinate map.
2269  *
2270  *      This range must have been created with vm_map_find,
2271  *      and no other operations may have been performed on this
2272  *      range prior to calling vm_map_submap.
2273  *
2274  *      Only a limited number of operations can be performed
2275  *      within this rage after calling vm_map_submap:
2276  *              vm_fault
2277  *      [Don't try vm_map_copy!]
2278  *
2279  *      To remove a submapping, one must first remove the
2280  *      range from the superior map, and then destroy the
2281  *      submap (if desired).  [Better yet, don't try it.]
2282  */
2283 int
2284 vm_map_submap(
2285         vm_map_t map,
2286         vm_offset_t start,
2287         vm_offset_t end,
2288         vm_map_t submap)
2289 {
2290         vm_map_entry_t entry;
2291         int result;
2292
2293         result = KERN_INVALID_ARGUMENT;
2294
2295         vm_map_lock(submap);
2296         submap->flags |= MAP_IS_SUB_MAP;
2297         vm_map_unlock(submap);
2298
2299         vm_map_lock(map);
2300
2301         VM_MAP_RANGE_CHECK(map, start, end);
2302
2303         if (vm_map_lookup_entry(map, start, &entry)) {
2304                 vm_map_clip_start(map, entry, start);
2305         } else
2306                 entry = entry->next;
2307
2308         vm_map_clip_end(map, entry, end);
2309
2310         if ((entry->start == start) && (entry->end == end) &&
2311             ((entry->eflags & MAP_ENTRY_COW) == 0) &&
2312             (entry->object.vm_object == NULL)) {
2313                 entry->object.sub_map = submap;
2314                 entry->eflags |= MAP_ENTRY_IS_SUB_MAP;
2315                 result = KERN_SUCCESS;
2316         }
2317         vm_map_unlock(map);
2318
2319         if (result != KERN_SUCCESS) {
2320                 vm_map_lock(submap);
2321                 submap->flags &= ~MAP_IS_SUB_MAP;
2322                 vm_map_unlock(submap);
2323         }
2324         return (result);
2325 }
2326
2327 /*
2328  * The maximum number of pages to map if MAP_PREFAULT_PARTIAL is specified
2329  */
2330 #define MAX_INIT_PT     96
2331
2332 /*
2333  *      vm_map_pmap_enter:
2334  *
2335  *      Preload the specified map's pmap with mappings to the specified
2336  *      object's memory-resident pages.  No further physical pages are
2337  *      allocated, and no further virtual pages are retrieved from secondary
2338  *      storage.  If the specified flags include MAP_PREFAULT_PARTIAL, then a
2339  *      limited number of page mappings are created at the low-end of the
2340  *      specified address range.  (For this purpose, a superpage mapping
2341  *      counts as one page mapping.)  Otherwise, all resident pages within
2342  *      the specified address range are mapped.
2343  */
2344 static void
2345 vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
2346     vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags)
2347 {
2348         vm_offset_t start;
2349         vm_page_t p, p_start;
2350         vm_pindex_t mask, psize, threshold, tmpidx;
2351
2352         if ((prot & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0 || object == NULL)
2353                 return;
2354         VM_OBJECT_RLOCK(object);
2355         if (object->type == OBJT_DEVICE || object->type == OBJT_SG) {
2356                 VM_OBJECT_RUNLOCK(object);
2357                 VM_OBJECT_WLOCK(object);
2358                 if (object->type == OBJT_DEVICE || object->type == OBJT_SG) {
2359                         pmap_object_init_pt(map->pmap, addr, object, pindex,
2360                             size);
2361                         VM_OBJECT_WUNLOCK(object);
2362                         return;
2363                 }
2364                 VM_OBJECT_LOCK_DOWNGRADE(object);
2365         }
2366
2367         psize = atop(size);
2368         if (psize + pindex > object->size) {
2369                 if (object->size < pindex) {
2370                         VM_OBJECT_RUNLOCK(object);
2371                         return;
2372                 }
2373                 psize = object->size - pindex;
2374         }
2375
2376         start = 0;
2377         p_start = NULL;
2378         threshold = MAX_INIT_PT;
2379
2380         p = vm_page_find_least(object, pindex);
2381         /*
2382          * Assert: the variable p is either (1) the page with the
2383          * least pindex greater than or equal to the parameter pindex
2384          * or (2) NULL.
2385          */
2386         for (;
2387              p != NULL && (tmpidx = p->pindex - pindex) < psize;
2388              p = TAILQ_NEXT(p, listq)) {
2389                 /*
2390                  * don't allow an madvise to blow away our really
2391                  * free pages allocating pv entries.
2392                  */
2393                 if (((flags & MAP_PREFAULT_MADVISE) != 0 &&
2394                     vm_page_count_severe()) ||
2395                     ((flags & MAP_PREFAULT_PARTIAL) != 0 &&
2396                     tmpidx >= threshold)) {
2397                         psize = tmpidx;
2398                         break;
2399                 }
2400                 if (p->valid == VM_PAGE_BITS_ALL) {
2401                         if (p_start == NULL) {
2402                                 start = addr + ptoa(tmpidx);
2403                                 p_start = p;
2404                         }
2405                         /* Jump ahead if a superpage mapping is possible. */
2406                         if (p->psind > 0 && ((addr + ptoa(tmpidx)) &
2407                             (pagesizes[p->psind] - 1)) == 0) {
2408                                 mask = atop(pagesizes[p->psind]) - 1;
2409                                 if (tmpidx + mask < psize &&
2410                                     vm_page_ps_test(p, PS_ALL_VALID, NULL)) {
2411                                         p += mask;
2412                                         threshold += mask;
2413                                 }
2414                         }
2415                 } else if (p_start != NULL) {
2416                         pmap_enter_object(map->pmap, start, addr +
2417                             ptoa(tmpidx), p_start, prot);
2418                         p_start = NULL;
2419                 }
2420         }
2421         if (p_start != NULL)
2422                 pmap_enter_object(map->pmap, start, addr + ptoa(psize),
2423                     p_start, prot);
2424         VM_OBJECT_RUNLOCK(object);
2425 }
2426
2427 /*
2428  *      vm_map_protect:
2429  *
2430  *      Sets the protection of the specified address
2431  *      region in the target map.  If "set_max" is
2432  *      specified, the maximum protection is to be set;
2433  *      otherwise, only the current protection is affected.
2434  */
2435 int
2436 vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end,
2437                vm_prot_t new_prot, boolean_t set_max)
2438 {
2439         vm_map_entry_t current, entry, in_tran;
2440         vm_object_t obj;
2441         struct ucred *cred;
2442         vm_prot_t old_prot;
2443         int rv;
2444
2445         if (start == end)
2446                 return (KERN_SUCCESS);
2447
2448 again:
2449         in_tran = NULL;
2450         vm_map_lock(map);
2451
2452         /*
2453          * Ensure that we are not concurrently wiring pages.  vm_map_wire() may
2454          * need to fault pages into the map and will drop the map lock while
2455          * doing so, and the VM object may end up in an inconsistent state if we
2456          * update the protection on the map entry in between faults.
2457          */
2458         vm_map_wait_busy(map);
2459
2460         VM_MAP_RANGE_CHECK(map, start, end);
2461
2462         if (!vm_map_lookup_entry(map, start, &entry))
2463                 entry = entry->next;
2464
2465         /*
2466          * Make a first pass to check for protection violations.
2467          */
2468         for (current = entry; current->start < end; current = current->next) {
2469                 if ((current->eflags & MAP_ENTRY_GUARD) != 0)
2470                         continue;
2471                 if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
2472                         vm_map_unlock(map);
2473                         return (KERN_INVALID_ARGUMENT);
2474                 }
2475                 if ((new_prot & current->max_protection) != new_prot) {
2476                         vm_map_unlock(map);
2477                         return (KERN_PROTECTION_FAILURE);
2478                 }
2479                 if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0)
2480                         in_tran = entry;
2481         }
2482
2483         /*
2484          * Postpone the operation until all in transition map entries
2485          * are stabilized.  In-transition entry might already have its
2486          * pages wired and wired_count incremented, but
2487          * MAP_ENTRY_USER_WIRED flag not yet set, and visible to other
2488          * threads because the map lock is dropped.  In this case we
2489          * would miss our call to vm_fault_copy_entry().
2490          */
2491         if (in_tran != NULL) {
2492                 in_tran->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
2493                 vm_map_unlock_and_wait(map, 0);
2494                 goto again;
2495         }
2496
2497         /*
2498          * Before changing the protections, try to reserve swap space for any
2499          * private (i.e., copy-on-write) mappings that are transitioning from
2500          * read-only to read/write access.  If a reservation fails, break out
2501          * of this loop early and let the next loop simplify the entries, since
2502          * some may now be mergeable.
2503          */
2504         rv = KERN_SUCCESS;
2505         vm_map_clip_start(map, entry, start);
2506         for (current = entry; current->start < end; current = current->next) {
2507
2508                 vm_map_clip_end(map, current, end);
2509
2510                 if (set_max ||
2511                     ((new_prot & ~(current->protection)) & VM_PROT_WRITE) == 0 ||
2512                     ENTRY_CHARGED(current) ||
2513                     (current->eflags & MAP_ENTRY_GUARD) != 0) {
2514                         continue;
2515                 }
2516
2517                 cred = curthread->td_ucred;
2518                 obj = current->object.vm_object;
2519
2520                 if (obj == NULL || (current->eflags & MAP_ENTRY_NEEDS_COPY)) {
2521                         if (!swap_reserve(current->end - current->start)) {
2522                                 rv = KERN_RESOURCE_SHORTAGE;
2523                                 end = current->end;
2524                                 break;
2525                         }
2526                         crhold(cred);
2527                         current->cred = cred;
2528                         continue;
2529                 }
2530
2531                 VM_OBJECT_WLOCK(obj);
2532                 if (obj->type != OBJT_DEFAULT && obj->type != OBJT_SWAP) {
2533                         VM_OBJECT_WUNLOCK(obj);
2534                         continue;
2535                 }
2536
2537                 /*
2538                  * Charge for the whole object allocation now, since
2539                  * we cannot distinguish between non-charged and
2540                  * charged clipped mapping of the same object later.
2541                  */
2542                 KASSERT(obj->charge == 0,
2543                     ("vm_map_protect: object %p overcharged (entry %p)",
2544                     obj, current));
2545                 if (!swap_reserve(ptoa(obj->size))) {
2546                         VM_OBJECT_WUNLOCK(obj);
2547                         rv = KERN_RESOURCE_SHORTAGE;
2548                         end = current->end;
2549                         break;
2550                 }
2551
2552                 crhold(cred);
2553                 obj->cred = cred;
2554                 obj->charge = ptoa(obj->size);
2555                 VM_OBJECT_WUNLOCK(obj);
2556         }
2557
2558         /*
2559          * If enough swap space was available, go back and fix up protections.
2560          * Otherwise, just simplify entries, since some may have been modified.
2561          * [Note that clipping is not necessary the second time.]
2562          */
2563         for (current = entry; current->start < end;
2564             vm_map_try_merge_entries(map, current->prev, current),
2565             current = current->next) {
2566                 if (rv != KERN_SUCCESS ||
2567                     (current->eflags & MAP_ENTRY_GUARD) != 0)
2568                         continue;
2569
2570                 old_prot = current->protection;
2571
2572                 if (set_max)
2573                         current->protection =
2574                             (current->max_protection = new_prot) &
2575                             old_prot;
2576                 else
2577                         current->protection = new_prot;
2578
2579                 /*
2580                  * For user wired map entries, the normal lazy evaluation of
2581                  * write access upgrades through soft page faults is
2582                  * undesirable.  Instead, immediately copy any pages that are
2583                  * copy-on-write and enable write access in the physical map.
2584                  */
2585                 if ((current->eflags & MAP_ENTRY_USER_WIRED) != 0 &&
2586                     (current->protection & VM_PROT_WRITE) != 0 &&
2587                     (old_prot & VM_PROT_WRITE) == 0)
2588                         vm_fault_copy_entry(map, map, current, current, NULL);
2589
2590                 /*
2591                  * When restricting access, update the physical map.  Worry
2592                  * about copy-on-write here.
2593                  */
2594                 if ((old_prot & ~current->protection) != 0) {
2595 #define MASK(entry)     (((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \
2596                                                         VM_PROT_ALL)
2597                         pmap_protect(map->pmap, current->start,
2598                             current->end,
2599                             current->protection & MASK(current));
2600 #undef  MASK
2601                 }
2602         }
2603         vm_map_try_merge_entries(map, current->prev, current);
2604         vm_map_unlock(map);
2605         return (rv);
2606 }
2607
2608 /*
2609  *      vm_map_madvise:
2610  *
2611  *      This routine traverses a processes map handling the madvise
2612  *      system call.  Advisories are classified as either those effecting
2613  *      the vm_map_entry structure, or those effecting the underlying
2614  *      objects.
2615  */
2616 int
2617 vm_map_madvise(
2618         vm_map_t map,
2619         vm_offset_t start,
2620         vm_offset_t end,
2621         int behav)
2622 {
2623         vm_map_entry_t current, entry;
2624         bool modify_map;
2625
2626         /*
2627          * Some madvise calls directly modify the vm_map_entry, in which case
2628          * we need to use an exclusive lock on the map and we need to perform
2629          * various clipping operations.  Otherwise we only need a read-lock
2630          * on the map.
2631          */
2632         switch(behav) {
2633         case MADV_NORMAL:
2634         case MADV_SEQUENTIAL:
2635         case MADV_RANDOM:
2636         case MADV_NOSYNC:
2637         case MADV_AUTOSYNC:
2638         case MADV_NOCORE:
2639         case MADV_CORE:
2640                 if (start == end)
2641                         return (0);
2642                 modify_map = true;
2643                 vm_map_lock(map);
2644                 break;
2645         case MADV_WILLNEED:
2646         case MADV_DONTNEED:
2647         case MADV_FREE:
2648                 if (start == end)
2649                         return (0);
2650                 modify_map = false;
2651                 vm_map_lock_read(map);
2652                 break;
2653         default:
2654                 return (EINVAL);
2655         }
2656
2657         /*
2658          * Locate starting entry and clip if necessary.
2659          */
2660         VM_MAP_RANGE_CHECK(map, start, end);
2661
2662         if (vm_map_lookup_entry(map, start, &entry)) {
2663                 if (modify_map)
2664                         vm_map_clip_start(map, entry, start);
2665         } else {
2666                 entry = entry->next;
2667         }
2668
2669         if (modify_map) {
2670                 /*
2671                  * madvise behaviors that are implemented in the vm_map_entry.
2672                  *
2673                  * We clip the vm_map_entry so that behavioral changes are
2674                  * limited to the specified address range.
2675                  */
2676                 for (current = entry; current->start < end;
2677                     current = current->next) {
2678                         if (current->eflags & MAP_ENTRY_IS_SUB_MAP)
2679                                 continue;
2680
2681                         vm_map_clip_end(map, current, end);
2682
2683                         switch (behav) {
2684                         case MADV_NORMAL:
2685                                 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_NORMAL);
2686                                 break;
2687                         case MADV_SEQUENTIAL:
2688                                 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_SEQUENTIAL);
2689                                 break;
2690                         case MADV_RANDOM:
2691                                 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_RANDOM);
2692                                 break;
2693                         case MADV_NOSYNC:
2694                                 current->eflags |= MAP_ENTRY_NOSYNC;
2695                                 break;
2696                         case MADV_AUTOSYNC:
2697                                 current->eflags &= ~MAP_ENTRY_NOSYNC;
2698                                 break;
2699                         case MADV_NOCORE:
2700                                 current->eflags |= MAP_ENTRY_NOCOREDUMP;
2701                                 break;
2702                         case MADV_CORE:
2703                                 current->eflags &= ~MAP_ENTRY_NOCOREDUMP;
2704                                 break;
2705                         default:
2706                                 break;
2707                         }
2708                         vm_map_try_merge_entries(map, current->prev, current);
2709                 }
2710                 vm_map_try_merge_entries(map, current->prev, current);
2711                 vm_map_unlock(map);
2712         } else {
2713                 vm_pindex_t pstart, pend;
2714
2715                 /*
2716                  * madvise behaviors that are implemented in the underlying
2717                  * vm_object.
2718                  *
2719                  * Since we don't clip the vm_map_entry, we have to clip
2720                  * the vm_object pindex and count.
2721                  */
2722                 for (current = entry; current->start < end;
2723                     current = current->next) {
2724                         vm_offset_t useEnd, useStart;
2725
2726                         if (current->eflags & MAP_ENTRY_IS_SUB_MAP)
2727                                 continue;
2728
2729                         /*
2730                          * MADV_FREE would otherwise rewind time to
2731                          * the creation of the shadow object.  Because
2732                          * we hold the VM map read-locked, neither the
2733                          * entry's object nor the presence of a
2734                          * backing object can change.
2735                          */
2736                         if (behav == MADV_FREE &&
2737                             current->object.vm_object != NULL &&
2738                             current->object.vm_object->backing_object != NULL)
2739                                 continue;
2740
2741                         pstart = OFF_TO_IDX(current->offset);
2742                         pend = pstart + atop(current->end - current->start);
2743                         useStart = current->start;
2744                         useEnd = current->end;
2745
2746                         if (current->start < start) {
2747                                 pstart += atop(start - current->start);
2748                                 useStart = start;
2749                         }
2750                         if (current->end > end) {
2751                                 pend -= atop(current->end - end);
2752                                 useEnd = end;
2753                         }
2754
2755                         if (pstart >= pend)
2756                                 continue;
2757
2758                         /*
2759                          * Perform the pmap_advise() before clearing
2760                          * PGA_REFERENCED in vm_page_advise().  Otherwise, a
2761                          * concurrent pmap operation, such as pmap_remove(),
2762                          * could clear a reference in the pmap and set
2763                          * PGA_REFERENCED on the page before the pmap_advise()
2764                          * had completed.  Consequently, the page would appear
2765                          * referenced based upon an old reference that
2766                          * occurred before this pmap_advise() ran.
2767                          */
2768                         if (behav == MADV_DONTNEED || behav == MADV_FREE)
2769                                 pmap_advise(map->pmap, useStart, useEnd,
2770                                     behav);
2771
2772                         vm_object_madvise(current->object.vm_object, pstart,
2773                             pend, behav);
2774
2775                         /*
2776                          * Pre-populate paging structures in the
2777                          * WILLNEED case.  For wired entries, the
2778                          * paging structures are already populated.
2779                          */
2780                         if (behav == MADV_WILLNEED &&
2781                             current->wired_count == 0) {
2782                                 vm_map_pmap_enter(map,
2783                                     useStart,
2784                                     current->protection,
2785                                     current->object.vm_object,
2786                                     pstart,
2787                                     ptoa(pend - pstart),
2788                                     MAP_PREFAULT_MADVISE
2789                                 );
2790                         }
2791                 }
2792                 vm_map_unlock_read(map);
2793         }
2794         return (0);
2795 }
2796
2797
2798 /*
2799  *      vm_map_inherit:
2800  *
2801  *      Sets the inheritance of the specified address
2802  *      range in the target map.  Inheritance
2803  *      affects how the map will be shared with
2804  *      child maps at the time of vmspace_fork.
2805  */
2806 int
2807 vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end,
2808                vm_inherit_t new_inheritance)
2809 {
2810         vm_map_entry_t entry;
2811         vm_map_entry_t temp_entry;
2812
2813         switch (new_inheritance) {
2814         case VM_INHERIT_NONE:
2815         case VM_INHERIT_COPY:
2816         case VM_INHERIT_SHARE:
2817         case VM_INHERIT_ZERO:
2818                 break;
2819         default:
2820                 return (KERN_INVALID_ARGUMENT);
2821         }
2822         if (start == end)
2823                 return (KERN_SUCCESS);
2824         vm_map_lock(map);
2825         VM_MAP_RANGE_CHECK(map, start, end);
2826         if (vm_map_lookup_entry(map, start, &temp_entry)) {
2827                 entry = temp_entry;
2828                 vm_map_clip_start(map, entry, start);
2829         } else
2830                 entry = temp_entry->next;
2831         while (entry->start < end) {
2832                 vm_map_clip_end(map, entry, end);
2833                 if ((entry->eflags & MAP_ENTRY_GUARD) == 0 ||
2834                     new_inheritance != VM_INHERIT_ZERO)
2835                         entry->inheritance = new_inheritance;
2836                 vm_map_try_merge_entries(map, entry->prev, entry);
2837                 entry = entry->next;
2838         }
2839         vm_map_try_merge_entries(map, entry->prev, entry);
2840         vm_map_unlock(map);
2841         return (KERN_SUCCESS);
2842 }
2843
2844 /*
2845  *      vm_map_entry_in_transition:
2846  *
2847  *      Release the map lock, and sleep until the entry is no longer in
2848  *      transition.  Awake and acquire the map lock.  If the map changed while
2849  *      another held the lock, lookup a possibly-changed entry at or after the
2850  *      'start' position of the old entry.
2851  */
2852 static vm_map_entry_t
2853 vm_map_entry_in_transition(vm_map_t map, vm_offset_t in_start,
2854     vm_offset_t *io_end, bool holes_ok, vm_map_entry_t in_entry)
2855 {
2856         vm_map_entry_t entry;
2857         vm_offset_t start;
2858         u_int last_timestamp;
2859
2860         VM_MAP_ASSERT_LOCKED(map);
2861         KASSERT((in_entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0,
2862             ("not in-tranition map entry %p", in_entry));
2863         /*
2864          * We have not yet clipped the entry.
2865          */
2866         start = MAX(in_start, in_entry->start);
2867         in_entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
2868         last_timestamp = map->timestamp;
2869         if (vm_map_unlock_and_wait(map, 0)) {
2870                 /*
2871                  * Allow interruption of user wiring/unwiring?
2872                  */
2873         }
2874         vm_map_lock(map);
2875         if (last_timestamp + 1 == map->timestamp)
2876                 return (in_entry);
2877
2878         /*
2879          * Look again for the entry because the map was modified while it was
2880          * unlocked.  Specifically, the entry may have been clipped, merged, or
2881          * deleted.
2882          */
2883         if (!vm_map_lookup_entry(map, start, &entry)) {
2884                 if (!holes_ok) {
2885                         *io_end = start;
2886                         return (NULL);
2887                 }
2888                 entry = entry->next;
2889         }
2890         return (entry);
2891 }
2892
2893 /*
2894  *      vm_map_unwire:
2895  *
2896  *      Implements both kernel and user unwiring.
2897  */
2898 int
2899 vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end,
2900     int flags)
2901 {
2902         vm_map_entry_t entry, first_entry;
2903         int rv;
2904         bool first_iteration, holes_ok, need_wakeup, user_unwire;
2905
2906         if (start == end)
2907                 return (KERN_SUCCESS);
2908         holes_ok = (flags & VM_MAP_WIRE_HOLESOK) != 0;
2909         user_unwire = (flags & VM_MAP_WIRE_USER) != 0;
2910         vm_map_lock(map);
2911         VM_MAP_RANGE_CHECK(map, start, end);
2912         if (!vm_map_lookup_entry(map, start, &first_entry)) {
2913                 if (holes_ok)
2914                         first_entry = first_entry->next;
2915                 else {
2916                         vm_map_unlock(map);
2917                         return (KERN_INVALID_ADDRESS);
2918                 }
2919         }
2920         first_iteration = true;
2921         entry = first_entry;
2922         rv = KERN_SUCCESS;
2923         while (entry->start < end) {
2924                 if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
2925                         /*
2926                          * We have not yet clipped the entry.
2927                          */
2928                         entry = vm_map_entry_in_transition(map, start, &end,
2929                             holes_ok, entry);
2930                         if (entry == NULL) {
2931                                 if (first_iteration) {
2932                                         vm_map_unlock(map);
2933                                         return (KERN_INVALID_ADDRESS);
2934                                 }
2935                                 rv = KERN_INVALID_ADDRESS;
2936                                 break;
2937                         }
2938                         first_entry = first_iteration ? entry : NULL;
2939                         continue;
2940                 }
2941                 first_iteration = false;
2942                 vm_map_clip_start(map, entry, start);
2943                 vm_map_clip_end(map, entry, end);
2944                 /*
2945                  * Mark the entry in case the map lock is released.  (See
2946                  * above.)
2947                  */
2948                 KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 &&
2949                     entry->wiring_thread == NULL,
2950                     ("owned map entry %p", entry));
2951                 entry->eflags |= MAP_ENTRY_IN_TRANSITION;
2952                 entry->wiring_thread = curthread;
2953                 /*
2954                  * Check the map for holes in the specified region.
2955                  * If holes_ok, skip this check.
2956                  */
2957                 if (!holes_ok &&
2958                     (entry->end < end && entry->next->start > entry->end)) {
2959                         end = entry->end;
2960                         rv = KERN_INVALID_ADDRESS;
2961                         break;
2962                 }
2963                 /*
2964                  * If system unwiring, require that the entry is system wired.
2965                  */
2966                 if (!user_unwire &&
2967                     vm_map_entry_system_wired_count(entry) == 0) {
2968                         end = entry->end;
2969                         rv = KERN_INVALID_ARGUMENT;
2970                         break;
2971                 }
2972                 entry = entry->next;
2973         }
2974         need_wakeup = false;
2975         if (first_entry == NULL &&
2976             !vm_map_lookup_entry(map, start, &first_entry)) {
2977                 KASSERT(holes_ok, ("vm_map_unwire: lookup failed"));
2978                 first_entry = first_entry->next;
2979         }
2980         for (entry = first_entry; entry->start < end; entry = entry->next) {
2981                 /*
2982                  * If holes_ok was specified, an empty
2983                  * space in the unwired region could have been mapped
2984                  * while the map lock was dropped for draining
2985                  * MAP_ENTRY_IN_TRANSITION.  Moreover, another thread
2986                  * could be simultaneously wiring this new mapping
2987                  * entry.  Detect these cases and skip any entries
2988                  * marked as in transition by us.
2989                  */
2990                 if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 ||
2991                     entry->wiring_thread != curthread) {
2992                         KASSERT(holes_ok,
2993                             ("vm_map_unwire: !HOLESOK and new/changed entry"));
2994                         continue;
2995                 }
2996
2997                 if (rv == KERN_SUCCESS && (!user_unwire ||
2998                     (entry->eflags & MAP_ENTRY_USER_WIRED))) {
2999                         if (entry->wired_count == 1)
3000                                 vm_map_entry_unwire(map, entry);
3001                         else
3002                                 entry->wired_count--;
3003                         if (user_unwire)
3004                                 entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3005                 }
3006                 KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0,
3007                     ("vm_map_unwire: in-transition flag missing %p", entry));
3008                 KASSERT(entry->wiring_thread == curthread,
3009                     ("vm_map_unwire: alien wire %p", entry));
3010                 entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
3011                 entry->wiring_thread = NULL;
3012                 if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
3013                         entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
3014                         need_wakeup = true;
3015                 }
3016                 vm_map_try_merge_entries(map, entry->prev, entry);
3017         }
3018         vm_map_try_merge_entries(map, entry->prev, entry);
3019         vm_map_unlock(map);
3020         if (need_wakeup)
3021                 vm_map_wakeup(map);
3022         return (rv);
3023 }
3024
3025 static void
3026 vm_map_wire_user_count_sub(u_long npages)
3027 {
3028
3029         atomic_subtract_long(&vm_user_wire_count, npages);
3030 }
3031
3032 static bool
3033 vm_map_wire_user_count_add(u_long npages)
3034 {
3035         u_long wired;
3036
3037         wired = vm_user_wire_count;
3038         do {
3039                 if (npages + wired > vm_page_max_user_wired)
3040                         return (false);
3041         } while (!atomic_fcmpset_long(&vm_user_wire_count, &wired,
3042             npages + wired));
3043
3044         return (true);
3045 }
3046
3047 /*
3048  *      vm_map_wire_entry_failure:
3049  *
3050  *      Handle a wiring failure on the given entry.
3051  *
3052  *      The map should be locked.
3053  */
3054 static void
3055 vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry,
3056     vm_offset_t failed_addr)
3057 {
3058
3059         VM_MAP_ASSERT_LOCKED(map);
3060         KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 &&
3061             entry->wired_count == 1,
3062             ("vm_map_wire_entry_failure: entry %p isn't being wired", entry));
3063         KASSERT(failed_addr < entry->end,
3064             ("vm_map_wire_entry_failure: entry %p was fully wired", entry));
3065
3066         /*
3067          * If any pages at the start of this entry were successfully wired,
3068          * then unwire them.
3069          */
3070         if (failed_addr > entry->start) {
3071                 pmap_unwire(map->pmap, entry->start, failed_addr);
3072                 vm_object_unwire(entry->object.vm_object, entry->offset,
3073                     failed_addr - entry->start, PQ_ACTIVE);
3074         }
3075
3076         /*
3077          * Assign an out-of-range value to represent the failure to wire this
3078          * entry.
3079          */
3080         entry->wired_count = -1;
3081 }
3082
3083 int
3084 vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags)
3085 {
3086         int rv;
3087
3088         vm_map_lock(map);
3089         rv = vm_map_wire_locked(map, start, end, flags);
3090         vm_map_unlock(map);
3091         return (rv);
3092 }
3093
3094
3095 /*
3096  *      vm_map_wire_locked:
3097  *
3098  *      Implements both kernel and user wiring.  Returns with the map locked,
3099  *      the map lock may be dropped.
3100  */
3101 int
3102 vm_map_wire_locked(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags)
3103 {
3104         vm_map_entry_t entry, first_entry, tmp_entry;
3105         vm_offset_t faddr, saved_end, saved_start;
3106         u_long npages;
3107         u_int last_timestamp;
3108         int rv;
3109         bool first_iteration, holes_ok, need_wakeup, user_wire;
3110         vm_prot_t prot;
3111
3112         VM_MAP_ASSERT_LOCKED(map);
3113
3114         if (start == end)
3115                 return (KERN_SUCCESS);
3116         prot = 0;
3117         if (flags & VM_MAP_WIRE_WRITE)
3118                 prot |= VM_PROT_WRITE;
3119         holes_ok = (flags & VM_MAP_WIRE_HOLESOK) != 0;
3120         user_wire = (flags & VM_MAP_WIRE_USER) != 0;
3121         VM_MAP_RANGE_CHECK(map, start, end);
3122         if (!vm_map_lookup_entry(map, start, &first_entry)) {
3123                 if (holes_ok)
3124                         first_entry = first_entry->next;
3125                 else
3126                         return (KERN_INVALID_ADDRESS);
3127         }
3128         first_iteration = true;
3129         entry = first_entry;
3130         while (entry->start < end) {
3131                 if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
3132                         /*
3133                          * We have not yet clipped the entry.
3134                          */
3135                         entry = vm_map_entry_in_transition(map, start, &end,
3136                             holes_ok, entry);
3137                         if (entry == NULL) {
3138                                 if (first_iteration)
3139                                         return (KERN_INVALID_ADDRESS);
3140                                 rv = KERN_INVALID_ADDRESS;
3141                                 goto done;
3142                         }
3143                         first_entry = first_iteration ? entry : NULL;
3144                         continue;
3145                 }
3146                 first_iteration = false;
3147                 vm_map_clip_start(map, entry, start);
3148                 vm_map_clip_end(map, entry, end);
3149                 /*
3150                  * Mark the entry in case the map lock is released.  (See
3151                  * above.)
3152                  */
3153                 KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 &&
3154                     entry->wiring_thread == NULL,
3155                     ("owned map entry %p", entry));
3156                 entry->eflags |= MAP_ENTRY_IN_TRANSITION;
3157                 entry->wiring_thread = curthread;
3158                 if ((entry->protection & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0
3159                     || (entry->protection & prot) != prot) {
3160                         entry->eflags |= MAP_ENTRY_WIRE_SKIPPED;
3161                         if (!holes_ok) {
3162                                 end = entry->end;
3163                                 rv = KERN_INVALID_ADDRESS;
3164                                 goto done;
3165                         }
3166                 } else if (entry->wired_count == 0) {
3167                         entry->wired_count++;
3168
3169                         npages = atop(entry->end - entry->start);
3170                         if (user_wire && !vm_map_wire_user_count_add(npages)) {
3171                                 vm_map_wire_entry_failure(map, entry,
3172                                     entry->start);
3173                                 end = entry->end;
3174                                 rv = KERN_RESOURCE_SHORTAGE;
3175                                 goto done;
3176                         }
3177
3178                         /*
3179                          * Release the map lock, relying on the in-transition
3180                          * mark.  Mark the map busy for fork.
3181                          */
3182                         saved_start = entry->start;
3183                         saved_end = entry->end;
3184                         last_timestamp = map->timestamp;
3185                         vm_map_busy(map);
3186                         vm_map_unlock(map);
3187
3188                         faddr = saved_start;
3189                         do {
3190                                 /*
3191                                  * Simulate a fault to get the page and enter
3192                                  * it into the physical map.
3193                                  */
3194                                 if ((rv = vm_fault(map, faddr, VM_PROT_NONE,
3195                                     VM_FAULT_WIRE)) != KERN_SUCCESS)
3196                                         break;
3197                         } while ((faddr += PAGE_SIZE) < saved_end);
3198                         vm_map_lock(map);
3199                         vm_map_unbusy(map);
3200                         if (last_timestamp + 1 != map->timestamp) {
3201                                 /*
3202                                  * Look again for the entry because the map was
3203                                  * modified while it was unlocked.  The entry
3204                                  * may have been clipped, but NOT merged or
3205                                  * deleted.
3206                                  */
3207                                 if (!vm_map_lookup_entry(map, saved_start,
3208                                     &tmp_entry))
3209                                         KASSERT(false,
3210                                             ("vm_map_wire: lookup failed"));
3211                                 if (entry == first_entry)
3212                                         first_entry = tmp_entry;
3213                                 else
3214                                         first_entry = NULL;
3215                                 entry = tmp_entry;
3216                                 while (entry->end < saved_end) {
3217                                         /*
3218                                          * In case of failure, handle entries
3219                                          * that were not fully wired here;
3220                                          * fully wired entries are handled
3221                                          * later.
3222                                          */
3223                                         if (rv != KERN_SUCCESS &&
3224                                             faddr < entry->end)
3225                                                 vm_map_wire_entry_failure(map,
3226                                                     entry, faddr);
3227                                         entry = entry->next;
3228                                 }
3229                         }
3230                         if (rv != KERN_SUCCESS) {
3231                                 vm_map_wire_entry_failure(map, entry, faddr);
3232                                 if (user_wire)
3233                                         vm_map_wire_user_count_sub(npages);
3234                                 end = entry->end;
3235                                 goto done;
3236                         }
3237                 } else if (!user_wire ||
3238                            (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
3239                         entry->wired_count++;
3240                 }
3241                 /*
3242                  * Check the map for holes in the specified region.
3243                  * If holes_ok was specified, skip this check.
3244                  */
3245                 if (!holes_ok &&
3246                     entry->end < end && entry->next->start > entry->end) {
3247                         end = entry->end;
3248                         rv = KERN_INVALID_ADDRESS;
3249                         goto done;
3250                 }
3251                 entry = entry->next;
3252         }
3253         rv = KERN_SUCCESS;
3254 done:
3255         need_wakeup = false;
3256         if (first_entry == NULL &&
3257             !vm_map_lookup_entry(map, start, &first_entry)) {
3258                 KASSERT(holes_ok, ("vm_map_wire: lookup failed"));
3259                 first_entry = first_entry->next;
3260         }
3261         for (entry = first_entry; entry->start < end; entry = entry->next) {
3262                 /*
3263                  * If holes_ok was specified, an empty
3264                  * space in the unwired region could have been mapped
3265                  * while the map lock was dropped for faulting in the
3266                  * pages or draining MAP_ENTRY_IN_TRANSITION.
3267                  * Moreover, another thread could be simultaneously
3268                  * wiring this new mapping entry.  Detect these cases
3269                  * and skip any entries marked as in transition not by us.
3270                  */
3271                 if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 ||
3272                     entry->wiring_thread != curthread) {
3273                         KASSERT(holes_ok,
3274                             ("vm_map_wire: !HOLESOK and new/changed entry"));
3275                         continue;
3276                 }
3277
3278                 if ((entry->eflags & MAP_ENTRY_WIRE_SKIPPED) != 0) {
3279                         /* do nothing */
3280                 } else if (rv == KERN_SUCCESS) {
3281                         if (user_wire)
3282                                 entry->eflags |= MAP_ENTRY_USER_WIRED;
3283                 } else if (entry->wired_count == -1) {
3284                         /*
3285                          * Wiring failed on this entry.  Thus, unwiring is
3286                          * unnecessary.
3287                          */
3288                         entry->wired_count = 0;
3289                 } else if (!user_wire ||
3290                     (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
3291                         /*
3292                          * Undo the wiring.  Wiring succeeded on this entry
3293                          * but failed on a later entry.  
3294                          */
3295                         if (entry->wired_count == 1) {
3296                                 vm_map_entry_unwire(map, entry);
3297                                 if (user_wire)
3298                                         vm_map_wire_user_count_sub(
3299                                             atop(entry->end - entry->start));
3300                         } else
3301                                 entry->wired_count--;
3302                 }
3303                 KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0,
3304                     ("vm_map_wire: in-transition flag missing %p", entry));
3305                 KASSERT(entry->wiring_thread == curthread,
3306                     ("vm_map_wire: alien wire %p", entry));
3307                 entry->eflags &= ~(MAP_ENTRY_IN_TRANSITION |
3308                     MAP_ENTRY_WIRE_SKIPPED);
3309                 entry->wiring_thread = NULL;
3310                 if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
3311                         entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
3312                         need_wakeup = true;
3313                 }
3314                 vm_map_try_merge_entries(map, entry->prev, entry);
3315         }
3316         vm_map_try_merge_entries(map, entry->prev, entry);
3317         if (need_wakeup)
3318                 vm_map_wakeup(map);
3319         return (rv);
3320 }
3321
3322 /*
3323  * vm_map_sync
3324  *
3325  * Push any dirty cached pages in the address range to their pager.
3326  * If syncio is TRUE, dirty pages are written synchronously.
3327  * If invalidate is TRUE, any cached pages are freed as well.
3328  *
3329  * If the size of the region from start to end is zero, we are
3330  * supposed to flush all modified pages within the region containing
3331  * start.  Unfortunately, a region can be split or coalesced with
3332  * neighboring regions, making it difficult to determine what the
3333  * original region was.  Therefore, we approximate this requirement by
3334  * flushing the current region containing start.
3335  *
3336  * Returns an error if any part of the specified range is not mapped.
3337  */
3338 int
3339 vm_map_sync(
3340         vm_map_t map,
3341         vm_offset_t start,
3342         vm_offset_t end,
3343         boolean_t syncio,
3344         boolean_t invalidate)
3345 {
3346         vm_map_entry_t current;
3347         vm_map_entry_t entry;
3348         vm_size_t size;
3349         vm_object_t object;
3350         vm_ooffset_t offset;
3351         unsigned int last_timestamp;
3352         boolean_t failed;
3353
3354         vm_map_lock_read(map);
3355         VM_MAP_RANGE_CHECK(map, start, end);
3356         if (!vm_map_lookup_entry(map, start, &entry)) {
3357                 vm_map_unlock_read(map);
3358                 return (KERN_INVALID_ADDRESS);
3359         } else if (start == end) {
3360                 start = entry->start;
3361                 end = entry->end;
3362         }
3363         /*
3364          * Make a first pass to check for user-wired memory and holes.
3365          */
3366         for (current = entry; current->start < end; current = current->next) {
3367                 if (invalidate && (current->eflags & MAP_ENTRY_USER_WIRED)) {
3368                         vm_map_unlock_read(map);
3369                         return (KERN_INVALID_ARGUMENT);
3370                 }
3371                 if (end > current->end &&
3372                     current->end != current->next->start) {
3373                         vm_map_unlock_read(map);
3374                         return (KERN_INVALID_ADDRESS);
3375                 }
3376         }
3377
3378         if (invalidate)
3379                 pmap_remove(map->pmap, start, end);
3380         failed = FALSE;
3381
3382         /*
3383          * Make a second pass, cleaning/uncaching pages from the indicated
3384          * objects as we go.
3385          */
3386         for (current = entry; current->start < end;) {
3387                 offset = current->offset + (start - current->start);
3388                 size = (end <= current->end ? end : current->end) - start;
3389                 if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
3390                         vm_map_t smap;
3391                         vm_map_entry_t tentry;
3392                         vm_size_t tsize;
3393
3394                         smap = current->object.sub_map;
3395                         vm_map_lock_read(smap);
3396                         (void) vm_map_lookup_entry(smap, offset, &tentry);
3397                         tsize = tentry->end - offset;
3398                         if (tsize < size)
3399                                 size = tsize;
3400                         object = tentry->object.vm_object;
3401                         offset = tentry->offset + (offset - tentry->start);
3402                         vm_map_unlock_read(smap);
3403                 } else {
3404                         object = current->object.vm_object;
3405                 }
3406                 vm_object_reference(object);
3407                 last_timestamp = map->timestamp;
3408                 vm_map_unlock_read(map);
3409                 if (!vm_object_sync(object, offset, size, syncio, invalidate))
3410                         failed = TRUE;
3411                 start += size;
3412                 vm_object_deallocate(object);
3413                 vm_map_lock_read(map);
3414                 if (last_timestamp == map->timestamp ||
3415                     !vm_map_lookup_entry(map, start, &current))
3416                         current = current->next;
3417         }
3418
3419         vm_map_unlock_read(map);
3420         return (failed ? KERN_FAILURE : KERN_SUCCESS);
3421 }
3422
3423 /*
3424  *      vm_map_entry_unwire:    [ internal use only ]
3425  *
3426  *      Make the region specified by this entry pageable.
3427  *
3428  *      The map in question should be locked.
3429  *      [This is the reason for this routine's existence.]
3430  */
3431 static void
3432 vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry)
3433 {
3434         vm_size_t size;
3435
3436         VM_MAP_ASSERT_LOCKED(map);
3437         KASSERT(entry->wired_count > 0,
3438             ("vm_map_entry_unwire: entry %p isn't wired", entry));
3439
3440         size = entry->end - entry->start;
3441         if ((entry->eflags & MAP_ENTRY_USER_WIRED) != 0)
3442                 vm_map_wire_user_count_sub(atop(size));
3443         pmap_unwire(map->pmap, entry->start, entry->end);
3444         vm_object_unwire(entry->object.vm_object, entry->offset, size,
3445             PQ_ACTIVE);
3446         entry->wired_count = 0;
3447 }
3448
3449 static void
3450 vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map)
3451 {
3452
3453         if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0)
3454                 vm_object_deallocate(entry->object.vm_object);
3455         uma_zfree(system_map ? kmapentzone : mapentzone, entry);
3456 }
3457
3458 /*
3459  *      vm_map_entry_delete:    [ internal use only ]
3460  *
3461  *      Deallocate the given entry from the target map.
3462  */
3463 static void
3464 vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry)
3465 {
3466         vm_object_t object;
3467         vm_pindex_t offidxstart, offidxend, count, size1;
3468         vm_size_t size;
3469
3470         vm_map_entry_unlink(map, entry, UNLINK_MERGE_NONE);
3471         object = entry->object.vm_object;
3472
3473         if ((entry->eflags & MAP_ENTRY_GUARD) != 0) {
3474                 MPASS(entry->cred == NULL);
3475                 MPASS((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0);
3476                 MPASS(object == NULL);
3477                 vm_map_entry_deallocate(entry, map->system_map);
3478                 return;
3479         }
3480
3481         size = entry->end - entry->start;
3482         map->size -= size;
3483
3484         if (entry->cred != NULL) {
3485                 swap_release_by_cred(size, entry->cred);
3486                 crfree(entry->cred);
3487         }
3488
3489         if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0 &&
3490             (object != NULL)) {
3491                 KASSERT(entry->cred == NULL || object->cred == NULL ||
3492                     (entry->eflags & MAP_ENTRY_NEEDS_COPY),
3493                     ("OVERCOMMIT vm_map_entry_delete: both cred %p", entry));
3494                 count = atop(size);
3495                 offidxstart = OFF_TO_IDX(entry->offset);
3496                 offidxend = offidxstart + count;
3497                 VM_OBJECT_WLOCK(object);
3498                 if (object->ref_count != 1 && ((object->flags & (OBJ_NOSPLIT |
3499                     OBJ_ONEMAPPING)) == OBJ_ONEMAPPING ||
3500                     object == kernel_object)) {
3501                         vm_object_collapse(object);
3502
3503                         /*
3504                          * The option OBJPR_NOTMAPPED can be passed here
3505                          * because vm_map_delete() already performed
3506                          * pmap_remove() on the only mapping to this range
3507                          * of pages. 
3508                          */
3509                         vm_object_page_remove(object, offidxstart, offidxend,
3510                             OBJPR_NOTMAPPED);
3511                         if (object->type == OBJT_SWAP)
3512                                 swap_pager_freespace(object, offidxstart,
3513                                     count);
3514                         if (offidxend >= object->size &&
3515                             offidxstart < object->size) {
3516                                 size1 = object->size;
3517                                 object->size = offidxstart;
3518                                 if (object->cred != NULL) {
3519                                         size1 -= object->size;
3520                                         KASSERT(object->charge >= ptoa(size1),
3521                                             ("object %p charge < 0", object));
3522                                         swap_release_by_cred(ptoa(size1),
3523                                             object->cred);
3524                                         object->charge -= ptoa(size1);
3525                                 }
3526                         }
3527                 }
3528                 VM_OBJECT_WUNLOCK(object);
3529         } else
3530                 entry->object.vm_object = NULL;
3531         if (map->system_map)
3532                 vm_map_entry_deallocate(entry, TRUE);
3533         else {
3534                 entry->next = curthread->td_map_def_user;
3535                 curthread->td_map_def_user = entry;
3536         }
3537 }
3538
3539 /*
3540  *      vm_map_delete:  [ internal use only ]
3541  *
3542  *      Deallocates the given address range from the target
3543  *      map.
3544  */
3545 int
3546 vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end)
3547 {
3548         vm_map_entry_t entry;
3549         vm_map_entry_t first_entry;
3550
3551         VM_MAP_ASSERT_LOCKED(map);
3552         if (start == end)
3553                 return (KERN_SUCCESS);
3554
3555         /*
3556          * Find the start of the region, and clip it
3557          */
3558         if (!vm_map_lookup_entry(map, start, &first_entry))
3559                 entry = first_entry->next;
3560         else {
3561                 entry = first_entry;
3562                 vm_map_clip_start(map, entry, start);
3563         }
3564
3565         /*
3566          * Step through all entries in this region
3567          */
3568         while (entry->start < end) {
3569                 vm_map_entry_t next;
3570
3571                 /*
3572                  * Wait for wiring or unwiring of an entry to complete.
3573                  * Also wait for any system wirings to disappear on
3574                  * user maps.
3575                  */
3576                 if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 ||
3577                     (vm_map_pmap(map) != kernel_pmap &&
3578                     vm_map_entry_system_wired_count(entry) != 0)) {
3579                         unsigned int last_timestamp;
3580                         vm_offset_t saved_start;
3581                         vm_map_entry_t tmp_entry;
3582
3583                         saved_start = entry->start;
3584                         entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
3585                         last_timestamp = map->timestamp;
3586                         (void) vm_map_unlock_and_wait(map, 0);
3587                         vm_map_lock(map);
3588                         if (last_timestamp + 1 != map->timestamp) {
3589                                 /*
3590                                  * Look again for the entry because the map was
3591                                  * modified while it was unlocked.
3592                                  * Specifically, the entry may have been
3593                                  * clipped, merged, or deleted.
3594                                  */
3595                                 if (!vm_map_lookup_entry(map, saved_start,
3596                                                          &tmp_entry))
3597                                         entry = tmp_entry->next;
3598                                 else {
3599                                         entry = tmp_entry;
3600                                         vm_map_clip_start(map, entry,
3601                                                           saved_start);
3602                                 }
3603                         }
3604                         continue;
3605                 }
3606                 vm_map_clip_end(map, entry, end);
3607
3608                 next = entry->next;
3609
3610                 /*
3611                  * Unwire before removing addresses from the pmap; otherwise,
3612                  * unwiring will put the entries back in the pmap.
3613                  */
3614                 if (entry->wired_count != 0)
3615                         vm_map_entry_unwire(map, entry);
3616
3617                 /*
3618                  * Remove mappings for the pages, but only if the
3619                  * mappings could exist.  For instance, it does not
3620                  * make sense to call pmap_remove() for guard entries.
3621                  */
3622                 if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0 ||
3623                     entry->object.vm_object != NULL)
3624                         pmap_remove(map->pmap, entry->start, entry->end);
3625
3626                 if (entry->end == map->anon_loc)
3627                         map->anon_loc = entry->start;
3628
3629                 /*
3630                  * Delete the entry only after removing all pmap
3631                  * entries pointing to its pages.  (Otherwise, its
3632                  * page frames may be reallocated, and any modify bits
3633                  * will be set in the wrong object!)
3634                  */
3635                 vm_map_entry_delete(map, entry);
3636                 entry = next;
3637         }
3638         return (KERN_SUCCESS);
3639 }
3640
3641 /*
3642  *      vm_map_remove:
3643  *
3644  *      Remove the given address range from the target map.
3645  *      This is the exported form of vm_map_delete.
3646  */
3647 int
3648 vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end)
3649 {
3650         int result;
3651
3652         vm_map_lock(map);
3653         VM_MAP_RANGE_CHECK(map, start, end);
3654         result = vm_map_delete(map, start, end);
3655         vm_map_unlock(map);
3656         return (result);
3657 }
3658
3659 /*
3660  *      vm_map_check_protection:
3661  *
3662  *      Assert that the target map allows the specified privilege on the
3663  *      entire address region given.  The entire region must be allocated.
3664  *
3665  *      WARNING!  This code does not and should not check whether the
3666  *      contents of the region is accessible.  For example a smaller file
3667  *      might be mapped into a larger address space.
3668  *
3669  *      NOTE!  This code is also called by munmap().
3670  *
3671  *      The map must be locked.  A read lock is sufficient.
3672  */
3673 boolean_t
3674 vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end,
3675                         vm_prot_t protection)
3676 {
3677         vm_map_entry_t entry;
3678         vm_map_entry_t tmp_entry;
3679
3680         if (!vm_map_lookup_entry(map, start, &tmp_entry))
3681                 return (FALSE);
3682         entry = tmp_entry;
3683
3684         while (start < end) {
3685                 /*
3686                  * No holes allowed!
3687                  */
3688                 if (start < entry->start)
3689                         return (FALSE);
3690                 /*
3691                  * Check protection associated with entry.
3692                  */
3693                 if ((entry->protection & protection) != protection)
3694                         return (FALSE);
3695                 /* go to next entry */
3696                 start = entry->end;
3697                 entry = entry->next;
3698         }
3699         return (TRUE);
3700 }
3701
3702 /*
3703  *      vm_map_copy_entry:
3704  *
3705  *      Copies the contents of the source entry to the destination
3706  *      entry.  The entries *must* be aligned properly.
3707  */
3708 static void
3709 vm_map_copy_entry(
3710         vm_map_t src_map,
3711         vm_map_t dst_map,
3712         vm_map_entry_t src_entry,
3713         vm_map_entry_t dst_entry,
3714         vm_ooffset_t *fork_charge)
3715 {
3716         vm_object_t src_object;
3717         vm_map_entry_t fake_entry;
3718         vm_offset_t size;
3719         struct ucred *cred;
3720         int charged;
3721
3722         VM_MAP_ASSERT_LOCKED(dst_map);
3723
3724         if ((dst_entry->eflags|src_entry->eflags) & MAP_ENTRY_IS_SUB_MAP)
3725                 return;
3726
3727         if (src_entry->wired_count == 0 ||
3728             (src_entry->protection & VM_PROT_WRITE) == 0) {
3729                 /*
3730                  * If the source entry is marked needs_copy, it is already
3731                  * write-protected.
3732                  */
3733                 if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0 &&
3734                     (src_entry->protection & VM_PROT_WRITE) != 0) {
3735                         pmap_protect(src_map->pmap,
3736                             src_entry->start,
3737                             src_entry->end,
3738                             src_entry->protection & ~VM_PROT_WRITE);
3739                 }
3740
3741                 /*
3742                  * Make a copy of the object.
3743                  */
3744                 size = src_entry->end - src_entry->start;
3745                 if ((src_object = src_entry->object.vm_object) != NULL) {
3746                         VM_OBJECT_WLOCK(src_object);
3747                         charged = ENTRY_CHARGED(src_entry);
3748                         if (src_object->handle == NULL &&
3749                             (src_object->type == OBJT_DEFAULT ||
3750                             src_object->type == OBJT_SWAP)) {
3751                                 vm_object_collapse(src_object);
3752                                 if ((src_object->flags & (OBJ_NOSPLIT |
3753                                     OBJ_ONEMAPPING)) == OBJ_ONEMAPPING) {
3754                                         vm_object_split(src_entry);
3755                                         src_object =
3756                                             src_entry->object.vm_object;
3757                                 }
3758                         }
3759                         vm_object_reference_locked(src_object);
3760                         vm_object_clear_flag(src_object, OBJ_ONEMAPPING);
3761                         if (src_entry->cred != NULL &&
3762                             !(src_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
3763                                 KASSERT(src_object->cred == NULL,
3764                                     ("OVERCOMMIT: vm_map_copy_entry: cred %p",
3765                                      src_object));
3766                                 src_object->cred = src_entry->cred;
3767                                 src_object->charge = size;
3768                         }
3769                         VM_OBJECT_WUNLOCK(src_object);
3770                         dst_entry->object.vm_object = src_object;
3771                         if (charged) {
3772                                 cred = curthread->td_ucred;
3773                                 crhold(cred);
3774                                 dst_entry->cred = cred;
3775                                 *fork_charge += size;
3776                                 if (!(src_entry->eflags &
3777                                       MAP_ENTRY_NEEDS_COPY)) {
3778                                         crhold(cred);
3779                                         src_entry->cred = cred;
3780                                         *fork_charge += size;
3781                                 }
3782                         }
3783                         src_entry->eflags |= MAP_ENTRY_COW |
3784                             MAP_ENTRY_NEEDS_COPY;
3785                         dst_entry->eflags |= MAP_ENTRY_COW |
3786                             MAP_ENTRY_NEEDS_COPY;
3787                         dst_entry->offset = src_entry->offset;
3788                         if (src_entry->eflags & MAP_ENTRY_WRITECNT) {
3789                                 /*
3790                                  * MAP_ENTRY_WRITECNT cannot
3791                                  * indicate write reference from
3792                                  * src_entry, since the entry is
3793                                  * marked as needs copy.  Allocate a
3794                                  * fake entry that is used to
3795                                  * decrement object->un_pager writecount
3796                                  * at the appropriate time.  Attach
3797                                  * fake_entry to the deferred list.
3798                                  */
3799                                 fake_entry = vm_map_entry_create(dst_map);
3800                                 fake_entry->eflags = MAP_ENTRY_WRITECNT;
3801                                 src_entry->eflags &= ~MAP_ENTRY_WRITECNT;
3802                                 vm_object_reference(src_object);
3803                                 fake_entry->object.vm_object = src_object;
3804                                 fake_entry->start = src_entry->start;
3805                                 fake_entry->end = src_entry->end;
3806                                 fake_entry->next = curthread->td_map_def_user;
3807                                 curthread->td_map_def_user = fake_entry;
3808                         }
3809
3810                         pmap_copy(dst_map->pmap, src_map->pmap,
3811                             dst_entry->start, dst_entry->end - dst_entry->start,
3812                             src_entry->start);
3813                 } else {
3814                         dst_entry->object.vm_object = NULL;
3815                         dst_entry->offset = 0;
3816                         if (src_entry->cred != NULL) {
3817                                 dst_entry->cred = curthread->td_ucred;
3818                                 crhold(dst_entry->cred);
3819                                 *fork_charge += size;
3820                         }
3821                 }
3822         } else {
3823                 /*
3824                  * We don't want to make writeable wired pages copy-on-write.
3825                  * Immediately copy these pages into the new map by simulating
3826                  * page faults.  The new pages are pageable.
3827                  */
3828                 vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry,
3829                     fork_charge);
3830         }
3831 }
3832
3833 /*
3834  * vmspace_map_entry_forked:
3835  * Update the newly-forked vmspace each time a map entry is inherited
3836  * or copied.  The values for vm_dsize and vm_tsize are approximate
3837  * (and mostly-obsolete ideas in the face of mmap(2) et al.)
3838  */
3839 static void
3840 vmspace_map_entry_forked(const struct vmspace *vm1, struct vmspace *vm2,
3841     vm_map_entry_t entry)
3842 {
3843         vm_size_t entrysize;
3844         vm_offset_t newend;
3845
3846         if ((entry->eflags & MAP_ENTRY_GUARD) != 0)
3847                 return;
3848         entrysize = entry->end - entry->start;
3849         vm2->vm_map.size += entrysize;
3850         if (entry->eflags & (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP)) {
3851                 vm2->vm_ssize += btoc(entrysize);
3852         } else if (entry->start >= (vm_offset_t)vm1->vm_daddr &&
3853             entry->start < (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize)) {
3854                 newend = MIN(entry->end,
3855                     (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize));
3856                 vm2->vm_dsize += btoc(newend - entry->start);
3857         } else if (entry->start >= (vm_offset_t)vm1->vm_taddr &&
3858             entry->start < (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize)) {
3859                 newend = MIN(entry->end,
3860                     (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize));
3861                 vm2->vm_tsize += btoc(newend - entry->start);
3862         }
3863 }
3864
3865 /*
3866  * vmspace_fork:
3867  * Create a new process vmspace structure and vm_map
3868  * based on those of an existing process.  The new map
3869  * is based on the old map, according to the inheritance
3870  * values on the regions in that map.
3871  *
3872  * XXX It might be worth coalescing the entries added to the new vmspace.
3873  *
3874  * The source map must not be locked.
3875  */
3876 struct vmspace *
3877 vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_charge)
3878 {
3879         struct vmspace *vm2;
3880         vm_map_t new_map, old_map;
3881         vm_map_entry_t new_entry, old_entry;
3882         vm_object_t object;
3883         int error, locked;
3884         vm_inherit_t inh;
3885
3886         old_map = &vm1->vm_map;
3887         /* Copy immutable fields of vm1 to vm2. */
3888         vm2 = vmspace_alloc(vm_map_min(old_map), vm_map_max(old_map),
3889             pmap_pinit);
3890         if (vm2 == NULL)
3891                 return (NULL);
3892
3893         vm2->vm_taddr = vm1->vm_taddr;
3894         vm2->vm_daddr = vm1->vm_daddr;
3895         vm2->vm_maxsaddr = vm1->vm_maxsaddr;
3896         vm_map_lock(old_map);
3897         if (old_map->busy)
3898                 vm_map_wait_busy(old_map);
3899         new_map = &vm2->vm_map;
3900         locked = vm_map_trylock(new_map); /* trylock to silence WITNESS */
3901         KASSERT(locked, ("vmspace_fork: lock failed"));
3902
3903         error = pmap_vmspace_copy(new_map->pmap, old_map->pmap);
3904         if (error != 0) {
3905                 sx_xunlock(&old_map->lock);
3906                 sx_xunlock(&new_map->lock);
3907                 vm_map_process_deferred();
3908                 vmspace_free(vm2);
3909                 return (NULL);
3910         }
3911
3912         new_map->anon_loc = old_map->anon_loc;
3913
3914         old_entry = old_map->header.next;
3915
3916         while (old_entry != &old_map->header) {
3917                 if (old_entry->eflags & MAP_ENTRY_IS_SUB_MAP)
3918                         panic("vm_map_fork: encountered a submap");
3919
3920                 inh = old_entry->inheritance;
3921                 if ((old_entry->eflags & MAP_ENTRY_GUARD) != 0 &&
3922                     inh != VM_INHERIT_NONE)
3923                         inh = VM_INHERIT_COPY;
3924
3925                 switch (inh) {
3926                 case VM_INHERIT_NONE:
3927                         break;
3928
3929                 case VM_INHERIT_SHARE:
3930                         /*
3931                          * Clone the entry, creating the shared object if necessary.
3932                          */
3933                         object = old_entry->object.vm_object;
3934                         if (object == NULL) {
3935                                 vm_map_entry_back(old_entry);
3936                                 object = old_entry->object.vm_object;
3937                         }
3938
3939                         /*
3940                          * Add the reference before calling vm_object_shadow
3941                          * to insure that a shadow object is created.
3942                          */
3943                         vm_object_reference(object);
3944                         if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) {
3945                                 vm_object_shadow(&old_entry->object.vm_object,
3946                                     &old_entry->offset,
3947                                     old_entry->end - old_entry->start);
3948                                 old_entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
3949                                 /* Transfer the second reference too. */
3950                                 vm_object_reference(
3951                                     old_entry->object.vm_object);
3952
3953                                 /*
3954                                  * As in vm_map_merged_neighbor_dispose(),
3955                                  * the vnode lock will not be acquired in
3956                                  * this call to vm_object_deallocate().
3957                                  */
3958                                 vm_object_deallocate(object);
3959                                 object = old_entry->object.vm_object;
3960                         }
3961                         VM_OBJECT_WLOCK(object);
3962                         vm_object_clear_flag(object, OBJ_ONEMAPPING);
3963                         if (old_entry->cred != NULL) {
3964                                 KASSERT(object->cred == NULL, ("vmspace_fork both cred"));
3965                                 object->cred = old_entry->cred;
3966                                 object->charge = old_entry->end - old_entry->start;
3967                                 old_entry->cred = NULL;
3968                         }
3969
3970                         /*
3971                          * Assert the correct state of the vnode
3972                          * v_writecount while the object is locked, to
3973                          * not relock it later for the assertion
3974                          * correctness.
3975                          */
3976                         if (old_entry->eflags & MAP_ENTRY_WRITECNT &&
3977                             object->type == OBJT_VNODE) {
3978                                 KASSERT(((struct vnode *)object->handle)->
3979                                     v_writecount > 0,
3980                                     ("vmspace_fork: v_writecount %p", object));
3981                                 KASSERT(object->un_pager.vnp.writemappings > 0,
3982                                     ("vmspace_fork: vnp.writecount %p",
3983                                     object));
3984                         }
3985                         VM_OBJECT_WUNLOCK(object);
3986
3987                         /*
3988                          * Clone the entry, referencing the shared object.
3989                          */
3990                         new_entry = vm_map_entry_create(new_map);
3991                         *new_entry = *old_entry;
3992                         new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED |
3993                             MAP_ENTRY_IN_TRANSITION);
3994                         new_entry->wiring_thread = NULL;
3995                         new_entry->wired_count = 0;
3996                         if (new_entry->eflags & MAP_ENTRY_WRITECNT) {
3997                                 vm_pager_update_writecount(object,
3998                                     new_entry->start, new_entry->end);
3999                         }
4000                         vm_map_entry_set_vnode_text(new_entry, true);
4001
4002                         /*
4003                          * Insert the entry into the new map -- we know we're
4004                          * inserting at the end of the new map.
4005                          */
4006                         vm_map_entry_link(new_map, new_entry);
4007                         vmspace_map_entry_forked(vm1, vm2, new_entry);
4008
4009                         /*
4010                          * Update the physical map
4011                          */
4012                         pmap_copy(new_map->pmap, old_map->pmap,
4013                             new_entry->start,
4014                             (old_entry->end - old_entry->start),
4015                             old_entry->start);
4016                         break;
4017
4018                 case VM_INHERIT_COPY:
4019                         /*
4020                          * Clone the entry and link into the map.
4021                          */
4022                         new_entry = vm_map_entry_create(new_map);
4023                         *new_entry = *old_entry;
4024                         /*
4025                          * Copied entry is COW over the old object.
4026                          */
4027                         new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED |
4028                             MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_WRITECNT);
4029                         new_entry->wiring_thread = NULL;
4030                         new_entry->wired_count = 0;
4031                         new_entry->object.vm_object = NULL;
4032                         new_entry->cred = NULL;
4033                         vm_map_entry_link(new_map, new_entry);
4034                         vmspace_map_entry_forked(vm1, vm2, new_entry);
4035                         vm_map_copy_entry(old_map, new_map, old_entry,
4036                             new_entry, fork_charge);
4037                         vm_map_entry_set_vnode_text(new_entry, true);
4038                         break;
4039
4040                 case VM_INHERIT_ZERO:
4041                         /*
4042                          * Create a new anonymous mapping entry modelled from
4043                          * the old one.
4044                          */
4045                         new_entry = vm_map_entry_create(new_map);
4046                         memset(new_entry, 0, sizeof(*new_entry));
4047
4048                         new_entry->start = old_entry->start;
4049                         new_entry->end = old_entry->end;
4050                         new_entry->eflags = old_entry->eflags &
4051                             ~(MAP_ENTRY_USER_WIRED | MAP_ENTRY_IN_TRANSITION |
4052                             MAP_ENTRY_WRITECNT | MAP_ENTRY_VN_EXEC);
4053                         new_entry->protection = old_entry->protection;
4054                         new_entry->max_protection = old_entry->max_protection;
4055                         new_entry->inheritance = VM_INHERIT_ZERO;
4056
4057                         vm_map_entry_link(new_map, new_entry);
4058                         vmspace_map_entry_forked(vm1, vm2, new_entry);
4059
4060                         new_entry->cred = curthread->td_ucred;
4061                         crhold(new_entry->cred);
4062                         *fork_charge += (new_entry->end - new_entry->start);
4063
4064                         break;
4065                 }
4066                 old_entry = old_entry->next;
4067         }
4068         /*
4069          * Use inlined vm_map_unlock() to postpone handling the deferred
4070          * map entries, which cannot be done until both old_map and
4071          * new_map locks are released.
4072          */
4073         sx_xunlock(&old_map->lock);
4074         sx_xunlock(&new_map->lock);
4075         vm_map_process_deferred();
4076
4077         return (vm2);
4078 }
4079
4080 /*
4081  * Create a process's stack for exec_new_vmspace().  This function is never
4082  * asked to wire the newly created stack.
4083  */
4084 int
4085 vm_map_stack(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
4086     vm_prot_t prot, vm_prot_t max, int cow)
4087 {
4088         vm_size_t growsize, init_ssize;
4089         rlim_t vmemlim;
4090         int rv;
4091
4092         MPASS((map->flags & MAP_WIREFUTURE) == 0);
4093         growsize = sgrowsiz;
4094         init_ssize = (max_ssize < growsize) ? max_ssize : growsize;
4095         vm_map_lock(map);
4096         vmemlim = lim_cur(curthread, RLIMIT_VMEM);
4097         /* If we would blow our VMEM resource limit, no go */
4098         if (map->size + init_ssize > vmemlim) {
4099                 rv = KERN_NO_SPACE;
4100                 goto out;
4101         }
4102         rv = vm_map_stack_locked(map, addrbos, max_ssize, growsize, prot,
4103             max, cow);
4104 out:
4105         vm_map_unlock(map);
4106         return (rv);
4107 }
4108
4109 static int stack_guard_page = 1;
4110 SYSCTL_INT(_security_bsd, OID_AUTO, stack_guard_page, CTLFLAG_RWTUN,
4111     &stack_guard_page, 0,
4112     "Specifies the number of guard pages for a stack that grows");
4113
4114 static int
4115 vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
4116     vm_size_t growsize, vm_prot_t prot, vm_prot_t max, int cow)
4117 {
4118         vm_map_entry_t new_entry, prev_entry;
4119         vm_offset_t bot, gap_bot, gap_top, top;
4120         vm_size_t init_ssize, sgp;
4121         int orient, rv;
4122
4123         /*
4124          * The stack orientation is piggybacked with the cow argument.
4125          * Extract it into orient and mask the cow argument so that we
4126          * don't pass it around further.
4127          */
4128         orient = cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP);
4129         KASSERT(orient != 0, ("No stack grow direction"));
4130         KASSERT(orient != (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP),
4131             ("bi-dir stack"));
4132
4133         if (addrbos < vm_map_min(map) ||
4134             addrbos + max_ssize > vm_map_max(map) ||
4135             addrbos + max_ssize <= addrbos)
4136                 return (KERN_INVALID_ADDRESS);
4137         sgp = (curproc->p_flag2 & P2_STKGAP_DISABLE) != 0 ? 0 :
4138             (vm_size_t)stack_guard_page * PAGE_SIZE;
4139         if (sgp >= max_ssize)
4140                 return (KERN_INVALID_ARGUMENT);
4141
4142         init_ssize = growsize;
4143         if (max_ssize < init_ssize + sgp)
4144                 init_ssize = max_ssize - sgp;
4145
4146         /* If addr is already mapped, no go */
4147         if (vm_map_lookup_entry(map, addrbos, &prev_entry))
4148                 return (KERN_NO_SPACE);
4149
4150         /*
4151          * If we can't accommodate max_ssize in the current mapping, no go.
4152          */
4153         if (prev_entry->next->start < addrbos + max_ssize)
4154                 return (KERN_NO_SPACE);
4155
4156         /*
4157          * We initially map a stack of only init_ssize.  We will grow as
4158          * needed later.  Depending on the orientation of the stack (i.e.
4159          * the grow direction) we either map at the top of the range, the
4160          * bottom of the range or in the middle.
4161          *
4162          * Note: we would normally expect prot and max to be VM_PROT_ALL,
4163          * and cow to be 0.  Possibly we should eliminate these as input
4164          * parameters, and just pass these values here in the insert call.
4165          */
4166         if (orient == MAP_STACK_GROWS_DOWN) {
4167                 bot = addrbos + max_ssize - init_ssize;
4168                 top = bot + init_ssize;
4169                 gap_bot = addrbos;
4170                 gap_top = bot;
4171         } else /* if (orient == MAP_STACK_GROWS_UP) */ {
4172                 bot = addrbos;
4173                 top = bot + init_ssize;
4174                 gap_bot = top;
4175                 gap_top = addrbos + max_ssize;
4176         }
4177         rv = vm_map_insert(map, NULL, 0, bot, top, prot, max, cow);
4178         if (rv != KERN_SUCCESS)
4179                 return (rv);
4180         new_entry = prev_entry->next;
4181         KASSERT(new_entry->end == top || new_entry->start == bot,
4182             ("Bad entry start/end for new stack entry"));
4183         KASSERT((orient & MAP_STACK_GROWS_DOWN) == 0 ||
4184             (new_entry->eflags & MAP_ENTRY_GROWS_DOWN) != 0,
4185             ("new entry lacks MAP_ENTRY_GROWS_DOWN"));
4186         KASSERT((orient & MAP_STACK_GROWS_UP) == 0 ||
4187             (new_entry->eflags & MAP_ENTRY_GROWS_UP) != 0,
4188             ("new entry lacks MAP_ENTRY_GROWS_UP"));
4189         if (gap_bot == gap_top)
4190                 return (KERN_SUCCESS);
4191         rv = vm_map_insert(map, NULL, 0, gap_bot, gap_top, VM_PROT_NONE,
4192             VM_PROT_NONE, MAP_CREATE_GUARD | (orient == MAP_STACK_GROWS_DOWN ?
4193             MAP_CREATE_STACK_GAP_DN : MAP_CREATE_STACK_GAP_UP));
4194         if (rv == KERN_SUCCESS) {
4195                 /*
4196                  * Gap can never successfully handle a fault, so
4197                  * read-ahead logic is never used for it.  Re-use
4198                  * next_read of the gap entry to store
4199                  * stack_guard_page for vm_map_growstack().
4200                  */
4201                 if (orient == MAP_STACK_GROWS_DOWN)
4202                         new_entry->prev->next_read = sgp;
4203                 else
4204                         new_entry->next->next_read = sgp;
4205         } else {
4206                 (void)vm_map_delete(map, bot, top);
4207         }
4208         return (rv);
4209 }
4210
4211 /*
4212  * Attempts to grow a vm stack entry.  Returns KERN_SUCCESS if we
4213  * successfully grow the stack.
4214  */
4215 static int
4216 vm_map_growstack(vm_map_t map, vm_offset_t addr, vm_map_entry_t gap_entry)
4217 {
4218         vm_map_entry_t stack_entry;
4219         struct proc *p;
4220         struct vmspace *vm;
4221         struct ucred *cred;
4222         vm_offset_t gap_end, gap_start, grow_start;
4223         vm_size_t grow_amount, guard, max_grow;
4224         rlim_t lmemlim, stacklim, vmemlim;
4225         int rv, rv1;
4226         bool gap_deleted, grow_down, is_procstack;
4227 #ifdef notyet
4228         uint64_t limit;
4229 #endif
4230 #ifdef RACCT
4231         int error;
4232 #endif
4233
4234         p = curproc;
4235         vm = p->p_vmspace;
4236
4237         /*
4238          * Disallow stack growth when the access is performed by a
4239          * debugger or AIO daemon.  The reason is that the wrong
4240          * resource limits are applied.
4241          */
4242         if (p != initproc && (map != &p->p_vmspace->vm_map ||
4243             p->p_textvp == NULL))
4244                 return (KERN_FAILURE);
4245
4246         MPASS(!map->system_map);
4247
4248         lmemlim = lim_cur(curthread, RLIMIT_MEMLOCK);
4249         stacklim = lim_cur(curthread, RLIMIT_STACK);
4250         vmemlim = lim_cur(curthread, RLIMIT_VMEM);
4251 retry:
4252         /* If addr is not in a hole for a stack grow area, no need to grow. */
4253         if (gap_entry == NULL && !vm_map_lookup_entry(map, addr, &gap_entry))
4254                 return (KERN_FAILURE);
4255         if ((gap_entry->eflags & MAP_ENTRY_GUARD) == 0)
4256                 return (KERN_SUCCESS);
4257         if ((gap_entry->eflags & MAP_ENTRY_STACK_GAP_DN) != 0) {
4258                 stack_entry = gap_entry->next;
4259                 if ((stack_entry->eflags & MAP_ENTRY_GROWS_DOWN) == 0 ||
4260                     stack_entry->start != gap_entry->end)
4261                         return (KERN_FAILURE);
4262                 grow_amount = round_page(stack_entry->start - addr);
4263                 grow_down = true;
4264         } else if ((gap_entry->eflags & MAP_ENTRY_STACK_GAP_UP) != 0) {
4265                 stack_entry = gap_entry->prev;
4266                 if ((stack_entry->eflags & MAP_ENTRY_GROWS_UP) == 0 ||
4267                     stack_entry->end != gap_entry->start)
4268                         return (KERN_FAILURE);
4269                 grow_amount = round_page(addr + 1 - stack_entry->end);
4270                 grow_down = false;
4271         } else {
4272                 return (KERN_FAILURE);
4273         }
4274         guard = (curproc->p_flag2 & P2_STKGAP_DISABLE) != 0 ? 0 :
4275             gap_entry->next_read;
4276         max_grow = gap_entry->end - gap_entry->start;
4277         if (guard > max_grow)
4278                 return (KERN_NO_SPACE);
4279         max_grow -= guard;
4280         if (grow_amount > max_grow)
4281                 return (KERN_NO_SPACE);
4282
4283         /*
4284          * If this is the main process stack, see if we're over the stack
4285          * limit.
4286          */
4287         is_procstack = addr >= (vm_offset_t)vm->vm_maxsaddr &&
4288             addr < (vm_offset_t)p->p_sysent->sv_usrstack;
4289         if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim))
4290                 return (KERN_NO_SPACE);
4291
4292 #ifdef RACCT
4293         if (racct_enable) {
4294                 PROC_LOCK(p);
4295                 if (is_procstack && racct_set(p, RACCT_STACK,
4296                     ctob(vm->vm_ssize) + grow_amount)) {
4297                         PROC_UNLOCK(p);
4298                         return (KERN_NO_SPACE);
4299                 }
4300                 PROC_UNLOCK(p);
4301         }
4302 #endif
4303
4304         grow_amount = roundup(grow_amount, sgrowsiz);
4305         if (grow_amount > max_grow)
4306                 grow_amount = max_grow;
4307         if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) {
4308                 grow_amount = trunc_page((vm_size_t)stacklim) -
4309                     ctob(vm->vm_ssize);
4310         }
4311
4312 #ifdef notyet
4313         PROC_LOCK(p);
4314         limit = racct_get_available(p, RACCT_STACK);
4315         PROC_UNLOCK(p);
4316         if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > limit))
4317                 grow_amount = limit - ctob(vm->vm_ssize);
4318 #endif
4319
4320         if (!old_mlock && (map->flags & MAP_WIREFUTURE) != 0) {
4321                 if (ptoa(pmap_wired_count(map->pmap)) + grow_amount > lmemlim) {
4322                         rv = KERN_NO_SPACE;
4323                         goto out;
4324                 }
4325 #ifdef RACCT
4326                 if (racct_enable) {
4327                         PROC_LOCK(p);
4328                         if (racct_set(p, RACCT_MEMLOCK,
4329                             ptoa(pmap_wired_count(map->pmap)) + grow_amount)) {
4330                                 PROC_UNLOCK(p);
4331                                 rv = KERN_NO_SPACE;
4332                                 goto out;
4333                         }
4334                         PROC_UNLOCK(p);
4335                 }
4336 #endif
4337         }
4338
4339         /* If we would blow our VMEM resource limit, no go */
4340         if (map->size + grow_amount > vmemlim) {
4341                 rv = KERN_NO_SPACE;
4342                 goto out;
4343         }
4344 #ifdef RACCT
4345         if (racct_enable) {
4346                 PROC_LOCK(p);
4347                 if (racct_set(p, RACCT_VMEM, map->size + grow_amount)) {
4348                         PROC_UNLOCK(p);
4349                         rv = KERN_NO_SPACE;
4350                         goto out;
4351                 }
4352                 PROC_UNLOCK(p);
4353         }
4354 #endif
4355
4356         if (vm_map_lock_upgrade(map)) {
4357                 gap_entry = NULL;
4358                 vm_map_lock_read(map);
4359                 goto retry;
4360         }
4361
4362         if (grow_down) {
4363                 grow_start = gap_entry->end - grow_amount;
4364                 if (gap_entry->start + grow_amount == gap_entry->end) {
4365                         gap_start = gap_entry->start;
4366                         gap_end = gap_entry->end;
4367                         vm_map_entry_delete(map, gap_entry);
4368                         gap_deleted = true;
4369                 } else {
4370                         MPASS(gap_entry->start < gap_entry->end - grow_amount);
4371                         vm_map_entry_resize(map, gap_entry, -grow_amount);
4372                         gap_deleted = false;
4373                 }
4374                 rv = vm_map_insert(map, NULL, 0, grow_start,
4375                     grow_start + grow_amount,
4376                     stack_entry->protection, stack_entry->max_protection,
4377                     MAP_STACK_GROWS_DOWN);
4378                 if (rv != KERN_SUCCESS) {
4379                         if (gap_deleted) {
4380                                 rv1 = vm_map_insert(map, NULL, 0, gap_start,
4381                                     gap_end, VM_PROT_NONE, VM_PROT_NONE,
4382                                     MAP_CREATE_GUARD | MAP_CREATE_STACK_GAP_DN);
4383                                 MPASS(rv1 == KERN_SUCCESS);
4384                         } else
4385                                 vm_map_entry_resize(map, gap_entry,
4386                                     grow_amount);
4387                 }
4388         } else {
4389                 grow_start = stack_entry->end;
4390                 cred = stack_entry->cred;
4391                 if (cred == NULL && stack_entry->object.vm_object != NULL)
4392                         cred = stack_entry->object.vm_object->cred;
4393                 if (cred != NULL && !swap_reserve_by_cred(grow_amount, cred))
4394                         rv = KERN_NO_SPACE;
4395                 /* Grow the underlying object if applicable. */
4396                 else if (stack_entry->object.vm_object == NULL ||
4397                     vm_object_coalesce(stack_entry->object.vm_object,
4398                     stack_entry->offset,
4399                     (vm_size_t)(stack_entry->end - stack_entry->start),
4400                     grow_amount, cred != NULL)) {
4401                         if (gap_entry->start + grow_amount == gap_entry->end) {
4402                                 vm_map_entry_delete(map, gap_entry);
4403                                 vm_map_entry_resize(map, stack_entry,
4404                                     grow_amount);
4405                         } else {
4406                                 gap_entry->start += grow_amount;
4407                                 stack_entry->end += grow_amount;
4408                         }
4409                         map->size += grow_amount;
4410                         rv = KERN_SUCCESS;
4411                 } else
4412                         rv = KERN_FAILURE;
4413         }
4414         if (rv == KERN_SUCCESS && is_procstack)
4415                 vm->vm_ssize += btoc(grow_amount);
4416
4417         /*
4418          * Heed the MAP_WIREFUTURE flag if it was set for this process.
4419          */
4420         if (rv == KERN_SUCCESS && (map->flags & MAP_WIREFUTURE) != 0) {
4421                 rv = vm_map_wire_locked(map, grow_start,
4422                     grow_start + grow_amount,
4423                     VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
4424         }
4425         vm_map_lock_downgrade(map);
4426
4427 out:
4428 #ifdef RACCT
4429         if (racct_enable && rv != KERN_SUCCESS) {
4430                 PROC_LOCK(p);
4431                 error = racct_set(p, RACCT_VMEM, map->size);
4432                 KASSERT(error == 0, ("decreasing RACCT_VMEM failed"));
4433                 if (!old_mlock) {
4434                         error = racct_set(p, RACCT_MEMLOCK,
4435                             ptoa(pmap_wired_count(map->pmap)));
4436                         KASSERT(error == 0, ("decreasing RACCT_MEMLOCK failed"));
4437                 }
4438                 error = racct_set(p, RACCT_STACK, ctob(vm->vm_ssize));
4439                 KASSERT(error == 0, ("decreasing RACCT_STACK failed"));
4440                 PROC_UNLOCK(p);
4441         }
4442 #endif
4443
4444         return (rv);
4445 }
4446
4447 /*
4448  * Unshare the specified VM space for exec.  If other processes are
4449  * mapped to it, then create a new one.  The new vmspace is null.
4450  */
4451 int
4452 vmspace_exec(struct proc *p, vm_offset_t minuser, vm_offset_t maxuser)
4453 {
4454         struct vmspace *oldvmspace = p->p_vmspace;
4455         struct vmspace *newvmspace;
4456
4457         KASSERT((curthread->td_pflags & TDP_EXECVMSPC) == 0,
4458             ("vmspace_exec recursed"));
4459         newvmspace = vmspace_alloc(minuser, maxuser, pmap_pinit);
4460         if (newvmspace == NULL)
4461                 return (ENOMEM);
4462         newvmspace->vm_swrss = oldvmspace->vm_swrss;
4463         /*
4464          * This code is written like this for prototype purposes.  The
4465          * goal is to avoid running down the vmspace here, but let the
4466          * other process's that are still using the vmspace to finally
4467          * run it down.  Even though there is little or no chance of blocking
4468          * here, it is a good idea to keep this form for future mods.
4469          */
4470         PROC_VMSPACE_LOCK(p);
4471         p->p_vmspace = newvmspace;
4472         PROC_VMSPACE_UNLOCK(p);
4473         if (p == curthread->td_proc)
4474                 pmap_activate(curthread);
4475         curthread->td_pflags |= TDP_EXECVMSPC;
4476         return (0);
4477 }
4478
4479 /*
4480  * Unshare the specified VM space for forcing COW.  This
4481  * is called by rfork, for the (RFMEM|RFPROC) == 0 case.
4482  */
4483 int
4484 vmspace_unshare(struct proc *p)
4485 {
4486         struct vmspace *oldvmspace = p->p_vmspace;
4487         struct vmspace *newvmspace;
4488         vm_ooffset_t fork_charge;
4489
4490         if (oldvmspace->vm_refcnt == 1)
4491                 return (0);
4492         fork_charge = 0;
4493         newvmspace = vmspace_fork(oldvmspace, &fork_charge);
4494         if (newvmspace == NULL)
4495                 return (ENOMEM);
4496         if (!swap_reserve_by_cred(fork_charge, p->p_ucred)) {
4497                 vmspace_free(newvmspace);
4498                 return (ENOMEM);
4499         }
4500         PROC_VMSPACE_LOCK(p);
4501         p->p_vmspace = newvmspace;
4502         PROC_VMSPACE_UNLOCK(p);
4503         if (p == curthread->td_proc)
4504                 pmap_activate(curthread);
4505         vmspace_free(oldvmspace);
4506         return (0);
4507 }
4508
4509 /*
4510  *      vm_map_lookup:
4511  *
4512  *      Finds the VM object, offset, and
4513  *      protection for a given virtual address in the
4514  *      specified map, assuming a page fault of the
4515  *      type specified.
4516  *
4517  *      Leaves the map in question locked for read; return
4518  *      values are guaranteed until a vm_map_lookup_done
4519  *      call is performed.  Note that the map argument
4520  *      is in/out; the returned map must be used in
4521  *      the call to vm_map_lookup_done.
4522  *
4523  *      A handle (out_entry) is returned for use in
4524  *      vm_map_lookup_done, to make that fast.
4525  *
4526  *      If a lookup is requested with "write protection"
4527  *      specified, the map may be changed to perform virtual
4528  *      copying operations, although the data referenced will
4529  *      remain the same.
4530  */
4531 int
4532 vm_map_lookup(vm_map_t *var_map,                /* IN/OUT */
4533               vm_offset_t vaddr,
4534               vm_prot_t fault_typea,
4535               vm_map_entry_t *out_entry,        /* OUT */
4536               vm_object_t *object,              /* OUT */
4537               vm_pindex_t *pindex,              /* OUT */
4538               vm_prot_t *out_prot,              /* OUT */
4539               boolean_t *wired)                 /* OUT */
4540 {
4541         vm_map_entry_t entry;
4542         vm_map_t map = *var_map;
4543         vm_prot_t prot;
4544         vm_prot_t fault_type = fault_typea;
4545         vm_object_t eobject;
4546         vm_size_t size;
4547         struct ucred *cred;
4548
4549 RetryLookup:
4550
4551         vm_map_lock_read(map);
4552
4553 RetryLookupLocked:
4554         /*
4555          * Lookup the faulting address.
4556          */
4557         if (!vm_map_lookup_entry(map, vaddr, out_entry)) {
4558                 vm_map_unlock_read(map);
4559                 return (KERN_INVALID_ADDRESS);
4560         }
4561
4562         entry = *out_entry;
4563
4564         /*
4565          * Handle submaps.
4566          */
4567         if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
4568                 vm_map_t old_map = map;
4569
4570                 *var_map = map = entry->object.sub_map;
4571                 vm_map_unlock_read(old_map);
4572                 goto RetryLookup;
4573         }
4574
4575         /*
4576          * Check whether this task is allowed to have this page.
4577          */
4578         prot = entry->protection;
4579         if ((fault_typea & VM_PROT_FAULT_LOOKUP) != 0) {
4580                 fault_typea &= ~VM_PROT_FAULT_LOOKUP;
4581                 if (prot == VM_PROT_NONE && map != kernel_map &&
4582                     (entry->eflags & MAP_ENTRY_GUARD) != 0 &&
4583                     (entry->eflags & (MAP_ENTRY_STACK_GAP_DN |
4584                     MAP_ENTRY_STACK_GAP_UP)) != 0 &&
4585                     vm_map_growstack(map, vaddr, entry) == KERN_SUCCESS)
4586                         goto RetryLookupLocked;
4587         }
4588         fault_type &= VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE;
4589         if ((fault_type & prot) != fault_type || prot == VM_PROT_NONE) {
4590                 vm_map_unlock_read(map);
4591                 return (KERN_PROTECTION_FAILURE);
4592         }
4593         KASSERT((prot & VM_PROT_WRITE) == 0 || (entry->eflags &
4594             (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY)) !=
4595             (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY),
4596             ("entry %p flags %x", entry, entry->eflags));
4597         if ((fault_typea & VM_PROT_COPY) != 0 &&
4598             (entry->max_protection & VM_PROT_WRITE) == 0 &&
4599             (entry->eflags & MAP_ENTRY_COW) == 0) {
4600                 vm_map_unlock_read(map);
4601                 return (KERN_PROTECTION_FAILURE);
4602         }
4603
4604         /*
4605          * If this page is not pageable, we have to get it for all possible
4606          * accesses.
4607          */
4608         *wired = (entry->wired_count != 0);
4609         if (*wired)
4610                 fault_type = entry->protection;
4611         size = entry->end - entry->start;
4612         /*
4613          * If the entry was copy-on-write, we either ...
4614          */
4615         if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
4616                 /*
4617                  * If we want to write the page, we may as well handle that
4618                  * now since we've got the map locked.
4619                  *
4620                  * If we don't need to write the page, we just demote the
4621                  * permissions allowed.
4622                  */
4623                 if ((fault_type & VM_PROT_WRITE) != 0 ||
4624                     (fault_typea & VM_PROT_COPY) != 0) {
4625                         /*
4626                          * Make a new object, and place it in the object
4627                          * chain.  Note that no new references have appeared
4628                          * -- one just moved from the map to the new
4629                          * object.
4630                          */
4631                         if (vm_map_lock_upgrade(map))
4632                                 goto RetryLookup;
4633
4634                         if (entry->cred == NULL) {
4635                                 /*
4636                                  * The debugger owner is charged for
4637                                  * the memory.
4638                                  */
4639                                 cred = curthread->td_ucred;
4640                                 crhold(cred);
4641                                 if (!swap_reserve_by_cred(size, cred)) {
4642                                         crfree(cred);
4643                                         vm_map_unlock(map);
4644                                         return (KERN_RESOURCE_SHORTAGE);
4645                                 }
4646                                 entry->cred = cred;
4647                         }
4648                         vm_object_shadow(&entry->object.vm_object,
4649                             &entry->offset, size);
4650                         entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
4651                         eobject = entry->object.vm_object;
4652                         if (eobject->cred != NULL) {
4653                                 /*
4654                                  * The object was not shadowed.
4655                                  */
4656                                 swap_release_by_cred(size, entry->cred);
4657                                 crfree(entry->cred);
4658                                 entry->cred = NULL;
4659                         } else if (entry->cred != NULL) {
4660                                 VM_OBJECT_WLOCK(eobject);
4661                                 eobject->cred = entry->cred;
4662                                 eobject->charge = size;
4663                                 VM_OBJECT_WUNLOCK(eobject);
4664                                 entry->cred = NULL;
4665                         }
4666
4667                         vm_map_lock_downgrade(map);
4668                 } else {
4669                         /*
4670                          * We're attempting to read a copy-on-write page --
4671                          * don't allow writes.
4672                          */
4673                         prot &= ~VM_PROT_WRITE;
4674                 }
4675         }
4676
4677         /*
4678          * Create an object if necessary.
4679          */
4680         if (entry->object.vm_object == NULL &&
4681             !map->system_map) {
4682                 if (vm_map_lock_upgrade(map))
4683                         goto RetryLookup;
4684                 entry->object.vm_object = vm_object_allocate(OBJT_DEFAULT,
4685                     atop(size));
4686                 entry->offset = 0;
4687                 if (entry->cred != NULL) {
4688                         VM_OBJECT_WLOCK(entry->object.vm_object);
4689                         entry->object.vm_object->cred = entry->cred;
4690                         entry->object.vm_object->charge = size;
4691                         VM_OBJECT_WUNLOCK(entry->object.vm_object);
4692                         entry->cred = NULL;
4693                 }
4694                 vm_map_lock_downgrade(map);
4695         }
4696
4697         /*
4698          * Return the object/offset from this entry.  If the entry was
4699          * copy-on-write or empty, it has been fixed up.
4700          */
4701         *pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
4702         *object = entry->object.vm_object;
4703
4704         *out_prot = prot;
4705         return (KERN_SUCCESS);
4706 }
4707
4708 /*
4709  *      vm_map_lookup_locked:
4710  *
4711  *      Lookup the faulting address.  A version of vm_map_lookup that returns 
4712  *      KERN_FAILURE instead of blocking on map lock or memory allocation.
4713  */
4714 int
4715 vm_map_lookup_locked(vm_map_t *var_map,         /* IN/OUT */
4716                      vm_offset_t vaddr,
4717                      vm_prot_t fault_typea,
4718                      vm_map_entry_t *out_entry, /* OUT */
4719                      vm_object_t *object,       /* OUT */
4720                      vm_pindex_t *pindex,       /* OUT */
4721                      vm_prot_t *out_prot,       /* OUT */
4722                      boolean_t *wired)          /* OUT */
4723 {
4724         vm_map_entry_t entry;
4725         vm_map_t map = *var_map;
4726         vm_prot_t prot;
4727         vm_prot_t fault_type = fault_typea;
4728
4729         /*
4730          * Lookup the faulting address.
4731          */
4732         if (!vm_map_lookup_entry(map, vaddr, out_entry))
4733                 return (KERN_INVALID_ADDRESS);
4734
4735         entry = *out_entry;
4736
4737         /*
4738          * Fail if the entry refers to a submap.
4739          */
4740         if (entry->eflags & MAP_ENTRY_IS_SUB_MAP)
4741                 return (KERN_FAILURE);
4742
4743         /*
4744          * Check whether this task is allowed to have this page.
4745          */
4746         prot = entry->protection;
4747         fault_type &= VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE;
4748         if ((fault_type & prot) != fault_type)
4749                 return (KERN_PROTECTION_FAILURE);
4750
4751         /*
4752          * If this page is not pageable, we have to get it for all possible
4753          * accesses.
4754          */
4755         *wired = (entry->wired_count != 0);
4756         if (*wired)
4757                 fault_type = entry->protection;
4758
4759         if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
4760                 /*
4761                  * Fail if the entry was copy-on-write for a write fault.
4762                  */
4763                 if (fault_type & VM_PROT_WRITE)
4764                         return (KERN_FAILURE);
4765                 /*
4766                  * We're attempting to read a copy-on-write page --
4767                  * don't allow writes.
4768                  */
4769                 prot &= ~VM_PROT_WRITE;
4770         }
4771
4772         /*
4773          * Fail if an object should be created.
4774          */
4775         if (entry->object.vm_object == NULL && !map->system_map)
4776                 return (KERN_FAILURE);
4777
4778         /*
4779          * Return the object/offset from this entry.  If the entry was
4780          * copy-on-write or empty, it has been fixed up.
4781          */
4782         *pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
4783         *object = entry->object.vm_object;
4784
4785         *out_prot = prot;
4786         return (KERN_SUCCESS);
4787 }
4788
4789 /*
4790  *      vm_map_lookup_done:
4791  *
4792  *      Releases locks acquired by a vm_map_lookup
4793  *      (according to the handle returned by that lookup).
4794  */
4795 void
4796 vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry)
4797 {
4798         /*
4799          * Unlock the main-level map
4800          */
4801         vm_map_unlock_read(map);
4802 }
4803
4804 vm_offset_t
4805 vm_map_max_KBI(const struct vm_map *map)
4806 {
4807
4808         return (vm_map_max(map));
4809 }
4810
4811 vm_offset_t
4812 vm_map_min_KBI(const struct vm_map *map)
4813 {
4814
4815         return (vm_map_min(map));
4816 }
4817
4818 pmap_t
4819 vm_map_pmap_KBI(vm_map_t map)
4820 {
4821
4822         return (map->pmap);
4823 }
4824
4825 #include "opt_ddb.h"
4826 #ifdef DDB
4827 #include <sys/kernel.h>
4828
4829 #include <ddb/ddb.h>
4830
4831 static void
4832 vm_map_print(vm_map_t map)
4833 {
4834         vm_map_entry_t entry, prev;
4835
4836         db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n",
4837             (void *)map,
4838             (void *)map->pmap, map->nentries, map->timestamp);
4839
4840         db_indent += 2;
4841         for (prev = &map->header; (entry = prev->next) != &map->header;
4842             prev = entry) {
4843                 db_iprintf("map entry %p: start=%p, end=%p, eflags=%#x, \n",
4844                     (void *)entry, (void *)entry->start, (void *)entry->end,
4845                     entry->eflags);
4846                 {
4847                         static char *inheritance_name[4] =
4848                         {"share", "copy", "none", "donate_copy"};
4849
4850                         db_iprintf(" prot=%x/%x/%s",
4851                             entry->protection,
4852                             entry->max_protection,
4853                             inheritance_name[(int)(unsigned char)
4854                             entry->inheritance]);
4855                         if (entry->wired_count != 0)
4856                                 db_printf(", wired");
4857                 }
4858                 if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
4859                         db_printf(", share=%p, offset=0x%jx\n",
4860                             (void *)entry->object.sub_map,
4861                             (uintmax_t)entry->offset);
4862                         if (prev == &map->header ||
4863                             prev->object.sub_map !=
4864                                 entry->object.sub_map) {
4865                                 db_indent += 2;
4866                                 vm_map_print((vm_map_t)entry->object.sub_map);
4867                                 db_indent -= 2;
4868                         }
4869                 } else {
4870                         if (entry->cred != NULL)
4871                                 db_printf(", ruid %d", entry->cred->cr_ruid);
4872                         db_printf(", object=%p, offset=0x%jx",
4873                             (void *)entry->object.vm_object,
4874                             (uintmax_t)entry->offset);
4875                         if (entry->object.vm_object && entry->object.vm_object->cred)
4876                                 db_printf(", obj ruid %d charge %jx",
4877                                     entry->object.vm_object->cred->cr_ruid,
4878                                     (uintmax_t)entry->object.vm_object->charge);
4879                         if (entry->eflags & MAP_ENTRY_COW)
4880                                 db_printf(", copy (%s)",
4881                                     (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
4882                         db_printf("\n");
4883
4884                         if (prev == &map->header ||
4885                             prev->object.vm_object !=
4886                                 entry->object.vm_object) {
4887                                 db_indent += 2;
4888                                 vm_object_print((db_expr_t)(intptr_t)
4889                                                 entry->object.vm_object,
4890                                                 0, 0, (char *)0);
4891                                 db_indent -= 2;
4892                         }
4893                 }
4894         }
4895         db_indent -= 2;
4896 }
4897
4898 DB_SHOW_COMMAND(map, map)
4899 {
4900
4901         if (!have_addr) {
4902                 db_printf("usage: show map <addr>\n");
4903                 return;
4904         }
4905         vm_map_print((vm_map_t)addr);
4906 }
4907
4908 DB_SHOW_COMMAND(procvm, procvm)
4909 {
4910         struct proc *p;
4911
4912         if (have_addr) {
4913                 p = db_lookup_proc(addr);
4914         } else {
4915                 p = curproc;
4916         }
4917
4918         db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n",
4919             (void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map,
4920             (void *)vmspace_pmap(p->p_vmspace));
4921
4922         vm_map_print((vm_map_t)&p->p_vmspace->vm_map);
4923 }
4924
4925 #endif /* DDB */