]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/vm/vm_map.c
fdt: Fix installation of aarch64 dtb
[FreeBSD/FreeBSD.git] / sys / vm / vm_map.c
1 /*-
2  * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU)
3  *
4  * Copyright (c) 1991, 1993
5  *      The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * The Mach Operating System project at Carnegie-Mellon University.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *      from: @(#)vm_map.c      8.3 (Berkeley) 1/12/94
35  *
36  *
37  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
38  * All rights reserved.
39  *
40  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
41  *
42  * Permission to use, copy, modify and distribute this software and
43  * its documentation is hereby granted, provided that both the copyright
44  * notice and this permission notice appear in all copies of the
45  * software, derivative works or modified versions, and any portions
46  * thereof, and that both notices appear in supporting documentation.
47  *
48  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
49  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
50  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
51  *
52  * Carnegie Mellon requests users of this software to return to
53  *
54  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
55  *  School of Computer Science
56  *  Carnegie Mellon University
57  *  Pittsburgh PA 15213-3890
58  *
59  * any improvements or extensions that they make and grant Carnegie the
60  * rights to redistribute these changes.
61  */
62
63 /*
64  *      Virtual memory mapping module.
65  */
66
67 #include <sys/cdefs.h>
68 __FBSDID("$FreeBSD$");
69
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/kernel.h>
73 #include <sys/ktr.h>
74 #include <sys/lock.h>
75 #include <sys/mutex.h>
76 #include <sys/proc.h>
77 #include <sys/vmmeter.h>
78 #include <sys/mman.h>
79 #include <sys/vnode.h>
80 #include <sys/racct.h>
81 #include <sys/resourcevar.h>
82 #include <sys/rwlock.h>
83 #include <sys/file.h>
84 #include <sys/sysctl.h>
85 #include <sys/sysent.h>
86 #include <sys/shm.h>
87
88 #include <vm/vm.h>
89 #include <vm/vm_param.h>
90 #include <vm/pmap.h>
91 #include <vm/vm_map.h>
92 #include <vm/vm_page.h>
93 #include <vm/vm_object.h>
94 #include <vm/vm_pager.h>
95 #include <vm/vm_kern.h>
96 #include <vm/vm_extern.h>
97 #include <vm/vnode_pager.h>
98 #include <vm/swap_pager.h>
99 #include <vm/uma.h>
100
101 /*
102  *      Virtual memory maps provide for the mapping, protection,
103  *      and sharing of virtual memory objects.  In addition,
104  *      this module provides for an efficient virtual copy of
105  *      memory from one map to another.
106  *
107  *      Synchronization is required prior to most operations.
108  *
109  *      Maps consist of an ordered doubly-linked list of simple
110  *      entries; a self-adjusting binary search tree of these
111  *      entries is used to speed up lookups.
112  *
113  *      Since portions of maps are specified by start/end addresses,
114  *      which may not align with existing map entries, all
115  *      routines merely "clip" entries to these start/end values.
116  *      [That is, an entry is split into two, bordering at a
117  *      start or end value.]  Note that these clippings may not
118  *      always be necessary (as the two resulting entries are then
119  *      not changed); however, the clipping is done for convenience.
120  *
121  *      As mentioned above, virtual copy operations are performed
122  *      by copying VM object references from one map to
123  *      another, and then marking both regions as copy-on-write.
124  */
125
126 static struct mtx map_sleep_mtx;
127 static uma_zone_t mapentzone;
128 static uma_zone_t kmapentzone;
129 static uma_zone_t mapzone;
130 static uma_zone_t vmspace_zone;
131 static int vmspace_zinit(void *mem, int size, int flags);
132 static int vm_map_zinit(void *mem, int ize, int flags);
133 static void _vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min,
134     vm_offset_t max);
135 static void vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map);
136 static void vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry);
137 static void vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry);
138 static int vm_map_growstack(vm_map_t map, vm_offset_t addr,
139     vm_map_entry_t gap_entry);
140 static void vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
141     vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags);
142 #ifdef INVARIANTS
143 static void vm_map_zdtor(void *mem, int size, void *arg);
144 static void vmspace_zdtor(void *mem, int size, void *arg);
145 #endif
146 static int vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos,
147     vm_size_t max_ssize, vm_size_t growsize, vm_prot_t prot, vm_prot_t max,
148     int cow);
149 static void vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry,
150     vm_offset_t failed_addr);
151
152 #define ENTRY_CHARGED(e) ((e)->cred != NULL || \
153     ((e)->object.vm_object != NULL && (e)->object.vm_object->cred != NULL && \
154      !((e)->eflags & MAP_ENTRY_NEEDS_COPY)))
155
156 /* 
157  * PROC_VMSPACE_{UN,}LOCK() can be a noop as long as vmspaces are type
158  * stable.
159  */
160 #define PROC_VMSPACE_LOCK(p) do { } while (0)
161 #define PROC_VMSPACE_UNLOCK(p) do { } while (0)
162
163 /*
164  *      VM_MAP_RANGE_CHECK:     [ internal use only ]
165  *
166  *      Asserts that the starting and ending region
167  *      addresses fall within the valid range of the map.
168  */
169 #define VM_MAP_RANGE_CHECK(map, start, end)             \
170                 {                                       \
171                 if (start < vm_map_min(map))            \
172                         start = vm_map_min(map);        \
173                 if (end > vm_map_max(map))              \
174                         end = vm_map_max(map);          \
175                 if (start > end)                        \
176                         start = end;                    \
177                 }
178
179 /*
180  *      vm_map_startup:
181  *
182  *      Initialize the vm_map module.  Must be called before
183  *      any other vm_map routines.
184  *
185  *      Map and entry structures are allocated from the general
186  *      purpose memory pool with some exceptions:
187  *
188  *      - The kernel map and kmem submap are allocated statically.
189  *      - Kernel map entries are allocated out of a static pool.
190  *
191  *      These restrictions are necessary since malloc() uses the
192  *      maps and requires map entries.
193  */
194
195 void
196 vm_map_startup(void)
197 {
198         mtx_init(&map_sleep_mtx, "vm map sleep mutex", NULL, MTX_DEF);
199         mapzone = uma_zcreate("MAP", sizeof(struct vm_map), NULL,
200 #ifdef INVARIANTS
201             vm_map_zdtor,
202 #else
203             NULL,
204 #endif
205             vm_map_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
206         uma_prealloc(mapzone, MAX_KMAP);
207         kmapentzone = uma_zcreate("KMAP ENTRY", sizeof(struct vm_map_entry),
208             NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
209             UMA_ZONE_MTXCLASS | UMA_ZONE_VM);
210         mapentzone = uma_zcreate("MAP ENTRY", sizeof(struct vm_map_entry),
211             NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
212         vmspace_zone = uma_zcreate("VMSPACE", sizeof(struct vmspace), NULL,
213 #ifdef INVARIANTS
214             vmspace_zdtor,
215 #else
216             NULL,
217 #endif
218             vmspace_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
219 }
220
221 static int
222 vmspace_zinit(void *mem, int size, int flags)
223 {
224         struct vmspace *vm;
225
226         vm = (struct vmspace *)mem;
227
228         vm->vm_map.pmap = NULL;
229         (void)vm_map_zinit(&vm->vm_map, sizeof(vm->vm_map), flags);
230         PMAP_LOCK_INIT(vmspace_pmap(vm));
231         return (0);
232 }
233
234 static int
235 vm_map_zinit(void *mem, int size, int flags)
236 {
237         vm_map_t map;
238
239         map = (vm_map_t)mem;
240         memset(map, 0, sizeof(*map));
241         mtx_init(&map->system_mtx, "vm map (system)", NULL, MTX_DEF | MTX_DUPOK);
242         sx_init(&map->lock, "vm map (user)");
243         return (0);
244 }
245
246 #ifdef INVARIANTS
247 static void
248 vmspace_zdtor(void *mem, int size, void *arg)
249 {
250         struct vmspace *vm;
251
252         vm = (struct vmspace *)mem;
253
254         vm_map_zdtor(&vm->vm_map, sizeof(vm->vm_map), arg);
255 }
256 static void
257 vm_map_zdtor(void *mem, int size, void *arg)
258 {
259         vm_map_t map;
260
261         map = (vm_map_t)mem;
262         KASSERT(map->nentries == 0,
263             ("map %p nentries == %d on free.",
264             map, map->nentries));
265         KASSERT(map->size == 0,
266             ("map %p size == %lu on free.",
267             map, (unsigned long)map->size));
268 }
269 #endif  /* INVARIANTS */
270
271 /*
272  * Allocate a vmspace structure, including a vm_map and pmap,
273  * and initialize those structures.  The refcnt is set to 1.
274  *
275  * If 'pinit' is NULL then the embedded pmap is initialized via pmap_pinit().
276  */
277 struct vmspace *
278 vmspace_alloc(vm_offset_t min, vm_offset_t max, pmap_pinit_t pinit)
279 {
280         struct vmspace *vm;
281
282         vm = uma_zalloc(vmspace_zone, M_WAITOK);
283         KASSERT(vm->vm_map.pmap == NULL, ("vm_map.pmap must be NULL"));
284         if (!pinit(vmspace_pmap(vm))) {
285                 uma_zfree(vmspace_zone, vm);
286                 return (NULL);
287         }
288         CTR1(KTR_VM, "vmspace_alloc: %p", vm);
289         _vm_map_init(&vm->vm_map, vmspace_pmap(vm), min, max);
290         vm->vm_refcnt = 1;
291         vm->vm_shm = NULL;
292         vm->vm_swrss = 0;
293         vm->vm_tsize = 0;
294         vm->vm_dsize = 0;
295         vm->vm_ssize = 0;
296         vm->vm_taddr = 0;
297         vm->vm_daddr = 0;
298         vm->vm_maxsaddr = 0;
299         return (vm);
300 }
301
302 #ifdef RACCT
303 static void
304 vmspace_container_reset(struct proc *p)
305 {
306
307         PROC_LOCK(p);
308         racct_set(p, RACCT_DATA, 0);
309         racct_set(p, RACCT_STACK, 0);
310         racct_set(p, RACCT_RSS, 0);
311         racct_set(p, RACCT_MEMLOCK, 0);
312         racct_set(p, RACCT_VMEM, 0);
313         PROC_UNLOCK(p);
314 }
315 #endif
316
317 static inline void
318 vmspace_dofree(struct vmspace *vm)
319 {
320
321         CTR1(KTR_VM, "vmspace_free: %p", vm);
322
323         /*
324          * Make sure any SysV shm is freed, it might not have been in
325          * exit1().
326          */
327         shmexit(vm);
328
329         /*
330          * Lock the map, to wait out all other references to it.
331          * Delete all of the mappings and pages they hold, then call
332          * the pmap module to reclaim anything left.
333          */
334         (void)vm_map_remove(&vm->vm_map, vm_map_min(&vm->vm_map),
335             vm_map_max(&vm->vm_map));
336
337         pmap_release(vmspace_pmap(vm));
338         vm->vm_map.pmap = NULL;
339         uma_zfree(vmspace_zone, vm);
340 }
341
342 void
343 vmspace_free(struct vmspace *vm)
344 {
345
346         WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
347             "vmspace_free() called");
348
349         if (vm->vm_refcnt == 0)
350                 panic("vmspace_free: attempt to free already freed vmspace");
351
352         if (atomic_fetchadd_int(&vm->vm_refcnt, -1) == 1)
353                 vmspace_dofree(vm);
354 }
355
356 void
357 vmspace_exitfree(struct proc *p)
358 {
359         struct vmspace *vm;
360
361         PROC_VMSPACE_LOCK(p);
362         vm = p->p_vmspace;
363         p->p_vmspace = NULL;
364         PROC_VMSPACE_UNLOCK(p);
365         KASSERT(vm == &vmspace0, ("vmspace_exitfree: wrong vmspace"));
366         vmspace_free(vm);
367 }
368
369 void
370 vmspace_exit(struct thread *td)
371 {
372         int refcnt;
373         struct vmspace *vm;
374         struct proc *p;
375
376         /*
377          * Release user portion of address space.
378          * This releases references to vnodes,
379          * which could cause I/O if the file has been unlinked.
380          * Need to do this early enough that we can still sleep.
381          *
382          * The last exiting process to reach this point releases as
383          * much of the environment as it can. vmspace_dofree() is the
384          * slower fallback in case another process had a temporary
385          * reference to the vmspace.
386          */
387
388         p = td->td_proc;
389         vm = p->p_vmspace;
390         atomic_add_int(&vmspace0.vm_refcnt, 1);
391         refcnt = vm->vm_refcnt;
392         do {
393                 if (refcnt > 1 && p->p_vmspace != &vmspace0) {
394                         /* Switch now since other proc might free vmspace */
395                         PROC_VMSPACE_LOCK(p);
396                         p->p_vmspace = &vmspace0;
397                         PROC_VMSPACE_UNLOCK(p);
398                         pmap_activate(td);
399                 }
400         } while (!atomic_fcmpset_int(&vm->vm_refcnt, &refcnt, refcnt - 1));
401         if (refcnt == 1) {
402                 if (p->p_vmspace != vm) {
403                         /* vmspace not yet freed, switch back */
404                         PROC_VMSPACE_LOCK(p);
405                         p->p_vmspace = vm;
406                         PROC_VMSPACE_UNLOCK(p);
407                         pmap_activate(td);
408                 }
409                 pmap_remove_pages(vmspace_pmap(vm));
410                 /* Switch now since this proc will free vmspace */
411                 PROC_VMSPACE_LOCK(p);
412                 p->p_vmspace = &vmspace0;
413                 PROC_VMSPACE_UNLOCK(p);
414                 pmap_activate(td);
415                 vmspace_dofree(vm);
416         }
417 #ifdef RACCT
418         if (racct_enable)
419                 vmspace_container_reset(p);
420 #endif
421 }
422
423 /* Acquire reference to vmspace owned by another process. */
424
425 struct vmspace *
426 vmspace_acquire_ref(struct proc *p)
427 {
428         struct vmspace *vm;
429         int refcnt;
430
431         PROC_VMSPACE_LOCK(p);
432         vm = p->p_vmspace;
433         if (vm == NULL) {
434                 PROC_VMSPACE_UNLOCK(p);
435                 return (NULL);
436         }
437         refcnt = vm->vm_refcnt;
438         do {
439                 if (refcnt <= 0) {      /* Avoid 0->1 transition */
440                         PROC_VMSPACE_UNLOCK(p);
441                         return (NULL);
442                 }
443         } while (!atomic_fcmpset_int(&vm->vm_refcnt, &refcnt, refcnt + 1));
444         if (vm != p->p_vmspace) {
445                 PROC_VMSPACE_UNLOCK(p);
446                 vmspace_free(vm);
447                 return (NULL);
448         }
449         PROC_VMSPACE_UNLOCK(p);
450         return (vm);
451 }
452
453 /*
454  * Switch between vmspaces in an AIO kernel process.
455  *
456  * The AIO kernel processes switch to and from a user process's
457  * vmspace while performing an I/O operation on behalf of a user
458  * process.  The new vmspace is either the vmspace of a user process
459  * obtained from an active AIO request or the initial vmspace of the
460  * AIO kernel process (when it is idling).  Because user processes
461  * will block to drain any active AIO requests before proceeding in
462  * exit() or execve(), the vmspace reference count for these vmspaces
463  * can never be 0.  This allows for a much simpler implementation than
464  * the loop in vmspace_acquire_ref() above.  Similarly, AIO kernel
465  * processes hold an extra reference on their initial vmspace for the
466  * life of the process so that this guarantee is true for any vmspace
467  * passed as 'newvm'.
468  */
469 void
470 vmspace_switch_aio(struct vmspace *newvm)
471 {
472         struct vmspace *oldvm;
473
474         /* XXX: Need some way to assert that this is an aio daemon. */
475
476         KASSERT(newvm->vm_refcnt > 0,
477             ("vmspace_switch_aio: newvm unreferenced"));
478
479         oldvm = curproc->p_vmspace;
480         if (oldvm == newvm)
481                 return;
482
483         /*
484          * Point to the new address space and refer to it.
485          */
486         curproc->p_vmspace = newvm;
487         atomic_add_int(&newvm->vm_refcnt, 1);
488
489         /* Activate the new mapping. */
490         pmap_activate(curthread);
491
492         /* Remove the daemon's reference to the old address space. */
493         KASSERT(oldvm->vm_refcnt > 1,
494             ("vmspace_switch_aio: oldvm dropping last reference"));
495         vmspace_free(oldvm);
496 }
497
498 void
499 _vm_map_lock(vm_map_t map, const char *file, int line)
500 {
501
502         if (map->system_map)
503                 mtx_lock_flags_(&map->system_mtx, 0, file, line);
504         else
505                 sx_xlock_(&map->lock, file, line);
506         map->timestamp++;
507 }
508
509 static void
510 vm_map_process_deferred(void)
511 {
512         struct thread *td;
513         vm_map_entry_t entry, next;
514         vm_object_t object;
515
516         td = curthread;
517         entry = td->td_map_def_user;
518         td->td_map_def_user = NULL;
519         while (entry != NULL) {
520                 next = entry->next;
521                 if ((entry->eflags & MAP_ENTRY_VN_WRITECNT) != 0) {
522                         /*
523                          * Decrement the object's writemappings and
524                          * possibly the vnode's v_writecount.
525                          */
526                         KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
527                             ("Submap with writecount"));
528                         object = entry->object.vm_object;
529                         KASSERT(object != NULL, ("No object for writecount"));
530                         vnode_pager_release_writecount(object, entry->start,
531                             entry->end);
532                 }
533                 vm_map_entry_deallocate(entry, FALSE);
534                 entry = next;
535         }
536 }
537
538 void
539 _vm_map_unlock(vm_map_t map, const char *file, int line)
540 {
541
542         if (map->system_map)
543                 mtx_unlock_flags_(&map->system_mtx, 0, file, line);
544         else {
545                 sx_xunlock_(&map->lock, file, line);
546                 vm_map_process_deferred();
547         }
548 }
549
550 void
551 _vm_map_lock_read(vm_map_t map, const char *file, int line)
552 {
553
554         if (map->system_map)
555                 mtx_lock_flags_(&map->system_mtx, 0, file, line);
556         else
557                 sx_slock_(&map->lock, file, line);
558 }
559
560 void
561 _vm_map_unlock_read(vm_map_t map, const char *file, int line)
562 {
563
564         if (map->system_map)
565                 mtx_unlock_flags_(&map->system_mtx, 0, file, line);
566         else {
567                 sx_sunlock_(&map->lock, file, line);
568                 vm_map_process_deferred();
569         }
570 }
571
572 int
573 _vm_map_trylock(vm_map_t map, const char *file, int line)
574 {
575         int error;
576
577         error = map->system_map ?
578             !mtx_trylock_flags_(&map->system_mtx, 0, file, line) :
579             !sx_try_xlock_(&map->lock, file, line);
580         if (error == 0)
581                 map->timestamp++;
582         return (error == 0);
583 }
584
585 int
586 _vm_map_trylock_read(vm_map_t map, const char *file, int line)
587 {
588         int error;
589
590         error = map->system_map ?
591             !mtx_trylock_flags_(&map->system_mtx, 0, file, line) :
592             !sx_try_slock_(&map->lock, file, line);
593         return (error == 0);
594 }
595
596 /*
597  *      _vm_map_lock_upgrade:   [ internal use only ]
598  *
599  *      Tries to upgrade a read (shared) lock on the specified map to a write
600  *      (exclusive) lock.  Returns the value "0" if the upgrade succeeds and a
601  *      non-zero value if the upgrade fails.  If the upgrade fails, the map is
602  *      returned without a read or write lock held.
603  *
604  *      Requires that the map be read locked.
605  */
606 int
607 _vm_map_lock_upgrade(vm_map_t map, const char *file, int line)
608 {
609         unsigned int last_timestamp;
610
611         if (map->system_map) {
612                 mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
613         } else {
614                 if (!sx_try_upgrade_(&map->lock, file, line)) {
615                         last_timestamp = map->timestamp;
616                         sx_sunlock_(&map->lock, file, line);
617                         vm_map_process_deferred();
618                         /*
619                          * If the map's timestamp does not change while the
620                          * map is unlocked, then the upgrade succeeds.
621                          */
622                         sx_xlock_(&map->lock, file, line);
623                         if (last_timestamp != map->timestamp) {
624                                 sx_xunlock_(&map->lock, file, line);
625                                 return (1);
626                         }
627                 }
628         }
629         map->timestamp++;
630         return (0);
631 }
632
633 void
634 _vm_map_lock_downgrade(vm_map_t map, const char *file, int line)
635 {
636
637         if (map->system_map) {
638                 mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
639         } else
640                 sx_downgrade_(&map->lock, file, line);
641 }
642
643 /*
644  *      vm_map_locked:
645  *
646  *      Returns a non-zero value if the caller holds a write (exclusive) lock
647  *      on the specified map and the value "0" otherwise.
648  */
649 int
650 vm_map_locked(vm_map_t map)
651 {
652
653         if (map->system_map)
654                 return (mtx_owned(&map->system_mtx));
655         else
656                 return (sx_xlocked(&map->lock));
657 }
658
659 #ifdef INVARIANTS
660 static void
661 _vm_map_assert_locked(vm_map_t map, const char *file, int line)
662 {
663
664         if (map->system_map)
665                 mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
666         else
667                 sx_assert_(&map->lock, SA_XLOCKED, file, line);
668 }
669
670 #define VM_MAP_ASSERT_LOCKED(map) \
671     _vm_map_assert_locked(map, LOCK_FILE, LOCK_LINE)
672
673 #ifdef DIAGNOSTIC
674 static int enable_vmmap_check = 1;
675 #else
676 static int enable_vmmap_check = 0;
677 #endif
678 SYSCTL_INT(_debug, OID_AUTO, vmmap_check, CTLFLAG_RWTUN,
679     &enable_vmmap_check, 0, "Enable vm map consistency checking");
680
681 static void
682 _vm_map_assert_consistent(vm_map_t map)
683 {
684         vm_map_entry_t entry;
685         vm_map_entry_t child;
686         vm_size_t max_left, max_right;
687
688         if (!enable_vmmap_check)
689                 return;
690
691         for (entry = map->header.next; entry != &map->header;
692             entry = entry->next) {
693                 KASSERT(entry->prev->end <= entry->start,
694                     ("map %p prev->end = %jx, start = %jx", map,
695                     (uintmax_t)entry->prev->end, (uintmax_t)entry->start));
696                 KASSERT(entry->start < entry->end,
697                     ("map %p start = %jx, end = %jx", map,
698                     (uintmax_t)entry->start, (uintmax_t)entry->end));
699                 KASSERT(entry->end <= entry->next->start,
700                     ("map %p end = %jx, next->start = %jx", map,
701                     (uintmax_t)entry->end, (uintmax_t)entry->next->start));
702                 KASSERT(entry->left == NULL ||
703                     entry->left->start < entry->start,
704                     ("map %p left->start = %jx, start = %jx", map,
705                     (uintmax_t)entry->left->start, (uintmax_t)entry->start));
706                 KASSERT(entry->right == NULL ||
707                     entry->start < entry->right->start,
708                     ("map %p start = %jx, right->start = %jx", map,
709                     (uintmax_t)entry->start, (uintmax_t)entry->right->start));
710                 child = entry->left;
711                 max_left = (child != NULL) ? child->max_free :
712                         entry->start - entry->prev->end;
713                 child = entry->right;
714                 max_right = (child != NULL) ? child->max_free :
715                         entry->next->start - entry->end;
716                 KASSERT(entry->max_free == MAX(max_left, max_right),
717                     ("map %p max = %jx, max_left = %jx, max_right = %jx", map,
718                      (uintmax_t)entry->max_free,
719                      (uintmax_t)max_left, (uintmax_t)max_right));
720         }       
721 }
722
723 #define VM_MAP_ASSERT_CONSISTENT(map) \
724     _vm_map_assert_consistent(map)
725 #else
726 #define VM_MAP_ASSERT_LOCKED(map)
727 #define VM_MAP_ASSERT_CONSISTENT(map)
728 #endif /* INVARIANTS */
729
730 /*
731  *      _vm_map_unlock_and_wait:
732  *
733  *      Atomically releases the lock on the specified map and puts the calling
734  *      thread to sleep.  The calling thread will remain asleep until either
735  *      vm_map_wakeup() is performed on the map or the specified timeout is
736  *      exceeded.
737  *
738  *      WARNING!  This function does not perform deferred deallocations of
739  *      objects and map entries.  Therefore, the calling thread is expected to
740  *      reacquire the map lock after reawakening and later perform an ordinary
741  *      unlock operation, such as vm_map_unlock(), before completing its
742  *      operation on the map.
743  */
744 int
745 _vm_map_unlock_and_wait(vm_map_t map, int timo, const char *file, int line)
746 {
747
748         mtx_lock(&map_sleep_mtx);
749         if (map->system_map)
750                 mtx_unlock_flags_(&map->system_mtx, 0, file, line);
751         else
752                 sx_xunlock_(&map->lock, file, line);
753         return (msleep(&map->root, &map_sleep_mtx, PDROP | PVM, "vmmaps",
754             timo));
755 }
756
757 /*
758  *      vm_map_wakeup:
759  *
760  *      Awaken any threads that have slept on the map using
761  *      vm_map_unlock_and_wait().
762  */
763 void
764 vm_map_wakeup(vm_map_t map)
765 {
766
767         /*
768          * Acquire and release map_sleep_mtx to prevent a wakeup()
769          * from being performed (and lost) between the map unlock
770          * and the msleep() in _vm_map_unlock_and_wait().
771          */
772         mtx_lock(&map_sleep_mtx);
773         mtx_unlock(&map_sleep_mtx);
774         wakeup(&map->root);
775 }
776
777 void
778 vm_map_busy(vm_map_t map)
779 {
780
781         VM_MAP_ASSERT_LOCKED(map);
782         map->busy++;
783 }
784
785 void
786 vm_map_unbusy(vm_map_t map)
787 {
788
789         VM_MAP_ASSERT_LOCKED(map);
790         KASSERT(map->busy, ("vm_map_unbusy: not busy"));
791         if (--map->busy == 0 && (map->flags & MAP_BUSY_WAKEUP)) {
792                 vm_map_modflags(map, 0, MAP_BUSY_WAKEUP);
793                 wakeup(&map->busy);
794         }
795 }
796
797 void 
798 vm_map_wait_busy(vm_map_t map)
799 {
800
801         VM_MAP_ASSERT_LOCKED(map);
802         while (map->busy) {
803                 vm_map_modflags(map, MAP_BUSY_WAKEUP, 0);
804                 if (map->system_map)
805                         msleep(&map->busy, &map->system_mtx, 0, "mbusy", 0);
806                 else
807                         sx_sleep(&map->busy, &map->lock, 0, "mbusy", 0);
808         }
809         map->timestamp++;
810 }
811
812 long
813 vmspace_resident_count(struct vmspace *vmspace)
814 {
815         return pmap_resident_count(vmspace_pmap(vmspace));
816 }
817
818 /*
819  *      vm_map_create:
820  *
821  *      Creates and returns a new empty VM map with
822  *      the given physical map structure, and having
823  *      the given lower and upper address bounds.
824  */
825 vm_map_t
826 vm_map_create(pmap_t pmap, vm_offset_t min, vm_offset_t max)
827 {
828         vm_map_t result;
829
830         result = uma_zalloc(mapzone, M_WAITOK);
831         CTR1(KTR_VM, "vm_map_create: %p", result);
832         _vm_map_init(result, pmap, min, max);
833         return (result);
834 }
835
836 /*
837  * Initialize an existing vm_map structure
838  * such as that in the vmspace structure.
839  */
840 static void
841 _vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max)
842 {
843
844         map->header.next = map->header.prev = &map->header;
845         map->header.eflags = MAP_ENTRY_HEADER;
846         map->needs_wakeup = FALSE;
847         map->system_map = 0;
848         map->pmap = pmap;
849         map->header.end = min;
850         map->header.start = max;
851         map->flags = 0;
852         map->root = NULL;
853         map->timestamp = 0;
854         map->busy = 0;
855         map->anon_loc = 0;
856 }
857
858 void
859 vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max)
860 {
861
862         _vm_map_init(map, pmap, min, max);
863         mtx_init(&map->system_mtx, "system map", NULL, MTX_DEF | MTX_DUPOK);
864         sx_init(&map->lock, "user map");
865 }
866
867 /*
868  *      vm_map_entry_dispose:   [ internal use only ]
869  *
870  *      Inverse of vm_map_entry_create.
871  */
872 static void
873 vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry)
874 {
875         uma_zfree(map->system_map ? kmapentzone : mapentzone, entry);
876 }
877
878 /*
879  *      vm_map_entry_create:    [ internal use only ]
880  *
881  *      Allocates a VM map entry for insertion.
882  *      No entry fields are filled in.
883  */
884 static vm_map_entry_t
885 vm_map_entry_create(vm_map_t map)
886 {
887         vm_map_entry_t new_entry;
888
889         if (map->system_map)
890                 new_entry = uma_zalloc(kmapentzone, M_NOWAIT);
891         else
892                 new_entry = uma_zalloc(mapentzone, M_WAITOK);
893         if (new_entry == NULL)
894                 panic("vm_map_entry_create: kernel resources exhausted");
895         return (new_entry);
896 }
897
898 /*
899  *      vm_map_entry_set_behavior:
900  *
901  *      Set the expected access behavior, either normal, random, or
902  *      sequential.
903  */
904 static inline void
905 vm_map_entry_set_behavior(vm_map_entry_t entry, u_char behavior)
906 {
907         entry->eflags = (entry->eflags & ~MAP_ENTRY_BEHAV_MASK) |
908             (behavior & MAP_ENTRY_BEHAV_MASK);
909 }
910
911 /*
912  *      vm_map_entry_set_max_free:
913  *
914  *      Set the max_free field in a vm_map_entry.
915  */
916 static inline void
917 vm_map_entry_set_max_free(vm_map_entry_t entry)
918 {
919         vm_map_entry_t child;
920         vm_size_t max_left, max_right;
921
922         child = entry->left;
923         max_left = (child != NULL) ? child->max_free :
924             entry->start - entry->prev->end;
925         child = entry->right;
926         max_right = (child != NULL) ? child->max_free :
927             entry->next->start - entry->end;
928         entry->max_free = MAX(max_left, max_right);
929 }
930
931 #define SPLAY_LEFT_STEP(root, y, rlist, test) do {      \
932         y = root->left;                                 \
933         if (y != NULL && (test)) {                      \
934                 /* Rotate right and make y root. */     \
935                 root->left = y->right;                  \
936                 y->right = root;                        \
937                 vm_map_entry_set_max_free(root);        \
938                 root = y;                               \
939                 y = root->left;                         \
940         }                                               \
941         /* Put root on rlist. */                        \
942         root->left = rlist;                             \
943         rlist = root;                                   \
944         root = y;                                       \
945 } while (0)
946
947 #define SPLAY_RIGHT_STEP(root, y, llist, test) do {     \
948         y = root->right;                                \
949         if (y != NULL && (test)) {                      \
950                 /* Rotate left and make y root. */      \
951                 root->right = y->left;                  \
952                 y->left = root;                         \
953                 vm_map_entry_set_max_free(root);        \
954                 root = y;                               \
955                 y = root->right;                        \
956         }                                               \
957         /* Put root on llist. */                        \
958         root->right = llist;                            \
959         llist = root;                                   \
960         root = y;                                       \
961 } while (0)
962
963 /*
964  * Walk down the tree until we find addr or a NULL pointer where addr would go,
965  * breaking off left and right subtrees of nodes less than, or greater than
966  * addr.  Treat pointers to nodes with max_free < length as NULL pointers.
967  * llist and rlist are the two sides in reverse order (bottom-up), with llist
968  * linked by the right pointer and rlist linked by the left pointer in the
969  * vm_map_entry.
970  */
971 static vm_map_entry_t
972 vm_map_splay_split(vm_offset_t addr, vm_size_t length,
973     vm_map_entry_t root, vm_map_entry_t *out_llist, vm_map_entry_t *out_rlist)
974 {
975         vm_map_entry_t llist, rlist;
976         vm_map_entry_t y;
977
978         llist = NULL;
979         rlist = NULL;
980         while (root != NULL && root->max_free >= length) {
981                 if (addr < root->start) {
982                         SPLAY_LEFT_STEP(root, y, rlist,
983                             y->max_free >= length && addr < y->start);
984                 } else if (addr >= root->end) {
985                         SPLAY_RIGHT_STEP(root, y, llist,
986                             y->max_free >= length && addr >= y->end);
987                 } else
988                         break;
989         }
990         *out_llist = llist;
991         *out_rlist = rlist;
992         return (root);
993 }
994
995 static void
996 vm_map_splay_findnext(vm_map_entry_t root, vm_map_entry_t *iolist)
997 {
998         vm_map_entry_t rlist, y;
999
1000         root = root->right;
1001         rlist = *iolist;
1002         while (root != NULL)
1003                 SPLAY_LEFT_STEP(root, y, rlist, true);
1004         *iolist = rlist;
1005 }
1006
1007 static void
1008 vm_map_splay_findprev(vm_map_entry_t root, vm_map_entry_t *iolist)
1009 {
1010         vm_map_entry_t llist, y;
1011
1012         root = root->left;
1013         llist = *iolist;
1014         while (root != NULL)
1015                 SPLAY_RIGHT_STEP(root, y, llist, true);
1016         *iolist = llist;
1017 }
1018
1019 /*
1020  * Walk back up the two spines, flip the pointers and set max_free.  The
1021  * subtrees of the root go at the bottom of llist and rlist.
1022  */
1023 static vm_map_entry_t
1024 vm_map_splay_merge(vm_map_entry_t root,
1025     vm_map_entry_t llist, vm_map_entry_t rlist,
1026     vm_map_entry_t ltree, vm_map_entry_t rtree)
1027 {
1028         vm_map_entry_t y;
1029
1030         while (llist != NULL) {
1031                 y = llist->right;
1032                 llist->right = ltree;
1033                 vm_map_entry_set_max_free(llist);
1034                 ltree = llist;
1035                 llist = y;
1036         }
1037         while (rlist != NULL) {
1038                 y = rlist->left;
1039                 rlist->left = rtree;
1040                 vm_map_entry_set_max_free(rlist);
1041                 rtree = rlist;
1042                 rlist = y;
1043         }
1044
1045         /*
1046          * Final assembly: add ltree and rtree as subtrees of root.
1047          */
1048         root->left = ltree;
1049         root->right = rtree;
1050         vm_map_entry_set_max_free(root);
1051
1052         return (root);
1053 }
1054
1055 /*
1056  *      vm_map_entry_splay:
1057  *
1058  *      The Sleator and Tarjan top-down splay algorithm with the
1059  *      following variation.  Max_free must be computed bottom-up, so
1060  *      on the downward pass, maintain the left and right spines in
1061  *      reverse order.  Then, make a second pass up each side to fix
1062  *      the pointers and compute max_free.  The time bound is O(log n)
1063  *      amortized.
1064  *
1065  *      The new root is the vm_map_entry containing "addr", or else an
1066  *      adjacent entry (lower if possible) if addr is not in the tree.
1067  *
1068  *      The map must be locked, and leaves it so.
1069  *
1070  *      Returns: the new root.
1071  */
1072 static vm_map_entry_t
1073 vm_map_entry_splay(vm_offset_t addr, vm_map_entry_t root)
1074 {
1075         vm_map_entry_t llist, rlist;
1076
1077         root = vm_map_splay_split(addr, 0, root, &llist, &rlist);
1078         if (root != NULL) {
1079                 /* do nothing */
1080         } else if (llist != NULL) {
1081                 /*
1082                  * Recover the greatest node in the left
1083                  * subtree and make it the root.
1084                  */
1085                 root = llist;
1086                 llist = root->right;
1087                 root->right = NULL;
1088         } else if (rlist != NULL) {
1089                 /*
1090                  * Recover the least node in the right
1091                  * subtree and make it the root.
1092                  */
1093                 root = rlist;
1094                 rlist = root->left;
1095                 root->left = NULL;
1096         } else {
1097                 /* There is no root. */
1098                 return (NULL);
1099         }
1100         return (vm_map_splay_merge(root, llist, rlist,
1101             root->left, root->right));
1102 }
1103
1104 /*
1105  *      vm_map_entry_{un,}link:
1106  *
1107  *      Insert/remove entries from maps.
1108  */
1109 static void
1110 vm_map_entry_link(vm_map_t map,
1111                   vm_map_entry_t entry)
1112 {
1113         vm_map_entry_t llist, rlist, root;
1114
1115         CTR3(KTR_VM,
1116             "vm_map_entry_link: map %p, nentries %d, entry %p", map,
1117             map->nentries, entry);
1118         VM_MAP_ASSERT_LOCKED(map);
1119         map->nentries++;
1120         root = map->root;
1121         root = vm_map_splay_split(entry->start, 0, root, &llist, &rlist);
1122         KASSERT(root == NULL,
1123             ("vm_map_entry_link: link object already mapped"));
1124         entry->prev = (llist == NULL) ? &map->header : llist;
1125         entry->next = (rlist == NULL) ? &map->header : rlist;
1126         entry->prev->next = entry->next->prev = entry;
1127         root = vm_map_splay_merge(entry, llist, rlist, NULL, NULL);
1128         map->root = entry;
1129         VM_MAP_ASSERT_CONSISTENT(map);
1130 }
1131
1132 enum unlink_merge_type {
1133         UNLINK_MERGE_PREV,
1134         UNLINK_MERGE_NONE,
1135         UNLINK_MERGE_NEXT
1136 };
1137
1138 static void
1139 vm_map_entry_unlink(vm_map_t map,
1140                     vm_map_entry_t entry,
1141                     enum unlink_merge_type op)
1142 {
1143         vm_map_entry_t llist, rlist, root, y;
1144
1145         VM_MAP_ASSERT_LOCKED(map);
1146         llist = entry->prev;
1147         rlist = entry->next;
1148         llist->next = rlist;
1149         rlist->prev = llist;
1150         root = map->root;
1151         root = vm_map_splay_split(entry->start, 0, root, &llist, &rlist);
1152         KASSERT(root != NULL,
1153             ("vm_map_entry_unlink: unlink object not mapped"));
1154
1155         switch (op) {
1156         case UNLINK_MERGE_PREV:
1157                 vm_map_splay_findprev(root, &llist);
1158                 llist->end = root->end;
1159                 y = root->right;
1160                 root = llist;
1161                 llist = root->right;
1162                 root->right = y;
1163                 break;
1164         case UNLINK_MERGE_NEXT:
1165                 vm_map_splay_findnext(root, &rlist);
1166                 rlist->start = root->start;
1167                 rlist->offset = root->offset;
1168                 y = root->left;
1169                 root = rlist;
1170                 rlist = root->left;
1171                 root->left = y;
1172                 break;
1173         case UNLINK_MERGE_NONE:
1174                 vm_map_splay_findprev(root, &llist);
1175                 vm_map_splay_findnext(root, &rlist);
1176                 if (llist != NULL) {
1177                         root = llist;
1178                         llist = root->right;
1179                         root->right = NULL;
1180                 } else if (rlist != NULL) {
1181                         root = rlist;
1182                         rlist = root->left;
1183                         root->left = NULL;
1184                 } else
1185                         root = NULL;
1186                 break;
1187         }
1188         if (root != NULL)
1189                 root = vm_map_splay_merge(root, llist, rlist,
1190                     root->left, root->right);
1191         map->root = root;
1192         VM_MAP_ASSERT_CONSISTENT(map);
1193         map->nentries--;
1194         CTR3(KTR_VM, "vm_map_entry_unlink: map %p, nentries %d, entry %p", map,
1195             map->nentries, entry);
1196 }
1197
1198 /*
1199  *      vm_map_entry_resize_free:
1200  *
1201  *      Recompute the amount of free space following a modified vm_map_entry
1202  *      and propagate those values up the tree.  Call this function after
1203  *      resizing a map entry in-place by changing the end value, without a
1204  *      call to vm_map_entry_link() or _unlink().
1205  *
1206  *      The map must be locked, and leaves it so.
1207  */
1208 static void
1209 vm_map_entry_resize_free(vm_map_t map, vm_map_entry_t entry)
1210 {
1211         vm_map_entry_t llist, rlist, root;
1212
1213         VM_MAP_ASSERT_LOCKED(map);
1214         root = map->root;
1215         root = vm_map_splay_split(entry->start, 0, root, &llist, &rlist);
1216         KASSERT(root != NULL,
1217             ("vm_map_entry_resize_free: resize_free object not mapped"));
1218         vm_map_splay_findnext(root, &rlist);
1219         root->right = NULL;
1220         map->root = vm_map_splay_merge(root, llist, rlist,
1221             root->left, root->right);
1222         VM_MAP_ASSERT_CONSISTENT(map);
1223         CTR3(KTR_VM, "vm_map_entry_resize_free: map %p, nentries %d, entry %p", map,
1224             map->nentries, entry);
1225 }
1226
1227 /*
1228  *      vm_map_lookup_entry:    [ internal use only ]
1229  *
1230  *      Finds the map entry containing (or
1231  *      immediately preceding) the specified address
1232  *      in the given map; the entry is returned
1233  *      in the "entry" parameter.  The boolean
1234  *      result indicates whether the address is
1235  *      actually contained in the map.
1236  */
1237 boolean_t
1238 vm_map_lookup_entry(
1239         vm_map_t map,
1240         vm_offset_t address,
1241         vm_map_entry_t *entry)  /* OUT */
1242 {
1243         vm_map_entry_t cur, lbound;
1244         boolean_t locked;
1245
1246         /*
1247          * If the map is empty, then the map entry immediately preceding
1248          * "address" is the map's header.
1249          */
1250         cur = map->root;
1251         if (cur == NULL) {
1252                 *entry = &map->header;
1253                 return (FALSE);
1254         }
1255         if (address >= cur->start && cur->end > address) {
1256                 *entry = cur;
1257                 return (TRUE);
1258         }
1259         if ((locked = vm_map_locked(map)) ||
1260             sx_try_upgrade(&map->lock)) {
1261                 /*
1262                  * Splay requires a write lock on the map.  However, it only
1263                  * restructures the binary search tree; it does not otherwise
1264                  * change the map.  Thus, the map's timestamp need not change
1265                  * on a temporary upgrade.
1266                  */
1267                 map->root = cur = vm_map_entry_splay(address, cur);
1268                 VM_MAP_ASSERT_CONSISTENT(map);
1269                 if (!locked)
1270                         sx_downgrade(&map->lock);
1271
1272                 /*
1273                  * If "address" is contained within a map entry, the new root
1274                  * is that map entry.  Otherwise, the new root is a map entry
1275                  * immediately before or after "address".
1276                  */
1277                 if (address < cur->start) {
1278                         *entry = &map->header;
1279                         return (FALSE);
1280                 }
1281                 *entry = cur;
1282                 return (address < cur->end);
1283         }
1284         /*
1285          * Since the map is only locked for read access, perform a
1286          * standard binary search tree lookup for "address".
1287          */
1288         lbound = &map->header;
1289         do {
1290                 if (address < cur->start) {
1291                         cur = cur->left;
1292                 } else if (cur->end <= address) {
1293                         lbound = cur;
1294                         cur = cur->right;
1295                 } else {
1296                         *entry = cur;
1297                         return (TRUE);
1298                 }
1299         } while (cur != NULL);
1300         *entry = lbound;
1301         return (FALSE);
1302 }
1303
1304 /*
1305  *      vm_map_insert:
1306  *
1307  *      Inserts the given whole VM object into the target
1308  *      map at the specified address range.  The object's
1309  *      size should match that of the address range.
1310  *
1311  *      Requires that the map be locked, and leaves it so.
1312  *
1313  *      If object is non-NULL, ref count must be bumped by caller
1314  *      prior to making call to account for the new entry.
1315  */
1316 int
1317 vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1318     vm_offset_t start, vm_offset_t end, vm_prot_t prot, vm_prot_t max, int cow)
1319 {
1320         vm_map_entry_t new_entry, prev_entry, temp_entry;
1321         struct ucred *cred;
1322         vm_eflags_t protoeflags;
1323         vm_inherit_t inheritance;
1324
1325         VM_MAP_ASSERT_LOCKED(map);
1326         KASSERT(object != kernel_object ||
1327             (cow & MAP_COPY_ON_WRITE) == 0,
1328             ("vm_map_insert: kernel object and COW"));
1329         KASSERT(object == NULL || (cow & MAP_NOFAULT) == 0,
1330             ("vm_map_insert: paradoxical MAP_NOFAULT request"));
1331         KASSERT((prot & ~max) == 0,
1332             ("prot %#x is not subset of max_prot %#x", prot, max));
1333
1334         /*
1335          * Check that the start and end points are not bogus.
1336          */
1337         if (start < vm_map_min(map) || end > vm_map_max(map) ||
1338             start >= end)
1339                 return (KERN_INVALID_ADDRESS);
1340
1341         /*
1342          * Find the entry prior to the proposed starting address; if it's part
1343          * of an existing entry, this range is bogus.
1344          */
1345         if (vm_map_lookup_entry(map, start, &temp_entry))
1346                 return (KERN_NO_SPACE);
1347
1348         prev_entry = temp_entry;
1349
1350         /*
1351          * Assert that the next entry doesn't overlap the end point.
1352          */
1353         if (prev_entry->next->start < end)
1354                 return (KERN_NO_SPACE);
1355
1356         if ((cow & MAP_CREATE_GUARD) != 0 && (object != NULL ||
1357             max != VM_PROT_NONE))
1358                 return (KERN_INVALID_ARGUMENT);
1359
1360         protoeflags = 0;
1361         if (cow & MAP_COPY_ON_WRITE)
1362                 protoeflags |= MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY;
1363         if (cow & MAP_NOFAULT)
1364                 protoeflags |= MAP_ENTRY_NOFAULT;
1365         if (cow & MAP_DISABLE_SYNCER)
1366                 protoeflags |= MAP_ENTRY_NOSYNC;
1367         if (cow & MAP_DISABLE_COREDUMP)
1368                 protoeflags |= MAP_ENTRY_NOCOREDUMP;
1369         if (cow & MAP_STACK_GROWS_DOWN)
1370                 protoeflags |= MAP_ENTRY_GROWS_DOWN;
1371         if (cow & MAP_STACK_GROWS_UP)
1372                 protoeflags |= MAP_ENTRY_GROWS_UP;
1373         if (cow & MAP_VN_WRITECOUNT)
1374                 protoeflags |= MAP_ENTRY_VN_WRITECNT;
1375         if ((cow & MAP_CREATE_GUARD) != 0)
1376                 protoeflags |= MAP_ENTRY_GUARD;
1377         if ((cow & MAP_CREATE_STACK_GAP_DN) != 0)
1378                 protoeflags |= MAP_ENTRY_STACK_GAP_DN;
1379         if ((cow & MAP_CREATE_STACK_GAP_UP) != 0)
1380                 protoeflags |= MAP_ENTRY_STACK_GAP_UP;
1381         if (cow & MAP_INHERIT_SHARE)
1382                 inheritance = VM_INHERIT_SHARE;
1383         else
1384                 inheritance = VM_INHERIT_DEFAULT;
1385
1386         cred = NULL;
1387         if ((cow & (MAP_ACC_NO_CHARGE | MAP_NOFAULT | MAP_CREATE_GUARD)) != 0)
1388                 goto charged;
1389         if ((cow & MAP_ACC_CHARGED) || ((prot & VM_PROT_WRITE) &&
1390             ((protoeflags & MAP_ENTRY_NEEDS_COPY) || object == NULL))) {
1391                 if (!(cow & MAP_ACC_CHARGED) && !swap_reserve(end - start))
1392                         return (KERN_RESOURCE_SHORTAGE);
1393                 KASSERT(object == NULL ||
1394                     (protoeflags & MAP_ENTRY_NEEDS_COPY) != 0 ||
1395                     object->cred == NULL,
1396                     ("overcommit: vm_map_insert o %p", object));
1397                 cred = curthread->td_ucred;
1398         }
1399
1400 charged:
1401         /* Expand the kernel pmap, if necessary. */
1402         if (map == kernel_map && end > kernel_vm_end)
1403                 pmap_growkernel(end);
1404         if (object != NULL) {
1405                 /*
1406                  * OBJ_ONEMAPPING must be cleared unless this mapping
1407                  * is trivially proven to be the only mapping for any
1408                  * of the object's pages.  (Object granularity
1409                  * reference counting is insufficient to recognize
1410                  * aliases with precision.)
1411                  */
1412                 VM_OBJECT_WLOCK(object);
1413                 if (object->ref_count > 1 || object->shadow_count != 0)
1414                         vm_object_clear_flag(object, OBJ_ONEMAPPING);
1415                 VM_OBJECT_WUNLOCK(object);
1416         } else if ((prev_entry->eflags & ~MAP_ENTRY_USER_WIRED) ==
1417             protoeflags &&
1418             (cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 &&
1419             prev_entry->end == start && (prev_entry->cred == cred ||
1420             (prev_entry->object.vm_object != NULL &&
1421             prev_entry->object.vm_object->cred == cred)) &&
1422             vm_object_coalesce(prev_entry->object.vm_object,
1423             prev_entry->offset,
1424             (vm_size_t)(prev_entry->end - prev_entry->start),
1425             (vm_size_t)(end - prev_entry->end), cred != NULL &&
1426             (protoeflags & MAP_ENTRY_NEEDS_COPY) == 0)) {
1427                 /*
1428                  * We were able to extend the object.  Determine if we
1429                  * can extend the previous map entry to include the
1430                  * new range as well.
1431                  */
1432                 if (prev_entry->inheritance == inheritance &&
1433                     prev_entry->protection == prot &&
1434                     prev_entry->max_protection == max &&
1435                     prev_entry->wired_count == 0) {
1436                         KASSERT((prev_entry->eflags & MAP_ENTRY_USER_WIRED) ==
1437                             0, ("prev_entry %p has incoherent wiring",
1438                             prev_entry));
1439                         if ((prev_entry->eflags & MAP_ENTRY_GUARD) == 0)
1440                                 map->size += end - prev_entry->end;
1441                         prev_entry->end = end;
1442                         vm_map_entry_resize_free(map, prev_entry);
1443                         vm_map_simplify_entry(map, prev_entry);
1444                         return (KERN_SUCCESS);
1445                 }
1446
1447                 /*
1448                  * If we can extend the object but cannot extend the
1449                  * map entry, we have to create a new map entry.  We
1450                  * must bump the ref count on the extended object to
1451                  * account for it.  object may be NULL.
1452                  */
1453                 object = prev_entry->object.vm_object;
1454                 offset = prev_entry->offset +
1455                     (prev_entry->end - prev_entry->start);
1456                 vm_object_reference(object);
1457                 if (cred != NULL && object != NULL && object->cred != NULL &&
1458                     !(prev_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
1459                         /* Object already accounts for this uid. */
1460                         cred = NULL;
1461                 }
1462         }
1463         if (cred != NULL)
1464                 crhold(cred);
1465
1466         /*
1467          * Create a new entry
1468          */
1469         new_entry = vm_map_entry_create(map);
1470         new_entry->start = start;
1471         new_entry->end = end;
1472         new_entry->cred = NULL;
1473
1474         new_entry->eflags = protoeflags;
1475         new_entry->object.vm_object = object;
1476         new_entry->offset = offset;
1477
1478         new_entry->inheritance = inheritance;
1479         new_entry->protection = prot;
1480         new_entry->max_protection = max;
1481         new_entry->wired_count = 0;
1482         new_entry->wiring_thread = NULL;
1483         new_entry->read_ahead = VM_FAULT_READ_AHEAD_INIT;
1484         new_entry->next_read = start;
1485
1486         KASSERT(cred == NULL || !ENTRY_CHARGED(new_entry),
1487             ("overcommit: vm_map_insert leaks vm_map %p", new_entry));
1488         new_entry->cred = cred;
1489
1490         /*
1491          * Insert the new entry into the list
1492          */
1493         vm_map_entry_link(map, new_entry);
1494         if ((new_entry->eflags & MAP_ENTRY_GUARD) == 0)
1495                 map->size += new_entry->end - new_entry->start;
1496
1497         /*
1498          * Try to coalesce the new entry with both the previous and next
1499          * entries in the list.  Previously, we only attempted to coalesce
1500          * with the previous entry when object is NULL.  Here, we handle the
1501          * other cases, which are less common.
1502          */
1503         vm_map_simplify_entry(map, new_entry);
1504
1505         if ((cow & (MAP_PREFAULT | MAP_PREFAULT_PARTIAL)) != 0) {
1506                 vm_map_pmap_enter(map, start, prot, object, OFF_TO_IDX(offset),
1507                     end - start, cow & MAP_PREFAULT_PARTIAL);
1508         }
1509
1510         return (KERN_SUCCESS);
1511 }
1512
1513 /*
1514  *      vm_map_findspace:
1515  *
1516  *      Find the first fit (lowest VM address) for "length" free bytes
1517  *      beginning at address >= start in the given map.
1518  *
1519  *      In a vm_map_entry, "max_free" is the maximum amount of
1520  *      contiguous free space between an entry in its subtree and a
1521  *      neighbor of that entry.  This allows finding a free region in
1522  *      one path down the tree, so O(log n) amortized with splay
1523  *      trees.
1524  *
1525  *      The map must be locked, and leaves it so.
1526  *
1527  *      Returns: starting address if sufficient space,
1528  *               vm_map_max(map)-length+1 if insufficient space.
1529  */
1530 vm_offset_t
1531 vm_map_findspace(vm_map_t map, vm_offset_t start, vm_size_t length)
1532 {
1533         vm_map_entry_t llist, rlist, root, y;
1534         vm_size_t left_length;
1535
1536         /*
1537          * Request must fit within min/max VM address and must avoid
1538          * address wrap.
1539          */
1540         start = MAX(start, vm_map_min(map));
1541         if (start + length > vm_map_max(map) || start + length < start)
1542                 return (vm_map_max(map) - length + 1);
1543
1544         /* Empty tree means wide open address space. */
1545         if (map->root == NULL)
1546                 return (start);
1547
1548         /*
1549          * After splay, if start comes before root node, then there
1550          * must be a gap from start to the root.
1551          */
1552         root = vm_map_splay_split(start, length, map->root,
1553             &llist, &rlist);
1554         if (root != NULL)
1555                 start = root->end;
1556         else if (rlist != NULL) {
1557                 root = rlist;
1558                 rlist = root->left;
1559                 root->left = NULL;
1560         } else {
1561                 root = llist;
1562                 llist = root->right;
1563                 root->right = NULL;
1564         }
1565         map->root = vm_map_splay_merge(root, llist, rlist,
1566             root->left, root->right);
1567         VM_MAP_ASSERT_CONSISTENT(map);
1568         if (start + length <= root->start)
1569                 return (start);
1570
1571         /*
1572          * Root is the last node that might begin its gap before
1573          * start, and this is the last comparison where address
1574          * wrap might be a problem.
1575          */
1576         if (root->right == NULL &&
1577             start + length <= vm_map_max(map))
1578                 return (start);
1579
1580         /* With max_free, can immediately tell if no solution. */
1581         if (root->right == NULL || length > root->right->max_free)
1582                 return (vm_map_max(map) - length + 1);
1583
1584         /*
1585          * Splay for the least large-enough gap in the right subtree.
1586          */
1587         llist = NULL;
1588         rlist = NULL;
1589         for (left_length = 0; ;
1590              left_length = root->left != NULL ?
1591              root->left->max_free : root->start - llist->end) {
1592                 if (length <= left_length)
1593                         SPLAY_LEFT_STEP(root, y, rlist,
1594                             length <= (y->left != NULL ?
1595                             y->left->max_free : y->start - llist->end));
1596                 else
1597                         SPLAY_RIGHT_STEP(root, y, llist,
1598                             length > (y->left != NULL ?
1599                             y->left->max_free : y->start - root->end));
1600                 if (root == NULL)
1601                         break;
1602         }
1603         root = llist;
1604         llist = root->right;
1605         if ((y = rlist) == NULL)
1606                 root->right = NULL;
1607         else {
1608                 rlist = y->left;
1609                 y->left = NULL;
1610                 root->right = y->right;
1611         }
1612         root = vm_map_splay_merge(root, llist, rlist,
1613             root->left, root->right);
1614         if (y != NULL) {
1615                 y->right = root->right;
1616                 vm_map_entry_set_max_free(y);
1617                 root->right = y;
1618                 vm_map_entry_set_max_free(root);
1619         }
1620         map->root = root;
1621         VM_MAP_ASSERT_CONSISTENT(map);
1622         return (root->end);
1623 }
1624
1625 int
1626 vm_map_fixed(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1627     vm_offset_t start, vm_size_t length, vm_prot_t prot,
1628     vm_prot_t max, int cow)
1629 {
1630         vm_offset_t end;
1631         int result;
1632
1633         end = start + length;
1634         KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 ||
1635             object == NULL,
1636             ("vm_map_fixed: non-NULL backing object for stack"));
1637         vm_map_lock(map);
1638         VM_MAP_RANGE_CHECK(map, start, end);
1639         if ((cow & MAP_CHECK_EXCL) == 0)
1640                 vm_map_delete(map, start, end);
1641         if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) {
1642                 result = vm_map_stack_locked(map, start, length, sgrowsiz,
1643                     prot, max, cow);
1644         } else {
1645                 result = vm_map_insert(map, object, offset, start, end,
1646                     prot, max, cow);
1647         }
1648         vm_map_unlock(map);
1649         return (result);
1650 }
1651
1652 static const int aslr_pages_rnd_64[2] = {0x1000, 0x10};
1653 static const int aslr_pages_rnd_32[2] = {0x100, 0x4};
1654
1655 static int cluster_anon = 1;
1656 SYSCTL_INT(_vm, OID_AUTO, cluster_anon, CTLFLAG_RW,
1657     &cluster_anon, 0,
1658     "Cluster anonymous mappings: 0 = no, 1 = yes if no hint, 2 = always");
1659
1660 static bool
1661 clustering_anon_allowed(vm_offset_t addr)
1662 {
1663
1664         switch (cluster_anon) {
1665         case 0:
1666                 return (false);
1667         case 1:
1668                 return (addr == 0);
1669         case 2:
1670         default:
1671                 return (true);
1672         }
1673 }
1674
1675 static long aslr_restarts;
1676 SYSCTL_LONG(_vm, OID_AUTO, aslr_restarts, CTLFLAG_RD,
1677     &aslr_restarts, 0,
1678     "Number of aslr failures");
1679
1680 #define MAP_32BIT_MAX_ADDR      ((vm_offset_t)1 << 31)
1681
1682 /*
1683  * Searches for the specified amount of free space in the given map with the
1684  * specified alignment.  Performs an address-ordered, first-fit search from
1685  * the given address "*addr", with an optional upper bound "max_addr".  If the
1686  * parameter "alignment" is zero, then the alignment is computed from the
1687  * given (object, offset) pair so as to enable the greatest possible use of
1688  * superpage mappings.  Returns KERN_SUCCESS and the address of the free space
1689  * in "*addr" if successful.  Otherwise, returns KERN_NO_SPACE.
1690  *
1691  * The map must be locked.  Initially, there must be at least "length" bytes
1692  * of free space at the given address.
1693  */
1694 static int
1695 vm_map_alignspace(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1696     vm_offset_t *addr, vm_size_t length, vm_offset_t max_addr,
1697     vm_offset_t alignment)
1698 {
1699         vm_offset_t aligned_addr, free_addr;
1700
1701         VM_MAP_ASSERT_LOCKED(map);
1702         free_addr = *addr;
1703         KASSERT(free_addr == vm_map_findspace(map, free_addr, length),
1704             ("caller failed to provide space %d at address %p",
1705              (int)length, (void*)free_addr));
1706         for (;;) {
1707                 /*
1708                  * At the start of every iteration, the free space at address
1709                  * "*addr" is at least "length" bytes.
1710                  */
1711                 if (alignment == 0)
1712                         pmap_align_superpage(object, offset, addr, length);
1713                 else if ((*addr & (alignment - 1)) != 0) {
1714                         *addr &= ~(alignment - 1);
1715                         *addr += alignment;
1716                 }
1717                 aligned_addr = *addr;
1718                 if (aligned_addr == free_addr) {
1719                         /*
1720                          * Alignment did not change "*addr", so "*addr" must
1721                          * still provide sufficient free space.
1722                          */
1723                         return (KERN_SUCCESS);
1724                 }
1725
1726                 /*
1727                  * Test for address wrap on "*addr".  A wrapped "*addr" could
1728                  * be a valid address, in which case vm_map_findspace() cannot
1729                  * be relied upon to fail.
1730                  */
1731                 if (aligned_addr < free_addr)
1732                         return (KERN_NO_SPACE);
1733                 *addr = vm_map_findspace(map, aligned_addr, length);
1734                 if (*addr + length > vm_map_max(map) ||
1735                     (max_addr != 0 && *addr + length > max_addr))
1736                         return (KERN_NO_SPACE);
1737                 free_addr = *addr;
1738                 if (free_addr == aligned_addr) {
1739                         /*
1740                          * If a successful call to vm_map_findspace() did not
1741                          * change "*addr", then "*addr" must still be aligned
1742                          * and provide sufficient free space.
1743                          */
1744                         return (KERN_SUCCESS);
1745                 }
1746         }
1747 }
1748
1749 /*
1750  *      vm_map_find finds an unallocated region in the target address
1751  *      map with the given length.  The search is defined to be
1752  *      first-fit from the specified address; the region found is
1753  *      returned in the same parameter.
1754  *
1755  *      If object is non-NULL, ref count must be bumped by caller
1756  *      prior to making call to account for the new entry.
1757  */
1758 int
1759 vm_map_find(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1760             vm_offset_t *addr,  /* IN/OUT */
1761             vm_size_t length, vm_offset_t max_addr, int find_space,
1762             vm_prot_t prot, vm_prot_t max, int cow)
1763 {
1764         vm_offset_t alignment, curr_min_addr, min_addr;
1765         int gap, pidx, rv, try;
1766         bool cluster, en_aslr, update_anon;
1767
1768         KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 ||
1769             object == NULL,
1770             ("vm_map_find: non-NULL backing object for stack"));
1771         MPASS((cow & MAP_REMAP) == 0 || (find_space == VMFS_NO_SPACE &&
1772             (cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0));
1773         if (find_space == VMFS_OPTIMAL_SPACE && (object == NULL ||
1774             (object->flags & OBJ_COLORED) == 0))
1775                 find_space = VMFS_ANY_SPACE;
1776         if (find_space >> 8 != 0) {
1777                 KASSERT((find_space & 0xff) == 0, ("bad VMFS flags"));
1778                 alignment = (vm_offset_t)1 << (find_space >> 8);
1779         } else
1780                 alignment = 0;
1781         en_aslr = (map->flags & MAP_ASLR) != 0;
1782         update_anon = cluster = clustering_anon_allowed(*addr) &&
1783             (map->flags & MAP_IS_SUB_MAP) == 0 && max_addr == 0 &&
1784             find_space != VMFS_NO_SPACE && object == NULL &&
1785             (cow & (MAP_INHERIT_SHARE | MAP_STACK_GROWS_UP |
1786             MAP_STACK_GROWS_DOWN)) == 0 && prot != PROT_NONE;
1787         curr_min_addr = min_addr = *addr;
1788         if (en_aslr && min_addr == 0 && !cluster &&
1789             find_space != VMFS_NO_SPACE &&
1790             (map->flags & MAP_ASLR_IGNSTART) != 0)
1791                 curr_min_addr = min_addr = vm_map_min(map);
1792         try = 0;
1793         vm_map_lock(map);
1794         if (cluster) {
1795                 curr_min_addr = map->anon_loc;
1796                 if (curr_min_addr == 0)
1797                         cluster = false;
1798         }
1799         if (find_space != VMFS_NO_SPACE) {
1800                 KASSERT(find_space == VMFS_ANY_SPACE ||
1801                     find_space == VMFS_OPTIMAL_SPACE ||
1802                     find_space == VMFS_SUPER_SPACE ||
1803                     alignment != 0, ("unexpected VMFS flag"));
1804 again:
1805                 /*
1806                  * When creating an anonymous mapping, try clustering
1807                  * with an existing anonymous mapping first.
1808                  *
1809                  * We make up to two attempts to find address space
1810                  * for a given find_space value. The first attempt may
1811                  * apply randomization or may cluster with an existing
1812                  * anonymous mapping. If this first attempt fails,
1813                  * perform a first-fit search of the available address
1814                  * space.
1815                  *
1816                  * If all tries failed, and find_space is
1817                  * VMFS_OPTIMAL_SPACE, fallback to VMFS_ANY_SPACE.
1818                  * Again enable clustering and randomization.
1819                  */
1820                 try++;
1821                 MPASS(try <= 2);
1822
1823                 if (try == 2) {
1824                         /*
1825                          * Second try: we failed either to find a
1826                          * suitable region for randomizing the
1827                          * allocation, or to cluster with an existing
1828                          * mapping.  Retry with free run.
1829                          */
1830                         curr_min_addr = (map->flags & MAP_ASLR_IGNSTART) != 0 ?
1831                             vm_map_min(map) : min_addr;
1832                         atomic_add_long(&aslr_restarts, 1);
1833                 }
1834
1835                 if (try == 1 && en_aslr && !cluster) {
1836                         /*
1837                          * Find space for allocation, including
1838                          * gap needed for later randomization.
1839                          */
1840                         pidx = MAXPAGESIZES > 1 && pagesizes[1] != 0 &&
1841                             (find_space == VMFS_SUPER_SPACE || find_space ==
1842                             VMFS_OPTIMAL_SPACE) ? 1 : 0;
1843                         gap = vm_map_max(map) > MAP_32BIT_MAX_ADDR &&
1844                             (max_addr == 0 || max_addr > MAP_32BIT_MAX_ADDR) ?
1845                             aslr_pages_rnd_64[pidx] : aslr_pages_rnd_32[pidx];
1846                         *addr = vm_map_findspace(map, curr_min_addr,
1847                             length + gap * pagesizes[pidx]);
1848                         if (*addr + length + gap * pagesizes[pidx] >
1849                             vm_map_max(map))
1850                                 goto again;
1851                         /* And randomize the start address. */
1852                         *addr += (arc4random() % gap) * pagesizes[pidx];
1853                         if (max_addr != 0 && *addr + length > max_addr)
1854                                 goto again;
1855                 } else {
1856                         *addr = vm_map_findspace(map, curr_min_addr, length);
1857                         if (*addr + length > vm_map_max(map) ||
1858                             (max_addr != 0 && *addr + length > max_addr)) {
1859                                 if (cluster) {
1860                                         cluster = false;
1861                                         MPASS(try == 1);
1862                                         goto again;
1863                                 }
1864                                 rv = KERN_NO_SPACE;
1865                                 goto done;
1866                         }
1867                 }
1868
1869                 if (find_space != VMFS_ANY_SPACE &&
1870                     (rv = vm_map_alignspace(map, object, offset, addr, length,
1871                     max_addr, alignment)) != KERN_SUCCESS) {
1872                         if (find_space == VMFS_OPTIMAL_SPACE) {
1873                                 find_space = VMFS_ANY_SPACE;
1874                                 curr_min_addr = min_addr;
1875                                 cluster = update_anon;
1876                                 try = 0;
1877                                 goto again;
1878                         }
1879                         goto done;
1880                 }
1881         } else if ((cow & MAP_REMAP) != 0) {
1882                 if (*addr < vm_map_min(map) ||
1883                     *addr + length > vm_map_max(map) ||
1884                     *addr + length <= length) {
1885                         rv = KERN_INVALID_ADDRESS;
1886                         goto done;
1887                 }
1888                 vm_map_delete(map, *addr, *addr + length);
1889         }
1890         if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) {
1891                 rv = vm_map_stack_locked(map, *addr, length, sgrowsiz, prot,
1892                     max, cow);
1893         } else {
1894                 rv = vm_map_insert(map, object, offset, *addr, *addr + length,
1895                     prot, max, cow);
1896         }
1897         if (rv == KERN_SUCCESS && update_anon)
1898                 map->anon_loc = *addr + length;
1899 done:
1900         vm_map_unlock(map);
1901         return (rv);
1902 }
1903
1904 /*
1905  *      vm_map_find_min() is a variant of vm_map_find() that takes an
1906  *      additional parameter (min_addr) and treats the given address
1907  *      (*addr) differently.  Specifically, it treats *addr as a hint
1908  *      and not as the minimum address where the mapping is created.
1909  *
1910  *      This function works in two phases.  First, it tries to
1911  *      allocate above the hint.  If that fails and the hint is
1912  *      greater than min_addr, it performs a second pass, replacing
1913  *      the hint with min_addr as the minimum address for the
1914  *      allocation.
1915  */
1916 int
1917 vm_map_find_min(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1918     vm_offset_t *addr, vm_size_t length, vm_offset_t min_addr,
1919     vm_offset_t max_addr, int find_space, vm_prot_t prot, vm_prot_t max,
1920     int cow)
1921 {
1922         vm_offset_t hint;
1923         int rv;
1924
1925         hint = *addr;
1926         for (;;) {
1927                 rv = vm_map_find(map, object, offset, addr, length, max_addr,
1928                     find_space, prot, max, cow);
1929                 if (rv == KERN_SUCCESS || min_addr >= hint)
1930                         return (rv);
1931                 *addr = hint = min_addr;
1932         }
1933 }
1934
1935 /*
1936  * A map entry with any of the following flags set must not be merged with
1937  * another entry.
1938  */
1939 #define MAP_ENTRY_NOMERGE_MASK  (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP | \
1940             MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_IS_SUB_MAP)
1941
1942 static bool
1943 vm_map_mergeable_neighbors(vm_map_entry_t prev, vm_map_entry_t entry)
1944 {
1945
1946         KASSERT((prev->eflags & MAP_ENTRY_NOMERGE_MASK) == 0 ||
1947             (entry->eflags & MAP_ENTRY_NOMERGE_MASK) == 0,
1948             ("vm_map_mergeable_neighbors: neither %p nor %p are mergeable",
1949             prev, entry));
1950         return (prev->end == entry->start &&
1951             prev->object.vm_object == entry->object.vm_object &&
1952             (prev->object.vm_object == NULL ||
1953             prev->offset + (prev->end - prev->start) == entry->offset) &&
1954             prev->eflags == entry->eflags &&
1955             prev->protection == entry->protection &&
1956             prev->max_protection == entry->max_protection &&
1957             prev->inheritance == entry->inheritance &&
1958             prev->wired_count == entry->wired_count &&
1959             prev->cred == entry->cred);
1960 }
1961
1962 static void
1963 vm_map_merged_neighbor_dispose(vm_map_t map, vm_map_entry_t entry)
1964 {
1965
1966         /*
1967          * If the backing object is a vnode object, vm_object_deallocate()
1968          * calls vrele().  However, vrele() does not lock the vnode because
1969          * the vnode has additional references.  Thus, the map lock can be
1970          * kept without causing a lock-order reversal with the vnode lock.
1971          *
1972          * Since we count the number of virtual page mappings in
1973          * object->un_pager.vnp.writemappings, the writemappings value
1974          * should not be adjusted when the entry is disposed of.
1975          */
1976         if (entry->object.vm_object != NULL)
1977                 vm_object_deallocate(entry->object.vm_object);
1978         if (entry->cred != NULL)
1979                 crfree(entry->cred);
1980         vm_map_entry_dispose(map, entry);
1981 }
1982
1983 /*
1984  *      vm_map_simplify_entry:
1985  *
1986  *      Simplify the given map entry by merging with either neighbor.  This
1987  *      routine also has the ability to merge with both neighbors.
1988  *
1989  *      The map must be locked.
1990  *
1991  *      This routine guarantees that the passed entry remains valid (though
1992  *      possibly extended).  When merging, this routine may delete one or
1993  *      both neighbors.
1994  */
1995 void
1996 vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry)
1997 {
1998         vm_map_entry_t next, prev;
1999
2000         if ((entry->eflags & MAP_ENTRY_NOMERGE_MASK) != 0)
2001                 return;
2002         prev = entry->prev;
2003         if (vm_map_mergeable_neighbors(prev, entry)) {
2004                 vm_map_entry_unlink(map, prev, UNLINK_MERGE_NEXT);
2005                 vm_map_merged_neighbor_dispose(map, prev);
2006         }
2007         next = entry->next;
2008         if (vm_map_mergeable_neighbors(entry, next)) {
2009                 vm_map_entry_unlink(map, next, UNLINK_MERGE_PREV);
2010                 vm_map_merged_neighbor_dispose(map, next);
2011         }
2012 }
2013
2014 /*
2015  *      vm_map_clip_start:      [ internal use only ]
2016  *
2017  *      Asserts that the given entry begins at or after
2018  *      the specified address; if necessary,
2019  *      it splits the entry into two.
2020  */
2021 #define vm_map_clip_start(map, entry, startaddr) \
2022 { \
2023         if (startaddr > entry->start) \
2024                 _vm_map_clip_start(map, entry, startaddr); \
2025 }
2026
2027 /*
2028  *      This routine is called only when it is known that
2029  *      the entry must be split.
2030  */
2031 static void
2032 _vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start)
2033 {
2034         vm_map_entry_t new_entry;
2035
2036         VM_MAP_ASSERT_LOCKED(map);
2037         KASSERT(entry->end > start && entry->start < start,
2038             ("_vm_map_clip_start: invalid clip of entry %p", entry));
2039
2040         /*
2041          * Split off the front portion -- note that we must insert the new
2042          * entry BEFORE this one, so that this entry has the specified
2043          * starting address.
2044          */
2045         vm_map_simplify_entry(map, entry);
2046
2047         /*
2048          * If there is no object backing this entry, we might as well create
2049          * one now.  If we defer it, an object can get created after the map
2050          * is clipped, and individual objects will be created for the split-up
2051          * map.  This is a bit of a hack, but is also about the best place to
2052          * put this improvement.
2053          */
2054         if (entry->object.vm_object == NULL && !map->system_map &&
2055             (entry->eflags & MAP_ENTRY_GUARD) == 0) {
2056                 vm_object_t object;
2057                 object = vm_object_allocate(OBJT_DEFAULT,
2058                                 atop(entry->end - entry->start));
2059                 entry->object.vm_object = object;
2060                 entry->offset = 0;
2061                 if (entry->cred != NULL) {
2062                         object->cred = entry->cred;
2063                         object->charge = entry->end - entry->start;
2064                         entry->cred = NULL;
2065                 }
2066         } else if (entry->object.vm_object != NULL &&
2067                    ((entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) &&
2068                    entry->cred != NULL) {
2069                 VM_OBJECT_WLOCK(entry->object.vm_object);
2070                 KASSERT(entry->object.vm_object->cred == NULL,
2071                     ("OVERCOMMIT: vm_entry_clip_start: both cred e %p", entry));
2072                 entry->object.vm_object->cred = entry->cred;
2073                 entry->object.vm_object->charge = entry->end - entry->start;
2074                 VM_OBJECT_WUNLOCK(entry->object.vm_object);
2075                 entry->cred = NULL;
2076         }
2077
2078         new_entry = vm_map_entry_create(map);
2079         *new_entry = *entry;
2080
2081         new_entry->end = start;
2082         entry->offset += (start - entry->start);
2083         entry->start = start;
2084         if (new_entry->cred != NULL)
2085                 crhold(entry->cred);
2086
2087         vm_map_entry_link(map, new_entry);
2088
2089         if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
2090                 vm_object_reference(new_entry->object.vm_object);
2091                 /*
2092                  * The object->un_pager.vnp.writemappings for the
2093                  * object of MAP_ENTRY_VN_WRITECNT type entry shall be
2094                  * kept as is here.  The virtual pages are
2095                  * re-distributed among the clipped entries, so the sum is
2096                  * left the same.
2097                  */
2098         }
2099 }
2100
2101 /*
2102  *      vm_map_clip_end:        [ internal use only ]
2103  *
2104  *      Asserts that the given entry ends at or before
2105  *      the specified address; if necessary,
2106  *      it splits the entry into two.
2107  */
2108 #define vm_map_clip_end(map, entry, endaddr) \
2109 { \
2110         if ((endaddr) < (entry->end)) \
2111                 _vm_map_clip_end((map), (entry), (endaddr)); \
2112 }
2113
2114 /*
2115  *      This routine is called only when it is known that
2116  *      the entry must be split.
2117  */
2118 static void
2119 _vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end)
2120 {
2121         vm_map_entry_t new_entry;
2122
2123         VM_MAP_ASSERT_LOCKED(map);
2124         KASSERT(entry->start < end && entry->end > end,
2125             ("_vm_map_clip_end: invalid clip of entry %p", entry));
2126
2127         /*
2128          * If there is no object backing this entry, we might as well create
2129          * one now.  If we defer it, an object can get created after the map
2130          * is clipped, and individual objects will be created for the split-up
2131          * map.  This is a bit of a hack, but is also about the best place to
2132          * put this improvement.
2133          */
2134         if (entry->object.vm_object == NULL && !map->system_map &&
2135             (entry->eflags & MAP_ENTRY_GUARD) == 0) {
2136                 vm_object_t object;
2137                 object = vm_object_allocate(OBJT_DEFAULT,
2138                                 atop(entry->end - entry->start));
2139                 entry->object.vm_object = object;
2140                 entry->offset = 0;
2141                 if (entry->cred != NULL) {
2142                         object->cred = entry->cred;
2143                         object->charge = entry->end - entry->start;
2144                         entry->cred = NULL;
2145                 }
2146         } else if (entry->object.vm_object != NULL &&
2147                    ((entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) &&
2148                    entry->cred != NULL) {
2149                 VM_OBJECT_WLOCK(entry->object.vm_object);
2150                 KASSERT(entry->object.vm_object->cred == NULL,
2151                     ("OVERCOMMIT: vm_entry_clip_end: both cred e %p", entry));
2152                 entry->object.vm_object->cred = entry->cred;
2153                 entry->object.vm_object->charge = entry->end - entry->start;
2154                 VM_OBJECT_WUNLOCK(entry->object.vm_object);
2155                 entry->cred = NULL;
2156         }
2157
2158         /*
2159          * Create a new entry and insert it AFTER the specified entry
2160          */
2161         new_entry = vm_map_entry_create(map);
2162         *new_entry = *entry;
2163
2164         new_entry->start = entry->end = end;
2165         new_entry->offset += (end - entry->start);
2166         if (new_entry->cred != NULL)
2167                 crhold(entry->cred);
2168
2169         vm_map_entry_link(map, new_entry);
2170
2171         if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
2172                 vm_object_reference(new_entry->object.vm_object);
2173         }
2174 }
2175
2176 /*
2177  *      vm_map_submap:          [ kernel use only ]
2178  *
2179  *      Mark the given range as handled by a subordinate map.
2180  *
2181  *      This range must have been created with vm_map_find,
2182  *      and no other operations may have been performed on this
2183  *      range prior to calling vm_map_submap.
2184  *
2185  *      Only a limited number of operations can be performed
2186  *      within this rage after calling vm_map_submap:
2187  *              vm_fault
2188  *      [Don't try vm_map_copy!]
2189  *
2190  *      To remove a submapping, one must first remove the
2191  *      range from the superior map, and then destroy the
2192  *      submap (if desired).  [Better yet, don't try it.]
2193  */
2194 int
2195 vm_map_submap(
2196         vm_map_t map,
2197         vm_offset_t start,
2198         vm_offset_t end,
2199         vm_map_t submap)
2200 {
2201         vm_map_entry_t entry;
2202         int result;
2203
2204         result = KERN_INVALID_ARGUMENT;
2205
2206         vm_map_lock(submap);
2207         submap->flags |= MAP_IS_SUB_MAP;
2208         vm_map_unlock(submap);
2209
2210         vm_map_lock(map);
2211
2212         VM_MAP_RANGE_CHECK(map, start, end);
2213
2214         if (vm_map_lookup_entry(map, start, &entry)) {
2215                 vm_map_clip_start(map, entry, start);
2216         } else
2217                 entry = entry->next;
2218
2219         vm_map_clip_end(map, entry, end);
2220
2221         if ((entry->start == start) && (entry->end == end) &&
2222             ((entry->eflags & MAP_ENTRY_COW) == 0) &&
2223             (entry->object.vm_object == NULL)) {
2224                 entry->object.sub_map = submap;
2225                 entry->eflags |= MAP_ENTRY_IS_SUB_MAP;
2226                 result = KERN_SUCCESS;
2227         }
2228         vm_map_unlock(map);
2229
2230         if (result != KERN_SUCCESS) {
2231                 vm_map_lock(submap);
2232                 submap->flags &= ~MAP_IS_SUB_MAP;
2233                 vm_map_unlock(submap);
2234         }
2235         return (result);
2236 }
2237
2238 /*
2239  * The maximum number of pages to map if MAP_PREFAULT_PARTIAL is specified
2240  */
2241 #define MAX_INIT_PT     96
2242
2243 /*
2244  *      vm_map_pmap_enter:
2245  *
2246  *      Preload the specified map's pmap with mappings to the specified
2247  *      object's memory-resident pages.  No further physical pages are
2248  *      allocated, and no further virtual pages are retrieved from secondary
2249  *      storage.  If the specified flags include MAP_PREFAULT_PARTIAL, then a
2250  *      limited number of page mappings are created at the low-end of the
2251  *      specified address range.  (For this purpose, a superpage mapping
2252  *      counts as one page mapping.)  Otherwise, all resident pages within
2253  *      the specified address range are mapped.
2254  */
2255 static void
2256 vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
2257     vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags)
2258 {
2259         vm_offset_t start;
2260         vm_page_t p, p_start;
2261         vm_pindex_t mask, psize, threshold, tmpidx;
2262
2263         if ((prot & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0 || object == NULL)
2264                 return;
2265         VM_OBJECT_RLOCK(object);
2266         if (object->type == OBJT_DEVICE || object->type == OBJT_SG) {
2267                 VM_OBJECT_RUNLOCK(object);
2268                 VM_OBJECT_WLOCK(object);
2269                 if (object->type == OBJT_DEVICE || object->type == OBJT_SG) {
2270                         pmap_object_init_pt(map->pmap, addr, object, pindex,
2271                             size);
2272                         VM_OBJECT_WUNLOCK(object);
2273                         return;
2274                 }
2275                 VM_OBJECT_LOCK_DOWNGRADE(object);
2276         }
2277
2278         psize = atop(size);
2279         if (psize + pindex > object->size) {
2280                 if (object->size < pindex) {
2281                         VM_OBJECT_RUNLOCK(object);
2282                         return;
2283                 }
2284                 psize = object->size - pindex;
2285         }
2286
2287         start = 0;
2288         p_start = NULL;
2289         threshold = MAX_INIT_PT;
2290
2291         p = vm_page_find_least(object, pindex);
2292         /*
2293          * Assert: the variable p is either (1) the page with the
2294          * least pindex greater than or equal to the parameter pindex
2295          * or (2) NULL.
2296          */
2297         for (;
2298              p != NULL && (tmpidx = p->pindex - pindex) < psize;
2299              p = TAILQ_NEXT(p, listq)) {
2300                 /*
2301                  * don't allow an madvise to blow away our really
2302                  * free pages allocating pv entries.
2303                  */
2304                 if (((flags & MAP_PREFAULT_MADVISE) != 0 &&
2305                     vm_page_count_severe()) ||
2306                     ((flags & MAP_PREFAULT_PARTIAL) != 0 &&
2307                     tmpidx >= threshold)) {
2308                         psize = tmpidx;
2309                         break;
2310                 }
2311                 if (p->valid == VM_PAGE_BITS_ALL) {
2312                         if (p_start == NULL) {
2313                                 start = addr + ptoa(tmpidx);
2314                                 p_start = p;
2315                         }
2316                         /* Jump ahead if a superpage mapping is possible. */
2317                         if (p->psind > 0 && ((addr + ptoa(tmpidx)) &
2318                             (pagesizes[p->psind] - 1)) == 0) {
2319                                 mask = atop(pagesizes[p->psind]) - 1;
2320                                 if (tmpidx + mask < psize &&
2321                                     vm_page_ps_test(p, PS_ALL_VALID, NULL)) {
2322                                         p += mask;
2323                                         threshold += mask;
2324                                 }
2325                         }
2326                 } else if (p_start != NULL) {
2327                         pmap_enter_object(map->pmap, start, addr +
2328                             ptoa(tmpidx), p_start, prot);
2329                         p_start = NULL;
2330                 }
2331         }
2332         if (p_start != NULL)
2333                 pmap_enter_object(map->pmap, start, addr + ptoa(psize),
2334                     p_start, prot);
2335         VM_OBJECT_RUNLOCK(object);
2336 }
2337
2338 /*
2339  *      vm_map_protect:
2340  *
2341  *      Sets the protection of the specified address
2342  *      region in the target map.  If "set_max" is
2343  *      specified, the maximum protection is to be set;
2344  *      otherwise, only the current protection is affected.
2345  */
2346 int
2347 vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end,
2348                vm_prot_t new_prot, boolean_t set_max)
2349 {
2350         vm_map_entry_t current, entry, in_tran;
2351         vm_object_t obj;
2352         struct ucred *cred;
2353         vm_prot_t old_prot;
2354
2355         if (start == end)
2356                 return (KERN_SUCCESS);
2357
2358 again:
2359         in_tran = NULL;
2360         vm_map_lock(map);
2361
2362         /*
2363          * Ensure that we are not concurrently wiring pages.  vm_map_wire() may
2364          * need to fault pages into the map and will drop the map lock while
2365          * doing so, and the VM object may end up in an inconsistent state if we
2366          * update the protection on the map entry in between faults.
2367          */
2368         vm_map_wait_busy(map);
2369
2370         VM_MAP_RANGE_CHECK(map, start, end);
2371
2372         if (vm_map_lookup_entry(map, start, &entry)) {
2373                 vm_map_clip_start(map, entry, start);
2374         } else {
2375                 entry = entry->next;
2376         }
2377
2378         /*
2379          * Make a first pass to check for protection violations.
2380          */
2381         for (current = entry; current->start < end; current = current->next) {
2382                 if ((current->eflags & MAP_ENTRY_GUARD) != 0)
2383                         continue;
2384                 if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
2385                         vm_map_unlock(map);
2386                         return (KERN_INVALID_ARGUMENT);
2387                 }
2388                 if ((new_prot & current->max_protection) != new_prot) {
2389                         vm_map_unlock(map);
2390                         return (KERN_PROTECTION_FAILURE);
2391                 }
2392                 if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0)
2393                         in_tran = entry;
2394         }
2395
2396         /*
2397          * Postpone the operation until all in transition map entries
2398          * are stabilized.  In-transition entry might already have its
2399          * pages wired and wired_count incremented, but
2400          * MAP_ENTRY_USER_WIRED flag not yet set, and visible to other
2401          * threads because the map lock is dropped.  In this case we
2402          * would miss our call to vm_fault_copy_entry().
2403          */
2404         if (in_tran != NULL) {
2405                 in_tran->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
2406                 vm_map_unlock_and_wait(map, 0);
2407                 goto again;
2408         }
2409
2410         /*
2411          * Do an accounting pass for private read-only mappings that
2412          * now will do cow due to allowed write (e.g. debugger sets
2413          * breakpoint on text segment)
2414          */
2415         for (current = entry; current->start < end; current = current->next) {
2416
2417                 vm_map_clip_end(map, current, end);
2418
2419                 if (set_max ||
2420                     ((new_prot & ~(current->protection)) & VM_PROT_WRITE) == 0 ||
2421                     ENTRY_CHARGED(current) ||
2422                     (current->eflags & MAP_ENTRY_GUARD) != 0) {
2423                         continue;
2424                 }
2425
2426                 cred = curthread->td_ucred;
2427                 obj = current->object.vm_object;
2428
2429                 if (obj == NULL || (current->eflags & MAP_ENTRY_NEEDS_COPY)) {
2430                         if (!swap_reserve(current->end - current->start)) {
2431                                 vm_map_unlock(map);
2432                                 return (KERN_RESOURCE_SHORTAGE);
2433                         }
2434                         crhold(cred);
2435                         current->cred = cred;
2436                         continue;
2437                 }
2438
2439                 VM_OBJECT_WLOCK(obj);
2440                 if (obj->type != OBJT_DEFAULT && obj->type != OBJT_SWAP) {
2441                         VM_OBJECT_WUNLOCK(obj);
2442                         continue;
2443                 }
2444
2445                 /*
2446                  * Charge for the whole object allocation now, since
2447                  * we cannot distinguish between non-charged and
2448                  * charged clipped mapping of the same object later.
2449                  */
2450                 KASSERT(obj->charge == 0,
2451                     ("vm_map_protect: object %p overcharged (entry %p)",
2452                     obj, current));
2453                 if (!swap_reserve(ptoa(obj->size))) {
2454                         VM_OBJECT_WUNLOCK(obj);
2455                         vm_map_unlock(map);
2456                         return (KERN_RESOURCE_SHORTAGE);
2457                 }
2458
2459                 crhold(cred);
2460                 obj->cred = cred;
2461                 obj->charge = ptoa(obj->size);
2462                 VM_OBJECT_WUNLOCK(obj);
2463         }
2464
2465         /*
2466          * Go back and fix up protections. [Note that clipping is not
2467          * necessary the second time.]
2468          */
2469         for (current = entry; current->start < end; current = current->next) {
2470                 if ((current->eflags & MAP_ENTRY_GUARD) != 0)
2471                         continue;
2472
2473                 old_prot = current->protection;
2474
2475                 if (set_max)
2476                         current->protection =
2477                             (current->max_protection = new_prot) &
2478                             old_prot;
2479                 else
2480                         current->protection = new_prot;
2481
2482                 /*
2483                  * For user wired map entries, the normal lazy evaluation of
2484                  * write access upgrades through soft page faults is
2485                  * undesirable.  Instead, immediately copy any pages that are
2486                  * copy-on-write and enable write access in the physical map.
2487                  */
2488                 if ((current->eflags & MAP_ENTRY_USER_WIRED) != 0 &&
2489                     (current->protection & VM_PROT_WRITE) != 0 &&
2490                     (old_prot & VM_PROT_WRITE) == 0)
2491                         vm_fault_copy_entry(map, map, current, current, NULL);
2492
2493                 /*
2494                  * When restricting access, update the physical map.  Worry
2495                  * about copy-on-write here.
2496                  */
2497                 if ((old_prot & ~current->protection) != 0) {
2498 #define MASK(entry)     (((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \
2499                                                         VM_PROT_ALL)
2500                         pmap_protect(map->pmap, current->start,
2501                             current->end,
2502                             current->protection & MASK(current));
2503 #undef  MASK
2504                 }
2505                 vm_map_simplify_entry(map, current);
2506         }
2507         vm_map_unlock(map);
2508         return (KERN_SUCCESS);
2509 }
2510
2511 /*
2512  *      vm_map_madvise:
2513  *
2514  *      This routine traverses a processes map handling the madvise
2515  *      system call.  Advisories are classified as either those effecting
2516  *      the vm_map_entry structure, or those effecting the underlying
2517  *      objects.
2518  */
2519 int
2520 vm_map_madvise(
2521         vm_map_t map,
2522         vm_offset_t start,
2523         vm_offset_t end,
2524         int behav)
2525 {
2526         vm_map_entry_t current, entry;
2527         bool modify_map;
2528
2529         /*
2530          * Some madvise calls directly modify the vm_map_entry, in which case
2531          * we need to use an exclusive lock on the map and we need to perform
2532          * various clipping operations.  Otherwise we only need a read-lock
2533          * on the map.
2534          */
2535         switch(behav) {
2536         case MADV_NORMAL:
2537         case MADV_SEQUENTIAL:
2538         case MADV_RANDOM:
2539         case MADV_NOSYNC:
2540         case MADV_AUTOSYNC:
2541         case MADV_NOCORE:
2542         case MADV_CORE:
2543                 if (start == end)
2544                         return (0);
2545                 modify_map = true;
2546                 vm_map_lock(map);
2547                 break;
2548         case MADV_WILLNEED:
2549         case MADV_DONTNEED:
2550         case MADV_FREE:
2551                 if (start == end)
2552                         return (0);
2553                 modify_map = false;
2554                 vm_map_lock_read(map);
2555                 break;
2556         default:
2557                 return (EINVAL);
2558         }
2559
2560         /*
2561          * Locate starting entry and clip if necessary.
2562          */
2563         VM_MAP_RANGE_CHECK(map, start, end);
2564
2565         if (vm_map_lookup_entry(map, start, &entry)) {
2566                 if (modify_map)
2567                         vm_map_clip_start(map, entry, start);
2568         } else {
2569                 entry = entry->next;
2570         }
2571
2572         if (modify_map) {
2573                 /*
2574                  * madvise behaviors that are implemented in the vm_map_entry.
2575                  *
2576                  * We clip the vm_map_entry so that behavioral changes are
2577                  * limited to the specified address range.
2578                  */
2579                 for (current = entry; current->start < end;
2580                     current = current->next) {
2581                         if (current->eflags & MAP_ENTRY_IS_SUB_MAP)
2582                                 continue;
2583
2584                         vm_map_clip_end(map, current, end);
2585
2586                         switch (behav) {
2587                         case MADV_NORMAL:
2588                                 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_NORMAL);
2589                                 break;
2590                         case MADV_SEQUENTIAL:
2591                                 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_SEQUENTIAL);
2592                                 break;
2593                         case MADV_RANDOM:
2594                                 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_RANDOM);
2595                                 break;
2596                         case MADV_NOSYNC:
2597                                 current->eflags |= MAP_ENTRY_NOSYNC;
2598                                 break;
2599                         case MADV_AUTOSYNC:
2600                                 current->eflags &= ~MAP_ENTRY_NOSYNC;
2601                                 break;
2602                         case MADV_NOCORE:
2603                                 current->eflags |= MAP_ENTRY_NOCOREDUMP;
2604                                 break;
2605                         case MADV_CORE:
2606                                 current->eflags &= ~MAP_ENTRY_NOCOREDUMP;
2607                                 break;
2608                         default:
2609                                 break;
2610                         }
2611                         vm_map_simplify_entry(map, current);
2612                 }
2613                 vm_map_unlock(map);
2614         } else {
2615                 vm_pindex_t pstart, pend;
2616
2617                 /*
2618                  * madvise behaviors that are implemented in the underlying
2619                  * vm_object.
2620                  *
2621                  * Since we don't clip the vm_map_entry, we have to clip
2622                  * the vm_object pindex and count.
2623                  */
2624                 for (current = entry; current->start < end;
2625                     current = current->next) {
2626                         vm_offset_t useEnd, useStart;
2627
2628                         if (current->eflags & MAP_ENTRY_IS_SUB_MAP)
2629                                 continue;
2630
2631                         pstart = OFF_TO_IDX(current->offset);
2632                         pend = pstart + atop(current->end - current->start);
2633                         useStart = current->start;
2634                         useEnd = current->end;
2635
2636                         if (current->start < start) {
2637                                 pstart += atop(start - current->start);
2638                                 useStart = start;
2639                         }
2640                         if (current->end > end) {
2641                                 pend -= atop(current->end - end);
2642                                 useEnd = end;
2643                         }
2644
2645                         if (pstart >= pend)
2646                                 continue;
2647
2648                         /*
2649                          * Perform the pmap_advise() before clearing
2650                          * PGA_REFERENCED in vm_page_advise().  Otherwise, a
2651                          * concurrent pmap operation, such as pmap_remove(),
2652                          * could clear a reference in the pmap and set
2653                          * PGA_REFERENCED on the page before the pmap_advise()
2654                          * had completed.  Consequently, the page would appear
2655                          * referenced based upon an old reference that
2656                          * occurred before this pmap_advise() ran.
2657                          */
2658                         if (behav == MADV_DONTNEED || behav == MADV_FREE)
2659                                 pmap_advise(map->pmap, useStart, useEnd,
2660                                     behav);
2661
2662                         vm_object_madvise(current->object.vm_object, pstart,
2663                             pend, behav);
2664
2665                         /*
2666                          * Pre-populate paging structures in the
2667                          * WILLNEED case.  For wired entries, the
2668                          * paging structures are already populated.
2669                          */
2670                         if (behav == MADV_WILLNEED &&
2671                             current->wired_count == 0) {
2672                                 vm_map_pmap_enter(map,
2673                                     useStart,
2674                                     current->protection,
2675                                     current->object.vm_object,
2676                                     pstart,
2677                                     ptoa(pend - pstart),
2678                                     MAP_PREFAULT_MADVISE
2679                                 );
2680                         }
2681                 }
2682                 vm_map_unlock_read(map);
2683         }
2684         return (0);
2685 }
2686
2687
2688 /*
2689  *      vm_map_inherit:
2690  *
2691  *      Sets the inheritance of the specified address
2692  *      range in the target map.  Inheritance
2693  *      affects how the map will be shared with
2694  *      child maps at the time of vmspace_fork.
2695  */
2696 int
2697 vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end,
2698                vm_inherit_t new_inheritance)
2699 {
2700         vm_map_entry_t entry;
2701         vm_map_entry_t temp_entry;
2702
2703         switch (new_inheritance) {
2704         case VM_INHERIT_NONE:
2705         case VM_INHERIT_COPY:
2706         case VM_INHERIT_SHARE:
2707         case VM_INHERIT_ZERO:
2708                 break;
2709         default:
2710                 return (KERN_INVALID_ARGUMENT);
2711         }
2712         if (start == end)
2713                 return (KERN_SUCCESS);
2714         vm_map_lock(map);
2715         VM_MAP_RANGE_CHECK(map, start, end);
2716         if (vm_map_lookup_entry(map, start, &temp_entry)) {
2717                 entry = temp_entry;
2718                 vm_map_clip_start(map, entry, start);
2719         } else
2720                 entry = temp_entry->next;
2721         while (entry->start < end) {
2722                 vm_map_clip_end(map, entry, end);
2723                 if ((entry->eflags & MAP_ENTRY_GUARD) == 0 ||
2724                     new_inheritance != VM_INHERIT_ZERO)
2725                         entry->inheritance = new_inheritance;
2726                 vm_map_simplify_entry(map, entry);
2727                 entry = entry->next;
2728         }
2729         vm_map_unlock(map);
2730         return (KERN_SUCCESS);
2731 }
2732
2733 /*
2734  *      vm_map_unwire:
2735  *
2736  *      Implements both kernel and user unwiring.
2737  */
2738 int
2739 vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end,
2740     int flags)
2741 {
2742         vm_map_entry_t entry, first_entry, tmp_entry;
2743         vm_offset_t saved_start;
2744         unsigned int last_timestamp;
2745         int rv;
2746         boolean_t need_wakeup, result, user_unwire;
2747
2748         if (start == end)
2749                 return (KERN_SUCCESS);
2750         user_unwire = (flags & VM_MAP_WIRE_USER) ? TRUE : FALSE;
2751         vm_map_lock(map);
2752         VM_MAP_RANGE_CHECK(map, start, end);
2753         if (!vm_map_lookup_entry(map, start, &first_entry)) {
2754                 if (flags & VM_MAP_WIRE_HOLESOK)
2755                         first_entry = first_entry->next;
2756                 else {
2757                         vm_map_unlock(map);
2758                         return (KERN_INVALID_ADDRESS);
2759                 }
2760         }
2761         last_timestamp = map->timestamp;
2762         entry = first_entry;
2763         while (entry->start < end) {
2764                 if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
2765                         /*
2766                          * We have not yet clipped the entry.
2767                          */
2768                         saved_start = (start >= entry->start) ? start :
2769                             entry->start;
2770                         entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
2771                         if (vm_map_unlock_and_wait(map, 0)) {
2772                                 /*
2773                                  * Allow interruption of user unwiring?
2774                                  */
2775                         }
2776                         vm_map_lock(map);
2777                         if (last_timestamp+1 != map->timestamp) {
2778                                 /*
2779                                  * Look again for the entry because the map was
2780                                  * modified while it was unlocked.
2781                                  * Specifically, the entry may have been
2782                                  * clipped, merged, or deleted.
2783                                  */
2784                                 if (!vm_map_lookup_entry(map, saved_start,
2785                                     &tmp_entry)) {
2786                                         if (flags & VM_MAP_WIRE_HOLESOK)
2787                                                 tmp_entry = tmp_entry->next;
2788                                         else {
2789                                                 if (saved_start == start) {
2790                                                         /*
2791                                                          * First_entry has been deleted.
2792                                                          */
2793                                                         vm_map_unlock(map);
2794                                                         return (KERN_INVALID_ADDRESS);
2795                                                 }
2796                                                 end = saved_start;
2797                                                 rv = KERN_INVALID_ADDRESS;
2798                                                 goto done;
2799                                         }
2800                                 }
2801                                 if (entry == first_entry)
2802                                         first_entry = tmp_entry;
2803                                 else
2804                                         first_entry = NULL;
2805                                 entry = tmp_entry;
2806                         }
2807                         last_timestamp = map->timestamp;
2808                         continue;
2809                 }
2810                 vm_map_clip_start(map, entry, start);
2811                 vm_map_clip_end(map, entry, end);
2812                 /*
2813                  * Mark the entry in case the map lock is released.  (See
2814                  * above.)
2815                  */
2816                 KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 &&
2817                     entry->wiring_thread == NULL,
2818                     ("owned map entry %p", entry));
2819                 entry->eflags |= MAP_ENTRY_IN_TRANSITION;
2820                 entry->wiring_thread = curthread;
2821                 /*
2822                  * Check the map for holes in the specified region.
2823                  * If VM_MAP_WIRE_HOLESOK was specified, skip this check.
2824                  */
2825                 if (((flags & VM_MAP_WIRE_HOLESOK) == 0) &&
2826                     (entry->end < end && entry->next->start > entry->end)) {
2827                         end = entry->end;
2828                         rv = KERN_INVALID_ADDRESS;
2829                         goto done;
2830                 }
2831                 /*
2832                  * If system unwiring, require that the entry is system wired.
2833                  */
2834                 if (!user_unwire &&
2835                     vm_map_entry_system_wired_count(entry) == 0) {
2836                         end = entry->end;
2837                         rv = KERN_INVALID_ARGUMENT;
2838                         goto done;
2839                 }
2840                 entry = entry->next;
2841         }
2842         rv = KERN_SUCCESS;
2843 done:
2844         need_wakeup = FALSE;
2845         if (first_entry == NULL) {
2846                 result = vm_map_lookup_entry(map, start, &first_entry);
2847                 if (!result && (flags & VM_MAP_WIRE_HOLESOK))
2848                         first_entry = first_entry->next;
2849                 else
2850                         KASSERT(result, ("vm_map_unwire: lookup failed"));
2851         }
2852         for (entry = first_entry; entry->start < end; entry = entry->next) {
2853                 /*
2854                  * If VM_MAP_WIRE_HOLESOK was specified, an empty
2855                  * space in the unwired region could have been mapped
2856                  * while the map lock was dropped for draining
2857                  * MAP_ENTRY_IN_TRANSITION.  Moreover, another thread
2858                  * could be simultaneously wiring this new mapping
2859                  * entry.  Detect these cases and skip any entries
2860                  * marked as in transition by us.
2861                  */
2862                 if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 ||
2863                     entry->wiring_thread != curthread) {
2864                         KASSERT((flags & VM_MAP_WIRE_HOLESOK) != 0,
2865                             ("vm_map_unwire: !HOLESOK and new/changed entry"));
2866                         continue;
2867                 }
2868
2869                 if (rv == KERN_SUCCESS && (!user_unwire ||
2870                     (entry->eflags & MAP_ENTRY_USER_WIRED))) {
2871                         if (user_unwire)
2872                                 entry->eflags &= ~MAP_ENTRY_USER_WIRED;
2873                         if (entry->wired_count == 1)
2874                                 vm_map_entry_unwire(map, entry);
2875                         else
2876                                 entry->wired_count--;
2877                 }
2878                 KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0,
2879                     ("vm_map_unwire: in-transition flag missing %p", entry));
2880                 KASSERT(entry->wiring_thread == curthread,
2881                     ("vm_map_unwire: alien wire %p", entry));
2882                 entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
2883                 entry->wiring_thread = NULL;
2884                 if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
2885                         entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
2886                         need_wakeup = TRUE;
2887                 }
2888                 vm_map_simplify_entry(map, entry);
2889         }
2890         vm_map_unlock(map);
2891         if (need_wakeup)
2892                 vm_map_wakeup(map);
2893         return (rv);
2894 }
2895
2896 /*
2897  *      vm_map_wire_entry_failure:
2898  *
2899  *      Handle a wiring failure on the given entry.
2900  *
2901  *      The map should be locked.
2902  */
2903 static void
2904 vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry,
2905     vm_offset_t failed_addr)
2906 {
2907
2908         VM_MAP_ASSERT_LOCKED(map);
2909         KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 &&
2910             entry->wired_count == 1,
2911             ("vm_map_wire_entry_failure: entry %p isn't being wired", entry));
2912         KASSERT(failed_addr < entry->end,
2913             ("vm_map_wire_entry_failure: entry %p was fully wired", entry));
2914
2915         /*
2916          * If any pages at the start of this entry were successfully wired,
2917          * then unwire them.
2918          */
2919         if (failed_addr > entry->start) {
2920                 pmap_unwire(map->pmap, entry->start, failed_addr);
2921                 vm_object_unwire(entry->object.vm_object, entry->offset,
2922                     failed_addr - entry->start, PQ_ACTIVE);
2923         }
2924
2925         /*
2926          * Assign an out-of-range value to represent the failure to wire this
2927          * entry.
2928          */
2929         entry->wired_count = -1;
2930 }
2931
2932 /*
2933  *      vm_map_wire:
2934  *
2935  *      Implements both kernel and user wiring.
2936  */
2937 int
2938 vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end,
2939     int flags)
2940 {
2941         vm_map_entry_t entry, first_entry, tmp_entry;
2942         vm_offset_t faddr, saved_end, saved_start;
2943         unsigned int last_timestamp;
2944         int rv;
2945         boolean_t need_wakeup, result, user_wire;
2946         vm_prot_t prot;
2947
2948         if (start == end)
2949                 return (KERN_SUCCESS);
2950         prot = 0;
2951         if (flags & VM_MAP_WIRE_WRITE)
2952                 prot |= VM_PROT_WRITE;
2953         user_wire = (flags & VM_MAP_WIRE_USER) ? TRUE : FALSE;
2954         vm_map_lock(map);
2955         VM_MAP_RANGE_CHECK(map, start, end);
2956         if (!vm_map_lookup_entry(map, start, &first_entry)) {
2957                 if (flags & VM_MAP_WIRE_HOLESOK)
2958                         first_entry = first_entry->next;
2959                 else {
2960                         vm_map_unlock(map);
2961                         return (KERN_INVALID_ADDRESS);
2962                 }
2963         }
2964         last_timestamp = map->timestamp;
2965         entry = first_entry;
2966         while (entry->start < end) {
2967                 if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
2968                         /*
2969                          * We have not yet clipped the entry.
2970                          */
2971                         saved_start = (start >= entry->start) ? start :
2972                             entry->start;
2973                         entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
2974                         if (vm_map_unlock_and_wait(map, 0)) {
2975                                 /*
2976                                  * Allow interruption of user wiring?
2977                                  */
2978                         }
2979                         vm_map_lock(map);
2980                         if (last_timestamp + 1 != map->timestamp) {
2981                                 /*
2982                                  * Look again for the entry because the map was
2983                                  * modified while it was unlocked.
2984                                  * Specifically, the entry may have been
2985                                  * clipped, merged, or deleted.
2986                                  */
2987                                 if (!vm_map_lookup_entry(map, saved_start,
2988                                     &tmp_entry)) {
2989                                         if (flags & VM_MAP_WIRE_HOLESOK)
2990                                                 tmp_entry = tmp_entry->next;
2991                                         else {
2992                                                 if (saved_start == start) {
2993                                                         /*
2994                                                          * first_entry has been deleted.
2995                                                          */
2996                                                         vm_map_unlock(map);
2997                                                         return (KERN_INVALID_ADDRESS);
2998                                                 }
2999                                                 end = saved_start;
3000                                                 rv = KERN_INVALID_ADDRESS;
3001                                                 goto done;
3002                                         }
3003                                 }
3004                                 if (entry == first_entry)
3005                                         first_entry = tmp_entry;
3006                                 else
3007                                         first_entry = NULL;
3008                                 entry = tmp_entry;
3009                         }
3010                         last_timestamp = map->timestamp;
3011                         continue;
3012                 }
3013                 vm_map_clip_start(map, entry, start);
3014                 vm_map_clip_end(map, entry, end);
3015                 /*
3016                  * Mark the entry in case the map lock is released.  (See
3017                  * above.)
3018                  */
3019                 KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 &&
3020                     entry->wiring_thread == NULL,
3021                     ("owned map entry %p", entry));
3022                 entry->eflags |= MAP_ENTRY_IN_TRANSITION;
3023                 entry->wiring_thread = curthread;
3024                 if ((entry->protection & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0
3025                     || (entry->protection & prot) != prot) {
3026                         entry->eflags |= MAP_ENTRY_WIRE_SKIPPED;
3027                         if ((flags & VM_MAP_WIRE_HOLESOK) == 0) {
3028                                 end = entry->end;
3029                                 rv = KERN_INVALID_ADDRESS;
3030                                 goto done;
3031                         }
3032                         goto next_entry;
3033                 }
3034                 if (entry->wired_count == 0) {
3035                         entry->wired_count++;
3036                         saved_start = entry->start;
3037                         saved_end = entry->end;
3038
3039                         /*
3040                          * Release the map lock, relying on the in-transition
3041                          * mark.  Mark the map busy for fork.
3042                          */
3043                         vm_map_busy(map);
3044                         vm_map_unlock(map);
3045
3046                         faddr = saved_start;
3047                         do {
3048                                 /*
3049                                  * Simulate a fault to get the page and enter
3050                                  * it into the physical map.
3051                                  */
3052                                 if ((rv = vm_fault(map, faddr, VM_PROT_NONE,
3053                                     VM_FAULT_WIRE)) != KERN_SUCCESS)
3054                                         break;
3055                         } while ((faddr += PAGE_SIZE) < saved_end);
3056                         vm_map_lock(map);
3057                         vm_map_unbusy(map);
3058                         if (last_timestamp + 1 != map->timestamp) {
3059                                 /*
3060                                  * Look again for the entry because the map was
3061                                  * modified while it was unlocked.  The entry
3062                                  * may have been clipped, but NOT merged or
3063                                  * deleted.
3064                                  */
3065                                 result = vm_map_lookup_entry(map, saved_start,
3066                                     &tmp_entry);
3067                                 KASSERT(result, ("vm_map_wire: lookup failed"));
3068                                 if (entry == first_entry)
3069                                         first_entry = tmp_entry;
3070                                 else
3071                                         first_entry = NULL;
3072                                 entry = tmp_entry;
3073                                 while (entry->end < saved_end) {
3074                                         /*
3075                                          * In case of failure, handle entries
3076                                          * that were not fully wired here;
3077                                          * fully wired entries are handled
3078                                          * later.
3079                                          */
3080                                         if (rv != KERN_SUCCESS &&
3081                                             faddr < entry->end)
3082                                                 vm_map_wire_entry_failure(map,
3083                                                     entry, faddr);
3084                                         entry = entry->next;
3085                                 }
3086                         }
3087                         last_timestamp = map->timestamp;
3088                         if (rv != KERN_SUCCESS) {
3089                                 vm_map_wire_entry_failure(map, entry, faddr);
3090                                 end = entry->end;
3091                                 goto done;
3092                         }
3093                 } else if (!user_wire ||
3094                            (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
3095                         entry->wired_count++;
3096                 }
3097                 /*
3098                  * Check the map for holes in the specified region.
3099                  * If VM_MAP_WIRE_HOLESOK was specified, skip this check.
3100                  */
3101         next_entry:
3102                 if ((flags & VM_MAP_WIRE_HOLESOK) == 0 &&
3103                     entry->end < end && entry->next->start > entry->end) {
3104                         end = entry->end;
3105                         rv = KERN_INVALID_ADDRESS;
3106                         goto done;
3107                 }
3108                 entry = entry->next;
3109         }
3110         rv = KERN_SUCCESS;
3111 done:
3112         need_wakeup = FALSE;
3113         if (first_entry == NULL) {
3114                 result = vm_map_lookup_entry(map, start, &first_entry);
3115                 if (!result && (flags & VM_MAP_WIRE_HOLESOK))
3116                         first_entry = first_entry->next;
3117                 else
3118                         KASSERT(result, ("vm_map_wire: lookup failed"));
3119         }
3120         for (entry = first_entry; entry->start < end; entry = entry->next) {
3121                 /*
3122                  * If VM_MAP_WIRE_HOLESOK was specified, an empty
3123                  * space in the unwired region could have been mapped
3124                  * while the map lock was dropped for faulting in the
3125                  * pages or draining MAP_ENTRY_IN_TRANSITION.
3126                  * Moreover, another thread could be simultaneously
3127                  * wiring this new mapping entry.  Detect these cases
3128                  * and skip any entries marked as in transition not by us.
3129                  */
3130                 if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 ||
3131                     entry->wiring_thread != curthread) {
3132                         KASSERT((flags & VM_MAP_WIRE_HOLESOK) != 0,
3133                             ("vm_map_wire: !HOLESOK and new/changed entry"));
3134                         continue;
3135                 }
3136
3137                 if ((entry->eflags & MAP_ENTRY_WIRE_SKIPPED) != 0)
3138                         goto next_entry_done;
3139
3140                 if (rv == KERN_SUCCESS) {
3141                         if (user_wire)
3142                                 entry->eflags |= MAP_ENTRY_USER_WIRED;
3143                 } else if (entry->wired_count == -1) {
3144                         /*
3145                          * Wiring failed on this entry.  Thus, unwiring is
3146                          * unnecessary.
3147                          */
3148                         entry->wired_count = 0;
3149                 } else if (!user_wire ||
3150                     (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
3151                         /*
3152                          * Undo the wiring.  Wiring succeeded on this entry
3153                          * but failed on a later entry.  
3154                          */
3155                         if (entry->wired_count == 1)
3156                                 vm_map_entry_unwire(map, entry);
3157                         else
3158                                 entry->wired_count--;
3159                 }
3160         next_entry_done:
3161                 KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0,
3162                     ("vm_map_wire: in-transition flag missing %p", entry));
3163                 KASSERT(entry->wiring_thread == curthread,
3164                     ("vm_map_wire: alien wire %p", entry));
3165                 entry->eflags &= ~(MAP_ENTRY_IN_TRANSITION |
3166                     MAP_ENTRY_WIRE_SKIPPED);
3167                 entry->wiring_thread = NULL;
3168                 if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
3169                         entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
3170                         need_wakeup = TRUE;
3171                 }
3172                 vm_map_simplify_entry(map, entry);
3173         }
3174         vm_map_unlock(map);
3175         if (need_wakeup)
3176                 vm_map_wakeup(map);
3177         return (rv);
3178 }
3179
3180 /*
3181  * vm_map_sync
3182  *
3183  * Push any dirty cached pages in the address range to their pager.
3184  * If syncio is TRUE, dirty pages are written synchronously.
3185  * If invalidate is TRUE, any cached pages are freed as well.
3186  *
3187  * If the size of the region from start to end is zero, we are
3188  * supposed to flush all modified pages within the region containing
3189  * start.  Unfortunately, a region can be split or coalesced with
3190  * neighboring regions, making it difficult to determine what the
3191  * original region was.  Therefore, we approximate this requirement by
3192  * flushing the current region containing start.
3193  *
3194  * Returns an error if any part of the specified range is not mapped.
3195  */
3196 int
3197 vm_map_sync(
3198         vm_map_t map,
3199         vm_offset_t start,
3200         vm_offset_t end,
3201         boolean_t syncio,
3202         boolean_t invalidate)
3203 {
3204         vm_map_entry_t current;
3205         vm_map_entry_t entry;
3206         vm_size_t size;
3207         vm_object_t object;
3208         vm_ooffset_t offset;
3209         unsigned int last_timestamp;
3210         boolean_t failed;
3211
3212         vm_map_lock_read(map);
3213         VM_MAP_RANGE_CHECK(map, start, end);
3214         if (!vm_map_lookup_entry(map, start, &entry)) {
3215                 vm_map_unlock_read(map);
3216                 return (KERN_INVALID_ADDRESS);
3217         } else if (start == end) {
3218                 start = entry->start;
3219                 end = entry->end;
3220         }
3221         /*
3222          * Make a first pass to check for user-wired memory and holes.
3223          */
3224         for (current = entry; current->start < end; current = current->next) {
3225                 if (invalidate && (current->eflags & MAP_ENTRY_USER_WIRED)) {
3226                         vm_map_unlock_read(map);
3227                         return (KERN_INVALID_ARGUMENT);
3228                 }
3229                 if (end > current->end &&
3230                     current->end != current->next->start) {
3231                         vm_map_unlock_read(map);
3232                         return (KERN_INVALID_ADDRESS);
3233                 }
3234         }
3235
3236         if (invalidate)
3237                 pmap_remove(map->pmap, start, end);
3238         failed = FALSE;
3239
3240         /*
3241          * Make a second pass, cleaning/uncaching pages from the indicated
3242          * objects as we go.
3243          */
3244         for (current = entry; current->start < end;) {
3245                 offset = current->offset + (start - current->start);
3246                 size = (end <= current->end ? end : current->end) - start;
3247                 if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
3248                         vm_map_t smap;
3249                         vm_map_entry_t tentry;
3250                         vm_size_t tsize;
3251
3252                         smap = current->object.sub_map;
3253                         vm_map_lock_read(smap);
3254                         (void) vm_map_lookup_entry(smap, offset, &tentry);
3255                         tsize = tentry->end - offset;
3256                         if (tsize < size)
3257                                 size = tsize;
3258                         object = tentry->object.vm_object;
3259                         offset = tentry->offset + (offset - tentry->start);
3260                         vm_map_unlock_read(smap);
3261                 } else {
3262                         object = current->object.vm_object;
3263                 }
3264                 vm_object_reference(object);
3265                 last_timestamp = map->timestamp;
3266                 vm_map_unlock_read(map);
3267                 if (!vm_object_sync(object, offset, size, syncio, invalidate))
3268                         failed = TRUE;
3269                 start += size;
3270                 vm_object_deallocate(object);
3271                 vm_map_lock_read(map);
3272                 if (last_timestamp == map->timestamp ||
3273                     !vm_map_lookup_entry(map, start, &current))
3274                         current = current->next;
3275         }
3276
3277         vm_map_unlock_read(map);
3278         return (failed ? KERN_FAILURE : KERN_SUCCESS);
3279 }
3280
3281 /*
3282  *      vm_map_entry_unwire:    [ internal use only ]
3283  *
3284  *      Make the region specified by this entry pageable.
3285  *
3286  *      The map in question should be locked.
3287  *      [This is the reason for this routine's existence.]
3288  */
3289 static void
3290 vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry)
3291 {
3292
3293         VM_MAP_ASSERT_LOCKED(map);
3294         KASSERT(entry->wired_count > 0,
3295             ("vm_map_entry_unwire: entry %p isn't wired", entry));
3296         pmap_unwire(map->pmap, entry->start, entry->end);
3297         vm_object_unwire(entry->object.vm_object, entry->offset, entry->end -
3298             entry->start, PQ_ACTIVE);
3299         entry->wired_count = 0;
3300 }
3301
3302 static void
3303 vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map)
3304 {
3305
3306         if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0)
3307                 vm_object_deallocate(entry->object.vm_object);
3308         uma_zfree(system_map ? kmapentzone : mapentzone, entry);
3309 }
3310
3311 /*
3312  *      vm_map_entry_delete:    [ internal use only ]
3313  *
3314  *      Deallocate the given entry from the target map.
3315  */
3316 static void
3317 vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry)
3318 {
3319         vm_object_t object;
3320         vm_pindex_t offidxstart, offidxend, count, size1;
3321         vm_size_t size;
3322
3323         vm_map_entry_unlink(map, entry, UNLINK_MERGE_NONE);
3324         object = entry->object.vm_object;
3325
3326         if ((entry->eflags & MAP_ENTRY_GUARD) != 0) {
3327                 MPASS(entry->cred == NULL);
3328                 MPASS((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0);
3329                 MPASS(object == NULL);
3330                 vm_map_entry_deallocate(entry, map->system_map);
3331                 return;
3332         }
3333
3334         size = entry->end - entry->start;
3335         map->size -= size;
3336
3337         if (entry->cred != NULL) {
3338                 swap_release_by_cred(size, entry->cred);
3339                 crfree(entry->cred);
3340         }
3341
3342         if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0 &&
3343             (object != NULL)) {
3344                 KASSERT(entry->cred == NULL || object->cred == NULL ||
3345                     (entry->eflags & MAP_ENTRY_NEEDS_COPY),
3346                     ("OVERCOMMIT vm_map_entry_delete: both cred %p", entry));
3347                 count = atop(size);
3348                 offidxstart = OFF_TO_IDX(entry->offset);
3349                 offidxend = offidxstart + count;
3350                 VM_OBJECT_WLOCK(object);
3351                 if (object->ref_count != 1 && ((object->flags & (OBJ_NOSPLIT |
3352                     OBJ_ONEMAPPING)) == OBJ_ONEMAPPING ||
3353                     object == kernel_object)) {
3354                         vm_object_collapse(object);
3355
3356                         /*
3357                          * The option OBJPR_NOTMAPPED can be passed here
3358                          * because vm_map_delete() already performed
3359                          * pmap_remove() on the only mapping to this range
3360                          * of pages. 
3361                          */
3362                         vm_object_page_remove(object, offidxstart, offidxend,
3363                             OBJPR_NOTMAPPED);
3364                         if (object->type == OBJT_SWAP)
3365                                 swap_pager_freespace(object, offidxstart,
3366                                     count);
3367                         if (offidxend >= object->size &&
3368                             offidxstart < object->size) {
3369                                 size1 = object->size;
3370                                 object->size = offidxstart;
3371                                 if (object->cred != NULL) {
3372                                         size1 -= object->size;
3373                                         KASSERT(object->charge >= ptoa(size1),
3374                                             ("object %p charge < 0", object));
3375                                         swap_release_by_cred(ptoa(size1),
3376                                             object->cred);
3377                                         object->charge -= ptoa(size1);
3378                                 }
3379                         }
3380                 }
3381                 VM_OBJECT_WUNLOCK(object);
3382         } else
3383                 entry->object.vm_object = NULL;
3384         if (map->system_map)
3385                 vm_map_entry_deallocate(entry, TRUE);
3386         else {
3387                 entry->next = curthread->td_map_def_user;
3388                 curthread->td_map_def_user = entry;
3389         }
3390 }
3391
3392 /*
3393  *      vm_map_delete:  [ internal use only ]
3394  *
3395  *      Deallocates the given address range from the target
3396  *      map.
3397  */
3398 int
3399 vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end)
3400 {
3401         vm_map_entry_t entry;
3402         vm_map_entry_t first_entry;
3403
3404         VM_MAP_ASSERT_LOCKED(map);
3405         if (start == end)
3406                 return (KERN_SUCCESS);
3407
3408         /*
3409          * Find the start of the region, and clip it
3410          */
3411         if (!vm_map_lookup_entry(map, start, &first_entry))
3412                 entry = first_entry->next;
3413         else {
3414                 entry = first_entry;
3415                 vm_map_clip_start(map, entry, start);
3416         }
3417
3418         /*
3419          * Step through all entries in this region
3420          */
3421         while (entry->start < end) {
3422                 vm_map_entry_t next;
3423
3424                 /*
3425                  * Wait for wiring or unwiring of an entry to complete.
3426                  * Also wait for any system wirings to disappear on
3427                  * user maps.
3428                  */
3429                 if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 ||
3430                     (vm_map_pmap(map) != kernel_pmap &&
3431                     vm_map_entry_system_wired_count(entry) != 0)) {
3432                         unsigned int last_timestamp;
3433                         vm_offset_t saved_start;
3434                         vm_map_entry_t tmp_entry;
3435
3436                         saved_start = entry->start;
3437                         entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
3438                         last_timestamp = map->timestamp;
3439                         (void) vm_map_unlock_and_wait(map, 0);
3440                         vm_map_lock(map);
3441                         if (last_timestamp + 1 != map->timestamp) {
3442                                 /*
3443                                  * Look again for the entry because the map was
3444                                  * modified while it was unlocked.
3445                                  * Specifically, the entry may have been
3446                                  * clipped, merged, or deleted.
3447                                  */
3448                                 if (!vm_map_lookup_entry(map, saved_start,
3449                                                          &tmp_entry))
3450                                         entry = tmp_entry->next;
3451                                 else {
3452                                         entry = tmp_entry;
3453                                         vm_map_clip_start(map, entry,
3454                                                           saved_start);
3455                                 }
3456                         }
3457                         continue;
3458                 }
3459                 vm_map_clip_end(map, entry, end);
3460
3461                 next = entry->next;
3462
3463                 /*
3464                  * Unwire before removing addresses from the pmap; otherwise,
3465                  * unwiring will put the entries back in the pmap.
3466                  */
3467                 if (entry->wired_count != 0)
3468                         vm_map_entry_unwire(map, entry);
3469
3470                 /*
3471                  * Remove mappings for the pages, but only if the
3472                  * mappings could exist.  For instance, it does not
3473                  * make sense to call pmap_remove() for guard entries.
3474                  */
3475                 if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0 ||
3476                     entry->object.vm_object != NULL)
3477                         pmap_remove(map->pmap, entry->start, entry->end);
3478
3479                 if (entry->end == map->anon_loc)
3480                         map->anon_loc = entry->start;
3481
3482                 /*
3483                  * Delete the entry only after removing all pmap
3484                  * entries pointing to its pages.  (Otherwise, its
3485                  * page frames may be reallocated, and any modify bits
3486                  * will be set in the wrong object!)
3487                  */
3488                 vm_map_entry_delete(map, entry);
3489                 entry = next;
3490         }
3491         return (KERN_SUCCESS);
3492 }
3493
3494 /*
3495  *      vm_map_remove:
3496  *
3497  *      Remove the given address range from the target map.
3498  *      This is the exported form of vm_map_delete.
3499  */
3500 int
3501 vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end)
3502 {
3503         int result;
3504
3505         vm_map_lock(map);
3506         VM_MAP_RANGE_CHECK(map, start, end);
3507         result = vm_map_delete(map, start, end);
3508         vm_map_unlock(map);
3509         return (result);
3510 }
3511
3512 /*
3513  *      vm_map_check_protection:
3514  *
3515  *      Assert that the target map allows the specified privilege on the
3516  *      entire address region given.  The entire region must be allocated.
3517  *
3518  *      WARNING!  This code does not and should not check whether the
3519  *      contents of the region is accessible.  For example a smaller file
3520  *      might be mapped into a larger address space.
3521  *
3522  *      NOTE!  This code is also called by munmap().
3523  *
3524  *      The map must be locked.  A read lock is sufficient.
3525  */
3526 boolean_t
3527 vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end,
3528                         vm_prot_t protection)
3529 {
3530         vm_map_entry_t entry;
3531         vm_map_entry_t tmp_entry;
3532
3533         if (!vm_map_lookup_entry(map, start, &tmp_entry))
3534                 return (FALSE);
3535         entry = tmp_entry;
3536
3537         while (start < end) {
3538                 /*
3539                  * No holes allowed!
3540                  */
3541                 if (start < entry->start)
3542                         return (FALSE);
3543                 /*
3544                  * Check protection associated with entry.
3545                  */
3546                 if ((entry->protection & protection) != protection)
3547                         return (FALSE);
3548                 /* go to next entry */
3549                 start = entry->end;
3550                 entry = entry->next;
3551         }
3552         return (TRUE);
3553 }
3554
3555 /*
3556  *      vm_map_copy_entry:
3557  *
3558  *      Copies the contents of the source entry to the destination
3559  *      entry.  The entries *must* be aligned properly.
3560  */
3561 static void
3562 vm_map_copy_entry(
3563         vm_map_t src_map,
3564         vm_map_t dst_map,
3565         vm_map_entry_t src_entry,
3566         vm_map_entry_t dst_entry,
3567         vm_ooffset_t *fork_charge)
3568 {
3569         vm_object_t src_object;
3570         vm_map_entry_t fake_entry;
3571         vm_offset_t size;
3572         struct ucred *cred;
3573         int charged;
3574
3575         VM_MAP_ASSERT_LOCKED(dst_map);
3576
3577         if ((dst_entry->eflags|src_entry->eflags) & MAP_ENTRY_IS_SUB_MAP)
3578                 return;
3579
3580         if (src_entry->wired_count == 0 ||
3581             (src_entry->protection & VM_PROT_WRITE) == 0) {
3582                 /*
3583                  * If the source entry is marked needs_copy, it is already
3584                  * write-protected.
3585                  */
3586                 if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0 &&
3587                     (src_entry->protection & VM_PROT_WRITE) != 0) {
3588                         pmap_protect(src_map->pmap,
3589                             src_entry->start,
3590                             src_entry->end,
3591                             src_entry->protection & ~VM_PROT_WRITE);
3592                 }
3593
3594                 /*
3595                  * Make a copy of the object.
3596                  */
3597                 size = src_entry->end - src_entry->start;
3598                 if ((src_object = src_entry->object.vm_object) != NULL) {
3599                         VM_OBJECT_WLOCK(src_object);
3600                         charged = ENTRY_CHARGED(src_entry);
3601                         if (src_object->handle == NULL &&
3602                             (src_object->type == OBJT_DEFAULT ||
3603                             src_object->type == OBJT_SWAP)) {
3604                                 vm_object_collapse(src_object);
3605                                 if ((src_object->flags & (OBJ_NOSPLIT |
3606                                     OBJ_ONEMAPPING)) == OBJ_ONEMAPPING) {
3607                                         vm_object_split(src_entry);
3608                                         src_object =
3609                                             src_entry->object.vm_object;
3610                                 }
3611                         }
3612                         vm_object_reference_locked(src_object);
3613                         vm_object_clear_flag(src_object, OBJ_ONEMAPPING);
3614                         if (src_entry->cred != NULL &&
3615                             !(src_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
3616                                 KASSERT(src_object->cred == NULL,
3617                                     ("OVERCOMMIT: vm_map_copy_entry: cred %p",
3618                                      src_object));
3619                                 src_object->cred = src_entry->cred;
3620                                 src_object->charge = size;
3621                         }
3622                         VM_OBJECT_WUNLOCK(src_object);
3623                         dst_entry->object.vm_object = src_object;
3624                         if (charged) {
3625                                 cred = curthread->td_ucred;
3626                                 crhold(cred);
3627                                 dst_entry->cred = cred;
3628                                 *fork_charge += size;
3629                                 if (!(src_entry->eflags &
3630                                       MAP_ENTRY_NEEDS_COPY)) {
3631                                         crhold(cred);
3632                                         src_entry->cred = cred;
3633                                         *fork_charge += size;
3634                                 }
3635                         }
3636                         src_entry->eflags |= MAP_ENTRY_COW |
3637                             MAP_ENTRY_NEEDS_COPY;
3638                         dst_entry->eflags |= MAP_ENTRY_COW |
3639                             MAP_ENTRY_NEEDS_COPY;
3640                         dst_entry->offset = src_entry->offset;
3641                         if (src_entry->eflags & MAP_ENTRY_VN_WRITECNT) {
3642                                 /*
3643                                  * MAP_ENTRY_VN_WRITECNT cannot
3644                                  * indicate write reference from
3645                                  * src_entry, since the entry is
3646                                  * marked as needs copy.  Allocate a
3647                                  * fake entry that is used to
3648                                  * decrement object->un_pager.vnp.writecount
3649                                  * at the appropriate time.  Attach
3650                                  * fake_entry to the deferred list.
3651                                  */
3652                                 fake_entry = vm_map_entry_create(dst_map);
3653                                 fake_entry->eflags = MAP_ENTRY_VN_WRITECNT;
3654                                 src_entry->eflags &= ~MAP_ENTRY_VN_WRITECNT;
3655                                 vm_object_reference(src_object);
3656                                 fake_entry->object.vm_object = src_object;
3657                                 fake_entry->start = src_entry->start;
3658                                 fake_entry->end = src_entry->end;
3659                                 fake_entry->next = curthread->td_map_def_user;
3660                                 curthread->td_map_def_user = fake_entry;
3661                         }
3662
3663                         pmap_copy(dst_map->pmap, src_map->pmap,
3664                             dst_entry->start, dst_entry->end - dst_entry->start,
3665                             src_entry->start);
3666                 } else {
3667                         dst_entry->object.vm_object = NULL;
3668                         dst_entry->offset = 0;
3669                         if (src_entry->cred != NULL) {
3670                                 dst_entry->cred = curthread->td_ucred;
3671                                 crhold(dst_entry->cred);
3672                                 *fork_charge += size;
3673                         }
3674                 }
3675         } else {
3676                 /*
3677                  * We don't want to make writeable wired pages copy-on-write.
3678                  * Immediately copy these pages into the new map by simulating
3679                  * page faults.  The new pages are pageable.
3680                  */
3681                 vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry,
3682                     fork_charge);
3683         }
3684 }
3685
3686 /*
3687  * vmspace_map_entry_forked:
3688  * Update the newly-forked vmspace each time a map entry is inherited
3689  * or copied.  The values for vm_dsize and vm_tsize are approximate
3690  * (and mostly-obsolete ideas in the face of mmap(2) et al.)
3691  */
3692 static void
3693 vmspace_map_entry_forked(const struct vmspace *vm1, struct vmspace *vm2,
3694     vm_map_entry_t entry)
3695 {
3696         vm_size_t entrysize;
3697         vm_offset_t newend;
3698
3699         if ((entry->eflags & MAP_ENTRY_GUARD) != 0)
3700                 return;
3701         entrysize = entry->end - entry->start;
3702         vm2->vm_map.size += entrysize;
3703         if (entry->eflags & (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP)) {
3704                 vm2->vm_ssize += btoc(entrysize);
3705         } else if (entry->start >= (vm_offset_t)vm1->vm_daddr &&
3706             entry->start < (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize)) {
3707                 newend = MIN(entry->end,
3708                     (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize));
3709                 vm2->vm_dsize += btoc(newend - entry->start);
3710         } else if (entry->start >= (vm_offset_t)vm1->vm_taddr &&
3711             entry->start < (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize)) {
3712                 newend = MIN(entry->end,
3713                     (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize));
3714                 vm2->vm_tsize += btoc(newend - entry->start);
3715         }
3716 }
3717
3718 /*
3719  * vmspace_fork:
3720  * Create a new process vmspace structure and vm_map
3721  * based on those of an existing process.  The new map
3722  * is based on the old map, according to the inheritance
3723  * values on the regions in that map.
3724  *
3725  * XXX It might be worth coalescing the entries added to the new vmspace.
3726  *
3727  * The source map must not be locked.
3728  */
3729 struct vmspace *
3730 vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_charge)
3731 {
3732         struct vmspace *vm2;
3733         vm_map_t new_map, old_map;
3734         vm_map_entry_t new_entry, old_entry;
3735         vm_object_t object;
3736         int error, locked;
3737         vm_inherit_t inh;
3738
3739         old_map = &vm1->vm_map;
3740         /* Copy immutable fields of vm1 to vm2. */
3741         vm2 = vmspace_alloc(vm_map_min(old_map), vm_map_max(old_map),
3742             pmap_pinit);
3743         if (vm2 == NULL)
3744                 return (NULL);
3745
3746         vm2->vm_taddr = vm1->vm_taddr;
3747         vm2->vm_daddr = vm1->vm_daddr;
3748         vm2->vm_maxsaddr = vm1->vm_maxsaddr;
3749         vm_map_lock(old_map);
3750         if (old_map->busy)
3751                 vm_map_wait_busy(old_map);
3752         new_map = &vm2->vm_map;
3753         locked = vm_map_trylock(new_map); /* trylock to silence WITNESS */
3754         KASSERT(locked, ("vmspace_fork: lock failed"));
3755
3756         error = pmap_vmspace_copy(new_map->pmap, old_map->pmap);
3757         if (error != 0) {
3758                 sx_xunlock(&old_map->lock);
3759                 sx_xunlock(&new_map->lock);
3760                 vm_map_process_deferred();
3761                 vmspace_free(vm2);
3762                 return (NULL);
3763         }
3764
3765         new_map->anon_loc = old_map->anon_loc;
3766
3767         old_entry = old_map->header.next;
3768
3769         while (old_entry != &old_map->header) {
3770                 if (old_entry->eflags & MAP_ENTRY_IS_SUB_MAP)
3771                         panic("vm_map_fork: encountered a submap");
3772
3773                 inh = old_entry->inheritance;
3774                 if ((old_entry->eflags & MAP_ENTRY_GUARD) != 0 &&
3775                     inh != VM_INHERIT_NONE)
3776                         inh = VM_INHERIT_COPY;
3777
3778                 switch (inh) {
3779                 case VM_INHERIT_NONE:
3780                         break;
3781
3782                 case VM_INHERIT_SHARE:
3783                         /*
3784                          * Clone the entry, creating the shared object if necessary.
3785                          */
3786                         object = old_entry->object.vm_object;
3787                         if (object == NULL) {
3788                                 object = vm_object_allocate(OBJT_DEFAULT,
3789                                         atop(old_entry->end - old_entry->start));
3790                                 old_entry->object.vm_object = object;
3791                                 old_entry->offset = 0;
3792                                 if (old_entry->cred != NULL) {
3793                                         object->cred = old_entry->cred;
3794                                         object->charge = old_entry->end -
3795                                             old_entry->start;
3796                                         old_entry->cred = NULL;
3797                                 }
3798                         }
3799
3800                         /*
3801                          * Add the reference before calling vm_object_shadow
3802                          * to insure that a shadow object is created.
3803                          */
3804                         vm_object_reference(object);
3805                         if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) {
3806                                 vm_object_shadow(&old_entry->object.vm_object,
3807                                     &old_entry->offset,
3808                                     old_entry->end - old_entry->start);
3809                                 old_entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
3810                                 /* Transfer the second reference too. */
3811                                 vm_object_reference(
3812                                     old_entry->object.vm_object);
3813
3814                                 /*
3815                                  * As in vm_map_simplify_entry(), the
3816                                  * vnode lock will not be acquired in
3817                                  * this call to vm_object_deallocate().
3818                                  */
3819                                 vm_object_deallocate(object);
3820                                 object = old_entry->object.vm_object;
3821                         }
3822                         VM_OBJECT_WLOCK(object);
3823                         vm_object_clear_flag(object, OBJ_ONEMAPPING);
3824                         if (old_entry->cred != NULL) {
3825                                 KASSERT(object->cred == NULL, ("vmspace_fork both cred"));
3826                                 object->cred = old_entry->cred;
3827                                 object->charge = old_entry->end - old_entry->start;
3828                                 old_entry->cred = NULL;
3829                         }
3830
3831                         /*
3832                          * Assert the correct state of the vnode
3833                          * v_writecount while the object is locked, to
3834                          * not relock it later for the assertion
3835                          * correctness.
3836                          */
3837                         if (old_entry->eflags & MAP_ENTRY_VN_WRITECNT &&
3838                             object->type == OBJT_VNODE) {
3839                                 KASSERT(((struct vnode *)object->handle)->
3840                                     v_writecount > 0,
3841                                     ("vmspace_fork: v_writecount %p", object));
3842                                 KASSERT(object->un_pager.vnp.writemappings > 0,
3843                                     ("vmspace_fork: vnp.writecount %p",
3844                                     object));
3845                         }
3846                         VM_OBJECT_WUNLOCK(object);
3847
3848                         /*
3849                          * Clone the entry, referencing the shared object.
3850                          */
3851                         new_entry = vm_map_entry_create(new_map);
3852                         *new_entry = *old_entry;
3853                         new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED |
3854                             MAP_ENTRY_IN_TRANSITION);
3855                         new_entry->wiring_thread = NULL;
3856                         new_entry->wired_count = 0;
3857                         if (new_entry->eflags & MAP_ENTRY_VN_WRITECNT) {
3858                                 vnode_pager_update_writecount(object,
3859                                     new_entry->start, new_entry->end);
3860                         }
3861
3862                         /*
3863                          * Insert the entry into the new map -- we know we're
3864                          * inserting at the end of the new map.
3865                          */
3866                         vm_map_entry_link(new_map, new_entry);
3867                         vmspace_map_entry_forked(vm1, vm2, new_entry);
3868
3869                         /*
3870                          * Update the physical map
3871                          */
3872                         pmap_copy(new_map->pmap, old_map->pmap,
3873                             new_entry->start,
3874                             (old_entry->end - old_entry->start),
3875                             old_entry->start);
3876                         break;
3877
3878                 case VM_INHERIT_COPY:
3879                         /*
3880                          * Clone the entry and link into the map.
3881                          */
3882                         new_entry = vm_map_entry_create(new_map);
3883                         *new_entry = *old_entry;
3884                         /*
3885                          * Copied entry is COW over the old object.
3886                          */
3887                         new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED |
3888                             MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_VN_WRITECNT);
3889                         new_entry->wiring_thread = NULL;
3890                         new_entry->wired_count = 0;
3891                         new_entry->object.vm_object = NULL;
3892                         new_entry->cred = NULL;
3893                         vm_map_entry_link(new_map, new_entry);
3894                         vmspace_map_entry_forked(vm1, vm2, new_entry);
3895                         vm_map_copy_entry(old_map, new_map, old_entry,
3896                             new_entry, fork_charge);
3897                         break;
3898
3899                 case VM_INHERIT_ZERO:
3900                         /*
3901                          * Create a new anonymous mapping entry modelled from
3902                          * the old one.
3903                          */
3904                         new_entry = vm_map_entry_create(new_map);
3905                         memset(new_entry, 0, sizeof(*new_entry));
3906
3907                         new_entry->start = old_entry->start;
3908                         new_entry->end = old_entry->end;
3909                         new_entry->eflags = old_entry->eflags &
3910                             ~(MAP_ENTRY_USER_WIRED | MAP_ENTRY_IN_TRANSITION |
3911                             MAP_ENTRY_VN_WRITECNT);
3912                         new_entry->protection = old_entry->protection;
3913                         new_entry->max_protection = old_entry->max_protection;
3914                         new_entry->inheritance = VM_INHERIT_ZERO;
3915
3916                         vm_map_entry_link(new_map, new_entry);
3917                         vmspace_map_entry_forked(vm1, vm2, new_entry);
3918
3919                         new_entry->cred = curthread->td_ucred;
3920                         crhold(new_entry->cred);
3921                         *fork_charge += (new_entry->end - new_entry->start);
3922
3923                         break;
3924                 }
3925                 old_entry = old_entry->next;
3926         }
3927         /*
3928          * Use inlined vm_map_unlock() to postpone handling the deferred
3929          * map entries, which cannot be done until both old_map and
3930          * new_map locks are released.
3931          */
3932         sx_xunlock(&old_map->lock);
3933         sx_xunlock(&new_map->lock);
3934         vm_map_process_deferred();
3935
3936         return (vm2);
3937 }
3938
3939 /*
3940  * Create a process's stack for exec_new_vmspace().  This function is never
3941  * asked to wire the newly created stack.
3942  */
3943 int
3944 vm_map_stack(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
3945     vm_prot_t prot, vm_prot_t max, int cow)
3946 {
3947         vm_size_t growsize, init_ssize;
3948         rlim_t vmemlim;
3949         int rv;
3950
3951         MPASS((map->flags & MAP_WIREFUTURE) == 0);
3952         growsize = sgrowsiz;
3953         init_ssize = (max_ssize < growsize) ? max_ssize : growsize;
3954         vm_map_lock(map);
3955         vmemlim = lim_cur(curthread, RLIMIT_VMEM);
3956         /* If we would blow our VMEM resource limit, no go */
3957         if (map->size + init_ssize > vmemlim) {
3958                 rv = KERN_NO_SPACE;
3959                 goto out;
3960         }
3961         rv = vm_map_stack_locked(map, addrbos, max_ssize, growsize, prot,
3962             max, cow);
3963 out:
3964         vm_map_unlock(map);
3965         return (rv);
3966 }
3967
3968 static int stack_guard_page = 1;
3969 SYSCTL_INT(_security_bsd, OID_AUTO, stack_guard_page, CTLFLAG_RWTUN,
3970     &stack_guard_page, 0,
3971     "Specifies the number of guard pages for a stack that grows");
3972
3973 static int
3974 vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
3975     vm_size_t growsize, vm_prot_t prot, vm_prot_t max, int cow)
3976 {
3977         vm_map_entry_t new_entry, prev_entry;
3978         vm_offset_t bot, gap_bot, gap_top, top;
3979         vm_size_t init_ssize, sgp;
3980         int orient, rv;
3981
3982         /*
3983          * The stack orientation is piggybacked with the cow argument.
3984          * Extract it into orient and mask the cow argument so that we
3985          * don't pass it around further.
3986          */
3987         orient = cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP);
3988         KASSERT(orient != 0, ("No stack grow direction"));
3989         KASSERT(orient != (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP),
3990             ("bi-dir stack"));
3991
3992         if (addrbos < vm_map_min(map) ||
3993             addrbos + max_ssize > vm_map_max(map) ||
3994             addrbos + max_ssize <= addrbos)
3995                 return (KERN_INVALID_ADDRESS);
3996         sgp = (vm_size_t)stack_guard_page * PAGE_SIZE;
3997         if (sgp >= max_ssize)
3998                 return (KERN_INVALID_ARGUMENT);
3999
4000         init_ssize = growsize;
4001         if (max_ssize < init_ssize + sgp)
4002                 init_ssize = max_ssize - sgp;
4003
4004         /* If addr is already mapped, no go */
4005         if (vm_map_lookup_entry(map, addrbos, &prev_entry))
4006                 return (KERN_NO_SPACE);
4007
4008         /*
4009          * If we can't accommodate max_ssize in the current mapping, no go.
4010          */
4011         if (prev_entry->next->start < addrbos + max_ssize)
4012                 return (KERN_NO_SPACE);
4013
4014         /*
4015          * We initially map a stack of only init_ssize.  We will grow as
4016          * needed later.  Depending on the orientation of the stack (i.e.
4017          * the grow direction) we either map at the top of the range, the
4018          * bottom of the range or in the middle.
4019          *
4020          * Note: we would normally expect prot and max to be VM_PROT_ALL,
4021          * and cow to be 0.  Possibly we should eliminate these as input
4022          * parameters, and just pass these values here in the insert call.
4023          */
4024         if (orient == MAP_STACK_GROWS_DOWN) {
4025                 bot = addrbos + max_ssize - init_ssize;
4026                 top = bot + init_ssize;
4027                 gap_bot = addrbos;
4028                 gap_top = bot;
4029         } else /* if (orient == MAP_STACK_GROWS_UP) */ {
4030                 bot = addrbos;
4031                 top = bot + init_ssize;
4032                 gap_bot = top;
4033                 gap_top = addrbos + max_ssize;
4034         }
4035         rv = vm_map_insert(map, NULL, 0, bot, top, prot, max, cow);
4036         if (rv != KERN_SUCCESS)
4037                 return (rv);
4038         new_entry = prev_entry->next;
4039         KASSERT(new_entry->end == top || new_entry->start == bot,
4040             ("Bad entry start/end for new stack entry"));
4041         KASSERT((orient & MAP_STACK_GROWS_DOWN) == 0 ||
4042             (new_entry->eflags & MAP_ENTRY_GROWS_DOWN) != 0,
4043             ("new entry lacks MAP_ENTRY_GROWS_DOWN"));
4044         KASSERT((orient & MAP_STACK_GROWS_UP) == 0 ||
4045             (new_entry->eflags & MAP_ENTRY_GROWS_UP) != 0,
4046             ("new entry lacks MAP_ENTRY_GROWS_UP"));
4047         rv = vm_map_insert(map, NULL, 0, gap_bot, gap_top, VM_PROT_NONE,
4048             VM_PROT_NONE, MAP_CREATE_GUARD | (orient == MAP_STACK_GROWS_DOWN ?
4049             MAP_CREATE_STACK_GAP_DN : MAP_CREATE_STACK_GAP_UP));
4050         if (rv != KERN_SUCCESS)
4051                 (void)vm_map_delete(map, bot, top);
4052         return (rv);
4053 }
4054
4055 /*
4056  * Attempts to grow a vm stack entry.  Returns KERN_SUCCESS if we
4057  * successfully grow the stack.
4058  */
4059 static int
4060 vm_map_growstack(vm_map_t map, vm_offset_t addr, vm_map_entry_t gap_entry)
4061 {
4062         vm_map_entry_t stack_entry;
4063         struct proc *p;
4064         struct vmspace *vm;
4065         struct ucred *cred;
4066         vm_offset_t gap_end, gap_start, grow_start;
4067         size_t grow_amount, guard, max_grow;
4068         rlim_t lmemlim, stacklim, vmemlim;
4069         int rv, rv1;
4070         bool gap_deleted, grow_down, is_procstack;
4071 #ifdef notyet
4072         uint64_t limit;
4073 #endif
4074 #ifdef RACCT
4075         int error;
4076 #endif
4077
4078         p = curproc;
4079         vm = p->p_vmspace;
4080
4081         /*
4082          * Disallow stack growth when the access is performed by a
4083          * debugger or AIO daemon.  The reason is that the wrong
4084          * resource limits are applied.
4085          */
4086         if (map != &p->p_vmspace->vm_map || p->p_textvp == NULL)
4087                 return (KERN_FAILURE);
4088
4089         MPASS(!map->system_map);
4090
4091         guard = stack_guard_page * PAGE_SIZE;
4092         lmemlim = lim_cur(curthread, RLIMIT_MEMLOCK);
4093         stacklim = lim_cur(curthread, RLIMIT_STACK);
4094         vmemlim = lim_cur(curthread, RLIMIT_VMEM);
4095 retry:
4096         /* If addr is not in a hole for a stack grow area, no need to grow. */
4097         if (gap_entry == NULL && !vm_map_lookup_entry(map, addr, &gap_entry))
4098                 return (KERN_FAILURE);
4099         if ((gap_entry->eflags & MAP_ENTRY_GUARD) == 0)
4100                 return (KERN_SUCCESS);
4101         if ((gap_entry->eflags & MAP_ENTRY_STACK_GAP_DN) != 0) {
4102                 stack_entry = gap_entry->next;
4103                 if ((stack_entry->eflags & MAP_ENTRY_GROWS_DOWN) == 0 ||
4104                     stack_entry->start != gap_entry->end)
4105                         return (KERN_FAILURE);
4106                 grow_amount = round_page(stack_entry->start - addr);
4107                 grow_down = true;
4108         } else if ((gap_entry->eflags & MAP_ENTRY_STACK_GAP_UP) != 0) {
4109                 stack_entry = gap_entry->prev;
4110                 if ((stack_entry->eflags & MAP_ENTRY_GROWS_UP) == 0 ||
4111                     stack_entry->end != gap_entry->start)
4112                         return (KERN_FAILURE);
4113                 grow_amount = round_page(addr + 1 - stack_entry->end);
4114                 grow_down = false;
4115         } else {
4116                 return (KERN_FAILURE);
4117         }
4118         max_grow = gap_entry->end - gap_entry->start;
4119         if (guard > max_grow)
4120                 return (KERN_NO_SPACE);
4121         max_grow -= guard;
4122         if (grow_amount > max_grow)
4123                 return (KERN_NO_SPACE);
4124
4125         /*
4126          * If this is the main process stack, see if we're over the stack
4127          * limit.
4128          */
4129         is_procstack = addr >= (vm_offset_t)vm->vm_maxsaddr &&
4130             addr < (vm_offset_t)p->p_sysent->sv_usrstack;
4131         if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim))
4132                 return (KERN_NO_SPACE);
4133
4134 #ifdef RACCT
4135         if (racct_enable) {
4136                 PROC_LOCK(p);
4137                 if (is_procstack && racct_set(p, RACCT_STACK,
4138                     ctob(vm->vm_ssize) + grow_amount)) {
4139                         PROC_UNLOCK(p);
4140                         return (KERN_NO_SPACE);
4141                 }
4142                 PROC_UNLOCK(p);
4143         }
4144 #endif
4145
4146         grow_amount = roundup(grow_amount, sgrowsiz);
4147         if (grow_amount > max_grow)
4148                 grow_amount = max_grow;
4149         if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) {
4150                 grow_amount = trunc_page((vm_size_t)stacklim) -
4151                     ctob(vm->vm_ssize);
4152         }
4153
4154 #ifdef notyet
4155         PROC_LOCK(p);
4156         limit = racct_get_available(p, RACCT_STACK);
4157         PROC_UNLOCK(p);
4158         if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > limit))
4159                 grow_amount = limit - ctob(vm->vm_ssize);
4160 #endif
4161
4162         if (!old_mlock && (map->flags & MAP_WIREFUTURE) != 0) {
4163                 if (ptoa(pmap_wired_count(map->pmap)) + grow_amount > lmemlim) {
4164                         rv = KERN_NO_SPACE;
4165                         goto out;
4166                 }
4167 #ifdef RACCT
4168                 if (racct_enable) {
4169                         PROC_LOCK(p);
4170                         if (racct_set(p, RACCT_MEMLOCK,
4171                             ptoa(pmap_wired_count(map->pmap)) + grow_amount)) {
4172                                 PROC_UNLOCK(p);
4173                                 rv = KERN_NO_SPACE;
4174                                 goto out;
4175                         }
4176                         PROC_UNLOCK(p);
4177                 }
4178 #endif
4179         }
4180
4181         /* If we would blow our VMEM resource limit, no go */
4182         if (map->size + grow_amount > vmemlim) {
4183                 rv = KERN_NO_SPACE;
4184                 goto out;
4185         }
4186 #ifdef RACCT
4187         if (racct_enable) {
4188                 PROC_LOCK(p);
4189                 if (racct_set(p, RACCT_VMEM, map->size + grow_amount)) {
4190                         PROC_UNLOCK(p);
4191                         rv = KERN_NO_SPACE;
4192                         goto out;
4193                 }
4194                 PROC_UNLOCK(p);
4195         }
4196 #endif
4197
4198         if (vm_map_lock_upgrade(map)) {
4199                 gap_entry = NULL;
4200                 vm_map_lock_read(map);
4201                 goto retry;
4202         }
4203
4204         if (grow_down) {
4205                 grow_start = gap_entry->end - grow_amount;
4206                 if (gap_entry->start + grow_amount == gap_entry->end) {
4207                         gap_start = gap_entry->start;
4208                         gap_end = gap_entry->end;
4209                         vm_map_entry_delete(map, gap_entry);
4210                         gap_deleted = true;
4211                 } else {
4212                         MPASS(gap_entry->start < gap_entry->end - grow_amount);
4213                         gap_entry->end -= grow_amount;
4214                         vm_map_entry_resize_free(map, gap_entry);
4215                         gap_deleted = false;
4216                 }
4217                 rv = vm_map_insert(map, NULL, 0, grow_start,
4218                     grow_start + grow_amount,
4219                     stack_entry->protection, stack_entry->max_protection,
4220                     MAP_STACK_GROWS_DOWN);
4221                 if (rv != KERN_SUCCESS) {
4222                         if (gap_deleted) {
4223                                 rv1 = vm_map_insert(map, NULL, 0, gap_start,
4224                                     gap_end, VM_PROT_NONE, VM_PROT_NONE,
4225                                     MAP_CREATE_GUARD | MAP_CREATE_STACK_GAP_DN);
4226                                 MPASS(rv1 == KERN_SUCCESS);
4227                         } else {
4228                                 gap_entry->end += grow_amount;
4229                                 vm_map_entry_resize_free(map, gap_entry);
4230                         }
4231                 }
4232         } else {
4233                 grow_start = stack_entry->end;
4234                 cred = stack_entry->cred;
4235                 if (cred == NULL && stack_entry->object.vm_object != NULL)
4236                         cred = stack_entry->object.vm_object->cred;
4237                 if (cred != NULL && !swap_reserve_by_cred(grow_amount, cred))
4238                         rv = KERN_NO_SPACE;
4239                 /* Grow the underlying object if applicable. */
4240                 else if (stack_entry->object.vm_object == NULL ||
4241                     vm_object_coalesce(stack_entry->object.vm_object,
4242                     stack_entry->offset,
4243                     (vm_size_t)(stack_entry->end - stack_entry->start),
4244                     (vm_size_t)grow_amount, cred != NULL)) {
4245                         if (gap_entry->start + grow_amount == gap_entry->end)
4246                                 vm_map_entry_delete(map, gap_entry);
4247                         else
4248                                 gap_entry->start += grow_amount;
4249                         stack_entry->end += grow_amount;
4250                         map->size += grow_amount;
4251                         vm_map_entry_resize_free(map, stack_entry);
4252                         rv = KERN_SUCCESS;
4253                 } else
4254                         rv = KERN_FAILURE;
4255         }
4256         if (rv == KERN_SUCCESS && is_procstack)
4257                 vm->vm_ssize += btoc(grow_amount);
4258
4259         /*
4260          * Heed the MAP_WIREFUTURE flag if it was set for this process.
4261          */
4262         if (rv == KERN_SUCCESS && (map->flags & MAP_WIREFUTURE) != 0) {
4263                 vm_map_unlock(map);
4264                 vm_map_wire(map, grow_start, grow_start + grow_amount,
4265                     VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
4266                 vm_map_lock_read(map);
4267         } else
4268                 vm_map_lock_downgrade(map);
4269
4270 out:
4271 #ifdef RACCT
4272         if (racct_enable && rv != KERN_SUCCESS) {
4273                 PROC_LOCK(p);
4274                 error = racct_set(p, RACCT_VMEM, map->size);
4275                 KASSERT(error == 0, ("decreasing RACCT_VMEM failed"));
4276                 if (!old_mlock) {
4277                         error = racct_set(p, RACCT_MEMLOCK,
4278                             ptoa(pmap_wired_count(map->pmap)));
4279                         KASSERT(error == 0, ("decreasing RACCT_MEMLOCK failed"));
4280                 }
4281                 error = racct_set(p, RACCT_STACK, ctob(vm->vm_ssize));
4282                 KASSERT(error == 0, ("decreasing RACCT_STACK failed"));
4283                 PROC_UNLOCK(p);
4284         }
4285 #endif
4286
4287         return (rv);
4288 }
4289
4290 /*
4291  * Unshare the specified VM space for exec.  If other processes are
4292  * mapped to it, then create a new one.  The new vmspace is null.
4293  */
4294 int
4295 vmspace_exec(struct proc *p, vm_offset_t minuser, vm_offset_t maxuser)
4296 {
4297         struct vmspace *oldvmspace = p->p_vmspace;
4298         struct vmspace *newvmspace;
4299
4300         KASSERT((curthread->td_pflags & TDP_EXECVMSPC) == 0,
4301             ("vmspace_exec recursed"));
4302         newvmspace = vmspace_alloc(minuser, maxuser, pmap_pinit);
4303         if (newvmspace == NULL)
4304                 return (ENOMEM);
4305         newvmspace->vm_swrss = oldvmspace->vm_swrss;
4306         /*
4307          * This code is written like this for prototype purposes.  The
4308          * goal is to avoid running down the vmspace here, but let the
4309          * other process's that are still using the vmspace to finally
4310          * run it down.  Even though there is little or no chance of blocking
4311          * here, it is a good idea to keep this form for future mods.
4312          */
4313         PROC_VMSPACE_LOCK(p);
4314         p->p_vmspace = newvmspace;
4315         PROC_VMSPACE_UNLOCK(p);
4316         if (p == curthread->td_proc)
4317                 pmap_activate(curthread);
4318         curthread->td_pflags |= TDP_EXECVMSPC;
4319         return (0);
4320 }
4321
4322 /*
4323  * Unshare the specified VM space for forcing COW.  This
4324  * is called by rfork, for the (RFMEM|RFPROC) == 0 case.
4325  */
4326 int
4327 vmspace_unshare(struct proc *p)
4328 {
4329         struct vmspace *oldvmspace = p->p_vmspace;
4330         struct vmspace *newvmspace;
4331         vm_ooffset_t fork_charge;
4332
4333         if (oldvmspace->vm_refcnt == 1)
4334                 return (0);
4335         fork_charge = 0;
4336         newvmspace = vmspace_fork(oldvmspace, &fork_charge);
4337         if (newvmspace == NULL)
4338                 return (ENOMEM);
4339         if (!swap_reserve_by_cred(fork_charge, p->p_ucred)) {
4340                 vmspace_free(newvmspace);
4341                 return (ENOMEM);
4342         }
4343         PROC_VMSPACE_LOCK(p);
4344         p->p_vmspace = newvmspace;
4345         PROC_VMSPACE_UNLOCK(p);
4346         if (p == curthread->td_proc)
4347                 pmap_activate(curthread);
4348         vmspace_free(oldvmspace);
4349         return (0);
4350 }
4351
4352 /*
4353  *      vm_map_lookup:
4354  *
4355  *      Finds the VM object, offset, and
4356  *      protection for a given virtual address in the
4357  *      specified map, assuming a page fault of the
4358  *      type specified.
4359  *
4360  *      Leaves the map in question locked for read; return
4361  *      values are guaranteed until a vm_map_lookup_done
4362  *      call is performed.  Note that the map argument
4363  *      is in/out; the returned map must be used in
4364  *      the call to vm_map_lookup_done.
4365  *
4366  *      A handle (out_entry) is returned for use in
4367  *      vm_map_lookup_done, to make that fast.
4368  *
4369  *      If a lookup is requested with "write protection"
4370  *      specified, the map may be changed to perform virtual
4371  *      copying operations, although the data referenced will
4372  *      remain the same.
4373  */
4374 int
4375 vm_map_lookup(vm_map_t *var_map,                /* IN/OUT */
4376               vm_offset_t vaddr,
4377               vm_prot_t fault_typea,
4378               vm_map_entry_t *out_entry,        /* OUT */
4379               vm_object_t *object,              /* OUT */
4380               vm_pindex_t *pindex,              /* OUT */
4381               vm_prot_t *out_prot,              /* OUT */
4382               boolean_t *wired)                 /* OUT */
4383 {
4384         vm_map_entry_t entry;
4385         vm_map_t map = *var_map;
4386         vm_prot_t prot;
4387         vm_prot_t fault_type = fault_typea;
4388         vm_object_t eobject;
4389         vm_size_t size;
4390         struct ucred *cred;
4391
4392 RetryLookup:
4393
4394         vm_map_lock_read(map);
4395
4396 RetryLookupLocked:
4397         /*
4398          * Lookup the faulting address.
4399          */
4400         if (!vm_map_lookup_entry(map, vaddr, out_entry)) {
4401                 vm_map_unlock_read(map);
4402                 return (KERN_INVALID_ADDRESS);
4403         }
4404
4405         entry = *out_entry;
4406
4407         /*
4408          * Handle submaps.
4409          */
4410         if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
4411                 vm_map_t old_map = map;
4412
4413                 *var_map = map = entry->object.sub_map;
4414                 vm_map_unlock_read(old_map);
4415                 goto RetryLookup;
4416         }
4417
4418         /*
4419          * Check whether this task is allowed to have this page.
4420          */
4421         prot = entry->protection;
4422         if ((fault_typea & VM_PROT_FAULT_LOOKUP) != 0) {
4423                 fault_typea &= ~VM_PROT_FAULT_LOOKUP;
4424                 if (prot == VM_PROT_NONE && map != kernel_map &&
4425                     (entry->eflags & MAP_ENTRY_GUARD) != 0 &&
4426                     (entry->eflags & (MAP_ENTRY_STACK_GAP_DN |
4427                     MAP_ENTRY_STACK_GAP_UP)) != 0 &&
4428                     vm_map_growstack(map, vaddr, entry) == KERN_SUCCESS)
4429                         goto RetryLookupLocked;
4430         }
4431         fault_type &= VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE;
4432         if ((fault_type & prot) != fault_type || prot == VM_PROT_NONE) {
4433                 vm_map_unlock_read(map);
4434                 return (KERN_PROTECTION_FAILURE);
4435         }
4436         KASSERT((prot & VM_PROT_WRITE) == 0 || (entry->eflags &
4437             (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY)) !=
4438             (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY),
4439             ("entry %p flags %x", entry, entry->eflags));
4440         if ((fault_typea & VM_PROT_COPY) != 0 &&
4441             (entry->max_protection & VM_PROT_WRITE) == 0 &&
4442             (entry->eflags & MAP_ENTRY_COW) == 0) {
4443                 vm_map_unlock_read(map);
4444                 return (KERN_PROTECTION_FAILURE);
4445         }
4446
4447         /*
4448          * If this page is not pageable, we have to get it for all possible
4449          * accesses.
4450          */
4451         *wired = (entry->wired_count != 0);
4452         if (*wired)
4453                 fault_type = entry->protection;
4454         size = entry->end - entry->start;
4455         /*
4456          * If the entry was copy-on-write, we either ...
4457          */
4458         if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
4459                 /*
4460                  * If we want to write the page, we may as well handle that
4461                  * now since we've got the map locked.
4462                  *
4463                  * If we don't need to write the page, we just demote the
4464                  * permissions allowed.
4465                  */
4466                 if ((fault_type & VM_PROT_WRITE) != 0 ||
4467                     (fault_typea & VM_PROT_COPY) != 0) {
4468                         /*
4469                          * Make a new object, and place it in the object
4470                          * chain.  Note that no new references have appeared
4471                          * -- one just moved from the map to the new
4472                          * object.
4473                          */
4474                         if (vm_map_lock_upgrade(map))
4475                                 goto RetryLookup;
4476
4477                         if (entry->cred == NULL) {
4478                                 /*
4479                                  * The debugger owner is charged for
4480                                  * the memory.
4481                                  */
4482                                 cred = curthread->td_ucred;
4483                                 crhold(cred);
4484                                 if (!swap_reserve_by_cred(size, cred)) {
4485                                         crfree(cred);
4486                                         vm_map_unlock(map);
4487                                         return (KERN_RESOURCE_SHORTAGE);
4488                                 }
4489                                 entry->cred = cred;
4490                         }
4491                         vm_object_shadow(&entry->object.vm_object,
4492                             &entry->offset, size);
4493                         entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
4494                         eobject = entry->object.vm_object;
4495                         if (eobject->cred != NULL) {
4496                                 /*
4497                                  * The object was not shadowed.
4498                                  */
4499                                 swap_release_by_cred(size, entry->cred);
4500                                 crfree(entry->cred);
4501                                 entry->cred = NULL;
4502                         } else if (entry->cred != NULL) {
4503                                 VM_OBJECT_WLOCK(eobject);
4504                                 eobject->cred = entry->cred;
4505                                 eobject->charge = size;
4506                                 VM_OBJECT_WUNLOCK(eobject);
4507                                 entry->cred = NULL;
4508                         }
4509
4510                         vm_map_lock_downgrade(map);
4511                 } else {
4512                         /*
4513                          * We're attempting to read a copy-on-write page --
4514                          * don't allow writes.
4515                          */
4516                         prot &= ~VM_PROT_WRITE;
4517                 }
4518         }
4519
4520         /*
4521          * Create an object if necessary.
4522          */
4523         if (entry->object.vm_object == NULL &&
4524             !map->system_map) {
4525                 if (vm_map_lock_upgrade(map))
4526                         goto RetryLookup;
4527                 entry->object.vm_object = vm_object_allocate(OBJT_DEFAULT,
4528                     atop(size));
4529                 entry->offset = 0;
4530                 if (entry->cred != NULL) {
4531                         VM_OBJECT_WLOCK(entry->object.vm_object);
4532                         entry->object.vm_object->cred = entry->cred;
4533                         entry->object.vm_object->charge = size;
4534                         VM_OBJECT_WUNLOCK(entry->object.vm_object);
4535                         entry->cred = NULL;
4536                 }
4537                 vm_map_lock_downgrade(map);
4538         }
4539
4540         /*
4541          * Return the object/offset from this entry.  If the entry was
4542          * copy-on-write or empty, it has been fixed up.
4543          */
4544         *pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
4545         *object = entry->object.vm_object;
4546
4547         *out_prot = prot;
4548         return (KERN_SUCCESS);
4549 }
4550
4551 /*
4552  *      vm_map_lookup_locked:
4553  *
4554  *      Lookup the faulting address.  A version of vm_map_lookup that returns 
4555  *      KERN_FAILURE instead of blocking on map lock or memory allocation.
4556  */
4557 int
4558 vm_map_lookup_locked(vm_map_t *var_map,         /* IN/OUT */
4559                      vm_offset_t vaddr,
4560                      vm_prot_t fault_typea,
4561                      vm_map_entry_t *out_entry, /* OUT */
4562                      vm_object_t *object,       /* OUT */
4563                      vm_pindex_t *pindex,       /* OUT */
4564                      vm_prot_t *out_prot,       /* OUT */
4565                      boolean_t *wired)          /* OUT */
4566 {
4567         vm_map_entry_t entry;
4568         vm_map_t map = *var_map;
4569         vm_prot_t prot;
4570         vm_prot_t fault_type = fault_typea;
4571
4572         /*
4573          * Lookup the faulting address.
4574          */
4575         if (!vm_map_lookup_entry(map, vaddr, out_entry))
4576                 return (KERN_INVALID_ADDRESS);
4577
4578         entry = *out_entry;
4579
4580         /*
4581          * Fail if the entry refers to a submap.
4582          */
4583         if (entry->eflags & MAP_ENTRY_IS_SUB_MAP)
4584                 return (KERN_FAILURE);
4585
4586         /*
4587          * Check whether this task is allowed to have this page.
4588          */
4589         prot = entry->protection;
4590         fault_type &= VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE;
4591         if ((fault_type & prot) != fault_type)
4592                 return (KERN_PROTECTION_FAILURE);
4593
4594         /*
4595          * If this page is not pageable, we have to get it for all possible
4596          * accesses.
4597          */
4598         *wired = (entry->wired_count != 0);
4599         if (*wired)
4600                 fault_type = entry->protection;
4601
4602         if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
4603                 /*
4604                  * Fail if the entry was copy-on-write for a write fault.
4605                  */
4606                 if (fault_type & VM_PROT_WRITE)
4607                         return (KERN_FAILURE);
4608                 /*
4609                  * We're attempting to read a copy-on-write page --
4610                  * don't allow writes.
4611                  */
4612                 prot &= ~VM_PROT_WRITE;
4613         }
4614
4615         /*
4616          * Fail if an object should be created.
4617          */
4618         if (entry->object.vm_object == NULL && !map->system_map)
4619                 return (KERN_FAILURE);
4620
4621         /*
4622          * Return the object/offset from this entry.  If the entry was
4623          * copy-on-write or empty, it has been fixed up.
4624          */
4625         *pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
4626         *object = entry->object.vm_object;
4627
4628         *out_prot = prot;
4629         return (KERN_SUCCESS);
4630 }
4631
4632 /*
4633  *      vm_map_lookup_done:
4634  *
4635  *      Releases locks acquired by a vm_map_lookup
4636  *      (according to the handle returned by that lookup).
4637  */
4638 void
4639 vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry)
4640 {
4641         /*
4642          * Unlock the main-level map
4643          */
4644         vm_map_unlock_read(map);
4645 }
4646
4647 vm_offset_t
4648 vm_map_max_KBI(const struct vm_map *map)
4649 {
4650
4651         return (vm_map_max(map));
4652 }
4653
4654 vm_offset_t
4655 vm_map_min_KBI(const struct vm_map *map)
4656 {
4657
4658         return (vm_map_min(map));
4659 }
4660
4661 pmap_t
4662 vm_map_pmap_KBI(vm_map_t map)
4663 {
4664
4665         return (map->pmap);
4666 }
4667
4668 #include "opt_ddb.h"
4669 #ifdef DDB
4670 #include <sys/kernel.h>
4671
4672 #include <ddb/ddb.h>
4673
4674 static void
4675 vm_map_print(vm_map_t map)
4676 {
4677         vm_map_entry_t entry;
4678
4679         db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n",
4680             (void *)map,
4681             (void *)map->pmap, map->nentries, map->timestamp);
4682
4683         db_indent += 2;
4684         for (entry = map->header.next; entry != &map->header;
4685             entry = entry->next) {
4686                 db_iprintf("map entry %p: start=%p, end=%p, eflags=%#x, \n",
4687                     (void *)entry, (void *)entry->start, (void *)entry->end,
4688                     entry->eflags);
4689                 {
4690                         static char *inheritance_name[4] =
4691                         {"share", "copy", "none", "donate_copy"};
4692
4693                         db_iprintf(" prot=%x/%x/%s",
4694                             entry->protection,
4695                             entry->max_protection,
4696                             inheritance_name[(int)(unsigned char)entry->inheritance]);
4697                         if (entry->wired_count != 0)
4698                                 db_printf(", wired");
4699                 }
4700                 if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
4701                         db_printf(", share=%p, offset=0x%jx\n",
4702                             (void *)entry->object.sub_map,
4703                             (uintmax_t)entry->offset);
4704                         if ((entry->prev == &map->header) ||
4705                             (entry->prev->object.sub_map !=
4706                                 entry->object.sub_map)) {
4707                                 db_indent += 2;
4708                                 vm_map_print((vm_map_t)entry->object.sub_map);
4709                                 db_indent -= 2;
4710                         }
4711                 } else {
4712                         if (entry->cred != NULL)
4713                                 db_printf(", ruid %d", entry->cred->cr_ruid);
4714                         db_printf(", object=%p, offset=0x%jx",
4715                             (void *)entry->object.vm_object,
4716                             (uintmax_t)entry->offset);
4717                         if (entry->object.vm_object && entry->object.vm_object->cred)
4718                                 db_printf(", obj ruid %d charge %jx",
4719                                     entry->object.vm_object->cred->cr_ruid,
4720                                     (uintmax_t)entry->object.vm_object->charge);
4721                         if (entry->eflags & MAP_ENTRY_COW)
4722                                 db_printf(", copy (%s)",
4723                                     (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
4724                         db_printf("\n");
4725
4726                         if ((entry->prev == &map->header) ||
4727                             (entry->prev->object.vm_object !=
4728                                 entry->object.vm_object)) {
4729                                 db_indent += 2;
4730                                 vm_object_print((db_expr_t)(intptr_t)
4731                                                 entry->object.vm_object,
4732                                                 0, 0, (char *)0);
4733                                 db_indent -= 2;
4734                         }
4735                 }
4736         }
4737         db_indent -= 2;
4738 }
4739
4740 DB_SHOW_COMMAND(map, map)
4741 {
4742
4743         if (!have_addr) {
4744                 db_printf("usage: show map <addr>\n");
4745                 return;
4746         }
4747         vm_map_print((vm_map_t)addr);
4748 }
4749
4750 DB_SHOW_COMMAND(procvm, procvm)
4751 {
4752         struct proc *p;
4753
4754         if (have_addr) {
4755                 p = db_lookup_proc(addr);
4756         } else {
4757                 p = curproc;
4758         }
4759
4760         db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n",
4761             (void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map,
4762             (void *)vmspace_pmap(p->p_vmspace));
4763
4764         vm_map_print((vm_map_t)&p->p_vmspace->vm_map);
4765 }
4766
4767 #endif /* DDB */