]> CyberLeo.Net >> Repos - FreeBSD/stable/8.git/blob - sys/vm/vm_map.c
MFC r214095: PG_BUSY -> VPO_BUSY, PG_WANTED -> VPO_WANTED in manual
[FreeBSD/stable/8.git] / sys / vm / vm_map.c
1 /*-
2  * Copyright (c) 1991, 1993
3  *      The Regents of the University of California.  All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * The Mach Operating System project at Carnegie-Mellon University.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 4. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *      from: @(#)vm_map.c      8.3 (Berkeley) 1/12/94
33  *
34  *
35  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
36  * All rights reserved.
37  *
38  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
39  *
40  * Permission to use, copy, modify and distribute this software and
41  * its documentation is hereby granted, provided that both the copyright
42  * notice and this permission notice appear in all copies of the
43  * software, derivative works or modified versions, and any portions
44  * thereof, and that both notices appear in supporting documentation.
45  *
46  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
47  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
48  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
49  *
50  * Carnegie Mellon requests users of this software to return to
51  *
52  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
53  *  School of Computer Science
54  *  Carnegie Mellon University
55  *  Pittsburgh PA 15213-3890
56  *
57  * any improvements or extensions that they make and grant Carnegie the
58  * rights to redistribute these changes.
59  */
60
61 /*
62  *      Virtual memory mapping module.
63  */
64
65 #include <sys/cdefs.h>
66 __FBSDID("$FreeBSD$");
67
68 #include <sys/param.h>
69 #include <sys/systm.h>
70 #include <sys/ktr.h>
71 #include <sys/lock.h>
72 #include <sys/mutex.h>
73 #include <sys/proc.h>
74 #include <sys/vmmeter.h>
75 #include <sys/mman.h>
76 #include <sys/vnode.h>
77 #include <sys/resourcevar.h>
78 #include <sys/file.h>
79 #include <sys/sysent.h>
80 #include <sys/shm.h>
81
82 #include <vm/vm.h>
83 #include <vm/vm_param.h>
84 #include <vm/pmap.h>
85 #include <vm/vm_map.h>
86 #include <vm/vm_page.h>
87 #include <vm/vm_object.h>
88 #include <vm/vm_pager.h>
89 #include <vm/vm_kern.h>
90 #include <vm/vm_extern.h>
91 #include <vm/swap_pager.h>
92 #include <vm/uma.h>
93
94 /*
95  *      Virtual memory maps provide for the mapping, protection,
96  *      and sharing of virtual memory objects.  In addition,
97  *      this module provides for an efficient virtual copy of
98  *      memory from one map to another.
99  *
100  *      Synchronization is required prior to most operations.
101  *
102  *      Maps consist of an ordered doubly-linked list of simple
103  *      entries; a self-adjusting binary search tree of these
104  *      entries is used to speed up lookups.
105  *
106  *      Since portions of maps are specified by start/end addresses,
107  *      which may not align with existing map entries, all
108  *      routines merely "clip" entries to these start/end values.
109  *      [That is, an entry is split into two, bordering at a
110  *      start or end value.]  Note that these clippings may not
111  *      always be necessary (as the two resulting entries are then
112  *      not changed); however, the clipping is done for convenience.
113  *
114  *      As mentioned above, virtual copy operations are performed
115  *      by copying VM object references from one map to
116  *      another, and then marking both regions as copy-on-write.
117  */
118
119 static struct mtx map_sleep_mtx;
120 static uma_zone_t mapentzone;
121 static uma_zone_t kmapentzone;
122 static uma_zone_t mapzone;
123 static uma_zone_t vmspace_zone;
124 static struct vm_object kmapentobj;
125 static int vmspace_zinit(void *mem, int size, int flags);
126 static void vmspace_zfini(void *mem, int size);
127 static int vm_map_zinit(void *mem, int ize, int flags);
128 static void vm_map_zfini(void *mem, int size);
129 static void _vm_map_init(vm_map_t map, vm_offset_t min, vm_offset_t max);
130 static void vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map);
131 static void vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry);
132 #ifdef INVARIANTS
133 static void vm_map_zdtor(void *mem, int size, void *arg);
134 static void vmspace_zdtor(void *mem, int size, void *arg);
135 #endif
136
137 #define ENTRY_CHARGED(e) ((e)->uip != NULL || \
138     ((e)->object.vm_object != NULL && (e)->object.vm_object->uip != NULL && \
139      !((e)->eflags & MAP_ENTRY_NEEDS_COPY)))
140
141 /* 
142  * PROC_VMSPACE_{UN,}LOCK() can be a noop as long as vmspaces are type
143  * stable.
144  */
145 #define PROC_VMSPACE_LOCK(p) do { } while (0)
146 #define PROC_VMSPACE_UNLOCK(p) do { } while (0)
147
148 /*
149  *      VM_MAP_RANGE_CHECK:     [ internal use only ]
150  *
151  *      Asserts that the starting and ending region
152  *      addresses fall within the valid range of the map.
153  */
154 #define VM_MAP_RANGE_CHECK(map, start, end)             \
155                 {                                       \
156                 if (start < vm_map_min(map))            \
157                         start = vm_map_min(map);        \
158                 if (end > vm_map_max(map))              \
159                         end = vm_map_max(map);          \
160                 if (start > end)                        \
161                         start = end;                    \
162                 }
163
164 /*
165  *      vm_map_startup:
166  *
167  *      Initialize the vm_map module.  Must be called before
168  *      any other vm_map routines.
169  *
170  *      Map and entry structures are allocated from the general
171  *      purpose memory pool with some exceptions:
172  *
173  *      - The kernel map and kmem submap are allocated statically.
174  *      - Kernel map entries are allocated out of a static pool.
175  *
176  *      These restrictions are necessary since malloc() uses the
177  *      maps and requires map entries.
178  */
179
180 void
181 vm_map_startup(void)
182 {
183         mtx_init(&map_sleep_mtx, "vm map sleep mutex", NULL, MTX_DEF);
184         mapzone = uma_zcreate("MAP", sizeof(struct vm_map), NULL,
185 #ifdef INVARIANTS
186             vm_map_zdtor,
187 #else
188             NULL,
189 #endif
190             vm_map_zinit, vm_map_zfini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
191         uma_prealloc(mapzone, MAX_KMAP);
192         kmapentzone = uma_zcreate("KMAP ENTRY", sizeof(struct vm_map_entry),
193             NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
194             UMA_ZONE_MTXCLASS | UMA_ZONE_VM);
195         uma_prealloc(kmapentzone, MAX_KMAPENT);
196         mapentzone = uma_zcreate("MAP ENTRY", sizeof(struct vm_map_entry),
197             NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
198 }
199
200 static void
201 vmspace_zfini(void *mem, int size)
202 {
203         struct vmspace *vm;
204
205         vm = (struct vmspace *)mem;
206         vm_map_zfini(&vm->vm_map, sizeof(vm->vm_map));
207 }
208
209 static int
210 vmspace_zinit(void *mem, int size, int flags)
211 {
212         struct vmspace *vm;
213
214         vm = (struct vmspace *)mem;
215
216         vm->vm_map.pmap = NULL;
217         (void)vm_map_zinit(&vm->vm_map, sizeof(vm->vm_map), flags);
218         return (0);
219 }
220
221 static void
222 vm_map_zfini(void *mem, int size)
223 {
224         vm_map_t map;
225
226         map = (vm_map_t)mem;
227         mtx_destroy(&map->system_mtx);
228         sx_destroy(&map->lock);
229 }
230
231 static int
232 vm_map_zinit(void *mem, int size, int flags)
233 {
234         vm_map_t map;
235
236         map = (vm_map_t)mem;
237         map->nentries = 0;
238         map->size = 0;
239         mtx_init(&map->system_mtx, "system map", NULL, MTX_DEF | MTX_DUPOK);
240         sx_init(&map->lock, "user map");
241         return (0);
242 }
243
244 #ifdef INVARIANTS
245 static void
246 vmspace_zdtor(void *mem, int size, void *arg)
247 {
248         struct vmspace *vm;
249
250         vm = (struct vmspace *)mem;
251
252         vm_map_zdtor(&vm->vm_map, sizeof(vm->vm_map), arg);
253 }
254 static void
255 vm_map_zdtor(void *mem, int size, void *arg)
256 {
257         vm_map_t map;
258
259         map = (vm_map_t)mem;
260         KASSERT(map->nentries == 0,
261             ("map %p nentries == %d on free.",
262             map, map->nentries));
263         KASSERT(map->size == 0,
264             ("map %p size == %lu on free.",
265             map, (unsigned long)map->size));
266 }
267 #endif  /* INVARIANTS */
268
269 /*
270  * Allocate a vmspace structure, including a vm_map and pmap,
271  * and initialize those structures.  The refcnt is set to 1.
272  */
273 struct vmspace *
274 vmspace_alloc(min, max)
275         vm_offset_t min, max;
276 {
277         struct vmspace *vm;
278
279         vm = uma_zalloc(vmspace_zone, M_WAITOK);
280         if (vm->vm_map.pmap == NULL && !pmap_pinit(vmspace_pmap(vm))) {
281                 uma_zfree(vmspace_zone, vm);
282                 return (NULL);
283         }
284         CTR1(KTR_VM, "vmspace_alloc: %p", vm);
285         _vm_map_init(&vm->vm_map, min, max);
286         vm->vm_map.pmap = vmspace_pmap(vm);             /* XXX */
287         vm->vm_refcnt = 1;
288         vm->vm_shm = NULL;
289         vm->vm_swrss = 0;
290         vm->vm_tsize = 0;
291         vm->vm_dsize = 0;
292         vm->vm_ssize = 0;
293         vm->vm_taddr = 0;
294         vm->vm_daddr = 0;
295         vm->vm_maxsaddr = 0;
296         return (vm);
297 }
298
299 void
300 vm_init2(void)
301 {
302         uma_zone_set_obj(kmapentzone, &kmapentobj, lmin(cnt.v_page_count,
303             (VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) / PAGE_SIZE) / 8 +
304              maxproc * 2 + maxfiles);
305         vmspace_zone = uma_zcreate("VMSPACE", sizeof(struct vmspace), NULL,
306 #ifdef INVARIANTS
307             vmspace_zdtor,
308 #else
309             NULL,
310 #endif
311             vmspace_zinit, vmspace_zfini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
312 }
313
314 static inline void
315 vmspace_dofree(struct vmspace *vm)
316 {
317         CTR1(KTR_VM, "vmspace_free: %p", vm);
318
319         /*
320          * Make sure any SysV shm is freed, it might not have been in
321          * exit1().
322          */
323         shmexit(vm);
324
325         /*
326          * Lock the map, to wait out all other references to it.
327          * Delete all of the mappings and pages they hold, then call
328          * the pmap module to reclaim anything left.
329          */
330         (void)vm_map_remove(&vm->vm_map, vm->vm_map.min_offset,
331             vm->vm_map.max_offset);
332
333         /*
334          * XXX Comment out the pmap_release call for now. The
335          * vmspace_zone is marked as UMA_ZONE_NOFREE, and bugs cause
336          * pmap.resident_count to be != 0 on exit sometimes.
337          */
338 /*      pmap_release(vmspace_pmap(vm)); */
339         uma_zfree(vmspace_zone, vm);
340 }
341
342 void
343 vmspace_free(struct vmspace *vm)
344 {
345         int refcnt;
346
347         if (vm->vm_refcnt == 0)
348                 panic("vmspace_free: attempt to free already freed vmspace");
349
350         do
351                 refcnt = vm->vm_refcnt;
352         while (!atomic_cmpset_int(&vm->vm_refcnt, refcnt, refcnt - 1));
353         if (refcnt == 1)
354                 vmspace_dofree(vm);
355 }
356
357 void
358 vmspace_exitfree(struct proc *p)
359 {
360         struct vmspace *vm;
361
362         PROC_VMSPACE_LOCK(p);
363         vm = p->p_vmspace;
364         p->p_vmspace = NULL;
365         PROC_VMSPACE_UNLOCK(p);
366         KASSERT(vm == &vmspace0, ("vmspace_exitfree: wrong vmspace"));
367         vmspace_free(vm);
368 }
369
370 void
371 vmspace_exit(struct thread *td)
372 {
373         int refcnt;
374         struct vmspace *vm;
375         struct proc *p;
376
377         /*
378          * Release user portion of address space.
379          * This releases references to vnodes,
380          * which could cause I/O if the file has been unlinked.
381          * Need to do this early enough that we can still sleep.
382          *
383          * The last exiting process to reach this point releases as
384          * much of the environment as it can. vmspace_dofree() is the
385          * slower fallback in case another process had a temporary
386          * reference to the vmspace.
387          */
388
389         p = td->td_proc;
390         vm = p->p_vmspace;
391         atomic_add_int(&vmspace0.vm_refcnt, 1);
392         do {
393                 refcnt = vm->vm_refcnt;
394                 if (refcnt > 1 && p->p_vmspace != &vmspace0) {
395                         /* Switch now since other proc might free vmspace */
396                         PROC_VMSPACE_LOCK(p);
397                         p->p_vmspace = &vmspace0;
398                         PROC_VMSPACE_UNLOCK(p);
399                         pmap_activate(td);
400                 }
401         } while (!atomic_cmpset_int(&vm->vm_refcnt, refcnt, refcnt - 1));
402         if (refcnt == 1) {
403                 if (p->p_vmspace != vm) {
404                         /* vmspace not yet freed, switch back */
405                         PROC_VMSPACE_LOCK(p);
406                         p->p_vmspace = vm;
407                         PROC_VMSPACE_UNLOCK(p);
408                         pmap_activate(td);
409                 }
410                 pmap_remove_pages(vmspace_pmap(vm));
411                 /* Switch now since this proc will free vmspace */
412                 PROC_VMSPACE_LOCK(p);
413                 p->p_vmspace = &vmspace0;
414                 PROC_VMSPACE_UNLOCK(p);
415                 pmap_activate(td);
416                 vmspace_dofree(vm);
417         }
418 }
419
420 /* Acquire reference to vmspace owned by another process. */
421
422 struct vmspace *
423 vmspace_acquire_ref(struct proc *p)
424 {
425         struct vmspace *vm;
426         int refcnt;
427
428         PROC_VMSPACE_LOCK(p);
429         vm = p->p_vmspace;
430         if (vm == NULL) {
431                 PROC_VMSPACE_UNLOCK(p);
432                 return (NULL);
433         }
434         do {
435                 refcnt = vm->vm_refcnt;
436                 if (refcnt <= 0) {      /* Avoid 0->1 transition */
437                         PROC_VMSPACE_UNLOCK(p);
438                         return (NULL);
439                 }
440         } while (!atomic_cmpset_int(&vm->vm_refcnt, refcnt, refcnt + 1));
441         if (vm != p->p_vmspace) {
442                 PROC_VMSPACE_UNLOCK(p);
443                 vmspace_free(vm);
444                 return (NULL);
445         }
446         PROC_VMSPACE_UNLOCK(p);
447         return (vm);
448 }
449
450 void
451 _vm_map_lock(vm_map_t map, const char *file, int line)
452 {
453
454         if (map->system_map)
455                 _mtx_lock_flags(&map->system_mtx, 0, file, line);
456         else
457                 (void)_sx_xlock(&map->lock, 0, file, line);
458         map->timestamp++;
459 }
460
461 static void
462 vm_map_process_deferred(void)
463 {
464         struct thread *td;
465         vm_map_entry_t entry;
466
467         td = curthread;
468
469         while ((entry = td->td_map_def_user) != NULL) {
470                 td->td_map_def_user = entry->next;
471                 vm_map_entry_deallocate(entry, FALSE);
472         }
473 }
474
475 void
476 _vm_map_unlock(vm_map_t map, const char *file, int line)
477 {
478
479         if (map->system_map)
480                 _mtx_unlock_flags(&map->system_mtx, 0, file, line);
481         else {
482                 _sx_xunlock(&map->lock, file, line);
483                 vm_map_process_deferred();
484         }
485 }
486
487 void
488 _vm_map_lock_read(vm_map_t map, const char *file, int line)
489 {
490
491         if (map->system_map)
492                 _mtx_lock_flags(&map->system_mtx, 0, file, line);
493         else
494                 (void)_sx_slock(&map->lock, 0, file, line);
495 }
496
497 void
498 _vm_map_unlock_read(vm_map_t map, const char *file, int line)
499 {
500
501         if (map->system_map)
502                 _mtx_unlock_flags(&map->system_mtx, 0, file, line);
503         else {
504                 _sx_sunlock(&map->lock, file, line);
505                 vm_map_process_deferred();
506         }
507 }
508
509 int
510 _vm_map_trylock(vm_map_t map, const char *file, int line)
511 {
512         int error;
513
514         error = map->system_map ?
515             !_mtx_trylock(&map->system_mtx, 0, file, line) :
516             !_sx_try_xlock(&map->lock, file, line);
517         if (error == 0)
518                 map->timestamp++;
519         return (error == 0);
520 }
521
522 int
523 _vm_map_trylock_read(vm_map_t map, const char *file, int line)
524 {
525         int error;
526
527         error = map->system_map ?
528             !_mtx_trylock(&map->system_mtx, 0, file, line) :
529             !_sx_try_slock(&map->lock, file, line);
530         return (error == 0);
531 }
532
533 /*
534  *      _vm_map_lock_upgrade:   [ internal use only ]
535  *
536  *      Tries to upgrade a read (shared) lock on the specified map to a write
537  *      (exclusive) lock.  Returns the value "0" if the upgrade succeeds and a
538  *      non-zero value if the upgrade fails.  If the upgrade fails, the map is
539  *      returned without a read or write lock held.
540  *
541  *      Requires that the map be read locked.
542  */
543 int
544 _vm_map_lock_upgrade(vm_map_t map, const char *file, int line)
545 {
546         unsigned int last_timestamp;
547
548         if (map->system_map) {
549 #ifdef INVARIANTS
550                 _mtx_assert(&map->system_mtx, MA_OWNED, file, line);
551 #endif
552         } else {
553                 if (!_sx_try_upgrade(&map->lock, file, line)) {
554                         last_timestamp = map->timestamp;
555                         _sx_sunlock(&map->lock, file, line);
556                         vm_map_process_deferred();
557                         /*
558                          * If the map's timestamp does not change while the
559                          * map is unlocked, then the upgrade succeeds.
560                          */
561                         (void)_sx_xlock(&map->lock, 0, file, line);
562                         if (last_timestamp != map->timestamp) {
563                                 _sx_xunlock(&map->lock, file, line);
564                                 return (1);
565                         }
566                 }
567         }
568         map->timestamp++;
569         return (0);
570 }
571
572 void
573 _vm_map_lock_downgrade(vm_map_t map, const char *file, int line)
574 {
575
576         if (map->system_map) {
577 #ifdef INVARIANTS
578                 _mtx_assert(&map->system_mtx, MA_OWNED, file, line);
579 #endif
580         } else
581                 _sx_downgrade(&map->lock, file, line);
582 }
583
584 /*
585  *      vm_map_locked:
586  *
587  *      Returns a non-zero value if the caller holds a write (exclusive) lock
588  *      on the specified map and the value "0" otherwise.
589  */
590 int
591 vm_map_locked(vm_map_t map)
592 {
593
594         if (map->system_map)
595                 return (mtx_owned(&map->system_mtx));
596         else
597                 return (sx_xlocked(&map->lock));
598 }
599
600 #ifdef INVARIANTS
601 static void
602 _vm_map_assert_locked(vm_map_t map, const char *file, int line)
603 {
604
605         if (map->system_map)
606                 _mtx_assert(&map->system_mtx, MA_OWNED, file, line);
607         else
608                 _sx_assert(&map->lock, SA_XLOCKED, file, line);
609 }
610
611 #if 0
612 static void
613 _vm_map_assert_locked_read(vm_map_t map, const char *file, int line)
614 {
615
616         if (map->system_map)
617                 _mtx_assert(&map->system_mtx, MA_OWNED, file, line);
618         else
619                 _sx_assert(&map->lock, SA_SLOCKED, file, line);
620 }
621 #endif
622
623 #define VM_MAP_ASSERT_LOCKED(map) \
624     _vm_map_assert_locked(map, LOCK_FILE, LOCK_LINE)
625 #define VM_MAP_ASSERT_LOCKED_READ(map) \
626     _vm_map_assert_locked_read(map, LOCK_FILE, LOCK_LINE)
627 #else
628 #define VM_MAP_ASSERT_LOCKED(map)
629 #define VM_MAP_ASSERT_LOCKED_READ(map)
630 #endif
631
632 /*
633  *      _vm_map_unlock_and_wait:
634  *
635  *      Atomically releases the lock on the specified map and puts the calling
636  *      thread to sleep.  The calling thread will remain asleep until either
637  *      vm_map_wakeup() is performed on the map or the specified timeout is
638  *      exceeded.
639  *
640  *      WARNING!  This function does not perform deferred deallocations of
641  *      objects and map entries.  Therefore, the calling thread is expected to
642  *      reacquire the map lock after reawakening and later perform an ordinary
643  *      unlock operation, such as vm_map_unlock(), before completing its
644  *      operation on the map.
645  */
646 int
647 _vm_map_unlock_and_wait(vm_map_t map, int timo, const char *file, int line)
648 {
649
650         mtx_lock(&map_sleep_mtx);
651         if (map->system_map)
652                 _mtx_unlock_flags(&map->system_mtx, 0, file, line);
653         else
654                 _sx_xunlock(&map->lock, file, line);
655         return (msleep(&map->root, &map_sleep_mtx, PDROP | PVM, "vmmaps",
656             timo));
657 }
658
659 /*
660  *      vm_map_wakeup:
661  *
662  *      Awaken any threads that have slept on the map using
663  *      vm_map_unlock_and_wait().
664  */
665 void
666 vm_map_wakeup(vm_map_t map)
667 {
668
669         /*
670          * Acquire and release map_sleep_mtx to prevent a wakeup()
671          * from being performed (and lost) between the map unlock
672          * and the msleep() in _vm_map_unlock_and_wait().
673          */
674         mtx_lock(&map_sleep_mtx);
675         mtx_unlock(&map_sleep_mtx);
676         wakeup(&map->root);
677 }
678
679 long
680 vmspace_resident_count(struct vmspace *vmspace)
681 {
682         return pmap_resident_count(vmspace_pmap(vmspace));
683 }
684
685 long
686 vmspace_wired_count(struct vmspace *vmspace)
687 {
688         return pmap_wired_count(vmspace_pmap(vmspace));
689 }
690
691 /*
692  *      vm_map_create:
693  *
694  *      Creates and returns a new empty VM map with
695  *      the given physical map structure, and having
696  *      the given lower and upper address bounds.
697  */
698 vm_map_t
699 vm_map_create(pmap_t pmap, vm_offset_t min, vm_offset_t max)
700 {
701         vm_map_t result;
702
703         result = uma_zalloc(mapzone, M_WAITOK);
704         CTR1(KTR_VM, "vm_map_create: %p", result);
705         _vm_map_init(result, min, max);
706         result->pmap = pmap;
707         return (result);
708 }
709
710 /*
711  * Initialize an existing vm_map structure
712  * such as that in the vmspace structure.
713  * The pmap is set elsewhere.
714  */
715 static void
716 _vm_map_init(vm_map_t map, vm_offset_t min, vm_offset_t max)
717 {
718
719         map->header.next = map->header.prev = &map->header;
720         map->needs_wakeup = FALSE;
721         map->system_map = 0;
722         map->min_offset = min;
723         map->max_offset = max;
724         map->flags = 0;
725         map->root = NULL;
726         map->timestamp = 0;
727 }
728
729 void
730 vm_map_init(vm_map_t map, vm_offset_t min, vm_offset_t max)
731 {
732         _vm_map_init(map, min, max);
733         mtx_init(&map->system_mtx, "system map", NULL, MTX_DEF | MTX_DUPOK);
734         sx_init(&map->lock, "user map");
735 }
736
737 /*
738  *      vm_map_entry_dispose:   [ internal use only ]
739  *
740  *      Inverse of vm_map_entry_create.
741  */
742 static void
743 vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry)
744 {
745         uma_zfree(map->system_map ? kmapentzone : mapentzone, entry);
746 }
747
748 /*
749  *      vm_map_entry_create:    [ internal use only ]
750  *
751  *      Allocates a VM map entry for insertion.
752  *      No entry fields are filled in.
753  */
754 static vm_map_entry_t
755 vm_map_entry_create(vm_map_t map)
756 {
757         vm_map_entry_t new_entry;
758
759         if (map->system_map)
760                 new_entry = uma_zalloc(kmapentzone, M_NOWAIT);
761         else
762                 new_entry = uma_zalloc(mapentzone, M_WAITOK);
763         if (new_entry == NULL)
764                 panic("vm_map_entry_create: kernel resources exhausted");
765         return (new_entry);
766 }
767
768 /*
769  *      vm_map_entry_set_behavior:
770  *
771  *      Set the expected access behavior, either normal, random, or
772  *      sequential.
773  */
774 static inline void
775 vm_map_entry_set_behavior(vm_map_entry_t entry, u_char behavior)
776 {
777         entry->eflags = (entry->eflags & ~MAP_ENTRY_BEHAV_MASK) |
778             (behavior & MAP_ENTRY_BEHAV_MASK);
779 }
780
781 /*
782  *      vm_map_entry_set_max_free:
783  *
784  *      Set the max_free field in a vm_map_entry.
785  */
786 static inline void
787 vm_map_entry_set_max_free(vm_map_entry_t entry)
788 {
789
790         entry->max_free = entry->adj_free;
791         if (entry->left != NULL && entry->left->max_free > entry->max_free)
792                 entry->max_free = entry->left->max_free;
793         if (entry->right != NULL && entry->right->max_free > entry->max_free)
794                 entry->max_free = entry->right->max_free;
795 }
796
797 /*
798  *      vm_map_entry_splay:
799  *
800  *      The Sleator and Tarjan top-down splay algorithm with the
801  *      following variation.  Max_free must be computed bottom-up, so
802  *      on the downward pass, maintain the left and right spines in
803  *      reverse order.  Then, make a second pass up each side to fix
804  *      the pointers and compute max_free.  The time bound is O(log n)
805  *      amortized.
806  *
807  *      The new root is the vm_map_entry containing "addr", or else an
808  *      adjacent entry (lower or higher) if addr is not in the tree.
809  *
810  *      The map must be locked, and leaves it so.
811  *
812  *      Returns: the new root.
813  */
814 static vm_map_entry_t
815 vm_map_entry_splay(vm_offset_t addr, vm_map_entry_t root)
816 {
817         vm_map_entry_t llist, rlist;
818         vm_map_entry_t ltree, rtree;
819         vm_map_entry_t y;
820
821         /* Special case of empty tree. */
822         if (root == NULL)
823                 return (root);
824
825         /*
826          * Pass One: Splay down the tree until we find addr or a NULL
827          * pointer where addr would go.  llist and rlist are the two
828          * sides in reverse order (bottom-up), with llist linked by
829          * the right pointer and rlist linked by the left pointer in
830          * the vm_map_entry.  Wait until Pass Two to set max_free on
831          * the two spines.
832          */
833         llist = NULL;
834         rlist = NULL;
835         for (;;) {
836                 /* root is never NULL in here. */
837                 if (addr < root->start) {
838                         y = root->left;
839                         if (y == NULL)
840                                 break;
841                         if (addr < y->start && y->left != NULL) {
842                                 /* Rotate right and put y on rlist. */
843                                 root->left = y->right;
844                                 y->right = root;
845                                 vm_map_entry_set_max_free(root);
846                                 root = y->left;
847                                 y->left = rlist;
848                                 rlist = y;
849                         } else {
850                                 /* Put root on rlist. */
851                                 root->left = rlist;
852                                 rlist = root;
853                                 root = y;
854                         }
855                 } else if (addr >= root->end) {
856                         y = root->right;
857                         if (y == NULL)
858                                 break;
859                         if (addr >= y->end && y->right != NULL) {
860                                 /* Rotate left and put y on llist. */
861                                 root->right = y->left;
862                                 y->left = root;
863                                 vm_map_entry_set_max_free(root);
864                                 root = y->right;
865                                 y->right = llist;
866                                 llist = y;
867                         } else {
868                                 /* Put root on llist. */
869                                 root->right = llist;
870                                 llist = root;
871                                 root = y;
872                         }
873                 } else
874                         break;
875         }
876
877         /*
878          * Pass Two: Walk back up the two spines, flip the pointers
879          * and set max_free.  The subtrees of the root go at the
880          * bottom of llist and rlist.
881          */
882         ltree = root->left;
883         while (llist != NULL) {
884                 y = llist->right;
885                 llist->right = ltree;
886                 vm_map_entry_set_max_free(llist);
887                 ltree = llist;
888                 llist = y;
889         }
890         rtree = root->right;
891         while (rlist != NULL) {
892                 y = rlist->left;
893                 rlist->left = rtree;
894                 vm_map_entry_set_max_free(rlist);
895                 rtree = rlist;
896                 rlist = y;
897         }
898
899         /*
900          * Final assembly: add ltree and rtree as subtrees of root.
901          */
902         root->left = ltree;
903         root->right = rtree;
904         vm_map_entry_set_max_free(root);
905
906         return (root);
907 }
908
909 /*
910  *      vm_map_entry_{un,}link:
911  *
912  *      Insert/remove entries from maps.
913  */
914 static void
915 vm_map_entry_link(vm_map_t map,
916                   vm_map_entry_t after_where,
917                   vm_map_entry_t entry)
918 {
919
920         CTR4(KTR_VM,
921             "vm_map_entry_link: map %p, nentries %d, entry %p, after %p", map,
922             map->nentries, entry, after_where);
923         VM_MAP_ASSERT_LOCKED(map);
924         map->nentries++;
925         entry->prev = after_where;
926         entry->next = after_where->next;
927         entry->next->prev = entry;
928         after_where->next = entry;
929
930         if (after_where != &map->header) {
931                 if (after_where != map->root)
932                         vm_map_entry_splay(after_where->start, map->root);
933                 entry->right = after_where->right;
934                 entry->left = after_where;
935                 after_where->right = NULL;
936                 after_where->adj_free = entry->start - after_where->end;
937                 vm_map_entry_set_max_free(after_where);
938         } else {
939                 entry->right = map->root;
940                 entry->left = NULL;
941         }
942         entry->adj_free = (entry->next == &map->header ? map->max_offset :
943             entry->next->start) - entry->end;
944         vm_map_entry_set_max_free(entry);
945         map->root = entry;
946 }
947
948 static void
949 vm_map_entry_unlink(vm_map_t map,
950                     vm_map_entry_t entry)
951 {
952         vm_map_entry_t next, prev, root;
953
954         VM_MAP_ASSERT_LOCKED(map);
955         if (entry != map->root)
956                 vm_map_entry_splay(entry->start, map->root);
957         if (entry->left == NULL)
958                 root = entry->right;
959         else {
960                 root = vm_map_entry_splay(entry->start, entry->left);
961                 root->right = entry->right;
962                 root->adj_free = (entry->next == &map->header ? map->max_offset :
963                     entry->next->start) - root->end;
964                 vm_map_entry_set_max_free(root);
965         }
966         map->root = root;
967
968         prev = entry->prev;
969         next = entry->next;
970         next->prev = prev;
971         prev->next = next;
972         map->nentries--;
973         CTR3(KTR_VM, "vm_map_entry_unlink: map %p, nentries %d, entry %p", map,
974             map->nentries, entry);
975 }
976
977 /*
978  *      vm_map_entry_resize_free:
979  *
980  *      Recompute the amount of free space following a vm_map_entry
981  *      and propagate that value up the tree.  Call this function after
982  *      resizing a map entry in-place, that is, without a call to
983  *      vm_map_entry_link() or _unlink().
984  *
985  *      The map must be locked, and leaves it so.
986  */
987 static void
988 vm_map_entry_resize_free(vm_map_t map, vm_map_entry_t entry)
989 {
990
991         /*
992          * Using splay trees without parent pointers, propagating
993          * max_free up the tree is done by moving the entry to the
994          * root and making the change there.
995          */
996         if (entry != map->root)
997                 map->root = vm_map_entry_splay(entry->start, map->root);
998
999         entry->adj_free = (entry->next == &map->header ? map->max_offset :
1000             entry->next->start) - entry->end;
1001         vm_map_entry_set_max_free(entry);
1002 }
1003
1004 /*
1005  *      vm_map_lookup_entry:    [ internal use only ]
1006  *
1007  *      Finds the map entry containing (or
1008  *      immediately preceding) the specified address
1009  *      in the given map; the entry is returned
1010  *      in the "entry" parameter.  The boolean
1011  *      result indicates whether the address is
1012  *      actually contained in the map.
1013  */
1014 boolean_t
1015 vm_map_lookup_entry(
1016         vm_map_t map,
1017         vm_offset_t address,
1018         vm_map_entry_t *entry)  /* OUT */
1019 {
1020         vm_map_entry_t cur;
1021         boolean_t locked;
1022
1023         /*
1024          * If the map is empty, then the map entry immediately preceding
1025          * "address" is the map's header.
1026          */
1027         cur = map->root;
1028         if (cur == NULL)
1029                 *entry = &map->header;
1030         else if (address >= cur->start && cur->end > address) {
1031                 *entry = cur;
1032                 return (TRUE);
1033         } else if ((locked = vm_map_locked(map)) ||
1034             sx_try_upgrade(&map->lock)) {
1035                 /*
1036                  * Splay requires a write lock on the map.  However, it only
1037                  * restructures the binary search tree; it does not otherwise
1038                  * change the map.  Thus, the map's timestamp need not change
1039                  * on a temporary upgrade.
1040                  */
1041                 map->root = cur = vm_map_entry_splay(address, cur);
1042                 if (!locked)
1043                         sx_downgrade(&map->lock);
1044
1045                 /*
1046                  * If "address" is contained within a map entry, the new root
1047                  * is that map entry.  Otherwise, the new root is a map entry
1048                  * immediately before or after "address".
1049                  */
1050                 if (address >= cur->start) {
1051                         *entry = cur;
1052                         if (cur->end > address)
1053                                 return (TRUE);
1054                 } else
1055                         *entry = cur->prev;
1056         } else
1057                 /*
1058                  * Since the map is only locked for read access, perform a
1059                  * standard binary search tree lookup for "address".
1060                  */
1061                 for (;;) {
1062                         if (address < cur->start) {
1063                                 if (cur->left == NULL) {
1064                                         *entry = cur->prev;
1065                                         break;
1066                                 }
1067                                 cur = cur->left;
1068                         } else if (cur->end > address) {
1069                                 *entry = cur;
1070                                 return (TRUE);
1071                         } else {
1072                                 if (cur->right == NULL) {
1073                                         *entry = cur;
1074                                         break;
1075                                 }
1076                                 cur = cur->right;
1077                         }
1078                 }
1079         return (FALSE);
1080 }
1081
1082 /*
1083  *      vm_map_insert:
1084  *
1085  *      Inserts the given whole VM object into the target
1086  *      map at the specified address range.  The object's
1087  *      size should match that of the address range.
1088  *
1089  *      Requires that the map be locked, and leaves it so.
1090  *
1091  *      If object is non-NULL, ref count must be bumped by caller
1092  *      prior to making call to account for the new entry.
1093  */
1094 int
1095 vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1096               vm_offset_t start, vm_offset_t end, vm_prot_t prot, vm_prot_t max,
1097               int cow)
1098 {
1099         vm_map_entry_t new_entry;
1100         vm_map_entry_t prev_entry;
1101         vm_map_entry_t temp_entry;
1102         vm_eflags_t protoeflags;
1103         struct uidinfo *uip;
1104         boolean_t charge_prev_obj;
1105
1106         VM_MAP_ASSERT_LOCKED(map);
1107
1108         /*
1109          * Check that the start and end points are not bogus.
1110          */
1111         if ((start < map->min_offset) || (end > map->max_offset) ||
1112             (start >= end))
1113                 return (KERN_INVALID_ADDRESS);
1114
1115         /*
1116          * Find the entry prior to the proposed starting address; if it's part
1117          * of an existing entry, this range is bogus.
1118          */
1119         if (vm_map_lookup_entry(map, start, &temp_entry))
1120                 return (KERN_NO_SPACE);
1121
1122         prev_entry = temp_entry;
1123
1124         /*
1125          * Assert that the next entry doesn't overlap the end point.
1126          */
1127         if ((prev_entry->next != &map->header) &&
1128             (prev_entry->next->start < end))
1129                 return (KERN_NO_SPACE);
1130
1131         protoeflags = 0;
1132         charge_prev_obj = FALSE;
1133
1134         if (cow & MAP_COPY_ON_WRITE)
1135                 protoeflags |= MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY;
1136
1137         if (cow & MAP_NOFAULT) {
1138                 protoeflags |= MAP_ENTRY_NOFAULT;
1139
1140                 KASSERT(object == NULL,
1141                         ("vm_map_insert: paradoxical MAP_NOFAULT request"));
1142         }
1143         if (cow & MAP_DISABLE_SYNCER)
1144                 protoeflags |= MAP_ENTRY_NOSYNC;
1145         if (cow & MAP_DISABLE_COREDUMP)
1146                 protoeflags |= MAP_ENTRY_NOCOREDUMP;
1147
1148         uip = NULL;
1149         KASSERT((object != kmem_object && object != kernel_object) ||
1150             ((object == kmem_object || object == kernel_object) &&
1151                 !(protoeflags & MAP_ENTRY_NEEDS_COPY)),
1152             ("kmem or kernel object and cow"));
1153         if (cow & (MAP_ACC_NO_CHARGE | MAP_NOFAULT))
1154                 goto charged;
1155         if ((cow & MAP_ACC_CHARGED) || ((prot & VM_PROT_WRITE) &&
1156             ((protoeflags & MAP_ENTRY_NEEDS_COPY) || object == NULL))) {
1157                 if (!(cow & MAP_ACC_CHARGED) && !swap_reserve(end - start))
1158                         return (KERN_RESOURCE_SHORTAGE);
1159                 KASSERT(object == NULL || (protoeflags & MAP_ENTRY_NEEDS_COPY) ||
1160                     object->uip == NULL,
1161                     ("OVERCOMMIT: vm_map_insert o %p", object));
1162                 uip = curthread->td_ucred->cr_ruidinfo;
1163                 uihold(uip);
1164                 if (object == NULL && !(protoeflags & MAP_ENTRY_NEEDS_COPY))
1165                         charge_prev_obj = TRUE;
1166         }
1167
1168 charged:
1169         if (object != NULL) {
1170                 /*
1171                  * OBJ_ONEMAPPING must be cleared unless this mapping
1172                  * is trivially proven to be the only mapping for any
1173                  * of the object's pages.  (Object granularity
1174                  * reference counting is insufficient to recognize
1175                  * aliases with precision.)
1176                  */
1177                 VM_OBJECT_LOCK(object);
1178                 if (object->ref_count > 1 || object->shadow_count != 0)
1179                         vm_object_clear_flag(object, OBJ_ONEMAPPING);
1180                 VM_OBJECT_UNLOCK(object);
1181         }
1182         else if ((prev_entry != &map->header) &&
1183                  (prev_entry->eflags == protoeflags) &&
1184                  (prev_entry->end == start) &&
1185                  (prev_entry->wired_count == 0) &&
1186                  (prev_entry->uip == uip ||
1187                   (prev_entry->object.vm_object != NULL &&
1188                    (prev_entry->object.vm_object->uip == uip))) &&
1189                    vm_object_coalesce(prev_entry->object.vm_object,
1190                        prev_entry->offset,
1191                        (vm_size_t)(prev_entry->end - prev_entry->start),
1192                        (vm_size_t)(end - prev_entry->end), charge_prev_obj)) {
1193                 /*
1194                  * We were able to extend the object.  Determine if we
1195                  * can extend the previous map entry to include the
1196                  * new range as well.
1197                  */
1198                 if ((prev_entry->inheritance == VM_INHERIT_DEFAULT) &&
1199                     (prev_entry->protection == prot) &&
1200                     (prev_entry->max_protection == max)) {
1201                         map->size += (end - prev_entry->end);
1202                         prev_entry->end = end;
1203                         vm_map_entry_resize_free(map, prev_entry);
1204                         vm_map_simplify_entry(map, prev_entry);
1205                         if (uip != NULL)
1206                                 uifree(uip);
1207                         return (KERN_SUCCESS);
1208                 }
1209
1210                 /*
1211                  * If we can extend the object but cannot extend the
1212                  * map entry, we have to create a new map entry.  We
1213                  * must bump the ref count on the extended object to
1214                  * account for it.  object may be NULL.
1215                  */
1216                 object = prev_entry->object.vm_object;
1217                 offset = prev_entry->offset +
1218                         (prev_entry->end - prev_entry->start);
1219                 vm_object_reference(object);
1220                 if (uip != NULL && object != NULL && object->uip != NULL &&
1221                     !(prev_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
1222                         /* Object already accounts for this uid. */
1223                         uifree(uip);
1224                         uip = NULL;
1225                 }
1226         }
1227
1228         /*
1229          * NOTE: if conditionals fail, object can be NULL here.  This occurs
1230          * in things like the buffer map where we manage kva but do not manage
1231          * backing objects.
1232          */
1233
1234         /*
1235          * Create a new entry
1236          */
1237         new_entry = vm_map_entry_create(map);
1238         new_entry->start = start;
1239         new_entry->end = end;
1240         new_entry->uip = NULL;
1241
1242         new_entry->eflags = protoeflags;
1243         new_entry->object.vm_object = object;
1244         new_entry->offset = offset;
1245         new_entry->avail_ssize = 0;
1246
1247         new_entry->inheritance = VM_INHERIT_DEFAULT;
1248         new_entry->protection = prot;
1249         new_entry->max_protection = max;
1250         new_entry->wired_count = 0;
1251
1252         KASSERT(uip == NULL || !ENTRY_CHARGED(new_entry),
1253             ("OVERCOMMIT: vm_map_insert leaks vm_map %p", new_entry));
1254         new_entry->uip = uip;
1255
1256         /*
1257          * Insert the new entry into the list
1258          */
1259         vm_map_entry_link(map, prev_entry, new_entry);
1260         map->size += new_entry->end - new_entry->start;
1261
1262 #if 0
1263         /*
1264          * Temporarily removed to avoid MAP_STACK panic, due to
1265          * MAP_STACK being a huge hack.  Will be added back in
1266          * when MAP_STACK (and the user stack mapping) is fixed.
1267          */
1268         /*
1269          * It may be possible to simplify the entry
1270          */
1271         vm_map_simplify_entry(map, new_entry);
1272 #endif
1273
1274         if (cow & (MAP_PREFAULT|MAP_PREFAULT_PARTIAL)) {
1275                 vm_map_pmap_enter(map, start, prot,
1276                                     object, OFF_TO_IDX(offset), end - start,
1277                                     cow & MAP_PREFAULT_PARTIAL);
1278         }
1279
1280         return (KERN_SUCCESS);
1281 }
1282
1283 /*
1284  *      vm_map_findspace:
1285  *
1286  *      Find the first fit (lowest VM address) for "length" free bytes
1287  *      beginning at address >= start in the given map.
1288  *
1289  *      In a vm_map_entry, "adj_free" is the amount of free space
1290  *      adjacent (higher address) to this entry, and "max_free" is the
1291  *      maximum amount of contiguous free space in its subtree.  This
1292  *      allows finding a free region in one path down the tree, so
1293  *      O(log n) amortized with splay trees.
1294  *
1295  *      The map must be locked, and leaves it so.
1296  *
1297  *      Returns: 0 on success, and starting address in *addr,
1298  *               1 if insufficient space.
1299  */
1300 int
1301 vm_map_findspace(vm_map_t map, vm_offset_t start, vm_size_t length,
1302     vm_offset_t *addr)  /* OUT */
1303 {
1304         vm_map_entry_t entry;
1305         vm_offset_t end, st;
1306
1307         /*
1308          * Request must fit within min/max VM address and must avoid
1309          * address wrap.
1310          */
1311         if (start < map->min_offset)
1312                 start = map->min_offset;
1313         if (start + length > map->max_offset || start + length < start)
1314                 return (1);
1315
1316         /* Empty tree means wide open address space. */
1317         if (map->root == NULL) {
1318                 *addr = start;
1319                 goto found;
1320         }
1321
1322         /*
1323          * After splay, if start comes before root node, then there
1324          * must be a gap from start to the root.
1325          */
1326         map->root = vm_map_entry_splay(start, map->root);
1327         if (start + length <= map->root->start) {
1328                 *addr = start;
1329                 goto found;
1330         }
1331
1332         /*
1333          * Root is the last node that might begin its gap before
1334          * start, and this is the last comparison where address
1335          * wrap might be a problem.
1336          */
1337         st = (start > map->root->end) ? start : map->root->end;
1338         if (length <= map->root->end + map->root->adj_free - st) {
1339                 *addr = st;
1340                 goto found;
1341         }
1342
1343         /* With max_free, can immediately tell if no solution. */
1344         entry = map->root->right;
1345         if (entry == NULL || length > entry->max_free)
1346                 return (1);
1347
1348         /*
1349          * Search the right subtree in the order: left subtree, root,
1350          * right subtree (first fit).  The previous splay implies that
1351          * all regions in the right subtree have addresses > start.
1352          */
1353         while (entry != NULL) {
1354                 if (entry->left != NULL && entry->left->max_free >= length)
1355                         entry = entry->left;
1356                 else if (entry->adj_free >= length) {
1357                         *addr = entry->end;
1358                         goto found;
1359                 } else
1360                         entry = entry->right;
1361         }
1362
1363         /* Can't get here, so panic if we do. */
1364         panic("vm_map_findspace: max_free corrupt");
1365
1366 found:
1367         /* Expand the kernel pmap, if necessary. */
1368         if (map == kernel_map) {
1369                 end = round_page(*addr + length);
1370                 if (end > kernel_vm_end)
1371                         pmap_growkernel(end);
1372         }
1373         return (0);
1374 }
1375
1376 int
1377 vm_map_fixed(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1378     vm_offset_t start, vm_size_t length, vm_prot_t prot,
1379     vm_prot_t max, int cow)
1380 {
1381         vm_offset_t end;
1382         int result;
1383
1384         end = start + length;
1385         vm_map_lock(map);
1386         VM_MAP_RANGE_CHECK(map, start, end);
1387         (void) vm_map_delete(map, start, end);
1388         result = vm_map_insert(map, object, offset, start, end, prot,
1389             max, cow);
1390         vm_map_unlock(map);
1391         return (result);
1392 }
1393
1394 /*
1395  *      vm_map_find finds an unallocated region in the target address
1396  *      map with the given length.  The search is defined to be
1397  *      first-fit from the specified address; the region found is
1398  *      returned in the same parameter.
1399  *
1400  *      If object is non-NULL, ref count must be bumped by caller
1401  *      prior to making call to account for the new entry.
1402  */
1403 int
1404 vm_map_find(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1405             vm_offset_t *addr,  /* IN/OUT */
1406             vm_size_t length, int find_space, vm_prot_t prot,
1407             vm_prot_t max, int cow)
1408 {
1409         vm_offset_t start;
1410         int result;
1411
1412         start = *addr;
1413         vm_map_lock(map);
1414         do {
1415                 if (find_space != VMFS_NO_SPACE) {
1416                         if (vm_map_findspace(map, start, length, addr)) {
1417                                 vm_map_unlock(map);
1418                                 return (KERN_NO_SPACE);
1419                         }
1420                         if (find_space == VMFS_ALIGNED_SPACE)
1421                                 pmap_align_superpage(object, offset, addr,
1422                                     length);
1423                         start = *addr;
1424                 }
1425                 result = vm_map_insert(map, object, offset, start, start +
1426                     length, prot, max, cow);
1427         } while (result == KERN_NO_SPACE && find_space == VMFS_ALIGNED_SPACE);
1428         vm_map_unlock(map);
1429         return (result);
1430 }
1431
1432 /*
1433  *      vm_map_simplify_entry:
1434  *
1435  *      Simplify the given map entry by merging with either neighbor.  This
1436  *      routine also has the ability to merge with both neighbors.
1437  *
1438  *      The map must be locked.
1439  *
1440  *      This routine guarentees that the passed entry remains valid (though
1441  *      possibly extended).  When merging, this routine may delete one or
1442  *      both neighbors.
1443  */
1444 void
1445 vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry)
1446 {
1447         vm_map_entry_t next, prev;
1448         vm_size_t prevsize, esize;
1449
1450         if (entry->eflags & (MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_IS_SUB_MAP))
1451                 return;
1452
1453         prev = entry->prev;
1454         if (prev != &map->header) {
1455                 prevsize = prev->end - prev->start;
1456                 if ( (prev->end == entry->start) &&
1457                      (prev->object.vm_object == entry->object.vm_object) &&
1458                      (!prev->object.vm_object ||
1459                         (prev->offset + prevsize == entry->offset)) &&
1460                      (prev->eflags == entry->eflags) &&
1461                      (prev->protection == entry->protection) &&
1462                      (prev->max_protection == entry->max_protection) &&
1463                      (prev->inheritance == entry->inheritance) &&
1464                      (prev->wired_count == entry->wired_count) &&
1465                      (prev->uip == entry->uip)) {
1466                         vm_map_entry_unlink(map, prev);
1467                         entry->start = prev->start;
1468                         entry->offset = prev->offset;
1469                         if (entry->prev != &map->header)
1470                                 vm_map_entry_resize_free(map, entry->prev);
1471
1472                         /*
1473                          * If the backing object is a vnode object,
1474                          * vm_object_deallocate() calls vrele().
1475                          * However, vrele() does not lock the vnode
1476                          * because the vnode has additional
1477                          * references.  Thus, the map lock can be kept
1478                          * without causing a lock-order reversal with
1479                          * the vnode lock.
1480                          */
1481                         if (prev->object.vm_object)
1482                                 vm_object_deallocate(prev->object.vm_object);
1483                         if (prev->uip != NULL)
1484                                 uifree(prev->uip);
1485                         vm_map_entry_dispose(map, prev);
1486                 }
1487         }
1488
1489         next = entry->next;
1490         if (next != &map->header) {
1491                 esize = entry->end - entry->start;
1492                 if ((entry->end == next->start) &&
1493                     (next->object.vm_object == entry->object.vm_object) &&
1494                      (!entry->object.vm_object ||
1495                         (entry->offset + esize == next->offset)) &&
1496                     (next->eflags == entry->eflags) &&
1497                     (next->protection == entry->protection) &&
1498                     (next->max_protection == entry->max_protection) &&
1499                     (next->inheritance == entry->inheritance) &&
1500                     (next->wired_count == entry->wired_count) &&
1501                     (next->uip == entry->uip)) {
1502                         vm_map_entry_unlink(map, next);
1503                         entry->end = next->end;
1504                         vm_map_entry_resize_free(map, entry);
1505
1506                         /*
1507                          * See comment above.
1508                          */
1509                         if (next->object.vm_object)
1510                                 vm_object_deallocate(next->object.vm_object);
1511                         if (next->uip != NULL)
1512                                 uifree(next->uip);
1513                         vm_map_entry_dispose(map, next);
1514                 }
1515         }
1516 }
1517 /*
1518  *      vm_map_clip_start:      [ internal use only ]
1519  *
1520  *      Asserts that the given entry begins at or after
1521  *      the specified address; if necessary,
1522  *      it splits the entry into two.
1523  */
1524 #define vm_map_clip_start(map, entry, startaddr) \
1525 { \
1526         if (startaddr > entry->start) \
1527                 _vm_map_clip_start(map, entry, startaddr); \
1528 }
1529
1530 /*
1531  *      This routine is called only when it is known that
1532  *      the entry must be split.
1533  */
1534 static void
1535 _vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start)
1536 {
1537         vm_map_entry_t new_entry;
1538
1539         VM_MAP_ASSERT_LOCKED(map);
1540
1541         /*
1542          * Split off the front portion -- note that we must insert the new
1543          * entry BEFORE this one, so that this entry has the specified
1544          * starting address.
1545          */
1546         vm_map_simplify_entry(map, entry);
1547
1548         /*
1549          * If there is no object backing this entry, we might as well create
1550          * one now.  If we defer it, an object can get created after the map
1551          * is clipped, and individual objects will be created for the split-up
1552          * map.  This is a bit of a hack, but is also about the best place to
1553          * put this improvement.
1554          */
1555         if (entry->object.vm_object == NULL && !map->system_map) {
1556                 vm_object_t object;
1557                 object = vm_object_allocate(OBJT_DEFAULT,
1558                                 atop(entry->end - entry->start));
1559                 entry->object.vm_object = object;
1560                 entry->offset = 0;
1561                 if (entry->uip != NULL) {
1562                         object->uip = entry->uip;
1563                         object->charge = entry->end - entry->start;
1564                         entry->uip = NULL;
1565                 }
1566         } else if (entry->object.vm_object != NULL &&
1567                    ((entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) &&
1568                    entry->uip != NULL) {
1569                 VM_OBJECT_LOCK(entry->object.vm_object);
1570                 KASSERT(entry->object.vm_object->uip == NULL,
1571                     ("OVERCOMMIT: vm_entry_clip_start: both uip e %p", entry));
1572                 entry->object.vm_object->uip = entry->uip;
1573                 entry->object.vm_object->charge = entry->end - entry->start;
1574                 VM_OBJECT_UNLOCK(entry->object.vm_object);
1575                 entry->uip = NULL;
1576         }
1577
1578         new_entry = vm_map_entry_create(map);
1579         *new_entry = *entry;
1580
1581         new_entry->end = start;
1582         entry->offset += (start - entry->start);
1583         entry->start = start;
1584         if (new_entry->uip != NULL)
1585                 uihold(entry->uip);
1586
1587         vm_map_entry_link(map, entry->prev, new_entry);
1588
1589         if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
1590                 vm_object_reference(new_entry->object.vm_object);
1591         }
1592 }
1593
1594 /*
1595  *      vm_map_clip_end:        [ internal use only ]
1596  *
1597  *      Asserts that the given entry ends at or before
1598  *      the specified address; if necessary,
1599  *      it splits the entry into two.
1600  */
1601 #define vm_map_clip_end(map, entry, endaddr) \
1602 { \
1603         if ((endaddr) < (entry->end)) \
1604                 _vm_map_clip_end((map), (entry), (endaddr)); \
1605 }
1606
1607 /*
1608  *      This routine is called only when it is known that
1609  *      the entry must be split.
1610  */
1611 static void
1612 _vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end)
1613 {
1614         vm_map_entry_t new_entry;
1615
1616         VM_MAP_ASSERT_LOCKED(map);
1617
1618         /*
1619          * If there is no object backing this entry, we might as well create
1620          * one now.  If we defer it, an object can get created after the map
1621          * is clipped, and individual objects will be created for the split-up
1622          * map.  This is a bit of a hack, but is also about the best place to
1623          * put this improvement.
1624          */
1625         if (entry->object.vm_object == NULL && !map->system_map) {
1626                 vm_object_t object;
1627                 object = vm_object_allocate(OBJT_DEFAULT,
1628                                 atop(entry->end - entry->start));
1629                 entry->object.vm_object = object;
1630                 entry->offset = 0;
1631                 if (entry->uip != NULL) {
1632                         object->uip = entry->uip;
1633                         object->charge = entry->end - entry->start;
1634                         entry->uip = NULL;
1635                 }
1636         } else if (entry->object.vm_object != NULL &&
1637                    ((entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) &&
1638                    entry->uip != NULL) {
1639                 VM_OBJECT_LOCK(entry->object.vm_object);
1640                 KASSERT(entry->object.vm_object->uip == NULL,
1641                     ("OVERCOMMIT: vm_entry_clip_end: both uip e %p", entry));
1642                 entry->object.vm_object->uip = entry->uip;
1643                 entry->object.vm_object->charge = entry->end - entry->start;
1644                 VM_OBJECT_UNLOCK(entry->object.vm_object);
1645                 entry->uip = NULL;
1646         }
1647
1648         /*
1649          * Create a new entry and insert it AFTER the specified entry
1650          */
1651         new_entry = vm_map_entry_create(map);
1652         *new_entry = *entry;
1653
1654         new_entry->start = entry->end = end;
1655         new_entry->offset += (end - entry->start);
1656         if (new_entry->uip != NULL)
1657                 uihold(entry->uip);
1658
1659         vm_map_entry_link(map, entry, new_entry);
1660
1661         if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
1662                 vm_object_reference(new_entry->object.vm_object);
1663         }
1664 }
1665
1666 /*
1667  *      vm_map_submap:          [ kernel use only ]
1668  *
1669  *      Mark the given range as handled by a subordinate map.
1670  *
1671  *      This range must have been created with vm_map_find,
1672  *      and no other operations may have been performed on this
1673  *      range prior to calling vm_map_submap.
1674  *
1675  *      Only a limited number of operations can be performed
1676  *      within this rage after calling vm_map_submap:
1677  *              vm_fault
1678  *      [Don't try vm_map_copy!]
1679  *
1680  *      To remove a submapping, one must first remove the
1681  *      range from the superior map, and then destroy the
1682  *      submap (if desired).  [Better yet, don't try it.]
1683  */
1684 int
1685 vm_map_submap(
1686         vm_map_t map,
1687         vm_offset_t start,
1688         vm_offset_t end,
1689         vm_map_t submap)
1690 {
1691         vm_map_entry_t entry;
1692         int result = KERN_INVALID_ARGUMENT;
1693
1694         vm_map_lock(map);
1695
1696         VM_MAP_RANGE_CHECK(map, start, end);
1697
1698         if (vm_map_lookup_entry(map, start, &entry)) {
1699                 vm_map_clip_start(map, entry, start);
1700         } else
1701                 entry = entry->next;
1702
1703         vm_map_clip_end(map, entry, end);
1704
1705         if ((entry->start == start) && (entry->end == end) &&
1706             ((entry->eflags & MAP_ENTRY_COW) == 0) &&
1707             (entry->object.vm_object == NULL)) {
1708                 entry->object.sub_map = submap;
1709                 entry->eflags |= MAP_ENTRY_IS_SUB_MAP;
1710                 result = KERN_SUCCESS;
1711         }
1712         vm_map_unlock(map);
1713
1714         return (result);
1715 }
1716
1717 /*
1718  * The maximum number of pages to map
1719  */
1720 #define MAX_INIT_PT     96
1721
1722 /*
1723  *      vm_map_pmap_enter:
1724  *
1725  *      Preload read-only mappings for the given object's resident pages into
1726  *      the given map.  This eliminates the soft faults on process startup and
1727  *      immediately after an mmap(2).  Because these are speculative mappings,
1728  *      cached pages are not reactivated and mapped.
1729  */
1730 void
1731 vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
1732     vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags)
1733 {
1734         vm_offset_t start;
1735         vm_page_t p, p_start;
1736         vm_pindex_t psize, tmpidx;
1737         boolean_t are_queues_locked;
1738
1739         if ((prot & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0 || object == NULL)
1740                 return;
1741         VM_OBJECT_LOCK(object);
1742         if (object->type == OBJT_DEVICE || object->type == OBJT_SG) {
1743                 pmap_object_init_pt(map->pmap, addr, object, pindex, size);
1744                 goto unlock_return;
1745         }
1746
1747         psize = atop(size);
1748
1749         if ((flags & MAP_PREFAULT_PARTIAL) && psize > MAX_INIT_PT &&
1750             object->resident_page_count > MAX_INIT_PT)
1751                 goto unlock_return;
1752
1753         if (psize + pindex > object->size) {
1754                 if (object->size < pindex)
1755                         goto unlock_return;
1756                 psize = object->size - pindex;
1757         }
1758
1759         are_queues_locked = FALSE;
1760         start = 0;
1761         p_start = NULL;
1762
1763         p = vm_page_find_least(object, pindex);
1764         /*
1765          * Assert: the variable p is either (1) the page with the
1766          * least pindex greater than or equal to the parameter pindex
1767          * or (2) NULL.
1768          */
1769         for (;
1770              p != NULL && (tmpidx = p->pindex - pindex) < psize;
1771              p = TAILQ_NEXT(p, listq)) {
1772                 /*
1773                  * don't allow an madvise to blow away our really
1774                  * free pages allocating pv entries.
1775                  */
1776                 if ((flags & MAP_PREFAULT_MADVISE) &&
1777                     cnt.v_free_count < cnt.v_free_reserved) {
1778                         psize = tmpidx;
1779                         break;
1780                 }
1781                 if (p->valid == VM_PAGE_BITS_ALL) {
1782                         if (p_start == NULL) {
1783                                 start = addr + ptoa(tmpidx);
1784                                 p_start = p;
1785                         }
1786                 } else if (p_start != NULL) {
1787                         if (!are_queues_locked) {
1788                                 are_queues_locked = TRUE;
1789                                 vm_page_lock_queues();
1790                         }
1791                         pmap_enter_object(map->pmap, start, addr +
1792                             ptoa(tmpidx), p_start, prot);
1793                         p_start = NULL;
1794                 }
1795         }
1796         if (p_start != NULL) {
1797                 if (!are_queues_locked) {
1798                         are_queues_locked = TRUE;
1799                         vm_page_lock_queues();
1800                 }
1801                 pmap_enter_object(map->pmap, start, addr + ptoa(psize),
1802                     p_start, prot);
1803         }
1804         if (are_queues_locked)
1805                 vm_page_unlock_queues();
1806 unlock_return:
1807         VM_OBJECT_UNLOCK(object);
1808 }
1809
1810 /*
1811  *      vm_map_protect:
1812  *
1813  *      Sets the protection of the specified address
1814  *      region in the target map.  If "set_max" is
1815  *      specified, the maximum protection is to be set;
1816  *      otherwise, only the current protection is affected.
1817  */
1818 int
1819 vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end,
1820                vm_prot_t new_prot, boolean_t set_max)
1821 {
1822         vm_map_entry_t current, entry;
1823         vm_object_t obj;
1824         struct uidinfo *uip;
1825         vm_prot_t old_prot;
1826
1827         vm_map_lock(map);
1828
1829         VM_MAP_RANGE_CHECK(map, start, end);
1830
1831         if (vm_map_lookup_entry(map, start, &entry)) {
1832                 vm_map_clip_start(map, entry, start);
1833         } else {
1834                 entry = entry->next;
1835         }
1836
1837         /*
1838          * Make a first pass to check for protection violations.
1839          */
1840         current = entry;
1841         while ((current != &map->header) && (current->start < end)) {
1842                 if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
1843                         vm_map_unlock(map);
1844                         return (KERN_INVALID_ARGUMENT);
1845                 }
1846                 if ((new_prot & current->max_protection) != new_prot) {
1847                         vm_map_unlock(map);
1848                         return (KERN_PROTECTION_FAILURE);
1849                 }
1850                 current = current->next;
1851         }
1852
1853
1854         /*
1855          * Do an accounting pass for private read-only mappings that
1856          * now will do cow due to allowed write (e.g. debugger sets
1857          * breakpoint on text segment)
1858          */
1859         for (current = entry; (current != &map->header) &&
1860              (current->start < end); current = current->next) {
1861
1862                 vm_map_clip_end(map, current, end);
1863
1864                 if (set_max ||
1865                     ((new_prot & ~(current->protection)) & VM_PROT_WRITE) == 0 ||
1866                     ENTRY_CHARGED(current)) {
1867                         continue;
1868                 }
1869
1870                 uip = curthread->td_ucred->cr_ruidinfo;
1871                 obj = current->object.vm_object;
1872
1873                 if (obj == NULL || (current->eflags & MAP_ENTRY_NEEDS_COPY)) {
1874                         if (!swap_reserve(current->end - current->start)) {
1875                                 vm_map_unlock(map);
1876                                 return (KERN_RESOURCE_SHORTAGE);
1877                         }
1878                         uihold(uip);
1879                         current->uip = uip;
1880                         continue;
1881                 }
1882
1883                 VM_OBJECT_LOCK(obj);
1884                 if (obj->type != OBJT_DEFAULT && obj->type != OBJT_SWAP) {
1885                         VM_OBJECT_UNLOCK(obj);
1886                         continue;
1887                 }
1888
1889                 /*
1890                  * Charge for the whole object allocation now, since
1891                  * we cannot distinguish between non-charged and
1892                  * charged clipped mapping of the same object later.
1893                  */
1894                 KASSERT(obj->charge == 0,
1895                     ("vm_map_protect: object %p overcharged\n", obj));
1896                 if (!swap_reserve(ptoa(obj->size))) {
1897                         VM_OBJECT_UNLOCK(obj);
1898                         vm_map_unlock(map);
1899                         return (KERN_RESOURCE_SHORTAGE);
1900                 }
1901
1902                 uihold(uip);
1903                 obj->uip = uip;
1904                 obj->charge = ptoa(obj->size);
1905                 VM_OBJECT_UNLOCK(obj);
1906         }
1907
1908         /*
1909          * Go back and fix up protections. [Note that clipping is not
1910          * necessary the second time.]
1911          */
1912         current = entry;
1913         while ((current != &map->header) && (current->start < end)) {
1914                 old_prot = current->protection;
1915
1916                 if (set_max)
1917                         current->protection =
1918                             (current->max_protection = new_prot) &
1919                             old_prot;
1920                 else
1921                         current->protection = new_prot;
1922
1923                 if ((current->eflags & (MAP_ENTRY_COW | MAP_ENTRY_USER_WIRED))
1924                      == (MAP_ENTRY_COW | MAP_ENTRY_USER_WIRED) &&
1925                     (current->protection & VM_PROT_WRITE) != 0 &&
1926                     (old_prot & VM_PROT_WRITE) == 0) {
1927                         vm_fault_copy_entry(map, map, current, current, NULL);
1928                 }
1929
1930                 /*
1931                  * Update physical map if necessary. Worry about copy-on-write
1932                  * here.
1933                  */
1934                 if (current->protection != old_prot) {
1935 #define MASK(entry)     (((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \
1936                                                         VM_PROT_ALL)
1937                         pmap_protect(map->pmap, current->start,
1938                             current->end,
1939                             current->protection & MASK(current));
1940 #undef  MASK
1941                 }
1942                 vm_map_simplify_entry(map, current);
1943                 current = current->next;
1944         }
1945         vm_map_unlock(map);
1946         return (KERN_SUCCESS);
1947 }
1948
1949 /*
1950  *      vm_map_madvise:
1951  *
1952  *      This routine traverses a processes map handling the madvise
1953  *      system call.  Advisories are classified as either those effecting
1954  *      the vm_map_entry structure, or those effecting the underlying
1955  *      objects.
1956  */
1957 int
1958 vm_map_madvise(
1959         vm_map_t map,
1960         vm_offset_t start,
1961         vm_offset_t end,
1962         int behav)
1963 {
1964         vm_map_entry_t current, entry;
1965         int modify_map = 0;
1966
1967         /*
1968          * Some madvise calls directly modify the vm_map_entry, in which case
1969          * we need to use an exclusive lock on the map and we need to perform
1970          * various clipping operations.  Otherwise we only need a read-lock
1971          * on the map.
1972          */
1973         switch(behav) {
1974         case MADV_NORMAL:
1975         case MADV_SEQUENTIAL:
1976         case MADV_RANDOM:
1977         case MADV_NOSYNC:
1978         case MADV_AUTOSYNC:
1979         case MADV_NOCORE:
1980         case MADV_CORE:
1981                 modify_map = 1;
1982                 vm_map_lock(map);
1983                 break;
1984         case MADV_WILLNEED:
1985         case MADV_DONTNEED:
1986         case MADV_FREE:
1987                 vm_map_lock_read(map);
1988                 break;
1989         default:
1990                 return (KERN_INVALID_ARGUMENT);
1991         }
1992
1993         /*
1994          * Locate starting entry and clip if necessary.
1995          */
1996         VM_MAP_RANGE_CHECK(map, start, end);
1997
1998         if (vm_map_lookup_entry(map, start, &entry)) {
1999                 if (modify_map)
2000                         vm_map_clip_start(map, entry, start);
2001         } else {
2002                 entry = entry->next;
2003         }
2004
2005         if (modify_map) {
2006                 /*
2007                  * madvise behaviors that are implemented in the vm_map_entry.
2008                  *
2009                  * We clip the vm_map_entry so that behavioral changes are
2010                  * limited to the specified address range.
2011                  */
2012                 for (current = entry;
2013                      (current != &map->header) && (current->start < end);
2014                      current = current->next
2015                 ) {
2016                         if (current->eflags & MAP_ENTRY_IS_SUB_MAP)
2017                                 continue;
2018
2019                         vm_map_clip_end(map, current, end);
2020
2021                         switch (behav) {
2022                         case MADV_NORMAL:
2023                                 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_NORMAL);
2024                                 break;
2025                         case MADV_SEQUENTIAL:
2026                                 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_SEQUENTIAL);
2027                                 break;
2028                         case MADV_RANDOM:
2029                                 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_RANDOM);
2030                                 break;
2031                         case MADV_NOSYNC:
2032                                 current->eflags |= MAP_ENTRY_NOSYNC;
2033                                 break;
2034                         case MADV_AUTOSYNC:
2035                                 current->eflags &= ~MAP_ENTRY_NOSYNC;
2036                                 break;
2037                         case MADV_NOCORE:
2038                                 current->eflags |= MAP_ENTRY_NOCOREDUMP;
2039                                 break;
2040                         case MADV_CORE:
2041                                 current->eflags &= ~MAP_ENTRY_NOCOREDUMP;
2042                                 break;
2043                         default:
2044                                 break;
2045                         }
2046                         vm_map_simplify_entry(map, current);
2047                 }
2048                 vm_map_unlock(map);
2049         } else {
2050                 vm_pindex_t pindex;
2051                 int count;
2052
2053                 /*
2054                  * madvise behaviors that are implemented in the underlying
2055                  * vm_object.
2056                  *
2057                  * Since we don't clip the vm_map_entry, we have to clip
2058                  * the vm_object pindex and count.
2059                  */
2060                 for (current = entry;
2061                      (current != &map->header) && (current->start < end);
2062                      current = current->next
2063                 ) {
2064                         vm_offset_t useStart;
2065
2066                         if (current->eflags & MAP_ENTRY_IS_SUB_MAP)
2067                                 continue;
2068
2069                         pindex = OFF_TO_IDX(current->offset);
2070                         count = atop(current->end - current->start);
2071                         useStart = current->start;
2072
2073                         if (current->start < start) {
2074                                 pindex += atop(start - current->start);
2075                                 count -= atop(start - current->start);
2076                                 useStart = start;
2077                         }
2078                         if (current->end > end)
2079                                 count -= atop(current->end - end);
2080
2081                         if (count <= 0)
2082                                 continue;
2083
2084                         vm_object_madvise(current->object.vm_object,
2085                                           pindex, count, behav);
2086                         if (behav == MADV_WILLNEED) {
2087                                 vm_map_pmap_enter(map,
2088                                     useStart,
2089                                     current->protection,
2090                                     current->object.vm_object,
2091                                     pindex,
2092                                     (count << PAGE_SHIFT),
2093                                     MAP_PREFAULT_MADVISE
2094                                 );
2095                         }
2096                 }
2097                 vm_map_unlock_read(map);
2098         }
2099         return (0);
2100 }
2101
2102
2103 /*
2104  *      vm_map_inherit:
2105  *
2106  *      Sets the inheritance of the specified address
2107  *      range in the target map.  Inheritance
2108  *      affects how the map will be shared with
2109  *      child maps at the time of vmspace_fork.
2110  */
2111 int
2112 vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end,
2113                vm_inherit_t new_inheritance)
2114 {
2115         vm_map_entry_t entry;
2116         vm_map_entry_t temp_entry;
2117
2118         switch (new_inheritance) {
2119         case VM_INHERIT_NONE:
2120         case VM_INHERIT_COPY:
2121         case VM_INHERIT_SHARE:
2122                 break;
2123         default:
2124                 return (KERN_INVALID_ARGUMENT);
2125         }
2126         vm_map_lock(map);
2127         VM_MAP_RANGE_CHECK(map, start, end);
2128         if (vm_map_lookup_entry(map, start, &temp_entry)) {
2129                 entry = temp_entry;
2130                 vm_map_clip_start(map, entry, start);
2131         } else
2132                 entry = temp_entry->next;
2133         while ((entry != &map->header) && (entry->start < end)) {
2134                 vm_map_clip_end(map, entry, end);
2135                 entry->inheritance = new_inheritance;
2136                 vm_map_simplify_entry(map, entry);
2137                 entry = entry->next;
2138         }
2139         vm_map_unlock(map);
2140         return (KERN_SUCCESS);
2141 }
2142
2143 /*
2144  *      vm_map_unwire:
2145  *
2146  *      Implements both kernel and user unwiring.
2147  */
2148 int
2149 vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end,
2150     int flags)
2151 {
2152         vm_map_entry_t entry, first_entry, tmp_entry;
2153         vm_offset_t saved_start;
2154         unsigned int last_timestamp;
2155         int rv;
2156         boolean_t need_wakeup, result, user_unwire;
2157
2158         user_unwire = (flags & VM_MAP_WIRE_USER) ? TRUE : FALSE;
2159         vm_map_lock(map);
2160         VM_MAP_RANGE_CHECK(map, start, end);
2161         if (!vm_map_lookup_entry(map, start, &first_entry)) {
2162                 if (flags & VM_MAP_WIRE_HOLESOK)
2163                         first_entry = first_entry->next;
2164                 else {
2165                         vm_map_unlock(map);
2166                         return (KERN_INVALID_ADDRESS);
2167                 }
2168         }
2169         last_timestamp = map->timestamp;
2170         entry = first_entry;
2171         while (entry != &map->header && entry->start < end) {
2172                 if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
2173                         /*
2174                          * We have not yet clipped the entry.
2175                          */
2176                         saved_start = (start >= entry->start) ? start :
2177                             entry->start;
2178                         entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
2179                         if (vm_map_unlock_and_wait(map, 0)) {
2180                                 /*
2181                                  * Allow interruption of user unwiring?
2182                                  */
2183                         }
2184                         vm_map_lock(map);
2185                         if (last_timestamp+1 != map->timestamp) {
2186                                 /*
2187                                  * Look again for the entry because the map was
2188                                  * modified while it was unlocked.
2189                                  * Specifically, the entry may have been
2190                                  * clipped, merged, or deleted.
2191                                  */
2192                                 if (!vm_map_lookup_entry(map, saved_start,
2193                                     &tmp_entry)) {
2194                                         if (flags & VM_MAP_WIRE_HOLESOK)
2195                                                 tmp_entry = tmp_entry->next;
2196                                         else {
2197                                                 if (saved_start == start) {
2198                                                         /*
2199                                                          * First_entry has been deleted.
2200                                                          */
2201                                                         vm_map_unlock(map);
2202                                                         return (KERN_INVALID_ADDRESS);
2203                                                 }
2204                                                 end = saved_start;
2205                                                 rv = KERN_INVALID_ADDRESS;
2206                                                 goto done;
2207                                         }
2208                                 }
2209                                 if (entry == first_entry)
2210                                         first_entry = tmp_entry;
2211                                 else
2212                                         first_entry = NULL;
2213                                 entry = tmp_entry;
2214                         }
2215                         last_timestamp = map->timestamp;
2216                         continue;
2217                 }
2218                 vm_map_clip_start(map, entry, start);
2219                 vm_map_clip_end(map, entry, end);
2220                 /*
2221                  * Mark the entry in case the map lock is released.  (See
2222                  * above.)
2223                  */
2224                 entry->eflags |= MAP_ENTRY_IN_TRANSITION;
2225                 /*
2226                  * Check the map for holes in the specified region.
2227                  * If VM_MAP_WIRE_HOLESOK was specified, skip this check.
2228                  */
2229                 if (((flags & VM_MAP_WIRE_HOLESOK) == 0) &&
2230                     (entry->end < end && (entry->next == &map->header ||
2231                     entry->next->start > entry->end))) {
2232                         end = entry->end;
2233                         rv = KERN_INVALID_ADDRESS;
2234                         goto done;
2235                 }
2236                 /*
2237                  * If system unwiring, require that the entry is system wired.
2238                  */
2239                 if (!user_unwire &&
2240                     vm_map_entry_system_wired_count(entry) == 0) {
2241                         end = entry->end;
2242                         rv = KERN_INVALID_ARGUMENT;
2243                         goto done;
2244                 }
2245                 entry = entry->next;
2246         }
2247         rv = KERN_SUCCESS;
2248 done:
2249         need_wakeup = FALSE;
2250         if (first_entry == NULL) {
2251                 result = vm_map_lookup_entry(map, start, &first_entry);
2252                 if (!result && (flags & VM_MAP_WIRE_HOLESOK))
2253                         first_entry = first_entry->next;
2254                 else
2255                         KASSERT(result, ("vm_map_unwire: lookup failed"));
2256         }
2257         entry = first_entry;
2258         while (entry != &map->header && entry->start < end) {
2259                 if (rv == KERN_SUCCESS && (!user_unwire ||
2260                     (entry->eflags & MAP_ENTRY_USER_WIRED))) {
2261                         if (user_unwire)
2262                                 entry->eflags &= ~MAP_ENTRY_USER_WIRED;
2263                         entry->wired_count--;
2264                         if (entry->wired_count == 0) {
2265                                 /*
2266                                  * Retain the map lock.
2267                                  */
2268                                 vm_fault_unwire(map, entry->start, entry->end,
2269                                     entry->object.vm_object != NULL &&
2270                                     (entry->object.vm_object->type == OBJT_DEVICE ||
2271                                     entry->object.vm_object->type == OBJT_SG));
2272                         }
2273                 }
2274                 KASSERT(entry->eflags & MAP_ENTRY_IN_TRANSITION,
2275                         ("vm_map_unwire: in-transition flag missing"));
2276                 entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
2277                 if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
2278                         entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
2279                         need_wakeup = TRUE;
2280                 }
2281                 vm_map_simplify_entry(map, entry);
2282                 entry = entry->next;
2283         }
2284         vm_map_unlock(map);
2285         if (need_wakeup)
2286                 vm_map_wakeup(map);
2287         return (rv);
2288 }
2289
2290 /*
2291  *      vm_map_wire:
2292  *
2293  *      Implements both kernel and user wiring.
2294  */
2295 int
2296 vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end,
2297     int flags)
2298 {
2299         vm_map_entry_t entry, first_entry, tmp_entry;
2300         vm_offset_t saved_end, saved_start;
2301         unsigned int last_timestamp;
2302         int rv;
2303         boolean_t fictitious, need_wakeup, result, user_wire;
2304
2305         user_wire = (flags & VM_MAP_WIRE_USER) ? TRUE : FALSE;
2306         vm_map_lock(map);
2307         VM_MAP_RANGE_CHECK(map, start, end);
2308         if (!vm_map_lookup_entry(map, start, &first_entry)) {
2309                 if (flags & VM_MAP_WIRE_HOLESOK)
2310                         first_entry = first_entry->next;
2311                 else {
2312                         vm_map_unlock(map);
2313                         return (KERN_INVALID_ADDRESS);
2314                 }
2315         }
2316         last_timestamp = map->timestamp;
2317         entry = first_entry;
2318         while (entry != &map->header && entry->start < end) {
2319                 if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
2320                         /*
2321                          * We have not yet clipped the entry.
2322                          */
2323                         saved_start = (start >= entry->start) ? start :
2324                             entry->start;
2325                         entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
2326                         if (vm_map_unlock_and_wait(map, 0)) {
2327                                 /*
2328                                  * Allow interruption of user wiring?
2329                                  */
2330                         }
2331                         vm_map_lock(map);
2332                         if (last_timestamp + 1 != map->timestamp) {
2333                                 /*
2334                                  * Look again for the entry because the map was
2335                                  * modified while it was unlocked.
2336                                  * Specifically, the entry may have been
2337                                  * clipped, merged, or deleted.
2338                                  */
2339                                 if (!vm_map_lookup_entry(map, saved_start,
2340                                     &tmp_entry)) {
2341                                         if (flags & VM_MAP_WIRE_HOLESOK)
2342                                                 tmp_entry = tmp_entry->next;
2343                                         else {
2344                                                 if (saved_start == start) {
2345                                                         /*
2346                                                          * first_entry has been deleted.
2347                                                          */
2348                                                         vm_map_unlock(map);
2349                                                         return (KERN_INVALID_ADDRESS);
2350                                                 }
2351                                                 end = saved_start;
2352                                                 rv = KERN_INVALID_ADDRESS;
2353                                                 goto done;
2354                                         }
2355                                 }
2356                                 if (entry == first_entry)
2357                                         first_entry = tmp_entry;
2358                                 else
2359                                         first_entry = NULL;
2360                                 entry = tmp_entry;
2361                         }
2362                         last_timestamp = map->timestamp;
2363                         continue;
2364                 }
2365                 vm_map_clip_start(map, entry, start);
2366                 vm_map_clip_end(map, entry, end);
2367                 /*
2368                  * Mark the entry in case the map lock is released.  (See
2369                  * above.)
2370                  */
2371                 entry->eflags |= MAP_ENTRY_IN_TRANSITION;
2372                 /*
2373                  *
2374                  */
2375                 if (entry->wired_count == 0) {
2376                         if ((entry->protection & (VM_PROT_READ|VM_PROT_EXECUTE))
2377                             == 0) {
2378                                 entry->eflags |= MAP_ENTRY_WIRE_SKIPPED;
2379                                 if ((flags & VM_MAP_WIRE_HOLESOK) == 0) {
2380                                         end = entry->end;
2381                                         rv = KERN_INVALID_ADDRESS;
2382                                         goto done;
2383                                 }
2384                                 goto next_entry;
2385                         }
2386                         entry->wired_count++;
2387                         saved_start = entry->start;
2388                         saved_end = entry->end;
2389                         fictitious = entry->object.vm_object != NULL &&
2390                             (entry->object.vm_object->type == OBJT_DEVICE ||
2391                             entry->object.vm_object->type == OBJT_SG);
2392                         /*
2393                          * Release the map lock, relying on the in-transition
2394                          * mark.
2395                          */
2396                         vm_map_unlock(map);
2397                         rv = vm_fault_wire(map, saved_start, saved_end,
2398                             user_wire, fictitious);
2399                         vm_map_lock(map);
2400                         if (last_timestamp + 1 != map->timestamp) {
2401                                 /*
2402                                  * Look again for the entry because the map was
2403                                  * modified while it was unlocked.  The entry
2404                                  * may have been clipped, but NOT merged or
2405                                  * deleted.
2406                                  */
2407                                 result = vm_map_lookup_entry(map, saved_start,
2408                                     &tmp_entry);
2409                                 KASSERT(result, ("vm_map_wire: lookup failed"));
2410                                 if (entry == first_entry)
2411                                         first_entry = tmp_entry;
2412                                 else
2413                                         first_entry = NULL;
2414                                 entry = tmp_entry;
2415                                 while (entry->end < saved_end) {
2416                                         if (rv != KERN_SUCCESS) {
2417                                                 KASSERT(entry->wired_count == 1,
2418                                                     ("vm_map_wire: bad count"));
2419                                                 entry->wired_count = -1;
2420                                         }
2421                                         entry = entry->next;
2422                                 }
2423                         }
2424                         last_timestamp = map->timestamp;
2425                         if (rv != KERN_SUCCESS) {
2426                                 KASSERT(entry->wired_count == 1,
2427                                     ("vm_map_wire: bad count"));
2428                                 /*
2429                                  * Assign an out-of-range value to represent
2430                                  * the failure to wire this entry.
2431                                  */
2432                                 entry->wired_count = -1;
2433                                 end = entry->end;
2434                                 goto done;
2435                         }
2436                 } else if (!user_wire ||
2437                            (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
2438                         entry->wired_count++;
2439                 }
2440                 /*
2441                  * Check the map for holes in the specified region.
2442                  * If VM_MAP_WIRE_HOLESOK was specified, skip this check.
2443                  */
2444         next_entry:
2445                 if (((flags & VM_MAP_WIRE_HOLESOK) == 0) &&
2446                     (entry->end < end && (entry->next == &map->header ||
2447                     entry->next->start > entry->end))) {
2448                         end = entry->end;
2449                         rv = KERN_INVALID_ADDRESS;
2450                         goto done;
2451                 }
2452                 entry = entry->next;
2453         }
2454         rv = KERN_SUCCESS;
2455 done:
2456         need_wakeup = FALSE;
2457         if (first_entry == NULL) {
2458                 result = vm_map_lookup_entry(map, start, &first_entry);
2459                 if (!result && (flags & VM_MAP_WIRE_HOLESOK))
2460                         first_entry = first_entry->next;
2461                 else
2462                         KASSERT(result, ("vm_map_wire: lookup failed"));
2463         }
2464         entry = first_entry;
2465         while (entry != &map->header && entry->start < end) {
2466                 if ((entry->eflags & MAP_ENTRY_WIRE_SKIPPED) != 0)
2467                         goto next_entry_done;
2468                 if (rv == KERN_SUCCESS) {
2469                         if (user_wire)
2470                                 entry->eflags |= MAP_ENTRY_USER_WIRED;
2471                 } else if (entry->wired_count == -1) {
2472                         /*
2473                          * Wiring failed on this entry.  Thus, unwiring is
2474                          * unnecessary.
2475                          */
2476                         entry->wired_count = 0;
2477                 } else {
2478                         if (!user_wire ||
2479                             (entry->eflags & MAP_ENTRY_USER_WIRED) == 0)
2480                                 entry->wired_count--;
2481                         if (entry->wired_count == 0) {
2482                                 /*
2483                                  * Retain the map lock.
2484                                  */
2485                                 vm_fault_unwire(map, entry->start, entry->end,
2486                                     entry->object.vm_object != NULL &&
2487                                     (entry->object.vm_object->type == OBJT_DEVICE ||
2488                                     entry->object.vm_object->type == OBJT_SG));
2489                         }
2490                 }
2491         next_entry_done:
2492                 KASSERT(entry->eflags & MAP_ENTRY_IN_TRANSITION,
2493                         ("vm_map_wire: in-transition flag missing"));
2494                 entry->eflags &= ~(MAP_ENTRY_IN_TRANSITION|MAP_ENTRY_WIRE_SKIPPED);
2495                 if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
2496                         entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
2497                         need_wakeup = TRUE;
2498                 }
2499                 vm_map_simplify_entry(map, entry);
2500                 entry = entry->next;
2501         }
2502         vm_map_unlock(map);
2503         if (need_wakeup)
2504                 vm_map_wakeup(map);
2505         return (rv);
2506 }
2507
2508 /*
2509  * vm_map_sync
2510  *
2511  * Push any dirty cached pages in the address range to their pager.
2512  * If syncio is TRUE, dirty pages are written synchronously.
2513  * If invalidate is TRUE, any cached pages are freed as well.
2514  *
2515  * If the size of the region from start to end is zero, we are
2516  * supposed to flush all modified pages within the region containing
2517  * start.  Unfortunately, a region can be split or coalesced with
2518  * neighboring regions, making it difficult to determine what the
2519  * original region was.  Therefore, we approximate this requirement by
2520  * flushing the current region containing start.
2521  *
2522  * Returns an error if any part of the specified range is not mapped.
2523  */
2524 int
2525 vm_map_sync(
2526         vm_map_t map,
2527         vm_offset_t start,
2528         vm_offset_t end,
2529         boolean_t syncio,
2530         boolean_t invalidate)
2531 {
2532         vm_map_entry_t current;
2533         vm_map_entry_t entry;
2534         vm_size_t size;
2535         vm_object_t object;
2536         vm_ooffset_t offset;
2537         unsigned int last_timestamp;
2538
2539         vm_map_lock_read(map);
2540         VM_MAP_RANGE_CHECK(map, start, end);
2541         if (!vm_map_lookup_entry(map, start, &entry)) {
2542                 vm_map_unlock_read(map);
2543                 return (KERN_INVALID_ADDRESS);
2544         } else if (start == end) {
2545                 start = entry->start;
2546                 end = entry->end;
2547         }
2548         /*
2549          * Make a first pass to check for user-wired memory and holes.
2550          */
2551         for (current = entry; current != &map->header && current->start < end;
2552             current = current->next) {
2553                 if (invalidate && (current->eflags & MAP_ENTRY_USER_WIRED)) {
2554                         vm_map_unlock_read(map);
2555                         return (KERN_INVALID_ARGUMENT);
2556                 }
2557                 if (end > current->end &&
2558                     (current->next == &map->header ||
2559                         current->end != current->next->start)) {
2560                         vm_map_unlock_read(map);
2561                         return (KERN_INVALID_ADDRESS);
2562                 }
2563         }
2564
2565         if (invalidate)
2566                 pmap_remove(map->pmap, start, end);
2567
2568         /*
2569          * Make a second pass, cleaning/uncaching pages from the indicated
2570          * objects as we go.
2571          */
2572         for (current = entry; current != &map->header && current->start < end;) {
2573                 offset = current->offset + (start - current->start);
2574                 size = (end <= current->end ? end : current->end) - start;
2575                 if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
2576                         vm_map_t smap;
2577                         vm_map_entry_t tentry;
2578                         vm_size_t tsize;
2579
2580                         smap = current->object.sub_map;
2581                         vm_map_lock_read(smap);
2582                         (void) vm_map_lookup_entry(smap, offset, &tentry);
2583                         tsize = tentry->end - offset;
2584                         if (tsize < size)
2585                                 size = tsize;
2586                         object = tentry->object.vm_object;
2587                         offset = tentry->offset + (offset - tentry->start);
2588                         vm_map_unlock_read(smap);
2589                 } else {
2590                         object = current->object.vm_object;
2591                 }
2592                 vm_object_reference(object);
2593                 last_timestamp = map->timestamp;
2594                 vm_map_unlock_read(map);
2595                 vm_object_sync(object, offset, size, syncio, invalidate);
2596                 start += size;
2597                 vm_object_deallocate(object);
2598                 vm_map_lock_read(map);
2599                 if (last_timestamp == map->timestamp ||
2600                     !vm_map_lookup_entry(map, start, &current))
2601                         current = current->next;
2602         }
2603
2604         vm_map_unlock_read(map);
2605         return (KERN_SUCCESS);
2606 }
2607
2608 /*
2609  *      vm_map_entry_unwire:    [ internal use only ]
2610  *
2611  *      Make the region specified by this entry pageable.
2612  *
2613  *      The map in question should be locked.
2614  *      [This is the reason for this routine's existence.]
2615  */
2616 static void
2617 vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry)
2618 {
2619         vm_fault_unwire(map, entry->start, entry->end,
2620             entry->object.vm_object != NULL &&
2621             (entry->object.vm_object->type == OBJT_DEVICE ||
2622             entry->object.vm_object->type == OBJT_SG));
2623         entry->wired_count = 0;
2624 }
2625
2626 static void
2627 vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map)
2628 {
2629
2630         if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0)
2631                 vm_object_deallocate(entry->object.vm_object);
2632         uma_zfree(system_map ? kmapentzone : mapentzone, entry);
2633 }
2634
2635 /*
2636  *      vm_map_entry_delete:    [ internal use only ]
2637  *
2638  *      Deallocate the given entry from the target map.
2639  */
2640 static void
2641 vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry)
2642 {
2643         vm_object_t object;
2644         vm_pindex_t offidxstart, offidxend, count, size1;
2645         vm_ooffset_t size;
2646
2647         vm_map_entry_unlink(map, entry);
2648         object = entry->object.vm_object;
2649         size = entry->end - entry->start;
2650         map->size -= size;
2651
2652         if (entry->uip != NULL) {
2653                 swap_release_by_uid(size, entry->uip);
2654                 uifree(entry->uip);
2655         }
2656
2657         if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0 &&
2658             (object != NULL)) {
2659                 KASSERT(entry->uip == NULL || object->uip == NULL ||
2660                     (entry->eflags & MAP_ENTRY_NEEDS_COPY),
2661                     ("OVERCOMMIT vm_map_entry_delete: both uip %p", entry));
2662                 count = OFF_TO_IDX(size);
2663                 offidxstart = OFF_TO_IDX(entry->offset);
2664                 offidxend = offidxstart + count;
2665                 VM_OBJECT_LOCK(object);
2666                 if (object->ref_count != 1 &&
2667                     ((object->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) == OBJ_ONEMAPPING ||
2668                     object == kernel_object || object == kmem_object)) {
2669                         vm_object_collapse(object);
2670                         vm_object_page_remove(object, offidxstart, offidxend, FALSE);
2671                         if (object->type == OBJT_SWAP)
2672                                 swap_pager_freespace(object, offidxstart, count);
2673                         if (offidxend >= object->size &&
2674                             offidxstart < object->size) {
2675                                 size1 = object->size;
2676                                 object->size = offidxstart;
2677                                 if (object->uip != NULL) {
2678                                         size1 -= object->size;
2679                                         KASSERT(object->charge >= ptoa(size1),
2680                                             ("vm_map_entry_delete: object->charge < 0"));
2681                                         swap_release_by_uid(ptoa(size1), object->uip);
2682                                         object->charge -= ptoa(size1);
2683                                 }
2684                         }
2685                 }
2686                 VM_OBJECT_UNLOCK(object);
2687         } else
2688                 entry->object.vm_object = NULL;
2689         if (map->system_map)
2690                 vm_map_entry_deallocate(entry, TRUE);
2691         else {
2692                 entry->next = curthread->td_map_def_user;
2693                 curthread->td_map_def_user = entry;
2694         }
2695 }
2696
2697 /*
2698  *      vm_map_delete:  [ internal use only ]
2699  *
2700  *      Deallocates the given address range from the target
2701  *      map.
2702  */
2703 int
2704 vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end)
2705 {
2706         vm_map_entry_t entry;
2707         vm_map_entry_t first_entry;
2708
2709         VM_MAP_ASSERT_LOCKED(map);
2710
2711         /*
2712          * Find the start of the region, and clip it
2713          */
2714         if (!vm_map_lookup_entry(map, start, &first_entry))
2715                 entry = first_entry->next;
2716         else {
2717                 entry = first_entry;
2718                 vm_map_clip_start(map, entry, start);
2719         }
2720
2721         /*
2722          * Step through all entries in this region
2723          */
2724         while ((entry != &map->header) && (entry->start < end)) {
2725                 vm_map_entry_t next;
2726
2727                 /*
2728                  * Wait for wiring or unwiring of an entry to complete.
2729                  * Also wait for any system wirings to disappear on
2730                  * user maps.
2731                  */
2732                 if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 ||
2733                     (vm_map_pmap(map) != kernel_pmap &&
2734                     vm_map_entry_system_wired_count(entry) != 0)) {
2735                         unsigned int last_timestamp;
2736                         vm_offset_t saved_start;
2737                         vm_map_entry_t tmp_entry;
2738
2739                         saved_start = entry->start;
2740                         entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
2741                         last_timestamp = map->timestamp;
2742                         (void) vm_map_unlock_and_wait(map, 0);
2743                         vm_map_lock(map);
2744                         if (last_timestamp + 1 != map->timestamp) {
2745                                 /*
2746                                  * Look again for the entry because the map was
2747                                  * modified while it was unlocked.
2748                                  * Specifically, the entry may have been
2749                                  * clipped, merged, or deleted.
2750                                  */
2751                                 if (!vm_map_lookup_entry(map, saved_start,
2752                                                          &tmp_entry))
2753                                         entry = tmp_entry->next;
2754                                 else {
2755                                         entry = tmp_entry;
2756                                         vm_map_clip_start(map, entry,
2757                                                           saved_start);
2758                                 }
2759                         }
2760                         continue;
2761                 }
2762                 vm_map_clip_end(map, entry, end);
2763
2764                 next = entry->next;
2765
2766                 /*
2767                  * Unwire before removing addresses from the pmap; otherwise,
2768                  * unwiring will put the entries back in the pmap.
2769                  */
2770                 if (entry->wired_count != 0) {
2771                         vm_map_entry_unwire(map, entry);
2772                 }
2773
2774                 pmap_remove(map->pmap, entry->start, entry->end);
2775
2776                 /*
2777                  * Delete the entry only after removing all pmap
2778                  * entries pointing to its pages.  (Otherwise, its
2779                  * page frames may be reallocated, and any modify bits
2780                  * will be set in the wrong object!)
2781                  */
2782                 vm_map_entry_delete(map, entry);
2783                 entry = next;
2784         }
2785         return (KERN_SUCCESS);
2786 }
2787
2788 /*
2789  *      vm_map_remove:
2790  *
2791  *      Remove the given address range from the target map.
2792  *      This is the exported form of vm_map_delete.
2793  */
2794 int
2795 vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end)
2796 {
2797         int result;
2798
2799         vm_map_lock(map);
2800         VM_MAP_RANGE_CHECK(map, start, end);
2801         result = vm_map_delete(map, start, end);
2802         vm_map_unlock(map);
2803         return (result);
2804 }
2805
2806 /*
2807  *      vm_map_check_protection:
2808  *
2809  *      Assert that the target map allows the specified privilege on the
2810  *      entire address region given.  The entire region must be allocated.
2811  *
2812  *      WARNING!  This code does not and should not check whether the
2813  *      contents of the region is accessible.  For example a smaller file
2814  *      might be mapped into a larger address space.
2815  *
2816  *      NOTE!  This code is also called by munmap().
2817  *
2818  *      The map must be locked.  A read lock is sufficient.
2819  */
2820 boolean_t
2821 vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end,
2822                         vm_prot_t protection)
2823 {
2824         vm_map_entry_t entry;
2825         vm_map_entry_t tmp_entry;
2826
2827         if (!vm_map_lookup_entry(map, start, &tmp_entry))
2828                 return (FALSE);
2829         entry = tmp_entry;
2830
2831         while (start < end) {
2832                 if (entry == &map->header)
2833                         return (FALSE);
2834                 /*
2835                  * No holes allowed!
2836                  */
2837                 if (start < entry->start)
2838                         return (FALSE);
2839                 /*
2840                  * Check protection associated with entry.
2841                  */
2842                 if ((entry->protection & protection) != protection)
2843                         return (FALSE);
2844                 /* go to next entry */
2845                 start = entry->end;
2846                 entry = entry->next;
2847         }
2848         return (TRUE);
2849 }
2850
2851 /*
2852  *      vm_map_copy_entry:
2853  *
2854  *      Copies the contents of the source entry to the destination
2855  *      entry.  The entries *must* be aligned properly.
2856  */
2857 static void
2858 vm_map_copy_entry(
2859         vm_map_t src_map,
2860         vm_map_t dst_map,
2861         vm_map_entry_t src_entry,
2862         vm_map_entry_t dst_entry,
2863         vm_ooffset_t *fork_charge)
2864 {
2865         vm_object_t src_object;
2866         vm_offset_t size;
2867         struct uidinfo *uip;
2868         int charged;
2869
2870         VM_MAP_ASSERT_LOCKED(dst_map);
2871
2872         if ((dst_entry->eflags|src_entry->eflags) & MAP_ENTRY_IS_SUB_MAP)
2873                 return;
2874
2875         if (src_entry->wired_count == 0) {
2876
2877                 /*
2878                  * If the source entry is marked needs_copy, it is already
2879                  * write-protected.
2880                  */
2881                 if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) {
2882                         pmap_protect(src_map->pmap,
2883                             src_entry->start,
2884                             src_entry->end,
2885                             src_entry->protection & ~VM_PROT_WRITE);
2886                 }
2887
2888                 /*
2889                  * Make a copy of the object.
2890                  */
2891                 size = src_entry->end - src_entry->start;
2892                 if ((src_object = src_entry->object.vm_object) != NULL) {
2893                         VM_OBJECT_LOCK(src_object);
2894                         charged = ENTRY_CHARGED(src_entry);
2895                         if ((src_object->handle == NULL) &&
2896                                 (src_object->type == OBJT_DEFAULT ||
2897                                  src_object->type == OBJT_SWAP)) {
2898                                 vm_object_collapse(src_object);
2899                                 if ((src_object->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) == OBJ_ONEMAPPING) {
2900                                         vm_object_split(src_entry);
2901                                         src_object = src_entry->object.vm_object;
2902                                 }
2903                         }
2904                         vm_object_reference_locked(src_object);
2905                         vm_object_clear_flag(src_object, OBJ_ONEMAPPING);
2906                         if (src_entry->uip != NULL &&
2907                             !(src_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
2908                                 KASSERT(src_object->uip == NULL,
2909                                     ("OVERCOMMIT: vm_map_copy_entry: uip %p",
2910                                      src_object));
2911                                 src_object->uip = src_entry->uip;
2912                                 src_object->charge = size;
2913                         }
2914                         VM_OBJECT_UNLOCK(src_object);
2915                         dst_entry->object.vm_object = src_object;
2916                         if (charged) {
2917                                 uip = curthread->td_ucred->cr_ruidinfo;
2918                                 uihold(uip);
2919                                 dst_entry->uip = uip;
2920                                 *fork_charge += size;
2921                                 if (!(src_entry->eflags &
2922                                       MAP_ENTRY_NEEDS_COPY)) {
2923                                         uihold(uip);
2924                                         src_entry->uip = uip;
2925                                         *fork_charge += size;
2926                                 }
2927                         }
2928                         src_entry->eflags |= (MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY);
2929                         dst_entry->eflags |= (MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY);
2930                         dst_entry->offset = src_entry->offset;
2931                 } else {
2932                         dst_entry->object.vm_object = NULL;
2933                         dst_entry->offset = 0;
2934                         if (src_entry->uip != NULL) {
2935                                 dst_entry->uip = curthread->td_ucred->cr_ruidinfo;
2936                                 uihold(dst_entry->uip);
2937                                 *fork_charge += size;
2938                         }
2939                 }
2940
2941                 pmap_copy(dst_map->pmap, src_map->pmap, dst_entry->start,
2942                     dst_entry->end - dst_entry->start, src_entry->start);
2943         } else {
2944                 /*
2945                  * Of course, wired down pages can't be set copy-on-write.
2946                  * Cause wired pages to be copied into the new map by
2947                  * simulating faults (the new pages are pageable)
2948                  */
2949                 vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry,
2950                     fork_charge);
2951         }
2952 }
2953
2954 /*
2955  * vmspace_map_entry_forked:
2956  * Update the newly-forked vmspace each time a map entry is inherited
2957  * or copied.  The values for vm_dsize and vm_tsize are approximate
2958  * (and mostly-obsolete ideas in the face of mmap(2) et al.)
2959  */
2960 static void
2961 vmspace_map_entry_forked(const struct vmspace *vm1, struct vmspace *vm2,
2962     vm_map_entry_t entry)
2963 {
2964         vm_size_t entrysize;
2965         vm_offset_t newend;
2966
2967         entrysize = entry->end - entry->start;
2968         vm2->vm_map.size += entrysize;
2969         if (entry->eflags & (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP)) {
2970                 vm2->vm_ssize += btoc(entrysize);
2971         } else if (entry->start >= (vm_offset_t)vm1->vm_daddr &&
2972             entry->start < (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize)) {
2973                 newend = MIN(entry->end,
2974                     (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize));
2975                 vm2->vm_dsize += btoc(newend - entry->start);
2976         } else if (entry->start >= (vm_offset_t)vm1->vm_taddr &&
2977             entry->start < (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize)) {
2978                 newend = MIN(entry->end,
2979                     (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize));
2980                 vm2->vm_tsize += btoc(newend - entry->start);
2981         }
2982 }
2983
2984 /*
2985  * vmspace_fork:
2986  * Create a new process vmspace structure and vm_map
2987  * based on those of an existing process.  The new map
2988  * is based on the old map, according to the inheritance
2989  * values on the regions in that map.
2990  *
2991  * XXX It might be worth coalescing the entries added to the new vmspace.
2992  *
2993  * The source map must not be locked.
2994  */
2995 struct vmspace *
2996 vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_charge)
2997 {
2998         struct vmspace *vm2;
2999         vm_map_t old_map = &vm1->vm_map;
3000         vm_map_t new_map;
3001         vm_map_entry_t old_entry;
3002         vm_map_entry_t new_entry;
3003         vm_object_t object;
3004         int locked;
3005
3006         vm_map_lock(old_map);
3007         vm2 = vmspace_alloc(old_map->min_offset, old_map->max_offset);
3008         if (vm2 == NULL)
3009                 goto unlock_and_return;
3010         vm2->vm_taddr = vm1->vm_taddr;
3011         vm2->vm_daddr = vm1->vm_daddr;
3012         vm2->vm_maxsaddr = vm1->vm_maxsaddr;
3013         new_map = &vm2->vm_map; /* XXX */
3014         locked = vm_map_trylock(new_map); /* trylock to silence WITNESS */
3015         KASSERT(locked, ("vmspace_fork: lock failed"));
3016         new_map->timestamp = 1;
3017
3018         old_entry = old_map->header.next;
3019
3020         while (old_entry != &old_map->header) {
3021                 if (old_entry->eflags & MAP_ENTRY_IS_SUB_MAP)
3022                         panic("vm_map_fork: encountered a submap");
3023
3024                 switch (old_entry->inheritance) {
3025                 case VM_INHERIT_NONE:
3026                         break;
3027
3028                 case VM_INHERIT_SHARE:
3029                         /*
3030                          * Clone the entry, creating the shared object if necessary.
3031                          */
3032                         object = old_entry->object.vm_object;
3033                         if (object == NULL) {
3034                                 object = vm_object_allocate(OBJT_DEFAULT,
3035                                         atop(old_entry->end - old_entry->start));
3036                                 old_entry->object.vm_object = object;
3037                                 old_entry->offset = 0;
3038                                 if (old_entry->uip != NULL) {
3039                                         object->uip = old_entry->uip;
3040                                         object->charge = old_entry->end -
3041                                             old_entry->start;
3042                                         old_entry->uip = NULL;
3043                                 }
3044                         }
3045
3046                         /*
3047                          * Add the reference before calling vm_object_shadow
3048                          * to insure that a shadow object is created.
3049                          */
3050                         vm_object_reference(object);
3051                         if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) {
3052                                 vm_object_shadow(&old_entry->object.vm_object,
3053                                         &old_entry->offset,
3054                                         atop(old_entry->end - old_entry->start));
3055                                 old_entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
3056                                 /* Transfer the second reference too. */
3057                                 vm_object_reference(
3058                                     old_entry->object.vm_object);
3059
3060                                 /*
3061                                  * As in vm_map_simplify_entry(), the
3062                                  * vnode lock will not be acquired in
3063                                  * this call to vm_object_deallocate().
3064                                  */
3065                                 vm_object_deallocate(object);
3066                                 object = old_entry->object.vm_object;
3067                         }
3068                         VM_OBJECT_LOCK(object);
3069                         vm_object_clear_flag(object, OBJ_ONEMAPPING);
3070                         if (old_entry->uip != NULL) {
3071                                 KASSERT(object->uip == NULL, ("vmspace_fork both uip"));
3072                                 object->uip = old_entry->uip;
3073                                 object->charge = old_entry->end - old_entry->start;
3074                                 old_entry->uip = NULL;
3075                         }
3076                         VM_OBJECT_UNLOCK(object);
3077
3078                         /*
3079                          * Clone the entry, referencing the shared object.
3080                          */
3081                         new_entry = vm_map_entry_create(new_map);
3082                         *new_entry = *old_entry;
3083                         new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED |
3084                             MAP_ENTRY_IN_TRANSITION);
3085                         new_entry->wired_count = 0;
3086
3087                         /*
3088                          * Insert the entry into the new map -- we know we're
3089                          * inserting at the end of the new map.
3090                          */
3091                         vm_map_entry_link(new_map, new_map->header.prev,
3092                             new_entry);
3093                         vmspace_map_entry_forked(vm1, vm2, new_entry);
3094
3095                         /*
3096                          * Update the physical map
3097                          */
3098                         pmap_copy(new_map->pmap, old_map->pmap,
3099                             new_entry->start,
3100                             (old_entry->end - old_entry->start),
3101                             old_entry->start);
3102                         break;
3103
3104                 case VM_INHERIT_COPY:
3105                         /*
3106                          * Clone the entry and link into the map.
3107                          */
3108                         new_entry = vm_map_entry_create(new_map);
3109                         *new_entry = *old_entry;
3110                         new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED |
3111                             MAP_ENTRY_IN_TRANSITION);
3112                         new_entry->wired_count = 0;
3113                         new_entry->object.vm_object = NULL;
3114                         new_entry->uip = NULL;
3115                         vm_map_entry_link(new_map, new_map->header.prev,
3116                             new_entry);
3117                         vmspace_map_entry_forked(vm1, vm2, new_entry);
3118                         vm_map_copy_entry(old_map, new_map, old_entry,
3119                             new_entry, fork_charge);
3120                         break;
3121                 }
3122                 old_entry = old_entry->next;
3123         }
3124 unlock_and_return:
3125         vm_map_unlock(old_map);
3126         if (vm2 != NULL)
3127                 vm_map_unlock(new_map);
3128
3129         return (vm2);
3130 }
3131
3132 int
3133 vm_map_stack(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
3134     vm_prot_t prot, vm_prot_t max, int cow)
3135 {
3136         vm_map_entry_t new_entry, prev_entry;
3137         vm_offset_t bot, top;
3138         vm_size_t init_ssize;
3139         int orient, rv;
3140         rlim_t vmemlim;
3141
3142         /*
3143          * The stack orientation is piggybacked with the cow argument.
3144          * Extract it into orient and mask the cow argument so that we
3145          * don't pass it around further.
3146          * NOTE: We explicitly allow bi-directional stacks.
3147          */
3148         orient = cow & (MAP_STACK_GROWS_DOWN|MAP_STACK_GROWS_UP);
3149         cow &= ~orient;
3150         KASSERT(orient != 0, ("No stack grow direction"));
3151
3152         if (addrbos < vm_map_min(map) ||
3153             addrbos > vm_map_max(map) ||
3154             addrbos + max_ssize < addrbos)
3155                 return (KERN_NO_SPACE);
3156
3157         init_ssize = (max_ssize < sgrowsiz) ? max_ssize : sgrowsiz;
3158
3159         PROC_LOCK(curthread->td_proc);
3160         vmemlim = lim_cur(curthread->td_proc, RLIMIT_VMEM);
3161         PROC_UNLOCK(curthread->td_proc);
3162
3163         vm_map_lock(map);
3164
3165         /* If addr is already mapped, no go */
3166         if (vm_map_lookup_entry(map, addrbos, &prev_entry)) {
3167                 vm_map_unlock(map);
3168                 return (KERN_NO_SPACE);
3169         }
3170
3171         /* If we would blow our VMEM resource limit, no go */
3172         if (map->size + init_ssize > vmemlim) {
3173                 vm_map_unlock(map);
3174                 return (KERN_NO_SPACE);
3175         }
3176
3177         /*
3178          * If we can't accomodate max_ssize in the current mapping, no go.
3179          * However, we need to be aware that subsequent user mappings might
3180          * map into the space we have reserved for stack, and currently this
3181          * space is not protected.
3182          *
3183          * Hopefully we will at least detect this condition when we try to
3184          * grow the stack.
3185          */
3186         if ((prev_entry->next != &map->header) &&
3187             (prev_entry->next->start < addrbos + max_ssize)) {
3188                 vm_map_unlock(map);
3189                 return (KERN_NO_SPACE);
3190         }
3191
3192         /*
3193          * We initially map a stack of only init_ssize.  We will grow as
3194          * needed later.  Depending on the orientation of the stack (i.e.
3195          * the grow direction) we either map at the top of the range, the
3196          * bottom of the range or in the middle.
3197          *
3198          * Note: we would normally expect prot and max to be VM_PROT_ALL,
3199          * and cow to be 0.  Possibly we should eliminate these as input
3200          * parameters, and just pass these values here in the insert call.
3201          */
3202         if (orient == MAP_STACK_GROWS_DOWN)
3203                 bot = addrbos + max_ssize - init_ssize;
3204         else if (orient == MAP_STACK_GROWS_UP)
3205                 bot = addrbos;
3206         else
3207                 bot = round_page(addrbos + max_ssize/2 - init_ssize/2);
3208         top = bot + init_ssize;
3209         rv = vm_map_insert(map, NULL, 0, bot, top, prot, max, cow);
3210
3211         /* Now set the avail_ssize amount. */
3212         if (rv == KERN_SUCCESS) {
3213                 if (prev_entry != &map->header)
3214                         vm_map_clip_end(map, prev_entry, bot);
3215                 new_entry = prev_entry->next;
3216                 if (new_entry->end != top || new_entry->start != bot)
3217                         panic("Bad entry start/end for new stack entry");
3218
3219                 new_entry->avail_ssize = max_ssize - init_ssize;
3220                 if (orient & MAP_STACK_GROWS_DOWN)
3221                         new_entry->eflags |= MAP_ENTRY_GROWS_DOWN;
3222                 if (orient & MAP_STACK_GROWS_UP)
3223                         new_entry->eflags |= MAP_ENTRY_GROWS_UP;
3224         }
3225
3226         vm_map_unlock(map);
3227         return (rv);
3228 }
3229
3230 /* Attempts to grow a vm stack entry.  Returns KERN_SUCCESS if the
3231  * desired address is already mapped, or if we successfully grow
3232  * the stack.  Also returns KERN_SUCCESS if addr is outside the
3233  * stack range (this is strange, but preserves compatibility with
3234  * the grow function in vm_machdep.c).
3235  */
3236 int
3237 vm_map_growstack(struct proc *p, vm_offset_t addr)
3238 {
3239         vm_map_entry_t next_entry, prev_entry;
3240         vm_map_entry_t new_entry, stack_entry;
3241         struct vmspace *vm = p->p_vmspace;
3242         vm_map_t map = &vm->vm_map;
3243         vm_offset_t end;
3244         size_t grow_amount, max_grow;
3245         rlim_t stacklim, vmemlim;
3246         int is_procstack, rv;
3247         struct uidinfo *uip;
3248
3249 Retry:
3250         PROC_LOCK(p);
3251         stacklim = lim_cur(p, RLIMIT_STACK);
3252         vmemlim = lim_cur(p, RLIMIT_VMEM);
3253         PROC_UNLOCK(p);
3254
3255         vm_map_lock_read(map);
3256
3257         /* If addr is already in the entry range, no need to grow.*/
3258         if (vm_map_lookup_entry(map, addr, &prev_entry)) {
3259                 vm_map_unlock_read(map);
3260                 return (KERN_SUCCESS);
3261         }
3262
3263         next_entry = prev_entry->next;
3264         if (!(prev_entry->eflags & MAP_ENTRY_GROWS_UP)) {
3265                 /*
3266                  * This entry does not grow upwards. Since the address lies
3267                  * beyond this entry, the next entry (if one exists) has to
3268                  * be a downward growable entry. The entry list header is
3269                  * never a growable entry, so it suffices to check the flags.
3270                  */
3271                 if (!(next_entry->eflags & MAP_ENTRY_GROWS_DOWN)) {
3272                         vm_map_unlock_read(map);
3273                         return (KERN_SUCCESS);
3274                 }
3275                 stack_entry = next_entry;
3276         } else {
3277                 /*
3278                  * This entry grows upward. If the next entry does not at
3279                  * least grow downwards, this is the entry we need to grow.
3280                  * otherwise we have two possible choices and we have to
3281                  * select one.
3282                  */
3283                 if (next_entry->eflags & MAP_ENTRY_GROWS_DOWN) {
3284                         /*
3285                          * We have two choices; grow the entry closest to
3286                          * the address to minimize the amount of growth.
3287                          */
3288                         if (addr - prev_entry->end <= next_entry->start - addr)
3289                                 stack_entry = prev_entry;
3290                         else
3291                                 stack_entry = next_entry;
3292                 } else
3293                         stack_entry = prev_entry;
3294         }
3295
3296         if (stack_entry == next_entry) {
3297                 KASSERT(stack_entry->eflags & MAP_ENTRY_GROWS_DOWN, ("foo"));
3298                 KASSERT(addr < stack_entry->start, ("foo"));
3299                 end = (prev_entry != &map->header) ? prev_entry->end :
3300                     stack_entry->start - stack_entry->avail_ssize;
3301                 grow_amount = roundup(stack_entry->start - addr, PAGE_SIZE);
3302                 max_grow = stack_entry->start - end;
3303         } else {
3304                 KASSERT(stack_entry->eflags & MAP_ENTRY_GROWS_UP, ("foo"));
3305                 KASSERT(addr >= stack_entry->end, ("foo"));
3306                 end = (next_entry != &map->header) ? next_entry->start :
3307                     stack_entry->end + stack_entry->avail_ssize;
3308                 grow_amount = roundup(addr + 1 - stack_entry->end, PAGE_SIZE);
3309                 max_grow = end - stack_entry->end;
3310         }
3311
3312         if (grow_amount > stack_entry->avail_ssize) {
3313                 vm_map_unlock_read(map);
3314                 return (KERN_NO_SPACE);
3315         }
3316
3317         /*
3318          * If there is no longer enough space between the entries nogo, and
3319          * adjust the available space.  Note: this  should only happen if the
3320          * user has mapped into the stack area after the stack was created,
3321          * and is probably an error.
3322          *
3323          * This also effectively destroys any guard page the user might have
3324          * intended by limiting the stack size.
3325          */
3326         if (grow_amount > max_grow) {
3327                 if (vm_map_lock_upgrade(map))
3328                         goto Retry;
3329
3330                 stack_entry->avail_ssize = max_grow;
3331
3332                 vm_map_unlock(map);
3333                 return (KERN_NO_SPACE);
3334         }
3335
3336         is_procstack = (addr >= (vm_offset_t)vm->vm_maxsaddr) ? 1 : 0;
3337
3338         /*
3339          * If this is the main process stack, see if we're over the stack
3340          * limit.
3341          */
3342         if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) {
3343                 vm_map_unlock_read(map);
3344                 return (KERN_NO_SPACE);
3345         }
3346
3347         /* Round up the grow amount modulo SGROWSIZ */
3348         grow_amount = roundup (grow_amount, sgrowsiz);
3349         if (grow_amount > stack_entry->avail_ssize)
3350                 grow_amount = stack_entry->avail_ssize;
3351         if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) {
3352                 grow_amount = stacklim - ctob(vm->vm_ssize);
3353         }
3354
3355         /* If we would blow our VMEM resource limit, no go */
3356         if (map->size + grow_amount > vmemlim) {
3357                 vm_map_unlock_read(map);
3358                 return (KERN_NO_SPACE);
3359         }
3360
3361         if (vm_map_lock_upgrade(map))
3362                 goto Retry;
3363
3364         if (stack_entry == next_entry) {
3365                 /*
3366                  * Growing downward.
3367                  */
3368                 /* Get the preliminary new entry start value */
3369                 addr = stack_entry->start - grow_amount;
3370
3371                 /*
3372                  * If this puts us into the previous entry, cut back our
3373                  * growth to the available space. Also, see the note above.
3374                  */
3375                 if (addr < end) {
3376                         stack_entry->avail_ssize = max_grow;
3377                         addr = end;
3378                 }
3379
3380                 rv = vm_map_insert(map, NULL, 0, addr, stack_entry->start,
3381                     p->p_sysent->sv_stackprot, VM_PROT_ALL, 0);
3382
3383                 /* Adjust the available stack space by the amount we grew. */
3384                 if (rv == KERN_SUCCESS) {
3385                         if (prev_entry != &map->header)
3386                                 vm_map_clip_end(map, prev_entry, addr);
3387                         new_entry = prev_entry->next;
3388                         KASSERT(new_entry == stack_entry->prev, ("foo"));
3389                         KASSERT(new_entry->end == stack_entry->start, ("foo"));
3390                         KASSERT(new_entry->start == addr, ("foo"));
3391                         grow_amount = new_entry->end - new_entry->start;
3392                         new_entry->avail_ssize = stack_entry->avail_ssize -
3393                             grow_amount;
3394                         stack_entry->eflags &= ~MAP_ENTRY_GROWS_DOWN;
3395                         new_entry->eflags |= MAP_ENTRY_GROWS_DOWN;
3396                 }
3397         } else {
3398                 /*
3399                  * Growing upward.
3400                  */
3401                 addr = stack_entry->end + grow_amount;
3402
3403                 /*
3404                  * If this puts us into the next entry, cut back our growth
3405                  * to the available space. Also, see the note above.
3406                  */
3407                 if (addr > end) {
3408                         stack_entry->avail_ssize = end - stack_entry->end;
3409                         addr = end;
3410                 }
3411
3412                 grow_amount = addr - stack_entry->end;
3413                 uip = stack_entry->uip;
3414                 if (uip == NULL && stack_entry->object.vm_object != NULL)
3415                         uip = stack_entry->object.vm_object->uip;
3416                 if (uip != NULL && !swap_reserve_by_uid(grow_amount, uip))
3417                         rv = KERN_NO_SPACE;
3418                 /* Grow the underlying object if applicable. */
3419                 else if (stack_entry->object.vm_object == NULL ||
3420                          vm_object_coalesce(stack_entry->object.vm_object,
3421                          stack_entry->offset,
3422                          (vm_size_t)(stack_entry->end - stack_entry->start),
3423                          (vm_size_t)grow_amount, uip != NULL)) {
3424                         map->size += (addr - stack_entry->end);
3425                         /* Update the current entry. */
3426                         stack_entry->end = addr;
3427                         stack_entry->avail_ssize -= grow_amount;
3428                         vm_map_entry_resize_free(map, stack_entry);
3429                         rv = KERN_SUCCESS;
3430
3431                         if (next_entry != &map->header)
3432                                 vm_map_clip_start(map, next_entry, addr);
3433                 } else
3434                         rv = KERN_FAILURE;
3435         }
3436
3437         if (rv == KERN_SUCCESS && is_procstack)
3438                 vm->vm_ssize += btoc(grow_amount);
3439
3440         vm_map_unlock(map);
3441
3442         /*
3443          * Heed the MAP_WIREFUTURE flag if it was set for this process.
3444          */
3445         if (rv == KERN_SUCCESS && (map->flags & MAP_WIREFUTURE)) {
3446                 vm_map_wire(map,
3447                     (stack_entry == next_entry) ? addr : addr - grow_amount,
3448                     (stack_entry == next_entry) ? stack_entry->start : addr,
3449                     (p->p_flag & P_SYSTEM)
3450                     ? VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES
3451                     : VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES);
3452         }
3453
3454         return (rv);
3455 }
3456
3457 /*
3458  * Unshare the specified VM space for exec.  If other processes are
3459  * mapped to it, then create a new one.  The new vmspace is null.
3460  */
3461 int
3462 vmspace_exec(struct proc *p, vm_offset_t minuser, vm_offset_t maxuser)
3463 {
3464         struct vmspace *oldvmspace = p->p_vmspace;
3465         struct vmspace *newvmspace;
3466
3467         newvmspace = vmspace_alloc(minuser, maxuser);
3468         if (newvmspace == NULL)
3469                 return (ENOMEM);
3470         newvmspace->vm_swrss = oldvmspace->vm_swrss;
3471         /*
3472          * This code is written like this for prototype purposes.  The
3473          * goal is to avoid running down the vmspace here, but let the
3474          * other process's that are still using the vmspace to finally
3475          * run it down.  Even though there is little or no chance of blocking
3476          * here, it is a good idea to keep this form for future mods.
3477          */
3478         PROC_VMSPACE_LOCK(p);
3479         p->p_vmspace = newvmspace;
3480         PROC_VMSPACE_UNLOCK(p);
3481         if (p == curthread->td_proc)
3482                 pmap_activate(curthread);
3483         vmspace_free(oldvmspace);
3484         return (0);
3485 }
3486
3487 /*
3488  * Unshare the specified VM space for forcing COW.  This
3489  * is called by rfork, for the (RFMEM|RFPROC) == 0 case.
3490  */
3491 int
3492 vmspace_unshare(struct proc *p)
3493 {
3494         struct vmspace *oldvmspace = p->p_vmspace;
3495         struct vmspace *newvmspace;
3496         vm_ooffset_t fork_charge;
3497
3498         if (oldvmspace->vm_refcnt == 1)
3499                 return (0);
3500         fork_charge = 0;
3501         newvmspace = vmspace_fork(oldvmspace, &fork_charge);
3502         if (newvmspace == NULL)
3503                 return (ENOMEM);
3504         if (!swap_reserve_by_uid(fork_charge, p->p_ucred->cr_ruidinfo)) {
3505                 vmspace_free(newvmspace);
3506                 return (ENOMEM);
3507         }
3508         PROC_VMSPACE_LOCK(p);
3509         p->p_vmspace = newvmspace;
3510         PROC_VMSPACE_UNLOCK(p);
3511         if (p == curthread->td_proc)
3512                 pmap_activate(curthread);
3513         vmspace_free(oldvmspace);
3514         return (0);
3515 }
3516
3517 /*
3518  *      vm_map_lookup:
3519  *
3520  *      Finds the VM object, offset, and
3521  *      protection for a given virtual address in the
3522  *      specified map, assuming a page fault of the
3523  *      type specified.
3524  *
3525  *      Leaves the map in question locked for read; return
3526  *      values are guaranteed until a vm_map_lookup_done
3527  *      call is performed.  Note that the map argument
3528  *      is in/out; the returned map must be used in
3529  *      the call to vm_map_lookup_done.
3530  *
3531  *      A handle (out_entry) is returned for use in
3532  *      vm_map_lookup_done, to make that fast.
3533  *
3534  *      If a lookup is requested with "write protection"
3535  *      specified, the map may be changed to perform virtual
3536  *      copying operations, although the data referenced will
3537  *      remain the same.
3538  */
3539 int
3540 vm_map_lookup(vm_map_t *var_map,                /* IN/OUT */
3541               vm_offset_t vaddr,
3542               vm_prot_t fault_typea,
3543               vm_map_entry_t *out_entry,        /* OUT */
3544               vm_object_t *object,              /* OUT */
3545               vm_pindex_t *pindex,              /* OUT */
3546               vm_prot_t *out_prot,              /* OUT */
3547               boolean_t *wired)                 /* OUT */
3548 {
3549         vm_map_entry_t entry;
3550         vm_map_t map = *var_map;
3551         vm_prot_t prot;
3552         vm_prot_t fault_type = fault_typea;
3553         vm_object_t eobject;
3554         struct uidinfo *uip;
3555         vm_ooffset_t size;
3556
3557 RetryLookup:;
3558
3559         vm_map_lock_read(map);
3560
3561         /*
3562          * Lookup the faulting address.
3563          */
3564         if (!vm_map_lookup_entry(map, vaddr, out_entry)) {
3565                 vm_map_unlock_read(map);
3566                 return (KERN_INVALID_ADDRESS);
3567         }
3568
3569         entry = *out_entry;
3570
3571         /*
3572          * Handle submaps.
3573          */
3574         if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
3575                 vm_map_t old_map = map;
3576
3577                 *var_map = map = entry->object.sub_map;
3578                 vm_map_unlock_read(old_map);
3579                 goto RetryLookup;
3580         }
3581
3582         /*
3583          * Check whether this task is allowed to have this page.
3584          * Note the special case for MAP_ENTRY_COW
3585          * pages with an override.  This is to implement a forced
3586          * COW for debuggers.
3587          */
3588         if (fault_type & VM_PROT_OVERRIDE_WRITE)
3589                 prot = entry->max_protection;
3590         else
3591                 prot = entry->protection;
3592         fault_type &= (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
3593         if ((fault_type & prot) != fault_type) {
3594                 vm_map_unlock_read(map);
3595                 return (KERN_PROTECTION_FAILURE);
3596         }
3597         if ((entry->eflags & MAP_ENTRY_USER_WIRED) &&
3598             (entry->eflags & MAP_ENTRY_COW) &&
3599             (fault_type & VM_PROT_WRITE) &&
3600             (fault_typea & VM_PROT_OVERRIDE_WRITE) == 0) {
3601                 vm_map_unlock_read(map);
3602                 return (KERN_PROTECTION_FAILURE);
3603         }
3604
3605         /*
3606          * If this page is not pageable, we have to get it for all possible
3607          * accesses.
3608          */
3609         *wired = (entry->wired_count != 0);
3610         if (*wired)
3611                 prot = fault_type = entry->protection;
3612         size = entry->end - entry->start;
3613         /*
3614          * If the entry was copy-on-write, we either ...
3615          */
3616         if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
3617                 /*
3618                  * If we want to write the page, we may as well handle that
3619                  * now since we've got the map locked.
3620                  *
3621                  * If we don't need to write the page, we just demote the
3622                  * permissions allowed.
3623                  */
3624                 if (fault_type & VM_PROT_WRITE) {
3625                         /*
3626                          * Make a new object, and place it in the object
3627                          * chain.  Note that no new references have appeared
3628                          * -- one just moved from the map to the new
3629                          * object.
3630                          */
3631                         if (vm_map_lock_upgrade(map))
3632                                 goto RetryLookup;
3633
3634                         if (entry->uip == NULL) {
3635                                 /*
3636                                  * The debugger owner is charged for
3637                                  * the memory.
3638                                  */
3639                                 uip = curthread->td_ucred->cr_ruidinfo;
3640                                 uihold(uip);
3641                                 if (!swap_reserve_by_uid(size, uip)) {
3642                                         uifree(uip);
3643                                         vm_map_unlock(map);
3644                                         return (KERN_RESOURCE_SHORTAGE);
3645                                 }
3646                                 entry->uip = uip;
3647                         }
3648                         vm_object_shadow(
3649                             &entry->object.vm_object,
3650                             &entry->offset,
3651                             atop(size));
3652                         entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
3653                         eobject = entry->object.vm_object;
3654                         if (eobject->uip != NULL) {
3655                                 /*
3656                                  * The object was not shadowed.
3657                                  */
3658                                 swap_release_by_uid(size, entry->uip);
3659                                 uifree(entry->uip);
3660                                 entry->uip = NULL;
3661                         } else if (entry->uip != NULL) {
3662                                 VM_OBJECT_LOCK(eobject);
3663                                 eobject->uip = entry->uip;
3664                                 eobject->charge = size;
3665                                 VM_OBJECT_UNLOCK(eobject);
3666                                 entry->uip = NULL;
3667                         }
3668
3669                         vm_map_lock_downgrade(map);
3670                 } else {
3671                         /*
3672                          * We're attempting to read a copy-on-write page --
3673                          * don't allow writes.
3674                          */
3675                         prot &= ~VM_PROT_WRITE;
3676                 }
3677         }
3678
3679         /*
3680          * Create an object if necessary.
3681          */
3682         if (entry->object.vm_object == NULL &&
3683             !map->system_map) {
3684                 if (vm_map_lock_upgrade(map))
3685                         goto RetryLookup;
3686                 entry->object.vm_object = vm_object_allocate(OBJT_DEFAULT,
3687                     atop(size));
3688                 entry->offset = 0;
3689                 if (entry->uip != NULL) {
3690                         VM_OBJECT_LOCK(entry->object.vm_object);
3691                         entry->object.vm_object->uip = entry->uip;
3692                         entry->object.vm_object->charge = size;
3693                         VM_OBJECT_UNLOCK(entry->object.vm_object);
3694                         entry->uip = NULL;
3695                 }
3696                 vm_map_lock_downgrade(map);
3697         }
3698
3699         /*
3700          * Return the object/offset from this entry.  If the entry was
3701          * copy-on-write or empty, it has been fixed up.
3702          */
3703         *pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
3704         *object = entry->object.vm_object;
3705
3706         *out_prot = prot;
3707         return (KERN_SUCCESS);
3708 }
3709
3710 /*
3711  *      vm_map_lookup_locked:
3712  *
3713  *      Lookup the faulting address.  A version of vm_map_lookup that returns 
3714  *      KERN_FAILURE instead of blocking on map lock or memory allocation.
3715  */
3716 int
3717 vm_map_lookup_locked(vm_map_t *var_map,         /* IN/OUT */
3718                      vm_offset_t vaddr,
3719                      vm_prot_t fault_typea,
3720                      vm_map_entry_t *out_entry, /* OUT */
3721                      vm_object_t *object,       /* OUT */
3722                      vm_pindex_t *pindex,       /* OUT */
3723                      vm_prot_t *out_prot,       /* OUT */
3724                      boolean_t *wired)          /* OUT */
3725 {
3726         vm_map_entry_t entry;
3727         vm_map_t map = *var_map;
3728         vm_prot_t prot;
3729         vm_prot_t fault_type = fault_typea;
3730
3731         /*
3732          * Lookup the faulting address.
3733          */
3734         if (!vm_map_lookup_entry(map, vaddr, out_entry))
3735                 return (KERN_INVALID_ADDRESS);
3736
3737         entry = *out_entry;
3738
3739         /*
3740          * Fail if the entry refers to a submap.
3741          */
3742         if (entry->eflags & MAP_ENTRY_IS_SUB_MAP)
3743                 return (KERN_FAILURE);
3744
3745         /*
3746          * Check whether this task is allowed to have this page.
3747          * Note the special case for MAP_ENTRY_COW
3748          * pages with an override.  This is to implement a forced
3749          * COW for debuggers.
3750          */
3751         if (fault_type & VM_PROT_OVERRIDE_WRITE)
3752                 prot = entry->max_protection;
3753         else
3754                 prot = entry->protection;
3755         fault_type &= VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE;
3756         if ((fault_type & prot) != fault_type)
3757                 return (KERN_PROTECTION_FAILURE);
3758         if ((entry->eflags & MAP_ENTRY_USER_WIRED) &&
3759             (entry->eflags & MAP_ENTRY_COW) &&
3760             (fault_type & VM_PROT_WRITE) &&
3761             (fault_typea & VM_PROT_OVERRIDE_WRITE) == 0)
3762                 return (KERN_PROTECTION_FAILURE);
3763
3764         /*
3765          * If this page is not pageable, we have to get it for all possible
3766          * accesses.
3767          */
3768         *wired = (entry->wired_count != 0);
3769         if (*wired)
3770                 prot = fault_type = entry->protection;
3771
3772         if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
3773                 /*
3774                  * Fail if the entry was copy-on-write for a write fault.
3775                  */
3776                 if (fault_type & VM_PROT_WRITE)
3777                         return (KERN_FAILURE);
3778                 /*
3779                  * We're attempting to read a copy-on-write page --
3780                  * don't allow writes.
3781                  */
3782                 prot &= ~VM_PROT_WRITE;
3783         }
3784
3785         /*
3786          * Fail if an object should be created.
3787          */
3788         if (entry->object.vm_object == NULL && !map->system_map)
3789                 return (KERN_FAILURE);
3790
3791         /*
3792          * Return the object/offset from this entry.  If the entry was
3793          * copy-on-write or empty, it has been fixed up.
3794          */
3795         *pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
3796         *object = entry->object.vm_object;
3797
3798         *out_prot = prot;
3799         return (KERN_SUCCESS);
3800 }
3801
3802 /*
3803  *      vm_map_lookup_done:
3804  *
3805  *      Releases locks acquired by a vm_map_lookup
3806  *      (according to the handle returned by that lookup).
3807  */
3808 void
3809 vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry)
3810 {
3811         /*
3812          * Unlock the main-level map
3813          */
3814         vm_map_unlock_read(map);
3815 }
3816
3817 #include "opt_ddb.h"
3818 #ifdef DDB
3819 #include <sys/kernel.h>
3820
3821 #include <ddb/ddb.h>
3822
3823 /*
3824  *      vm_map_print:   [ debug ]
3825  */
3826 DB_SHOW_COMMAND(map, vm_map_print)
3827 {
3828         static int nlines;
3829         /* XXX convert args. */
3830         vm_map_t map = (vm_map_t)addr;
3831         boolean_t full = have_addr;
3832
3833         vm_map_entry_t entry;
3834
3835         db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n",
3836             (void *)map,
3837             (void *)map->pmap, map->nentries, map->timestamp);
3838         nlines++;
3839
3840         if (!full && db_indent)
3841                 return;
3842
3843         db_indent += 2;
3844         for (entry = map->header.next; entry != &map->header;
3845             entry = entry->next) {
3846                 db_iprintf("map entry %p: start=%p, end=%p\n",
3847                     (void *)entry, (void *)entry->start, (void *)entry->end);
3848                 nlines++;
3849                 {
3850                         static char *inheritance_name[4] =
3851                         {"share", "copy", "none", "donate_copy"};
3852
3853                         db_iprintf(" prot=%x/%x/%s",
3854                             entry->protection,
3855                             entry->max_protection,
3856                             inheritance_name[(int)(unsigned char)entry->inheritance]);
3857                         if (entry->wired_count != 0)
3858                                 db_printf(", wired");
3859                 }
3860                 if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
3861                         db_printf(", share=%p, offset=0x%jx\n",
3862                             (void *)entry->object.sub_map,
3863                             (uintmax_t)entry->offset);
3864                         nlines++;
3865                         if ((entry->prev == &map->header) ||
3866                             (entry->prev->object.sub_map !=
3867                                 entry->object.sub_map)) {
3868                                 db_indent += 2;
3869                                 vm_map_print((db_expr_t)(intptr_t)
3870                                              entry->object.sub_map,
3871                                              full, 0, (char *)0);
3872                                 db_indent -= 2;
3873                         }
3874                 } else {
3875                         if (entry->uip != NULL)
3876                                 db_printf(", uip %d", entry->uip->ui_uid);
3877                         db_printf(", object=%p, offset=0x%jx",
3878                             (void *)entry->object.vm_object,
3879                             (uintmax_t)entry->offset);
3880                         if (entry->object.vm_object && entry->object.vm_object->uip)
3881                                 db_printf(", obj uip %d charge %jx",
3882                                     entry->object.vm_object->uip->ui_uid,
3883                                     (uintmax_t)entry->object.vm_object->charge);
3884                         if (entry->eflags & MAP_ENTRY_COW)
3885                                 db_printf(", copy (%s)",
3886                                     (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
3887                         db_printf("\n");
3888                         nlines++;
3889
3890                         if ((entry->prev == &map->header) ||
3891                             (entry->prev->object.vm_object !=
3892                                 entry->object.vm_object)) {
3893                                 db_indent += 2;
3894                                 vm_object_print((db_expr_t)(intptr_t)
3895                                                 entry->object.vm_object,
3896                                                 full, 0, (char *)0);
3897                                 nlines += 4;
3898                                 db_indent -= 2;
3899                         }
3900                 }
3901         }
3902         db_indent -= 2;
3903         if (db_indent == 0)
3904                 nlines = 0;
3905 }
3906
3907
3908 DB_SHOW_COMMAND(procvm, procvm)
3909 {
3910         struct proc *p;
3911
3912         if (have_addr) {
3913                 p = (struct proc *) addr;
3914         } else {
3915                 p = curproc;
3916         }
3917
3918         db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n",
3919             (void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map,
3920             (void *)vmspace_pmap(p->p_vmspace));
3921
3922         vm_map_print((db_expr_t)(intptr_t)&p->p_vmspace->vm_map, 1, 0, NULL);
3923 }
3924
3925 #endif /* DDB */