]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/vm/vm_map.c
Add a "show uma" command to DDB, which prints out the current stats for
[FreeBSD/FreeBSD.git] / sys / vm / vm_map.c
1 /*-
2  * Copyright (c) 1991, 1993
3  *      The Regents of the University of California.  All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * The Mach Operating System project at Carnegie-Mellon University.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 4. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *      from: @(#)vm_map.c      8.3 (Berkeley) 1/12/94
33  *
34  *
35  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
36  * All rights reserved.
37  *
38  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
39  *
40  * Permission to use, copy, modify and distribute this software and
41  * its documentation is hereby granted, provided that both the copyright
42  * notice and this permission notice appear in all copies of the
43  * software, derivative works or modified versions, and any portions
44  * thereof, and that both notices appear in supporting documentation.
45  *
46  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
47  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
48  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
49  *
50  * Carnegie Mellon requests users of this software to return to
51  *
52  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
53  *  School of Computer Science
54  *  Carnegie Mellon University
55  *  Pittsburgh PA 15213-3890
56  *
57  * any improvements or extensions that they make and grant Carnegie the
58  * rights to redistribute these changes.
59  */
60
61 /*
62  *      Virtual memory mapping module.
63  */
64
65 #include <sys/cdefs.h>
66 __FBSDID("$FreeBSD$");
67
68 #include <sys/param.h>
69 #include <sys/systm.h>
70 #include <sys/ktr.h>
71 #include <sys/lock.h>
72 #include <sys/mutex.h>
73 #include <sys/proc.h>
74 #include <sys/vmmeter.h>
75 #include <sys/mman.h>
76 #include <sys/vnode.h>
77 #include <sys/resourcevar.h>
78 #include <sys/file.h>
79 #include <sys/sysent.h>
80 #include <sys/shm.h>
81
82 #include <vm/vm.h>
83 #include <vm/vm_param.h>
84 #include <vm/pmap.h>
85 #include <vm/vm_map.h>
86 #include <vm/vm_page.h>
87 #include <vm/vm_object.h>
88 #include <vm/vm_pager.h>
89 #include <vm/vm_kern.h>
90 #include <vm/vm_extern.h>
91 #include <vm/swap_pager.h>
92 #include <vm/uma.h>
93
94 /*
95  *      Virtual memory maps provide for the mapping, protection,
96  *      and sharing of virtual memory objects.  In addition,
97  *      this module provides for an efficient virtual copy of
98  *      memory from one map to another.
99  *
100  *      Synchronization is required prior to most operations.
101  *
102  *      Maps consist of an ordered doubly-linked list of simple
103  *      entries; a single hint is used to speed up lookups.
104  *
105  *      Since portions of maps are specified by start/end addresses,
106  *      which may not align with existing map entries, all
107  *      routines merely "clip" entries to these start/end values.
108  *      [That is, an entry is split into two, bordering at a
109  *      start or end value.]  Note that these clippings may not
110  *      always be necessary (as the two resulting entries are then
111  *      not changed); however, the clipping is done for convenience.
112  *
113  *      As mentioned above, virtual copy operations are performed
114  *      by copying VM object references from one map to
115  *      another, and then marking both regions as copy-on-write.
116  */
117
118 /*
119  *      vm_map_startup:
120  *
121  *      Initialize the vm_map module.  Must be called before
122  *      any other vm_map routines.
123  *
124  *      Map and entry structures are allocated from the general
125  *      purpose memory pool with some exceptions:
126  *
127  *      - The kernel map and kmem submap are allocated statically.
128  *      - Kernel map entries are allocated out of a static pool.
129  *
130  *      These restrictions are necessary since malloc() uses the
131  *      maps and requires map entries.
132  */
133
134 static struct mtx map_sleep_mtx;
135 static uma_zone_t mapentzone;
136 static uma_zone_t kmapentzone;
137 static uma_zone_t mapzone;
138 static uma_zone_t vmspace_zone;
139 static struct vm_object kmapentobj;
140 static int vmspace_zinit(void *mem, int size, int flags);
141 static void vmspace_zfini(void *mem, int size);
142 static int vm_map_zinit(void *mem, int ize, int flags);
143 static void vm_map_zfini(void *mem, int size);
144 static void _vm_map_init(vm_map_t map, vm_offset_t min, vm_offset_t max);
145
146 #ifdef INVARIANTS
147 static void vm_map_zdtor(void *mem, int size, void *arg);
148 static void vmspace_zdtor(void *mem, int size, void *arg);
149 #endif
150
151 void
152 vm_map_startup(void)
153 {
154         mtx_init(&map_sleep_mtx, "vm map sleep mutex", NULL, MTX_DEF);
155         mapzone = uma_zcreate("MAP", sizeof(struct vm_map), NULL,
156 #ifdef INVARIANTS
157             vm_map_zdtor,
158 #else
159             NULL,
160 #endif
161             vm_map_zinit, vm_map_zfini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
162         uma_prealloc(mapzone, MAX_KMAP);
163         kmapentzone = uma_zcreate("KMAP ENTRY", sizeof(struct vm_map_entry),
164             NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
165             UMA_ZONE_MTXCLASS | UMA_ZONE_VM);
166         uma_prealloc(kmapentzone, MAX_KMAPENT);
167         mapentzone = uma_zcreate("MAP ENTRY", sizeof(struct vm_map_entry),
168             NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
169         uma_prealloc(mapentzone, MAX_MAPENT);
170 }
171
172 static void
173 vmspace_zfini(void *mem, int size)
174 {
175         struct vmspace *vm;
176
177         vm = (struct vmspace *)mem;
178         pmap_release(vmspace_pmap(vm));
179         vm_map_zfini(&vm->vm_map, sizeof(vm->vm_map));
180 }
181
182 static int
183 vmspace_zinit(void *mem, int size, int flags)
184 {
185         struct vmspace *vm;
186
187         vm = (struct vmspace *)mem;
188
189         (void)vm_map_zinit(&vm->vm_map, sizeof(vm->vm_map), flags);
190         pmap_pinit(vmspace_pmap(vm));
191         return (0);
192 }
193
194 static void
195 vm_map_zfini(void *mem, int size)
196 {
197         vm_map_t map;
198
199         map = (vm_map_t)mem;
200         mtx_destroy(&map->system_mtx);
201         sx_destroy(&map->lock);
202 }
203
204 static int
205 vm_map_zinit(void *mem, int size, int flags)
206 {
207         vm_map_t map;
208
209         map = (vm_map_t)mem;
210         map->nentries = 0;
211         map->size = 0;
212         mtx_init(&map->system_mtx, "system map", NULL, MTX_DEF | MTX_DUPOK);
213         sx_init(&map->lock, "user map");
214         return (0);
215 }
216
217 #ifdef INVARIANTS
218 static void
219 vmspace_zdtor(void *mem, int size, void *arg)
220 {
221         struct vmspace *vm;
222
223         vm = (struct vmspace *)mem;
224
225         vm_map_zdtor(&vm->vm_map, sizeof(vm->vm_map), arg);
226 }
227 static void
228 vm_map_zdtor(void *mem, int size, void *arg)
229 {
230         vm_map_t map;
231
232         map = (vm_map_t)mem;
233         KASSERT(map->nentries == 0,
234             ("map %p nentries == %d on free.",
235             map, map->nentries));
236         KASSERT(map->size == 0,
237             ("map %p size == %lu on free.",
238             map, (unsigned long)map->size));
239 }
240 #endif  /* INVARIANTS */
241
242 /*
243  * Allocate a vmspace structure, including a vm_map and pmap,
244  * and initialize those structures.  The refcnt is set to 1.
245  */
246 struct vmspace *
247 vmspace_alloc(min, max)
248         vm_offset_t min, max;
249 {
250         struct vmspace *vm;
251
252         vm = uma_zalloc(vmspace_zone, M_WAITOK);
253         CTR1(KTR_VM, "vmspace_alloc: %p", vm);
254         _vm_map_init(&vm->vm_map, min, max);
255         vm->vm_map.pmap = vmspace_pmap(vm);             /* XXX */
256         vm->vm_refcnt = 1;
257         vm->vm_shm = NULL;
258         vm->vm_swrss = 0;
259         vm->vm_tsize = 0;
260         vm->vm_dsize = 0;
261         vm->vm_ssize = 0;
262         vm->vm_taddr = 0;
263         vm->vm_daddr = 0;
264         vm->vm_maxsaddr = 0;
265         vm->vm_exitingcnt = 0;
266         return (vm);
267 }
268
269 void
270 vm_init2(void)
271 {
272         uma_zone_set_obj(kmapentzone, &kmapentobj, lmin(cnt.v_page_count,
273             (VM_MAX_KERNEL_ADDRESS - KERNBASE) / PAGE_SIZE) / 8 +
274              maxproc * 2 + maxfiles);
275         vmspace_zone = uma_zcreate("VMSPACE", sizeof(struct vmspace), NULL,
276 #ifdef INVARIANTS
277             vmspace_zdtor,
278 #else
279             NULL,
280 #endif
281             vmspace_zinit, vmspace_zfini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
282         pmap_init2();
283 }
284
285 static __inline void
286 vmspace_dofree(struct vmspace *vm)
287 {
288         CTR1(KTR_VM, "vmspace_free: %p", vm);
289
290         /*
291          * Make sure any SysV shm is freed, it might not have been in
292          * exit1().
293          */
294         shmexit(vm);
295
296         /*
297          * Lock the map, to wait out all other references to it.
298          * Delete all of the mappings and pages they hold, then call
299          * the pmap module to reclaim anything left.
300          */
301         vm_map_lock(&vm->vm_map);
302         (void) vm_map_delete(&vm->vm_map, vm->vm_map.min_offset,
303             vm->vm_map.max_offset);
304         vm_map_unlock(&vm->vm_map);
305
306         uma_zfree(vmspace_zone, vm);
307 }
308
309 void
310 vmspace_free(struct vmspace *vm)
311 {
312         int refcnt;
313
314         if (vm->vm_refcnt == 0)
315                 panic("vmspace_free: attempt to free already freed vmspace");
316
317         do
318                 refcnt = vm->vm_refcnt;
319         while (!atomic_cmpset_int(&vm->vm_refcnt, refcnt, refcnt - 1));
320         if (refcnt == 1 && vm->vm_exitingcnt == 0)
321                 vmspace_dofree(vm);
322 }
323
324 void
325 vmspace_exitfree(struct proc *p)
326 {
327         struct vmspace *vm;
328         int exitingcnt;
329
330         vm = p->p_vmspace;
331         p->p_vmspace = NULL;
332
333         /*
334          * cleanup by parent process wait()ing on exiting child.  vm_refcnt
335          * may not be 0 (e.g. fork() and child exits without exec()ing).
336          * exitingcnt may increment above 0 and drop back down to zero
337          * several times while vm_refcnt is held non-zero.  vm_refcnt
338          * may also increment above 0 and drop back down to zero several
339          * times while vm_exitingcnt is held non-zero.
340          *
341          * The last wait on the exiting child's vmspace will clean up
342          * the remainder of the vmspace.
343          */
344         do
345                 exitingcnt = vm->vm_exitingcnt;
346         while (!atomic_cmpset_int(&vm->vm_exitingcnt, exitingcnt,
347             exitingcnt - 1));
348         if (vm->vm_refcnt == 0 && exitingcnt == 1)
349                 vmspace_dofree(vm);
350 }
351
352 void
353 _vm_map_lock(vm_map_t map, const char *file, int line)
354 {
355
356         if (map->system_map)
357                 _mtx_lock_flags(&map->system_mtx, 0, file, line);
358         else
359                 _sx_xlock(&map->lock, file, line);
360         map->timestamp++;
361 }
362
363 void
364 _vm_map_unlock(vm_map_t map, const char *file, int line)
365 {
366
367         if (map->system_map)
368                 _mtx_unlock_flags(&map->system_mtx, 0, file, line);
369         else
370                 _sx_xunlock(&map->lock, file, line);
371 }
372
373 void
374 _vm_map_lock_read(vm_map_t map, const char *file, int line)
375 {
376
377         if (map->system_map)
378                 _mtx_lock_flags(&map->system_mtx, 0, file, line);
379         else
380                 _sx_xlock(&map->lock, file, line);
381 }
382
383 void
384 _vm_map_unlock_read(vm_map_t map, const char *file, int line)
385 {
386
387         if (map->system_map)
388                 _mtx_unlock_flags(&map->system_mtx, 0, file, line);
389         else
390                 _sx_xunlock(&map->lock, file, line);
391 }
392
393 int
394 _vm_map_trylock(vm_map_t map, const char *file, int line)
395 {
396         int error;
397
398         error = map->system_map ?
399             !_mtx_trylock(&map->system_mtx, 0, file, line) :
400             !_sx_try_xlock(&map->lock, file, line);
401         if (error == 0)
402                 map->timestamp++;
403         return (error == 0);
404 }
405
406 int
407 _vm_map_trylock_read(vm_map_t map, const char *file, int line)
408 {
409         int error;
410
411         error = map->system_map ?
412             !_mtx_trylock(&map->system_mtx, 0, file, line) :
413             !_sx_try_xlock(&map->lock, file, line);
414         return (error == 0);
415 }
416
417 int
418 _vm_map_lock_upgrade(vm_map_t map, const char *file, int line)
419 {
420
421 #ifdef INVARIANTS
422         if (map->system_map) {
423                 _mtx_assert(&map->system_mtx, MA_OWNED, file, line);
424         } else
425                 _sx_assert(&map->lock, SX_XLOCKED, file, line);
426 #endif
427         map->timestamp++;
428         return (0);
429 }
430
431 void
432 _vm_map_lock_downgrade(vm_map_t map, const char *file, int line)
433 {
434
435 #ifdef INVARIANTS
436         if (map->system_map) {
437                 _mtx_assert(&map->system_mtx, MA_OWNED, file, line);
438         } else
439                 _sx_assert(&map->lock, SX_XLOCKED, file, line);
440 #endif
441 }
442
443 /*
444  *      vm_map_unlock_and_wait:
445  */
446 int
447 vm_map_unlock_and_wait(vm_map_t map, boolean_t user_wait)
448 {
449
450         mtx_lock(&map_sleep_mtx);
451         vm_map_unlock(map);
452         return (msleep(&map->root, &map_sleep_mtx, PDROP | PVM, "vmmaps", 0));
453 }
454
455 /*
456  *      vm_map_wakeup:
457  */
458 void
459 vm_map_wakeup(vm_map_t map)
460 {
461
462         /*
463          * Acquire and release map_sleep_mtx to prevent a wakeup()
464          * from being performed (and lost) between the vm_map_unlock()
465          * and the msleep() in vm_map_unlock_and_wait().
466          */
467         mtx_lock(&map_sleep_mtx);
468         mtx_unlock(&map_sleep_mtx);
469         wakeup(&map->root);
470 }
471
472 long
473 vmspace_resident_count(struct vmspace *vmspace)
474 {
475         return pmap_resident_count(vmspace_pmap(vmspace));
476 }
477
478 long
479 vmspace_wired_count(struct vmspace *vmspace)
480 {
481         return pmap_wired_count(vmspace_pmap(vmspace));
482 }
483
484 /*
485  *      vm_map_create:
486  *
487  *      Creates and returns a new empty VM map with
488  *      the given physical map structure, and having
489  *      the given lower and upper address bounds.
490  */
491 vm_map_t
492 vm_map_create(pmap_t pmap, vm_offset_t min, vm_offset_t max)
493 {
494         vm_map_t result;
495
496         result = uma_zalloc(mapzone, M_WAITOK);
497         CTR1(KTR_VM, "vm_map_create: %p", result);
498         _vm_map_init(result, min, max);
499         result->pmap = pmap;
500         return (result);
501 }
502
503 /*
504  * Initialize an existing vm_map structure
505  * such as that in the vmspace structure.
506  * The pmap is set elsewhere.
507  */
508 static void
509 _vm_map_init(vm_map_t map, vm_offset_t min, vm_offset_t max)
510 {
511
512         map->header.next = map->header.prev = &map->header;
513         map->needs_wakeup = FALSE;
514         map->system_map = 0;
515         map->min_offset = min;
516         map->max_offset = max;
517         map->flags = 0;
518         map->root = NULL;
519         map->timestamp = 0;
520 }
521
522 void
523 vm_map_init(vm_map_t map, vm_offset_t min, vm_offset_t max)
524 {
525         _vm_map_init(map, min, max);
526         mtx_init(&map->system_mtx, "system map", NULL, MTX_DEF | MTX_DUPOK);
527         sx_init(&map->lock, "user map");
528 }
529
530 /*
531  *      vm_map_entry_dispose:   [ internal use only ]
532  *
533  *      Inverse of vm_map_entry_create.
534  */
535 static void
536 vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry)
537 {
538         uma_zfree(map->system_map ? kmapentzone : mapentzone, entry);
539 }
540
541 /*
542  *      vm_map_entry_create:    [ internal use only ]
543  *
544  *      Allocates a VM map entry for insertion.
545  *      No entry fields are filled in.
546  */
547 static vm_map_entry_t
548 vm_map_entry_create(vm_map_t map)
549 {
550         vm_map_entry_t new_entry;
551
552         if (map->system_map)
553                 new_entry = uma_zalloc(kmapentzone, M_NOWAIT);
554         else
555                 new_entry = uma_zalloc(mapentzone, M_WAITOK);
556         if (new_entry == NULL)
557                 panic("vm_map_entry_create: kernel resources exhausted");
558         return (new_entry);
559 }
560
561 /*
562  *      vm_map_entry_set_behavior:
563  *
564  *      Set the expected access behavior, either normal, random, or
565  *      sequential.
566  */
567 static __inline void
568 vm_map_entry_set_behavior(vm_map_entry_t entry, u_char behavior)
569 {
570         entry->eflags = (entry->eflags & ~MAP_ENTRY_BEHAV_MASK) |
571             (behavior & MAP_ENTRY_BEHAV_MASK);
572 }
573
574 /*
575  *      vm_map_entry_set_max_free:
576  *
577  *      Set the max_free field in a vm_map_entry.
578  */
579 static __inline void
580 vm_map_entry_set_max_free(vm_map_entry_t entry)
581 {
582
583         entry->max_free = entry->adj_free;
584         if (entry->left != NULL && entry->left->max_free > entry->max_free)
585                 entry->max_free = entry->left->max_free;
586         if (entry->right != NULL && entry->right->max_free > entry->max_free)
587                 entry->max_free = entry->right->max_free;
588 }
589
590 /*
591  *      vm_map_entry_splay:
592  *
593  *      The Sleator and Tarjan top-down splay algorithm with the
594  *      following variation.  Max_free must be computed bottom-up, so
595  *      on the downward pass, maintain the left and right spines in
596  *      reverse order.  Then, make a second pass up each side to fix
597  *      the pointers and compute max_free.  The time bound is O(log n)
598  *      amortized.
599  *
600  *      The new root is the vm_map_entry containing "addr", or else an
601  *      adjacent entry (lower or higher) if addr is not in the tree.
602  *
603  *      The map must be locked, and leaves it so.
604  *
605  *      Returns: the new root.
606  */
607 static vm_map_entry_t
608 vm_map_entry_splay(vm_offset_t addr, vm_map_entry_t root)
609 {
610         vm_map_entry_t llist, rlist;
611         vm_map_entry_t ltree, rtree;
612         vm_map_entry_t y;
613
614         /* Special case of empty tree. */
615         if (root == NULL)
616                 return (root);
617
618         /*
619          * Pass One: Splay down the tree until we find addr or a NULL
620          * pointer where addr would go.  llist and rlist are the two
621          * sides in reverse order (bottom-up), with llist linked by
622          * the right pointer and rlist linked by the left pointer in
623          * the vm_map_entry.  Wait until Pass Two to set max_free on
624          * the two spines.
625          */
626         llist = NULL;
627         rlist = NULL;
628         for (;;) {
629                 /* root is never NULL in here. */
630                 if (addr < root->start) {
631                         y = root->left;
632                         if (y == NULL)
633                                 break;
634                         if (addr < y->start && y->left != NULL) {
635                                 /* Rotate right and put y on rlist. */
636                                 root->left = y->right;
637                                 y->right = root;
638                                 vm_map_entry_set_max_free(root);
639                                 root = y->left;
640                                 y->left = rlist;
641                                 rlist = y;
642                         } else {
643                                 /* Put root on rlist. */
644                                 root->left = rlist;
645                                 rlist = root;
646                                 root = y;
647                         }
648                 } else {
649                         y = root->right;
650                         if (addr < root->end || y == NULL)
651                                 break;
652                         if (addr >= y->end && y->right != NULL) {
653                                 /* Rotate left and put y on llist. */
654                                 root->right = y->left;
655                                 y->left = root;
656                                 vm_map_entry_set_max_free(root);
657                                 root = y->right;
658                                 y->right = llist;
659                                 llist = y;
660                         } else {
661                                 /* Put root on llist. */
662                                 root->right = llist;
663                                 llist = root;
664                                 root = y;
665                         }
666                 }
667         }
668
669         /*
670          * Pass Two: Walk back up the two spines, flip the pointers
671          * and set max_free.  The subtrees of the root go at the
672          * bottom of llist and rlist.
673          */
674         ltree = root->left;
675         while (llist != NULL) {
676                 y = llist->right;
677                 llist->right = ltree;
678                 vm_map_entry_set_max_free(llist);
679                 ltree = llist;
680                 llist = y;
681         }
682         rtree = root->right;
683         while (rlist != NULL) {
684                 y = rlist->left;
685                 rlist->left = rtree;
686                 vm_map_entry_set_max_free(rlist);
687                 rtree = rlist;
688                 rlist = y;
689         }
690
691         /*
692          * Final assembly: add ltree and rtree as subtrees of root.
693          */
694         root->left = ltree;
695         root->right = rtree;
696         vm_map_entry_set_max_free(root);
697
698         return (root);
699 }
700
701 /*
702  *      vm_map_entry_{un,}link:
703  *
704  *      Insert/remove entries from maps.
705  */
706 static void
707 vm_map_entry_link(vm_map_t map,
708                   vm_map_entry_t after_where,
709                   vm_map_entry_t entry)
710 {
711
712         CTR4(KTR_VM,
713             "vm_map_entry_link: map %p, nentries %d, entry %p, after %p", map,
714             map->nentries, entry, after_where);
715         map->nentries++;
716         entry->prev = after_where;
717         entry->next = after_where->next;
718         entry->next->prev = entry;
719         after_where->next = entry;
720
721         if (after_where != &map->header) {
722                 if (after_where != map->root)
723                         vm_map_entry_splay(after_where->start, map->root);
724                 entry->right = after_where->right;
725                 entry->left = after_where;
726                 after_where->right = NULL;
727                 after_where->adj_free = entry->start - after_where->end;
728                 vm_map_entry_set_max_free(after_where);
729         } else {
730                 entry->right = map->root;
731                 entry->left = NULL;
732         }
733         entry->adj_free = (entry->next == &map->header ? map->max_offset :
734             entry->next->start) - entry->end;
735         vm_map_entry_set_max_free(entry);
736         map->root = entry;
737 }
738
739 static void
740 vm_map_entry_unlink(vm_map_t map,
741                     vm_map_entry_t entry)
742 {
743         vm_map_entry_t next, prev, root;
744
745         if (entry != map->root)
746                 vm_map_entry_splay(entry->start, map->root);
747         if (entry->left == NULL)
748                 root = entry->right;
749         else {
750                 root = vm_map_entry_splay(entry->start, entry->left);
751                 root->right = entry->right;
752                 root->adj_free = (entry->next == &map->header ? map->max_offset :
753                     entry->next->start) - root->end;
754                 vm_map_entry_set_max_free(root);
755         }
756         map->root = root;
757
758         prev = entry->prev;
759         next = entry->next;
760         next->prev = prev;
761         prev->next = next;
762         map->nentries--;
763         CTR3(KTR_VM, "vm_map_entry_unlink: map %p, nentries %d, entry %p", map,
764             map->nentries, entry);
765 }
766
767 /*
768  *      vm_map_entry_resize_free:
769  *
770  *      Recompute the amount of free space following a vm_map_entry
771  *      and propagate that value up the tree.  Call this function after
772  *      resizing a map entry in-place, that is, without a call to
773  *      vm_map_entry_link() or _unlink().
774  *
775  *      The map must be locked, and leaves it so.
776  */
777 static void
778 vm_map_entry_resize_free(vm_map_t map, vm_map_entry_t entry)
779 {
780
781         /*
782          * Using splay trees without parent pointers, propagating
783          * max_free up the tree is done by moving the entry to the
784          * root and making the change there.
785          */
786         if (entry != map->root)
787                 map->root = vm_map_entry_splay(entry->start, map->root);
788
789         entry->adj_free = (entry->next == &map->header ? map->max_offset :
790             entry->next->start) - entry->end;
791         vm_map_entry_set_max_free(entry);
792 }
793
794 /*
795  *      vm_map_lookup_entry:    [ internal use only ]
796  *
797  *      Finds the map entry containing (or
798  *      immediately preceding) the specified address
799  *      in the given map; the entry is returned
800  *      in the "entry" parameter.  The boolean
801  *      result indicates whether the address is
802  *      actually contained in the map.
803  */
804 boolean_t
805 vm_map_lookup_entry(
806         vm_map_t map,
807         vm_offset_t address,
808         vm_map_entry_t *entry)  /* OUT */
809 {
810         vm_map_entry_t cur;
811
812         cur = vm_map_entry_splay(address, map->root);
813         if (cur == NULL)
814                 *entry = &map->header;
815         else {
816                 map->root = cur;
817
818                 if (address >= cur->start) {
819                         *entry = cur;
820                         if (cur->end > address)
821                                 return (TRUE);
822                 } else
823                         *entry = cur->prev;
824         }
825         return (FALSE);
826 }
827
828 /*
829  *      vm_map_insert:
830  *
831  *      Inserts the given whole VM object into the target
832  *      map at the specified address range.  The object's
833  *      size should match that of the address range.
834  *
835  *      Requires that the map be locked, and leaves it so.
836  *
837  *      If object is non-NULL, ref count must be bumped by caller
838  *      prior to making call to account for the new entry.
839  */
840 int
841 vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
842               vm_offset_t start, vm_offset_t end, vm_prot_t prot, vm_prot_t max,
843               int cow)
844 {
845         vm_map_entry_t new_entry;
846         vm_map_entry_t prev_entry;
847         vm_map_entry_t temp_entry;
848         vm_eflags_t protoeflags;
849
850         /*
851          * Check that the start and end points are not bogus.
852          */
853         if ((start < map->min_offset) || (end > map->max_offset) ||
854             (start >= end))
855                 return (KERN_INVALID_ADDRESS);
856
857         /*
858          * Find the entry prior to the proposed starting address; if it's part
859          * of an existing entry, this range is bogus.
860          */
861         if (vm_map_lookup_entry(map, start, &temp_entry))
862                 return (KERN_NO_SPACE);
863
864         prev_entry = temp_entry;
865
866         /*
867          * Assert that the next entry doesn't overlap the end point.
868          */
869         if ((prev_entry->next != &map->header) &&
870             (prev_entry->next->start < end))
871                 return (KERN_NO_SPACE);
872
873         protoeflags = 0;
874
875         if (cow & MAP_COPY_ON_WRITE)
876                 protoeflags |= MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY;
877
878         if (cow & MAP_NOFAULT) {
879                 protoeflags |= MAP_ENTRY_NOFAULT;
880
881                 KASSERT(object == NULL,
882                         ("vm_map_insert: paradoxical MAP_NOFAULT request"));
883         }
884         if (cow & MAP_DISABLE_SYNCER)
885                 protoeflags |= MAP_ENTRY_NOSYNC;
886         if (cow & MAP_DISABLE_COREDUMP)
887                 protoeflags |= MAP_ENTRY_NOCOREDUMP;
888
889         if (object != NULL) {
890                 /*
891                  * OBJ_ONEMAPPING must be cleared unless this mapping
892                  * is trivially proven to be the only mapping for any
893                  * of the object's pages.  (Object granularity
894                  * reference counting is insufficient to recognize
895                  * aliases with precision.)
896                  */
897                 VM_OBJECT_LOCK(object);
898                 if (object->ref_count > 1 || object->shadow_count != 0)
899                         vm_object_clear_flag(object, OBJ_ONEMAPPING);
900                 VM_OBJECT_UNLOCK(object);
901         }
902         else if ((prev_entry != &map->header) &&
903                  (prev_entry->eflags == protoeflags) &&
904                  (prev_entry->end == start) &&
905                  (prev_entry->wired_count == 0) &&
906                  ((prev_entry->object.vm_object == NULL) ||
907                   vm_object_coalesce(prev_entry->object.vm_object,
908                                      prev_entry->offset,
909                                      (vm_size_t)(prev_entry->end - prev_entry->start),
910                                      (vm_size_t)(end - prev_entry->end)))) {
911                 /*
912                  * We were able to extend the object.  Determine if we
913                  * can extend the previous map entry to include the
914                  * new range as well.
915                  */
916                 if ((prev_entry->inheritance == VM_INHERIT_DEFAULT) &&
917                     (prev_entry->protection == prot) &&
918                     (prev_entry->max_protection == max)) {
919                         map->size += (end - prev_entry->end);
920                         prev_entry->end = end;
921                         vm_map_entry_resize_free(map, prev_entry);
922                         vm_map_simplify_entry(map, prev_entry);
923                         return (KERN_SUCCESS);
924                 }
925
926                 /*
927                  * If we can extend the object but cannot extend the
928                  * map entry, we have to create a new map entry.  We
929                  * must bump the ref count on the extended object to
930                  * account for it.  object may be NULL.
931                  */
932                 object = prev_entry->object.vm_object;
933                 offset = prev_entry->offset +
934                         (prev_entry->end - prev_entry->start);
935                 vm_object_reference(object);
936         }
937
938         /*
939          * NOTE: if conditionals fail, object can be NULL here.  This occurs
940          * in things like the buffer map where we manage kva but do not manage
941          * backing objects.
942          */
943
944         /*
945          * Create a new entry
946          */
947         new_entry = vm_map_entry_create(map);
948         new_entry->start = start;
949         new_entry->end = end;
950
951         new_entry->eflags = protoeflags;
952         new_entry->object.vm_object = object;
953         new_entry->offset = offset;
954         new_entry->avail_ssize = 0;
955
956         new_entry->inheritance = VM_INHERIT_DEFAULT;
957         new_entry->protection = prot;
958         new_entry->max_protection = max;
959         new_entry->wired_count = 0;
960
961         /*
962          * Insert the new entry into the list
963          */
964         vm_map_entry_link(map, prev_entry, new_entry);
965         map->size += new_entry->end - new_entry->start;
966
967 #if 0
968         /*
969          * Temporarily removed to avoid MAP_STACK panic, due to
970          * MAP_STACK being a huge hack.  Will be added back in
971          * when MAP_STACK (and the user stack mapping) is fixed.
972          */
973         /*
974          * It may be possible to simplify the entry
975          */
976         vm_map_simplify_entry(map, new_entry);
977 #endif
978
979         if (cow & (MAP_PREFAULT|MAP_PREFAULT_PARTIAL)) {
980                 vm_map_pmap_enter(map, start, prot,
981                                     object, OFF_TO_IDX(offset), end - start,
982                                     cow & MAP_PREFAULT_PARTIAL);
983         }
984
985         return (KERN_SUCCESS);
986 }
987
988 /*
989  *      vm_map_findspace:
990  *
991  *      Find the first fit (lowest VM address) for "length" free bytes
992  *      beginning at address >= start in the given map.
993  *
994  *      In a vm_map_entry, "adj_free" is the amount of free space
995  *      adjacent (higher address) to this entry, and "max_free" is the
996  *      maximum amount of contiguous free space in its subtree.  This
997  *      allows finding a free region in one path down the tree, so
998  *      O(log n) amortized with splay trees.
999  *
1000  *      The map must be locked, and leaves it so.
1001  *
1002  *      Returns: 0 on success, and starting address in *addr,
1003  *               1 if insufficient space.
1004  */
1005 int
1006 vm_map_findspace(vm_map_t map, vm_offset_t start, vm_size_t length,
1007     vm_offset_t *addr)  /* OUT */
1008 {
1009         vm_map_entry_t entry;
1010         vm_offset_t end, st;
1011
1012         /*
1013          * Request must fit within min/max VM address and must avoid
1014          * address wrap.
1015          */
1016         if (start < map->min_offset)
1017                 start = map->min_offset;
1018         if (start + length > map->max_offset || start + length < start)
1019                 return (1);
1020
1021         /* Empty tree means wide open address space. */
1022         if (map->root == NULL) {
1023                 *addr = start;
1024                 goto found;
1025         }
1026
1027         /*
1028          * After splay, if start comes before root node, then there
1029          * must be a gap from start to the root.
1030          */
1031         map->root = vm_map_entry_splay(start, map->root);
1032         if (start + length <= map->root->start) {
1033                 *addr = start;
1034                 goto found;
1035         }
1036
1037         /*
1038          * Root is the last node that might begin its gap before
1039          * start, and this is the last comparison where address
1040          * wrap might be a problem.
1041          */
1042         st = (start > map->root->end) ? start : map->root->end;
1043         if (length <= map->root->end + map->root->adj_free - st) {
1044                 *addr = st;
1045                 goto found;
1046         }
1047
1048         /* With max_free, can immediately tell if no solution. */
1049         entry = map->root->right;
1050         if (entry == NULL || length > entry->max_free)
1051                 return (1);
1052
1053         /*
1054          * Search the right subtree in the order: left subtree, root,
1055          * right subtree (first fit).  The previous splay implies that
1056          * all regions in the right subtree have addresses > start.
1057          */
1058         while (entry != NULL) {
1059                 if (entry->left != NULL && entry->left->max_free >= length)
1060                         entry = entry->left;
1061                 else if (entry->adj_free >= length) {
1062                         *addr = entry->end;
1063                         goto found;
1064                 } else
1065                         entry = entry->right;
1066         }
1067
1068         /* Can't get here, so panic if we do. */
1069         panic("vm_map_findspace: max_free corrupt");
1070
1071 found:
1072         /* Expand the kernel pmap, if necessary. */
1073         if (map == kernel_map) {
1074                 end = round_page(*addr + length);
1075                 if (end > kernel_vm_end)
1076                         pmap_growkernel(end);
1077         }
1078         return (0);
1079 }
1080
1081 /*
1082  *      vm_map_find finds an unallocated region in the target address
1083  *      map with the given length.  The search is defined to be
1084  *      first-fit from the specified address; the region found is
1085  *      returned in the same parameter.
1086  *
1087  *      If object is non-NULL, ref count must be bumped by caller
1088  *      prior to making call to account for the new entry.
1089  */
1090 int
1091 vm_map_find(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1092             vm_offset_t *addr,  /* IN/OUT */
1093             vm_size_t length, boolean_t find_space, vm_prot_t prot,
1094             vm_prot_t max, int cow)
1095 {
1096         vm_offset_t start;
1097         int result;
1098
1099         start = *addr;
1100         vm_map_lock(map);
1101         if (find_space) {
1102                 if (vm_map_findspace(map, start, length, addr)) {
1103                         vm_map_unlock(map);
1104                         return (KERN_NO_SPACE);
1105                 }
1106                 start = *addr;
1107         }
1108         result = vm_map_insert(map, object, offset,
1109                 start, start + length, prot, max, cow);
1110         vm_map_unlock(map);
1111         return (result);
1112 }
1113
1114 /*
1115  *      vm_map_simplify_entry:
1116  *
1117  *      Simplify the given map entry by merging with either neighbor.  This
1118  *      routine also has the ability to merge with both neighbors.
1119  *
1120  *      The map must be locked.
1121  *
1122  *      This routine guarentees that the passed entry remains valid (though
1123  *      possibly extended).  When merging, this routine may delete one or
1124  *      both neighbors.
1125  */
1126 void
1127 vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry)
1128 {
1129         vm_map_entry_t next, prev;
1130         vm_size_t prevsize, esize;
1131
1132         if (entry->eflags & (MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_IS_SUB_MAP))
1133                 return;
1134
1135         prev = entry->prev;
1136         if (prev != &map->header) {
1137                 prevsize = prev->end - prev->start;
1138                 if ( (prev->end == entry->start) &&
1139                      (prev->object.vm_object == entry->object.vm_object) &&
1140                      (!prev->object.vm_object ||
1141                         (prev->offset + prevsize == entry->offset)) &&
1142                      (prev->eflags == entry->eflags) &&
1143                      (prev->protection == entry->protection) &&
1144                      (prev->max_protection == entry->max_protection) &&
1145                      (prev->inheritance == entry->inheritance) &&
1146                      (prev->wired_count == entry->wired_count)) {
1147                         vm_map_entry_unlink(map, prev);
1148                         entry->start = prev->start;
1149                         entry->offset = prev->offset;
1150                         if (entry->prev != &map->header)
1151                                 vm_map_entry_resize_free(map, entry->prev);
1152                         if (prev->object.vm_object)
1153                                 vm_object_deallocate(prev->object.vm_object);
1154                         vm_map_entry_dispose(map, prev);
1155                 }
1156         }
1157
1158         next = entry->next;
1159         if (next != &map->header) {
1160                 esize = entry->end - entry->start;
1161                 if ((entry->end == next->start) &&
1162                     (next->object.vm_object == entry->object.vm_object) &&
1163                      (!entry->object.vm_object ||
1164                         (entry->offset + esize == next->offset)) &&
1165                     (next->eflags == entry->eflags) &&
1166                     (next->protection == entry->protection) &&
1167                     (next->max_protection == entry->max_protection) &&
1168                     (next->inheritance == entry->inheritance) &&
1169                     (next->wired_count == entry->wired_count)) {
1170                         vm_map_entry_unlink(map, next);
1171                         entry->end = next->end;
1172                         vm_map_entry_resize_free(map, entry);
1173                         if (next->object.vm_object)
1174                                 vm_object_deallocate(next->object.vm_object);
1175                         vm_map_entry_dispose(map, next);
1176                 }
1177         }
1178 }
1179 /*
1180  *      vm_map_clip_start:      [ internal use only ]
1181  *
1182  *      Asserts that the given entry begins at or after
1183  *      the specified address; if necessary,
1184  *      it splits the entry into two.
1185  */
1186 #define vm_map_clip_start(map, entry, startaddr) \
1187 { \
1188         if (startaddr > entry->start) \
1189                 _vm_map_clip_start(map, entry, startaddr); \
1190 }
1191
1192 /*
1193  *      This routine is called only when it is known that
1194  *      the entry must be split.
1195  */
1196 static void
1197 _vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start)
1198 {
1199         vm_map_entry_t new_entry;
1200
1201         /*
1202          * Split off the front portion -- note that we must insert the new
1203          * entry BEFORE this one, so that this entry has the specified
1204          * starting address.
1205          */
1206         vm_map_simplify_entry(map, entry);
1207
1208         /*
1209          * If there is no object backing this entry, we might as well create
1210          * one now.  If we defer it, an object can get created after the map
1211          * is clipped, and individual objects will be created for the split-up
1212          * map.  This is a bit of a hack, but is also about the best place to
1213          * put this improvement.
1214          */
1215         if (entry->object.vm_object == NULL && !map->system_map) {
1216                 vm_object_t object;
1217                 object = vm_object_allocate(OBJT_DEFAULT,
1218                                 atop(entry->end - entry->start));
1219                 entry->object.vm_object = object;
1220                 entry->offset = 0;
1221         }
1222
1223         new_entry = vm_map_entry_create(map);
1224         *new_entry = *entry;
1225
1226         new_entry->end = start;
1227         entry->offset += (start - entry->start);
1228         entry->start = start;
1229
1230         vm_map_entry_link(map, entry->prev, new_entry);
1231
1232         if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
1233                 vm_object_reference(new_entry->object.vm_object);
1234         }
1235 }
1236
1237 /*
1238  *      vm_map_clip_end:        [ internal use only ]
1239  *
1240  *      Asserts that the given entry ends at or before
1241  *      the specified address; if necessary,
1242  *      it splits the entry into two.
1243  */
1244 #define vm_map_clip_end(map, entry, endaddr) \
1245 { \
1246         if ((endaddr) < (entry->end)) \
1247                 _vm_map_clip_end((map), (entry), (endaddr)); \
1248 }
1249
1250 /*
1251  *      This routine is called only when it is known that
1252  *      the entry must be split.
1253  */
1254 static void
1255 _vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end)
1256 {
1257         vm_map_entry_t new_entry;
1258
1259         /*
1260          * If there is no object backing this entry, we might as well create
1261          * one now.  If we defer it, an object can get created after the map
1262          * is clipped, and individual objects will be created for the split-up
1263          * map.  This is a bit of a hack, but is also about the best place to
1264          * put this improvement.
1265          */
1266         if (entry->object.vm_object == NULL && !map->system_map) {
1267                 vm_object_t object;
1268                 object = vm_object_allocate(OBJT_DEFAULT,
1269                                 atop(entry->end - entry->start));
1270                 entry->object.vm_object = object;
1271                 entry->offset = 0;
1272         }
1273
1274         /*
1275          * Create a new entry and insert it AFTER the specified entry
1276          */
1277         new_entry = vm_map_entry_create(map);
1278         *new_entry = *entry;
1279
1280         new_entry->start = entry->end = end;
1281         new_entry->offset += (end - entry->start);
1282
1283         vm_map_entry_link(map, entry, new_entry);
1284
1285         if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
1286                 vm_object_reference(new_entry->object.vm_object);
1287         }
1288 }
1289
1290 /*
1291  *      VM_MAP_RANGE_CHECK:     [ internal use only ]
1292  *
1293  *      Asserts that the starting and ending region
1294  *      addresses fall within the valid range of the map.
1295  */
1296 #define VM_MAP_RANGE_CHECK(map, start, end)             \
1297                 {                                       \
1298                 if (start < vm_map_min(map))            \
1299                         start = vm_map_min(map);        \
1300                 if (end > vm_map_max(map))              \
1301                         end = vm_map_max(map);          \
1302                 if (start > end)                        \
1303                         start = end;                    \
1304                 }
1305
1306 /*
1307  *      vm_map_submap:          [ kernel use only ]
1308  *
1309  *      Mark the given range as handled by a subordinate map.
1310  *
1311  *      This range must have been created with vm_map_find,
1312  *      and no other operations may have been performed on this
1313  *      range prior to calling vm_map_submap.
1314  *
1315  *      Only a limited number of operations can be performed
1316  *      within this rage after calling vm_map_submap:
1317  *              vm_fault
1318  *      [Don't try vm_map_copy!]
1319  *
1320  *      To remove a submapping, one must first remove the
1321  *      range from the superior map, and then destroy the
1322  *      submap (if desired).  [Better yet, don't try it.]
1323  */
1324 int
1325 vm_map_submap(
1326         vm_map_t map,
1327         vm_offset_t start,
1328         vm_offset_t end,
1329         vm_map_t submap)
1330 {
1331         vm_map_entry_t entry;
1332         int result = KERN_INVALID_ARGUMENT;
1333
1334         vm_map_lock(map);
1335
1336         VM_MAP_RANGE_CHECK(map, start, end);
1337
1338         if (vm_map_lookup_entry(map, start, &entry)) {
1339                 vm_map_clip_start(map, entry, start);
1340         } else
1341                 entry = entry->next;
1342
1343         vm_map_clip_end(map, entry, end);
1344
1345         if ((entry->start == start) && (entry->end == end) &&
1346             ((entry->eflags & MAP_ENTRY_COW) == 0) &&
1347             (entry->object.vm_object == NULL)) {
1348                 entry->object.sub_map = submap;
1349                 entry->eflags |= MAP_ENTRY_IS_SUB_MAP;
1350                 result = KERN_SUCCESS;
1351         }
1352         vm_map_unlock(map);
1353
1354         return (result);
1355 }
1356
1357 /*
1358  * The maximum number of pages to map
1359  */
1360 #define MAX_INIT_PT     96
1361
1362 /*
1363  *      vm_map_pmap_enter:
1364  *
1365  *      Preload read-only mappings for the given object into the specified
1366  *      map.  This eliminates the soft faults on process startup and
1367  *      immediately after an mmap(2).
1368  */
1369 void
1370 vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
1371     vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags)
1372 {
1373         vm_offset_t tmpidx;
1374         int psize;
1375         vm_page_t p, mpte;
1376         boolean_t are_queues_locked;
1377
1378         if ((prot & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0 || object == NULL)
1379                 return;
1380         VM_OBJECT_LOCK(object);
1381         if (object->type == OBJT_DEVICE) {
1382                 pmap_object_init_pt(map->pmap, addr, object, pindex, size);
1383                 goto unlock_return;
1384         }
1385
1386         psize = atop(size);
1387
1388         if (object->type != OBJT_VNODE ||
1389             ((flags & MAP_PREFAULT_PARTIAL) && (psize > MAX_INIT_PT) &&
1390              (object->resident_page_count > MAX_INIT_PT))) {
1391                 goto unlock_return;
1392         }
1393
1394         if (psize + pindex > object->size) {
1395                 if (object->size < pindex)
1396                         goto unlock_return;
1397                 psize = object->size - pindex;
1398         }
1399
1400         are_queues_locked = FALSE;
1401         mpte = NULL;
1402
1403         if ((p = TAILQ_FIRST(&object->memq)) != NULL) {
1404                 if (p->pindex < pindex) {
1405                         p = vm_page_splay(pindex, object->root);
1406                         if ((object->root = p)->pindex < pindex)
1407                                 p = TAILQ_NEXT(p, listq);
1408                 }
1409         }
1410         /*
1411          * Assert: the variable p is either (1) the page with the
1412          * least pindex greater than or equal to the parameter pindex
1413          * or (2) NULL.
1414          */
1415         for (;
1416              p != NULL && (tmpidx = p->pindex - pindex) < psize;
1417              p = TAILQ_NEXT(p, listq)) {
1418                 /*
1419                  * don't allow an madvise to blow away our really
1420                  * free pages allocating pv entries.
1421                  */
1422                 if ((flags & MAP_PREFAULT_MADVISE) &&
1423                     cnt.v_free_count < cnt.v_free_reserved) {
1424                         break;
1425                 }
1426                 if ((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL &&
1427                     (p->busy == 0) &&
1428                     (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
1429                         if (!are_queues_locked) {
1430                                 are_queues_locked = TRUE;
1431                                 vm_page_lock_queues();
1432                         }
1433                         if ((p->queue - p->pc) == PQ_CACHE)
1434                                 vm_page_deactivate(p);
1435                         mpte = pmap_enter_quick(map->pmap,
1436                             addr + ptoa(tmpidx), p, prot, mpte);
1437                 }
1438         }
1439         if (are_queues_locked)
1440                 vm_page_unlock_queues();
1441 unlock_return:
1442         VM_OBJECT_UNLOCK(object);
1443 }
1444
1445 /*
1446  *      vm_map_protect:
1447  *
1448  *      Sets the protection of the specified address
1449  *      region in the target map.  If "set_max" is
1450  *      specified, the maximum protection is to be set;
1451  *      otherwise, only the current protection is affected.
1452  */
1453 int
1454 vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end,
1455                vm_prot_t new_prot, boolean_t set_max)
1456 {
1457         vm_map_entry_t current;
1458         vm_map_entry_t entry;
1459
1460         vm_map_lock(map);
1461
1462         VM_MAP_RANGE_CHECK(map, start, end);
1463
1464         if (vm_map_lookup_entry(map, start, &entry)) {
1465                 vm_map_clip_start(map, entry, start);
1466         } else {
1467                 entry = entry->next;
1468         }
1469
1470         /*
1471          * Make a first pass to check for protection violations.
1472          */
1473         current = entry;
1474         while ((current != &map->header) && (current->start < end)) {
1475                 if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
1476                         vm_map_unlock(map);
1477                         return (KERN_INVALID_ARGUMENT);
1478                 }
1479                 if ((new_prot & current->max_protection) != new_prot) {
1480                         vm_map_unlock(map);
1481                         return (KERN_PROTECTION_FAILURE);
1482                 }
1483                 current = current->next;
1484         }
1485
1486         /*
1487          * Go back and fix up protections. [Note that clipping is not
1488          * necessary the second time.]
1489          */
1490         current = entry;
1491         while ((current != &map->header) && (current->start < end)) {
1492                 vm_prot_t old_prot;
1493
1494                 vm_map_clip_end(map, current, end);
1495
1496                 old_prot = current->protection;
1497                 if (set_max)
1498                         current->protection =
1499                             (current->max_protection = new_prot) &
1500                             old_prot;
1501                 else
1502                         current->protection = new_prot;
1503
1504                 /*
1505                  * Update physical map if necessary. Worry about copy-on-write
1506                  * here -- CHECK THIS XXX
1507                  */
1508                 if (current->protection != old_prot) {
1509 #define MASK(entry)     (((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \
1510                                                         VM_PROT_ALL)
1511                         pmap_protect(map->pmap, current->start,
1512                             current->end,
1513                             current->protection & MASK(current));
1514 #undef  MASK
1515                 }
1516                 vm_map_simplify_entry(map, current);
1517                 current = current->next;
1518         }
1519         vm_map_unlock(map);
1520         return (KERN_SUCCESS);
1521 }
1522
1523 /*
1524  *      vm_map_madvise:
1525  *
1526  *      This routine traverses a processes map handling the madvise
1527  *      system call.  Advisories are classified as either those effecting
1528  *      the vm_map_entry structure, or those effecting the underlying
1529  *      objects.
1530  */
1531 int
1532 vm_map_madvise(
1533         vm_map_t map,
1534         vm_offset_t start,
1535         vm_offset_t end,
1536         int behav)
1537 {
1538         vm_map_entry_t current, entry;
1539         int modify_map = 0;
1540
1541         /*
1542          * Some madvise calls directly modify the vm_map_entry, in which case
1543          * we need to use an exclusive lock on the map and we need to perform
1544          * various clipping operations.  Otherwise we only need a read-lock
1545          * on the map.
1546          */
1547         switch(behav) {
1548         case MADV_NORMAL:
1549         case MADV_SEQUENTIAL:
1550         case MADV_RANDOM:
1551         case MADV_NOSYNC:
1552         case MADV_AUTOSYNC:
1553         case MADV_NOCORE:
1554         case MADV_CORE:
1555                 modify_map = 1;
1556                 vm_map_lock(map);
1557                 break;
1558         case MADV_WILLNEED:
1559         case MADV_DONTNEED:
1560         case MADV_FREE:
1561                 vm_map_lock_read(map);
1562                 break;
1563         default:
1564                 return (KERN_INVALID_ARGUMENT);
1565         }
1566
1567         /*
1568          * Locate starting entry and clip if necessary.
1569          */
1570         VM_MAP_RANGE_CHECK(map, start, end);
1571
1572         if (vm_map_lookup_entry(map, start, &entry)) {
1573                 if (modify_map)
1574                         vm_map_clip_start(map, entry, start);
1575         } else {
1576                 entry = entry->next;
1577         }
1578
1579         if (modify_map) {
1580                 /*
1581                  * madvise behaviors that are implemented in the vm_map_entry.
1582                  *
1583                  * We clip the vm_map_entry so that behavioral changes are
1584                  * limited to the specified address range.
1585                  */
1586                 for (current = entry;
1587                      (current != &map->header) && (current->start < end);
1588                      current = current->next
1589                 ) {
1590                         if (current->eflags & MAP_ENTRY_IS_SUB_MAP)
1591                                 continue;
1592
1593                         vm_map_clip_end(map, current, end);
1594
1595                         switch (behav) {
1596                         case MADV_NORMAL:
1597                                 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_NORMAL);
1598                                 break;
1599                         case MADV_SEQUENTIAL:
1600                                 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_SEQUENTIAL);
1601                                 break;
1602                         case MADV_RANDOM:
1603                                 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_RANDOM);
1604                                 break;
1605                         case MADV_NOSYNC:
1606                                 current->eflags |= MAP_ENTRY_NOSYNC;
1607                                 break;
1608                         case MADV_AUTOSYNC:
1609                                 current->eflags &= ~MAP_ENTRY_NOSYNC;
1610                                 break;
1611                         case MADV_NOCORE:
1612                                 current->eflags |= MAP_ENTRY_NOCOREDUMP;
1613                                 break;
1614                         case MADV_CORE:
1615                                 current->eflags &= ~MAP_ENTRY_NOCOREDUMP;
1616                                 break;
1617                         default:
1618                                 break;
1619                         }
1620                         vm_map_simplify_entry(map, current);
1621                 }
1622                 vm_map_unlock(map);
1623         } else {
1624                 vm_pindex_t pindex;
1625                 int count;
1626
1627                 /*
1628                  * madvise behaviors that are implemented in the underlying
1629                  * vm_object.
1630                  *
1631                  * Since we don't clip the vm_map_entry, we have to clip
1632                  * the vm_object pindex and count.
1633                  */
1634                 for (current = entry;
1635                      (current != &map->header) && (current->start < end);
1636                      current = current->next
1637                 ) {
1638                         vm_offset_t useStart;
1639
1640                         if (current->eflags & MAP_ENTRY_IS_SUB_MAP)
1641                                 continue;
1642
1643                         pindex = OFF_TO_IDX(current->offset);
1644                         count = atop(current->end - current->start);
1645                         useStart = current->start;
1646
1647                         if (current->start < start) {
1648                                 pindex += atop(start - current->start);
1649                                 count -= atop(start - current->start);
1650                                 useStart = start;
1651                         }
1652                         if (current->end > end)
1653                                 count -= atop(current->end - end);
1654
1655                         if (count <= 0)
1656                                 continue;
1657
1658                         vm_object_madvise(current->object.vm_object,
1659                                           pindex, count, behav);
1660                         if (behav == MADV_WILLNEED) {
1661                                 vm_map_pmap_enter(map,
1662                                     useStart,
1663                                     current->protection,
1664                                     current->object.vm_object,
1665                                     pindex,
1666                                     (count << PAGE_SHIFT),
1667                                     MAP_PREFAULT_MADVISE
1668                                 );
1669                         }
1670                 }
1671                 vm_map_unlock_read(map);
1672         }
1673         return (0);
1674 }
1675
1676
1677 /*
1678  *      vm_map_inherit:
1679  *
1680  *      Sets the inheritance of the specified address
1681  *      range in the target map.  Inheritance
1682  *      affects how the map will be shared with
1683  *      child maps at the time of vm_map_fork.
1684  */
1685 int
1686 vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end,
1687                vm_inherit_t new_inheritance)
1688 {
1689         vm_map_entry_t entry;
1690         vm_map_entry_t temp_entry;
1691
1692         switch (new_inheritance) {
1693         case VM_INHERIT_NONE:
1694         case VM_INHERIT_COPY:
1695         case VM_INHERIT_SHARE:
1696                 break;
1697         default:
1698                 return (KERN_INVALID_ARGUMENT);
1699         }
1700         vm_map_lock(map);
1701         VM_MAP_RANGE_CHECK(map, start, end);
1702         if (vm_map_lookup_entry(map, start, &temp_entry)) {
1703                 entry = temp_entry;
1704                 vm_map_clip_start(map, entry, start);
1705         } else
1706                 entry = temp_entry->next;
1707         while ((entry != &map->header) && (entry->start < end)) {
1708                 vm_map_clip_end(map, entry, end);
1709                 entry->inheritance = new_inheritance;
1710                 vm_map_simplify_entry(map, entry);
1711                 entry = entry->next;
1712         }
1713         vm_map_unlock(map);
1714         return (KERN_SUCCESS);
1715 }
1716
1717 /*
1718  *      vm_map_unwire:
1719  *
1720  *      Implements both kernel and user unwiring.
1721  */
1722 int
1723 vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end,
1724     int flags)
1725 {
1726         vm_map_entry_t entry, first_entry, tmp_entry;
1727         vm_offset_t saved_start;
1728         unsigned int last_timestamp;
1729         int rv;
1730         boolean_t need_wakeup, result, user_unwire;
1731
1732         user_unwire = (flags & VM_MAP_WIRE_USER) ? TRUE : FALSE;
1733         vm_map_lock(map);
1734         VM_MAP_RANGE_CHECK(map, start, end);
1735         if (!vm_map_lookup_entry(map, start, &first_entry)) {
1736                 if (flags & VM_MAP_WIRE_HOLESOK)
1737                         first_entry = first_entry->next;
1738                 else {
1739                         vm_map_unlock(map);
1740                         return (KERN_INVALID_ADDRESS);
1741                 }
1742         }
1743         last_timestamp = map->timestamp;
1744         entry = first_entry;
1745         while (entry != &map->header && entry->start < end) {
1746                 if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
1747                         /*
1748                          * We have not yet clipped the entry.
1749                          */
1750                         saved_start = (start >= entry->start) ? start :
1751                             entry->start;
1752                         entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
1753                         if (vm_map_unlock_and_wait(map, user_unwire)) {
1754                                 /*
1755                                  * Allow interruption of user unwiring?
1756                                  */
1757                         }
1758                         vm_map_lock(map);
1759                         if (last_timestamp+1 != map->timestamp) {
1760                                 /*
1761                                  * Look again for the entry because the map was
1762                                  * modified while it was unlocked.
1763                                  * Specifically, the entry may have been
1764                                  * clipped, merged, or deleted.
1765                                  */
1766                                 if (!vm_map_lookup_entry(map, saved_start,
1767                                     &tmp_entry)) {
1768                                         if (flags & VM_MAP_WIRE_HOLESOK)
1769                                                 tmp_entry = tmp_entry->next;
1770                                         else {
1771                                                 if (saved_start == start) {
1772                                                         /*
1773                                                          * First_entry has been deleted.
1774                                                          */
1775                                                         vm_map_unlock(map);
1776                                                         return (KERN_INVALID_ADDRESS);
1777                                                 }
1778                                                 end = saved_start;
1779                                                 rv = KERN_INVALID_ADDRESS;
1780                                                 goto done;
1781                                         }
1782                                 }
1783                                 if (entry == first_entry)
1784                                         first_entry = tmp_entry;
1785                                 else
1786                                         first_entry = NULL;
1787                                 entry = tmp_entry;
1788                         }
1789                         last_timestamp = map->timestamp;
1790                         continue;
1791                 }
1792                 vm_map_clip_start(map, entry, start);
1793                 vm_map_clip_end(map, entry, end);
1794                 /*
1795                  * Mark the entry in case the map lock is released.  (See
1796                  * above.)
1797                  */
1798                 entry->eflags |= MAP_ENTRY_IN_TRANSITION;
1799                 /*
1800                  * Check the map for holes in the specified region.
1801                  * If VM_MAP_WIRE_HOLESOK was specified, skip this check.
1802                  */
1803                 if (((flags & VM_MAP_WIRE_HOLESOK) == 0) &&
1804                     (entry->end < end && (entry->next == &map->header ||
1805                     entry->next->start > entry->end))) {
1806                         end = entry->end;
1807                         rv = KERN_INVALID_ADDRESS;
1808                         goto done;
1809                 }
1810                 /*
1811                  * If system unwiring, require that the entry is system wired.
1812                  */
1813                 if (!user_unwire &&
1814                     vm_map_entry_system_wired_count(entry) == 0) {
1815                         end = entry->end;
1816                         rv = KERN_INVALID_ARGUMENT;
1817                         goto done;
1818                 }
1819                 entry = entry->next;
1820         }
1821         rv = KERN_SUCCESS;
1822 done:
1823         need_wakeup = FALSE;
1824         if (first_entry == NULL) {
1825                 result = vm_map_lookup_entry(map, start, &first_entry);
1826                 if (!result && (flags & VM_MAP_WIRE_HOLESOK))
1827                         first_entry = first_entry->next;
1828                 else
1829                         KASSERT(result, ("vm_map_unwire: lookup failed"));
1830         }
1831         entry = first_entry;
1832         while (entry != &map->header && entry->start < end) {
1833                 if (rv == KERN_SUCCESS && (!user_unwire ||
1834                     (entry->eflags & MAP_ENTRY_USER_WIRED))) {
1835                         if (user_unwire)
1836                                 entry->eflags &= ~MAP_ENTRY_USER_WIRED;
1837                         entry->wired_count--;
1838                         if (entry->wired_count == 0) {
1839                                 /*
1840                                  * Retain the map lock.
1841                                  */
1842                                 vm_fault_unwire(map, entry->start, entry->end,
1843                                     entry->object.vm_object != NULL &&
1844                                     entry->object.vm_object->type == OBJT_DEVICE);
1845                         }
1846                 }
1847                 KASSERT(entry->eflags & MAP_ENTRY_IN_TRANSITION,
1848                         ("vm_map_unwire: in-transition flag missing"));
1849                 entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
1850                 if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
1851                         entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
1852                         need_wakeup = TRUE;
1853                 }
1854                 vm_map_simplify_entry(map, entry);
1855                 entry = entry->next;
1856         }
1857         vm_map_unlock(map);
1858         if (need_wakeup)
1859                 vm_map_wakeup(map);
1860         return (rv);
1861 }
1862
1863 /*
1864  *      vm_map_wire:
1865  *
1866  *      Implements both kernel and user wiring.
1867  */
1868 int
1869 vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end,
1870     int flags)
1871 {
1872         vm_map_entry_t entry, first_entry, tmp_entry;
1873         vm_offset_t saved_end, saved_start;
1874         unsigned int last_timestamp;
1875         int rv;
1876         boolean_t fictitious, need_wakeup, result, user_wire;
1877
1878         user_wire = (flags & VM_MAP_WIRE_USER) ? TRUE : FALSE;
1879         vm_map_lock(map);
1880         VM_MAP_RANGE_CHECK(map, start, end);
1881         if (!vm_map_lookup_entry(map, start, &first_entry)) {
1882                 if (flags & VM_MAP_WIRE_HOLESOK)
1883                         first_entry = first_entry->next;
1884                 else {
1885                         vm_map_unlock(map);
1886                         return (KERN_INVALID_ADDRESS);
1887                 }
1888         }
1889         last_timestamp = map->timestamp;
1890         entry = first_entry;
1891         while (entry != &map->header && entry->start < end) {
1892                 if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
1893                         /*
1894                          * We have not yet clipped the entry.
1895                          */
1896                         saved_start = (start >= entry->start) ? start :
1897                             entry->start;
1898                         entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
1899                         if (vm_map_unlock_and_wait(map, user_wire)) {
1900                                 /*
1901                                  * Allow interruption of user wiring?
1902                                  */
1903                         }
1904                         vm_map_lock(map);
1905                         if (last_timestamp + 1 != map->timestamp) {
1906                                 /*
1907                                  * Look again for the entry because the map was
1908                                  * modified while it was unlocked.
1909                                  * Specifically, the entry may have been
1910                                  * clipped, merged, or deleted.
1911                                  */
1912                                 if (!vm_map_lookup_entry(map, saved_start,
1913                                     &tmp_entry)) {
1914                                         if (flags & VM_MAP_WIRE_HOLESOK)
1915                                                 tmp_entry = tmp_entry->next;
1916                                         else {
1917                                                 if (saved_start == start) {
1918                                                         /*
1919                                                          * first_entry has been deleted.
1920                                                          */
1921                                                         vm_map_unlock(map);
1922                                                         return (KERN_INVALID_ADDRESS);
1923                                                 }
1924                                                 end = saved_start;
1925                                                 rv = KERN_INVALID_ADDRESS;
1926                                                 goto done;
1927                                         }
1928                                 }
1929                                 if (entry == first_entry)
1930                                         first_entry = tmp_entry;
1931                                 else
1932                                         first_entry = NULL;
1933                                 entry = tmp_entry;
1934                         }
1935                         last_timestamp = map->timestamp;
1936                         continue;
1937                 }
1938                 vm_map_clip_start(map, entry, start);
1939                 vm_map_clip_end(map, entry, end);
1940                 /*
1941                  * Mark the entry in case the map lock is released.  (See
1942                  * above.)
1943                  */
1944                 entry->eflags |= MAP_ENTRY_IN_TRANSITION;
1945                 /*
1946                  *
1947                  */
1948                 if (entry->wired_count == 0) {
1949                         entry->wired_count++;
1950                         saved_start = entry->start;
1951                         saved_end = entry->end;
1952                         fictitious = entry->object.vm_object != NULL &&
1953                             entry->object.vm_object->type == OBJT_DEVICE;
1954                         /*
1955                          * Release the map lock, relying on the in-transition
1956                          * mark.
1957                          */
1958                         vm_map_unlock(map);
1959                         rv = vm_fault_wire(map, saved_start, saved_end,
1960                             user_wire, fictitious);
1961                         vm_map_lock(map);
1962                         if (last_timestamp + 1 != map->timestamp) {
1963                                 /*
1964                                  * Look again for the entry because the map was
1965                                  * modified while it was unlocked.  The entry
1966                                  * may have been clipped, but NOT merged or
1967                                  * deleted.
1968                                  */
1969                                 result = vm_map_lookup_entry(map, saved_start,
1970                                     &tmp_entry);
1971                                 KASSERT(result, ("vm_map_wire: lookup failed"));
1972                                 if (entry == first_entry)
1973                                         first_entry = tmp_entry;
1974                                 else
1975                                         first_entry = NULL;
1976                                 entry = tmp_entry;
1977                                 while (entry->end < saved_end) {
1978                                         if (rv != KERN_SUCCESS) {
1979                                                 KASSERT(entry->wired_count == 1,
1980                                                     ("vm_map_wire: bad count"));
1981                                                 entry->wired_count = -1;
1982                                         }
1983                                         entry = entry->next;
1984                                 }
1985                         }
1986                         last_timestamp = map->timestamp;
1987                         if (rv != KERN_SUCCESS) {
1988                                 KASSERT(entry->wired_count == 1,
1989                                     ("vm_map_wire: bad count"));
1990                                 /*
1991                                  * Assign an out-of-range value to represent
1992                                  * the failure to wire this entry.
1993                                  */
1994                                 entry->wired_count = -1;
1995                                 end = entry->end;
1996                                 goto done;
1997                         }
1998                 } else if (!user_wire ||
1999                            (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
2000                         entry->wired_count++;
2001                 }
2002                 /*
2003                  * Check the map for holes in the specified region.
2004                  * If VM_MAP_WIRE_HOLESOK was specified, skip this check.
2005                  */
2006                 if (((flags & VM_MAP_WIRE_HOLESOK) == 0) &&
2007                     (entry->end < end && (entry->next == &map->header ||
2008                     entry->next->start > entry->end))) {
2009                         end = entry->end;
2010                         rv = KERN_INVALID_ADDRESS;
2011                         goto done;
2012                 }
2013                 entry = entry->next;
2014         }
2015         rv = KERN_SUCCESS;
2016 done:
2017         need_wakeup = FALSE;
2018         if (first_entry == NULL) {
2019                 result = vm_map_lookup_entry(map, start, &first_entry);
2020                 if (!result && (flags & VM_MAP_WIRE_HOLESOK))
2021                         first_entry = first_entry->next;
2022                 else
2023                         KASSERT(result, ("vm_map_wire: lookup failed"));
2024         }
2025         entry = first_entry;
2026         while (entry != &map->header && entry->start < end) {
2027                 if (rv == KERN_SUCCESS) {
2028                         if (user_wire)
2029                                 entry->eflags |= MAP_ENTRY_USER_WIRED;
2030                 } else if (entry->wired_count == -1) {
2031                         /*
2032                          * Wiring failed on this entry.  Thus, unwiring is
2033                          * unnecessary.
2034                          */
2035                         entry->wired_count = 0;
2036                 } else {
2037                         if (!user_wire ||
2038                             (entry->eflags & MAP_ENTRY_USER_WIRED) == 0)
2039                                 entry->wired_count--;
2040                         if (entry->wired_count == 0) {
2041                                 /*
2042                                  * Retain the map lock.
2043                                  */
2044                                 vm_fault_unwire(map, entry->start, entry->end,
2045                                     entry->object.vm_object != NULL &&
2046                                     entry->object.vm_object->type == OBJT_DEVICE);
2047                         }
2048                 }
2049                 KASSERT(entry->eflags & MAP_ENTRY_IN_TRANSITION,
2050                         ("vm_map_wire: in-transition flag missing"));
2051                 entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
2052                 if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
2053                         entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
2054                         need_wakeup = TRUE;
2055                 }
2056                 vm_map_simplify_entry(map, entry);
2057                 entry = entry->next;
2058         }
2059         vm_map_unlock(map);
2060         if (need_wakeup)
2061                 vm_map_wakeup(map);
2062         return (rv);
2063 }
2064
2065 /*
2066  * vm_map_sync
2067  *
2068  * Push any dirty cached pages in the address range to their pager.
2069  * If syncio is TRUE, dirty pages are written synchronously.
2070  * If invalidate is TRUE, any cached pages are freed as well.
2071  *
2072  * If the size of the region from start to end is zero, we are
2073  * supposed to flush all modified pages within the region containing
2074  * start.  Unfortunately, a region can be split or coalesced with
2075  * neighboring regions, making it difficult to determine what the
2076  * original region was.  Therefore, we approximate this requirement by
2077  * flushing the current region containing start.
2078  *
2079  * Returns an error if any part of the specified range is not mapped.
2080  */
2081 int
2082 vm_map_sync(
2083         vm_map_t map,
2084         vm_offset_t start,
2085         vm_offset_t end,
2086         boolean_t syncio,
2087         boolean_t invalidate)
2088 {
2089         vm_map_entry_t current;
2090         vm_map_entry_t entry;
2091         vm_size_t size;
2092         vm_object_t object;
2093         vm_ooffset_t offset;
2094
2095         vm_map_lock_read(map);
2096         VM_MAP_RANGE_CHECK(map, start, end);
2097         if (!vm_map_lookup_entry(map, start, &entry)) {
2098                 vm_map_unlock_read(map);
2099                 return (KERN_INVALID_ADDRESS);
2100         } else if (start == end) {
2101                 start = entry->start;
2102                 end = entry->end;
2103         }
2104         /*
2105          * Make a first pass to check for user-wired memory and holes.
2106          */
2107         for (current = entry; current->start < end; current = current->next) {
2108                 if (invalidate && (current->eflags & MAP_ENTRY_USER_WIRED)) {
2109                         vm_map_unlock_read(map);
2110                         return (KERN_INVALID_ARGUMENT);
2111                 }
2112                 if (end > current->end &&
2113                     (current->next == &map->header ||
2114                         current->end != current->next->start)) {
2115                         vm_map_unlock_read(map);
2116                         return (KERN_INVALID_ADDRESS);
2117                 }
2118         }
2119
2120         if (invalidate) {
2121                 VM_LOCK_GIANT();
2122                 pmap_remove(map->pmap, start, end);
2123                 VM_UNLOCK_GIANT();
2124         }
2125         /*
2126          * Make a second pass, cleaning/uncaching pages from the indicated
2127          * objects as we go.
2128          */
2129         for (current = entry; current->start < end; current = current->next) {
2130                 offset = current->offset + (start - current->start);
2131                 size = (end <= current->end ? end : current->end) - start;
2132                 if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
2133                         vm_map_t smap;
2134                         vm_map_entry_t tentry;
2135                         vm_size_t tsize;
2136
2137                         smap = current->object.sub_map;
2138                         vm_map_lock_read(smap);
2139                         (void) vm_map_lookup_entry(smap, offset, &tentry);
2140                         tsize = tentry->end - offset;
2141                         if (tsize < size)
2142                                 size = tsize;
2143                         object = tentry->object.vm_object;
2144                         offset = tentry->offset + (offset - tentry->start);
2145                         vm_map_unlock_read(smap);
2146                 } else {
2147                         object = current->object.vm_object;
2148                 }
2149                 vm_object_sync(object, offset, size, syncio, invalidate);
2150                 start += size;
2151         }
2152
2153         vm_map_unlock_read(map);
2154         return (KERN_SUCCESS);
2155 }
2156
2157 /*
2158  *      vm_map_entry_unwire:    [ internal use only ]
2159  *
2160  *      Make the region specified by this entry pageable.
2161  *
2162  *      The map in question should be locked.
2163  *      [This is the reason for this routine's existence.]
2164  */
2165 static void
2166 vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry)
2167 {
2168         vm_fault_unwire(map, entry->start, entry->end,
2169             entry->object.vm_object != NULL &&
2170             entry->object.vm_object->type == OBJT_DEVICE);
2171         entry->wired_count = 0;
2172 }
2173
2174 /*
2175  *      vm_map_entry_delete:    [ internal use only ]
2176  *
2177  *      Deallocate the given entry from the target map.
2178  */
2179 static void
2180 vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry)
2181 {
2182         vm_object_t object;
2183         vm_pindex_t offidxstart, offidxend, count;
2184
2185         vm_map_entry_unlink(map, entry);
2186         map->size -= entry->end - entry->start;
2187
2188         if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0 &&
2189             (object = entry->object.vm_object) != NULL) {
2190                 count = OFF_TO_IDX(entry->end - entry->start);
2191                 offidxstart = OFF_TO_IDX(entry->offset);
2192                 offidxend = offidxstart + count;
2193                 VM_OBJECT_LOCK(object);
2194                 if (object->ref_count != 1 &&
2195                     ((object->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) == OBJ_ONEMAPPING ||
2196                      object == kernel_object || object == kmem_object) &&
2197                     (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP)) {
2198                         vm_object_collapse(object);
2199                         vm_object_page_remove(object, offidxstart, offidxend, FALSE);
2200                         if (object->type == OBJT_SWAP)
2201                                 swap_pager_freespace(object, offidxstart, count);
2202                         if (offidxend >= object->size &&
2203                             offidxstart < object->size)
2204                                 object->size = offidxstart;
2205                 }
2206                 VM_OBJECT_UNLOCK(object);
2207                 vm_object_deallocate(object);
2208         }
2209
2210         vm_map_entry_dispose(map, entry);
2211 }
2212
2213 /*
2214  *      vm_map_delete:  [ internal use only ]
2215  *
2216  *      Deallocates the given address range from the target
2217  *      map.
2218  */
2219 int
2220 vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end)
2221 {
2222         vm_map_entry_t entry;
2223         vm_map_entry_t first_entry;
2224
2225         /*
2226          * Find the start of the region, and clip it
2227          */
2228         if (!vm_map_lookup_entry(map, start, &first_entry))
2229                 entry = first_entry->next;
2230         else {
2231                 entry = first_entry;
2232                 vm_map_clip_start(map, entry, start);
2233         }
2234
2235         /*
2236          * Step through all entries in this region
2237          */
2238         while ((entry != &map->header) && (entry->start < end)) {
2239                 vm_map_entry_t next;
2240
2241                 /*
2242                  * Wait for wiring or unwiring of an entry to complete.
2243                  * Also wait for any system wirings to disappear on
2244                  * user maps.
2245                  */
2246                 if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 ||
2247                     (vm_map_pmap(map) != kernel_pmap &&
2248                     vm_map_entry_system_wired_count(entry) != 0)) {
2249                         unsigned int last_timestamp;
2250                         vm_offset_t saved_start;
2251                         vm_map_entry_t tmp_entry;
2252
2253                         saved_start = entry->start;
2254                         entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
2255                         last_timestamp = map->timestamp;
2256                         (void) vm_map_unlock_and_wait(map, FALSE);
2257                         vm_map_lock(map);
2258                         if (last_timestamp + 1 != map->timestamp) {
2259                                 /*
2260                                  * Look again for the entry because the map was
2261                                  * modified while it was unlocked.
2262                                  * Specifically, the entry may have been
2263                                  * clipped, merged, or deleted.
2264                                  */
2265                                 if (!vm_map_lookup_entry(map, saved_start,
2266                                                          &tmp_entry))
2267                                         entry = tmp_entry->next;
2268                                 else {
2269                                         entry = tmp_entry;
2270                                         vm_map_clip_start(map, entry,
2271                                                           saved_start);
2272                                 }
2273                         }
2274                         continue;
2275                 }
2276                 vm_map_clip_end(map, entry, end);
2277
2278                 next = entry->next;
2279
2280                 /*
2281                  * Unwire before removing addresses from the pmap; otherwise,
2282                  * unwiring will put the entries back in the pmap.
2283                  */
2284                 if (entry->wired_count != 0) {
2285                         vm_map_entry_unwire(map, entry);
2286                 }
2287
2288                 if (!map->system_map)
2289                         VM_LOCK_GIANT();
2290                 pmap_remove(map->pmap, entry->start, entry->end);
2291                 if (!map->system_map)
2292                         VM_UNLOCK_GIANT();
2293
2294                 /*
2295                  * Delete the entry (which may delete the object) only after
2296                  * removing all pmap entries pointing to its pages.
2297                  * (Otherwise, its page frames may be reallocated, and any
2298                  * modify bits will be set in the wrong object!)
2299                  */
2300                 vm_map_entry_delete(map, entry);
2301                 entry = next;
2302         }
2303         return (KERN_SUCCESS);
2304 }
2305
2306 /*
2307  *      vm_map_remove:
2308  *
2309  *      Remove the given address range from the target map.
2310  *      This is the exported form of vm_map_delete.
2311  */
2312 int
2313 vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end)
2314 {
2315         int result;
2316
2317         vm_map_lock(map);
2318         VM_MAP_RANGE_CHECK(map, start, end);
2319         result = vm_map_delete(map, start, end);
2320         vm_map_unlock(map);
2321         return (result);
2322 }
2323
2324 /*
2325  *      vm_map_check_protection:
2326  *
2327  *      Assert that the target map allows the specified privilege on the
2328  *      entire address region given.  The entire region must be allocated.
2329  *
2330  *      WARNING!  This code does not and should not check whether the
2331  *      contents of the region is accessible.  For example a smaller file
2332  *      might be mapped into a larger address space.
2333  *
2334  *      NOTE!  This code is also called by munmap().
2335  *
2336  *      The map must be locked.  A read lock is sufficient.
2337  */
2338 boolean_t
2339 vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end,
2340                         vm_prot_t protection)
2341 {
2342         vm_map_entry_t entry;
2343         vm_map_entry_t tmp_entry;
2344
2345         if (!vm_map_lookup_entry(map, start, &tmp_entry))
2346                 return (FALSE);
2347         entry = tmp_entry;
2348
2349         while (start < end) {
2350                 if (entry == &map->header)
2351                         return (FALSE);
2352                 /*
2353                  * No holes allowed!
2354                  */
2355                 if (start < entry->start)
2356                         return (FALSE);
2357                 /*
2358                  * Check protection associated with entry.
2359                  */
2360                 if ((entry->protection & protection) != protection)
2361                         return (FALSE);
2362                 /* go to next entry */
2363                 start = entry->end;
2364                 entry = entry->next;
2365         }
2366         return (TRUE);
2367 }
2368
2369 /*
2370  *      vm_map_copy_entry:
2371  *
2372  *      Copies the contents of the source entry to the destination
2373  *      entry.  The entries *must* be aligned properly.
2374  */
2375 static void
2376 vm_map_copy_entry(
2377         vm_map_t src_map,
2378         vm_map_t dst_map,
2379         vm_map_entry_t src_entry,
2380         vm_map_entry_t dst_entry)
2381 {
2382         vm_object_t src_object;
2383
2384         if ((dst_entry->eflags|src_entry->eflags) & MAP_ENTRY_IS_SUB_MAP)
2385                 return;
2386
2387         if (src_entry->wired_count == 0) {
2388
2389                 /*
2390                  * If the source entry is marked needs_copy, it is already
2391                  * write-protected.
2392                  */
2393                 if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) {
2394                         pmap_protect(src_map->pmap,
2395                             src_entry->start,
2396                             src_entry->end,
2397                             src_entry->protection & ~VM_PROT_WRITE);
2398                 }
2399
2400                 /*
2401                  * Make a copy of the object.
2402                  */
2403                 if ((src_object = src_entry->object.vm_object) != NULL) {
2404                         VM_OBJECT_LOCK(src_object);
2405                         if ((src_object->handle == NULL) &&
2406                                 (src_object->type == OBJT_DEFAULT ||
2407                                  src_object->type == OBJT_SWAP)) {
2408                                 vm_object_collapse(src_object);
2409                                 if ((src_object->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) == OBJ_ONEMAPPING) {
2410                                         vm_object_split(src_entry);
2411                                         src_object = src_entry->object.vm_object;
2412                                 }
2413                         }
2414                         vm_object_reference_locked(src_object);
2415                         vm_object_clear_flag(src_object, OBJ_ONEMAPPING);
2416                         VM_OBJECT_UNLOCK(src_object);
2417                         dst_entry->object.vm_object = src_object;
2418                         src_entry->eflags |= (MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY);
2419                         dst_entry->eflags |= (MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY);
2420                         dst_entry->offset = src_entry->offset;
2421                 } else {
2422                         dst_entry->object.vm_object = NULL;
2423                         dst_entry->offset = 0;
2424                 }
2425
2426                 pmap_copy(dst_map->pmap, src_map->pmap, dst_entry->start,
2427                     dst_entry->end - dst_entry->start, src_entry->start);
2428         } else {
2429                 /*
2430                  * Of course, wired down pages can't be set copy-on-write.
2431                  * Cause wired pages to be copied into the new map by
2432                  * simulating faults (the new pages are pageable)
2433                  */
2434                 vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry);
2435         }
2436 }
2437
2438 /*
2439  * vmspace_map_entry_forked:
2440  * Update the newly-forked vmspace each time a map entry is inherited
2441  * or copied.  The values for vm_dsize and vm_tsize are approximate
2442  * (and mostly-obsolete ideas in the face of mmap(2) et al.)
2443  */
2444 static void
2445 vmspace_map_entry_forked(const struct vmspace *vm1, struct vmspace *vm2,
2446     vm_map_entry_t entry)
2447 {
2448         vm_size_t entrysize;
2449         vm_offset_t newend;
2450
2451         entrysize = entry->end - entry->start;
2452         vm2->vm_map.size += entrysize;
2453         if (entry->eflags & (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP)) {
2454                 vm2->vm_ssize += btoc(entrysize);
2455         } else if (entry->start >= (vm_offset_t)vm1->vm_daddr &&
2456             entry->start < (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize)) {
2457                 newend = MIN(entry->end,
2458                     (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize));
2459                 vm2->vm_dsize += btoc(newend - entry->start);
2460         } else if (entry->start >= (vm_offset_t)vm1->vm_taddr &&
2461             entry->start < (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize)) {
2462                 newend = MIN(entry->end,
2463                     (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize));
2464                 vm2->vm_tsize += btoc(newend - entry->start);
2465         }
2466 }
2467
2468 /*
2469  * vmspace_fork:
2470  * Create a new process vmspace structure and vm_map
2471  * based on those of an existing process.  The new map
2472  * is based on the old map, according to the inheritance
2473  * values on the regions in that map.
2474  *
2475  * XXX It might be worth coalescing the entries added to the new vmspace.
2476  *
2477  * The source map must not be locked.
2478  */
2479 struct vmspace *
2480 vmspace_fork(struct vmspace *vm1)
2481 {
2482         struct vmspace *vm2;
2483         vm_map_t old_map = &vm1->vm_map;
2484         vm_map_t new_map;
2485         vm_map_entry_t old_entry;
2486         vm_map_entry_t new_entry;
2487         vm_object_t object;
2488
2489         vm_map_lock(old_map);
2490
2491         vm2 = vmspace_alloc(old_map->min_offset, old_map->max_offset);
2492         vm2->vm_taddr = vm1->vm_taddr;
2493         vm2->vm_daddr = vm1->vm_daddr;
2494         vm2->vm_maxsaddr = vm1->vm_maxsaddr;
2495         new_map = &vm2->vm_map; /* XXX */
2496         new_map->timestamp = 1;
2497
2498         /* Do not inherit the MAP_WIREFUTURE property. */
2499         if ((new_map->flags & MAP_WIREFUTURE) == MAP_WIREFUTURE)
2500                 new_map->flags &= ~MAP_WIREFUTURE;
2501
2502         old_entry = old_map->header.next;
2503
2504         while (old_entry != &old_map->header) {
2505                 if (old_entry->eflags & MAP_ENTRY_IS_SUB_MAP)
2506                         panic("vm_map_fork: encountered a submap");
2507
2508                 switch (old_entry->inheritance) {
2509                 case VM_INHERIT_NONE:
2510                         break;
2511
2512                 case VM_INHERIT_SHARE:
2513                         /*
2514                          * Clone the entry, creating the shared object if necessary.
2515                          */
2516                         object = old_entry->object.vm_object;
2517                         if (object == NULL) {
2518                                 object = vm_object_allocate(OBJT_DEFAULT,
2519                                         atop(old_entry->end - old_entry->start));
2520                                 old_entry->object.vm_object = object;
2521                                 old_entry->offset = 0;
2522                         }
2523
2524                         /*
2525                          * Add the reference before calling vm_object_shadow
2526                          * to insure that a shadow object is created.
2527                          */
2528                         vm_object_reference(object);
2529                         if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) {
2530                                 vm_object_shadow(&old_entry->object.vm_object,
2531                                         &old_entry->offset,
2532                                         atop(old_entry->end - old_entry->start));
2533                                 old_entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
2534                                 /* Transfer the second reference too. */
2535                                 vm_object_reference(
2536                                     old_entry->object.vm_object);
2537                                 vm_object_deallocate(object);
2538                                 object = old_entry->object.vm_object;
2539                         }
2540                         VM_OBJECT_LOCK(object);
2541                         vm_object_clear_flag(object, OBJ_ONEMAPPING);
2542                         VM_OBJECT_UNLOCK(object);
2543
2544                         /*
2545                          * Clone the entry, referencing the shared object.
2546                          */
2547                         new_entry = vm_map_entry_create(new_map);
2548                         *new_entry = *old_entry;
2549                         new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
2550                         new_entry->wired_count = 0;
2551
2552                         /*
2553                          * Insert the entry into the new map -- we know we're
2554                          * inserting at the end of the new map.
2555                          */
2556                         vm_map_entry_link(new_map, new_map->header.prev,
2557                             new_entry);
2558                         vmspace_map_entry_forked(vm1, vm2, new_entry);
2559
2560                         /*
2561                          * Update the physical map
2562                          */
2563                         pmap_copy(new_map->pmap, old_map->pmap,
2564                             new_entry->start,
2565                             (old_entry->end - old_entry->start),
2566                             old_entry->start);
2567                         break;
2568
2569                 case VM_INHERIT_COPY:
2570                         /*
2571                          * Clone the entry and link into the map.
2572                          */
2573                         new_entry = vm_map_entry_create(new_map);
2574                         *new_entry = *old_entry;
2575                         new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
2576                         new_entry->wired_count = 0;
2577                         new_entry->object.vm_object = NULL;
2578                         vm_map_entry_link(new_map, new_map->header.prev,
2579                             new_entry);
2580                         vmspace_map_entry_forked(vm1, vm2, new_entry);
2581                         vm_map_copy_entry(old_map, new_map, old_entry,
2582                             new_entry);
2583                         break;
2584                 }
2585                 old_entry = old_entry->next;
2586         }
2587
2588         vm_map_unlock(old_map);
2589
2590         return (vm2);
2591 }
2592
2593 int
2594 vm_map_stack(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
2595     vm_prot_t prot, vm_prot_t max, int cow)
2596 {
2597         vm_map_entry_t new_entry, prev_entry;
2598         vm_offset_t bot, top;
2599         vm_size_t init_ssize;
2600         int orient, rv;
2601         rlim_t vmemlim;
2602
2603         /*
2604          * The stack orientation is piggybacked with the cow argument.
2605          * Extract it into orient and mask the cow argument so that we
2606          * don't pass it around further.
2607          * NOTE: We explicitly allow bi-directional stacks.
2608          */
2609         orient = cow & (MAP_STACK_GROWS_DOWN|MAP_STACK_GROWS_UP);
2610         cow &= ~orient;
2611         KASSERT(orient != 0, ("No stack grow direction"));
2612
2613         if (addrbos < vm_map_min(map) || addrbos > map->max_offset)
2614                 return (KERN_NO_SPACE);
2615
2616         init_ssize = (max_ssize < sgrowsiz) ? max_ssize : sgrowsiz;
2617
2618         PROC_LOCK(curthread->td_proc);
2619         vmemlim = lim_cur(curthread->td_proc, RLIMIT_VMEM);
2620         PROC_UNLOCK(curthread->td_proc);
2621
2622         vm_map_lock(map);
2623
2624         /* If addr is already mapped, no go */
2625         if (vm_map_lookup_entry(map, addrbos, &prev_entry)) {
2626                 vm_map_unlock(map);
2627                 return (KERN_NO_SPACE);
2628         }
2629
2630         /* If we would blow our VMEM resource limit, no go */
2631         if (map->size + init_ssize > vmemlim) {
2632                 vm_map_unlock(map);
2633                 return (KERN_NO_SPACE);
2634         }
2635
2636         /*
2637          * If we can't accomodate max_ssize in the current mapping, no go.
2638          * However, we need to be aware that subsequent user mappings might
2639          * map into the space we have reserved for stack, and currently this
2640          * space is not protected.
2641          *
2642          * Hopefully we will at least detect this condition when we try to
2643          * grow the stack.
2644          */
2645         if ((prev_entry->next != &map->header) &&
2646             (prev_entry->next->start < addrbos + max_ssize)) {
2647                 vm_map_unlock(map);
2648                 return (KERN_NO_SPACE);
2649         }
2650
2651         /*
2652          * We initially map a stack of only init_ssize.  We will grow as
2653          * needed later.  Depending on the orientation of the stack (i.e.
2654          * the grow direction) we either map at the top of the range, the
2655          * bottom of the range or in the middle.
2656          *
2657          * Note: we would normally expect prot and max to be VM_PROT_ALL,
2658          * and cow to be 0.  Possibly we should eliminate these as input
2659          * parameters, and just pass these values here in the insert call.
2660          */
2661         if (orient == MAP_STACK_GROWS_DOWN)
2662                 bot = addrbos + max_ssize - init_ssize;
2663         else if (orient == MAP_STACK_GROWS_UP)
2664                 bot = addrbos;
2665         else
2666                 bot = round_page(addrbos + max_ssize/2 - init_ssize/2);
2667         top = bot + init_ssize;
2668         rv = vm_map_insert(map, NULL, 0, bot, top, prot, max, cow);
2669
2670         /* Now set the avail_ssize amount. */
2671         if (rv == KERN_SUCCESS) {
2672                 if (prev_entry != &map->header)
2673                         vm_map_clip_end(map, prev_entry, bot);
2674                 new_entry = prev_entry->next;
2675                 if (new_entry->end != top || new_entry->start != bot)
2676                         panic("Bad entry start/end for new stack entry");
2677
2678                 new_entry->avail_ssize = max_ssize - init_ssize;
2679                 if (orient & MAP_STACK_GROWS_DOWN)
2680                         new_entry->eflags |= MAP_ENTRY_GROWS_DOWN;
2681                 if (orient & MAP_STACK_GROWS_UP)
2682                         new_entry->eflags |= MAP_ENTRY_GROWS_UP;
2683         }
2684
2685         vm_map_unlock(map);
2686         return (rv);
2687 }
2688
2689 /* Attempts to grow a vm stack entry.  Returns KERN_SUCCESS if the
2690  * desired address is already mapped, or if we successfully grow
2691  * the stack.  Also returns KERN_SUCCESS if addr is outside the
2692  * stack range (this is strange, but preserves compatibility with
2693  * the grow function in vm_machdep.c).
2694  */
2695 int
2696 vm_map_growstack(struct proc *p, vm_offset_t addr)
2697 {
2698         vm_map_entry_t next_entry, prev_entry;
2699         vm_map_entry_t new_entry, stack_entry;
2700         struct vmspace *vm = p->p_vmspace;
2701         vm_map_t map = &vm->vm_map;
2702         vm_offset_t end;
2703         size_t grow_amount, max_grow;
2704         rlim_t stacklim, vmemlim;
2705         int is_procstack, rv;
2706
2707 Retry:
2708         PROC_LOCK(p);
2709         stacklim = lim_cur(p, RLIMIT_STACK);
2710         vmemlim = lim_cur(p, RLIMIT_VMEM);
2711         PROC_UNLOCK(p);
2712
2713         vm_map_lock_read(map);
2714
2715         /* If addr is already in the entry range, no need to grow.*/
2716         if (vm_map_lookup_entry(map, addr, &prev_entry)) {
2717                 vm_map_unlock_read(map);
2718                 return (KERN_SUCCESS);
2719         }
2720
2721         next_entry = prev_entry->next;
2722         if (!(prev_entry->eflags & MAP_ENTRY_GROWS_UP)) {
2723                 /*
2724                  * This entry does not grow upwards. Since the address lies
2725                  * beyond this entry, the next entry (if one exists) has to
2726                  * be a downward growable entry. The entry list header is
2727                  * never a growable entry, so it suffices to check the flags.
2728                  */
2729                 if (!(next_entry->eflags & MAP_ENTRY_GROWS_DOWN)) {
2730                         vm_map_unlock_read(map);
2731                         return (KERN_SUCCESS);
2732                 }
2733                 stack_entry = next_entry;
2734         } else {
2735                 /*
2736                  * This entry grows upward. If the next entry does not at
2737                  * least grow downwards, this is the entry we need to grow.
2738                  * otherwise we have two possible choices and we have to
2739                  * select one.
2740                  */
2741                 if (next_entry->eflags & MAP_ENTRY_GROWS_DOWN) {
2742                         /*
2743                          * We have two choices; grow the entry closest to
2744                          * the address to minimize the amount of growth.
2745                          */
2746                         if (addr - prev_entry->end <= next_entry->start - addr)
2747                                 stack_entry = prev_entry;
2748                         else
2749                                 stack_entry = next_entry;
2750                 } else
2751                         stack_entry = prev_entry;
2752         }
2753
2754         if (stack_entry == next_entry) {
2755                 KASSERT(stack_entry->eflags & MAP_ENTRY_GROWS_DOWN, ("foo"));
2756                 KASSERT(addr < stack_entry->start, ("foo"));
2757                 end = (prev_entry != &map->header) ? prev_entry->end :
2758                     stack_entry->start - stack_entry->avail_ssize;
2759                 grow_amount = roundup(stack_entry->start - addr, PAGE_SIZE);
2760                 max_grow = stack_entry->start - end;
2761         } else {
2762                 KASSERT(stack_entry->eflags & MAP_ENTRY_GROWS_UP, ("foo"));
2763                 KASSERT(addr >= stack_entry->end, ("foo"));
2764                 end = (next_entry != &map->header) ? next_entry->start :
2765                     stack_entry->end + stack_entry->avail_ssize;
2766                 grow_amount = roundup(addr + 1 - stack_entry->end, PAGE_SIZE);
2767                 max_grow = end - stack_entry->end;
2768         }
2769
2770         if (grow_amount > stack_entry->avail_ssize) {
2771                 vm_map_unlock_read(map);
2772                 return (KERN_NO_SPACE);
2773         }
2774
2775         /*
2776          * If there is no longer enough space between the entries nogo, and
2777          * adjust the available space.  Note: this  should only happen if the
2778          * user has mapped into the stack area after the stack was created,
2779          * and is probably an error.
2780          *
2781          * This also effectively destroys any guard page the user might have
2782          * intended by limiting the stack size.
2783          */
2784         if (grow_amount > max_grow) {
2785                 if (vm_map_lock_upgrade(map))
2786                         goto Retry;
2787
2788                 stack_entry->avail_ssize = max_grow;
2789
2790                 vm_map_unlock(map);
2791                 return (KERN_NO_SPACE);
2792         }
2793
2794         is_procstack = (addr >= (vm_offset_t)vm->vm_maxsaddr) ? 1 : 0;
2795
2796         /*
2797          * If this is the main process stack, see if we're over the stack
2798          * limit.
2799          */
2800         if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) {
2801                 vm_map_unlock_read(map);
2802                 return (KERN_NO_SPACE);
2803         }
2804
2805         /* Round up the grow amount modulo SGROWSIZ */
2806         grow_amount = roundup (grow_amount, sgrowsiz);
2807         if (grow_amount > stack_entry->avail_ssize)
2808                 grow_amount = stack_entry->avail_ssize;
2809         if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) {
2810                 grow_amount = stacklim - ctob(vm->vm_ssize);
2811         }
2812
2813         /* If we would blow our VMEM resource limit, no go */
2814         if (map->size + grow_amount > vmemlim) {
2815                 vm_map_unlock_read(map);
2816                 return (KERN_NO_SPACE);
2817         }
2818
2819         if (vm_map_lock_upgrade(map))
2820                 goto Retry;
2821
2822         if (stack_entry == next_entry) {
2823                 /*
2824                  * Growing downward.
2825                  */
2826                 /* Get the preliminary new entry start value */
2827                 addr = stack_entry->start - grow_amount;
2828
2829                 /*
2830                  * If this puts us into the previous entry, cut back our
2831                  * growth to the available space. Also, see the note above.
2832                  */
2833                 if (addr < end) {
2834                         stack_entry->avail_ssize = max_grow;
2835                         addr = end;
2836                 }
2837
2838                 rv = vm_map_insert(map, NULL, 0, addr, stack_entry->start,
2839                     p->p_sysent->sv_stackprot, VM_PROT_ALL, 0);
2840
2841                 /* Adjust the available stack space by the amount we grew. */
2842                 if (rv == KERN_SUCCESS) {
2843                         if (prev_entry != &map->header)
2844                                 vm_map_clip_end(map, prev_entry, addr);
2845                         new_entry = prev_entry->next;
2846                         KASSERT(new_entry == stack_entry->prev, ("foo"));
2847                         KASSERT(new_entry->end == stack_entry->start, ("foo"));
2848                         KASSERT(new_entry->start == addr, ("foo"));
2849                         grow_amount = new_entry->end - new_entry->start;
2850                         new_entry->avail_ssize = stack_entry->avail_ssize -
2851                             grow_amount;
2852                         stack_entry->eflags &= ~MAP_ENTRY_GROWS_DOWN;
2853                         new_entry->eflags |= MAP_ENTRY_GROWS_DOWN;
2854                 }
2855         } else {
2856                 /*
2857                  * Growing upward.
2858                  */
2859                 addr = stack_entry->end + grow_amount;
2860
2861                 /*
2862                  * If this puts us into the next entry, cut back our growth
2863                  * to the available space. Also, see the note above.
2864                  */
2865                 if (addr > end) {
2866                         stack_entry->avail_ssize = end - stack_entry->end;
2867                         addr = end;
2868                 }
2869
2870                 grow_amount = addr - stack_entry->end;
2871
2872                 /* Grow the underlying object if applicable. */
2873                 if (stack_entry->object.vm_object == NULL ||
2874                     vm_object_coalesce(stack_entry->object.vm_object,
2875                     stack_entry->offset,
2876                     (vm_size_t)(stack_entry->end - stack_entry->start),
2877                     (vm_size_t)grow_amount)) {
2878                         map->size += (addr - stack_entry->end);
2879                         /* Update the current entry. */
2880                         stack_entry->end = addr;
2881                         stack_entry->avail_ssize -= grow_amount;
2882                         vm_map_entry_resize_free(map, stack_entry);
2883                         rv = KERN_SUCCESS;
2884
2885                         if (next_entry != &map->header)
2886                                 vm_map_clip_start(map, next_entry, addr);
2887                 } else
2888                         rv = KERN_FAILURE;
2889         }
2890
2891         if (rv == KERN_SUCCESS && is_procstack)
2892                 vm->vm_ssize += btoc(grow_amount);
2893
2894         vm_map_unlock(map);
2895
2896         /*
2897          * Heed the MAP_WIREFUTURE flag if it was set for this process.
2898          */
2899         if (rv == KERN_SUCCESS && (map->flags & MAP_WIREFUTURE)) {
2900                 vm_map_wire(map,
2901                     (stack_entry == next_entry) ? addr : addr - grow_amount,
2902                     (stack_entry == next_entry) ? stack_entry->start : addr,
2903                     (p->p_flag & P_SYSTEM)
2904                     ? VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES
2905                     : VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES);
2906         }
2907
2908         return (rv);
2909 }
2910
2911 /*
2912  * Unshare the specified VM space for exec.  If other processes are
2913  * mapped to it, then create a new one.  The new vmspace is null.
2914  */
2915 void
2916 vmspace_exec(struct proc *p, vm_offset_t minuser, vm_offset_t maxuser)
2917 {
2918         struct vmspace *oldvmspace = p->p_vmspace;
2919         struct vmspace *newvmspace;
2920
2921         newvmspace = vmspace_alloc(minuser, maxuser);
2922         newvmspace->vm_swrss = oldvmspace->vm_swrss;
2923         /*
2924          * This code is written like this for prototype purposes.  The
2925          * goal is to avoid running down the vmspace here, but let the
2926          * other process's that are still using the vmspace to finally
2927          * run it down.  Even though there is little or no chance of blocking
2928          * here, it is a good idea to keep this form for future mods.
2929          */
2930         p->p_vmspace = newvmspace;
2931         if (p == curthread->td_proc)            /* XXXKSE ? */
2932                 pmap_activate(curthread);
2933         vmspace_free(oldvmspace);
2934 }
2935
2936 /*
2937  * Unshare the specified VM space for forcing COW.  This
2938  * is called by rfork, for the (RFMEM|RFPROC) == 0 case.
2939  */
2940 void
2941 vmspace_unshare(struct proc *p)
2942 {
2943         struct vmspace *oldvmspace = p->p_vmspace;
2944         struct vmspace *newvmspace;
2945
2946         if (oldvmspace->vm_refcnt == 1)
2947                 return;
2948         newvmspace = vmspace_fork(oldvmspace);
2949         p->p_vmspace = newvmspace;
2950         if (p == curthread->td_proc)            /* XXXKSE ? */
2951                 pmap_activate(curthread);
2952         vmspace_free(oldvmspace);
2953 }
2954
2955 /*
2956  *      vm_map_lookup:
2957  *
2958  *      Finds the VM object, offset, and
2959  *      protection for a given virtual address in the
2960  *      specified map, assuming a page fault of the
2961  *      type specified.
2962  *
2963  *      Leaves the map in question locked for read; return
2964  *      values are guaranteed until a vm_map_lookup_done
2965  *      call is performed.  Note that the map argument
2966  *      is in/out; the returned map must be used in
2967  *      the call to vm_map_lookup_done.
2968  *
2969  *      A handle (out_entry) is returned for use in
2970  *      vm_map_lookup_done, to make that fast.
2971  *
2972  *      If a lookup is requested with "write protection"
2973  *      specified, the map may be changed to perform virtual
2974  *      copying operations, although the data referenced will
2975  *      remain the same.
2976  */
2977 int
2978 vm_map_lookup(vm_map_t *var_map,                /* IN/OUT */
2979               vm_offset_t vaddr,
2980               vm_prot_t fault_typea,
2981               vm_map_entry_t *out_entry,        /* OUT */
2982               vm_object_t *object,              /* OUT */
2983               vm_pindex_t *pindex,              /* OUT */
2984               vm_prot_t *out_prot,              /* OUT */
2985               boolean_t *wired)                 /* OUT */
2986 {
2987         vm_map_entry_t entry;
2988         vm_map_t map = *var_map;
2989         vm_prot_t prot;
2990         vm_prot_t fault_type = fault_typea;
2991
2992 RetryLookup:;
2993         /*
2994          * Lookup the faulting address.
2995          */
2996
2997         vm_map_lock_read(map);
2998 #define RETURN(why) \
2999                 { \
3000                 vm_map_unlock_read(map); \
3001                 return (why); \
3002                 }
3003
3004         /*
3005          * If the map has an interesting hint, try it before calling full
3006          * blown lookup routine.
3007          */
3008         entry = map->root;
3009         *out_entry = entry;
3010         if (entry == NULL ||
3011             (vaddr < entry->start) || (vaddr >= entry->end)) {
3012                 /*
3013                  * Entry was either not a valid hint, or the vaddr was not
3014                  * contained in the entry, so do a full lookup.
3015                  */
3016                 if (!vm_map_lookup_entry(map, vaddr, out_entry))
3017                         RETURN(KERN_INVALID_ADDRESS);
3018
3019                 entry = *out_entry;
3020         }
3021
3022         /*
3023          * Handle submaps.
3024          */
3025         if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
3026                 vm_map_t old_map = map;
3027
3028                 *var_map = map = entry->object.sub_map;
3029                 vm_map_unlock_read(old_map);
3030                 goto RetryLookup;
3031         }
3032
3033         /*
3034          * Check whether this task is allowed to have this page.
3035          * Note the special case for MAP_ENTRY_COW
3036          * pages with an override.  This is to implement a forced
3037          * COW for debuggers.
3038          */
3039         if (fault_type & VM_PROT_OVERRIDE_WRITE)
3040                 prot = entry->max_protection;
3041         else
3042                 prot = entry->protection;
3043         fault_type &= (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
3044         if ((fault_type & prot) != fault_type) {
3045                         RETURN(KERN_PROTECTION_FAILURE);
3046         }
3047         if ((entry->eflags & MAP_ENTRY_USER_WIRED) &&
3048             (entry->eflags & MAP_ENTRY_COW) &&
3049             (fault_type & VM_PROT_WRITE) &&
3050             (fault_typea & VM_PROT_OVERRIDE_WRITE) == 0) {
3051                 RETURN(KERN_PROTECTION_FAILURE);
3052         }
3053
3054         /*
3055          * If this page is not pageable, we have to get it for all possible
3056          * accesses.
3057          */
3058         *wired = (entry->wired_count != 0);
3059         if (*wired)
3060                 prot = fault_type = entry->protection;
3061
3062         /*
3063          * If the entry was copy-on-write, we either ...
3064          */
3065         if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
3066                 /*
3067                  * If we want to write the page, we may as well handle that
3068                  * now since we've got the map locked.
3069                  *
3070                  * If we don't need to write the page, we just demote the
3071                  * permissions allowed.
3072                  */
3073                 if (fault_type & VM_PROT_WRITE) {
3074                         /*
3075                          * Make a new object, and place it in the object
3076                          * chain.  Note that no new references have appeared
3077                          * -- one just moved from the map to the new
3078                          * object.
3079                          */
3080                         if (vm_map_lock_upgrade(map))
3081                                 goto RetryLookup;
3082
3083                         vm_object_shadow(
3084                             &entry->object.vm_object,
3085                             &entry->offset,
3086                             atop(entry->end - entry->start));
3087                         entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
3088
3089                         vm_map_lock_downgrade(map);
3090                 } else {
3091                         /*
3092                          * We're attempting to read a copy-on-write page --
3093                          * don't allow writes.
3094                          */
3095                         prot &= ~VM_PROT_WRITE;
3096                 }
3097         }
3098
3099         /*
3100          * Create an object if necessary.
3101          */
3102         if (entry->object.vm_object == NULL &&
3103             !map->system_map) {
3104                 if (vm_map_lock_upgrade(map))
3105                         goto RetryLookup;
3106                 entry->object.vm_object = vm_object_allocate(OBJT_DEFAULT,
3107                     atop(entry->end - entry->start));
3108                 entry->offset = 0;
3109                 vm_map_lock_downgrade(map);
3110         }
3111
3112         /*
3113          * Return the object/offset from this entry.  If the entry was
3114          * copy-on-write or empty, it has been fixed up.
3115          */
3116         *pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
3117         *object = entry->object.vm_object;
3118
3119         *out_prot = prot;
3120         return (KERN_SUCCESS);
3121
3122 #undef  RETURN
3123 }
3124
3125 /*
3126  *      vm_map_lookup_locked:
3127  *
3128  *      Lookup the faulting address.  A version of vm_map_lookup that returns 
3129  *      KERN_FAILURE instead of blocking on map lock or memory allocation.
3130  */
3131 int
3132 vm_map_lookup_locked(vm_map_t *var_map,         /* IN/OUT */
3133                      vm_offset_t vaddr,
3134                      vm_prot_t fault_typea,
3135                      vm_map_entry_t *out_entry, /* OUT */
3136                      vm_object_t *object,       /* OUT */
3137                      vm_pindex_t *pindex,       /* OUT */
3138                      vm_prot_t *out_prot,       /* OUT */
3139                      boolean_t *wired)          /* OUT */
3140 {
3141         vm_map_entry_t entry;
3142         vm_map_t map = *var_map;
3143         vm_prot_t prot;
3144         vm_prot_t fault_type = fault_typea;
3145
3146         /*
3147          * If the map has an interesting hint, try it before calling full
3148          * blown lookup routine.
3149          */
3150         entry = map->root;
3151         *out_entry = entry;
3152         if (entry == NULL ||
3153             (vaddr < entry->start) || (vaddr >= entry->end)) {
3154                 /*
3155                  * Entry was either not a valid hint, or the vaddr was not
3156                  * contained in the entry, so do a full lookup.
3157                  */
3158                 if (!vm_map_lookup_entry(map, vaddr, out_entry))
3159                         return (KERN_INVALID_ADDRESS);
3160
3161                 entry = *out_entry;
3162         }
3163
3164         /*
3165          * Fail if the entry refers to a submap.
3166          */
3167         if (entry->eflags & MAP_ENTRY_IS_SUB_MAP)
3168                 return (KERN_FAILURE);
3169
3170         /*
3171          * Check whether this task is allowed to have this page.
3172          * Note the special case for MAP_ENTRY_COW
3173          * pages with an override.  This is to implement a forced
3174          * COW for debuggers.
3175          */
3176         if (fault_type & VM_PROT_OVERRIDE_WRITE)
3177                 prot = entry->max_protection;
3178         else
3179                 prot = entry->protection;
3180         fault_type &= VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE;
3181         if ((fault_type & prot) != fault_type)
3182                 return (KERN_PROTECTION_FAILURE);
3183         if ((entry->eflags & MAP_ENTRY_USER_WIRED) &&
3184             (entry->eflags & MAP_ENTRY_COW) &&
3185             (fault_type & VM_PROT_WRITE) &&
3186             (fault_typea & VM_PROT_OVERRIDE_WRITE) == 0)
3187                 return (KERN_PROTECTION_FAILURE);
3188
3189         /*
3190          * If this page is not pageable, we have to get it for all possible
3191          * accesses.
3192          */
3193         *wired = (entry->wired_count != 0);
3194         if (*wired)
3195                 prot = fault_type = entry->protection;
3196
3197         if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
3198                 /*
3199                  * Fail if the entry was copy-on-write for a write fault.
3200                  */
3201                 if (fault_type & VM_PROT_WRITE)
3202                         return (KERN_FAILURE);
3203                 /*
3204                  * We're attempting to read a copy-on-write page --
3205                  * don't allow writes.
3206                  */
3207                 prot &= ~VM_PROT_WRITE;
3208         }
3209
3210         /*
3211          * Fail if an object should be created.
3212          */
3213         if (entry->object.vm_object == NULL && !map->system_map)
3214                 return (KERN_FAILURE);
3215
3216         /*
3217          * Return the object/offset from this entry.  If the entry was
3218          * copy-on-write or empty, it has been fixed up.
3219          */
3220         *pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
3221         *object = entry->object.vm_object;
3222
3223         *out_prot = prot;
3224         return (KERN_SUCCESS);
3225 }
3226
3227 /*
3228  *      vm_map_lookup_done:
3229  *
3230  *      Releases locks acquired by a vm_map_lookup
3231  *      (according to the handle returned by that lookup).
3232  */
3233 void
3234 vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry)
3235 {
3236         /*
3237          * Unlock the main-level map
3238          */
3239         vm_map_unlock_read(map);
3240 }
3241
3242 #include "opt_ddb.h"
3243 #ifdef DDB
3244 #include <sys/kernel.h>
3245
3246 #include <ddb/ddb.h>
3247
3248 /*
3249  *      vm_map_print:   [ debug ]
3250  */
3251 DB_SHOW_COMMAND(map, vm_map_print)
3252 {
3253         static int nlines;
3254         /* XXX convert args. */
3255         vm_map_t map = (vm_map_t)addr;
3256         boolean_t full = have_addr;
3257
3258         vm_map_entry_t entry;
3259
3260         db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n",
3261             (void *)map,
3262             (void *)map->pmap, map->nentries, map->timestamp);
3263         nlines++;
3264
3265         if (!full && db_indent)
3266                 return;
3267
3268         db_indent += 2;
3269         for (entry = map->header.next; entry != &map->header;
3270             entry = entry->next) {
3271                 db_iprintf("map entry %p: start=%p, end=%p\n",
3272                     (void *)entry, (void *)entry->start, (void *)entry->end);
3273                 nlines++;
3274                 {
3275                         static char *inheritance_name[4] =
3276                         {"share", "copy", "none", "donate_copy"};
3277
3278                         db_iprintf(" prot=%x/%x/%s",
3279                             entry->protection,
3280                             entry->max_protection,
3281                             inheritance_name[(int)(unsigned char)entry->inheritance]);
3282                         if (entry->wired_count != 0)
3283                                 db_printf(", wired");
3284                 }
3285                 if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
3286                         db_printf(", share=%p, offset=0x%jx\n",
3287                             (void *)entry->object.sub_map,
3288                             (uintmax_t)entry->offset);
3289                         nlines++;
3290                         if ((entry->prev == &map->header) ||
3291                             (entry->prev->object.sub_map !=
3292                                 entry->object.sub_map)) {
3293                                 db_indent += 2;
3294                                 vm_map_print((db_expr_t)(intptr_t)
3295                                              entry->object.sub_map,
3296                                              full, 0, (char *)0);
3297                                 db_indent -= 2;
3298                         }
3299                 } else {
3300                         db_printf(", object=%p, offset=0x%jx",
3301                             (void *)entry->object.vm_object,
3302                             (uintmax_t)entry->offset);
3303                         if (entry->eflags & MAP_ENTRY_COW)
3304                                 db_printf(", copy (%s)",
3305                                     (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
3306                         db_printf("\n");
3307                         nlines++;
3308
3309                         if ((entry->prev == &map->header) ||
3310                             (entry->prev->object.vm_object !=
3311                                 entry->object.vm_object)) {
3312                                 db_indent += 2;
3313                                 vm_object_print((db_expr_t)(intptr_t)
3314                                                 entry->object.vm_object,
3315                                                 full, 0, (char *)0);
3316                                 nlines += 4;
3317                                 db_indent -= 2;
3318                         }
3319                 }
3320         }
3321         db_indent -= 2;
3322         if (db_indent == 0)
3323                 nlines = 0;
3324 }
3325
3326
3327 DB_SHOW_COMMAND(procvm, procvm)
3328 {
3329         struct proc *p;
3330
3331         if (have_addr) {
3332                 p = (struct proc *) addr;
3333         } else {
3334                 p = curproc;
3335         }
3336
3337         db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n",
3338             (void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map,
3339             (void *)vmspace_pmap(p->p_vmspace));
3340
3341         vm_map_print((db_expr_t)(intptr_t)&p->p_vmspace->vm_map, 1, 0, NULL);
3342 }
3343
3344 #endif /* DDB */