From b82151c2a926cf1c3d4145ee6d4070d32e509054 Mon Sep 17 00:00:00 2001 From: kib Date: Sun, 30 Jul 2017 10:36:20 +0000 Subject: [PATCH] Merge MAP_GUARD. MFC r316687 (by markj), r320314, r320317, r320338, r320339, r320344, r320430, r320560 (by alc), r320801, r320843, r321173, r321230. Tested by: pho Sponsored by: The FreeBSD Foundation git-svn-id: svn://svn.freebsd.org/base/stable/10@321717 ccf9f872-aa2e-dd11-9fc8-001c23d0bc1f --- lib/libc/sys/mmap.2 | 47 ++++- lib/libc/sys/munmap.2 | 4 +- sys/sys/mman.h | 1 + sys/sys/param.h | 5 +- sys/vm/vm.h | 1 + sys/vm/vm_fault.c | 18 +- sys/vm/vm_map.c | 473 +++++++++++++++++++++--------------------- sys/vm/vm_map.h | 10 +- sys/vm/vm_mmap.c | 40 +++- 9 files changed, 333 insertions(+), 266 deletions(-) diff --git a/lib/libc/sys/mmap.2 b/lib/libc/sys/mmap.2 index 7380a7fb3..1c9ff9477 100644 --- a/lib/libc/sys/mmap.2 +++ b/lib/libc/sys/mmap.2 @@ -28,7 +28,7 @@ .\" @(#)mmap.2 8.4 (Berkeley) 5/11/95 .\" $FreeBSD$ .\" -.Dd September 17, 2014 +.Dd June 22, 2017 .Dt MMAP 2 .Os .Sh NAME @@ -173,9 +173,21 @@ In contrast, if .Dv MAP_EXCL is specified, the request will fail if a mapping already exists within the range. -.It Dv MAP_HASSEMAPHORE -Notify the kernel that the region may contain semaphores and that special -handling may be necessary. +.It Dv MAP_GUARD +Instead of a mapping, create a guard of the specified size. +Guards allow a process to create reservations in its address space, +which can later be replaced by actual mappings. +.Pp +.Fa mmap +will not create mappings in the address range of a guard unless +the request specifies +.Dv MAP_FIXED . +Guards can be destroyed with +.Xr munmap 2 . +Any memory access by a thread to the guarded range results +in the delivery of a +.Dv SIGSEGV +signal to that thread. .It Dv MAP_NOCORE Region is not included in a core file. .It Dv MAP_NOSYNC @@ -278,6 +290,7 @@ must include at least .Dv PROT_READ and .Dv PROT_WRITE . +.Pp This option creates a memory region that grows to at most .Fa len @@ -288,6 +301,12 @@ stack top is the starting address returned by the call, plus bytes. The bottom of the stack at maximum growth is the starting address returned by the call. +.Pp +Stacks created with +.Dv MAP_STACK +automatically grow. +Guards prevent inadvertent use of the regions into which those +stacks can grow without requiring mapping the whole stack in advance. .El .Pp The @@ -375,6 +394,7 @@ were specified. .It Bq Er EINVAL None of .Dv MAP_ANON , +.Dv MAP_GUARD , .Dv MAP_PRIVATE , .Dv MAP_SHARED , or @@ -424,6 +444,25 @@ were specified, but the requested region is already used by a mapping. was specified, but .Dv MAP_FIXED was not. +.It Bq Er EINVAL +.Dv MAP_GUARD +was specified, but the +.Fa offset +argument was not zero, the +.Fa fd +argument was not -1, or the +.Fa prot +argument was not +.Dv PROT_NONE . +.It Bq Er EINVAL +.Dv MAP_GUARD +was specified together with one of the flags +.Dv MAP_ANON , +.Dv MAP_PREFAULT , +.Dv MAP_PREFAULT_READ , +.Dv MAP_PRIVATE , +.Dv MAP_SHARED , +.Dv MAP_STACK . .It Bq Er ENODEV .Dv MAP_ANON has not been specified and diff --git a/lib/libc/sys/munmap.2 b/lib/libc/sys/munmap.2 index 643107286..ce5420203 100644 --- a/lib/libc/sys/munmap.2 +++ b/lib/libc/sys/munmap.2 @@ -28,7 +28,7 @@ .\" @(#)munmap.2 8.3 (Berkeley) 5/27/94 .\" $FreeBSD$ .\" -.Dd May 27, 1994 +.Dd June 22, 2017 .Dt MUNMAP 2 .Os .Sh NAME @@ -44,7 +44,7 @@ The .Fn munmap system call -deletes the mappings for the specified address range, +deletes the mappings and guards for the specified address range, and causes further references to addresses within the range to generate invalid memory references. .Sh RETURN VALUES diff --git a/sys/sys/mman.h b/sys/sys/mman.h index 497fd6fd5..b654eb5c7 100644 --- a/sys/sys/mman.h +++ b/sys/sys/mman.h @@ -90,6 +90,7 @@ /* * Extended flags */ +#define MAP_GUARD 0x00002000 /* reserve but don't map address range */ #define MAP_EXCL 0x00004000 /* for MAP_FIXED, fail if address is used */ #define MAP_NOCORE 0x00020000 /* dont include these pages in a coredump */ #define MAP_PREFAULT_READ 0x00040000 /* prefault mapping for reading */ diff --git a/sys/sys/param.h b/sys/sys/param.h index bab3df3aa..ca1ecfc1d 100644 --- a/sys/sys/param.h +++ b/sys/sys/param.h @@ -58,7 +58,7 @@ * in the range 5 to 9. */ #undef __FreeBSD_version -#define __FreeBSD_version 1003515 /* Master, propagated to newvers */ +#define __FreeBSD_version 1003516 /* Master, propagated to newvers */ /* * __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD, @@ -76,10 +76,11 @@ #undef __FreeBSD_kernel__ #define __FreeBSD_kernel__ -#ifdef _KERNEL +#if defined(_KERNEL) || defined(IN_RTLD) #define P_OSREL_SIGWAIT 700000 #define P_OSREL_SIGSEGV 700004 #define P_OSREL_MAP_ANON 800104 +#define P_OSREL_MAP_GUARD 1200035 #define P_OSREL_MAJOR(x) ((x) / 100000) #endif diff --git a/sys/vm/vm.h b/sys/vm/vm.h index 0eb756891..687fef531 100644 --- a/sys/vm/vm.h +++ b/sys/vm/vm.h @@ -78,6 +78,7 @@ typedef u_char vm_prot_t; /* protection codes */ #define VM_PROT_WRITE ((vm_prot_t) 0x02) #define VM_PROT_EXECUTE ((vm_prot_t) 0x04) #define VM_PROT_COPY ((vm_prot_t) 0x08) /* copy-on-read */ +#define VM_PROT_FAULT_LOOKUP ((vm_prot_t) 0x010) #define VM_PROT_ALL (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE) #define VM_PROT_RW (VM_PROT_READ|VM_PROT_WRITE) diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c index e948982c1..c970728ce 100644 --- a/sys/vm/vm_fault.c +++ b/sys/vm/vm_fault.c @@ -336,7 +336,7 @@ vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, vm_prot_t prot; long ahead, behind; int alloc_req, era, faultcount, nera, reqpage, result; - boolean_t dead, growstack, is_first_object_locked, wired; + boolean_t dead, is_first_object_locked, wired; vm_object_t next_object; vm_page_t marray[VM_FAULT_READ_MAX]; int hardfault; @@ -345,7 +345,6 @@ vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, int locked, error; hardfault = 0; - growstack = TRUE; PCPU_INC(cnt.v_vm_faults); fs.vp = NULL; faultcount = reqpage = 0; @@ -357,17 +356,10 @@ RetryFault:; * search. */ fs.map = map; - result = vm_map_lookup(&fs.map, vaddr, fault_type, &fs.entry, - &fs.first_object, &fs.first_pindex, &prot, &wired); + result = vm_map_lookup(&fs.map, vaddr, fault_type | + VM_PROT_FAULT_LOOKUP, &fs.entry, &fs.first_object, + &fs.first_pindex, &prot, &wired); if (result != KERN_SUCCESS) { - if (growstack && result == KERN_INVALID_ADDRESS && - map != kernel_map) { - result = vm_map_growstack(curproc, vaddr); - if (result != KERN_SUCCESS) - return (KERN_FAILURE); - growstack = FALSE; - goto RetryFault; - } unlock_vp(&fs); return (result); } @@ -393,6 +385,8 @@ RetryFault:; goto RetryFault; } + MPASS((fs.entry->eflags & MAP_ENTRY_GUARD) == 0); + if (wired) fault_type = prot | (fault_type & VM_PROT_COPY); else diff --git a/sys/vm/vm_map.c b/sys/vm/vm_map.c index 6913d22bc..238317ac3 100644 --- a/sys/vm/vm_map.c +++ b/sys/vm/vm_map.c @@ -133,6 +133,8 @@ static void _vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, static void vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map); static void vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry); static void vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry); +static int vm_map_growstack(vm_map_t map, vm_offset_t addr, + vm_map_entry_t gap_entry); #ifdef INVARIANTS static void vm_map_zdtor(void *mem, int size, void *arg); static void vmspace_zdtor(void *mem, int size, void *arg); @@ -1167,6 +1169,10 @@ vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset, if (prev_entry->next != &map->header && prev_entry->next->start < end) return (KERN_NO_SPACE); + if ((cow & MAP_CREATE_GUARD) != 0 && (object != NULL || + max != VM_PROT_NONE)) + return (KERN_INVALID_ARGUMENT); + protoeflags = 0; if (cow & MAP_COPY_ON_WRITE) protoeflags |= MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY; @@ -1182,13 +1188,19 @@ vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset, protoeflags |= MAP_ENTRY_GROWS_UP; if (cow & MAP_VN_WRITECOUNT) protoeflags |= MAP_ENTRY_VN_WRITECNT; + if ((cow & MAP_CREATE_GUARD) != 0) + protoeflags |= MAP_ENTRY_GUARD; + if ((cow & MAP_CREATE_STACK_GAP_DN) != 0) + protoeflags |= MAP_ENTRY_STACK_GAP_DN; + if ((cow & MAP_CREATE_STACK_GAP_UP) != 0) + protoeflags |= MAP_ENTRY_STACK_GAP_UP; if (cow & MAP_INHERIT_SHARE) inheritance = VM_INHERIT_SHARE; else inheritance = VM_INHERIT_DEFAULT; cred = NULL; - if (cow & (MAP_ACC_NO_CHARGE | MAP_NOFAULT)) + if ((cow & (MAP_ACC_NO_CHARGE | MAP_NOFAULT | MAP_CREATE_GUARD)) != 0) goto charged; if ((cow & MAP_ACC_CHARGED) || ((prot & VM_PROT_WRITE) && ((protoeflags & MAP_ENTRY_NEEDS_COPY) || object == NULL))) { @@ -1237,7 +1249,8 @@ charged: if (prev_entry->inheritance == inheritance && prev_entry->protection == prot && prev_entry->max_protection == max) { - map->size += end - prev_entry->end; + if ((prev_entry->eflags & MAP_ENTRY_GUARD) == 0) + map->size += end - prev_entry->end; prev_entry->end = end; vm_map_entry_resize_free(map, prev_entry); vm_map_simplify_entry(map, prev_entry); @@ -1274,7 +1287,6 @@ charged: new_entry->eflags = protoeflags; new_entry->object.vm_object = object; new_entry->offset = offset; - new_entry->avail_ssize = 0; new_entry->inheritance = inheritance; new_entry->protection = prot; @@ -1292,7 +1304,8 @@ charged: * Insert the new entry into the list */ vm_map_entry_link(map, prev_entry, new_entry); - map->size += new_entry->end - new_entry->start; + if ((new_entry->eflags & MAP_ENTRY_GUARD) == 0) + map->size += new_entry->end - new_entry->start; /* * Try to coalesce the new entry with both the previous and next @@ -1496,6 +1509,25 @@ again: return (result); } +int +vm_map_find_min(vm_map_t map, vm_object_t object, vm_ooffset_t offset, + vm_offset_t *addr, vm_size_t length, vm_offset_t min_addr, + vm_offset_t max_addr, int find_space, vm_prot_t prot, vm_prot_t max, + int cow) +{ + vm_offset_t hint; + int rv; + + hint = *addr; + for (;;) { + rv = vm_map_find(map, object, offset, addr, length, max_addr, + find_space, prot, max, cow); + if (rv == KERN_SUCCESS || min_addr >= hint) + return (rv); + *addr = hint = min_addr; + } +} + /* * vm_map_simplify_entry: * @@ -1625,7 +1657,8 @@ _vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start) * map. This is a bit of a hack, but is also about the best place to * put this improvement. */ - if (entry->object.vm_object == NULL && !map->system_map) { + if (entry->object.vm_object == NULL && !map->system_map && + (entry->eflags & MAP_ENTRY_GUARD) == 0) { vm_object_t object; object = vm_object_allocate(OBJT_DEFAULT, atop(entry->end - entry->start)); @@ -1702,7 +1735,8 @@ _vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end) * map. This is a bit of a hack, but is also about the best place to * put this improvement. */ - if (entry->object.vm_object == NULL && !map->system_map) { + if (entry->object.vm_object == NULL && !map->system_map && + (entry->eflags & MAP_ENTRY_GUARD) == 0) { vm_object_t object; object = vm_object_allocate(OBJT_DEFAULT, atop(entry->end - entry->start)); @@ -1929,8 +1963,10 @@ vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end, /* * Make a first pass to check for protection violations. */ - current = entry; - while ((current != &map->header) && (current->start < end)) { + for (current = entry; current != &map->header && current->start < end; + current = current->next) { + if ((current->eflags & MAP_ENTRY_GUARD) != 0) + continue; if (current->eflags & MAP_ENTRY_IS_SUB_MAP) { vm_map_unlock(map); return (KERN_INVALID_ARGUMENT); @@ -1939,23 +1975,22 @@ vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end, vm_map_unlock(map); return (KERN_PROTECTION_FAILURE); } - current = current->next; } - /* * Do an accounting pass for private read-only mappings that * now will do cow due to allowed write (e.g. debugger sets * breakpoint on text segment) */ - for (current = entry; (current != &map->header) && - (current->start < end); current = current->next) { + for (current = entry; current != &map->header && current->start < end; + current = current->next) { vm_map_clip_end(map, current, end); if (set_max || ((new_prot & ~(current->protection)) & VM_PROT_WRITE) == 0 || - ENTRY_CHARGED(current)) { + ENTRY_CHARGED(current) || + (current->eflags & MAP_ENTRY_GUARD) != 0) { continue; } @@ -2002,8 +2037,11 @@ vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end, * Go back and fix up protections. [Note that clipping is not * necessary the second time.] */ - current = entry; - while ((current != &map->header) && (current->start < end)) { + for (current = entry; current != &map->header && current->start < end; + current = current->next) { + if ((current->eflags & MAP_ENTRY_GUARD) != 0) + continue; + old_prot = current->protection; if (set_max) @@ -2037,7 +2075,6 @@ vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end, #undef MASK } vm_map_simplify_entry(map, current); - current = current->next; } vm_map_unlock(map); return (KERN_SUCCESS); @@ -2258,7 +2295,9 @@ vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end, entry = temp_entry->next; while ((entry != &map->header) && (entry->start < end)) { vm_map_clip_end(map, entry, end); - entry->inheritance = new_inheritance; + if ((entry->eflags & MAP_ENTRY_GUARD) == 0 || + new_inheritance != VM_INHERIT_ZERO) + entry->inheritance = new_inheritance; vm_map_simplify_entry(map, entry); entry = entry->next; } @@ -2864,6 +2903,15 @@ vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry) vm_map_entry_unlink(map, entry); object = entry->object.vm_object; + + if ((entry->eflags & MAP_ENTRY_GUARD) != 0) { + MPASS(entry->cred == NULL); + MPASS((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0); + MPASS(object == NULL); + vm_map_entry_deallocate(entry, map->system_map); + return; + } + size = entry->end - entry->start; map->size -= size; @@ -3222,6 +3270,8 @@ vmspace_map_entry_forked(const struct vmspace *vm1, struct vmspace *vm2, vm_size_t entrysize; vm_offset_t newend; + if ((entry->eflags & MAP_ENTRY_GUARD) != 0) + return; entrysize = entry->end - entry->start; vm2->vm_map.size += entrysize; if (entry->eflags & (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP)) { @@ -3258,6 +3308,7 @@ vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_charge) vm_map_entry_t new_entry, old_entry; vm_object_t object; int locked; + vm_inherit_t inh; old_map = &vm1->vm_map; /* Copy immutable fields of vm1 to vm2. */ @@ -3280,7 +3331,12 @@ vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_charge) if (old_entry->eflags & MAP_ENTRY_IS_SUB_MAP) panic("vm_map_fork: encountered a submap"); - switch (old_entry->inheritance) { + inh = old_entry->inheritance; + if ((old_entry->eflags & MAP_ENTRY_GUARD) != 0 && + inh != VM_INHERIT_NONE) + inh = VM_INHERIT_COPY; + + switch (inh) { case VM_INHERIT_NONE: break; @@ -3413,7 +3469,6 @@ vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_charge) new_entry->start = old_entry->start; new_entry->end = old_entry->end; - new_entry->avail_ssize = old_entry->avail_ssize; new_entry->eflags = old_entry->eflags & ~(MAP_ENTRY_USER_WIRED | MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_VN_WRITECNT); @@ -3476,30 +3531,40 @@ out: return (rv); } +static int stack_guard_page = 1; +SYSCTL_INT(_security_bsd, OID_AUTO, stack_guard_page, CTLFLAG_RWTUN, + &stack_guard_page, 0, + "Specifies the number of guard pages for a stack that grows"); + static int vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize, vm_size_t growsize, vm_prot_t prot, vm_prot_t max, int cow) { vm_map_entry_t new_entry, prev_entry; - vm_offset_t bot, top; - vm_size_t init_ssize; + vm_offset_t bot, gap_bot, gap_top, top; + vm_size_t init_ssize, sgp; int orient, rv; /* * The stack orientation is piggybacked with the cow argument. * Extract it into orient and mask the cow argument so that we * don't pass it around further. - * NOTE: We explicitly allow bi-directional stacks. */ - orient = cow & (MAP_STACK_GROWS_DOWN|MAP_STACK_GROWS_UP); + orient = cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP); KASSERT(orient != 0, ("No stack grow direction")); + KASSERT(orient != (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP), + ("bi-dir stack")); + sgp = (vm_size_t)stack_guard_page * PAGE_SIZE; if (addrbos < vm_map_min(map) || addrbos > vm_map_max(map) || - addrbos + max_ssize < addrbos) + addrbos + max_ssize < addrbos || + sgp >= max_ssize) return (KERN_NO_SPACE); - init_ssize = (max_ssize < growsize) ? max_ssize : growsize; + init_ssize = growsize; + if (max_ssize < init_ssize + sgp) + init_ssize = max_ssize - sgp; /* If addr is already mapped, no go */ if (vm_map_lookup_entry(map, addrbos, &prev_entry)) @@ -3507,12 +3572,6 @@ vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize, /* * If we can't accomodate max_ssize in the current mapping, no go. - * However, we need to be aware that subsequent user mappings might - * map into the space we have reserved for stack, and currently this - * space is not protected. - * - * Hopefully we will at least detect this condition when we try to - * grow the stack. */ if ((prev_entry->next != &map->header) && (prev_entry->next->start < addrbos + max_ssize)) @@ -3528,58 +3587,53 @@ vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize, * and cow to be 0. Possibly we should eliminate these as input * parameters, and just pass these values here in the insert call. */ - if (orient == MAP_STACK_GROWS_DOWN) + if (orient == MAP_STACK_GROWS_DOWN) { bot = addrbos + max_ssize - init_ssize; - else if (orient == MAP_STACK_GROWS_UP) + top = bot + init_ssize; + gap_bot = addrbos; + gap_top = bot; + } else /* if (orient == MAP_STACK_GROWS_UP) */ { bot = addrbos; - else - bot = round_page(addrbos + max_ssize/2 - init_ssize/2); - top = bot + init_ssize; - rv = vm_map_insert(map, NULL, 0, bot, top, prot, max, cow); - - /* Now set the avail_ssize amount. */ - if (rv == KERN_SUCCESS) { - new_entry = prev_entry->next; - if (new_entry->end != top || new_entry->start != bot) - panic("Bad entry start/end for new stack entry"); - - new_entry->avail_ssize = max_ssize - init_ssize; - KASSERT((orient & MAP_STACK_GROWS_DOWN) == 0 || - (new_entry->eflags & MAP_ENTRY_GROWS_DOWN) != 0, - ("new entry lacks MAP_ENTRY_GROWS_DOWN")); - KASSERT((orient & MAP_STACK_GROWS_UP) == 0 || - (new_entry->eflags & MAP_ENTRY_GROWS_UP) != 0, - ("new entry lacks MAP_ENTRY_GROWS_UP")); + top = bot + init_ssize; + gap_bot = top; + gap_top = addrbos + max_ssize; } - + rv = vm_map_insert(map, NULL, 0, bot, top, prot, max, cow); + if (rv != KERN_SUCCESS) + return (rv); + new_entry = prev_entry->next; + KASSERT(new_entry->end == top || new_entry->start == bot, + ("Bad entry start/end for new stack entry")); + KASSERT((orient & MAP_STACK_GROWS_DOWN) == 0 || + (new_entry->eflags & MAP_ENTRY_GROWS_DOWN) != 0, + ("new entry lacks MAP_ENTRY_GROWS_DOWN")); + KASSERT((orient & MAP_STACK_GROWS_UP) == 0 || + (new_entry->eflags & MAP_ENTRY_GROWS_UP) != 0, + ("new entry lacks MAP_ENTRY_GROWS_UP")); + rv = vm_map_insert(map, NULL, 0, gap_bot, gap_top, VM_PROT_NONE, + VM_PROT_NONE, MAP_CREATE_GUARD | (orient == MAP_STACK_GROWS_DOWN ? + MAP_CREATE_STACK_GAP_DN : MAP_CREATE_STACK_GAP_UP)); + if (rv != KERN_SUCCESS) + (void)vm_map_delete(map, bot, top); return (rv); } -static int stack_guard_page = 0; -TUNABLE_INT("security.bsd.stack_guard_page", &stack_guard_page); -SYSCTL_INT(_security_bsd, OID_AUTO, stack_guard_page, CTLFLAG_RW, - &stack_guard_page, 0, - "Insert stack guard page ahead of the growable segments."); - -/* Attempts to grow a vm stack entry. Returns KERN_SUCCESS if the - * desired address is already mapped, or if we successfully grow - * the stack. Also returns KERN_SUCCESS if addr is outside the - * stack range (this is strange, but preserves compatibility with - * the grow function in vm_machdep.c). +/* + * Attempts to grow a vm stack entry. Returns KERN_SUCCESS if we + * successfully grow the stack. */ -int -vm_map_growstack(struct proc *p, vm_offset_t addr) +static int +vm_map_growstack(vm_map_t map, vm_offset_t addr, vm_map_entry_t gap_entry) { - vm_map_entry_t next_entry, prev_entry; - vm_map_entry_t new_entry, stack_entry; - struct vmspace *vm = p->p_vmspace; - vm_map_t map = &vm->vm_map; - vm_offset_t end; - vm_size_t growsize; - size_t grow_amount, max_grow; - rlim_t lmemlim, stacklim, vmemlim; - int is_procstack, rv; + vm_map_entry_t stack_entry; + struct proc *p; + struct vmspace *vm; struct ucred *cred; + vm_offset_t gap_end, gap_start, grow_start; + size_t grow_amount, guard, max_grow; + rlim_t lmemlim, stacklim, vmemlim; + int rv, rv1; + bool gap_deleted, grow_down, is_procstack; #ifdef notyet uint64_t limit; #endif @@ -3587,127 +3641,84 @@ vm_map_growstack(struct proc *p, vm_offset_t addr) int error; #endif -Retry: + p = curproc; + vm = p->p_vmspace; + + /* + * Disallow stack growth when the access is performed by a + * debugger or AIO daemon. The reason is that the wrong + * resource limits are applied. + */ + if (map != &p->p_vmspace->vm_map || p->p_textvp == NULL) + return (KERN_FAILURE); + + MPASS(!map->system_map); + + guard = stack_guard_page * PAGE_SIZE; PROC_LOCK(p); lmemlim = lim_cur(p, RLIMIT_MEMLOCK); stacklim = lim_cur(p, RLIMIT_STACK); vmemlim = lim_cur(p, RLIMIT_VMEM); PROC_UNLOCK(p); - - vm_map_lock_read(map); - - /* If addr is already in the entry range, no need to grow.*/ - if (vm_map_lookup_entry(map, addr, &prev_entry)) { - vm_map_unlock_read(map); +retry: + /* If addr is not in a hole for a stack grow area, no need to grow. */ + if (gap_entry == NULL && !vm_map_lookup_entry(map, addr, &gap_entry)) + return (KERN_FAILURE); + if ((gap_entry->eflags & MAP_ENTRY_GUARD) == 0) return (KERN_SUCCESS); - } - - next_entry = prev_entry->next; - if (!(prev_entry->eflags & MAP_ENTRY_GROWS_UP)) { - /* - * This entry does not grow upwards. Since the address lies - * beyond this entry, the next entry (if one exists) has to - * be a downward growable entry. The entry list header is - * never a growable entry, so it suffices to check the flags. - */ - if (!(next_entry->eflags & MAP_ENTRY_GROWS_DOWN)) { - vm_map_unlock_read(map); - return (KERN_SUCCESS); - } - stack_entry = next_entry; - } else { - /* - * This entry grows upward. If the next entry does not at - * least grow downwards, this is the entry we need to grow. - * otherwise we have two possible choices and we have to - * select one. - */ - if (next_entry->eflags & MAP_ENTRY_GROWS_DOWN) { - /* - * We have two choices; grow the entry closest to - * the address to minimize the amount of growth. - */ - if (addr - prev_entry->end <= next_entry->start - addr) - stack_entry = prev_entry; - else - stack_entry = next_entry; - } else - stack_entry = prev_entry; - } - - if (stack_entry == next_entry) { - KASSERT(stack_entry->eflags & MAP_ENTRY_GROWS_DOWN, ("foo")); - KASSERT(addr < stack_entry->start, ("foo")); - end = (prev_entry != &map->header) ? prev_entry->end : - stack_entry->start - stack_entry->avail_ssize; - grow_amount = roundup(stack_entry->start - addr, PAGE_SIZE); - max_grow = stack_entry->start - end; + if ((gap_entry->eflags & MAP_ENTRY_STACK_GAP_DN) != 0) { + stack_entry = gap_entry->next; + if ((stack_entry->eflags & MAP_ENTRY_GROWS_DOWN) == 0 || + stack_entry->start != gap_entry->end) + return (KERN_FAILURE); + grow_amount = round_page(stack_entry->start - addr); + grow_down = true; + } else if ((gap_entry->eflags & MAP_ENTRY_STACK_GAP_UP) != 0) { + stack_entry = gap_entry->prev; + if ((stack_entry->eflags & MAP_ENTRY_GROWS_UP) == 0 || + stack_entry->end != gap_entry->start) + return (KERN_FAILURE); + grow_amount = round_page(addr + 1 - stack_entry->end); + grow_down = false; } else { - KASSERT(stack_entry->eflags & MAP_ENTRY_GROWS_UP, ("foo")); - KASSERT(addr >= stack_entry->end, ("foo")); - end = (next_entry != &map->header) ? next_entry->start : - stack_entry->end + stack_entry->avail_ssize; - grow_amount = roundup(addr + 1 - stack_entry->end, PAGE_SIZE); - max_grow = end - stack_entry->end; + return (KERN_FAILURE); } - - if (grow_amount > stack_entry->avail_ssize) { - vm_map_unlock_read(map); + max_grow = gap_entry->end - gap_entry->start; + if (guard > max_grow) return (KERN_NO_SPACE); - } - - /* - * If there is no longer enough space between the entries nogo, and - * adjust the available space. Note: this should only happen if the - * user has mapped into the stack area after the stack was created, - * and is probably an error. - * - * This also effectively destroys any guard page the user might have - * intended by limiting the stack size. - */ - if (grow_amount + (stack_guard_page ? PAGE_SIZE : 0) > max_grow) { - if (vm_map_lock_upgrade(map)) - goto Retry; - - stack_entry->avail_ssize = max_grow; - - vm_map_unlock(map); + max_grow -= guard; + if (grow_amount > max_grow) return (KERN_NO_SPACE); - } - - is_procstack = (addr >= (vm_offset_t)vm->vm_maxsaddr && - addr < (vm_offset_t)p->p_sysent->sv_usrstack) ? 1 : 0; /* * If this is the main process stack, see if we're over the stack * limit. */ - if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) { - vm_map_unlock_read(map); + is_procstack = addr >= (vm_offset_t)vm->vm_maxsaddr && + addr < (vm_offset_t)p->p_sysent->sv_usrstack; + if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) return (KERN_NO_SPACE); - } + #ifdef RACCT if (racct_enable) { PROC_LOCK(p); if (is_procstack && racct_set(p, RACCT_STACK, ctob(vm->vm_ssize) + grow_amount)) { PROC_UNLOCK(p); - vm_map_unlock_read(map); return (KERN_NO_SPACE); } PROC_UNLOCK(p); } #endif - /* Round up the grow amount modulo sgrowsiz */ - growsize = sgrowsiz; - grow_amount = roundup(grow_amount, growsize); - if (grow_amount > stack_entry->avail_ssize) - grow_amount = stack_entry->avail_ssize; + grow_amount = roundup(grow_amount, sgrowsiz); + if (grow_amount > max_grow) + grow_amount = max_grow; if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) { grow_amount = trunc_page((vm_size_t)stacklim) - ctob(vm->vm_ssize); } + #ifdef notyet PROC_LOCK(p); limit = racct_get_available(p, RACCT_STACK); @@ -3715,9 +3726,9 @@ Retry: if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > limit)) grow_amount = limit - ctob(vm->vm_ssize); #endif - if (!old_mlock && map->flags & MAP_WIREFUTURE) { + + if (!old_mlock && (map->flags & MAP_WIREFUTURE) != 0) { if (ptoa(pmap_wired_count(map->pmap)) + grow_amount > lmemlim) { - vm_map_unlock_read(map); rv = KERN_NO_SPACE; goto out; } @@ -3727,7 +3738,6 @@ Retry: if (racct_set(p, RACCT_MEMLOCK, ptoa(pmap_wired_count(map->pmap)) + grow_amount)) { PROC_UNLOCK(p); - vm_map_unlock_read(map); rv = KERN_NO_SPACE; goto out; } @@ -3735,9 +3745,9 @@ Retry: } #endif } + /* If we would blow our VMEM resource limit, no go */ if (map->size + grow_amount > vmemlim) { - vm_map_unlock_read(map); rv = KERN_NO_SPACE; goto out; } @@ -3746,7 +3756,6 @@ Retry: PROC_LOCK(p); if (racct_set(p, RACCT_VMEM, map->size + grow_amount)) { PROC_UNLOCK(p); - vm_map_unlock_read(map); rv = KERN_NO_SPACE; goto out; } @@ -3754,62 +3763,42 @@ Retry: } #endif - if (vm_map_lock_upgrade(map)) - goto Retry; - - if (stack_entry == next_entry) { - /* - * Growing downward. - */ - /* Get the preliminary new entry start value */ - addr = stack_entry->start - grow_amount; + if (vm_map_lock_upgrade(map)) { + gap_entry = NULL; + vm_map_lock_read(map); + goto retry; + } - /* - * If this puts us into the previous entry, cut back our - * growth to the available space. Also, see the note above. - */ - if (addr < end) { - stack_entry->avail_ssize = max_grow; - addr = end; - if (stack_guard_page) - addr += PAGE_SIZE; + if (grow_down) { + grow_start = gap_entry->end - grow_amount; + if (gap_entry->start + grow_amount == gap_entry->end) { + gap_start = gap_entry->start; + gap_end = gap_entry->end; + vm_map_entry_delete(map, gap_entry); + gap_deleted = true; + } else { + MPASS(gap_entry->start < gap_entry->end - grow_amount); + gap_entry->end -= grow_amount; + vm_map_entry_resize_free(map, gap_entry); + gap_deleted = false; } - - rv = vm_map_insert(map, NULL, 0, addr, stack_entry->start, - next_entry->protection, next_entry->max_protection, + rv = vm_map_insert(map, NULL, 0, grow_start, + grow_start + grow_amount, + stack_entry->protection, stack_entry->max_protection, MAP_STACK_GROWS_DOWN); - - /* Adjust the available stack space by the amount we grew. */ - if (rv == KERN_SUCCESS) { - new_entry = prev_entry->next; - KASSERT(new_entry == stack_entry->prev, ("foo")); - KASSERT(new_entry->end == stack_entry->start, ("foo")); - KASSERT(new_entry->start == addr, ("foo")); - KASSERT((new_entry->eflags & MAP_ENTRY_GROWS_DOWN) != - 0, ("new entry lacks MAP_ENTRY_GROWS_DOWN")); - grow_amount = new_entry->end - new_entry->start; - new_entry->avail_ssize = stack_entry->avail_ssize - - grow_amount; - stack_entry->eflags &= ~MAP_ENTRY_GROWS_DOWN; + if (rv != KERN_SUCCESS) { + if (gap_deleted) { + rv1 = vm_map_insert(map, NULL, 0, gap_start, + gap_end, VM_PROT_NONE, VM_PROT_NONE, + MAP_CREATE_GUARD | MAP_CREATE_STACK_GAP_DN); + MPASS(rv1 == KERN_SUCCESS); + } else { + gap_entry->end += grow_amount; + vm_map_entry_resize_free(map, gap_entry); + } } } else { - /* - * Growing upward. - */ - addr = stack_entry->end + grow_amount; - - /* - * If this puts us into the next entry, cut back our growth - * to the available space. Also, see the note above. - */ - if (addr > end) { - stack_entry->avail_ssize = end - stack_entry->end; - addr = end; - if (stack_guard_page) - addr -= PAGE_SIZE; - } - - grow_amount = addr - stack_entry->end; + grow_start = stack_entry->end; cred = stack_entry->cred; if (cred == NULL && stack_entry->object.vm_object != NULL) cred = stack_entry->object.vm_object->cred; @@ -3821,35 +3810,32 @@ Retry: stack_entry->offset, (vm_size_t)(stack_entry->end - stack_entry->start), (vm_size_t)grow_amount, cred != NULL)) { - map->size += (addr - stack_entry->end); - /* Update the current entry. */ - stack_entry->end = addr; - stack_entry->avail_ssize -= grow_amount; + if (gap_entry->start + grow_amount == gap_entry->end) + vm_map_entry_delete(map, gap_entry); + else + gap_entry->start += grow_amount; + stack_entry->end += grow_amount; + map->size += grow_amount; vm_map_entry_resize_free(map, stack_entry); rv = KERN_SUCCESS; - - if (next_entry != &map->header) - vm_map_clip_start(map, next_entry, addr); } else rv = KERN_FAILURE; } - if (rv == KERN_SUCCESS && is_procstack) vm->vm_ssize += btoc(grow_amount); - vm_map_unlock(map); - /* * Heed the MAP_WIREFUTURE flag if it was set for this process. */ - if (rv == KERN_SUCCESS && (map->flags & MAP_WIREFUTURE)) { - vm_map_wire(map, - (stack_entry == next_entry) ? addr : addr - grow_amount, - (stack_entry == next_entry) ? stack_entry->start : addr, + if (rv == KERN_SUCCESS && (map->flags & MAP_WIREFUTURE) != 0) { + vm_map_unlock(map); + vm_map_wire(map, grow_start, grow_start + grow_amount, (p->p_flag & P_SYSTEM) ? VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES : VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES); - } + vm_map_lock_read(map); + } else + vm_map_lock_downgrade(map); out: #ifdef RACCT @@ -3973,10 +3959,11 @@ vm_map_lookup(vm_map_t *var_map, /* IN/OUT */ vm_size_t size; struct ucred *cred; -RetryLookup:; +RetryLookup: vm_map_lock_read(map); +RetryLookupLocked: /* * Lookup the faulting address. */ @@ -4002,7 +3989,16 @@ RetryLookup:; * Check whether this task is allowed to have this page. */ prot = entry->protection; - fault_type &= (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE); + if ((fault_typea & VM_PROT_FAULT_LOOKUP) != 0) { + fault_typea &= ~VM_PROT_FAULT_LOOKUP; + if (prot == VM_PROT_NONE && map != kernel_map && + (entry->eflags & MAP_ENTRY_GUARD) != 0 && + (entry->eflags & (MAP_ENTRY_STACK_GAP_DN | + MAP_ENTRY_STACK_GAP_UP)) != 0 && + vm_map_growstack(map, vaddr, entry) == KERN_SUCCESS) + goto RetryLookupLocked; + } + fault_type &= VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE; if ((fault_type & prot) != fault_type || prot == VM_PROT_NONE) { vm_map_unlock_read(map); return (KERN_PROTECTION_FAILURE); @@ -4236,8 +4232,9 @@ vm_map_print(vm_map_t map) db_indent += 2; for (entry = map->header.next; entry != &map->header; entry = entry->next) { - db_iprintf("map entry %p: start=%p, end=%p\n", - (void *)entry, (void *)entry->start, (void *)entry->end); + db_iprintf("map entry %p: start=%p, end=%p, eflags=%#x, \n", + (void *)entry, (void *)entry->start, (void *)entry->end, + entry->eflags); { static char *inheritance_name[4] = {"share", "copy", "none", "donate_copy"}; diff --git a/sys/vm/vm_map.h b/sys/vm/vm_map.h index a8378ef6c..bb067193d 100644 --- a/sys/vm/vm_map.h +++ b/sys/vm/vm_map.h @@ -103,7 +103,6 @@ struct vm_map_entry { struct vm_map_entry *right; /* right child in binary search tree */ vm_offset_t start; /* start address */ vm_offset_t end; /* end address */ - vm_offset_t avail_ssize; /* amt can grow if this is a stack */ vm_size_t adj_free; /* amount of adjacent free space */ vm_size_t max_free; /* max free space in subtree */ union vm_map_object object; /* object I point to */ @@ -142,6 +141,9 @@ struct vm_map_entry { #define MAP_ENTRY_WIRE_SKIPPED 0x4000 #define MAP_ENTRY_VN_WRITECNT 0x8000 /* writeable vnode mapping */ +#define MAP_ENTRY_GUARD 0x10000 +#define MAP_ENTRY_STACK_GAP_DN 0x20000 +#define MAP_ENTRY_STACK_GAP_UP 0x40000 #ifdef _KERNEL static __inline u_char @@ -316,6 +318,7 @@ long vmspace_resident_count(struct vmspace *vmspace); #define MAP_PREFAULT_PARTIAL 0x0010 #define MAP_DISABLE_SYNCER 0x0020 #define MAP_CHECK_EXCL 0x0040 +#define MAP_CREATE_GUARD 0x0080 #define MAP_DISABLE_COREDUMP 0x0100 #define MAP_PREFAULT_MADVISE 0x0200 /* from (user) madvise request */ #define MAP_VN_WRITECOUNT 0x0400 @@ -323,6 +326,8 @@ long vmspace_resident_count(struct vmspace *vmspace); #define MAP_STACK_GROWS_UP 0x2000 #define MAP_ACC_CHARGED 0x4000 #define MAP_ACC_NO_CHARGE 0x8000 +#define MAP_CREATE_STACK_GAP_UP 0x10000 +#define MAP_CREATE_STACK_GAP_DN 0x20000 /* * vm_fault option flags @@ -368,6 +373,8 @@ vm_map_t vm_map_create(pmap_t, vm_offset_t, vm_offset_t); int vm_map_delete(vm_map_t, vm_offset_t, vm_offset_t); int vm_map_find(vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t *, vm_size_t, vm_offset_t, int, vm_prot_t, vm_prot_t, int); +int vm_map_find_min(vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t *, + vm_size_t, vm_offset_t, vm_offset_t, int, vm_prot_t, vm_prot_t, int); int vm_map_fixed(vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t, vm_size_t, vm_prot_t, vm_prot_t, int); int vm_map_findspace (vm_map_t, vm_offset_t, vm_size_t, vm_offset_t *); @@ -390,7 +397,6 @@ int vm_map_sync(vm_map_t, vm_offset_t, vm_offset_t, boolean_t, boolean_t); int vm_map_madvise (vm_map_t, vm_offset_t, vm_offset_t, int); void vm_map_simplify_entry (vm_map_t, vm_map_entry_t); int vm_map_stack (vm_map_t, vm_offset_t, vm_size_t, vm_prot_t, vm_prot_t, int); -int vm_map_growstack (struct proc *p, vm_offset_t addr); int vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags); int vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end, diff --git a/sys/vm/vm_mmap.c b/sys/vm/vm_mmap.c index 5ac0eeebf..5a5e7e808 100644 --- a/sys/vm/vm_mmap.c +++ b/sys/vm/vm_mmap.c @@ -230,6 +230,10 @@ sys_mmap(td, uap) } if ((flags & (MAP_EXCL | MAP_FIXED)) == MAP_EXCL) return (EINVAL); + if ((flags & MAP_GUARD) != 0 && (prot != PROT_NONE || uap->fd != -1 || + pos != 0 || (flags & (MAP_SHARED | MAP_PRIVATE | MAP_PREFAULT | + MAP_PREFAULT_READ | MAP_ANON | MAP_STACK)) != 0)) + return (EINVAL); /* * Align the file position to a page boundary, @@ -299,7 +303,12 @@ sys_mmap(td, uap) lim_max(td->td_proc, RLIMIT_DATA)); PROC_UNLOCK(td->td_proc); } - if (flags & MAP_ANON) { + if ((flags & MAP_GUARD) != 0) { + handle = NULL; + handle_type = OBJT_DEFAULT; + maxprot = VM_PROT_NONE; + cap_maxprot = VM_PROT_NONE; + } else if ((flags & MAP_ANON) != 0) { /* * Mapping blank space is trivial. */ @@ -1450,7 +1459,8 @@ vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, objtype_t handle_type, void *handle, vm_ooffset_t foff) { - boolean_t fitit; + boolean_t curmap, fitit; + vm_offset_t max_addr; vm_object_t object = NULL; struct thread *td = curthread; int docow, error, findspace, rv; @@ -1461,7 +1471,8 @@ vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, size = round_page(size); - if (map == &td->td_proc->p_vmspace->vm_map) { + curmap = map == &td->td_proc->p_vmspace->vm_map; + if (curmap) { PROC_LOCK(td->td_proc); if (map->size + size > lim_cur(td->td_proc, RLIMIT_VMEM)) { PROC_UNLOCK(td->td_proc); @@ -1571,6 +1582,8 @@ vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, } if ((flags & MAP_EXCL) != 0) docow |= MAP_CHECK_EXCL; + if ((flags & MAP_GUARD) != 0) + docow |= MAP_CREATE_GUARD; if (fitit) { if ((flags & MAP_ALIGNMENT_MASK) == MAP_ALIGNED_SUPER) @@ -1580,11 +1593,26 @@ vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, MAP_ALIGNMENT_SHIFT); else findspace = VMFS_OPTIMAL_SPACE; - rv = vm_map_find(map, object, foff, addr, size, + max_addr = 0; #ifdef MAP_32BIT - flags & MAP_32BIT ? MAP_32BIT_MAX_ADDR : + if ((flags & MAP_32BIT) != 0) + max_addr = MAP_32BIT_MAX_ADDR; #endif - 0, findspace, prot, maxprot, docow); + if (curmap) { + vm_offset_t min_addr; + + PROC_LOCK(td->td_proc); + min_addr = round_page((vm_offset_t)td->td_proc-> + p_vmspace->vm_daddr + lim_max(td->td_proc, + RLIMIT_DATA)); + PROC_UNLOCK(td->td_proc); + rv = vm_map_find_min(map, object, foff, addr, size, + min_addr, max_addr, + findspace, prot, maxprot, docow); + } else { + rv = vm_map_find(map, object, foff, addr, size, + max_addr, findspace, prot, maxprot, docow); + } } else { rv = vm_map_fixed(map, object, foff, *addr, size, prot, maxprot, docow); -- 2.42.0