From df8dd6025af88a99d34f549fa9591a9b8f9b75b1 Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Tue, 14 Sep 2021 00:05:47 +0300 Subject: [PATCH] amd64: stop using top of the thread' kernel stack for FPU user save area Instead do one more allocation at the thread creation time. This frees a lot of space on the stack. Also do not use alloca() for temporal storage in signal delivery sendsig() function and signal return syscall sys_sigreturn(). This saves equal amount of space, again by the cost of one more allocation at the thread creation time. A useful experiment now would be to reduce KSTACK_PAGES. Reviewed by: jhb, markj Tested by: pho Sponsored by: The FreeBSD Foundation MFC after: 1 week Differential revision: https://reviews.freebsd.org/D31954 --- sys/amd64/amd64/exec_machdep.c | 4 ++-- sys/amd64/amd64/fpu.c | 2 ++ sys/amd64/amd64/machdep.c | 14 -------------- sys/amd64/amd64/vm_machdep.c | 22 +++++++++++++--------- sys/amd64/ia32/ia32_signal.c | 6 +++--- sys/amd64/include/proc.h | 2 ++ sys/kern/kern_thread.c | 2 +- 7 files changed, 23 insertions(+), 29 deletions(-) diff --git a/sys/amd64/amd64/exec_machdep.c b/sys/amd64/amd64/exec_machdep.c index 1297117638d..48bda05f968 100644 --- a/sys/amd64/amd64/exec_machdep.c +++ b/sys/amd64/amd64/exec_machdep.c @@ -135,7 +135,7 @@ sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) { xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu); - xfpusave = __builtin_alloca(xfpusave_len); + xfpusave = (char *)td->td_md.md_fpu_scratch; } else { xfpusave_len = 0; xfpusave = NULL; @@ -674,7 +674,7 @@ set_mcontext(struct thread *td, mcontext_t *mcp) if (mcp->mc_xfpustate_len > cpu_max_ext_state_size - sizeof(struct savefpu)) return (EINVAL); - xfpustate = __builtin_alloca(mcp->mc_xfpustate_len); + xfpustate = (char *)td->td_md.md_fpu_scratch; ret = copyin((void *)mcp->mc_xfpustate, xfpustate, mcp->mc_xfpustate_len); if (ret != 0) diff --git a/sys/amd64/amd64/fpu.c b/sys/amd64/amd64/fpu.c index d7936b3b192..24986958d4c 100644 --- a/sys/amd64/amd64/fpu.c +++ b/sys/amd64/amd64/fpu.c @@ -448,6 +448,8 @@ fpuinitstate(void *arg __unused) xsave_area_elm_descr), M_DEVBUF, M_WAITOK | M_ZERO); } + cpu_thread_alloc(&thread0); + saveintr = intr_disable(); stop_emulating(); diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c index d4e2356a9ae..5c9b6452660 100644 --- a/sys/amd64/amd64/machdep.c +++ b/sys/amd64/amd64/machdep.c @@ -1258,7 +1258,6 @@ hammer_time(u_int64_t modulep, u_int64_t physfree) caddr_t kmdp; int gsel_tss, x; struct pcpu *pc; - struct xstate_hdr *xhdr; uint64_t cr3, rsp0; pml4_entry_t *pml4e; pdp_entry_t *pdpe; @@ -1564,19 +1563,6 @@ hammer_time(u_int64_t modulep, u_int64_t physfree) msgbufinit(msgbufp, msgbufsize); fpuinit(); - /* - * Reinitialize thread0's stack base now that the xsave area size is - * known. Set up thread0's pcb save area after fpuinit calculated fpu - * save area size. Zero out the extended state header in fpu save area. - */ - set_top_of_stack_td(&thread0); - thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0); - bzero(thread0.td_pcb->pcb_save, cpu_max_ext_state_size); - if (use_xsave) { - xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) + - 1); - xhdr->xstate_bv = xsave_mask; - } /* make an initial tss so cpu can get interrupt stack on syscall! */ rsp0 = thread0.td_md.md_stack_base; /* Ensure the stack is aligned to 16 bytes */ diff --git a/sys/amd64/amd64/vm_machdep.c b/sys/amd64/amd64/vm_machdep.c index 4567e6e0eb5..e42d16d61b3 100644 --- a/sys/amd64/amd64/vm_machdep.c +++ b/sys/amd64/amd64/vm_machdep.c @@ -90,19 +90,17 @@ void set_top_of_stack_td(struct thread *td) { td->td_md.md_stack_base = td->td_kstack + - td->td_kstack_pages * PAGE_SIZE - - roundup2(cpu_max_ext_state_size, XSAVE_AREA_ALIGN); + td->td_kstack_pages * PAGE_SIZE; } struct savefpu * get_pcb_user_save_td(struct thread *td) { - vm_offset_t p; - - p = td->td_md.md_stack_base; - KASSERT((p % XSAVE_AREA_ALIGN) == 0, - ("Unaligned pcb_user_save area ptr %#lx td %p", p, td)); - return ((struct savefpu *)p); + KASSERT(((vm_offset_t)td->td_md.md_usr_fpu_save % + XSAVE_AREA_ALIGN) == 0, + ("Unaligned pcb_user_save area ptr %p td %p", + td->td_md.md_usr_fpu_save, td)); + return (td->td_md.md_usr_fpu_save); } struct pcb * @@ -393,6 +391,8 @@ cpu_thread_alloc(struct thread *td) set_top_of_stack_td(td); td->td_pcb = pcb = get_pcb_td(td); td->td_frame = (struct trapframe *)td->td_md.md_stack_base - 1; + td->td_md.md_usr_fpu_save = fpu_save_area_alloc(); + td->td_md.md_fpu_scratch = fpu_save_area_alloc(); pcb->pcb_save = get_pcb_user_save_pcb(pcb); if (use_xsave) { xhdr = (struct xstate_hdr *)(pcb->pcb_save + 1); @@ -404,8 +404,12 @@ cpu_thread_alloc(struct thread *td) void cpu_thread_free(struct thread *td) { - cpu_thread_clean(td); + + fpu_save_area_free(td->td_md.md_usr_fpu_save); + td->td_md.md_usr_fpu_save = NULL; + fpu_save_area_free(td->td_md.md_fpu_scratch); + td->td_md.md_fpu_scratch = NULL; } bool diff --git a/sys/amd64/ia32/ia32_signal.c b/sys/amd64/ia32/ia32_signal.c index 49b5797d68f..9b67c7001a8 100644 --- a/sys/amd64/ia32/ia32_signal.c +++ b/sys/amd64/ia32/ia32_signal.c @@ -210,7 +210,7 @@ ia32_set_mcontext(struct thread *td, struct ia32_mcontext *mcp) if (mcp->mc_xfpustate_len > cpu_max_ext_state_size - sizeof(struct savefpu)) return (EINVAL); - xfpustate = __builtin_alloca(mcp->mc_xfpustate_len); + xfpustate = (char *)td->td_md.md_fpu_scratch; ret = copyin(PTRIN(mcp->mc_xfpustate), xfpustate, mcp->mc_xfpustate_len); if (ret != 0) @@ -579,7 +579,7 @@ ia32_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) { xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu); - xfpusave = __builtin_alloca(xfpusave_len); + xfpusave = (char *)td->td_md.md_fpu_scratch; } else { xfpusave_len = 0; xfpusave = NULL; @@ -882,7 +882,7 @@ freebsd32_sigreturn(td, uap) td->td_proc->p_pid, td->td_name, xfpustate_len); return (EINVAL); } - xfpustate = __builtin_alloca(xfpustate_len); + xfpustate = (char *)td->td_md.md_fpu_scratch; error = copyin(PTRIN(ucp->uc_mcontext.mc_xfpustate), xfpustate, xfpustate_len); if (error != 0) { diff --git a/sys/amd64/include/proc.h b/sys/amd64/include/proc.h index 0f8cf50e326..bd07f70f8d4 100644 --- a/sys/amd64/include/proc.h +++ b/sys/amd64/include/proc.h @@ -75,6 +75,8 @@ struct mdthread { int md_efirt_dis_pf; /* (k) */ struct pcb md_pcb; vm_offset_t md_stack_base; + struct savefpu *md_usr_fpu_save; + struct savefpu *md_fpu_scratch; }; struct mdproc { diff --git a/sys/kern/kern_thread.c b/sys/kern/kern_thread.c index 65c5cc65c87..62f93940637 100644 --- a/sys/kern/kern_thread.c +++ b/sys/kern/kern_thread.c @@ -91,7 +91,7 @@ _Static_assert(offsetof(struct thread, td_pflags) == 0x110, "struct thread KBI td_pflags"); _Static_assert(offsetof(struct thread, td_frame) == 0x4a8, "struct thread KBI td_frame"); -_Static_assert(offsetof(struct thread, td_emuldata) == 0x6b0, +_Static_assert(offsetof(struct thread, td_emuldata) == 0x6c0, "struct thread KBI td_emuldata"); _Static_assert(offsetof(struct proc, p_flag) == 0xb8, "struct proc KBI p_flag"); -- 2.45.0