From 4beb16137ddf8a201323a58fa8be20374ddf85e1 Mon Sep 17 00:00:00 2001 From: kib Date: Sun, 23 Aug 2020 20:19:04 +0000 Subject: [PATCH] amd64 pmap: LA57 AKA 5-level paging Since LA57 was moved to the main SDM document with revision 072, it seems that we should have a support for it, and silicons are coming. This patch makes pmap support both LA48 and LA57 hardware. The selection of page table level is done at startup, kernel always receives control from loader with 4-level paging. It is not clear how UEFI spec would adapt LA57, for instance it could hand out control in LA57 mode sometimes. To switch from LA48 to LA57 requires turning off long mode, requesting LA57 in CR4, then re-entering long mode. This is somewhat delicate and done in pmap_bootstrap_la57(). AP startup in LA57 mode is much easier, we only need to toggle a bit in CR4 and load right value in CR3. I decided to not change kernel map for now. Single PML5 entry is created that points to the existing kernel_pml4 (KML4Phys) page, and a pml5 entry to create our recursive mapping for vtopte()/vtopde(). This decision is motivated by the fact that we cannot overcommit for KVA, so large space there is unusable until machines start providing wider physical memory addressing. Another reason is that I do not want to break our fragile autotuning, so the KVA expansion is not included into this first step. Nice side effect is that minidumps are compatible. On the other hand, (very) large address space is definitely immediately useful for some userspace applications. For userspace, numbering of pte entries (or page table pages) is always done for 5-level structures even if we operate in 4-level mode. The pmap_is_la57() function is added to report the mode of the specified pmap, this is done not to allow simultaneous 4-/5-levels (which is not allowed by hw), but to accomodate for EPT which has separate level control and in principle might not allow 5-leve EPT despite x86 paging supports it. Anyway, it does not seems critical to have 5-level EPT support now. Tested by: pho (LA48 hardware) Reviewed by: alc Sponsored by: The FreeBSD Foundation Differential revision: https://reviews.freebsd.org/D25273 --- sys/amd64/amd64/elf_machdep.c | 124 +++- sys/amd64/amd64/genassym.c | 9 +- sys/amd64/amd64/locore.S | 63 +- sys/amd64/amd64/mp_machdep.c | 40 +- sys/amd64/amd64/mpboot.S | 27 +- sys/amd64/amd64/pmap.c | 787 +++++++++++++++++++----- sys/amd64/include/md_var.h | 2 + sys/amd64/include/param.h | 6 + sys/amd64/include/pmap.h | 72 ++- sys/amd64/include/proc.h | 2 + sys/amd64/include/vmparam.h | 31 +- sys/amd64/linux/linux_sysvec.c | 8 +- sys/amd64/vmm/amd/svm.c | 2 +- sys/amd64/vmm/intel/vmx.c | 2 +- sys/cddl/dev/dtrace/amd64/dtrace_subr.c | 3 +- 15 files changed, 951 insertions(+), 227 deletions(-) diff --git a/sys/amd64/amd64/elf_machdep.c b/sys/amd64/amd64/elf_machdep.c index 1ab28676ce7..3182d1758b1 100644 --- a/sys/amd64/amd64/elf_machdep.c +++ b/sys/amd64/amd64/elf_machdep.c @@ -49,7 +49,7 @@ __FBSDID("$FreeBSD$"); #include #include -struct sysentvec elf64_freebsd_sysvec = { +struct sysentvec elf64_freebsd_sysvec_la48 = { .sv_size = SYS_MAXSYSCALL, .sv_table = sysent, .sv_errsize = 0, @@ -64,9 +64,9 @@ struct sysentvec elf64_freebsd_sysvec = { .sv_imgact_try = NULL, .sv_minsigstksz = MINSIGSTKSZ, .sv_minuser = VM_MIN_ADDRESS, - .sv_maxuser = VM_MAXUSER_ADDRESS, - .sv_usrstack = USRSTACK, - .sv_psstrings = PS_STRINGS, + .sv_maxuser = VM_MAXUSER_ADDRESS_LA48, + .sv_usrstack = USRSTACK_LA48, + .sv_psstrings = PS_STRINGS_LA48, .sv_stackprot = VM_PROT_ALL, .sv_copyout_auxargs = __elfN(freebsd_copyout_auxargs), .sv_copyout_strings = exec_copyout_strings, @@ -78,14 +78,64 @@ struct sysentvec elf64_freebsd_sysvec = { .sv_set_syscall_retval = cpu_set_syscall_retval, .sv_fetch_syscall_args = cpu_fetch_syscall_args, .sv_syscallnames = syscallnames, - .sv_shared_page_base = SHAREDPAGE, + .sv_shared_page_base = SHAREDPAGE_LA48, .sv_shared_page_len = PAGE_SIZE, .sv_schedtail = NULL, .sv_thread_detach = NULL, .sv_trap = NULL, .sv_stackgap = elf64_stackgap, }; -INIT_SYSENTVEC(elf64_sysvec, &elf64_freebsd_sysvec); + +struct sysentvec elf64_freebsd_sysvec_la57 = { + .sv_size = SYS_MAXSYSCALL, + .sv_table = sysent, + .sv_errsize = 0, + .sv_errtbl = NULL, + .sv_transtrap = NULL, + .sv_fixup = __elfN(freebsd_fixup), + .sv_sendsig = sendsig, + .sv_sigcode = sigcode, + .sv_szsigcode = &szsigcode, + .sv_name = "FreeBSD ELF64", + .sv_coredump = __elfN(coredump), + .sv_imgact_try = NULL, + .sv_minsigstksz = MINSIGSTKSZ, + .sv_minuser = VM_MIN_ADDRESS, + .sv_maxuser = VM_MAXUSER_ADDRESS_LA57, + .sv_usrstack = USRSTACK_LA57, + .sv_psstrings = PS_STRINGS_LA57, + .sv_stackprot = VM_PROT_ALL, + .sv_copyout_auxargs = __elfN(freebsd_copyout_auxargs), + .sv_copyout_strings = exec_copyout_strings, + .sv_setregs = exec_setregs, + .sv_fixlimit = NULL, + .sv_maxssiz = NULL, + .sv_flags = SV_ABI_FREEBSD | SV_ASLR | SV_LP64 | SV_SHP | + SV_TIMEKEEP, + .sv_set_syscall_retval = cpu_set_syscall_retval, + .sv_fetch_syscall_args = cpu_fetch_syscall_args, + .sv_syscallnames = syscallnames, + .sv_shared_page_base = SHAREDPAGE_LA57, + .sv_shared_page_len = PAGE_SIZE, + .sv_schedtail = NULL, + .sv_thread_detach = NULL, + .sv_trap = NULL, + .sv_stackgap = elf64_stackgap, +}; + +static void +amd64_init_sysvecs(void *arg) +{ + amd64_lower_shared_page(&elf64_freebsd_sysvec_la48); + if (la57) { + exec_sysvec_init(&elf64_freebsd_sysvec_la57); + exec_sysvec_init_secondary(&elf64_freebsd_sysvec_la57, + &elf64_freebsd_sysvec_la48); + } else { + exec_sysvec_init(&elf64_freebsd_sysvec_la48); + } +} +SYSINIT(elf64_sysvec, SI_SUB_EXEC, SI_ORDER_ANY, amd64_init_sysvecs, NULL); void amd64_lower_shared_page(struct sysentvec *sv) @@ -98,29 +148,57 @@ amd64_lower_shared_page(struct sysentvec *sv) } } -/* - * Do this fixup before INIT_SYSENTVEC (SI_ORDER_ANY) because the latter - * uses the value of sv_shared_page_base. - */ -SYSINIT(elf64_sysvec_fixup, SI_SUB_EXEC, SI_ORDER_FIRST, - (sysinit_cfunc_t) amd64_lower_shared_page, - &elf64_freebsd_sysvec); +static boolean_t +freebsd_brand_info_la57_img_compat(struct image_params *imgp, + int32_t *osrel __unused, uint32_t *fctl0) +{ + if ((imgp->proc->p_md.md_flags & P_MD_LA57) != 0) + return (TRUE); + if (fctl0 == NULL || (*fctl0 & NT_FREEBSD_FCTL_LA48) != 0) + return (FALSE); + if ((imgp->proc->p_md.md_flags & P_MD_LA48) != 0) + return (FALSE); + return (TRUE); +} -static Elf64_Brandinfo freebsd_brand_info = { +static Elf64_Brandinfo freebsd_brand_info_la48 = { .brand = ELFOSABI_FREEBSD, .machine = EM_X86_64, .compat_3_brand = "FreeBSD", .emul_path = NULL, .interp_path = "/libexec/ld-elf.so.1", - .sysvec = &elf64_freebsd_sysvec, + .sysvec = &elf64_freebsd_sysvec_la48, .interp_newpath = NULL, .brand_note = &elf64_freebsd_brandnote, - .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE + .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE, +}; + +static Elf64_Brandinfo freebsd_brand_info_la57 = { + .brand = ELFOSABI_FREEBSD, + .machine = EM_X86_64, + .compat_3_brand = "FreeBSD", + .emul_path = NULL, + .interp_path = "/libexec/ld-elf.so.1", + .sysvec = &elf64_freebsd_sysvec_la57, + .interp_newpath = NULL, + .brand_note = &elf64_freebsd_brandnote, + .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE, + .header_supported = freebsd_brand_info_la57_img_compat, }; +static void +sysinit_register_elf64_brand_entries(void *arg __unused) +{ + /* + * _57 must go first so it can either claim the image or hand + * it to _48. + */ + if (la57) + elf64_insert_brand_entry(&freebsd_brand_info_la57); + elf64_insert_brand_entry(&freebsd_brand_info_la48); +} SYSINIT(elf64, SI_SUB_EXEC, SI_ORDER_FIRST, - (sysinit_cfunc_t) elf64_insert_brand_entry, - &freebsd_brand_info); + sysinit_register_elf64_brand_entries, NULL); static Elf64_Brandinfo freebsd_brand_oinfo = { .brand = ELFOSABI_FREEBSD, @@ -128,15 +206,14 @@ static Elf64_Brandinfo freebsd_brand_oinfo = { .compat_3_brand = "FreeBSD", .emul_path = NULL, .interp_path = "/usr/libexec/ld-elf.so.1", - .sysvec = &elf64_freebsd_sysvec, + .sysvec = &elf64_freebsd_sysvec_la48, .interp_newpath = NULL, .brand_note = &elf64_freebsd_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; SYSINIT(oelf64, SI_SUB_EXEC, SI_ORDER_ANY, - (sysinit_cfunc_t) elf64_insert_brand_entry, - &freebsd_brand_oinfo); + (sysinit_cfunc_t)elf64_insert_brand_entry, &freebsd_brand_oinfo); static Elf64_Brandinfo kfreebsd_brand_info = { .brand = ELFOSABI_FREEBSD, @@ -144,15 +221,14 @@ static Elf64_Brandinfo kfreebsd_brand_info = { .compat_3_brand = "FreeBSD", .emul_path = NULL, .interp_path = "/lib/ld-kfreebsd-x86-64.so.1", - .sysvec = &elf64_freebsd_sysvec, + .sysvec = &elf64_freebsd_sysvec_la48, .interp_newpath = NULL, .brand_note = &elf64_kfreebsd_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE_MANDATORY }; SYSINIT(kelf64, SI_SUB_EXEC, SI_ORDER_ANY, - (sysinit_cfunc_t) elf64_insert_brand_entry, - &kfreebsd_brand_info); + (sysinit_cfunc_t)elf64_insert_brand_entry, &kfreebsd_brand_info); void elf64_dump_thread(struct thread *td, void *dst, size_t *off) diff --git a/sys/amd64/amd64/genassym.c b/sys/amd64/amd64/genassym.c index ec3707ce41f..75500555105 100644 --- a/sys/amd64/amd64/genassym.c +++ b/sys/amd64/amd64/genassym.c @@ -99,11 +99,10 @@ ASSYM(TDP_KTHREAD, TDP_KTHREAD); ASSYM(PAGE_SIZE, PAGE_SIZE); ASSYM(NPTEPG, NPTEPG); ASSYM(NPDEPG, NPDEPG); -ASSYM(addr_PTmap, addr_PTmap); -ASSYM(addr_PDmap, addr_PDmap); -ASSYM(addr_PDPmap, addr_PDPmap); -ASSYM(addr_PML4map, addr_PML4map); -ASSYM(addr_PML4pml4e, addr_PML4pml4e); +ASSYM(addr_P4Tmap, addr_P4Tmap); +ASSYM(addr_P4Dmap, addr_P4Dmap); +ASSYM(addr_P5Tmap, addr_P5Tmap); +ASSYM(addr_P5Dmap, addr_P5Dmap); ASSYM(PDESIZE, sizeof(pd_entry_t)); ASSYM(PTESIZE, sizeof(pt_entry_t)); ASSYM(PAGE_SHIFT, PAGE_SHIFT); diff --git a/sys/amd64/amd64/locore.S b/sys/amd64/amd64/locore.S index d070c10693b..a9a7b5f3972 100644 --- a/sys/amd64/amd64/locore.S +++ b/sys/amd64/amd64/locore.S @@ -36,13 +36,8 @@ /* * Compiled KERNBASE location */ - .globl kernbase,loc_PTmap,loc_PDmap,loc_PDPmap,loc_PML4map,loc_PML4pml4e,dmapbase,dmapend + .globl kernbase, loc_PTmap, loc_PDmap, loc_PDPmap, dmapbase, dmapend .set kernbase,KERNBASE - .set loc_PTmap,addr_PTmap - .set loc_PDmap,addr_PDmap - .set loc_PDPmap,addr_PDPmap - .set loc_PML4map,addr_PML4map - .set loc_PML4pml4e,addr_PML4pml4e .set dmapbase,DMAP_MIN_ADDRESS .set dmapend,DMAP_MAX_ADDRESS @@ -82,6 +77,62 @@ NON_GPROF_ENTRY(btext) 0: hlt jmp 0b +/* la57_trampoline(%rdi pml5) */ +NON_GPROF_ENTRY(la57_trampoline) + movq %rsp,%r11 + movq %rbx,%r10 + leaq la57_trampoline_end(%rip),%rsp + + movq %cr0,%rdx + lgdtq la57_trampoline_gdt_desc(%rip) + + pushq $(2<<3) + leaq l1(%rip),%rax + leaq l2(%rip),%rbx + + pushq %rax + lretq + .code32 + +l1: movl $(3<<3),%eax + movl %eax,%ss + + movl %edx,%eax + andl $~CR0_PG,%eax + movl %eax,%cr0 + + movl %cr4,%eax + orl $CR4_LA57,%eax + movl %eax,%cr4 + + movl %edi,%cr3 + movl %edx,%cr0 + + pushl $(1<<3) + pushl %ebx + lretl + .code64 + +l2: movq %r11,%rsp + movq %r10,%rbx + retq + .p2align 4,0 +NON_GPROF_ENTRY(la57_trampoline_gdt_desc) + .word la57_trampoline_end - la57_trampoline_gdt + .long 0 /* filled by pmap_bootstrap_la57 */ + .p2align 4,0 +NON_GPROF_ENTRY(la57_trampoline_gdt) + .long 0x00000000 /* null desc */ + .long 0x00000000 + .long 0x00000000 /* 64bit code */ + .long 0x00209800 + .long 0x0000ffff /* 32bit code */ + .long 0x00cf9b00 + .long 0x0000ffff /* universal data */ + .long 0x00cf9300 + .dcb.l 16,0 +NON_GPROF_ENTRY(la57_trampoline_end) + .bss ALIGN_DATA /* just to be sure */ .globl bootstack diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c index d46362ba9f9..844cb49b536 100644 --- a/sys/amd64/amd64/mp_machdep.c +++ b/sys/amd64/amd64/mp_machdep.c @@ -96,7 +96,7 @@ __FBSDID("$FreeBSD$"); #define GiB(v) (v ## ULL << 30) -#define AP_BOOTPT_SZ (PAGE_SIZE * 3) +#define AP_BOOTPT_SZ (PAGE_SIZE * 4) /* Temporary variables for init_secondary() */ char *doublefault_stack; @@ -104,6 +104,8 @@ char *mce_stack; char *nmi_stack; char *dbg_stack; +extern u_int mptramp_la57; + /* * Local data and functions. */ @@ -240,6 +242,8 @@ cpu_mp_start(void) assign_cpu_ids(); + mptramp_la57 = la57; + /* Start each Application Processor */ init_ops.start_all_aps(); @@ -395,9 +399,9 @@ mp_realloc_pcpu(int cpuid, int domain) int native_start_all_aps(void) { - u_int64_t *pt4, *pt3, *pt2; + u_int64_t *pt5, *pt4, *pt3, *pt2; u_int32_t mpbioswarmvec; - int apic_id, cpu, domain, i; + int apic_id, cpu, domain, i, xo; u_char mpbiosreason; mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN); @@ -406,18 +410,38 @@ native_start_all_aps(void) bcopy(mptramp_start, (void *)PHYS_TO_DMAP(boot_address), bootMP_size); /* Locate the page tables, they'll be below the trampoline */ - pt4 = (uint64_t *)PHYS_TO_DMAP(mptramp_pagetables); + if (la57) { + pt5 = (uint64_t *)PHYS_TO_DMAP(mptramp_pagetables); + xo = 1; + } else { + xo = 0; + } + pt4 = (uint64_t *)PHYS_TO_DMAP(mptramp_pagetables + xo * PAGE_SIZE); pt3 = pt4 + (PAGE_SIZE) / sizeof(u_int64_t); pt2 = pt3 + (PAGE_SIZE) / sizeof(u_int64_t); /* Create the initial 1GB replicated page tables */ for (i = 0; i < 512; i++) { - /* Each slot of the level 4 pages points to the same level 3 page */ - pt4[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + PAGE_SIZE); + if (la57) { + pt5[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + + PAGE_SIZE); + pt5[i] |= PG_V | PG_RW | PG_U; + } + + /* + * Each slot of the level 4 pages points to the same + * level 3 page. + */ + pt4[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + + (xo + 1) * PAGE_SIZE); pt4[i] |= PG_V | PG_RW | PG_U; - /* Each slot of the level 3 pages points to the same level 2 page */ - pt3[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + (2 * PAGE_SIZE)); + /* + * Each slot of the level 3 pages points to the same + * level 2 page. + */ + pt3[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + + ((xo + 2) * PAGE_SIZE)); pt3[i] |= PG_V | PG_RW | PG_U; /* The level 2 page slots are mapped with 2MB pages for 1GB. */ diff --git a/sys/amd64/amd64/mpboot.S b/sys/amd64/amd64/mpboot.S index 5545fe9290d..fb75d2b8844 100644 --- a/sys/amd64/amd64/mpboot.S +++ b/sys/amd64/amd64/mpboot.S @@ -90,10 +90,16 @@ protmode: mov $bootdata-gdt, %eax mov %ax, %ds - /* Turn on the PAE bit for when paging is enabled */ + /* + * Turn on the PAE bit and optionally the LA57 bit for when paging + * is later enabled. + */ mov %cr4, %eax orl $CR4_PAE, %eax - mov %eax, %cr4 + cmpb $0, mptramp_la57-mptramp_start(%ebx) + je 1f + orl $CR4_LA57, %eax +1: mov %eax, %cr4 /* * Enable EFER.LME so that we get long mode when all the prereqs are @@ -132,9 +138,9 @@ protmode: /* * At this point paging is enabled, and we are in "compatibility" mode. * We do another far jump to reload %cs with the 64 bit selector. - * %cr3 points to a 4-level page table page. + * %cr3 points to a 4- or 5-level page table. * We cannot yet jump all the way to the kernel because we can only - * specify a 32 bit linear address. So, yet another trampoline. + * specify a 32 bit linear address. So, we use yet another trampoline. * * The following instruction is: * ljmp $kernelcode-gdt, $tramp_64-mptramp_start @@ -209,6 +215,11 @@ gdtend: mptramp_pagetables: .long 0 + /* 5-level paging ? */ + .globl mptramp_la57 +mptramp_la57: + .long 0 + /* * The pseudo descriptor for lgdt to use. */ @@ -251,8 +262,12 @@ entry_64: * Load a real %cr3 that has all the direct map stuff and switches * off the 1GB replicated mirror. Load a stack pointer and jump * into AP startup code in C. - */ + */ + cmpl $0, la57 + jne 2f movq KPML4phys, %rax - movq %rax, %cr3 + jmp 3f +2: movq KPML5phys, %rax +3: movq %rax, %cr3 movq bootSTK, %rsp jmp init_secondary diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index d025beff451..4b17debd480 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -398,6 +398,19 @@ static int pg_ps_enabled = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pg_ps_enabled, 0, "Are large page mappings enabled?"); +int __read_frequently la57 = 0; +SYSCTL_INT(_vm_pmap, OID_AUTO, la57, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, + &la57, 0, + "5-level paging for host is enabled"); + +static bool +pmap_is_la57(pmap_t pmap) +{ + if (pmap->pm_type == PT_X86) + return (la57); + return (false); /* XXXKIB handle EPT */ +} + #define PAT_INDEX_SIZE 8 static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ @@ -405,7 +418,10 @@ static u_int64_t KPTphys; /* phys addr of kernel level 1 */ static u_int64_t KPDphys; /* phys addr of kernel level 2 */ u_int64_t KPDPphys; /* phys addr of kernel level 3 */ u_int64_t KPML4phys; /* phys addr of kernel level 4 */ +u_int64_t KPML5phys; /* phys addr of kernel level 5, + if supported */ +static pml4_entry_t *kernel_pml4; static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */ static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */ static int ndmpdpphys; /* number of DMPDPphys pages */ @@ -1257,7 +1273,7 @@ static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde); static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, - struct rwlock **lockp); + struct rwlock **lockp, vm_offset_t va); static pd_entry_t *pmap_alloc_pde(pmap_t pmap, vm_offset_t va, vm_page_t *pdpgp, struct rwlock **lockp); static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, @@ -1271,20 +1287,85 @@ static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); /* Inline functions */ /********************/ -/* Return a non-clipped PD index for a given VA */ +/* + * Return a non-clipped indexes for a given VA, which are page table + * pages indexes at the corresponding level. + */ static __inline vm_pindex_t pmap_pde_pindex(vm_offset_t va) { return (va >> PDRSHIFT); } +static __inline vm_pindex_t +pmap_pdpe_pindex(vm_offset_t va) +{ + return (NUPDE + (va >> PDPSHIFT)); +} + +static __inline vm_pindex_t +pmap_pml4e_pindex(vm_offset_t va) +{ + return (NUPDE + NUPDPE + (va >> PML4SHIFT)); +} + +static __inline vm_pindex_t +pmap_pml5e_pindex(vm_offset_t va) +{ + return (NUPDE + NUPDPE + NUPML4E + (va >> PML5SHIFT)); +} + +static __inline pml4_entry_t * +pmap_pml5e(pmap_t pmap, vm_offset_t va) +{ + + MPASS(pmap_is_la57(pmap)); + return (&pmap->pm_pmltop[pmap_pml5e_index(va)]); +} + +static __inline pml4_entry_t * +pmap_pml5e_u(pmap_t pmap, vm_offset_t va) +{ + + MPASS(pmap_is_la57(pmap)); + return (&pmap->pm_pmltopu[pmap_pml5e_index(va)]); +} + +static __inline pml4_entry_t * +pmap_pml5e_to_pml4e(pml5_entry_t *pml5e, vm_offset_t va) +{ + pml4_entry_t *pml4e; + + /* XXX MPASS(pmap_is_la57(pmap); */ + pml4e = (pml4_entry_t *)PHYS_TO_DMAP(*pml5e & PG_FRAME); + return (&pml4e[pmap_pml4e_index(va)]); +} /* Return a pointer to the PML4 slot that corresponds to a VA */ static __inline pml4_entry_t * pmap_pml4e(pmap_t pmap, vm_offset_t va) { + pml5_entry_t *pml5e; + pml4_entry_t *pml4e; + pt_entry_t PG_V; - return (&pmap->pm_pml4[pmap_pml4e_index(va)]); + if (pmap_is_la57(pmap)) { + pml5e = pmap_pml5e(pmap, va); + PG_V = pmap_valid_bit(pmap); + if ((*pml5e & PG_V) == 0) + return (NULL); + pml4e = (pml4_entry_t *)PHYS_TO_DMAP(*pml5e & PG_FRAME); + } else { + pml4e = pmap->pm_pmltop; + } + return (&pml4e[pmap_pml4e_index(va)]); +} + +static __inline pml4_entry_t * +pmap_pml4e_u(pmap_t pmap, vm_offset_t va) +{ + MPASS(!pmap_is_la57(pmap)); + return (&pmap->pm_pmltopu[pmap_pml4e_index(va)]); } /* Return a pointer to the PDP slot that corresponds to a VA */ @@ -1306,7 +1387,7 @@ pmap_pdpe(pmap_t pmap, vm_offset_t va) PG_V = pmap_valid_bit(pmap); pml4e = pmap_pml4e(pmap, va); - if ((*pml4e & PG_V) == 0) + if (pml4e == NULL || (*pml4e & PG_V) == 0) return (NULL); return (pmap_pml4e_to_pdpe(pml4e, va)); } @@ -1387,21 +1468,37 @@ pmap_resident_count_dec(pmap_t pmap, int count) PMAP_INLINE pt_entry_t * vtopte(vm_offset_t va) { - u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); + u_int64_t mask; KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va)); - return (PTmap + ((va >> PAGE_SHIFT) & mask)); + if (la57) { + mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + + NPML4EPGSHIFT + NPML5EPGSHIFT)) - 1); + return (P5Tmap + ((va >> PAGE_SHIFT) & mask)); + } else { + mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + + NPML4EPGSHIFT)) - 1); + return (P4Tmap + ((va >> PAGE_SHIFT) & mask)); + } } static __inline pd_entry_t * vtopde(vm_offset_t va) { - u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); + u_int64_t mask; KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va)); - return (PDmap + ((va >> PDRSHIFT) & mask)); + if (la57) { + mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + + NPML4EPGSHIFT + NPML5EPGSHIFT)) - 1); + return (P5Dmap + ((va >> PDRSHIFT) & mask)); + } else { + mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + + NPML4EPGSHIFT)) - 1); + return (P4Dmap + ((va >> PDRSHIFT) & mask)); + } } static u_int64_t @@ -1658,6 +1755,8 @@ create_pagetables(vm_paddr_t *firstaddr) p4_p[KPML4BASE + i] = KPDPphys + ptoa(i); p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V; } + + kernel_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); } /* @@ -1730,7 +1829,7 @@ pmap_bootstrap(vm_paddr_t *firstaddr) * later unmapped (using pmap_remove()) and freed. */ PMAP_LOCK_INIT(kernel_pmap); - kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys); + kernel_pmap->pm_pmltop = kernel_pml4; kernel_pmap->pm_cr3 = KPML4phys; kernel_pmap->pm_ucr3 = PMAP_NO_CR3; CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ @@ -1891,6 +1990,148 @@ pmap_init_pat(void) load_cr4(cr4); } +extern const char la57_trampoline[], la57_trampoline_gdt_desc[], + la57_trampoline_gdt[], la57_trampoline_end[]; + +static void +pmap_bootstrap_la57(void *arg __unused) +{ + char *v_code; + pml5_entry_t *v_pml5; + pml4_entry_t *v_pml4; + pdp_entry_t *v_pdp; + pd_entry_t *v_pd; + pt_entry_t *v_pt; + vm_page_t m_code, m_pml4, m_pdp, m_pd, m_pt, m_pml5; + void (*la57_tramp)(uint64_t pml5); + struct region_descriptor r_gdt; + + if ((cpu_stdext_feature2 & CPUID_STDEXT2_LA57) == 0) + return; + if (!TUNABLE_INT_FETCH("vm.pmap.la57", &la57)) + la57 = 1; + if (!la57) + return; + + r_gdt.rd_limit = NGDT * sizeof(struct user_segment_descriptor) - 1; + r_gdt.rd_base = (long)__pcpu[0].pc_gdt; + + m_code = vm_page_alloc_contig(NULL, 0, + VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ, + 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); + if ((m_code->flags & PG_ZERO) == 0) + pmap_zero_page(m_code); + v_code = (char *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_code)); + m_pml5 = vm_page_alloc_contig(NULL, 0, + VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ, + 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); + if ((m_pml5->flags & PG_ZERO) == 0) + pmap_zero_page(m_pml5); + KPML5phys = VM_PAGE_TO_PHYS(m_pml5); + v_pml5 = (pml5_entry_t *)PHYS_TO_DMAP(KPML5phys); + m_pml4 = vm_page_alloc_contig(NULL, 0, + VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ, + 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); + if ((m_pml4->flags & PG_ZERO) == 0) + pmap_zero_page(m_pml4); + v_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pml4)); + m_pdp = vm_page_alloc_contig(NULL, 0, + VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ, + 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); + if ((m_pdp->flags & PG_ZERO) == 0) + pmap_zero_page(m_pdp); + v_pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pdp)); + m_pd = vm_page_alloc_contig(NULL, 0, + VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ, + 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); + if ((m_pd->flags & PG_ZERO) == 0) + pmap_zero_page(m_pd); + v_pd = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd)); + m_pt = vm_page_alloc_contig(NULL, 0, + VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ, + 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); + if ((m_pt->flags & PG_ZERO) == 0) + pmap_zero_page(m_pt); + v_pt = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pt)); + + /* + * Map m_code 1:1, it appears below 4G in KVA due to physical + * address being below 4G. Since kernel KVA is in upper half, + * the pml4e should be zero and free for temporary use. + */ + kernel_pmap->pm_pmltop[pmap_pml4e_index(VM_PAGE_TO_PHYS(m_code))] = + VM_PAGE_TO_PHYS(m_pdp) | X86_PG_V | X86_PG_RW | X86_PG_A | + X86_PG_M; + v_pdp[pmap_pdpe_index(VM_PAGE_TO_PHYS(m_code))] = + VM_PAGE_TO_PHYS(m_pd) | X86_PG_V | X86_PG_RW | X86_PG_A | + X86_PG_M; + v_pd[pmap_pde_index(VM_PAGE_TO_PHYS(m_code))] = + VM_PAGE_TO_PHYS(m_pt) | X86_PG_V | X86_PG_RW | X86_PG_A | + X86_PG_M; + v_pt[pmap_pte_index(VM_PAGE_TO_PHYS(m_code))] = + VM_PAGE_TO_PHYS(m_code) | X86_PG_V | X86_PG_RW | X86_PG_A | + X86_PG_M; + + /* + * Add pml5 entry at top of KVA pointing to existing pml4 table, + * entering all existing kernel mappings into level 5 table. + */ + v_pml5[pmap_pml5e_index(UPT_MAX_ADDRESS)] = KPML4phys | X86_PG_V | + X86_PG_RW | X86_PG_A | X86_PG_M | pg_g; + + /* + * Add pml5 entry for 1:1 trampoline mapping after LA57 is turned on. + */ + v_pml5[pmap_pml5e_index(VM_PAGE_TO_PHYS(m_code))] = + VM_PAGE_TO_PHYS(m_pml4) | X86_PG_V | X86_PG_RW | X86_PG_A | + X86_PG_M; + v_pml4[pmap_pml4e_index(VM_PAGE_TO_PHYS(m_code))] = + VM_PAGE_TO_PHYS(m_pdp) | X86_PG_V | X86_PG_RW | X86_PG_A | + X86_PG_M; + + /* + * Copy and call the 48->57 trampoline, hope we return there, alive. + */ + bcopy(la57_trampoline, v_code, la57_trampoline_end - la57_trampoline); + *(u_long *)(v_code + 2 + (la57_trampoline_gdt_desc - la57_trampoline)) = + la57_trampoline_gdt - la57_trampoline + VM_PAGE_TO_PHYS(m_code); + la57_tramp = (void (*)(uint64_t))VM_PAGE_TO_PHYS(m_code); + la57_tramp(KPML5phys); + + /* + * gdt was necessary reset, switch back to our gdt. + */ + lgdt(&r_gdt); + wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]); + load_ds(_udatasel); + load_es(_udatasel); + load_fs(_ufssel); + ssdtosyssd(&gdt_segs[GPROC0_SEL], + (struct system_segment_descriptor *)&__pcpu[0].pc_gdt[GPROC0_SEL]); + ltr(GSEL(GPROC0_SEL, SEL_KPL)); + + /* + * Now unmap the trampoline, and free the pages. + * Clear pml5 entry used for 1:1 trampoline mapping. + */ + pte_clear(&v_pml5[pmap_pml5e_index(VM_PAGE_TO_PHYS(m_code))]); + invlpg((vm_offset_t)v_code); + vm_page_free(m_code); + vm_page_free(m_pdp); + vm_page_free(m_pd); + vm_page_free(m_pt); + + /* + * Recursively map PML5 to itself in order to get PTmap and + * PDmap. + */ + v_pml5[PML5PML5I] = KPML5phys | X86_PG_RW | X86_PG_V | pg_nx; + + kernel_pmap->pm_cr3 = KPML5phys; + kernel_pmap->pm_pmltop = v_pml5; +} +SYSINIT(la57, SI_SUB_KMEM, SI_ORDER_ANY, pmap_bootstrap_la57, NULL); + /* * Initialize a vm_page's machine-dependent fields. */ @@ -2190,7 +2431,8 @@ pmap_init(void) } for (i = 0; i < lm_ents; i++) { m = pmap_large_map_getptp_unlocked(); - kernel_pmap->pm_pml4[LMSPML4I + i] = X86_PG_V | + /* XXXKIB la57 */ + kernel_pml4[LMSPML4I + i] = X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M | pg_nx | VM_PAGE_TO_PHYS(m); } @@ -3566,44 +3808,57 @@ pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { + pml5_entry_t *pml5; + pml4_entry_t *pml4; + pdp_entry_t *pdp; + pd_entry_t *pd; + vm_page_t pdpg, pdppg, pml4pg; PMAP_LOCK_ASSERT(pmap, MA_OWNED); + /* * unmap the page table page */ - if (m->pindex >= NUPDE + NUPDPE) { + if (m->pindex >= NUPDE + NUPDPE + NUPML4E) { + /* PML4 page */ + MPASS(pmap_is_la57(pmap)); + pml5 = pmap_pml5e(pmap, va); + *pml5 = 0; + if (pmap->pm_pmltopu != NULL && va <= VM_MAXUSER_ADDRESS) { + pml5 = pmap_pml5e_u(pmap, va); + *pml5 = 0; + } + } else if (m->pindex >= NUPDE + NUPDPE) { /* PDP page */ - pml4_entry_t *pml4; pml4 = pmap_pml4e(pmap, va); *pml4 = 0; - if (pmap->pm_pml4u != NULL && va <= VM_MAXUSER_ADDRESS) { - pml4 = &pmap->pm_pml4u[pmap_pml4e_index(va)]; + if (!pmap_is_la57(pmap) && pmap->pm_pmltopu != NULL && + va <= VM_MAXUSER_ADDRESS) { + pml4 = pmap_pml4e_u(pmap, va); *pml4 = 0; } } else if (m->pindex >= NUPDE) { /* PD page */ - pdp_entry_t *pdp; pdp = pmap_pdpe(pmap, va); *pdp = 0; } else { /* PTE page */ - pd_entry_t *pd; pd = pmap_pde(pmap, va); *pd = 0; } pmap_resident_count_dec(pmap, 1); if (m->pindex < NUPDE) { /* We just released a PT, unhold the matching PD */ - vm_page_t pdpg; - pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME); pmap_unwire_ptp(pmap, va, pdpg, free); } else if (m->pindex < NUPDE + NUPDPE) { /* We just released a PD, unhold the matching PDP */ - vm_page_t pdppg; - pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME); pmap_unwire_ptp(pmap, va, pdppg, free); + } else if (m->pindex < NUPDE + NUPDPE + NUPML4E && pmap_is_la57(pmap)) { + /* We just released a PDP, unhold the matching PML4 */ + pml4pg = PHYS_TO_VM_PAGE(*pmap_pml5e(pmap, va) & PG_FRAME); + pmap_unwire_ptp(pmap, va, pml4pg, free); } /* @@ -3659,9 +3914,9 @@ pmap_pinit0(pmap_t pmap) int i; PMAP_LOCK_INIT(pmap); - pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); - pmap->pm_pml4u = NULL; - pmap->pm_cr3 = KPML4phys; + pmap->pm_pmltop = kernel_pmap->pm_pmltop; + pmap->pm_pmltopu = NULL; + pmap->pm_cr3 = kernel_pmap->pm_cr3; /* hack to keep pmap_pti_pcid_invalidate() alive */ pmap->pm_ucr3 = PMAP_NO_CR3; pmap->pm_root.rt_root = 0; @@ -3714,18 +3969,59 @@ pmap_pinit_pml4(vm_page_t pml4pg) /* install large map entries if configured */ for (i = 0; i < lm_ents; i++) - pm_pml4[LMSPML4I + i] = kernel_pmap->pm_pml4[LMSPML4I + i]; + pm_pml4[LMSPML4I + i] = kernel_pmap->pm_pmltop[LMSPML4I + i]; +} + +void +pmap_pinit_pml5(vm_page_t pml5pg) +{ + pml5_entry_t *pm_pml5; + + pm_pml5 = (pml5_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml5pg)); + + /* + * Add pml5 entry at top of KVA pointing to existing pml4 table, + * entering all existing kernel mappings into level 5 table. + */ + pm_pml5[pmap_pml5e_index(UPT_MAX_ADDRESS)] = KPML4phys | X86_PG_V | + X86_PG_RW | X86_PG_A | X86_PG_M | pg_g | + pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE); + + /* + * Install self-referential address mapping entry. + */ + pm_pml5[PML5PML5I] = VM_PAGE_TO_PHYS(pml5pg) | + X86_PG_RW | X86_PG_V | X86_PG_M | X86_PG_A | + pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE); } static void -pmap_pinit_pml4_pti(vm_page_t pml4pg) +pmap_pinit_pml4_pti(vm_page_t pml4pgu) { - pml4_entry_t *pm_pml4; + pml4_entry_t *pm_pml4u; int i; - pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg)); + pm_pml4u = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pgu)); for (i = 0; i < NPML4EPG; i++) - pm_pml4[i] = pti_pml4[i]; + pm_pml4u[i] = pti_pml4[i]; +} + +static void +pmap_pinit_pml5_pti(vm_page_t pml5pgu) +{ + pml5_entry_t *pm_pml5u; + + pm_pml5u = (pml5_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml5pgu)); + + /* + * Add pml5 entry at top of KVA pointing to existing pml4 pti + * table, entering all kernel mappings needed for usermode + * into level 5 table. + */ + pm_pml5u[pmap_pml5e_index(UPT_MAX_ADDRESS)] = + pmap_kextract((vm_offset_t)pti_pml4) | + X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M | pg_g | + pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE); } /* @@ -3735,29 +4031,30 @@ pmap_pinit_pml4_pti(vm_page_t pml4pg) int pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags) { - vm_page_t pml4pg, pml4pgu; - vm_paddr_t pml4phys; + vm_page_t pmltop_pg, pmltop_pgu; + vm_paddr_t pmltop_phys; int i; /* * allocate the page directory page */ - pml4pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | + pmltop_pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO | VM_ALLOC_WAITOK); - pml4phys = VM_PAGE_TO_PHYS(pml4pg); - pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(pml4phys); + pmltop_phys = VM_PAGE_TO_PHYS(pmltop_pg); + pmap->pm_pmltop = (pml5_entry_t *)PHYS_TO_DMAP(pmltop_phys); + CPU_FOREACH(i) { pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE; pmap->pm_pcids[i].pm_gen = 0; } pmap->pm_cr3 = PMAP_NO_CR3; /* initialize to an invalid value */ pmap->pm_ucr3 = PMAP_NO_CR3; - pmap->pm_pml4u = NULL; + pmap->pm_pmltopu = NULL; pmap->pm_type = pm_type; - if ((pml4pg->flags & PG_ZERO) == 0) - pagezero(pmap->pm_pml4); + if ((pmltop_pg->flags & PG_ZERO) == 0) + pagezero(pmap->pm_pmltop); /* * Do not install the host kernel mappings in the nested page @@ -3766,15 +4063,21 @@ pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags) * Install minimal kernel mappings in PTI case. */ if (pm_type == PT_X86) { - pmap->pm_cr3 = pml4phys; - pmap_pinit_pml4(pml4pg); + pmap->pm_cr3 = pmltop_phys; + if (pmap_is_la57(pmap)) + pmap_pinit_pml5(pmltop_pg); + else + pmap_pinit_pml4(pmltop_pg); if ((curproc->p_md.md_flags & P_MD_KPTI) != 0) { - pml4pgu = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | + pmltop_pgu = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_WAITOK); - pmap->pm_pml4u = (pml4_entry_t *)PHYS_TO_DMAP( - VM_PAGE_TO_PHYS(pml4pgu)); - pmap_pinit_pml4_pti(pml4pgu); - pmap->pm_ucr3 = VM_PAGE_TO_PHYS(pml4pgu); + pmap->pm_pmltopu = (pml4_entry_t *)PHYS_TO_DMAP( + VM_PAGE_TO_PHYS(pmltop_pgu)); + if (pmap_is_la57(pmap)) + pmap_pinit_pml5_pti(pmltop_pgu); + else + pmap_pinit_pml4_pti(pmltop_pgu); + pmap->pm_ucr3 = VM_PAGE_TO_PHYS(pmltop_pgu); } if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { rangeset_init(&pmap->pm_pkru, pkru_dup_range, @@ -3799,14 +4102,88 @@ pmap_pinit(pmap_t pmap) return (pmap_pinit_type(pmap, PT_X86, pmap_flags)); } +static pml4_entry_t * +pmap_allocpte_getpml4(pmap_t pmap, struct rwlock **lockp, vm_offset_t va, + bool addref) +{ + vm_pindex_t pml5index; + pml5_entry_t *pml5; + pml4_entry_t *pml4; + vm_page_t pml4pg; + pt_entry_t PG_V; + bool allocated; + + if (!pmap_is_la57(pmap)) + return (&pmap->pm_pmltop[pmap_pml4e_index(va)]); + + PG_V = pmap_valid_bit(pmap); + pml5index = pmap_pml5e_index(va); + pml5 = &pmap->pm_pmltop[pml5index]; + if ((*pml5 & PG_V) == 0) { + if (_pmap_allocpte(pmap, pmap_pml5e_pindex(va), lockp, va) == + NULL) + return (NULL); + allocated = true; + } else { + allocated = false; + } + pml4 = (pml4_entry_t *)PHYS_TO_DMAP(*pml5 & PG_FRAME); + pml4 = &pml4[pmap_pml4e_index(va)]; + if ((*pml4 & PG_V) == 0) { + pml4pg = PHYS_TO_VM_PAGE(*pml5 & PG_FRAME); + if (allocated && !addref) + pml4pg->ref_count--; + else if (!allocated && addref) + pml4pg->ref_count++; + } + return (pml4); +} + +static pdp_entry_t * +pmap_allocpte_getpdp(pmap_t pmap, struct rwlock **lockp, vm_offset_t va, + bool addref) +{ + vm_page_t pdppg; + pml4_entry_t *pml4; + pdp_entry_t *pdp; + pt_entry_t PG_V; + bool allocated; + + PG_V = pmap_valid_bit(pmap); + + pml4 = pmap_allocpte_getpml4(pmap, lockp, va, false); + if (pml4 == NULL) + return (NULL); + + if ((*pml4 & PG_V) == 0) { + /* Have to allocate a new pdp, recurse */ + if (_pmap_allocpte(pmap, pmap_pml4e_pindex(va), lockp, va) == + NULL) + return (NULL); + allocated = true; + } else { + allocated = false; + } + pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); + pdp = &pdp[pmap_pdpe_index(va)]; + if ((*pdp & PG_V) == 0) { + pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME); + if (allocated && !addref) + pdppg->ref_count--; + else if (!allocated && addref) + pdppg->ref_count++; + } + return (pdp); +} + /* * This routine is called if the desired page table page does not exist. * * If page table page allocation fails, this routine may sleep before * returning NULL. It sleeps only if a lock pointer was given. * - * Note: If a page allocation fails at page table level two or three, - * one or two pages may be held during the wait, only to be released + * Note: If a page allocation fails at page table level two, three, or four, + * up to three pages may be held during the wait, only to be released * afterwards. This conservative approach is easily argued to avoid * race conditions. * @@ -3823,20 +4200,35 @@ pmap_pinit(pmap_t pmap) * - for the page directory pointer page, * ptepindex = NUPDE + NUPDPE + (pmap_pde_index(va) >> (NPDEPGSHIFT + * NPML4EPGSHIFT), - * i.e. index of pml4e is put after the last index of PDPE. + * i.e. index of pml4e is put after the last index of PDPE, + * - for the PML4 page (if LA57 mode is enabled), + * ptepindex = NUPDE + NUPDPE + NUPML4E + (pmap_pde_index(va) >> + * (NPDEPGSHIFT + NPML4EPGSHIFT + NPML5EPGSHIFT), + * i.e. index of pml5e is put after the last index of PML4E. * * Define an order on the paging entries, where all entries of the * same height are put together, then heights are put from deepest to * root. Then ptexpindex is the sequential number of the * corresponding paging entry in this order. * - * The root page at PML4 does not participate in this indexing scheme, since - * it is statically allocated by pmap_pinit() and not by _pmap_allocpte(). + * The values of NUPDE, NUPDPE, and NUPML4E are determined by the size of + * LA57 paging structures even in LA48 paging mode. Moreover, the + * ptepindexes are calculated as if the paging structures were 5-level + * regardless of the actual mode of operation. + * + * The root page at PML4/PML5 does not participate in this indexing scheme, + * since it is statically allocated by pmap_pinit() and not by _pmap_allocpte(). */ static vm_page_t -_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) +_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp, + vm_offset_t va __unused) { - vm_page_t m, pdppg, pdpg; + vm_pindex_t pml5index, pml4index; + pml5_entry_t *pml5, *pml5u; + pml4_entry_t *pml4, *pml4u; + pdp_entry_t *pdp; + pd_entry_t *pd; + vm_page_t m, pdpg; pt_entry_t PG_A, PG_M, PG_RW, PG_V; PMAP_LOCK_ASSERT(pmap, MA_OWNED); @@ -3872,16 +4264,38 @@ _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) * Map the pagetable page into the process address space, if * it isn't already there. */ + if (ptepindex >= NUPDE + NUPDPE + NUPML4E) { + MPASS(pmap_is_la57(pmap)); + + pml5index = pmap_pml5e_index(va); + pml5 = &pmap->pm_pmltop[pml5index]; + KASSERT((*pml5 & PG_V) == 0, + ("pmap %p va %#lx pml5 %#lx", pmap, va, *pml5)); + *pml5 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; - if (ptepindex >= (NUPDE + NUPDPE)) { - pml4_entry_t *pml4, *pml4u; - vm_pindex_t pml4index; + if (pmap->pm_pmltopu != NULL && pml5index < NUPML5E) { + if (pmap->pm_ucr3 != PMAP_NO_CR3) + *pml5 |= pg_nx; + pml5u = &pmap->pm_pmltopu[pml5index]; + *pml5u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | + PG_A | PG_M; + } + } else if (ptepindex >= NUPDE + NUPDPE) { + pml4index = pmap_pml4e_index(va); /* Wire up a new PDPE page */ - pml4index = ptepindex - (NUPDE + NUPDPE); - pml4 = &pmap->pm_pml4[pml4index]; + pml4 = pmap_allocpte_getpml4(pmap, lockp, va, true); + if (pml4 == NULL) { + vm_page_unwire_noq(m); + vm_page_free_zero(m); + return (NULL); + } + KASSERT((*pml4 & PG_V) == 0, + ("pmap %p va %#lx pml4 %#lx", pmap, va, *pml4)); *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; - if (pmap->pm_pml4u != NULL && pml4index < NUPML4E) { + + if (!pmap_is_la57(pmap) && pmap->pm_pmltopu != NULL && + pml4index < NUPML4E) { /* * PTI: Make all user-space mappings in the * kernel-mode page table no-execute so that @@ -3892,85 +4306,48 @@ _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) if (pmap->pm_ucr3 != PMAP_NO_CR3) *pml4 |= pg_nx; - pml4u = &pmap->pm_pml4u[pml4index]; + pml4u = &pmap->pm_pmltopu[pml4index]; *pml4u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; } - } else if (ptepindex >= NUPDE) { - vm_pindex_t pml4index; - vm_pindex_t pdpindex; - pml4_entry_t *pml4; - pdp_entry_t *pdp; - /* Wire up a new PDE page */ - pdpindex = ptepindex - NUPDE; - pml4index = pdpindex >> NPML4EPGSHIFT; - - pml4 = &pmap->pm_pml4[pml4index]; - if ((*pml4 & PG_V) == 0) { - /* Have to allocate a new pdp, recurse */ - if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index, - lockp) == NULL) { - vm_page_unwire_noq(m); - vm_page_free_zero(m); - return (NULL); - } - } else { - /* Add reference to pdp page */ - pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME); - pdppg->ref_count++; + pdp = pmap_allocpte_getpdp(pmap, lockp, va, true); + if (pdp == NULL) { + vm_page_unwire_noq(m); + vm_page_free_zero(m); + return (NULL); } - pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); - - /* Now find the pdp page */ - pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; + KASSERT((*pdp & PG_V) == 0, + ("pmap %p va %#lx pdp %#lx", pmap, va, *pdp)); *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; - } else { - vm_pindex_t pml4index; - vm_pindex_t pdpindex; - pml4_entry_t *pml4; - pdp_entry_t *pdp; - pd_entry_t *pd; - /* Wire up a new PTE page */ - pdpindex = ptepindex >> NPDPEPGSHIFT; - pml4index = pdpindex >> NPML4EPGSHIFT; - - /* First, find the pdp and check that its valid. */ - pml4 = &pmap->pm_pml4[pml4index]; - if ((*pml4 & PG_V) == 0) { + pdp = pmap_allocpte_getpdp(pmap, lockp, va, false); + if (pdp == NULL) { + vm_page_unwire_noq(m); + vm_page_free_zero(m); + return (NULL); + } + if ((*pdp & PG_V) == 0) { /* Have to allocate a new pd, recurse */ - if (_pmap_allocpte(pmap, NUPDE + pdpindex, - lockp) == NULL) { + if (_pmap_allocpte(pmap, pmap_pdpe_pindex(va), + lockp, va) == NULL) { vm_page_unwire_noq(m); vm_page_free_zero(m); return (NULL); } - pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); - pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; } else { - pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); - pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; - if ((*pdp & PG_V) == 0) { - /* Have to allocate a new pd, recurse */ - if (_pmap_allocpte(pmap, NUPDE + pdpindex, - lockp) == NULL) { - vm_page_unwire_noq(m); - vm_page_free_zero(m); - return (NULL); - } - } else { - /* Add reference to the pd page */ - pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); - pdpg->ref_count++; - } + /* Add reference to the pd page */ + pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); + pdpg->ref_count++; } pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME); /* Now we know where the page directory page is */ - pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)]; + pd = &pd[pmap_pde_index(va)]; + KASSERT((*pd & PG_V) == 0, + ("pmap %p va %#lx pd %#lx", pmap, va, *pd)); *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; } @@ -4003,7 +4380,7 @@ pmap_alloc_pde(pmap_t pmap, vm_offset_t va, vm_page_t *pdpgp, } else if (va < VM_MAXUSER_ADDRESS) { /* Allocate a pd page. */ pdpindex = pmap_pde_pindex(va) >> NPDPEPGSHIFT; - pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp); + pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp, va); if (pdpg == NULL) { if (lockp != NULL) goto retry; @@ -4064,7 +4441,7 @@ pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) * Here if the pte page isn't mapped, or if it has been * deallocated. */ - m = _pmap_allocpte(pmap, ptepindex, lockp); + m = _pmap_allocpte(pmap, ptepindex, lockp, va); if (m == NULL && lockp != NULL) goto retry; } @@ -4088,28 +4465,35 @@ pmap_release(pmap_t pmap) int i; KASSERT(pmap->pm_stats.resident_count == 0, - ("pmap_release: pmap resident count %ld != 0", - pmap->pm_stats.resident_count)); + ("pmap_release: pmap %p resident count %ld != 0", + pmap, pmap->pm_stats.resident_count)); KASSERT(vm_radix_is_empty(&pmap->pm_root), - ("pmap_release: pmap has reserved page table page(s)")); + ("pmap_release: pmap %p has reserved page table page(s)", + pmap)); KASSERT(CPU_EMPTY(&pmap->pm_active), ("releasing active pmap %p", pmap)); - m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4)); + m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pmltop)); - for (i = 0; i < NKPML4E; i++) /* KVA */ - pmap->pm_pml4[KPML4BASE + i] = 0; - for (i = 0; i < ndmpdpphys; i++)/* Direct Map */ - pmap->pm_pml4[DMPML4I + i] = 0; - pmap->pm_pml4[PML4PML4I] = 0; /* Recursive Mapping */ - for (i = 0; i < lm_ents; i++) /* Large Map */ - pmap->pm_pml4[LMSPML4I + i] = 0; + if (pmap_is_la57(pmap)) { + pmap->pm_pmltop[pmap_pml5e_index(UPT_MAX_ADDRESS)] = 0; + pmap->pm_pmltop[PML5PML5I] = 0; + } else { + for (i = 0; i < NKPML4E; i++) /* KVA */ + pmap->pm_pmltop[KPML4BASE + i] = 0; + for (i = 0; i < ndmpdpphys; i++)/* Direct Map */ + pmap->pm_pmltop[DMPML4I + i] = 0; + pmap->pm_pmltop[PML4PML4I] = 0; /* Recursive Mapping */ + for (i = 0; i < lm_ents; i++) /* Large Map */ + pmap->pm_pmltop[LMSPML4I + i] = 0; + } vm_page_unwire_noq(m); vm_page_free_zero(m); - if (pmap->pm_pml4u != NULL) { - m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4u)); + if (pmap->pm_pmltopu != NULL) { + m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap-> + pm_pmltopu)); vm_page_unwire_noq(m); vm_page_free(m); } @@ -5448,6 +5832,7 @@ pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { struct rwlock *lock; vm_offset_t va_next; + pml5_entry_t *pml5e; pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t ptpaddr, *pde; @@ -5490,7 +5875,18 @@ pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) if (pmap->pm_stats.resident_count == 0) break; - pml4e = pmap_pml4e(pmap, sva); + if (pmap_is_la57(pmap)) { + pml5e = pmap_pml5e(pmap, sva); + if ((*pml5e & PG_V) == 0) { + va_next = (sva + NBPML5) & ~PML5MASK; + if (va_next < sva) + va_next = eva; + continue; + } + pml4e = pmap_pml5e_to_pml4e(pml5e, sva); + } else { + pml4e = pmap_pml4e(pmap, sva); + } if ((*pml4e & PG_V) == 0) { va_next = (sva + NBPML4) & ~PML4MASK; if (va_next < sva) @@ -6110,7 +6506,7 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, */ nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; mpte = _pmap_allocpte(pmap, pmap_pde_pindex(va), - nosleep ? NULL : &lock); + nosleep ? NULL : &lock, va); if (mpte == NULL && nosleep) { rv = KERN_RESOURCE_SHORTAGE; goto out; @@ -6593,7 +6989,8 @@ pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, * Pass NULL instead of the PV list lock * pointer, because we don't intend to sleep. */ - mpte = _pmap_allocpte(pmap, ptepindex, NULL); + mpte = _pmap_allocpte(pmap, ptepindex, NULL, + va); if (mpte == NULL) return (mpte); } @@ -9346,11 +9743,11 @@ pmap_large_map_pdpe(vm_offset_t va) ("pmap_large_map_pdpe: va %#jx out of range idx %#jx LMSPML4I " "%#jx lm_ents %d", (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents)); - KASSERT((kernel_pmap->pm_pml4[pml4_idx] & X86_PG_V) != 0, + KASSERT((kernel_pml4[pml4_idx] & X86_PG_V) != 0, ("pmap_large_map_pdpe: invalid pml4 for va %#jx idx %#jx " "LMSPML4I %#jx lm_ents %d", (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents)); - mphys = kernel_pmap->pm_pml4[pml4_idx] & PG_FRAME; + mphys = kernel_pml4[pml4_idx] & PG_FRAME; return ((pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va)); } @@ -10425,7 +10822,9 @@ sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range, mode, range->pdpes, range->pdes, range->ptes); /* Reset to sentinel value. */ - range->sva = KVADDR(NPML4EPG - 1, NPDPEPG - 1, NPDEPG - 1, NPTEPG - 1); + range->sva = la57 ? KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1, + NPDEPG - 1, NPTEPG - 1) : KV4ADDR(NPML4EPG - 1, NPDPEPG - 1, + NPDEPG - 1, NPTEPG - 1); } /* @@ -10519,7 +10918,9 @@ sysctl_kmaps(SYSCTL_HANDLER_ARGS) sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req); /* Sentinel value. */ - range.sva = KVADDR(NPML4EPG - 1, NPDPEPG - 1, NPDEPG - 1, NPTEPG - 1); + range.sva = la57 ? KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1, + NPDEPG - 1, NPTEPG - 1) : KV4ADDR(NPML4EPG - 1, NPDPEPG - 1, + NPDEPG - 1, NPTEPG - 1); /* * Iterate over the kernel page tables without holding the kernel pmap @@ -10549,7 +10950,7 @@ sysctl_kmaps(SYSCTL_HANDLER_ARGS) sva |= -1ul << 48; restart: - pml4e = kernel_pmap->pm_pml4[i]; + pml4e = kernel_pml4[i]; if ((pml4e & X86_PG_V) == 0) { sva = rounddown2(sva, NBPML4); sysctl_kmaps_dump(sb, &range, sva); @@ -10632,6 +11033,7 @@ SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps, DB_SHOW_COMMAND(pte, pmap_print_pte) { pmap_t pmap; + pml5_entry_t *pml5; pml4_entry_t *pml4; pdp_entry_t *pdp; pd_entry_t *pde; @@ -10650,8 +11052,20 @@ DB_SHOW_COMMAND(pte, pmap_print_pte) pmap = PCPU_GET(curpmap); PG_V = pmap_valid_bit(pmap); - pml4 = pmap_pml4e(pmap, va); - db_printf("VA 0x%016lx pml4e 0x%016lx", va, *pml4); + db_printf("VA 0x%016lx", va); + + if (pmap_is_la57(pmap)) { + pml5 = pmap_pml5e(pmap, va); + db_printf(" pml5e 0x%016lx", *pml5); + if ((*pml5 & PG_V) == 0) { + db_printf("\n"); + return; + } + pml4 = pmap_pml5e_to_pml4e(pml5, va); + } else { + pml4 = pmap_pml4e(pmap, va); + } + db_printf(" pml4e 0x%016lx", *pml4); if ((*pml4 & PG_V) == 0) { db_printf("\n"); return; @@ -10683,4 +11097,95 @@ DB_SHOW_COMMAND(phys2dmap, pmap_phys2dmap) db_printf("show phys2dmap addr\n"); } } + +static void +ptpages_show_page(int level, int idx, vm_page_t pg) +{ + db_printf("l %d i %d pg %p phys %#lx ref %x\n", + level, idx, pg, VM_PAGE_TO_PHYS(pg), pg->ref_count); +} + +static void +ptpages_show_complain(int level, int idx, uint64_t pte) +{ + db_printf("l %d i %d pte %#lx\n", level, idx, pte); +} + +static void +ptpages_show_pml4(vm_page_t pg4, int num_entries, uint64_t PG_V) +{ + vm_page_t pg3, pg2, pg1; + pml4_entry_t *pml4; + pdp_entry_t *pdp; + pd_entry_t *pd; + int i4, i3, i2; + + pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg4)); + for (i4 = 0; i4 < num_entries; i4++) { + if ((pml4[i4] & PG_V) == 0) + continue; + pg3 = PHYS_TO_VM_PAGE(pml4[i4] & PG_FRAME); + if (pg3 == NULL) { + ptpages_show_complain(3, i4, pml4[i4]); + continue; + } + ptpages_show_page(3, i4, pg3); + pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg3)); + for (i3 = 0; i3 < NPDPEPG; i3++) { + if ((pdp[i3] & PG_V) == 0) + continue; + pg2 = PHYS_TO_VM_PAGE(pdp[i3] & PG_FRAME); + if (pg3 == NULL) { + ptpages_show_complain(2, i3, pdp[i3]); + continue; + } + ptpages_show_page(2, i3, pg2); + pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg2)); + for (i2 = 0; i2 < NPDEPG; i2++) { + if ((pd[i2] & PG_V) == 0) + continue; + pg1 = PHYS_TO_VM_PAGE(pd[i2] & PG_FRAME); + if (pg1 == NULL) { + ptpages_show_complain(1, i2, pd[i2]); + continue; + } + ptpages_show_page(1, i2, pg1); + } + } + } +} + +DB_SHOW_COMMAND(ptpages, pmap_ptpages) +{ + pmap_t pmap; + vm_page_t pg; + pml5_entry_t *pml5; + uint64_t PG_V; + int i5; + + if (have_addr) + pmap = (pmap_t)addr; + else + pmap = PCPU_GET(curpmap); + + PG_V = pmap_valid_bit(pmap); + + if (pmap_is_la57(pmap)) { + pml5 = pmap->pm_pmltop; + for (i5 = 0; i5 < NUPML5E; i5++) { + if ((pml5[i5] & PG_V) == 0) + continue; + pg = PHYS_TO_VM_PAGE(pml5[i5] & PG_FRAME); + if (pg == NULL) { + ptpages_show_complain(4, i5, pml5[i5]); + continue; + } + ptpages_show_page(4, i5, pg); + ptpages_show_pml4(pg, NPML4EPG, PG_V); + } + } else { + ptpages_show_pml4(PHYS_TO_VM_PAGE(DMAP_TO_PHYS( + (vm_offset_t)pmap->pm_pmltop)), NUP4ML4E, PG_V); + } +} #endif diff --git a/sys/amd64/include/md_var.h b/sys/amd64/include/md_var.h index 8d866fee846..9a550d7024f 100644 --- a/sys/amd64/include/md_var.h +++ b/sys/amd64/include/md_var.h @@ -46,6 +46,8 @@ extern int syscall_ret_l1d_flush_mode; extern vm_paddr_t intel_graphics_stolen_base; extern vm_paddr_t intel_graphics_stolen_size; +extern int la57; + /* * The file "conf/ldscript.amd64" defines the symbol "kernphys". Its * value is the physical address at which the kernel is loaded. diff --git a/sys/amd64/include/param.h b/sys/amd64/include/param.h index ac3df693e4d..2bd4d913a7b 100644 --- a/sys/amd64/include/param.h +++ b/sys/amd64/include/param.h @@ -118,6 +118,12 @@ #define PML4SHIFT 39 /* LOG2(NBPML4) */ #define NBPML4 (1UL<> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1)); } +static __inline vm_pindex_t +pmap_pml5e_index(vm_offset_t va) +{ + + return ((va >> PML5SHIFT) & ((1ul << NPML5EPGSHIFT) - 1)); +} + #endif /* !LOCORE */ #endif /* !_MACHINE_PMAP_H_ */ diff --git a/sys/amd64/include/proc.h b/sys/amd64/include/proc.h index 75f357c3a53..e74f1626a56 100644 --- a/sys/amd64/include/proc.h +++ b/sys/amd64/include/proc.h @@ -84,6 +84,8 @@ struct mdproc { }; #define P_MD_KPTI 0x00000001 /* Enable KPTI on exec */ +#define P_MD_LA48 0x00000002 /* Request LA48 after exec */ +#define P_MD_LA57 0x00000004 /* Request LA57 after exec */ #define KINFO_PROC_SIZE 1088 #define KINFO_PROC32_SIZE 768 diff --git a/sys/amd64/include/vmparam.h b/sys/amd64/include/vmparam.h index 2fe349e0beb..64eed576035 100644 --- a/sys/amd64/include/vmparam.h +++ b/sys/amd64/include/vmparam.h @@ -169,25 +169,32 @@ * 0xffffffff80000000 KERNBASE */ -#define VM_MIN_KERNEL_ADDRESS KVADDR(KPML4BASE, 0, 0, 0) -#define VM_MAX_KERNEL_ADDRESS KVADDR(KPML4BASE + NKPML4E - 1, \ +#define VM_MIN_KERNEL_ADDRESS KV4ADDR(KPML4BASE, 0, 0, 0) +#define VM_MAX_KERNEL_ADDRESS KV4ADDR(KPML4BASE + NKPML4E - 1, \ NPDPEPG-1, NPDEPG-1, NPTEPG-1) -#define DMAP_MIN_ADDRESS KVADDR(DMPML4I, 0, 0, 0) -#define DMAP_MAX_ADDRESS KVADDR(DMPML4I + NDMPML4E, 0, 0, 0) +#define DMAP_MIN_ADDRESS KV4ADDR(DMPML4I, 0, 0, 0) +#define DMAP_MAX_ADDRESS KV4ADDR(DMPML4I + NDMPML4E, 0, 0, 0) -#define LARGEMAP_MIN_ADDRESS KVADDR(LMSPML4I, 0, 0, 0) -#define LARGEMAP_MAX_ADDRESS KVADDR(LMEPML4I + 1, 0, 0, 0) +#define LARGEMAP_MIN_ADDRESS KV4ADDR(LMSPML4I, 0, 0, 0) +#define LARGEMAP_MAX_ADDRESS KV4ADDR(LMEPML4I + 1, 0, 0, 0) -#define KERNBASE KVADDR(KPML4I, KPDPI, 0, 0) +#define KERNBASE KV4ADDR(KPML4I, KPDPI, 0, 0) -#define UPT_MAX_ADDRESS KVADDR(PML4PML4I, PML4PML4I, PML4PML4I, PML4PML4I) -#define UPT_MIN_ADDRESS KVADDR(PML4PML4I, 0, 0, 0) +#define UPT_MAX_ADDRESS KV4ADDR(PML4PML4I, PML4PML4I, PML4PML4I, PML4PML4I) +#define UPT_MIN_ADDRESS KV4ADDR(PML4PML4I, 0, 0, 0) -#define VM_MAXUSER_ADDRESS UVADDR(NUPML4E, 0, 0, 0) +#define VM_MAXUSER_ADDRESS_LA57 UVADDR(NUPML5E, 0, 0, 0, 0) +#define VM_MAXUSER_ADDRESS_LA48 UVADDR(0, NUP4ML4E, 0, 0, 0) +#define VM_MAXUSER_ADDRESS VM_MAXUSER_ADDRESS_LA57 -#define SHAREDPAGE (VM_MAXUSER_ADDRESS - PAGE_SIZE) -#define USRSTACK SHAREDPAGE +#define SHAREDPAGE_LA57 (VM_MAXUSER_ADDRESS_LA57 - PAGE_SIZE) +#define SHAREDPAGE_LA48 (VM_MAXUSER_ADDRESS_LA48 - PAGE_SIZE) +#define USRSTACK_LA57 SHAREDPAGE_LA57 +#define USRSTACK_LA48 SHAREDPAGE_LA48 +#define USRSTACK USRSTACK_LA48 +#define PS_STRINGS_LA57 (USRSTACK_LA57 - sizeof(struct ps_strings)) +#define PS_STRINGS_LA48 (USRSTACK_LA48 - sizeof(struct ps_strings)) #define VM_MAX_ADDRESS UPT_MAX_ADDRESS #define VM_MIN_ADDRESS (0) diff --git a/sys/amd64/linux/linux_sysvec.c b/sys/amd64/linux/linux_sysvec.c index 81ccbd75b5c..bb80f324868 100644 --- a/sys/amd64/linux/linux_sysvec.c +++ b/sys/amd64/linux/linux_sysvec.c @@ -739,9 +739,9 @@ struct sysentvec elf_linux_sysvec = { .sv_imgact_try = linux_exec_imgact_try, .sv_minsigstksz = LINUX_MINSIGSTKSZ, .sv_minuser = VM_MIN_ADDRESS, - .sv_maxuser = VM_MAXUSER_ADDRESS, - .sv_usrstack = USRSTACK, - .sv_psstrings = PS_STRINGS, + .sv_maxuser = VM_MAXUSER_ADDRESS_LA48, + .sv_usrstack = USRSTACK_LA48, + .sv_psstrings = PS_STRINGS_LA48, .sv_stackprot = VM_PROT_ALL, .sv_copyout_auxargs = linux_copyout_auxargs, .sv_copyout_strings = linux_copyout_strings, @@ -752,7 +752,7 @@ struct sysentvec elf_linux_sysvec = { .sv_set_syscall_retval = linux_set_syscall_retval, .sv_fetch_syscall_args = linux_fetch_syscall_args, .sv_syscallnames = NULL, - .sv_shared_page_base = SHAREDPAGE, + .sv_shared_page_base = SHAREDPAGE_LA48, .sv_shared_page_len = PAGE_SIZE, .sv_schedtail = linux_schedtail, .sv_thread_detach = linux_thread_detach, diff --git a/sys/amd64/vmm/amd/svm.c b/sys/amd64/vmm/amd/svm.c index f9660024fe0..3b26de3d00f 100644 --- a/sys/amd64/vmm/amd/svm.c +++ b/sys/amd64/vmm/amd/svm.c @@ -560,7 +560,7 @@ svm_vminit(struct vm *vm, pmap_t pmap) panic("contigmalloc of SVM IO bitmap failed"); svm_sc->vm = vm; - svm_sc->nptp = (vm_offset_t)vtophys(pmap->pm_pml4); + svm_sc->nptp = (vm_offset_t)vtophys(pmap->pm_pmltop); /* * Intercept read and write accesses to all MSRs. diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c index ddfada8a608..3fc6ccf28b6 100644 --- a/sys/amd64/vmm/intel/vmx.c +++ b/sys/amd64/vmm/intel/vmx.c @@ -1030,7 +1030,7 @@ vmx_vminit(struct vm *vm, pmap_t pmap) } vmx->vm = vm; - vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4)); + vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pmltop)); /* * Clean up EPTP-tagged guest physical and combined mappings diff --git a/sys/cddl/dev/dtrace/amd64/dtrace_subr.c b/sys/cddl/dev/dtrace/amd64/dtrace_subr.c index cf24e6adae3..924a59b3d65 100644 --- a/sys/cddl/dev/dtrace/amd64/dtrace_subr.c +++ b/sys/cddl/dev/dtrace/amd64/dtrace_subr.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include #include @@ -131,7 +132,7 @@ dtrace_invop_uninit(void) void dtrace_toxic_ranges(void (*func)(uintptr_t base, uintptr_t limit)) { - (*func)(0, (uintptr_t) addr_PTmap); + (*func)(0, la57 ? (uintptr_t)addr_P5Tmap : (uintptr_t)addr_P4Tmap); } void -- 2.45.0