From d4b2d3035a23d5dc468d41151487a8299bf45cdc Mon Sep 17 00:00:00 2001 From: Adam Fenn Date: Sat, 7 Aug 2021 13:10:04 -0700 Subject: [PATCH] pvclock: Add vDSO support Add vDSO support for timekeeping devices that support the KVM/XEN paravirtual clock API. Also, expose, in the userspace-accessible '', definitions that will be needed by 'libc' to support 'VDSO_TH_ALGO_X86_PVCLK'. Sponsored by: Juniper Networks, Inc. Sponsored by: Klara, Inc. Reviewed by: kib Differential Revision: https://reviews.freebsd.org/D31418 --- sys/dev/acpica/acpi_hpet.c | 4 + sys/dev/hyperv/vmbus/amd64/hyperv_machdep.c | 2 + sys/x86/include/pvclock.h | 62 ++++++++ sys/x86/include/vdso.h | 5 +- sys/x86/x86/pvclock.c | 165 +++++++++++++------- sys/x86/x86/tsc.c | 4 + 6 files changed, 188 insertions(+), 54 deletions(-) diff --git a/sys/dev/acpica/acpi_hpet.c b/sys/dev/acpica/acpi_hpet.c index 9f92521437f..0f0a16f336f 100644 --- a/sys/dev/acpica/acpi_hpet.c +++ b/sys/dev/acpica/acpi_hpet.c @@ -156,6 +156,8 @@ hpet_vdso_timehands(struct vdso_timehands *vdso_th, struct timecounter *tc) vdso_th->th_algo = VDSO_TH_ALGO_X86_HPET; vdso_th->th_x86_shift = 0; vdso_th->th_x86_hpet_idx = device_get_unit(sc->dev); + vdso_th->th_x86_pvc_last_systime = 0; + vdso_th->th_x86_pvc_stable_mask = 0; bzero(vdso_th->th_res, sizeof(vdso_th->th_res)); return (sc->mmap_allow != 0); } @@ -171,6 +173,8 @@ hpet_vdso_timehands32(struct vdso_timehands32 *vdso_th32, vdso_th32->th_algo = VDSO_TH_ALGO_X86_HPET; vdso_th32->th_x86_shift = 0; vdso_th32->th_x86_hpet_idx = device_get_unit(sc->dev); + vdso_th32->th_x86_pvc_last_systime = 0; + vdso_th32->th_x86_pvc_stable_mask = 0; bzero(vdso_th32->th_res, sizeof(vdso_th32->th_res)); return (sc->mmap_allow != 0); } diff --git a/sys/dev/hyperv/vmbus/amd64/hyperv_machdep.c b/sys/dev/hyperv/vmbus/amd64/hyperv_machdep.c index c0809813880..11d549dc18d 100644 --- a/sys/dev/hyperv/vmbus/amd64/hyperv_machdep.c +++ b/sys/dev/hyperv/vmbus/amd64/hyperv_machdep.c @@ -128,6 +128,8 @@ hyperv_tsc_vdso_timehands(struct vdso_timehands *vdso_th, vdso_th->th_algo = VDSO_TH_ALGO_X86_HVTSC; vdso_th->th_x86_shift = 0; vdso_th->th_x86_hpet_idx = 0; + vdso_th->th_x86_pvc_last_systime = 0; + vdso_th->th_x86_pvc_stable_mask = 0; bzero(vdso_th->th_res, sizeof(vdso_th->th_res)); return (1); } diff --git a/sys/x86/include/pvclock.h b/sys/x86/include/pvclock.h index 399017039dd..023acdb80d9 100644 --- a/sys/x86/include/pvclock.h +++ b/sys/x86/include/pvclock.h @@ -30,7 +30,12 @@ #define X86_PVCLOCK #include + +#ifdef _KERNEL #include +#endif /* _KERNEL */ + +#define PVCLOCK_CDEVNAME "pvclock" struct pvclock_vcpu_time_info { uint32_t version; @@ -46,6 +51,59 @@ struct pvclock_vcpu_time_info { #define PVCLOCK_FLAG_TSC_STABLE 0x01 #define PVCLOCK_FLAG_GUEST_PASUED 0x02 +/* + * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, + * yielding a 64-bit result. + */ +static inline uint64_t +pvclock_scale_delta(uint64_t delta, uint32_t mul_frac, int shift) +{ + uint64_t product; + + if (shift < 0) + delta >>= -shift; + else + delta <<= shift; +#if defined(__i386__) + { + uint32_t tmp1, tmp2; + + /** + * For i386, the formula looks like: + * + * lower = (mul_frac * (delta & UINT_MAX)) >> 32 + * upper = mul_frac * (delta >> 32) + * product = lower + upper + */ + __asm__ ( + "mul %5 ; " + "mov %4,%%eax ; " + "mov %%edx,%4 ; " + "mul %5 ; " + "xor %5,%5 ; " + "add %4,%%eax ; " + "adc %5,%%edx ; " + : "=A" (product), "=r" (tmp1), "=r" (tmp2) + : "a" ((uint32_t)delta), "1" ((uint32_t)(delta >> 32)), + "2" (mul_frac) ); + } +#elif defined(__amd64__) + { + unsigned long tmp; + + __asm__ ( + "mulq %[mul_frac] ; shrd $32, %[hi], %[lo]" + : [lo]"=a" (product), [hi]"=d" (tmp) + : "0" (delta), [mul_frac]"rm"((uint64_t)mul_frac)); + } +#else +#error "pvclock: unsupported x86 architecture?" +#endif + return (product); +} + +#ifdef _KERNEL + typedef struct pvclock_wall_clock *pvclock_get_wallclock_t(void *arg); struct pvclock_wall_clock { @@ -62,7 +120,9 @@ struct pvclock { bool stable_flag_supported; /* Private; initialized by the 'pvclock' API: */ + bool vdso_force_unstable; struct timecounter tc; + struct cdev *cdev; }; /* @@ -81,4 +141,6 @@ void pvclock_init(struct pvclock *pvc, device_t dev, void pvclock_gettime(struct pvclock *pvc, struct timespec *ts); int pvclock_destroy(struct pvclock *pvc); +#endif /* _KERNEL */ + #endif diff --git a/sys/x86/include/vdso.h b/sys/x86/include/vdso.h index 97972c660dd..ace63cbe9f6 100644 --- a/sys/x86/include/vdso.h +++ b/sys/x86/include/vdso.h @@ -37,11 +37,14 @@ #define VDSO_TIMEHANDS_MD \ uint32_t th_x86_shift; \ uint32_t th_x86_hpet_idx; \ - uint32_t th_res[6]; + uint64_t th_x86_pvc_last_systime;\ + uint8_t th_x86_pvc_stable_mask; \ + uint8_t th_res[15]; #define VDSO_TH_ALGO_X86_TSC VDSO_TH_ALGO_1 #define VDSO_TH_ALGO_X86_HPET VDSO_TH_ALGO_2 #define VDSO_TH_ALGO_X86_HVTSC VDSO_TH_ALGO_3 /* Hyper-V ref. TSC */ +#define VDSO_TH_ALGO_X86_PVCLK VDSO_TH_ALGO_4 /* KVM/XEN paravirtual clock */ #ifdef _KERNEL #ifdef COMPAT_FREEBSD32 diff --git a/sys/x86/x86/pvclock.c b/sys/x86/x86/pvclock.c index e0ad65d906b..cc2377bdbcf 100644 --- a/sys/x86/x86/pvclock.c +++ b/sys/x86/x86/pvclock.c @@ -31,11 +31,22 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include +#include +#include #include +#include #include +#include +#include +#include + +#include +#include #include +#include #include #include @@ -54,6 +65,22 @@ static void pvclock_read_time_info( static void pvclock_read_wall_clock(struct pvclock_wall_clock *wc, struct timespec *ts); static u_int pvclock_tc_get_timecount(struct timecounter *tc); +static uint32_t pvclock_tc_vdso_timehands( + struct vdso_timehands *vdso_th, struct timecounter *tc); +#ifdef COMPAT_FREEBSD32 +static uint32_t pvclock_tc_vdso_timehands32( + struct vdso_timehands32 *vdso_th, struct timecounter *tc); +#endif + +static d_open_t pvclock_cdev_open; +static d_mmap_t pvclock_cdev_mmap; + +static struct cdevsw pvclock_cdev_cdevsw = { + .d_version = D_VERSION, + .d_name = PVCLOCK_CDEVNAME, + .d_open = pvclock_cdev_open, + .d_mmap = pvclock_cdev_mmap, +}; void pvclock_resume(void) @@ -74,57 +101,6 @@ pvclock_tsc_freq(struct pvclock_vcpu_time_info *ti) return (freq); } -/* - * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, - * yielding a 64-bit result. - */ -static inline uint64_t -pvclock_scale_delta(uint64_t delta, uint32_t mul_frac, int shift) -{ - uint64_t product; - - if (shift < 0) - delta >>= -shift; - else - delta <<= shift; -#if defined(__i386__) - { - uint32_t tmp1, tmp2; - - /** - * For i386, the formula looks like: - * - * lower = (mul_frac * (delta & UINT_MAX)) >> 32 - * upper = mul_frac * (delta >> 32) - * product = lower + upper - */ - __asm__ ( - "mul %5 ; " - "mov %4,%%eax ; " - "mov %%edx,%4 ; " - "mul %5 ; " - "xor %5,%5 ; " - "add %4,%%eax ; " - "adc %5,%%edx ; " - : "=A" (product), "=r" (tmp1), "=r" (tmp2) - : "a" ((uint32_t)delta), "1" ((uint32_t)(delta >> 32)), - "2" (mul_frac) ); - } -#elif defined(__amd64__) - { - unsigned long tmp; - - __asm__ ( - "mulq %[mul_frac] ; shrd $32, %[hi], %[lo]" - : [lo]"=a" (product), [hi]"=d" (tmp) - : "0" (delta), [mul_frac]"rm"((uint64_t)mul_frac)); - } -#else -#error "pvclock: unsupported x86 architecture?" -#endif - return (product); -} - static void pvclock_read_time_info(struct pvclock_vcpu_time_info *ti, uint64_t *ns, uint8_t *flags) @@ -213,6 +189,27 @@ pvclock_get_wallclock(struct pvclock_wall_clock *wc, struct timespec *ts) pvclock_read_wall_clock(wc, ts); } +static int +pvclock_cdev_open(struct cdev *dev, int oflags, int devtype, struct thread *td) +{ + if (oflags & FWRITE) + return (EPERM); + return (0); +} + +static int +pvclock_cdev_mmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr, + int nprot, vm_memattr_t *memattr) +{ + if (offset >= mp_ncpus * sizeof(struct pvclock_vcpu_time_info)) + return (EINVAL); + if (PROT_EXTRACT(nprot) != PROT_READ) + return (EACCES); + *paddr = vtophys((uintptr_t)dev->si_drv1 + offset); + *memattr = VM_MEMATTR_DEFAULT; + return (0); +} + static u_int pvclock_tc_get_timecount(struct timecounter *tc) { @@ -221,6 +218,42 @@ pvclock_tc_get_timecount(struct timecounter *tc) return (pvclock_getsystime(pvc) & UINT_MAX); } +static uint32_t +pvclock_tc_vdso_timehands(struct vdso_timehands *vdso_th, + struct timecounter *tc) +{ + struct pvclock *pvc = tc->tc_priv; + + vdso_th->th_algo = VDSO_TH_ALGO_X86_PVCLK; + vdso_th->th_x86_shift = 0; + vdso_th->th_x86_hpet_idx = 0; + vdso_th->th_x86_pvc_last_systime = + atomic_load_acq_64(&pvclock_last_systime); + vdso_th->th_x86_pvc_stable_mask = !pvc->vdso_force_unstable && + pvc->stable_flag_supported ? PVCLOCK_FLAG_TSC_STABLE : 0; + bzero(vdso_th->th_res, sizeof(vdso_th->th_res)); + return (pvc->cdev != NULL && amd_feature & AMDID_RDTSCP); +} + +#ifdef COMPAT_FREEBSD32 +static uint32_t +pvclock_tc_vdso_timehands32(struct vdso_timehands32 *vdso_th, + struct timecounter *tc) +{ + struct pvclock *pvc = tc->tc_priv; + + vdso_th->th_algo = VDSO_TH_ALGO_X86_PVCLK; + vdso_th->th_x86_shift = 0; + vdso_th->th_x86_hpet_idx = 0; + vdso_th->th_x86_pvc_last_systime = + atomic_load_acq_64(&pvclock_last_systime); + vdso_th->th_x86_pvc_stable_mask = !pvc->vdso_force_unstable && + pvc->stable_flag_supported ? PVCLOCK_FLAG_TSC_STABLE : 0; + bzero(vdso_th->th_res, sizeof(vdso_th->th_res)); + return (pvc->cdev != NULL && amd_feature & AMDID_RDTSCP); +} +#endif + void pvclock_gettime(struct pvclock *pvc, struct timespec *ts) { @@ -238,9 +271,19 @@ void pvclock_init(struct pvclock *pvc, device_t dev, const char *tc_name, int tc_quality, u_int tc_flags) { + struct make_dev_args mda; + int err; + KASSERT(((uintptr_t)pvc->timeinfos & PAGE_MASK) == 0, ("Specified time info page(s) address is not page-aligned.")); + /* Set up vDSO stable-flag suppression test facility: */ + pvc->vdso_force_unstable = false; + SYSCTL_ADD_BOOL(device_get_sysctl_ctx(dev), + SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, + "vdso_force_unstable", CTLFLAG_RW, &pvc->vdso_force_unstable, 0, + "Forcibly deassert stable flag in vDSO codepath"); + /* Set up timecounter and timecounter-supporting members: */ pvc->tc.tc_get_timecount = pvclock_tc_get_timecount; pvc->tc.tc_poll_pps = NULL; @@ -250,11 +293,27 @@ pvclock_init(struct pvclock *pvc, device_t dev, const char *tc_name, pvc->tc.tc_quality = tc_quality; pvc->tc.tc_flags = tc_flags; pvc->tc.tc_priv = pvc; - pvc->tc.tc_fill_vdso_timehands = NULL; + pvc->tc.tc_fill_vdso_timehands = pvclock_tc_vdso_timehands; #ifdef COMPAT_FREEBSD32 - pvc->tc.tc_fill_vdso_timehands32 = NULL; + pvc->tc.tc_fill_vdso_timehands32 = pvclock_tc_vdso_timehands32; #endif + /* Set up cdev for userspace mmapping of vCPU 0 time info page: */ + make_dev_args_init(&mda); + mda.mda_devsw = &pvclock_cdev_cdevsw; + mda.mda_uid = UID_ROOT; + mda.mda_gid = GID_WHEEL; + mda.mda_mode = 0444; + mda.mda_si_drv1 = pvc->timeinfos; + err = make_dev_s(&mda, &pvc->cdev, PVCLOCK_CDEVNAME); + if (err != 0) { + device_printf(dev, "Could not create /dev/%s, error %d. Fast " + "time of day will be unavailable for this timecounter.\n", + PVCLOCK_CDEVNAME, err); + KASSERT(pvc->cdev == NULL, + ("Failed make_dev_s() unexpectedly inited cdev.")); + } + /* Register timecounter: */ tc_init(&pvc->tc); diff --git a/sys/x86/x86/tsc.c b/sys/x86/x86/tsc.c index 5ffbb64229e..0ebcea895cd 100644 --- a/sys/x86/x86/tsc.c +++ b/sys/x86/x86/tsc.c @@ -870,6 +870,8 @@ x86_tsc_vdso_timehands(struct vdso_timehands *vdso_th, struct timecounter *tc) vdso_th->th_algo = VDSO_TH_ALGO_X86_TSC; vdso_th->th_x86_shift = (int)(intptr_t)tc->tc_priv; vdso_th->th_x86_hpet_idx = 0xffffffff; + vdso_th->th_x86_pvc_last_systime = 0; + vdso_th->th_x86_pvc_stable_mask = 0; bzero(vdso_th->th_res, sizeof(vdso_th->th_res)); return (1); } @@ -883,6 +885,8 @@ x86_tsc_vdso_timehands32(struct vdso_timehands32 *vdso_th32, vdso_th32->th_algo = VDSO_TH_ALGO_X86_TSC; vdso_th32->th_x86_shift = (int)(intptr_t)tc->tc_priv; vdso_th32->th_x86_hpet_idx = 0xffffffff; + vdso_th32->th_x86_pvc_last_systime = 0; + vdso_th32->th_x86_pvc_stable_mask = 0; bzero(vdso_th32->th_res, sizeof(vdso_th32->th_res)); return (1); } -- 2.45.0