]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/x86/x86/pvclock.c
pvclock: Add vDSO support
[FreeBSD/FreeBSD.git] / sys / x86 / x86 / pvclock.c
1 /*-
2  * Copyright (c) 2009 Adrian Chadd
3  * Copyright (c) 2012 Spectra Logic Corporation
4  * Copyright (c) 2014 Bryan Venteicher
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/bus.h>
35 #include <sys/clock.h>
36 #include <sys/conf.h>
37 #include <sys/fcntl.h>
38 #include <sys/limits.h>
39 #include <sys/mman.h>
40 #include <sys/proc.h>
41 #include <sys/smp.h>
42 #include <sys/sysctl.h>
43 #include <sys/vdso.h>
44
45 #include <vm/vm.h>
46 #include <vm/pmap.h>
47
48 #include <machine/atomic.h>
49 #include <machine/cpufunc.h>
50 #include <machine/md_var.h>
51 #include <machine/pvclock.h>
52
53 /*
54  * Last system time. This is used to guarantee a monotonically non-decreasing
55  * clock for the kernel codepath and approximate the same for the vDSO codepath.
56  * In theory, this should be unnecessary absent hypervisor bug(s) and/or what
57  * should be rare cases where TSC jitter may still be visible despite the
58  * hypervisor's best efforts.
59  */
60 static volatile uint64_t pvclock_last_systime;
61
62 static uint64_t          pvclock_getsystime(struct pvclock *pvc);
63 static void              pvclock_read_time_info(
64     struct pvclock_vcpu_time_info *ti, uint64_t *ns, uint8_t *flags);
65 static void              pvclock_read_wall_clock(struct pvclock_wall_clock *wc,
66     struct timespec *ts);
67 static u_int             pvclock_tc_get_timecount(struct timecounter *tc);
68 static uint32_t          pvclock_tc_vdso_timehands(
69     struct vdso_timehands *vdso_th, struct timecounter *tc);
70 #ifdef COMPAT_FREEBSD32
71 static uint32_t          pvclock_tc_vdso_timehands32(
72     struct vdso_timehands32 *vdso_th, struct timecounter *tc);
73 #endif
74
75 static d_open_t          pvclock_cdev_open;
76 static d_mmap_t          pvclock_cdev_mmap;
77
78 static struct cdevsw     pvclock_cdev_cdevsw = {
79         .d_version =    D_VERSION,
80         .d_name =       PVCLOCK_CDEVNAME,
81         .d_open =       pvclock_cdev_open,
82         .d_mmap =       pvclock_cdev_mmap,
83 };
84
85 void
86 pvclock_resume(void)
87 {
88         atomic_store_rel_64(&pvclock_last_systime, 0);
89 }
90
91 uint64_t
92 pvclock_tsc_freq(struct pvclock_vcpu_time_info *ti)
93 {
94         uint64_t freq;
95
96         freq = (1000000000ULL << 32) / ti->tsc_to_system_mul;
97         if (ti->tsc_shift < 0)
98                 freq <<= -ti->tsc_shift;
99         else
100                 freq >>= ti->tsc_shift;
101         return (freq);
102 }
103
104 static void
105 pvclock_read_time_info(struct pvclock_vcpu_time_info *ti,
106     uint64_t *ns, uint8_t *flags)
107 {
108         uint64_t delta;
109         uint32_t version;
110
111         do {
112                 version = atomic_load_acq_32(&ti->version);
113                 delta = rdtsc_ordered() - ti->tsc_timestamp;
114                 *ns = ti->system_time + pvclock_scale_delta(delta,
115                     ti->tsc_to_system_mul, ti->tsc_shift);
116                 *flags = ti->flags;
117                 atomic_thread_fence_acq();
118         } while ((ti->version & 1) != 0 || ti->version != version);
119 }
120
121 static void
122 pvclock_read_wall_clock(struct pvclock_wall_clock *wc, struct timespec *ts)
123 {
124         uint32_t version;
125
126         do {
127                 version = atomic_load_acq_32(&wc->version);
128                 ts->tv_sec = wc->sec;
129                 ts->tv_nsec = wc->nsec;
130                 atomic_thread_fence_acq();
131         } while ((wc->version & 1) != 0 || wc->version != version);
132 }
133
134 static uint64_t
135 pvclock_getsystime(struct pvclock *pvc)
136 {
137         uint64_t now, last, ret;
138         uint8_t flags;
139
140         critical_enter();
141         pvclock_read_time_info(&pvc->timeinfos[curcpu], &now, &flags);
142         ret = now;
143         if ((flags & PVCLOCK_FLAG_TSC_STABLE) == 0) {
144                 last = atomic_load_acq_64(&pvclock_last_systime);
145                 do {
146                         if (last > now) {
147                                 ret = last;
148                                 break;
149                         }
150                 } while (!atomic_fcmpset_rel_64(&pvclock_last_systime, &last,
151                     now));
152         }
153         critical_exit();
154         return (ret);
155 }
156
157 /*
158  * NOTE: Transitional-only; this should be removed after 'dev/xen/timer/timer.c'
159  * has been migrated to the 'struct pvclock' API.
160  */
161 uint64_t
162 pvclock_get_timecount(struct pvclock_vcpu_time_info *ti)
163 {
164         uint64_t now, last, ret;
165         uint8_t flags;
166
167         pvclock_read_time_info(ti, &now, &flags);
168         ret = now;
169         if ((flags & PVCLOCK_FLAG_TSC_STABLE) == 0) {
170                 last = atomic_load_acq_64(&pvclock_last_systime);
171                 do {
172                         if (last > now) {
173                                 ret = last;
174                                 break;
175                         }
176                 } while (!atomic_fcmpset_rel_64(&pvclock_last_systime, &last,
177                     now));
178         }
179         return (ret);
180 }
181
182 /*
183  * NOTE: Transitional-only; this should be removed after 'dev/xen/timer/timer.c'
184  * has been migrated to the 'struct pvclock' API.
185  */
186 void
187 pvclock_get_wallclock(struct pvclock_wall_clock *wc, struct timespec *ts)
188 {
189         pvclock_read_wall_clock(wc, ts);
190 }
191
192 static int
193 pvclock_cdev_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
194 {
195         if (oflags & FWRITE)
196                 return (EPERM);
197         return (0);
198 }
199
200 static int
201 pvclock_cdev_mmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr,
202     int nprot, vm_memattr_t *memattr)
203 {
204         if (offset >= mp_ncpus * sizeof(struct pvclock_vcpu_time_info))
205                 return (EINVAL);
206         if (PROT_EXTRACT(nprot) != PROT_READ)
207                 return (EACCES);
208         *paddr = vtophys((uintptr_t)dev->si_drv1 + offset);
209         *memattr = VM_MEMATTR_DEFAULT;
210         return (0);
211 }
212
213 static u_int
214 pvclock_tc_get_timecount(struct timecounter *tc)
215 {
216         struct pvclock *pvc = tc->tc_priv;
217
218         return (pvclock_getsystime(pvc) & UINT_MAX);
219 }
220
221 static uint32_t
222 pvclock_tc_vdso_timehands(struct vdso_timehands *vdso_th,
223     struct timecounter *tc)
224 {
225         struct pvclock *pvc = tc->tc_priv;
226
227         vdso_th->th_algo = VDSO_TH_ALGO_X86_PVCLK;
228         vdso_th->th_x86_shift = 0;
229         vdso_th->th_x86_hpet_idx = 0;
230         vdso_th->th_x86_pvc_last_systime =
231             atomic_load_acq_64(&pvclock_last_systime);
232         vdso_th->th_x86_pvc_stable_mask = !pvc->vdso_force_unstable &&
233             pvc->stable_flag_supported ? PVCLOCK_FLAG_TSC_STABLE : 0;
234         bzero(vdso_th->th_res, sizeof(vdso_th->th_res));
235         return (pvc->cdev != NULL && amd_feature & AMDID_RDTSCP);
236 }
237
238 #ifdef COMPAT_FREEBSD32
239 static uint32_t
240 pvclock_tc_vdso_timehands32(struct vdso_timehands32 *vdso_th,
241     struct timecounter *tc)
242 {
243         struct pvclock *pvc = tc->tc_priv;
244
245         vdso_th->th_algo = VDSO_TH_ALGO_X86_PVCLK;
246         vdso_th->th_x86_shift = 0;
247         vdso_th->th_x86_hpet_idx = 0;
248         vdso_th->th_x86_pvc_last_systime =
249             atomic_load_acq_64(&pvclock_last_systime);
250         vdso_th->th_x86_pvc_stable_mask = !pvc->vdso_force_unstable &&
251             pvc->stable_flag_supported ? PVCLOCK_FLAG_TSC_STABLE : 0;
252         bzero(vdso_th->th_res, sizeof(vdso_th->th_res));
253         return (pvc->cdev != NULL && amd_feature & AMDID_RDTSCP);
254 }
255 #endif
256
257 void
258 pvclock_gettime(struct pvclock *pvc, struct timespec *ts)
259 {
260         struct timespec system_ts;
261         uint64_t system_ns;
262
263         pvclock_read_wall_clock(pvc->get_wallclock(pvc->get_wallclock_arg), ts);
264         system_ns = pvclock_getsystime(pvc);
265         system_ts.tv_sec = system_ns / 1000000000ULL;
266         system_ts.tv_nsec = system_ns % 1000000000ULL;
267         timespecadd(ts, &system_ts, ts);
268 }
269
270 void
271 pvclock_init(struct pvclock *pvc, device_t dev, const char *tc_name,
272     int tc_quality, u_int tc_flags)
273 {
274         struct make_dev_args mda;
275         int err;
276
277         KASSERT(((uintptr_t)pvc->timeinfos & PAGE_MASK) == 0,
278             ("Specified time info page(s) address is not page-aligned."));
279
280         /* Set up vDSO stable-flag suppression test facility: */
281         pvc->vdso_force_unstable = false;
282         SYSCTL_ADD_BOOL(device_get_sysctl_ctx(dev),
283             SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO,
284             "vdso_force_unstable", CTLFLAG_RW, &pvc->vdso_force_unstable, 0,
285             "Forcibly deassert stable flag in vDSO codepath");
286
287         /* Set up timecounter and timecounter-supporting members: */
288         pvc->tc.tc_get_timecount = pvclock_tc_get_timecount;
289         pvc->tc.tc_poll_pps = NULL;
290         pvc->tc.tc_counter_mask = ~0U;
291         pvc->tc.tc_frequency = 1000000000ULL;
292         pvc->tc.tc_name = tc_name;
293         pvc->tc.tc_quality = tc_quality;
294         pvc->tc.tc_flags = tc_flags;
295         pvc->tc.tc_priv = pvc;
296         pvc->tc.tc_fill_vdso_timehands = pvclock_tc_vdso_timehands;
297 #ifdef COMPAT_FREEBSD32
298         pvc->tc.tc_fill_vdso_timehands32 = pvclock_tc_vdso_timehands32;
299 #endif
300
301         /* Set up cdev for userspace mmapping of vCPU 0 time info page: */
302         make_dev_args_init(&mda);
303         mda.mda_devsw = &pvclock_cdev_cdevsw;
304         mda.mda_uid = UID_ROOT;
305         mda.mda_gid = GID_WHEEL;
306         mda.mda_mode = 0444;
307         mda.mda_si_drv1 = pvc->timeinfos;
308         err = make_dev_s(&mda, &pvc->cdev, PVCLOCK_CDEVNAME);
309         if (err != 0) {
310                 device_printf(dev, "Could not create /dev/%s, error %d. Fast "
311                     "time of day will be unavailable for this timecounter.\n",
312                     PVCLOCK_CDEVNAME, err);
313                 KASSERT(pvc->cdev == NULL,
314                     ("Failed make_dev_s() unexpectedly inited cdev."));
315         }
316
317         /* Register timecounter: */
318         tc_init(&pvc->tc);
319
320         /*
321          * Register wallclock:
322          *     The RTC registration API expects a resolution in microseconds;
323          *     pvclock's 1ns resolution is rounded up to 1us.
324          */
325         clock_register(dev, 1);
326 }
327
328 int
329 pvclock_destroy(struct pvclock *pvc)
330 {
331         /*
332          * Not currently possible since there is no teardown counterpart of
333          * 'tc_init()'.
334          */
335         return (EBUSY);
336 }