]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/amd64/vmm/x86.c
Expose the MD_CLEAR capability used by Intel MDS mitigations to guests.
[FreeBSD/FreeBSD.git] / sys / amd64 / vmm / x86.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  * $FreeBSD$
29  */
30
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33
34 #include <sys/param.h>
35 #include <sys/pcpu.h>
36 #include <sys/systm.h>
37 #include <sys/sysctl.h>
38
39 #include <machine/clock.h>
40 #include <machine/cpufunc.h>
41 #include <machine/md_var.h>
42 #include <machine/segments.h>
43 #include <machine/specialreg.h>
44
45 #include <machine/vmm.h>
46
47 #include "vmm_host.h"
48 #include "vmm_ktr.h"
49 #include "vmm_util.h"
50 #include "x86.h"
51
52 SYSCTL_DECL(_hw_vmm);
53 static SYSCTL_NODE(_hw_vmm, OID_AUTO, topology, CTLFLAG_RD, 0, NULL);
54
55 #define CPUID_VM_HIGH           0x40000000
56
57 static const char bhyve_id[12] = "bhyve bhyve ";
58
59 static uint64_t bhyve_xcpuids;
60 SYSCTL_ULONG(_hw_vmm, OID_AUTO, bhyve_xcpuids, CTLFLAG_RW, &bhyve_xcpuids, 0,
61     "Number of times an unknown cpuid leaf was accessed");
62
63 #if __FreeBSD_version < 1200060 /* Remove after 11 EOL helps MFCing */
64 extern u_int threads_per_core;
65 SYSCTL_UINT(_hw_vmm_topology, OID_AUTO, threads_per_core, CTLFLAG_RDTUN,
66     &threads_per_core, 0, NULL);
67
68 extern u_int cores_per_package;
69 SYSCTL_UINT(_hw_vmm_topology, OID_AUTO, cores_per_package, CTLFLAG_RDTUN,
70     &cores_per_package, 0, NULL);
71 #endif
72
73 static int cpuid_leaf_b = 1;
74 SYSCTL_INT(_hw_vmm_topology, OID_AUTO, cpuid_leaf_b, CTLFLAG_RDTUN,
75     &cpuid_leaf_b, 0, NULL);
76
77 /*
78  * Round up to the next power of two, if necessary, and then take log2.
79  * Returns -1 if argument is zero.
80  */
81 static __inline int
82 log2(u_int x)
83 {
84
85         return (fls(x << (1 - powerof2(x))) - 1);
86 }
87
88 int
89 x86_emulate_cpuid(struct vm *vm, int vcpu_id,
90                   uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
91 {
92         const struct xsave_limits *limits;
93         uint64_t cr4;
94         int error, enable_invpcid, level, width, x2apic_id;
95         unsigned int func, regs[4], logical_cpus;
96         enum x2apic_state x2apic_state;
97         uint16_t cores, maxcpus, sockets, threads;
98
99         VCPU_CTR2(vm, vcpu_id, "cpuid %#x,%#x", *eax, *ecx);
100
101         /*
102          * Requests for invalid CPUID levels should map to the highest
103          * available level instead.
104          */
105         if (cpu_exthigh != 0 && *eax >= 0x80000000) {
106                 if (*eax > cpu_exthigh)
107                         *eax = cpu_exthigh;
108         } else if (*eax >= 0x40000000) {
109                 if (*eax > CPUID_VM_HIGH)
110                         *eax = CPUID_VM_HIGH;
111         } else if (*eax > cpu_high) {
112                 *eax = cpu_high;
113         }
114
115         func = *eax;
116
117         /*
118          * In general the approach used for CPU topology is to
119          * advertise a flat topology where all CPUs are packages with
120          * no multi-core or SMT.
121          */
122         switch (func) {
123                 /*
124                  * Pass these through to the guest
125                  */
126                 case CPUID_0000_0000:
127                 case CPUID_0000_0002:
128                 case CPUID_0000_0003:
129                 case CPUID_8000_0000:
130                 case CPUID_8000_0002:
131                 case CPUID_8000_0003:
132                 case CPUID_8000_0004:
133                 case CPUID_8000_0006:
134                         cpuid_count(*eax, *ecx, regs);
135                         break;
136                 case CPUID_8000_0008:
137                         cpuid_count(*eax, *ecx, regs);
138                         if (vmm_is_amd()) {
139                                 /*
140                                  * As on Intel (0000_0007:0, EDX), mask out
141                                  * unsupported or unsafe AMD extended features
142                                  * (8000_0008 EBX).
143                                  */
144                                 regs[1] &= (AMDFEID_CLZERO | AMDFEID_IRPERF |
145                                     AMDFEID_XSAVEERPTR);
146
147                                 vm_get_topology(vm, &sockets, &cores, &threads,
148                                     &maxcpus);
149                                 /*
150                                  * Here, width is ApicIdCoreIdSize, present on
151                                  * at least Family 15h and newer.  It
152                                  * represents the "number of bits in the
153                                  * initial apicid that indicate thread id
154                                  * within a package."
155                                  *
156                                  * Our topo_probe_amd() uses it for
157                                  * pkg_id_shift and other OSes may rely on it.
158                                  */
159                                 width = MIN(0xF, log2(threads * cores));
160                                 if (width < 0x4)
161                                         width = 0;
162                                 logical_cpus = MIN(0xFF, threads * cores - 1);
163                                 regs[2] = (width << AMDID_COREID_SIZE_SHIFT) | logical_cpus;
164                         }
165                         break;
166
167                 case CPUID_8000_0001:
168                         cpuid_count(*eax, *ecx, regs);
169
170                         /*
171                          * Hide SVM from guest.
172                          */
173                         regs[2] &= ~AMDID2_SVM;
174
175                         /*
176                          * Don't advertise extended performance counter MSRs
177                          * to the guest.
178                          */
179                         regs[2] &= ~AMDID2_PCXC;
180                         regs[2] &= ~AMDID2_PNXC;
181                         regs[2] &= ~AMDID2_PTSCEL2I;
182
183                         /*
184                          * Don't advertise Instruction Based Sampling feature.
185                          */
186                         regs[2] &= ~AMDID2_IBS;
187
188                         /* NodeID MSR not available */
189                         regs[2] &= ~AMDID2_NODE_ID;
190
191                         /* Don't advertise the OS visible workaround feature */
192                         regs[2] &= ~AMDID2_OSVW;
193
194                         /* Hide mwaitx/monitorx capability from the guest */
195                         regs[2] &= ~AMDID2_MWAITX;
196
197                         /*
198                          * Hide rdtscp/ia32_tsc_aux until we know how
199                          * to deal with them.
200                          */
201                         regs[3] &= ~AMDID_RDTSCP;
202                         break;
203
204                 case CPUID_8000_0007:
205                         /*
206                          * AMD uses this leaf to advertise the processor's
207                          * power monitoring and RAS capabilities. These
208                          * features are hardware-specific and exposing
209                          * them to a guest doesn't make a lot of sense.
210                          *
211                          * Intel uses this leaf only to advertise the
212                          * "Invariant TSC" feature with all other bits
213                          * being reserved (set to zero).
214                          */
215                         regs[0] = 0;
216                         regs[1] = 0;
217                         regs[2] = 0;
218                         regs[3] = 0;
219
220                         /*
221                          * "Invariant TSC" can be advertised to the guest if:
222                          * - host TSC frequency is invariant
223                          * - host TSCs are synchronized across physical cpus
224                          *
225                          * XXX This still falls short because the vcpu
226                          * can observe the TSC moving backwards as it
227                          * migrates across physical cpus. But at least
228                          * it should discourage the guest from using the
229                          * TSC to keep track of time.
230                          */
231                         if (tsc_is_invariant && smp_tsc)
232                                 regs[3] |= AMDPM_TSC_INVARIANT;
233                         break;
234
235                 case CPUID_8000_001D:
236                         /* AMD Cache topology, like 0000_0004 for Intel. */
237                         if (!vmm_is_amd())
238                                 goto default_leaf;
239
240                         /*
241                          * Similar to Intel, generate a ficticious cache
242                          * topology for the guest with L3 shared by the
243                          * package, and L1 and L2 local to a core.
244                          */
245                         vm_get_topology(vm, &sockets, &cores, &threads,
246                             &maxcpus);
247                         switch (*ecx) {
248                         case 0:
249                                 logical_cpus = threads;
250                                 level = 1;
251                                 func = 1;       /* data cache */
252                                 break;
253                         case 1:
254                                 logical_cpus = threads;
255                                 level = 2;
256                                 func = 3;       /* unified cache */
257                                 break;
258                         case 2:
259                                 logical_cpus = threads * cores;
260                                 level = 3;
261                                 func = 3;       /* unified cache */
262                                 break;
263                         default:
264                                 logical_cpus = 0;
265                                 level = 0;
266                                 func = 0;
267                                 break;
268                         }
269
270                         logical_cpus = MIN(0xfff, logical_cpus - 1);
271                         regs[0] = (logical_cpus << 14) | (1 << 8) |
272                             (level << 5) | func;
273                         regs[1] = (func > 0) ? (CACHE_LINE_SIZE - 1) : 0;
274                         regs[2] = 0;
275                         regs[3] = 0;
276                         break;
277
278                 case CPUID_8000_001E:
279                         /* AMD Family 16h+ additional identifiers */
280                         if (!vmm_is_amd() || CPUID_TO_FAMILY(cpu_id) < 0x16)
281                                 goto default_leaf;
282
283                         vm_get_topology(vm, &sockets, &cores, &threads,
284                             &maxcpus);
285                         regs[0] = vcpu_id;
286                         threads = MIN(0xFF, threads - 1);
287                         regs[1] = (threads << 8) |
288                             (vcpu_id >> log2(threads + 1));
289                         /*
290                          * XXX Bhyve topology cannot yet represent >1 node per
291                          * processor.
292                          */
293                         regs[2] = 0;
294                         regs[3] = 0;
295                         break;
296
297                 case CPUID_0000_0001:
298                         do_cpuid(1, regs);
299
300                         error = vm_get_x2apic_state(vm, vcpu_id, &x2apic_state);
301                         if (error) {
302                                 panic("x86_emulate_cpuid: error %d "
303                                       "fetching x2apic state", error);
304                         }
305
306                         /*
307                          * Override the APIC ID only in ebx
308                          */
309                         regs[1] &= ~(CPUID_LOCAL_APIC_ID);
310                         regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT);
311
312                         /*
313                          * Don't expose VMX, SpeedStep, TME or SMX capability.
314                          * Advertise x2APIC capability and Hypervisor guest.
315                          */
316                         regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2);
317                         regs[2] &= ~(CPUID2_SMX);
318
319                         regs[2] |= CPUID2_HV;
320
321                         if (x2apic_state != X2APIC_DISABLED)
322                                 regs[2] |= CPUID2_X2APIC;
323                         else
324                                 regs[2] &= ~CPUID2_X2APIC;
325
326                         /*
327                          * Only advertise CPUID2_XSAVE in the guest if
328                          * the host is using XSAVE.
329                          */
330                         if (!(regs[2] & CPUID2_OSXSAVE))
331                                 regs[2] &= ~CPUID2_XSAVE;
332
333                         /*
334                          * If CPUID2_XSAVE is being advertised and the
335                          * guest has set CR4_XSAVE, set
336                          * CPUID2_OSXSAVE.
337                          */
338                         regs[2] &= ~CPUID2_OSXSAVE;
339                         if (regs[2] & CPUID2_XSAVE) {
340                                 error = vm_get_register(vm, vcpu_id,
341                                     VM_REG_GUEST_CR4, &cr4);
342                                 if (error)
343                                         panic("x86_emulate_cpuid: error %d "
344                                               "fetching %%cr4", error);
345                                 if (cr4 & CR4_XSAVE)
346                                         regs[2] |= CPUID2_OSXSAVE;
347                         }
348
349                         /*
350                          * Hide monitor/mwait until we know how to deal with
351                          * these instructions.
352                          */
353                         regs[2] &= ~CPUID2_MON;
354
355                         /*
356                          * Hide the performance and debug features.
357                          */
358                         regs[2] &= ~CPUID2_PDCM;
359
360                         /*
361                          * No TSC deadline support in the APIC yet
362                          */
363                         regs[2] &= ~CPUID2_TSCDLT;
364
365                         /*
366                          * Hide thermal monitoring
367                          */
368                         regs[3] &= ~(CPUID_ACPI | CPUID_TM);
369
370                         /*
371                          * Hide the debug store capability.
372                          */
373                         regs[3] &= ~CPUID_DS;
374
375                         /*
376                          * Advertise the Machine Check and MTRR capability.
377                          *
378                          * Some guest OSes (e.g. Windows) will not boot if
379                          * these features are absent.
380                          */
381                         regs[3] |= (CPUID_MCA | CPUID_MCE | CPUID_MTRR);
382
383                         vm_get_topology(vm, &sockets, &cores, &threads,
384                             &maxcpus);
385                         logical_cpus = threads * cores;
386                         regs[1] &= ~CPUID_HTT_CORES;
387                         regs[1] |= (logical_cpus & 0xff) << 16;
388                         regs[3] |= CPUID_HTT;
389                         break;
390
391                 case CPUID_0000_0004:
392                         cpuid_count(*eax, *ecx, regs);
393
394                         if (regs[0] || regs[1] || regs[2] || regs[3]) {
395                                 vm_get_topology(vm, &sockets, &cores, &threads,
396                                     &maxcpus);
397                                 regs[0] &= 0x3ff;
398                                 regs[0] |= (cores - 1) << 26;
399                                 /*
400                                  * Cache topology:
401                                  * - L1 and L2 are shared only by the logical
402                                  *   processors in a single core.
403                                  * - L3 and above are shared by all logical
404                                  *   processors in the package.
405                                  */
406                                 logical_cpus = threads;
407                                 level = (regs[0] >> 5) & 0x7;
408                                 if (level >= 3)
409                                         logical_cpus *= cores;
410                                 regs[0] |= (logical_cpus - 1) << 14;
411                         }
412                         break;
413
414                 case CPUID_0000_0007:
415                         regs[0] = 0;
416                         regs[1] = 0;
417                         regs[2] = 0;
418                         regs[3] = 0;
419
420                         /* leaf 0 */
421                         if (*ecx == 0) {
422                                 cpuid_count(*eax, *ecx, regs);
423
424                                 /* Only leaf 0 is supported */
425                                 regs[0] = 0;
426
427                                 /*
428                                  * Expose known-safe features.
429                                  */
430                                 regs[1] &= (CPUID_STDEXT_FSGSBASE |
431                                     CPUID_STDEXT_BMI1 | CPUID_STDEXT_HLE |
432                                     CPUID_STDEXT_AVX2 | CPUID_STDEXT_BMI2 |
433                                     CPUID_STDEXT_ERMS | CPUID_STDEXT_RTM |
434                                     CPUID_STDEXT_AVX512F |
435                                     CPUID_STDEXT_RDSEED |
436                                     CPUID_STDEXT_AVX512PF |
437                                     CPUID_STDEXT_AVX512ER |
438                                     CPUID_STDEXT_AVX512CD | CPUID_STDEXT_SHA);
439                                 regs[2] = 0;
440                                 regs[3] &= CPUID_STDEXT3_MD_CLEAR;
441
442                                 /* Advertise INVPCID if it is enabled. */
443                                 error = vm_get_capability(vm, vcpu_id,
444                                     VM_CAP_ENABLE_INVPCID, &enable_invpcid);
445                                 if (error == 0 && enable_invpcid)
446                                         regs[1] |= CPUID_STDEXT_INVPCID;
447                         }
448                         break;
449
450                 case CPUID_0000_0006:
451                         regs[0] = CPUTPM1_ARAT;
452                         regs[1] = 0;
453                         regs[2] = 0;
454                         regs[3] = 0;
455                         break;
456
457                 case CPUID_0000_000A:
458                         /*
459                          * Handle the access, but report 0 for
460                          * all options
461                          */
462                         regs[0] = 0;
463                         regs[1] = 0;
464                         regs[2] = 0;
465                         regs[3] = 0;
466                         break;
467
468                 case CPUID_0000_000B:
469                         /*
470                          * Intel processor topology enumeration
471                          */
472                         if (vmm_is_intel()) {
473                                 vm_get_topology(vm, &sockets, &cores, &threads,
474                                     &maxcpus);
475                                 if (*ecx == 0) {
476                                         logical_cpus = threads;
477                                         width = log2(logical_cpus);
478                                         level = CPUID_TYPE_SMT;
479                                         x2apic_id = vcpu_id;
480                                 }
481
482                                 if (*ecx == 1) {
483                                         logical_cpus = threads * cores;
484                                         width = log2(logical_cpus);
485                                         level = CPUID_TYPE_CORE;
486                                         x2apic_id = vcpu_id;
487                                 }
488
489                                 if (!cpuid_leaf_b || *ecx >= 2) {
490                                         width = 0;
491                                         logical_cpus = 0;
492                                         level = 0;
493                                         x2apic_id = 0;
494                                 }
495
496                                 regs[0] = width & 0x1f;
497                                 regs[1] = logical_cpus & 0xffff;
498                                 regs[2] = (level << 8) | (*ecx & 0xff);
499                                 regs[3] = x2apic_id;
500                         } else {
501                                 regs[0] = 0;
502                                 regs[1] = 0;
503                                 regs[2] = 0;
504                                 regs[3] = 0;
505                         }
506                         break;
507
508                 case CPUID_0000_000D:
509                         limits = vmm_get_xsave_limits();
510                         if (!limits->xsave_enabled) {
511                                 regs[0] = 0;
512                                 regs[1] = 0;
513                                 regs[2] = 0;
514                                 regs[3] = 0;
515                                 break;
516                         }
517
518                         cpuid_count(*eax, *ecx, regs);
519                         switch (*ecx) {
520                         case 0:
521                                 /*
522                                  * Only permit the guest to use bits
523                                  * that are active in the host in
524                                  * %xcr0.  Also, claim that the
525                                  * maximum save area size is
526                                  * equivalent to the host's current
527                                  * save area size.  Since this runs
528                                  * "inside" of vmrun(), it runs with
529                                  * the guest's xcr0, so the current
530                                  * save area size is correct as-is.
531                                  */
532                                 regs[0] &= limits->xcr0_allowed;
533                                 regs[2] = limits->xsave_max_size;
534                                 regs[3] &= (limits->xcr0_allowed >> 32);
535                                 break;
536                         case 1:
537                                 /* Only permit XSAVEOPT. */
538                                 regs[0] &= CPUID_EXTSTATE_XSAVEOPT;
539                                 regs[1] = 0;
540                                 regs[2] = 0;
541                                 regs[3] = 0;
542                                 break;
543                         default:
544                                 /*
545                                  * If the leaf is for a permitted feature,
546                                  * pass through as-is, otherwise return
547                                  * all zeroes.
548                                  */
549                                 if (!(limits->xcr0_allowed & (1ul << *ecx))) {
550                                         regs[0] = 0;
551                                         regs[1] = 0;
552                                         regs[2] = 0;
553                                         regs[3] = 0;
554                                 }
555                                 break;
556                         }
557                         break;
558
559                 case 0x40000000:
560                         regs[0] = CPUID_VM_HIGH;
561                         bcopy(bhyve_id, &regs[1], 4);
562                         bcopy(bhyve_id + 4, &regs[2], 4);
563                         bcopy(bhyve_id + 8, &regs[3], 4);
564                         break;
565
566                 default:
567 default_leaf:
568                         /*
569                          * The leaf value has already been clamped so
570                          * simply pass this through, keeping count of
571                          * how many unhandled leaf values have been seen.
572                          */
573                         atomic_add_long(&bhyve_xcpuids, 1);
574                         cpuid_count(*eax, *ecx, regs);
575                         break;
576         }
577
578         *eax = regs[0];
579         *ebx = regs[1];
580         *ecx = regs[2];
581         *edx = regs[3];
582
583         return (1);
584 }
585
586 bool
587 vm_cpuid_capability(struct vm *vm, int vcpuid, enum vm_cpuid_capability cap)
588 {
589         bool rv;
590
591         KASSERT(cap > 0 && cap < VCC_LAST, ("%s: invalid vm_cpu_capability %d",
592             __func__, cap));
593
594         /*
595          * Simply passthrough the capabilities of the host cpu for now.
596          */
597         rv = false;
598         switch (cap) {
599         case VCC_NO_EXECUTE:
600                 if (amd_feature & AMDID_NX)
601                         rv = true;
602                 break;
603         case VCC_FFXSR:
604                 if (amd_feature & AMDID_FFXSR)
605                         rv = true;
606                 break;
607         case VCC_TCE:
608                 if (amd_feature2 & AMDID2_TCE)
609                         rv = true;
610                 break;
611         default:
612                 panic("%s: unknown vm_cpu_capability %d", __func__, cap);
613         }
614         return (rv);
615 }