]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/amd64/vmm/x86.c
libfdt: Update to 1.4.6, switch to using libfdt for overlay support
[FreeBSD/FreeBSD.git] / sys / amd64 / vmm / x86.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  * $FreeBSD$
29  */
30
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33
34 #include <sys/param.h>
35 #include <sys/pcpu.h>
36 #include <sys/systm.h>
37 #include <sys/sysctl.h>
38
39 #include <machine/clock.h>
40 #include <machine/cpufunc.h>
41 #include <machine/md_var.h>
42 #include <machine/segments.h>
43 #include <machine/specialreg.h>
44
45 #include <machine/vmm.h>
46
47 #include "vmm_host.h"
48 #include "vmm_ktr.h"
49 #include "vmm_util.h"
50 #include "x86.h"
51
52 SYSCTL_DECL(_hw_vmm);
53 static SYSCTL_NODE(_hw_vmm, OID_AUTO, topology, CTLFLAG_RD, 0, NULL);
54
55 #define CPUID_VM_HIGH           0x40000000
56
57 static const char bhyve_id[12] = "bhyve bhyve ";
58
59 static uint64_t bhyve_xcpuids;
60 SYSCTL_ULONG(_hw_vmm, OID_AUTO, bhyve_xcpuids, CTLFLAG_RW, &bhyve_xcpuids, 0,
61     "Number of times an unknown cpuid leaf was accessed");
62
63 /*
64  * The default CPU topology is a single thread per package.
65  */
66 static u_int threads_per_core = 1;
67 SYSCTL_UINT(_hw_vmm_topology, OID_AUTO, threads_per_core, CTLFLAG_RDTUN,
68     &threads_per_core, 0, NULL);
69
70 static u_int cores_per_package = 1;
71 SYSCTL_UINT(_hw_vmm_topology, OID_AUTO, cores_per_package, CTLFLAG_RDTUN,
72     &cores_per_package, 0, NULL);
73
74 static int cpuid_leaf_b = 1;
75 SYSCTL_INT(_hw_vmm_topology, OID_AUTO, cpuid_leaf_b, CTLFLAG_RDTUN,
76     &cpuid_leaf_b, 0, NULL);
77
78 /*
79  * Round up to the next power of two, if necessary, and then take log2.
80  * Returns -1 if argument is zero.
81  */
82 static __inline int
83 log2(u_int x)
84 {
85
86         return (fls(x << (1 - powerof2(x))) - 1);
87 }
88
89 int
90 x86_emulate_cpuid(struct vm *vm, int vcpu_id,
91                   uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
92 {
93         const struct xsave_limits *limits;
94         uint64_t cr4;
95         int error, enable_invpcid, level, width, x2apic_id;
96         unsigned int func, regs[4], logical_cpus;
97         enum x2apic_state x2apic_state;
98
99         VCPU_CTR2(vm, vcpu_id, "cpuid %#x,%#x", *eax, *ecx);
100
101         /*
102          * Requests for invalid CPUID levels should map to the highest
103          * available level instead.
104          */
105         if (cpu_exthigh != 0 && *eax >= 0x80000000) {
106                 if (*eax > cpu_exthigh)
107                         *eax = cpu_exthigh;
108         } else if (*eax >= 0x40000000) {
109                 if (*eax > CPUID_VM_HIGH)
110                         *eax = CPUID_VM_HIGH;
111         } else if (*eax > cpu_high) {
112                 *eax = cpu_high;
113         }
114
115         func = *eax;
116
117         /*
118          * In general the approach used for CPU topology is to
119          * advertise a flat topology where all CPUs are packages with
120          * no multi-core or SMT.
121          */
122         switch (func) {
123                 /*
124                  * Pass these through to the guest
125                  */
126                 case CPUID_0000_0000:
127                 case CPUID_0000_0002:
128                 case CPUID_0000_0003:
129                 case CPUID_8000_0000:
130                 case CPUID_8000_0002:
131                 case CPUID_8000_0003:
132                 case CPUID_8000_0004:
133                 case CPUID_8000_0006:
134                         cpuid_count(*eax, *ecx, regs);
135                         break;
136                 case CPUID_8000_0008:
137                         cpuid_count(*eax, *ecx, regs);
138                         if (vmm_is_amd()) {
139                                 /*
140                                  * XXX this might appear silly because AMD
141                                  * cpus don't have threads.
142                                  *
143                                  * However this matches the logical cpus as
144                                  * advertised by leaf 0x1 and will work even
145                                  * if the 'threads_per_core' tunable is set
146                                  * incorrectly on an AMD host.
147                                  */
148                                 logical_cpus = threads_per_core *
149                                     cores_per_package;
150                                 regs[2] = logical_cpus - 1;
151                         }
152                         break;
153
154                 case CPUID_8000_0001:
155                         cpuid_count(*eax, *ecx, regs);
156
157                         /*
158                          * Hide SVM and Topology Extension features from guest.
159                          */
160                         regs[2] &= ~(AMDID2_SVM | AMDID2_TOPOLOGY);
161
162                         /*
163                          * Don't advertise extended performance counter MSRs
164                          * to the guest.
165                          */
166                         regs[2] &= ~AMDID2_PCXC;
167                         regs[2] &= ~AMDID2_PNXC;
168                         regs[2] &= ~AMDID2_PTSCEL2I;
169
170                         /*
171                          * Don't advertise Instruction Based Sampling feature.
172                          */
173                         regs[2] &= ~AMDID2_IBS;
174
175                         /* NodeID MSR not available */
176                         regs[2] &= ~AMDID2_NODE_ID;
177
178                         /* Don't advertise the OS visible workaround feature */
179                         regs[2] &= ~AMDID2_OSVW;
180
181                         /* Hide mwaitx/monitorx capability from the guest */
182                         regs[2] &= ~AMDID2_MWAITX;
183
184                         /*
185                          * Hide rdtscp/ia32_tsc_aux until we know how
186                          * to deal with them.
187                          */
188                         regs[3] &= ~AMDID_RDTSCP;
189                         break;
190
191                 case CPUID_8000_0007:
192                         /*
193                          * AMD uses this leaf to advertise the processor's
194                          * power monitoring and RAS capabilities. These
195                          * features are hardware-specific and exposing
196                          * them to a guest doesn't make a lot of sense.
197                          *
198                          * Intel uses this leaf only to advertise the
199                          * "Invariant TSC" feature with all other bits
200                          * being reserved (set to zero).
201                          */
202                         regs[0] = 0;
203                         regs[1] = 0;
204                         regs[2] = 0;
205                         regs[3] = 0;
206
207                         /*
208                          * "Invariant TSC" can be advertised to the guest if:
209                          * - host TSC frequency is invariant
210                          * - host TSCs are synchronized across physical cpus
211                          *
212                          * XXX This still falls short because the vcpu
213                          * can observe the TSC moving backwards as it
214                          * migrates across physical cpus. But at least
215                          * it should discourage the guest from using the
216                          * TSC to keep track of time.
217                          */
218                         if (tsc_is_invariant && smp_tsc)
219                                 regs[3] |= AMDPM_TSC_INVARIANT;
220                         break;
221
222                 case CPUID_0000_0001:
223                         do_cpuid(1, regs);
224
225                         error = vm_get_x2apic_state(vm, vcpu_id, &x2apic_state);
226                         if (error) {
227                                 panic("x86_emulate_cpuid: error %d "
228                                       "fetching x2apic state", error);
229                         }
230
231                         /*
232                          * Override the APIC ID only in ebx
233                          */
234                         regs[1] &= ~(CPUID_LOCAL_APIC_ID);
235                         regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT);
236
237                         /*
238                          * Don't expose VMX, SpeedStep, TME or SMX capability.
239                          * Advertise x2APIC capability and Hypervisor guest.
240                          */
241                         regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2);
242                         regs[2] &= ~(CPUID2_SMX);
243
244                         regs[2] |= CPUID2_HV;
245
246                         if (x2apic_state != X2APIC_DISABLED)
247                                 regs[2] |= CPUID2_X2APIC;
248                         else
249                                 regs[2] &= ~CPUID2_X2APIC;
250
251                         /*
252                          * Only advertise CPUID2_XSAVE in the guest if
253                          * the host is using XSAVE.
254                          */
255                         if (!(regs[2] & CPUID2_OSXSAVE))
256                                 regs[2] &= ~CPUID2_XSAVE;
257
258                         /*
259                          * If CPUID2_XSAVE is being advertised and the
260                          * guest has set CR4_XSAVE, set
261                          * CPUID2_OSXSAVE.
262                          */
263                         regs[2] &= ~CPUID2_OSXSAVE;
264                         if (regs[2] & CPUID2_XSAVE) {
265                                 error = vm_get_register(vm, vcpu_id,
266                                     VM_REG_GUEST_CR4, &cr4);
267                                 if (error)
268                                         panic("x86_emulate_cpuid: error %d "
269                                               "fetching %%cr4", error);
270                                 if (cr4 & CR4_XSAVE)
271                                         regs[2] |= CPUID2_OSXSAVE;
272                         }
273
274                         /*
275                          * Hide monitor/mwait until we know how to deal with
276                          * these instructions.
277                          */
278                         regs[2] &= ~CPUID2_MON;
279
280                         /*
281                          * Hide the performance and debug features.
282                          */
283                         regs[2] &= ~CPUID2_PDCM;
284
285                         /*
286                          * No TSC deadline support in the APIC yet
287                          */
288                         regs[2] &= ~CPUID2_TSCDLT;
289
290                         /*
291                          * Hide thermal monitoring
292                          */
293                         regs[3] &= ~(CPUID_ACPI | CPUID_TM);
294
295                         /*
296                          * Hide the debug store capability.
297                          */
298                         regs[3] &= ~CPUID_DS;
299
300                         /*
301                          * Advertise the Machine Check and MTRR capability.
302                          *
303                          * Some guest OSes (e.g. Windows) will not boot if
304                          * these features are absent.
305                          */
306                         regs[3] |= (CPUID_MCA | CPUID_MCE | CPUID_MTRR);
307
308                         logical_cpus = threads_per_core * cores_per_package;
309                         regs[1] &= ~CPUID_HTT_CORES;
310                         regs[1] |= (logical_cpus & 0xff) << 16;
311                         regs[3] |= CPUID_HTT;
312                         break;
313
314                 case CPUID_0000_0004:
315                         cpuid_count(*eax, *ecx, regs);
316
317                         if (regs[0] || regs[1] || regs[2] || regs[3]) {
318                                 regs[0] &= 0x3ff;
319                                 regs[0] |= (cores_per_package - 1) << 26;
320                                 /*
321                                  * Cache topology:
322                                  * - L1 and L2 are shared only by the logical
323                                  *   processors in a single core.
324                                  * - L3 and above are shared by all logical
325                                  *   processors in the package.
326                                  */
327                                 logical_cpus = threads_per_core;
328                                 level = (regs[0] >> 5) & 0x7;
329                                 if (level >= 3)
330                                         logical_cpus *= cores_per_package;
331                                 regs[0] |= (logical_cpus - 1) << 14;
332                         }
333                         break;
334
335                 case CPUID_0000_0007:
336                         regs[0] = 0;
337                         regs[1] = 0;
338                         regs[2] = 0;
339                         regs[3] = 0;
340
341                         /* leaf 0 */
342                         if (*ecx == 0) {
343                                 cpuid_count(*eax, *ecx, regs);
344
345                                 /* Only leaf 0 is supported */
346                                 regs[0] = 0;
347
348                                 /*
349                                  * Expose known-safe features.
350                                  */
351                                 regs[1] &= (CPUID_STDEXT_FSGSBASE |
352                                     CPUID_STDEXT_BMI1 | CPUID_STDEXT_HLE |
353                                     CPUID_STDEXT_AVX2 | CPUID_STDEXT_BMI2 |
354                                     CPUID_STDEXT_ERMS | CPUID_STDEXT_RTM |
355                                     CPUID_STDEXT_AVX512F |
356                                     CPUID_STDEXT_AVX512PF |
357                                     CPUID_STDEXT_AVX512ER |
358                                     CPUID_STDEXT_AVX512CD);
359                                 regs[2] = 0;
360                                 regs[3] = 0;
361
362                                 /* Advertise INVPCID if it is enabled. */
363                                 error = vm_get_capability(vm, vcpu_id,
364                                     VM_CAP_ENABLE_INVPCID, &enable_invpcid);
365                                 if (error == 0 && enable_invpcid)
366                                         regs[1] |= CPUID_STDEXT_INVPCID;
367                         }
368                         break;
369
370                 case CPUID_0000_0006:
371                         regs[0] = CPUTPM1_ARAT;
372                         regs[1] = 0;
373                         regs[2] = 0;
374                         regs[3] = 0;
375                         break;
376
377                 case CPUID_0000_000A:
378                         /*
379                          * Handle the access, but report 0 for
380                          * all options
381                          */
382                         regs[0] = 0;
383                         regs[1] = 0;
384                         regs[2] = 0;
385                         regs[3] = 0;
386                         break;
387
388                 case CPUID_0000_000B:
389                         /*
390                          * Processor topology enumeration
391                          */
392                         if (*ecx == 0) {
393                                 logical_cpus = threads_per_core;
394                                 width = log2(logical_cpus);
395                                 level = CPUID_TYPE_SMT;
396                                 x2apic_id = vcpu_id;
397                         }
398
399                         if (*ecx == 1) {
400                                 logical_cpus = threads_per_core *
401                                     cores_per_package;
402                                 width = log2(logical_cpus);
403                                 level = CPUID_TYPE_CORE;
404                                 x2apic_id = vcpu_id;
405                         }
406
407                         if (!cpuid_leaf_b || *ecx >= 2) {
408                                 width = 0;
409                                 logical_cpus = 0;
410                                 level = 0;
411                                 x2apic_id = 0;
412                         }
413
414                         regs[0] = width & 0x1f;
415                         regs[1] = logical_cpus & 0xffff;
416                         regs[2] = (level << 8) | (*ecx & 0xff);
417                         regs[3] = x2apic_id;
418                         break;
419
420                 case CPUID_0000_000D:
421                         limits = vmm_get_xsave_limits();
422                         if (!limits->xsave_enabled) {
423                                 regs[0] = 0;
424                                 regs[1] = 0;
425                                 regs[2] = 0;
426                                 regs[3] = 0;
427                                 break;
428                         }
429
430                         cpuid_count(*eax, *ecx, regs);
431                         switch (*ecx) {
432                         case 0:
433                                 /*
434                                  * Only permit the guest to use bits
435                                  * that are active in the host in
436                                  * %xcr0.  Also, claim that the
437                                  * maximum save area size is
438                                  * equivalent to the host's current
439                                  * save area size.  Since this runs
440                                  * "inside" of vmrun(), it runs with
441                                  * the guest's xcr0, so the current
442                                  * save area size is correct as-is.
443                                  */
444                                 regs[0] &= limits->xcr0_allowed;
445                                 regs[2] = limits->xsave_max_size;
446                                 regs[3] &= (limits->xcr0_allowed >> 32);
447                                 break;
448                         case 1:
449                                 /* Only permit XSAVEOPT. */
450                                 regs[0] &= CPUID_EXTSTATE_XSAVEOPT;
451                                 regs[1] = 0;
452                                 regs[2] = 0;
453                                 regs[3] = 0;
454                                 break;
455                         default:
456                                 /*
457                                  * If the leaf is for a permitted feature,
458                                  * pass through as-is, otherwise return
459                                  * all zeroes.
460                                  */
461                                 if (!(limits->xcr0_allowed & (1ul << *ecx))) {
462                                         regs[0] = 0;
463                                         regs[1] = 0;
464                                         regs[2] = 0;
465                                         regs[3] = 0;
466                                 }
467                                 break;
468                         }
469                         break;
470
471                 case 0x40000000:
472                         regs[0] = CPUID_VM_HIGH;
473                         bcopy(bhyve_id, &regs[1], 4);
474                         bcopy(bhyve_id + 4, &regs[2], 4);
475                         bcopy(bhyve_id + 8, &regs[3], 4);
476                         break;
477
478                 default:
479                         /*
480                          * The leaf value has already been clamped so
481                          * simply pass this through, keeping count of
482                          * how many unhandled leaf values have been seen.
483                          */
484                         atomic_add_long(&bhyve_xcpuids, 1);
485                         cpuid_count(*eax, *ecx, regs);
486                         break;
487         }
488
489         *eax = regs[0];
490         *ebx = regs[1];
491         *ecx = regs[2];
492         *edx = regs[3];
493
494         return (1);
495 }
496
497 bool
498 vm_cpuid_capability(struct vm *vm, int vcpuid, enum vm_cpuid_capability cap)
499 {
500         bool rv;
501
502         KASSERT(cap > 0 && cap < VCC_LAST, ("%s: invalid vm_cpu_capability %d",
503             __func__, cap));
504
505         /*
506          * Simply passthrough the capabilities of the host cpu for now.
507          */
508         rv = false;
509         switch (cap) {
510         case VCC_NO_EXECUTE:
511                 if (amd_feature & AMDID_NX)
512                         rv = true;
513                 break;
514         case VCC_FFXSR:
515                 if (amd_feature & AMDID_FFXSR)
516                         rv = true;
517                 break;
518         case VCC_TCE:
519                 if (amd_feature2 & AMDID2_TCE)
520                         rv = true;
521                 break;
522         default:
523                 panic("%s: unknown vm_cpu_capability %d", __func__, cap);
524         }
525         return (rv);
526 }