]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/amd64/vmm/x86.c
MFV r361322:
[FreeBSD/FreeBSD.git] / sys / amd64 / vmm / x86.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  * $FreeBSD$
29  */
30
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33
34 #include <sys/param.h>
35 #include <sys/pcpu.h>
36 #include <sys/systm.h>
37 #include <sys/sysctl.h>
38
39 #include <machine/clock.h>
40 #include <machine/cpufunc.h>
41 #include <machine/md_var.h>
42 #include <machine/segments.h>
43 #include <machine/specialreg.h>
44
45 #include <machine/vmm.h>
46
47 #include "vmm_host.h"
48 #include "vmm_ktr.h"
49 #include "vmm_util.h"
50 #include "x86.h"
51
52 SYSCTL_DECL(_hw_vmm);
53 static SYSCTL_NODE(_hw_vmm, OID_AUTO, topology, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
54     NULL);
55
56 #define CPUID_VM_HIGH           0x40000000
57
58 static const char bhyve_id[12] = "bhyve bhyve ";
59
60 static uint64_t bhyve_xcpuids;
61 SYSCTL_ULONG(_hw_vmm, OID_AUTO, bhyve_xcpuids, CTLFLAG_RW, &bhyve_xcpuids, 0,
62     "Number of times an unknown cpuid leaf was accessed");
63
64 #if __FreeBSD_version < 1200060 /* Remove after 11 EOL helps MFCing */
65 extern u_int threads_per_core;
66 SYSCTL_UINT(_hw_vmm_topology, OID_AUTO, threads_per_core, CTLFLAG_RDTUN,
67     &threads_per_core, 0, NULL);
68
69 extern u_int cores_per_package;
70 SYSCTL_UINT(_hw_vmm_topology, OID_AUTO, cores_per_package, CTLFLAG_RDTUN,
71     &cores_per_package, 0, NULL);
72 #endif
73
74 static int cpuid_leaf_b = 1;
75 SYSCTL_INT(_hw_vmm_topology, OID_AUTO, cpuid_leaf_b, CTLFLAG_RDTUN,
76     &cpuid_leaf_b, 0, NULL);
77
78 /*
79  * Round up to the next power of two, if necessary, and then take log2.
80  * Returns -1 if argument is zero.
81  */
82 static __inline int
83 log2(u_int x)
84 {
85
86         return (fls(x << (1 - powerof2(x))) - 1);
87 }
88
89 int
90 x86_emulate_cpuid(struct vm *vm, int vcpu_id,
91                   uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
92 {
93         const struct xsave_limits *limits;
94         uint64_t cr4;
95         int error, enable_invpcid, level, width, x2apic_id;
96         unsigned int func, regs[4], logical_cpus;
97         enum x2apic_state x2apic_state;
98         uint16_t cores, maxcpus, sockets, threads;
99
100         VCPU_CTR2(vm, vcpu_id, "cpuid %#x,%#x", *eax, *ecx);
101
102         /*
103          * Requests for invalid CPUID levels should map to the highest
104          * available level instead.
105          */
106         if (cpu_exthigh != 0 && *eax >= 0x80000000) {
107                 if (*eax > cpu_exthigh)
108                         *eax = cpu_exthigh;
109         } else if (*eax >= 0x40000000) {
110                 if (*eax > CPUID_VM_HIGH)
111                         *eax = CPUID_VM_HIGH;
112         } else if (*eax > cpu_high) {
113                 *eax = cpu_high;
114         }
115
116         func = *eax;
117
118         /*
119          * In general the approach used for CPU topology is to
120          * advertise a flat topology where all CPUs are packages with
121          * no multi-core or SMT.
122          */
123         switch (func) {
124                 /*
125                  * Pass these through to the guest
126                  */
127                 case CPUID_0000_0000:
128                 case CPUID_0000_0002:
129                 case CPUID_0000_0003:
130                 case CPUID_8000_0000:
131                 case CPUID_8000_0002:
132                 case CPUID_8000_0003:
133                 case CPUID_8000_0004:
134                 case CPUID_8000_0006:
135                         cpuid_count(*eax, *ecx, regs);
136                         break;
137                 case CPUID_8000_0008:
138                         cpuid_count(*eax, *ecx, regs);
139                         if (vmm_is_svm()) {
140                                 /*
141                                  * As on Intel (0000_0007:0, EDX), mask out
142                                  * unsupported or unsafe AMD extended features
143                                  * (8000_0008 EBX).
144                                  */
145                                 regs[1] &= (AMDFEID_CLZERO | AMDFEID_IRPERF |
146                                     AMDFEID_XSAVEERPTR);
147
148                                 vm_get_topology(vm, &sockets, &cores, &threads,
149                                     &maxcpus);
150                                 /*
151                                  * Here, width is ApicIdCoreIdSize, present on
152                                  * at least Family 15h and newer.  It
153                                  * represents the "number of bits in the
154                                  * initial apicid that indicate thread id
155                                  * within a package."
156                                  *
157                                  * Our topo_probe_amd() uses it for
158                                  * pkg_id_shift and other OSes may rely on it.
159                                  */
160                                 width = MIN(0xF, log2(threads * cores));
161                                 if (width < 0x4)
162                                         width = 0;
163                                 logical_cpus = MIN(0xFF, threads * cores - 1);
164                                 regs[2] = (width << AMDID_COREID_SIZE_SHIFT) | logical_cpus;
165                         }
166                         break;
167
168                 case CPUID_8000_0001:
169                         cpuid_count(*eax, *ecx, regs);
170
171                         /*
172                          * Hide SVM from guest.
173                          */
174                         regs[2] &= ~AMDID2_SVM;
175
176                         /*
177                          * Don't advertise extended performance counter MSRs
178                          * to the guest.
179                          */
180                         regs[2] &= ~AMDID2_PCXC;
181                         regs[2] &= ~AMDID2_PNXC;
182                         regs[2] &= ~AMDID2_PTSCEL2I;
183
184                         /*
185                          * Don't advertise Instruction Based Sampling feature.
186                          */
187                         regs[2] &= ~AMDID2_IBS;
188
189                         /* NodeID MSR not available */
190                         regs[2] &= ~AMDID2_NODE_ID;
191
192                         /* Don't advertise the OS visible workaround feature */
193                         regs[2] &= ~AMDID2_OSVW;
194
195                         /* Hide mwaitx/monitorx capability from the guest */
196                         regs[2] &= ~AMDID2_MWAITX;
197
198                         /*
199                          * Hide rdtscp/ia32_tsc_aux until we know how
200                          * to deal with them.
201                          */
202                         regs[3] &= ~AMDID_RDTSCP;
203                         break;
204
205                 case CPUID_8000_0007:
206                         /*
207                          * AMD uses this leaf to advertise the processor's
208                          * power monitoring and RAS capabilities. These
209                          * features are hardware-specific and exposing
210                          * them to a guest doesn't make a lot of sense.
211                          *
212                          * Intel uses this leaf only to advertise the
213                          * "Invariant TSC" feature with all other bits
214                          * being reserved (set to zero).
215                          */
216                         regs[0] = 0;
217                         regs[1] = 0;
218                         regs[2] = 0;
219                         regs[3] = 0;
220
221                         /*
222                          * "Invariant TSC" can be advertised to the guest if:
223                          * - host TSC frequency is invariant
224                          * - host TSCs are synchronized across physical cpus
225                          *
226                          * XXX This still falls short because the vcpu
227                          * can observe the TSC moving backwards as it
228                          * migrates across physical cpus. But at least
229                          * it should discourage the guest from using the
230                          * TSC to keep track of time.
231                          */
232                         if (tsc_is_invariant && smp_tsc)
233                                 regs[3] |= AMDPM_TSC_INVARIANT;
234                         break;
235
236                 case CPUID_8000_001D:
237                         /* AMD Cache topology, like 0000_0004 for Intel. */
238                         if (!vmm_is_svm())
239                                 goto default_leaf;
240
241                         /*
242                          * Similar to Intel, generate a ficticious cache
243                          * topology for the guest with L3 shared by the
244                          * package, and L1 and L2 local to a core.
245                          */
246                         vm_get_topology(vm, &sockets, &cores, &threads,
247                             &maxcpus);
248                         switch (*ecx) {
249                         case 0:
250                                 logical_cpus = threads;
251                                 level = 1;
252                                 func = 1;       /* data cache */
253                                 break;
254                         case 1:
255                                 logical_cpus = threads;
256                                 level = 2;
257                                 func = 3;       /* unified cache */
258                                 break;
259                         case 2:
260                                 logical_cpus = threads * cores;
261                                 level = 3;
262                                 func = 3;       /* unified cache */
263                                 break;
264                         default:
265                                 logical_cpus = 0;
266                                 level = 0;
267                                 func = 0;
268                                 break;
269                         }
270
271                         logical_cpus = MIN(0xfff, logical_cpus - 1);
272                         regs[0] = (logical_cpus << 14) | (1 << 8) |
273                             (level << 5) | func;
274                         regs[1] = (func > 0) ? (CACHE_LINE_SIZE - 1) : 0;
275                         regs[2] = 0;
276                         regs[3] = 0;
277                         break;
278
279                 case CPUID_8000_001E:
280                         /*
281                          * AMD Family 16h+ and Hygon Family 18h additional
282                          * identifiers.
283                          */
284                         if (!vmm_is_svm() || CPUID_TO_FAMILY(cpu_id) < 0x16)
285                                 goto default_leaf;
286
287                         vm_get_topology(vm, &sockets, &cores, &threads,
288                             &maxcpus);
289                         regs[0] = vcpu_id;
290                         threads = MIN(0xFF, threads - 1);
291                         regs[1] = (threads << 8) |
292                             (vcpu_id >> log2(threads + 1));
293                         /*
294                          * XXX Bhyve topology cannot yet represent >1 node per
295                          * processor.
296                          */
297                         regs[2] = 0;
298                         regs[3] = 0;
299                         break;
300
301                 case CPUID_0000_0001:
302                         do_cpuid(1, regs);
303
304                         error = vm_get_x2apic_state(vm, vcpu_id, &x2apic_state);
305                         if (error) {
306                                 panic("x86_emulate_cpuid: error %d "
307                                       "fetching x2apic state", error);
308                         }
309
310                         /*
311                          * Override the APIC ID only in ebx
312                          */
313                         regs[1] &= ~(CPUID_LOCAL_APIC_ID);
314                         regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT);
315
316                         /*
317                          * Don't expose VMX, SpeedStep, TME or SMX capability.
318                          * Advertise x2APIC capability and Hypervisor guest.
319                          */
320                         regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2);
321                         regs[2] &= ~(CPUID2_SMX);
322
323                         regs[2] |= CPUID2_HV;
324
325                         if (x2apic_state != X2APIC_DISABLED)
326                                 regs[2] |= CPUID2_X2APIC;
327                         else
328                                 regs[2] &= ~CPUID2_X2APIC;
329
330                         /*
331                          * Only advertise CPUID2_XSAVE in the guest if
332                          * the host is using XSAVE.
333                          */
334                         if (!(regs[2] & CPUID2_OSXSAVE))
335                                 regs[2] &= ~CPUID2_XSAVE;
336
337                         /*
338                          * If CPUID2_XSAVE is being advertised and the
339                          * guest has set CR4_XSAVE, set
340                          * CPUID2_OSXSAVE.
341                          */
342                         regs[2] &= ~CPUID2_OSXSAVE;
343                         if (regs[2] & CPUID2_XSAVE) {
344                                 error = vm_get_register(vm, vcpu_id,
345                                     VM_REG_GUEST_CR4, &cr4);
346                                 if (error)
347                                         panic("x86_emulate_cpuid: error %d "
348                                               "fetching %%cr4", error);
349                                 if (cr4 & CR4_XSAVE)
350                                         regs[2] |= CPUID2_OSXSAVE;
351                         }
352
353                         /*
354                          * Hide monitor/mwait until we know how to deal with
355                          * these instructions.
356                          */
357                         regs[2] &= ~CPUID2_MON;
358
359                         /*
360                          * Hide the performance and debug features.
361                          */
362                         regs[2] &= ~CPUID2_PDCM;
363
364                         /*
365                          * No TSC deadline support in the APIC yet
366                          */
367                         regs[2] &= ~CPUID2_TSCDLT;
368
369                         /*
370                          * Hide thermal monitoring
371                          */
372                         regs[3] &= ~(CPUID_ACPI | CPUID_TM);
373
374                         /*
375                          * Hide the debug store capability.
376                          */
377                         regs[3] &= ~CPUID_DS;
378
379                         /*
380                          * Advertise the Machine Check and MTRR capability.
381                          *
382                          * Some guest OSes (e.g. Windows) will not boot if
383                          * these features are absent.
384                          */
385                         regs[3] |= (CPUID_MCA | CPUID_MCE | CPUID_MTRR);
386
387                         vm_get_topology(vm, &sockets, &cores, &threads,
388                             &maxcpus);
389                         logical_cpus = threads * cores;
390                         regs[1] &= ~CPUID_HTT_CORES;
391                         regs[1] |= (logical_cpus & 0xff) << 16;
392                         regs[3] |= CPUID_HTT;
393                         break;
394
395                 case CPUID_0000_0004:
396                         cpuid_count(*eax, *ecx, regs);
397
398                         if (regs[0] || regs[1] || regs[2] || regs[3]) {
399                                 vm_get_topology(vm, &sockets, &cores, &threads,
400                                     &maxcpus);
401                                 regs[0] &= 0x3ff;
402                                 regs[0] |= (cores - 1) << 26;
403                                 /*
404                                  * Cache topology:
405                                  * - L1 and L2 are shared only by the logical
406                                  *   processors in a single core.
407                                  * - L3 and above are shared by all logical
408                                  *   processors in the package.
409                                  */
410                                 logical_cpus = threads;
411                                 level = (regs[0] >> 5) & 0x7;
412                                 if (level >= 3)
413                                         logical_cpus *= cores;
414                                 regs[0] |= (logical_cpus - 1) << 14;
415                         }
416                         break;
417
418                 case CPUID_0000_0007:
419                         regs[0] = 0;
420                         regs[1] = 0;
421                         regs[2] = 0;
422                         regs[3] = 0;
423
424                         /* leaf 0 */
425                         if (*ecx == 0) {
426                                 cpuid_count(*eax, *ecx, regs);
427
428                                 /* Only leaf 0 is supported */
429                                 regs[0] = 0;
430
431                                 /*
432                                  * Expose known-safe features.
433                                  */
434                                 regs[1] &= (CPUID_STDEXT_FSGSBASE |
435                                     CPUID_STDEXT_BMI1 | CPUID_STDEXT_HLE |
436                                     CPUID_STDEXT_AVX2 | CPUID_STDEXT_BMI2 |
437                                     CPUID_STDEXT_ERMS | CPUID_STDEXT_RTM |
438                                     CPUID_STDEXT_AVX512F |
439                                     CPUID_STDEXT_RDSEED |
440                                     CPUID_STDEXT_AVX512PF |
441                                     CPUID_STDEXT_AVX512ER |
442                                     CPUID_STDEXT_AVX512CD | CPUID_STDEXT_SHA);
443                                 regs[2] = 0;
444                                 regs[3] &= CPUID_STDEXT3_MD_CLEAR;
445
446                                 /* Advertise INVPCID if it is enabled. */
447                                 error = vm_get_capability(vm, vcpu_id,
448                                     VM_CAP_ENABLE_INVPCID, &enable_invpcid);
449                                 if (error == 0 && enable_invpcid)
450                                         regs[1] |= CPUID_STDEXT_INVPCID;
451                         }
452                         break;
453
454                 case CPUID_0000_0006:
455                         regs[0] = CPUTPM1_ARAT;
456                         regs[1] = 0;
457                         regs[2] = 0;
458                         regs[3] = 0;
459                         break;
460
461                 case CPUID_0000_000A:
462                         /*
463                          * Handle the access, but report 0 for
464                          * all options
465                          */
466                         regs[0] = 0;
467                         regs[1] = 0;
468                         regs[2] = 0;
469                         regs[3] = 0;
470                         break;
471
472                 case CPUID_0000_000B:
473                         /*
474                          * Intel processor topology enumeration
475                          */
476                         if (vmm_is_intel()) {
477                                 vm_get_topology(vm, &sockets, &cores, &threads,
478                                     &maxcpus);
479                                 if (*ecx == 0) {
480                                         logical_cpus = threads;
481                                         width = log2(logical_cpus);
482                                         level = CPUID_TYPE_SMT;
483                                         x2apic_id = vcpu_id;
484                                 }
485
486                                 if (*ecx == 1) {
487                                         logical_cpus = threads * cores;
488                                         width = log2(logical_cpus);
489                                         level = CPUID_TYPE_CORE;
490                                         x2apic_id = vcpu_id;
491                                 }
492
493                                 if (!cpuid_leaf_b || *ecx >= 2) {
494                                         width = 0;
495                                         logical_cpus = 0;
496                                         level = 0;
497                                         x2apic_id = 0;
498                                 }
499
500                                 regs[0] = width & 0x1f;
501                                 regs[1] = logical_cpus & 0xffff;
502                                 regs[2] = (level << 8) | (*ecx & 0xff);
503                                 regs[3] = x2apic_id;
504                         } else {
505                                 regs[0] = 0;
506                                 regs[1] = 0;
507                                 regs[2] = 0;
508                                 regs[3] = 0;
509                         }
510                         break;
511
512                 case CPUID_0000_000D:
513                         limits = vmm_get_xsave_limits();
514                         if (!limits->xsave_enabled) {
515                                 regs[0] = 0;
516                                 regs[1] = 0;
517                                 regs[2] = 0;
518                                 regs[3] = 0;
519                                 break;
520                         }
521
522                         cpuid_count(*eax, *ecx, regs);
523                         switch (*ecx) {
524                         case 0:
525                                 /*
526                                  * Only permit the guest to use bits
527                                  * that are active in the host in
528                                  * %xcr0.  Also, claim that the
529                                  * maximum save area size is
530                                  * equivalent to the host's current
531                                  * save area size.  Since this runs
532                                  * "inside" of vmrun(), it runs with
533                                  * the guest's xcr0, so the current
534                                  * save area size is correct as-is.
535                                  */
536                                 regs[0] &= limits->xcr0_allowed;
537                                 regs[2] = limits->xsave_max_size;
538                                 regs[3] &= (limits->xcr0_allowed >> 32);
539                                 break;
540                         case 1:
541                                 /* Only permit XSAVEOPT. */
542                                 regs[0] &= CPUID_EXTSTATE_XSAVEOPT;
543                                 regs[1] = 0;
544                                 regs[2] = 0;
545                                 regs[3] = 0;
546                                 break;
547                         default:
548                                 /*
549                                  * If the leaf is for a permitted feature,
550                                  * pass through as-is, otherwise return
551                                  * all zeroes.
552                                  */
553                                 if (!(limits->xcr0_allowed & (1ul << *ecx))) {
554                                         regs[0] = 0;
555                                         regs[1] = 0;
556                                         regs[2] = 0;
557                                         regs[3] = 0;
558                                 }
559                                 break;
560                         }
561                         break;
562
563                 case CPUID_0000_0015:
564                         /*
565                          * Don't report CPU TSC/Crystal ratio and clock
566                          * values since guests may use these to derive the
567                          * local APIC frequency..
568                          */
569                         regs[0] = 0;
570                         regs[1] = 0;
571                         regs[2] = 0;
572                         regs[3] = 0;
573                         break;
574
575                 case 0x40000000:
576                         regs[0] = CPUID_VM_HIGH;
577                         bcopy(bhyve_id, &regs[1], 4);
578                         bcopy(bhyve_id + 4, &regs[2], 4);
579                         bcopy(bhyve_id + 8, &regs[3], 4);
580                         break;
581
582                 default:
583 default_leaf:
584                         /*
585                          * The leaf value has already been clamped so
586                          * simply pass this through, keeping count of
587                          * how many unhandled leaf values have been seen.
588                          */
589                         atomic_add_long(&bhyve_xcpuids, 1);
590                         cpuid_count(*eax, *ecx, regs);
591                         break;
592         }
593
594         *eax = regs[0];
595         *ebx = regs[1];
596         *ecx = regs[2];
597         *edx = regs[3];
598
599         return (1);
600 }
601
602 bool
603 vm_cpuid_capability(struct vm *vm, int vcpuid, enum vm_cpuid_capability cap)
604 {
605         bool rv;
606
607         KASSERT(cap > 0 && cap < VCC_LAST, ("%s: invalid vm_cpu_capability %d",
608             __func__, cap));
609
610         /*
611          * Simply passthrough the capabilities of the host cpu for now.
612          */
613         rv = false;
614         switch (cap) {
615         case VCC_NO_EXECUTE:
616                 if (amd_feature & AMDID_NX)
617                         rv = true;
618                 break;
619         case VCC_FFXSR:
620                 if (amd_feature & AMDID_FFXSR)
621                         rv = true;
622                 break;
623         case VCC_TCE:
624                 if (amd_feature2 & AMDID2_TCE)
625                         rv = true;
626                 break;
627         default:
628                 panic("%s: unknown vm_cpu_capability %d", __func__, cap);
629         }
630         return (rv);
631 }