]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/amd64/vmm/x86.c
Merge llvm, clang, lld, lldb, compiler-rt and libc++ r303197, and update
[FreeBSD/FreeBSD.git] / sys / amd64 / vmm / x86.c
1 /*-
2  * Copyright (c) 2011 NetApp, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  * $FreeBSD$
27  */
28
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31
32 #include <sys/param.h>
33 #include <sys/pcpu.h>
34 #include <sys/systm.h>
35 #include <sys/sysctl.h>
36
37 #include <machine/clock.h>
38 #include <machine/cpufunc.h>
39 #include <machine/md_var.h>
40 #include <machine/segments.h>
41 #include <machine/specialreg.h>
42
43 #include <machine/vmm.h>
44
45 #include "vmm_host.h"
46 #include "vmm_ktr.h"
47 #include "vmm_util.h"
48 #include "x86.h"
49
50 SYSCTL_DECL(_hw_vmm);
51 static SYSCTL_NODE(_hw_vmm, OID_AUTO, topology, CTLFLAG_RD, 0, NULL);
52
53 #define CPUID_VM_HIGH           0x40000000
54
55 static const char bhyve_id[12] = "bhyve bhyve ";
56
57 static uint64_t bhyve_xcpuids;
58 SYSCTL_ULONG(_hw_vmm, OID_AUTO, bhyve_xcpuids, CTLFLAG_RW, &bhyve_xcpuids, 0,
59     "Number of times an unknown cpuid leaf was accessed");
60
61 /*
62  * The default CPU topology is a single thread per package.
63  */
64 static u_int threads_per_core = 1;
65 SYSCTL_UINT(_hw_vmm_topology, OID_AUTO, threads_per_core, CTLFLAG_RDTUN,
66     &threads_per_core, 0, NULL);
67
68 static u_int cores_per_package = 1;
69 SYSCTL_UINT(_hw_vmm_topology, OID_AUTO, cores_per_package, CTLFLAG_RDTUN,
70     &cores_per_package, 0, NULL);
71
72 static int cpuid_leaf_b = 1;
73 SYSCTL_INT(_hw_vmm_topology, OID_AUTO, cpuid_leaf_b, CTLFLAG_RDTUN,
74     &cpuid_leaf_b, 0, NULL);
75
76 /*
77  * Round up to the next power of two, if necessary, and then take log2.
78  * Returns -1 if argument is zero.
79  */
80 static __inline int
81 log2(u_int x)
82 {
83
84         return (fls(x << (1 - powerof2(x))) - 1);
85 }
86
87 int
88 x86_emulate_cpuid(struct vm *vm, int vcpu_id,
89                   uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
90 {
91         const struct xsave_limits *limits;
92         uint64_t cr4;
93         int error, enable_invpcid, level, width, x2apic_id;
94         unsigned int func, regs[4], logical_cpus;
95         enum x2apic_state x2apic_state;
96
97         VCPU_CTR2(vm, vcpu_id, "cpuid %#x,%#x", *eax, *ecx);
98
99         /*
100          * Requests for invalid CPUID levels should map to the highest
101          * available level instead.
102          */
103         if (cpu_exthigh != 0 && *eax >= 0x80000000) {
104                 if (*eax > cpu_exthigh)
105                         *eax = cpu_exthigh;
106         } else if (*eax >= 0x40000000) {
107                 if (*eax > CPUID_VM_HIGH)
108                         *eax = CPUID_VM_HIGH;
109         } else if (*eax > cpu_high) {
110                 *eax = cpu_high;
111         }
112
113         func = *eax;
114
115         /*
116          * In general the approach used for CPU topology is to
117          * advertise a flat topology where all CPUs are packages with
118          * no multi-core or SMT.
119          */
120         switch (func) {
121                 /*
122                  * Pass these through to the guest
123                  */
124                 case CPUID_0000_0000:
125                 case CPUID_0000_0002:
126                 case CPUID_0000_0003:
127                 case CPUID_8000_0000:
128                 case CPUID_8000_0002:
129                 case CPUID_8000_0003:
130                 case CPUID_8000_0004:
131                 case CPUID_8000_0006:
132                         cpuid_count(*eax, *ecx, regs);
133                         break;
134                 case CPUID_8000_0008:
135                         cpuid_count(*eax, *ecx, regs);
136                         if (vmm_is_amd()) {
137                                 /*
138                                  * XXX this might appear silly because AMD
139                                  * cpus don't have threads.
140                                  *
141                                  * However this matches the logical cpus as
142                                  * advertised by leaf 0x1 and will work even
143                                  * if the 'threads_per_core' tunable is set
144                                  * incorrectly on an AMD host.
145                                  */
146                                 logical_cpus = threads_per_core *
147                                     cores_per_package;
148                                 regs[2] = logical_cpus - 1;
149                         }
150                         break;
151
152                 case CPUID_8000_0001:
153                         cpuid_count(*eax, *ecx, regs);
154
155                         /*
156                          * Hide SVM and Topology Extension features from guest.
157                          */
158                         regs[2] &= ~(AMDID2_SVM | AMDID2_TOPOLOGY);
159
160                         /*
161                          * Don't advertise extended performance counter MSRs
162                          * to the guest.
163                          */
164                         regs[2] &= ~AMDID2_PCXC;
165                         regs[2] &= ~AMDID2_PNXC;
166                         regs[2] &= ~AMDID2_PTSCEL2I;
167
168                         /*
169                          * Don't advertise Instruction Based Sampling feature.
170                          */
171                         regs[2] &= ~AMDID2_IBS;
172
173                         /* NodeID MSR not available */
174                         regs[2] &= ~AMDID2_NODE_ID;
175
176                         /* Don't advertise the OS visible workaround feature */
177                         regs[2] &= ~AMDID2_OSVW;
178
179                         /* Hide mwaitx/monitorx capability from the guest */
180                         regs[2] &= ~AMDID2_MWAITX;
181
182                         /*
183                          * Hide rdtscp/ia32_tsc_aux until we know how
184                          * to deal with them.
185                          */
186                         regs[3] &= ~AMDID_RDTSCP;
187                         break;
188
189                 case CPUID_8000_0007:
190                         /*
191                          * AMD uses this leaf to advertise the processor's
192                          * power monitoring and RAS capabilities. These
193                          * features are hardware-specific and exposing
194                          * them to a guest doesn't make a lot of sense.
195                          *
196                          * Intel uses this leaf only to advertise the
197                          * "Invariant TSC" feature with all other bits
198                          * being reserved (set to zero).
199                          */
200                         regs[0] = 0;
201                         regs[1] = 0;
202                         regs[2] = 0;
203                         regs[3] = 0;
204
205                         /*
206                          * "Invariant TSC" can be advertised to the guest if:
207                          * - host TSC frequency is invariant
208                          * - host TSCs are synchronized across physical cpus
209                          *
210                          * XXX This still falls short because the vcpu
211                          * can observe the TSC moving backwards as it
212                          * migrates across physical cpus. But at least
213                          * it should discourage the guest from using the
214                          * TSC to keep track of time.
215                          */
216                         if (tsc_is_invariant && smp_tsc)
217                                 regs[3] |= AMDPM_TSC_INVARIANT;
218                         break;
219
220                 case CPUID_0000_0001:
221                         do_cpuid(1, regs);
222
223                         error = vm_get_x2apic_state(vm, vcpu_id, &x2apic_state);
224                         if (error) {
225                                 panic("x86_emulate_cpuid: error %d "
226                                       "fetching x2apic state", error);
227                         }
228
229                         /*
230                          * Override the APIC ID only in ebx
231                          */
232                         regs[1] &= ~(CPUID_LOCAL_APIC_ID);
233                         regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT);
234
235                         /*
236                          * Don't expose VMX, SpeedStep, TME or SMX capability.
237                          * Advertise x2APIC capability and Hypervisor guest.
238                          */
239                         regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2);
240                         regs[2] &= ~(CPUID2_SMX);
241
242                         regs[2] |= CPUID2_HV;
243
244                         if (x2apic_state != X2APIC_DISABLED)
245                                 regs[2] |= CPUID2_X2APIC;
246                         else
247                                 regs[2] &= ~CPUID2_X2APIC;
248
249                         /*
250                          * Only advertise CPUID2_XSAVE in the guest if
251                          * the host is using XSAVE.
252                          */
253                         if (!(regs[2] & CPUID2_OSXSAVE))
254                                 regs[2] &= ~CPUID2_XSAVE;
255
256                         /*
257                          * If CPUID2_XSAVE is being advertised and the
258                          * guest has set CR4_XSAVE, set
259                          * CPUID2_OSXSAVE.
260                          */
261                         regs[2] &= ~CPUID2_OSXSAVE;
262                         if (regs[2] & CPUID2_XSAVE) {
263                                 error = vm_get_register(vm, vcpu_id,
264                                     VM_REG_GUEST_CR4, &cr4);
265                                 if (error)
266                                         panic("x86_emulate_cpuid: error %d "
267                                               "fetching %%cr4", error);
268                                 if (cr4 & CR4_XSAVE)
269                                         regs[2] |= CPUID2_OSXSAVE;
270                         }
271
272                         /*
273                          * Hide monitor/mwait until we know how to deal with
274                          * these instructions.
275                          */
276                         regs[2] &= ~CPUID2_MON;
277
278                         /*
279                          * Hide the performance and debug features.
280                          */
281                         regs[2] &= ~CPUID2_PDCM;
282
283                         /*
284                          * No TSC deadline support in the APIC yet
285                          */
286                         regs[2] &= ~CPUID2_TSCDLT;
287
288                         /*
289                          * Hide thermal monitoring
290                          */
291                         regs[3] &= ~(CPUID_ACPI | CPUID_TM);
292
293                         /*
294                          * Hide the debug store capability.
295                          */
296                         regs[3] &= ~CPUID_DS;
297
298                         /*
299                          * Advertise the Machine Check and MTRR capability.
300                          *
301                          * Some guest OSes (e.g. Windows) will not boot if
302                          * these features are absent.
303                          */
304                         regs[3] |= (CPUID_MCA | CPUID_MCE | CPUID_MTRR);
305
306                         logical_cpus = threads_per_core * cores_per_package;
307                         regs[1] &= ~CPUID_HTT_CORES;
308                         regs[1] |= (logical_cpus & 0xff) << 16;
309                         regs[3] |= CPUID_HTT;
310                         break;
311
312                 case CPUID_0000_0004:
313                         cpuid_count(*eax, *ecx, regs);
314
315                         if (regs[0] || regs[1] || regs[2] || regs[3]) {
316                                 regs[0] &= 0x3ff;
317                                 regs[0] |= (cores_per_package - 1) << 26;
318                                 /*
319                                  * Cache topology:
320                                  * - L1 and L2 are shared only by the logical
321                                  *   processors in a single core.
322                                  * - L3 and above are shared by all logical
323                                  *   processors in the package.
324                                  */
325                                 logical_cpus = threads_per_core;
326                                 level = (regs[0] >> 5) & 0x7;
327                                 if (level >= 3)
328                                         logical_cpus *= cores_per_package;
329                                 regs[0] |= (logical_cpus - 1) << 14;
330                         }
331                         break;
332
333                 case CPUID_0000_0007:
334                         regs[0] = 0;
335                         regs[1] = 0;
336                         regs[2] = 0;
337                         regs[3] = 0;
338
339                         /* leaf 0 */
340                         if (*ecx == 0) {
341                                 cpuid_count(*eax, *ecx, regs);
342
343                                 /* Only leaf 0 is supported */
344                                 regs[0] = 0;
345
346                                 /*
347                                  * Expose known-safe features.
348                                  */
349                                 regs[1] &= (CPUID_STDEXT_FSGSBASE |
350                                     CPUID_STDEXT_BMI1 | CPUID_STDEXT_HLE |
351                                     CPUID_STDEXT_AVX2 | CPUID_STDEXT_BMI2 |
352                                     CPUID_STDEXT_ERMS | CPUID_STDEXT_RTM |
353                                     CPUID_STDEXT_AVX512F |
354                                     CPUID_STDEXT_AVX512PF |
355                                     CPUID_STDEXT_AVX512ER |
356                                     CPUID_STDEXT_AVX512CD);
357                                 regs[2] = 0;
358                                 regs[3] = 0;
359
360                                 /* Advertise INVPCID if it is enabled. */
361                                 error = vm_get_capability(vm, vcpu_id,
362                                     VM_CAP_ENABLE_INVPCID, &enable_invpcid);
363                                 if (error == 0 && enable_invpcid)
364                                         regs[1] |= CPUID_STDEXT_INVPCID;
365                         }
366                         break;
367
368                 case CPUID_0000_0006:
369                         regs[0] = CPUTPM1_ARAT;
370                         regs[1] = 0;
371                         regs[2] = 0;
372                         regs[3] = 0;
373                         break;
374
375                 case CPUID_0000_000A:
376                         /*
377                          * Handle the access, but report 0 for
378                          * all options
379                          */
380                         regs[0] = 0;
381                         regs[1] = 0;
382                         regs[2] = 0;
383                         regs[3] = 0;
384                         break;
385
386                 case CPUID_0000_000B:
387                         /*
388                          * Processor topology enumeration
389                          */
390                         if (*ecx == 0) {
391                                 logical_cpus = threads_per_core;
392                                 width = log2(logical_cpus);
393                                 level = CPUID_TYPE_SMT;
394                                 x2apic_id = vcpu_id;
395                         }
396
397                         if (*ecx == 1) {
398                                 logical_cpus = threads_per_core *
399                                     cores_per_package;
400                                 width = log2(logical_cpus);
401                                 level = CPUID_TYPE_CORE;
402                                 x2apic_id = vcpu_id;
403                         }
404
405                         if (!cpuid_leaf_b || *ecx >= 2) {
406                                 width = 0;
407                                 logical_cpus = 0;
408                                 level = 0;
409                                 x2apic_id = 0;
410                         }
411
412                         regs[0] = width & 0x1f;
413                         regs[1] = logical_cpus & 0xffff;
414                         regs[2] = (level << 8) | (*ecx & 0xff);
415                         regs[3] = x2apic_id;
416                         break;
417
418                 case CPUID_0000_000D:
419                         limits = vmm_get_xsave_limits();
420                         if (!limits->xsave_enabled) {
421                                 regs[0] = 0;
422                                 regs[1] = 0;
423                                 regs[2] = 0;
424                                 regs[3] = 0;
425                                 break;
426                         }
427
428                         cpuid_count(*eax, *ecx, regs);
429                         switch (*ecx) {
430                         case 0:
431                                 /*
432                                  * Only permit the guest to use bits
433                                  * that are active in the host in
434                                  * %xcr0.  Also, claim that the
435                                  * maximum save area size is
436                                  * equivalent to the host's current
437                                  * save area size.  Since this runs
438                                  * "inside" of vmrun(), it runs with
439                                  * the guest's xcr0, so the current
440                                  * save area size is correct as-is.
441                                  */
442                                 regs[0] &= limits->xcr0_allowed;
443                                 regs[2] = limits->xsave_max_size;
444                                 regs[3] &= (limits->xcr0_allowed >> 32);
445                                 break;
446                         case 1:
447                                 /* Only permit XSAVEOPT. */
448                                 regs[0] &= CPUID_EXTSTATE_XSAVEOPT;
449                                 regs[1] = 0;
450                                 regs[2] = 0;
451                                 regs[3] = 0;
452                                 break;
453                         default:
454                                 /*
455                                  * If the leaf is for a permitted feature,
456                                  * pass through as-is, otherwise return
457                                  * all zeroes.
458                                  */
459                                 if (!(limits->xcr0_allowed & (1ul << *ecx))) {
460                                         regs[0] = 0;
461                                         regs[1] = 0;
462                                         regs[2] = 0;
463                                         regs[3] = 0;
464                                 }
465                                 break;
466                         }
467                         break;
468
469                 case 0x40000000:
470                         regs[0] = CPUID_VM_HIGH;
471                         bcopy(bhyve_id, &regs[1], 4);
472                         bcopy(bhyve_id + 4, &regs[2], 4);
473                         bcopy(bhyve_id + 8, &regs[3], 4);
474                         break;
475
476                 default:
477                         /*
478                          * The leaf value has already been clamped so
479                          * simply pass this through, keeping count of
480                          * how many unhandled leaf values have been seen.
481                          */
482                         atomic_add_long(&bhyve_xcpuids, 1);
483                         cpuid_count(*eax, *ecx, regs);
484                         break;
485         }
486
487         *eax = regs[0];
488         *ebx = regs[1];
489         *ecx = regs[2];
490         *edx = regs[3];
491
492         return (1);
493 }
494
495 bool
496 vm_cpuid_capability(struct vm *vm, int vcpuid, enum vm_cpuid_capability cap)
497 {
498         bool rv;
499
500         KASSERT(cap > 0 && cap < VCC_LAST, ("%s: invalid vm_cpu_capability %d",
501             __func__, cap));
502
503         /*
504          * Simply passthrough the capabilities of the host cpu for now.
505          */
506         rv = false;
507         switch (cap) {
508         case VCC_NO_EXECUTE:
509                 if (amd_feature & AMDID_NX)
510                         rv = true;
511                 break;
512         case VCC_FFXSR:
513                 if (amd_feature & AMDID_FFXSR)
514                         rv = true;
515                 break;
516         case VCC_TCE:
517                 if (amd_feature2 & AMDID2_TCE)
518                         rv = true;
519                 break;
520         default:
521                 panic("%s: unknown vm_cpu_capability %d", __func__, cap);
522         }
523         return (rv);
524 }