]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/amd64/vmm/x86.c
MFH: r278968-r280640
[FreeBSD/FreeBSD.git] / sys / amd64 / vmm / x86.c
1 /*-
2  * Copyright (c) 2011 NetApp, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  * $FreeBSD$
27  */
28
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31
32 #include <sys/param.h>
33 #include <sys/pcpu.h>
34 #include <sys/systm.h>
35 #include <sys/cpuset.h>
36 #include <sys/sysctl.h>
37
38 #include <machine/clock.h>
39 #include <machine/cpufunc.h>
40 #include <machine/md_var.h>
41 #include <machine/segments.h>
42 #include <machine/specialreg.h>
43
44 #include <machine/vmm.h>
45
46 #include "vmm_host.h"
47 #include "vmm_ktr.h"
48 #include "vmm_util.h"
49 #include "x86.h"
50
51 SYSCTL_DECL(_hw_vmm);
52 static SYSCTL_NODE(_hw_vmm, OID_AUTO, topology, CTLFLAG_RD, 0, NULL);
53
54 #define CPUID_VM_HIGH           0x40000000
55
56 static const char bhyve_id[12] = "bhyve bhyve ";
57
58 static uint64_t bhyve_xcpuids;
59 SYSCTL_ULONG(_hw_vmm, OID_AUTO, bhyve_xcpuids, CTLFLAG_RW, &bhyve_xcpuids, 0,
60     "Number of times an unknown cpuid leaf was accessed");
61
62 /*
63  * The default CPU topology is a single thread per package.
64  */
65 static u_int threads_per_core = 1;
66 SYSCTL_UINT(_hw_vmm_topology, OID_AUTO, threads_per_core, CTLFLAG_RDTUN,
67     &threads_per_core, 0, NULL);
68
69 static u_int cores_per_package = 1;
70 SYSCTL_UINT(_hw_vmm_topology, OID_AUTO, cores_per_package, CTLFLAG_RDTUN,
71     &cores_per_package, 0, NULL);
72
73 static int cpuid_leaf_b = 1;
74 SYSCTL_INT(_hw_vmm_topology, OID_AUTO, cpuid_leaf_b, CTLFLAG_RDTUN,
75     &cpuid_leaf_b, 0, NULL);
76
77 /*
78  * Round up to the next power of two, if necessary, and then take log2.
79  * Returns -1 if argument is zero.
80  */
81 static __inline int
82 log2(u_int x)
83 {
84
85         return (fls(x << (1 - powerof2(x))) - 1);
86 }
87
88 int
89 x86_emulate_cpuid(struct vm *vm, int vcpu_id,
90                   uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
91 {
92         const struct xsave_limits *limits;
93         uint64_t cr4;
94         int error, enable_invpcid, level, width, x2apic_id;
95         unsigned int func, regs[4], logical_cpus;
96         enum x2apic_state x2apic_state;
97
98         VCPU_CTR2(vm, vcpu_id, "cpuid %#x,%#x", *eax, *ecx);
99
100         /*
101          * Requests for invalid CPUID levels should map to the highest
102          * available level instead.
103          */
104         if (cpu_exthigh != 0 && *eax >= 0x80000000) {
105                 if (*eax > cpu_exthigh)
106                         *eax = cpu_exthigh;
107         } else if (*eax >= 0x40000000) {
108                 if (*eax > CPUID_VM_HIGH)
109                         *eax = CPUID_VM_HIGH;
110         } else if (*eax > cpu_high) {
111                 *eax = cpu_high;
112         }
113
114         func = *eax;
115
116         /*
117          * In general the approach used for CPU topology is to
118          * advertise a flat topology where all CPUs are packages with
119          * no multi-core or SMT.
120          */
121         switch (func) {
122                 /*
123                  * Pass these through to the guest
124                  */
125                 case CPUID_0000_0000:
126                 case CPUID_0000_0002:
127                 case CPUID_0000_0003:
128                 case CPUID_8000_0000:
129                 case CPUID_8000_0002:
130                 case CPUID_8000_0003:
131                 case CPUID_8000_0004:
132                 case CPUID_8000_0006:
133                         cpuid_count(*eax, *ecx, regs);
134                         break;
135                 case CPUID_8000_0008:
136                         cpuid_count(*eax, *ecx, regs);
137                         if (vmm_is_amd()) {
138                                 /*
139                                  * XXX this might appear silly because AMD
140                                  * cpus don't have threads.
141                                  *
142                                  * However this matches the logical cpus as
143                                  * advertised by leaf 0x1 and will work even
144                                  * if the 'threads_per_core' tunable is set
145                                  * incorrectly on an AMD host.
146                                  */
147                                 logical_cpus = threads_per_core *
148                                     cores_per_package;
149                                 regs[2] = logical_cpus - 1;
150                         }
151                         break;
152
153                 case CPUID_8000_0001:
154                         cpuid_count(*eax, *ecx, regs);
155
156                         /*
157                          * Hide SVM and Topology Extension features from guest.
158                          */
159                         regs[2] &= ~(AMDID2_SVM | AMDID2_TOPOLOGY);
160
161                         /*
162                          * Don't advertise extended performance counter MSRs
163                          * to the guest.
164                          */
165                         regs[2] &= ~AMDID2_PCXC;
166                         regs[2] &= ~AMDID2_PNXC;
167                         regs[2] &= ~AMDID2_PTSCEL2I;
168
169                         /*
170                          * Don't advertise Instruction Based Sampling feature.
171                          */
172                         regs[2] &= ~AMDID2_IBS;
173
174                         /* NodeID MSR not available */
175                         regs[2] &= ~AMDID2_NODE_ID;
176
177                         /* Don't advertise the OS visible workaround feature */
178                         regs[2] &= ~AMDID2_OSVW;
179
180                         /*
181                          * Hide rdtscp/ia32_tsc_aux until we know how
182                          * to deal with them.
183                          */
184                         regs[3] &= ~AMDID_RDTSCP;
185                         break;
186
187                 case CPUID_8000_0007:
188                         /*
189                          * AMD uses this leaf to advertise the processor's
190                          * power monitoring and RAS capabilities. These
191                          * features are hardware-specific and exposing
192                          * them to a guest doesn't make a lot of sense.
193                          *
194                          * Intel uses this leaf only to advertise the
195                          * "Invariant TSC" feature with all other bits
196                          * being reserved (set to zero).
197                          */
198                         regs[0] = 0;
199                         regs[1] = 0;
200                         regs[2] = 0;
201                         regs[3] = 0;
202
203                         /*
204                          * "Invariant TSC" can be advertised to the guest if:
205                          * - host TSC frequency is invariant
206                          * - host TSCs are synchronized across physical cpus
207                          *
208                          * XXX This still falls short because the vcpu
209                          * can observe the TSC moving backwards as it
210                          * migrates across physical cpus. But at least
211                          * it should discourage the guest from using the
212                          * TSC to keep track of time.
213                          */
214                         if (tsc_is_invariant && smp_tsc)
215                                 regs[3] |= AMDPM_TSC_INVARIANT;
216                         break;
217
218                 case CPUID_0000_0001:
219                         do_cpuid(1, regs);
220
221                         error = vm_get_x2apic_state(vm, vcpu_id, &x2apic_state);
222                         if (error) {
223                                 panic("x86_emulate_cpuid: error %d "
224                                       "fetching x2apic state", error);
225                         }
226
227                         /*
228                          * Override the APIC ID only in ebx
229                          */
230                         regs[1] &= ~(CPUID_LOCAL_APIC_ID);
231                         regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT);
232
233                         /*
234                          * Don't expose VMX, SpeedStep or TME capability.
235                          * Advertise x2APIC capability and Hypervisor guest.
236                          */
237                         regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2);
238
239                         regs[2] |= CPUID2_HV;
240
241                         if (x2apic_state != X2APIC_DISABLED)
242                                 regs[2] |= CPUID2_X2APIC;
243                         else
244                                 regs[2] &= ~CPUID2_X2APIC;
245
246                         /*
247                          * Only advertise CPUID2_XSAVE in the guest if
248                          * the host is using XSAVE.
249                          */
250                         if (!(regs[2] & CPUID2_OSXSAVE))
251                                 regs[2] &= ~CPUID2_XSAVE;
252
253                         /*
254                          * If CPUID2_XSAVE is being advertised and the
255                          * guest has set CR4_XSAVE, set
256                          * CPUID2_OSXSAVE.
257                          */
258                         regs[2] &= ~CPUID2_OSXSAVE;
259                         if (regs[2] & CPUID2_XSAVE) {
260                                 error = vm_get_register(vm, vcpu_id,
261                                     VM_REG_GUEST_CR4, &cr4);
262                                 if (error)
263                                         panic("x86_emulate_cpuid: error %d "
264                                               "fetching %%cr4", error);
265                                 if (cr4 & CR4_XSAVE)
266                                         regs[2] |= CPUID2_OSXSAVE;
267                         }
268
269                         /*
270                          * Hide monitor/mwait until we know how to deal with
271                          * these instructions.
272                          */
273                         regs[2] &= ~CPUID2_MON;
274
275                         /*
276                          * Hide the performance and debug features.
277                          */
278                         regs[2] &= ~CPUID2_PDCM;
279
280                         /*
281                          * No TSC deadline support in the APIC yet
282                          */
283                         regs[2] &= ~CPUID2_TSCDLT;
284
285                         /*
286                          * Hide thermal monitoring
287                          */
288                         regs[3] &= ~(CPUID_ACPI | CPUID_TM);
289                         
290                         /*
291                          * Machine check handling is done in the host.
292                          * Hide MTRR capability.
293                          */
294                         regs[3] &= ~(CPUID_MCA | CPUID_MCE | CPUID_MTRR);
295
296                         /*
297                         * Hide the debug store capability.
298                         */
299                         regs[3] &= ~CPUID_DS;
300
301                         logical_cpus = threads_per_core * cores_per_package;
302                         regs[1] &= ~CPUID_HTT_CORES;
303                         regs[1] |= (logical_cpus & 0xff) << 16;
304                         regs[3] |= CPUID_HTT;
305                         break;
306
307                 case CPUID_0000_0004:
308                         cpuid_count(*eax, *ecx, regs);
309
310                         if (regs[0] || regs[1] || regs[2] || regs[3]) {
311                                 regs[0] &= 0x3ff;
312                                 regs[0] |= (cores_per_package - 1) << 26;
313                                 /*
314                                  * Cache topology:
315                                  * - L1 and L2 are shared only by the logical
316                                  *   processors in a single core.
317                                  * - L3 and above are shared by all logical
318                                  *   processors in the package.
319                                  */
320                                 logical_cpus = threads_per_core;
321                                 level = (regs[0] >> 5) & 0x7;
322                                 if (level >= 3)
323                                         logical_cpus *= cores_per_package;
324                                 regs[0] |= (logical_cpus - 1) << 14;
325                         }
326                         break;
327
328                 case CPUID_0000_0007:
329                         regs[0] = 0;
330                         regs[1] = 0;
331                         regs[2] = 0;
332                         regs[3] = 0;
333
334                         /* leaf 0 */
335                         if (*ecx == 0) {
336                                 cpuid_count(*eax, *ecx, regs);
337
338                                 /* Only leaf 0 is supported */
339                                 regs[0] = 0;
340
341                                 /*
342                                  * Expose known-safe features.
343                                  */
344                                 regs[1] &= (CPUID_STDEXT_FSGSBASE |
345                                     CPUID_STDEXT_BMI1 | CPUID_STDEXT_HLE |
346                                     CPUID_STDEXT_AVX2 | CPUID_STDEXT_BMI2 |
347                                     CPUID_STDEXT_ERMS | CPUID_STDEXT_RTM |
348                                     CPUID_STDEXT_AVX512F |
349                                     CPUID_STDEXT_AVX512PF |
350                                     CPUID_STDEXT_AVX512ER |
351                                     CPUID_STDEXT_AVX512CD);
352                                 regs[2] = 0;
353                                 regs[3] = 0;
354
355                                 /* Advertise INVPCID if it is enabled. */
356                                 error = vm_get_capability(vm, vcpu_id,
357                                     VM_CAP_ENABLE_INVPCID, &enable_invpcid);
358                                 if (error == 0 && enable_invpcid)
359                                         regs[1] |= CPUID_STDEXT_INVPCID;
360                         }
361                         break;
362
363                 case CPUID_0000_0006:
364                         regs[0] = CPUTPM1_ARAT;
365                         regs[1] = 0;
366                         regs[2] = 0;
367                         regs[3] = 0;
368                         break;
369
370                 case CPUID_0000_000A:
371                         /*
372                          * Handle the access, but report 0 for
373                          * all options
374                          */
375                         regs[0] = 0;
376                         regs[1] = 0;
377                         regs[2] = 0;
378                         regs[3] = 0;
379                         break;
380
381                 case CPUID_0000_000B:
382                         /*
383                          * Processor topology enumeration
384                          */
385                         if (*ecx == 0) {
386                                 logical_cpus = threads_per_core;
387                                 width = log2(logical_cpus);
388                                 level = CPUID_TYPE_SMT;
389                                 x2apic_id = vcpu_id;
390                         }
391
392                         if (*ecx == 1) {
393                                 logical_cpus = threads_per_core *
394                                     cores_per_package;
395                                 width = log2(logical_cpus);
396                                 level = CPUID_TYPE_CORE;
397                                 x2apic_id = vcpu_id;
398                         }
399
400                         if (!cpuid_leaf_b || *ecx >= 2) {
401                                 width = 0;
402                                 logical_cpus = 0;
403                                 level = 0;
404                                 x2apic_id = 0;
405                         }
406
407                         regs[0] = width & 0x1f;
408                         regs[1] = logical_cpus & 0xffff;
409                         regs[2] = (level << 8) | (*ecx & 0xff);
410                         regs[3] = x2apic_id;
411                         break;
412
413                 case CPUID_0000_000D:
414                         limits = vmm_get_xsave_limits();
415                         if (!limits->xsave_enabled) {
416                                 regs[0] = 0;
417                                 regs[1] = 0;
418                                 regs[2] = 0;
419                                 regs[3] = 0;
420                                 break;
421                         }
422
423                         cpuid_count(*eax, *ecx, regs);
424                         switch (*ecx) {
425                         case 0:
426                                 /*
427                                  * Only permit the guest to use bits
428                                  * that are active in the host in
429                                  * %xcr0.  Also, claim that the
430                                  * maximum save area size is
431                                  * equivalent to the host's current
432                                  * save area size.  Since this runs
433                                  * "inside" of vmrun(), it runs with
434                                  * the guest's xcr0, so the current
435                                  * save area size is correct as-is.
436                                  */
437                                 regs[0] &= limits->xcr0_allowed;
438                                 regs[2] = limits->xsave_max_size;
439                                 regs[3] &= (limits->xcr0_allowed >> 32);
440                                 break;
441                         case 1:
442                                 /* Only permit XSAVEOPT. */
443                                 regs[0] &= CPUID_EXTSTATE_XSAVEOPT;
444                                 regs[1] = 0;
445                                 regs[2] = 0;
446                                 regs[3] = 0;
447                                 break;
448                         default:
449                                 /*
450                                  * If the leaf is for a permitted feature,
451                                  * pass through as-is, otherwise return
452                                  * all zeroes.
453                                  */
454                                 if (!(limits->xcr0_allowed & (1ul << *ecx))) {
455                                         regs[0] = 0;
456                                         regs[1] = 0;
457                                         regs[2] = 0;
458                                         regs[3] = 0;
459                                 }
460                                 break;
461                         }
462                         break;
463
464                 case 0x40000000:
465                         regs[0] = CPUID_VM_HIGH;
466                         bcopy(bhyve_id, &regs[1], 4);
467                         bcopy(bhyve_id + 4, &regs[2], 4);
468                         bcopy(bhyve_id + 8, &regs[3], 4);
469                         break;
470
471                 default:
472                         /*
473                          * The leaf value has already been clamped so
474                          * simply pass this through, keeping count of
475                          * how many unhandled leaf values have been seen.
476                          */
477                         atomic_add_long(&bhyve_xcpuids, 1);
478                         cpuid_count(*eax, *ecx, regs);
479                         break;
480         }
481
482         *eax = regs[0];
483         *ebx = regs[1];
484         *ecx = regs[2];
485         *edx = regs[3];
486
487         return (1);
488 }