]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/amd64/vmm/intel/vmx.c
vmm vmx: Add a global bool to indicate if the host has the TSC_AUX MSR.
[FreeBSD/FreeBSD.git] / sys / amd64 / vmm / intel / vmx.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  * Copyright (c) 2018 Joyent, Inc.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  * $FreeBSD$
30  */
31
32 #include <sys/cdefs.h>
33 __FBSDID("$FreeBSD$");
34
35 #include "opt_bhyve_snapshot.h"
36
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/smp.h>
40 #include <sys/kernel.h>
41 #include <sys/malloc.h>
42 #include <sys/pcpu.h>
43 #include <sys/proc.h>
44 #include <sys/reg.h>
45 #include <sys/smr.h>
46 #include <sys/sysctl.h>
47
48 #include <vm/vm.h>
49 #include <vm/pmap.h>
50
51 #include <machine/psl.h>
52 #include <machine/cpufunc.h>
53 #include <machine/md_var.h>
54 #include <machine/segments.h>
55 #include <machine/smp.h>
56 #include <machine/specialreg.h>
57 #include <machine/vmparam.h>
58
59 #include <machine/vmm.h>
60 #include <machine/vmm_dev.h>
61 #include <machine/vmm_instruction_emul.h>
62 #include <machine/vmm_snapshot.h>
63
64 #include "vmm_lapic.h"
65 #include "vmm_host.h"
66 #include "vmm_ioport.h"
67 #include "vmm_ktr.h"
68 #include "vmm_stat.h"
69 #include "vatpic.h"
70 #include "vlapic.h"
71 #include "vlapic_priv.h"
72
73 #include "ept.h"
74 #include "vmx_cpufunc.h"
75 #include "vmx.h"
76 #include "vmx_msr.h"
77 #include "x86.h"
78 #include "vmx_controls.h"
79
80 #define PINBASED_CTLS_ONE_SETTING                                       \
81         (PINBASED_EXTINT_EXITING        |                               \
82          PINBASED_NMI_EXITING           |                               \
83          PINBASED_VIRTUAL_NMI)
84 #define PINBASED_CTLS_ZERO_SETTING      0
85
86 #define PROCBASED_CTLS_WINDOW_SETTING                                   \
87         (PROCBASED_INT_WINDOW_EXITING   |                               \
88          PROCBASED_NMI_WINDOW_EXITING)
89
90 #define PROCBASED_CTLS_ONE_SETTING                                      \
91         (PROCBASED_SECONDARY_CONTROLS   |                               \
92          PROCBASED_MWAIT_EXITING        |                               \
93          PROCBASED_MONITOR_EXITING      |                               \
94          PROCBASED_IO_EXITING           |                               \
95          PROCBASED_MSR_BITMAPS          |                               \
96          PROCBASED_CTLS_WINDOW_SETTING  |                               \
97          PROCBASED_CR8_LOAD_EXITING     |                               \
98          PROCBASED_CR8_STORE_EXITING)
99 #define PROCBASED_CTLS_ZERO_SETTING     \
100         (PROCBASED_CR3_LOAD_EXITING |   \
101         PROCBASED_CR3_STORE_EXITING |   \
102         PROCBASED_IO_BITMAPS)
103
104 #define PROCBASED_CTLS2_ONE_SETTING     PROCBASED2_ENABLE_EPT
105 #define PROCBASED_CTLS2_ZERO_SETTING    0
106
107 #define VM_EXIT_CTLS_ONE_SETTING                                        \
108         (VM_EXIT_SAVE_DEBUG_CONTROLS            |                       \
109         VM_EXIT_HOST_LMA                        |                       \
110         VM_EXIT_SAVE_EFER                       |                       \
111         VM_EXIT_LOAD_EFER                       |                       \
112         VM_EXIT_ACKNOWLEDGE_INTERRUPT)
113
114 #define VM_EXIT_CTLS_ZERO_SETTING       0
115
116 #define VM_ENTRY_CTLS_ONE_SETTING                                       \
117         (VM_ENTRY_LOAD_DEBUG_CONTROLS           |                       \
118         VM_ENTRY_LOAD_EFER)
119
120 #define VM_ENTRY_CTLS_ZERO_SETTING                                      \
121         (VM_ENTRY_INTO_SMM                      |                       \
122         VM_ENTRY_DEACTIVATE_DUAL_MONITOR)
123
124 #define HANDLED         1
125 #define UNHANDLED       0
126
127 static MALLOC_DEFINE(M_VMX, "vmx", "vmx");
128 static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic");
129
130 bool vmx_have_msr_tsc_aux;
131
132 SYSCTL_DECL(_hw_vmm);
133 SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
134     NULL);
135
136 int vmxon_enabled[MAXCPU];
137 static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE);
138
139 static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2;
140 static uint32_t exit_ctls, entry_ctls;
141
142 static uint64_t cr0_ones_mask, cr0_zeros_mask;
143 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_ones_mask, CTLFLAG_RD,
144              &cr0_ones_mask, 0, NULL);
145 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_zeros_mask, CTLFLAG_RD,
146              &cr0_zeros_mask, 0, NULL);
147
148 static uint64_t cr4_ones_mask, cr4_zeros_mask;
149 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD,
150              &cr4_ones_mask, 0, NULL);
151 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD,
152              &cr4_zeros_mask, 0, NULL);
153
154 static int vmx_initialized;
155 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD,
156            &vmx_initialized, 0, "Intel VMX initialized");
157
158 /*
159  * Optional capabilities
160  */
161 static SYSCTL_NODE(_hw_vmm_vmx, OID_AUTO, cap,
162     CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
163     NULL);
164
165 static int cap_halt_exit;
166 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, halt_exit, CTLFLAG_RD, &cap_halt_exit, 0,
167     "HLT triggers a VM-exit");
168
169 static int cap_pause_exit;
170 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, pause_exit, CTLFLAG_RD, &cap_pause_exit,
171     0, "PAUSE triggers a VM-exit");
172
173 static int cap_wbinvd_exit;
174 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, wbinvd_exit, CTLFLAG_RD, &cap_wbinvd_exit,
175     0, "WBINVD triggers a VM-exit");
176
177 static int cap_rdpid;
178 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, rdpid, CTLFLAG_RD, &cap_rdpid, 0,
179     "Guests are allowed to use RDPID");
180
181 static int cap_rdtscp;
182 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, rdtscp, CTLFLAG_RD, &cap_rdtscp, 0,
183     "Guests are allowed to use RDTSCP");
184
185 static int cap_unrestricted_guest;
186 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, unrestricted_guest, CTLFLAG_RD,
187     &cap_unrestricted_guest, 0, "Unrestricted guests");
188
189 static int cap_monitor_trap;
190 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, monitor_trap, CTLFLAG_RD,
191     &cap_monitor_trap, 0, "Monitor trap flag");
192
193 static int cap_invpcid;
194 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, invpcid, CTLFLAG_RD, &cap_invpcid,
195     0, "Guests are allowed to use INVPCID");
196
197 static int tpr_shadowing;
198 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, tpr_shadowing, CTLFLAG_RD,
199     &tpr_shadowing, 0, "TPR shadowing support");
200
201 static int virtual_interrupt_delivery;
202 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD,
203     &virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support");
204
205 static int posted_interrupts;
206 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, posted_interrupts, CTLFLAG_RD,
207     &posted_interrupts, 0, "APICv posted interrupt support");
208
209 static int pirvec = -1;
210 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupt_vector, CTLFLAG_RD,
211     &pirvec, 0, "APICv posted interrupt vector");
212
213 static struct unrhdr *vpid_unr;
214 static u_int vpid_alloc_failed;
215 SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD,
216             &vpid_alloc_failed, 0, NULL);
217
218 int guest_l1d_flush;
219 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, l1d_flush, CTLFLAG_RD,
220     &guest_l1d_flush, 0, NULL);
221 int guest_l1d_flush_sw;
222 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, l1d_flush_sw, CTLFLAG_RD,
223     &guest_l1d_flush_sw, 0, NULL);
224
225 static struct msr_entry msr_load_list[1] __aligned(16);
226
227 /*
228  * The definitions of SDT probes for VMX.
229  */
230
231 SDT_PROBE_DEFINE3(vmm, vmx, exit, entry,
232     "struct vmx *", "int", "struct vm_exit *");
233
234 SDT_PROBE_DEFINE4(vmm, vmx, exit, taskswitch,
235     "struct vmx *", "int", "struct vm_exit *", "struct vm_task_switch *");
236
237 SDT_PROBE_DEFINE4(vmm, vmx, exit, craccess,
238     "struct vmx *", "int", "struct vm_exit *", "uint64_t");
239
240 SDT_PROBE_DEFINE4(vmm, vmx, exit, rdmsr,
241     "struct vmx *", "int", "struct vm_exit *", "uint32_t");
242
243 SDT_PROBE_DEFINE5(vmm, vmx, exit, wrmsr,
244     "struct vmx *", "int", "struct vm_exit *", "uint32_t", "uint64_t");
245
246 SDT_PROBE_DEFINE3(vmm, vmx, exit, halt,
247     "struct vmx *", "int", "struct vm_exit *");
248
249 SDT_PROBE_DEFINE3(vmm, vmx, exit, mtrap,
250     "struct vmx *", "int", "struct vm_exit *");
251
252 SDT_PROBE_DEFINE3(vmm, vmx, exit, pause,
253     "struct vmx *", "int", "struct vm_exit *");
254
255 SDT_PROBE_DEFINE3(vmm, vmx, exit, intrwindow,
256     "struct vmx *", "int", "struct vm_exit *");
257
258 SDT_PROBE_DEFINE4(vmm, vmx, exit, interrupt,
259     "struct vmx *", "int", "struct vm_exit *", "uint32_t");
260
261 SDT_PROBE_DEFINE3(vmm, vmx, exit, nmiwindow,
262     "struct vmx *", "int", "struct vm_exit *");
263
264 SDT_PROBE_DEFINE3(vmm, vmx, exit, inout,
265     "struct vmx *", "int", "struct vm_exit *");
266
267 SDT_PROBE_DEFINE3(vmm, vmx, exit, cpuid,
268     "struct vmx *", "int", "struct vm_exit *");
269
270 SDT_PROBE_DEFINE5(vmm, vmx, exit, exception,
271     "struct vmx *", "int", "struct vm_exit *", "uint32_t", "int");
272
273 SDT_PROBE_DEFINE5(vmm, vmx, exit, nestedfault,
274     "struct vmx *", "int", "struct vm_exit *", "uint64_t", "uint64_t");
275
276 SDT_PROBE_DEFINE4(vmm, vmx, exit, mmiofault,
277     "struct vmx *", "int", "struct vm_exit *", "uint64_t");
278
279 SDT_PROBE_DEFINE3(vmm, vmx, exit, eoi,
280     "struct vmx *", "int", "struct vm_exit *");
281
282 SDT_PROBE_DEFINE3(vmm, vmx, exit, apicaccess,
283     "struct vmx *", "int", "struct vm_exit *");
284
285 SDT_PROBE_DEFINE4(vmm, vmx, exit, apicwrite,
286     "struct vmx *", "int", "struct vm_exit *", "struct vlapic *");
287
288 SDT_PROBE_DEFINE3(vmm, vmx, exit, xsetbv,
289     "struct vmx *", "int", "struct vm_exit *");
290
291 SDT_PROBE_DEFINE3(vmm, vmx, exit, monitor,
292     "struct vmx *", "int", "struct vm_exit *");
293
294 SDT_PROBE_DEFINE3(vmm, vmx, exit, mwait,
295     "struct vmx *", "int", "struct vm_exit *");
296
297 SDT_PROBE_DEFINE3(vmm, vmx, exit, vminsn,
298     "struct vmx *", "int", "struct vm_exit *");
299
300 SDT_PROBE_DEFINE4(vmm, vmx, exit, unknown,
301     "struct vmx *", "int", "struct vm_exit *", "uint32_t");
302
303 SDT_PROBE_DEFINE4(vmm, vmx, exit, return,
304     "struct vmx *", "int", "struct vm_exit *", "int");
305
306 /*
307  * Use the last page below 4GB as the APIC access address. This address is
308  * occupied by the boot firmware so it is guaranteed that it will not conflict
309  * with a page in system memory.
310  */
311 #define APIC_ACCESS_ADDRESS     0xFFFFF000
312
313 static int vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc);
314 static int vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval);
315 static int vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val);
316 static void vmx_inject_pir(struct vlapic *vlapic);
317 #ifdef BHYVE_SNAPSHOT
318 static int vmx_restore_tsc(void *arg, int vcpu, uint64_t now);
319 #endif
320
321 static inline bool
322 host_has_rdpid(void)
323 {
324         return ((cpu_stdext_feature2 & CPUID_STDEXT2_RDPID) != 0);
325 }
326
327 static inline bool
328 host_has_rdtscp(void)
329 {
330         return ((amd_feature & AMDID_RDTSCP) != 0);
331 }
332
333 #ifdef KTR
334 static const char *
335 exit_reason_to_str(int reason)
336 {
337         static char reasonbuf[32];
338
339         switch (reason) {
340         case EXIT_REASON_EXCEPTION:
341                 return "exception";
342         case EXIT_REASON_EXT_INTR:
343                 return "extint";
344         case EXIT_REASON_TRIPLE_FAULT:
345                 return "triplefault";
346         case EXIT_REASON_INIT:
347                 return "init";
348         case EXIT_REASON_SIPI:
349                 return "sipi";
350         case EXIT_REASON_IO_SMI:
351                 return "iosmi";
352         case EXIT_REASON_SMI:
353                 return "smi";
354         case EXIT_REASON_INTR_WINDOW:
355                 return "intrwindow";
356         case EXIT_REASON_NMI_WINDOW:
357                 return "nmiwindow";
358         case EXIT_REASON_TASK_SWITCH:
359                 return "taskswitch";
360         case EXIT_REASON_CPUID:
361                 return "cpuid";
362         case EXIT_REASON_GETSEC:
363                 return "getsec";
364         case EXIT_REASON_HLT:
365                 return "hlt";
366         case EXIT_REASON_INVD:
367                 return "invd";
368         case EXIT_REASON_INVLPG:
369                 return "invlpg";
370         case EXIT_REASON_RDPMC:
371                 return "rdpmc";
372         case EXIT_REASON_RDTSC:
373                 return "rdtsc";
374         case EXIT_REASON_RSM:
375                 return "rsm";
376         case EXIT_REASON_VMCALL:
377                 return "vmcall";
378         case EXIT_REASON_VMCLEAR:
379                 return "vmclear";
380         case EXIT_REASON_VMLAUNCH:
381                 return "vmlaunch";
382         case EXIT_REASON_VMPTRLD:
383                 return "vmptrld";
384         case EXIT_REASON_VMPTRST:
385                 return "vmptrst";
386         case EXIT_REASON_VMREAD:
387                 return "vmread";
388         case EXIT_REASON_VMRESUME:
389                 return "vmresume";
390         case EXIT_REASON_VMWRITE:
391                 return "vmwrite";
392         case EXIT_REASON_VMXOFF:
393                 return "vmxoff";
394         case EXIT_REASON_VMXON:
395                 return "vmxon";
396         case EXIT_REASON_CR_ACCESS:
397                 return "craccess";
398         case EXIT_REASON_DR_ACCESS:
399                 return "draccess";
400         case EXIT_REASON_INOUT:
401                 return "inout";
402         case EXIT_REASON_RDMSR:
403                 return "rdmsr";
404         case EXIT_REASON_WRMSR:
405                 return "wrmsr";
406         case EXIT_REASON_INVAL_VMCS:
407                 return "invalvmcs";
408         case EXIT_REASON_INVAL_MSR:
409                 return "invalmsr";
410         case EXIT_REASON_MWAIT:
411                 return "mwait";
412         case EXIT_REASON_MTF:
413                 return "mtf";
414         case EXIT_REASON_MONITOR:
415                 return "monitor";
416         case EXIT_REASON_PAUSE:
417                 return "pause";
418         case EXIT_REASON_MCE_DURING_ENTRY:
419                 return "mce-during-entry";
420         case EXIT_REASON_TPR:
421                 return "tpr";
422         case EXIT_REASON_APIC_ACCESS:
423                 return "apic-access";
424         case EXIT_REASON_GDTR_IDTR:
425                 return "gdtridtr";
426         case EXIT_REASON_LDTR_TR:
427                 return "ldtrtr";
428         case EXIT_REASON_EPT_FAULT:
429                 return "eptfault";
430         case EXIT_REASON_EPT_MISCONFIG:
431                 return "eptmisconfig";
432         case EXIT_REASON_INVEPT:
433                 return "invept";
434         case EXIT_REASON_RDTSCP:
435                 return "rdtscp";
436         case EXIT_REASON_VMX_PREEMPT:
437                 return "vmxpreempt";
438         case EXIT_REASON_INVVPID:
439                 return "invvpid";
440         case EXIT_REASON_WBINVD:
441                 return "wbinvd";
442         case EXIT_REASON_XSETBV:
443                 return "xsetbv";
444         case EXIT_REASON_APIC_WRITE:
445                 return "apic-write";
446         default:
447                 snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason);
448                 return (reasonbuf);
449         }
450 }
451 #endif  /* KTR */
452
453 static int
454 vmx_allow_x2apic_msrs(struct vmx *vmx)
455 {
456         int i, error;
457
458         error = 0;
459
460         /*
461          * Allow readonly access to the following x2APIC MSRs from the guest.
462          */
463         error += guest_msr_ro(vmx, MSR_APIC_ID);
464         error += guest_msr_ro(vmx, MSR_APIC_VERSION);
465         error += guest_msr_ro(vmx, MSR_APIC_LDR);
466         error += guest_msr_ro(vmx, MSR_APIC_SVR);
467
468         for (i = 0; i < 8; i++)
469                 error += guest_msr_ro(vmx, MSR_APIC_ISR0 + i);
470
471         for (i = 0; i < 8; i++)
472                 error += guest_msr_ro(vmx, MSR_APIC_TMR0 + i);
473
474         for (i = 0; i < 8; i++)
475                 error += guest_msr_ro(vmx, MSR_APIC_IRR0 + i);
476
477         error += guest_msr_ro(vmx, MSR_APIC_ESR);
478         error += guest_msr_ro(vmx, MSR_APIC_LVT_TIMER);
479         error += guest_msr_ro(vmx, MSR_APIC_LVT_THERMAL);
480         error += guest_msr_ro(vmx, MSR_APIC_LVT_PCINT);
481         error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT0);
482         error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT1);
483         error += guest_msr_ro(vmx, MSR_APIC_LVT_ERROR);
484         error += guest_msr_ro(vmx, MSR_APIC_ICR_TIMER);
485         error += guest_msr_ro(vmx, MSR_APIC_DCR_TIMER);
486         error += guest_msr_ro(vmx, MSR_APIC_ICR);
487
488         /*
489          * Allow TPR, EOI and SELF_IPI MSRs to be read and written by the guest.
490          *
491          * These registers get special treatment described in the section
492          * "Virtualizing MSR-Based APIC Accesses".
493          */
494         error += guest_msr_rw(vmx, MSR_APIC_TPR);
495         error += guest_msr_rw(vmx, MSR_APIC_EOI);
496         error += guest_msr_rw(vmx, MSR_APIC_SELF_IPI);
497
498         return (error);
499 }
500
501 u_long
502 vmx_fix_cr0(u_long cr0)
503 {
504
505         return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask);
506 }
507
508 u_long
509 vmx_fix_cr4(u_long cr4)
510 {
511
512         return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask);
513 }
514
515 static void
516 vpid_free(int vpid)
517 {
518         if (vpid < 0 || vpid > 0xffff)
519                 panic("vpid_free: invalid vpid %d", vpid);
520
521         /*
522          * VPIDs [0,VM_MAXCPU] are special and are not allocated from
523          * the unit number allocator.
524          */
525
526         if (vpid > VM_MAXCPU)
527                 free_unr(vpid_unr, vpid);
528 }
529
530 static void
531 vpid_alloc(uint16_t *vpid, int num)
532 {
533         int i, x;
534
535         if (num <= 0 || num > VM_MAXCPU)
536                 panic("invalid number of vpids requested: %d", num);
537
538         /*
539          * If the "enable vpid" execution control is not enabled then the
540          * VPID is required to be 0 for all vcpus.
541          */
542         if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) == 0) {
543                 for (i = 0; i < num; i++)
544                         vpid[i] = 0;
545                 return;
546         }
547
548         /*
549          * Allocate a unique VPID for each vcpu from the unit number allocator.
550          */
551         for (i = 0; i < num; i++) {
552                 x = alloc_unr(vpid_unr);
553                 if (x == -1)
554                         break;
555                 else
556                         vpid[i] = x;
557         }
558
559         if (i < num) {
560                 atomic_add_int(&vpid_alloc_failed, 1);
561
562                 /*
563                  * If the unit number allocator does not have enough unique
564                  * VPIDs then we need to allocate from the [1,VM_MAXCPU] range.
565                  *
566                  * These VPIDs are not be unique across VMs but this does not
567                  * affect correctness because the combined mappings are also
568                  * tagged with the EP4TA which is unique for each VM.
569                  *
570                  * It is still sub-optimal because the invvpid will invalidate
571                  * combined mappings for a particular VPID across all EP4TAs.
572                  */
573                 while (i-- > 0)
574                         vpid_free(vpid[i]);
575
576                 for (i = 0; i < num; i++)
577                         vpid[i] = i + 1;
578         }
579 }
580
581 static void
582 vpid_init(void)
583 {
584         /*
585          * VPID 0 is required when the "enable VPID" execution control is
586          * disabled.
587          *
588          * VPIDs [1,VM_MAXCPU] are used as the "overflow namespace" when the
589          * unit number allocator does not have sufficient unique VPIDs to
590          * satisfy the allocation.
591          *
592          * The remaining VPIDs are managed by the unit number allocator.
593          */
594         vpid_unr = new_unrhdr(VM_MAXCPU + 1, 0xffff, NULL);
595 }
596
597 static void
598 vmx_disable(void *arg __unused)
599 {
600         struct invvpid_desc invvpid_desc = { 0 };
601         struct invept_desc invept_desc = { 0 };
602
603         if (vmxon_enabled[curcpu]) {
604                 /*
605                  * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b.
606                  *
607                  * VMXON or VMXOFF are not required to invalidate any TLB
608                  * caching structures. This prevents potential retention of
609                  * cached information in the TLB between distinct VMX episodes.
610                  */
611                 invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc);
612                 invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc);
613                 vmxoff();
614         }
615         load_cr4(rcr4() & ~CR4_VMXE);
616 }
617
618 static int
619 vmx_modcleanup(void)
620 {
621
622         if (pirvec >= 0)
623                 lapic_ipi_free(pirvec);
624
625         if (vpid_unr != NULL) {
626                 delete_unrhdr(vpid_unr);
627                 vpid_unr = NULL;
628         }
629
630         if (nmi_flush_l1d_sw == 1)
631                 nmi_flush_l1d_sw = 0;
632
633         smp_rendezvous(NULL, vmx_disable, NULL, NULL);
634
635         return (0);
636 }
637
638 static void
639 vmx_enable(void *arg __unused)
640 {
641         int error;
642         uint64_t feature_control;
643
644         feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
645         if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 ||
646             (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) {
647                 wrmsr(MSR_IA32_FEATURE_CONTROL,
648                     feature_control | IA32_FEATURE_CONTROL_VMX_EN |
649                     IA32_FEATURE_CONTROL_LOCK);
650         }
651
652         load_cr4(rcr4() | CR4_VMXE);
653
654         *(uint32_t *)vmxon_region[curcpu] = vmx_revision();
655         error = vmxon(vmxon_region[curcpu]);
656         if (error == 0)
657                 vmxon_enabled[curcpu] = 1;
658 }
659
660 static void
661 vmx_modresume(void)
662 {
663
664         if (vmxon_enabled[curcpu])
665                 vmxon(vmxon_region[curcpu]);
666 }
667
668 static int
669 vmx_modinit(int ipinum)
670 {
671         int error;
672         uint64_t basic, fixed0, fixed1, feature_control;
673         uint32_t tmp, procbased2_vid_bits;
674
675         /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */
676         if (!(cpu_feature2 & CPUID2_VMX)) {
677                 printf("vmx_modinit: processor does not support VMX "
678                     "operation\n");
679                 return (ENXIO);
680         }
681
682         /*
683          * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits
684          * are set (bits 0 and 2 respectively).
685          */
686         feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
687         if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 1 &&
688             (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) {
689                 printf("vmx_modinit: VMX operation disabled by BIOS\n");
690                 return (ENXIO);
691         }
692
693         /*
694          * Verify capabilities MSR_VMX_BASIC:
695          * - bit 54 indicates support for INS/OUTS decoding
696          */
697         basic = rdmsr(MSR_VMX_BASIC);
698         if ((basic & (1UL << 54)) == 0) {
699                 printf("vmx_modinit: processor does not support desired basic "
700                     "capabilities\n");
701                 return (EINVAL);
702         }
703
704         /* Check support for primary processor-based VM-execution controls */
705         error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
706                                MSR_VMX_TRUE_PROCBASED_CTLS,
707                                PROCBASED_CTLS_ONE_SETTING,
708                                PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls);
709         if (error) {
710                 printf("vmx_modinit: processor does not support desired "
711                     "primary processor-based controls\n");
712                 return (error);
713         }
714
715         /* Clear the processor-based ctl bits that are set on demand */
716         procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING;
717
718         /* Check support for secondary processor-based VM-execution controls */
719         error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
720                                MSR_VMX_PROCBASED_CTLS2,
721                                PROCBASED_CTLS2_ONE_SETTING,
722                                PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2);
723         if (error) {
724                 printf("vmx_modinit: processor does not support desired "
725                     "secondary processor-based controls\n");
726                 return (error);
727         }
728
729         /* Check support for VPID */
730         error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
731                                PROCBASED2_ENABLE_VPID, 0, &tmp);
732         if (error == 0)
733                 procbased_ctls2 |= PROCBASED2_ENABLE_VPID;
734
735         /* Check support for pin-based VM-execution controls */
736         error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
737                                MSR_VMX_TRUE_PINBASED_CTLS,
738                                PINBASED_CTLS_ONE_SETTING,
739                                PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls);
740         if (error) {
741                 printf("vmx_modinit: processor does not support desired "
742                     "pin-based controls\n");
743                 return (error);
744         }
745
746         /* Check support for VM-exit controls */
747         error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS,
748                                VM_EXIT_CTLS_ONE_SETTING,
749                                VM_EXIT_CTLS_ZERO_SETTING,
750                                &exit_ctls);
751         if (error) {
752                 printf("vmx_modinit: processor does not support desired "
753                     "exit controls\n");
754                 return (error);
755         }
756
757         /* Check support for VM-entry controls */
758         error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS,
759             VM_ENTRY_CTLS_ONE_SETTING, VM_ENTRY_CTLS_ZERO_SETTING,
760             &entry_ctls);
761         if (error) {
762                 printf("vmx_modinit: processor does not support desired "
763                     "entry controls\n");
764                 return (error);
765         }
766
767         /*
768          * Check support for optional features by testing them
769          * as individual bits
770          */
771         cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
772                                         MSR_VMX_TRUE_PROCBASED_CTLS,
773                                         PROCBASED_HLT_EXITING, 0,
774                                         &tmp) == 0);
775
776         cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
777                                         MSR_VMX_PROCBASED_CTLS,
778                                         PROCBASED_MTF, 0,
779                                         &tmp) == 0);
780
781         cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
782                                          MSR_VMX_TRUE_PROCBASED_CTLS,
783                                          PROCBASED_PAUSE_EXITING, 0,
784                                          &tmp) == 0);
785
786         cap_wbinvd_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
787                                         MSR_VMX_PROCBASED_CTLS2,
788                                         PROCBASED2_WBINVD_EXITING,
789                                         0,
790                                         &tmp) == 0);
791
792         /*
793          * Check support for RDPID and/or RDTSCP.
794          *
795          * Support a pass-through-based implementation of these via the
796          * "enable RDTSCP" VM-execution control and the "RDTSC exiting"
797          * VM-execution control.
798          *
799          * The "enable RDTSCP" VM-execution control applies to both RDPID
800          * and RDTSCP (see SDM volume 3, section 25.3, "Changes to
801          * Instruction Behavior in VMX Non-root operation"); this is why
802          * only this VM-execution control needs to be enabled in order to
803          * enable passing through whichever of RDPID and/or RDTSCP are
804          * supported by the host.
805          *
806          * The "RDTSC exiting" VM-execution control applies to both RDTSC
807          * and RDTSCP (again, per SDM volume 3, section 25.3), and is
808          * already set up for RDTSC and RDTSCP pass-through by the current
809          * implementation of RDTSC.
810          *
811          * Although RDPID and RDTSCP are optional capabilities, since there
812          * does not currently seem to be a use case for enabling/disabling
813          * these via libvmmapi, choose not to support this and, instead,
814          * just statically always enable or always disable this support
815          * across all vCPUs on all VMs. (Note that there may be some
816          * complications to providing this functionality, e.g., the MSR
817          * bitmap is currently per-VM rather than per-vCPU while the
818          * capability API wants to be able to control capabilities on a
819          * per-vCPU basis).
820          */
821         error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
822                                MSR_VMX_PROCBASED_CTLS2,
823                                PROCBASED2_ENABLE_RDTSCP, 0, &tmp);
824         cap_rdpid = error == 0 && host_has_rdpid();
825         cap_rdtscp = error == 0 && host_has_rdtscp();
826         if (cap_rdpid || cap_rdtscp) {
827                 procbased_ctls2 |= PROCBASED2_ENABLE_RDTSCP;
828                 vmx_have_msr_tsc_aux = true;
829         }
830
831         cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
832                                         MSR_VMX_PROCBASED_CTLS2,
833                                         PROCBASED2_UNRESTRICTED_GUEST, 0,
834                                         &tmp) == 0);
835
836         cap_invpcid = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
837             MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0,
838             &tmp) == 0);
839
840         /*
841          * Check support for TPR shadow.
842          */
843         error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
844             MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_USE_TPR_SHADOW, 0,
845             &tmp);
846         if (error == 0) {
847                 tpr_shadowing = 1;
848                 TUNABLE_INT_FETCH("hw.vmm.vmx.use_tpr_shadowing",
849                     &tpr_shadowing);
850         }
851
852         if (tpr_shadowing) {
853                 procbased_ctls |= PROCBASED_USE_TPR_SHADOW;
854                 procbased_ctls &= ~PROCBASED_CR8_LOAD_EXITING;
855                 procbased_ctls &= ~PROCBASED_CR8_STORE_EXITING;
856         }
857
858         /*
859          * Check support for virtual interrupt delivery.
860          */
861         procbased2_vid_bits = (PROCBASED2_VIRTUALIZE_APIC_ACCESSES |
862             PROCBASED2_VIRTUALIZE_X2APIC_MODE |
863             PROCBASED2_APIC_REGISTER_VIRTUALIZATION |
864             PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY);
865
866         error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
867             procbased2_vid_bits, 0, &tmp);
868         if (error == 0 && tpr_shadowing) {
869                 virtual_interrupt_delivery = 1;
870                 TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_vid",
871                     &virtual_interrupt_delivery);
872         }
873
874         if (virtual_interrupt_delivery) {
875                 procbased_ctls |= PROCBASED_USE_TPR_SHADOW;
876                 procbased_ctls2 |= procbased2_vid_bits;
877                 procbased_ctls2 &= ~PROCBASED2_VIRTUALIZE_X2APIC_MODE;
878
879                 /*
880                  * Check for Posted Interrupts only if Virtual Interrupt
881                  * Delivery is enabled.
882                  */
883                 error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
884                     MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_POSTED_INTERRUPT, 0,
885                     &tmp);
886                 if (error == 0) {
887                         pirvec = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) :
888                             &IDTVEC(justreturn));
889                         if (pirvec < 0) {
890                                 if (bootverbose) {
891                                         printf("vmx_modinit: unable to "
892                                             "allocate posted interrupt "
893                                             "vector\n");
894                                 }
895                         } else {
896                                 posted_interrupts = 1;
897                                 TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_pir",
898                                     &posted_interrupts);
899                         }
900                 }
901         }
902
903         if (posted_interrupts)
904                     pinbased_ctls |= PINBASED_POSTED_INTERRUPT;
905
906         /* Initialize EPT */
907         error = ept_init(ipinum);
908         if (error) {
909                 printf("vmx_modinit: ept initialization failed (%d)\n", error);
910                 return (error);
911         }
912
913         guest_l1d_flush = (cpu_ia32_arch_caps &
914             IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) == 0;
915         TUNABLE_INT_FETCH("hw.vmm.l1d_flush", &guest_l1d_flush);
916
917         /*
918          * L1D cache flush is enabled.  Use IA32_FLUSH_CMD MSR when
919          * available.  Otherwise fall back to the software flush
920          * method which loads enough data from the kernel text to
921          * flush existing L1D content, both on VMX entry and on NMI
922          * return.
923          */
924         if (guest_l1d_flush) {
925                 if ((cpu_stdext_feature3 & CPUID_STDEXT3_L1D_FLUSH) == 0) {
926                         guest_l1d_flush_sw = 1;
927                         TUNABLE_INT_FETCH("hw.vmm.l1d_flush_sw",
928                             &guest_l1d_flush_sw);
929                 }
930                 if (guest_l1d_flush_sw) {
931                         if (nmi_flush_l1d_sw <= 1)
932                                 nmi_flush_l1d_sw = 1;
933                 } else {
934                         msr_load_list[0].index = MSR_IA32_FLUSH_CMD;
935                         msr_load_list[0].val = IA32_FLUSH_CMD_L1D;
936                 }
937         }
938
939         /*
940          * Stash the cr0 and cr4 bits that must be fixed to 0 or 1
941          */
942         fixed0 = rdmsr(MSR_VMX_CR0_FIXED0);
943         fixed1 = rdmsr(MSR_VMX_CR0_FIXED1);
944         cr0_ones_mask = fixed0 & fixed1;
945         cr0_zeros_mask = ~fixed0 & ~fixed1;
946
947         /*
948          * CR0_PE and CR0_PG can be set to zero in VMX non-root operation
949          * if unrestricted guest execution is allowed.
950          */
951         if (cap_unrestricted_guest)
952                 cr0_ones_mask &= ~(CR0_PG | CR0_PE);
953
954         /*
955          * Do not allow the guest to set CR0_NW or CR0_CD.
956          */
957         cr0_zeros_mask |= (CR0_NW | CR0_CD);
958
959         fixed0 = rdmsr(MSR_VMX_CR4_FIXED0);
960         fixed1 = rdmsr(MSR_VMX_CR4_FIXED1);
961         cr4_ones_mask = fixed0 & fixed1;
962         cr4_zeros_mask = ~fixed0 & ~fixed1;
963
964         vpid_init();
965
966         vmx_msr_init();
967
968         /* enable VMX operation */
969         smp_rendezvous(NULL, vmx_enable, NULL, NULL);
970
971         vmx_initialized = 1;
972
973         return (0);
974 }
975
976 static void
977 vmx_trigger_hostintr(int vector)
978 {
979         uintptr_t func;
980         struct gate_descriptor *gd;
981
982         gd = &idt[vector];
983
984         KASSERT(vector >= 32 && vector <= 255, ("vmx_trigger_hostintr: "
985             "invalid vector %d", vector));
986         KASSERT(gd->gd_p == 1, ("gate descriptor for vector %d not present",
987             vector));
988         KASSERT(gd->gd_type == SDT_SYSIGT, ("gate descriptor for vector %d "
989             "has invalid type %d", vector, gd->gd_type));
990         KASSERT(gd->gd_dpl == SEL_KPL, ("gate descriptor for vector %d "
991             "has invalid dpl %d", vector, gd->gd_dpl));
992         KASSERT(gd->gd_selector == GSEL(GCODE_SEL, SEL_KPL), ("gate descriptor "
993             "for vector %d has invalid selector %d", vector, gd->gd_selector));
994         KASSERT(gd->gd_ist == 0, ("gate descriptor for vector %d has invalid "
995             "IST %d", vector, gd->gd_ist));
996
997         func = ((long)gd->gd_hioffset << 16 | gd->gd_looffset);
998         vmx_call_isr(func);
999 }
1000
1001 static int
1002 vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial)
1003 {
1004         int error, mask_ident, shadow_ident;
1005         uint64_t mask_value;
1006
1007         if (which != 0 && which != 4)
1008                 panic("vmx_setup_cr_shadow: unknown cr%d", which);
1009
1010         if (which == 0) {
1011                 mask_ident = VMCS_CR0_MASK;
1012                 mask_value = cr0_ones_mask | cr0_zeros_mask;
1013                 shadow_ident = VMCS_CR0_SHADOW;
1014         } else {
1015                 mask_ident = VMCS_CR4_MASK;
1016                 mask_value = cr4_ones_mask | cr4_zeros_mask;
1017                 shadow_ident = VMCS_CR4_SHADOW;
1018         }
1019
1020         error = vmcs_setreg(vmcs, 0, VMCS_IDENT(mask_ident), mask_value);
1021         if (error)
1022                 return (error);
1023
1024         error = vmcs_setreg(vmcs, 0, VMCS_IDENT(shadow_ident), initial);
1025         if (error)
1026                 return (error);
1027
1028         return (0);
1029 }
1030 #define vmx_setup_cr0_shadow(vmcs,init) vmx_setup_cr_shadow(0, (vmcs), (init))
1031 #define vmx_setup_cr4_shadow(vmcs,init) vmx_setup_cr_shadow(4, (vmcs), (init))
1032
1033 static void *
1034 vmx_init(struct vm *vm, pmap_t pmap)
1035 {
1036         int i, error;
1037         struct vmx *vmx;
1038         struct vmcs *vmcs;
1039         struct vmx_vcpu *vcpu;
1040         uint32_t exc_bitmap;
1041         uint16_t maxcpus = vm_get_maxcpus(vm);
1042         uint16_t vpid[maxcpus];
1043
1044         vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO);
1045         vmx->vm = vm;
1046
1047         vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pmltop));
1048
1049         /*
1050          * Clean up EPTP-tagged guest physical and combined mappings
1051          *
1052          * VMX transitions are not required to invalidate any guest physical
1053          * mappings. So, it may be possible for stale guest physical mappings
1054          * to be present in the processor TLBs.
1055          *
1056          * Combined mappings for this EP4TA are also invalidated for all VPIDs.
1057          */
1058         ept_invalidate_mappings(vmx->eptp);
1059
1060         vmx->msr_bitmap = malloc_aligned(PAGE_SIZE, PAGE_SIZE, M_VMX,
1061             M_WAITOK | M_ZERO);
1062         msr_bitmap_initialize(vmx->msr_bitmap);
1063
1064         /*
1065          * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE.
1066          * The guest FSBASE and GSBASE are saved and restored during
1067          * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are
1068          * always restored from the vmcs host state area on vm-exit.
1069          *
1070          * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in
1071          * how they are saved/restored so can be directly accessed by the
1072          * guest.
1073          *
1074          * MSR_EFER is saved and restored in the guest VMCS area on a
1075          * VM exit and entry respectively. It is also restored from the
1076          * host VMCS area on a VM exit.
1077          *
1078          * The TSC MSR is exposed read-only. Writes are disallowed as
1079          * that will impact the host TSC.  If the guest does a write
1080          * the "use TSC offsetting" execution control is enabled and the
1081          * difference between the host TSC and the guest TSC is written
1082          * into the TSC offset in the VMCS.
1083          *
1084          * Guest TSC_AUX support is enabled if any of guest RDPID and/or
1085          * guest RDTSCP support are enabled (since, as per Table 2-2 in SDM
1086          * volume 4, TSC_AUX is supported if any of RDPID and/or RDTSCP are
1087          * supported). If guest TSC_AUX support is enabled, TSC_AUX is
1088          * exposed read-only so that the VMM can do one fewer MSR read per
1089          * exit than if this register were exposed read-write; the guest
1090          * restore value can be updated during guest writes (expected to be
1091          * rare) instead of during all exits (common).
1092          */
1093         if (guest_msr_rw(vmx, MSR_GSBASE) ||
1094             guest_msr_rw(vmx, MSR_FSBASE) ||
1095             guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) ||
1096             guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) ||
1097             guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) ||
1098             guest_msr_rw(vmx, MSR_EFER) ||
1099             guest_msr_ro(vmx, MSR_TSC) ||
1100             ((cap_rdpid || cap_rdtscp) && guest_msr_ro(vmx, MSR_TSC_AUX)))
1101                 panic("vmx_init: error setting guest msr access");
1102
1103         vpid_alloc(vpid, maxcpus);
1104
1105         if (virtual_interrupt_delivery) {
1106                 error = vm_map_mmio(vm, DEFAULT_APIC_BASE, PAGE_SIZE,
1107                     APIC_ACCESS_ADDRESS);
1108                 /* XXX this should really return an error to the caller */
1109                 KASSERT(error == 0, ("vm_map_mmio(apicbase) error %d", error));
1110         }
1111
1112         for (i = 0; i < maxcpus; i++) {
1113                 vcpu = &vmx->vcpus[i];
1114
1115                 vcpu->vmcs = malloc_aligned(sizeof(*vmcs), PAGE_SIZE, M_VMX,
1116                     M_WAITOK | M_ZERO);
1117                 vcpu->apic_page = malloc_aligned(PAGE_SIZE, PAGE_SIZE, M_VMX,
1118                     M_WAITOK | M_ZERO);
1119                 vcpu->pir_desc = malloc_aligned(sizeof(*vcpu->pir_desc), 64,
1120                     M_VMX, M_WAITOK | M_ZERO);
1121
1122                 vmcs = vcpu->vmcs;
1123                 vmcs->identifier = vmx_revision();
1124                 error = vmclear(vmcs);
1125                 if (error != 0) {
1126                         panic("vmx_init: vmclear error %d on vcpu %d\n",
1127                               error, i);
1128                 }
1129
1130                 vmx_msr_guest_init(vmx, i);
1131
1132                 error = vmcs_init(vmcs);
1133                 KASSERT(error == 0, ("vmcs_init error %d", error));
1134
1135                 VMPTRLD(vmcs);
1136                 error = 0;
1137                 error += vmwrite(VMCS_HOST_RSP, (u_long)&vcpu->ctx);
1138                 error += vmwrite(VMCS_EPTP, vmx->eptp);
1139                 error += vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls);
1140                 error += vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls);
1141                 if (vcpu_trap_wbinvd(vm, i)) {
1142                         KASSERT(cap_wbinvd_exit, ("WBINVD trap not available"));
1143                         procbased_ctls2 |= PROCBASED2_WBINVD_EXITING;
1144                 }
1145                 error += vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2);
1146                 error += vmwrite(VMCS_EXIT_CTLS, exit_ctls);
1147                 error += vmwrite(VMCS_ENTRY_CTLS, entry_ctls);
1148                 error += vmwrite(VMCS_MSR_BITMAP, vtophys(vmx->msr_bitmap));
1149                 error += vmwrite(VMCS_VPID, vpid[i]);
1150
1151                 if (guest_l1d_flush && !guest_l1d_flush_sw) {
1152                         vmcs_write(VMCS_ENTRY_MSR_LOAD, pmap_kextract(
1153                             (vm_offset_t)&msr_load_list[0]));
1154                         vmcs_write(VMCS_ENTRY_MSR_LOAD_COUNT,
1155                             nitems(msr_load_list));
1156                         vmcs_write(VMCS_EXIT_MSR_STORE, 0);
1157                         vmcs_write(VMCS_EXIT_MSR_STORE_COUNT, 0);
1158                 }
1159
1160                 /* exception bitmap */
1161                 if (vcpu_trace_exceptions(vm, i))
1162                         exc_bitmap = 0xffffffff;
1163                 else
1164                         exc_bitmap = 1 << IDT_MC;
1165                 error += vmwrite(VMCS_EXCEPTION_BITMAP, exc_bitmap);
1166
1167                 vcpu->ctx.guest_dr6 = DBREG_DR6_RESERVED1;
1168                 error += vmwrite(VMCS_GUEST_DR7, DBREG_DR7_RESERVED1);
1169
1170                 if (tpr_shadowing) {
1171                         error += vmwrite(VMCS_VIRTUAL_APIC,
1172                             vtophys(vcpu->apic_page));
1173                 }
1174
1175                 if (virtual_interrupt_delivery) {
1176                         error += vmwrite(VMCS_APIC_ACCESS, APIC_ACCESS_ADDRESS);
1177                         error += vmwrite(VMCS_EOI_EXIT0, 0);
1178                         error += vmwrite(VMCS_EOI_EXIT1, 0);
1179                         error += vmwrite(VMCS_EOI_EXIT2, 0);
1180                         error += vmwrite(VMCS_EOI_EXIT3, 0);
1181                 }
1182                 if (posted_interrupts) {
1183                         error += vmwrite(VMCS_PIR_VECTOR, pirvec);
1184                         error += vmwrite(VMCS_PIR_DESC,
1185                             vtophys(vcpu->pir_desc));
1186                 }
1187                 VMCLEAR(vmcs);
1188                 KASSERT(error == 0, ("vmx_init: error customizing the vmcs"));
1189
1190                 vcpu->cap.set = 0;
1191                 vcpu->cap.set |= cap_rdpid != 0 ? 1 << VM_CAP_RDPID : 0;
1192                 vcpu->cap.set |= cap_rdtscp != 0 ? 1 << VM_CAP_RDTSCP : 0;
1193                 vcpu->cap.proc_ctls = procbased_ctls;
1194                 vcpu->cap.proc_ctls2 = procbased_ctls2;
1195                 vcpu->cap.exc_bitmap = exc_bitmap;
1196
1197                 vcpu->state.nextrip = ~0;
1198                 vcpu->state.lastcpu = NOCPU;
1199                 vcpu->state.vpid = vpid[i];
1200
1201                 /*
1202                  * Set up the CR0/4 shadows, and init the read shadow
1203                  * to the power-on register value from the Intel Sys Arch.
1204                  *  CR0 - 0x60000010
1205                  *  CR4 - 0
1206                  */
1207                 error = vmx_setup_cr0_shadow(vmcs, 0x60000010);
1208                 if (error != 0)
1209                         panic("vmx_setup_cr0_shadow %d", error);
1210
1211                 error = vmx_setup_cr4_shadow(vmcs, 0);
1212                 if (error != 0)
1213                         panic("vmx_setup_cr4_shadow %d", error);
1214
1215                 vcpu->ctx.pmap = pmap;
1216         }
1217
1218         return (vmx);
1219 }
1220
1221 static int
1222 vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx)
1223 {
1224         int handled;
1225
1226         handled = x86_emulate_cpuid(vm, vcpu, (uint64_t *)&vmxctx->guest_rax,
1227             (uint64_t *)&vmxctx->guest_rbx, (uint64_t *)&vmxctx->guest_rcx,
1228             (uint64_t *)&vmxctx->guest_rdx);
1229         return (handled);
1230 }
1231
1232 static __inline void
1233 vmx_run_trace(struct vmx *vmx, int vcpu)
1234 {
1235 #ifdef KTR
1236         VCPU_CTR1(vmx->vm, vcpu, "Resume execution at %#lx", vmcs_guest_rip());
1237 #endif
1238 }
1239
1240 static __inline void
1241 vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason,
1242                int handled)
1243 {
1244 #ifdef KTR
1245         VCPU_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx",
1246                  handled ? "handled" : "unhandled",
1247                  exit_reason_to_str(exit_reason), rip);
1248 #endif
1249 }
1250
1251 static __inline void
1252 vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip)
1253 {
1254 #ifdef KTR
1255         VCPU_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip);
1256 #endif
1257 }
1258
1259 static VMM_STAT_INTEL(VCPU_INVVPID_SAVED, "Number of vpid invalidations saved");
1260 static VMM_STAT_INTEL(VCPU_INVVPID_DONE, "Number of vpid invalidations done");
1261
1262 /*
1263  * Invalidate guest mappings identified by its vpid from the TLB.
1264  */
1265 static __inline void
1266 vmx_invvpid(struct vmx *vmx, int vcpu, pmap_t pmap, int running)
1267 {
1268         struct vmxstate *vmxstate;
1269         struct invvpid_desc invvpid_desc;
1270
1271         vmxstate = &vmx->vcpus[vcpu].state;
1272         if (vmxstate->vpid == 0)
1273                 return;
1274
1275         if (!running) {
1276                 /*
1277                  * Set the 'lastcpu' to an invalid host cpu.
1278                  *
1279                  * This will invalidate TLB entries tagged with the vcpu's
1280                  * vpid the next time it runs via vmx_set_pcpu_defaults().
1281                  */
1282                 vmxstate->lastcpu = NOCPU;
1283                 return;
1284         }
1285
1286         KASSERT(curthread->td_critnest > 0, ("%s: vcpu %d running outside "
1287             "critical section", __func__, vcpu));
1288
1289         /*
1290          * Invalidate all mappings tagged with 'vpid'
1291          *
1292          * We do this because this vcpu was executing on a different host
1293          * cpu when it last ran. We do not track whether it invalidated
1294          * mappings associated with its 'vpid' during that run. So we must
1295          * assume that the mappings associated with 'vpid' on 'curcpu' are
1296          * stale and invalidate them.
1297          *
1298          * Note that we incur this penalty only when the scheduler chooses to
1299          * move the thread associated with this vcpu between host cpus.
1300          *
1301          * Note also that this will invalidate mappings tagged with 'vpid'
1302          * for "all" EP4TAs.
1303          */
1304         if (atomic_load_long(&pmap->pm_eptgen) == vmx->eptgen[curcpu]) {
1305                 invvpid_desc._res1 = 0;
1306                 invvpid_desc._res2 = 0;
1307                 invvpid_desc.vpid = vmxstate->vpid;
1308                 invvpid_desc.linear_addr = 0;
1309                 invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc);
1310                 vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_DONE, 1);
1311         } else {
1312                 /*
1313                  * The invvpid can be skipped if an invept is going to
1314                  * be performed before entering the guest. The invept
1315                  * will invalidate combined mappings tagged with
1316                  * 'vmx->eptp' for all vpids.
1317                  */
1318                 vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1);
1319         }
1320 }
1321
1322 static void
1323 vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap)
1324 {
1325         struct vmxstate *vmxstate;
1326
1327         vmxstate = &vmx->vcpus[vcpu].state;
1328         if (vmxstate->lastcpu == curcpu)
1329                 return;
1330
1331         vmxstate->lastcpu = curcpu;
1332
1333         vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1);
1334
1335         vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase());
1336         vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase());
1337         vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase());
1338         vmx_invvpid(vmx, vcpu, pmap, 1);
1339 }
1340
1341 /*
1342  * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set.
1343  */
1344 CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0);
1345
1346 static void __inline
1347 vmx_set_int_window_exiting(struct vmx *vmx, int vcpu)
1348 {
1349         struct vmx_vcpu *vmx_vcpu = &vmx->vcpus[vcpu];
1350
1351         if ((vmx_vcpu->cap.proc_ctls & PROCBASED_INT_WINDOW_EXITING) == 0) {
1352                 vmx_vcpu->cap.proc_ctls |= PROCBASED_INT_WINDOW_EXITING;
1353                 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx_vcpu->cap.proc_ctls);
1354                 VCPU_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting");
1355         }
1356 }
1357
1358 static void __inline
1359 vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu)
1360 {
1361         struct vmx_vcpu *vmx_vcpu = &vmx->vcpus[vcpu];
1362
1363         KASSERT((vmx_vcpu->cap.proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0,
1364             ("intr_window_exiting not set: %#x", vmx_vcpu->cap.proc_ctls));
1365         vmx_vcpu->cap.proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING;
1366         vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx_vcpu->cap.proc_ctls);
1367         VCPU_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting");
1368 }
1369
1370 static void __inline
1371 vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu)
1372 {
1373         struct vmx_vcpu *vmx_vcpu = &vmx->vcpus[vcpu];
1374
1375         if ((vmx_vcpu->cap.proc_ctls & PROCBASED_NMI_WINDOW_EXITING) == 0) {
1376                 vmx_vcpu->cap.proc_ctls |= PROCBASED_NMI_WINDOW_EXITING;
1377                 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx_vcpu->cap.proc_ctls);
1378                 VCPU_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting");
1379         }
1380 }
1381
1382 static void __inline
1383 vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu)
1384 {
1385         struct vmx_vcpu *vmx_vcpu = &vmx->vcpus[vcpu];
1386
1387         KASSERT((vmx_vcpu->cap.proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0,
1388             ("nmi_window_exiting not set %#x", vmx_vcpu->cap.proc_ctls));
1389         vmx_vcpu->cap.proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING;
1390         vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx_vcpu->cap.proc_ctls);
1391         VCPU_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting");
1392 }
1393
1394 int
1395 vmx_set_tsc_offset(struct vmx *vmx, int vcpu, uint64_t offset)
1396 {
1397         struct vmx_vcpu *vmx_vcpu = &vmx->vcpus[vcpu];
1398         int error;
1399
1400         if ((vmx_vcpu->cap.proc_ctls & PROCBASED_TSC_OFFSET) == 0) {
1401                 vmx_vcpu->cap.proc_ctls |= PROCBASED_TSC_OFFSET;
1402                 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx_vcpu->cap.proc_ctls);
1403                 VCPU_CTR0(vmx->vm, vcpu, "Enabling TSC offsetting");
1404         }
1405
1406         error = vmwrite(VMCS_TSC_OFFSET, offset);
1407 #ifdef BHYVE_SNAPSHOT
1408         if (error == 0)
1409                 error = vm_set_tsc_offset(vmx->vm, vcpu, offset);
1410 #endif
1411         return (error);
1412 }
1413
1414 #define NMI_BLOCKING    (VMCS_INTERRUPTIBILITY_NMI_BLOCKING |           \
1415                          VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING)
1416 #define HWINTR_BLOCKING (VMCS_INTERRUPTIBILITY_STI_BLOCKING |           \
1417                          VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING)
1418
1419 static void
1420 vmx_inject_nmi(struct vmx *vmx, int vcpu)
1421 {
1422         uint32_t gi __diagused, info;
1423
1424         gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1425         KASSERT((gi & NMI_BLOCKING) == 0, ("vmx_inject_nmi: invalid guest "
1426             "interruptibility-state %#x", gi));
1427
1428         info = vmcs_read(VMCS_ENTRY_INTR_INFO);
1429         KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_nmi: invalid "
1430             "VM-entry interruption information %#x", info));
1431
1432         /*
1433          * Inject the virtual NMI. The vector must be the NMI IDT entry
1434          * or the VMCS entry check will fail.
1435          */
1436         info = IDT_NMI | VMCS_INTR_T_NMI | VMCS_INTR_VALID;
1437         vmcs_write(VMCS_ENTRY_INTR_INFO, info);
1438
1439         VCPU_CTR0(vmx->vm, vcpu, "Injecting vNMI");
1440
1441         /* Clear the request */
1442         vm_nmi_clear(vmx->vm, vcpu);
1443 }
1444
1445 static void
1446 vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic,
1447     uint64_t guestrip)
1448 {
1449         struct vmx_vcpu *vmx_vcpu = &vmx->vcpus[vcpu];
1450         int vector, need_nmi_exiting, extint_pending;
1451         uint64_t rflags, entryinfo;
1452         uint32_t gi, info;
1453
1454         if (vmx_vcpu->state.nextrip != guestrip) {
1455                 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1456                 if (gi & HWINTR_BLOCKING) {
1457                         VCPU_CTR2(vmx->vm, vcpu, "Guest interrupt blocking "
1458                             "cleared due to rip change: %#lx/%#lx",
1459                             vmx_vcpu->state.nextrip, guestrip);
1460                         gi &= ~HWINTR_BLOCKING;
1461                         vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
1462                 }
1463         }
1464
1465         if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) {
1466                 KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry "
1467                     "intinfo is not valid: %#lx", __func__, entryinfo));
1468
1469                 info = vmcs_read(VMCS_ENTRY_INTR_INFO);
1470                 KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject "
1471                      "pending exception: %#lx/%#x", __func__, entryinfo, info));
1472
1473                 info = entryinfo;
1474                 vector = info & 0xff;
1475                 if (vector == IDT_BP || vector == IDT_OF) {
1476                         /*
1477                          * VT-x requires #BP and #OF to be injected as software
1478                          * exceptions.
1479                          */
1480                         info &= ~VMCS_INTR_T_MASK;
1481                         info |= VMCS_INTR_T_SWEXCEPTION;
1482                 }
1483
1484                 if (info & VMCS_INTR_DEL_ERRCODE)
1485                         vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, entryinfo >> 32);
1486
1487                 vmcs_write(VMCS_ENTRY_INTR_INFO, info);
1488         }
1489
1490         if (vm_nmi_pending(vmx->vm, vcpu)) {
1491                 /*
1492                  * If there are no conditions blocking NMI injection then
1493                  * inject it directly here otherwise enable "NMI window
1494                  * exiting" to inject it as soon as we can.
1495                  *
1496                  * We also check for STI_BLOCKING because some implementations
1497                  * don't allow NMI injection in this case. If we are running
1498                  * on a processor that doesn't have this restriction it will
1499                  * immediately exit and the NMI will be injected in the
1500                  * "NMI window exiting" handler.
1501                  */
1502                 need_nmi_exiting = 1;
1503                 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1504                 if ((gi & (HWINTR_BLOCKING | NMI_BLOCKING)) == 0) {
1505                         info = vmcs_read(VMCS_ENTRY_INTR_INFO);
1506                         if ((info & VMCS_INTR_VALID) == 0) {
1507                                 vmx_inject_nmi(vmx, vcpu);
1508                                 need_nmi_exiting = 0;
1509                         } else {
1510                                 VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI "
1511                                     "due to VM-entry intr info %#x", info);
1512                         }
1513                 } else {
1514                         VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI due to "
1515                             "Guest Interruptibility-state %#x", gi);
1516                 }
1517
1518                 if (need_nmi_exiting)
1519                         vmx_set_nmi_window_exiting(vmx, vcpu);
1520         }
1521
1522         extint_pending = vm_extint_pending(vmx->vm, vcpu);
1523
1524         if (!extint_pending && virtual_interrupt_delivery) {
1525                 vmx_inject_pir(vlapic);
1526                 return;
1527         }
1528
1529         /*
1530          * If interrupt-window exiting is already in effect then don't bother
1531          * checking for pending interrupts. This is just an optimization and
1532          * not needed for correctness.
1533          */
1534         if ((vmx_vcpu->cap.proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0) {
1535                 VCPU_CTR0(vmx->vm, vcpu, "Skip interrupt injection due to "
1536                     "pending int_window_exiting");
1537                 return;
1538         }
1539
1540         if (!extint_pending) {
1541                 /* Ask the local apic for a vector to inject */
1542                 if (!vlapic_pending_intr(vlapic, &vector))
1543                         return;
1544
1545                 /*
1546                  * From the Intel SDM, Volume 3, Section "Maskable
1547                  * Hardware Interrupts":
1548                  * - maskable interrupt vectors [16,255] can be delivered
1549                  *   through the local APIC.
1550                 */
1551                 KASSERT(vector >= 16 && vector <= 255,
1552                     ("invalid vector %d from local APIC", vector));
1553         } else {
1554                 /* Ask the legacy pic for a vector to inject */
1555                 vatpic_pending_intr(vmx->vm, &vector);
1556
1557                 /*
1558                  * From the Intel SDM, Volume 3, Section "Maskable
1559                  * Hardware Interrupts":
1560                  * - maskable interrupt vectors [0,255] can be delivered
1561                  *   through the INTR pin.
1562                  */
1563                 KASSERT(vector >= 0 && vector <= 255,
1564                     ("invalid vector %d from INTR", vector));
1565         }
1566
1567         /* Check RFLAGS.IF and the interruptibility state of the guest */
1568         rflags = vmcs_read(VMCS_GUEST_RFLAGS);
1569         if ((rflags & PSL_I) == 0) {
1570                 VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
1571                     "rflags %#lx", vector, rflags);
1572                 goto cantinject;
1573         }
1574
1575         gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1576         if (gi & HWINTR_BLOCKING) {
1577                 VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
1578                     "Guest Interruptibility-state %#x", vector, gi);
1579                 goto cantinject;
1580         }
1581
1582         info = vmcs_read(VMCS_ENTRY_INTR_INFO);
1583         if (info & VMCS_INTR_VALID) {
1584                 /*
1585                  * This is expected and could happen for multiple reasons:
1586                  * - A vectoring VM-entry was aborted due to astpending
1587                  * - A VM-exit happened during event injection.
1588                  * - An exception was injected above.
1589                  * - An NMI was injected above or after "NMI window exiting"
1590                  */
1591                 VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
1592                     "VM-entry intr info %#x", vector, info);
1593                 goto cantinject;
1594         }
1595
1596         /* Inject the interrupt */
1597         info = VMCS_INTR_T_HWINTR | VMCS_INTR_VALID;
1598         info |= vector;
1599         vmcs_write(VMCS_ENTRY_INTR_INFO, info);
1600
1601         if (!extint_pending) {
1602                 /* Update the Local APIC ISR */
1603                 vlapic_intr_accepted(vlapic, vector);
1604         } else {
1605                 vm_extint_clear(vmx->vm, vcpu);
1606                 vatpic_intr_accepted(vmx->vm, vector);
1607
1608                 /*
1609                  * After we accepted the current ExtINT the PIC may
1610                  * have posted another one.  If that is the case, set
1611                  * the Interrupt Window Exiting execution control so
1612                  * we can inject that one too.
1613                  *
1614                  * Also, interrupt window exiting allows us to inject any
1615                  * pending APIC vector that was preempted by the ExtINT
1616                  * as soon as possible. This applies both for the software
1617                  * emulated vlapic and the hardware assisted virtual APIC.
1618                  */
1619                 vmx_set_int_window_exiting(vmx, vcpu);
1620         }
1621
1622         VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector);
1623
1624         return;
1625
1626 cantinject:
1627         /*
1628          * Set the Interrupt Window Exiting execution control so we can inject
1629          * the interrupt as soon as blocking condition goes away.
1630          */
1631         vmx_set_int_window_exiting(vmx, vcpu);
1632 }
1633
1634 /*
1635  * If the Virtual NMIs execution control is '1' then the logical processor
1636  * tracks virtual-NMI blocking in the Guest Interruptibility-state field of
1637  * the VMCS. An IRET instruction in VMX non-root operation will remove any
1638  * virtual-NMI blocking.
1639  *
1640  * This unblocking occurs even if the IRET causes a fault. In this case the
1641  * hypervisor needs to restore virtual-NMI blocking before resuming the guest.
1642  */
1643 static void
1644 vmx_restore_nmi_blocking(struct vmx *vmx, int vcpuid)
1645 {
1646         uint32_t gi;
1647
1648         VCPU_CTR0(vmx->vm, vcpuid, "Restore Virtual-NMI blocking");
1649         gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1650         gi |= VMCS_INTERRUPTIBILITY_NMI_BLOCKING;
1651         vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
1652 }
1653
1654 static void
1655 vmx_clear_nmi_blocking(struct vmx *vmx, int vcpuid)
1656 {
1657         uint32_t gi;
1658
1659         VCPU_CTR0(vmx->vm, vcpuid, "Clear Virtual-NMI blocking");
1660         gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1661         gi &= ~VMCS_INTERRUPTIBILITY_NMI_BLOCKING;
1662         vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
1663 }
1664
1665 static void
1666 vmx_assert_nmi_blocking(struct vmx *vmx, int vcpuid)
1667 {
1668         uint32_t gi __diagused;
1669
1670         gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1671         KASSERT(gi & VMCS_INTERRUPTIBILITY_NMI_BLOCKING,
1672             ("NMI blocking is not in effect %#x", gi));
1673 }
1674
1675 static int
1676 vmx_emulate_xsetbv(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
1677 {
1678         struct vmxctx *vmxctx;
1679         uint64_t xcrval;
1680         const struct xsave_limits *limits;
1681
1682         vmxctx = &vmx->vcpus[vcpu].ctx;
1683         limits = vmm_get_xsave_limits();
1684
1685         /*
1686          * Note that the processor raises a GP# fault on its own if
1687          * xsetbv is executed for CPL != 0, so we do not have to
1688          * emulate that fault here.
1689          */
1690
1691         /* Only xcr0 is supported. */
1692         if (vmxctx->guest_rcx != 0) {
1693                 vm_inject_gp(vmx->vm, vcpu);
1694                 return (HANDLED);
1695         }
1696
1697         /* We only handle xcr0 if both the host and guest have XSAVE enabled. */
1698         if (!limits->xsave_enabled || !(vmcs_read(VMCS_GUEST_CR4) & CR4_XSAVE)) {
1699                 vm_inject_ud(vmx->vm, vcpu);
1700                 return (HANDLED);
1701         }
1702
1703         xcrval = vmxctx->guest_rdx << 32 | (vmxctx->guest_rax & 0xffffffff);
1704         if ((xcrval & ~limits->xcr0_allowed) != 0) {
1705                 vm_inject_gp(vmx->vm, vcpu);
1706                 return (HANDLED);
1707         }
1708
1709         if (!(xcrval & XFEATURE_ENABLED_X87)) {
1710                 vm_inject_gp(vmx->vm, vcpu);
1711                 return (HANDLED);
1712         }
1713
1714         /* AVX (YMM_Hi128) requires SSE. */
1715         if (xcrval & XFEATURE_ENABLED_AVX &&
1716             (xcrval & XFEATURE_AVX) != XFEATURE_AVX) {
1717                 vm_inject_gp(vmx->vm, vcpu);
1718                 return (HANDLED);
1719         }
1720
1721         /*
1722          * AVX512 requires base AVX (YMM_Hi128) as well as OpMask,
1723          * ZMM_Hi256, and Hi16_ZMM.
1724          */
1725         if (xcrval & XFEATURE_AVX512 &&
1726             (xcrval & (XFEATURE_AVX512 | XFEATURE_AVX)) !=
1727             (XFEATURE_AVX512 | XFEATURE_AVX)) {
1728                 vm_inject_gp(vmx->vm, vcpu);
1729                 return (HANDLED);
1730         }
1731
1732         /*
1733          * Intel MPX requires both bound register state flags to be
1734          * set.
1735          */
1736         if (((xcrval & XFEATURE_ENABLED_BNDREGS) != 0) !=
1737             ((xcrval & XFEATURE_ENABLED_BNDCSR) != 0)) {
1738                 vm_inject_gp(vmx->vm, vcpu);
1739                 return (HANDLED);
1740         }
1741
1742         /*
1743          * This runs "inside" vmrun() with the guest's FPU state, so
1744          * modifying xcr0 directly modifies the guest's xcr0, not the
1745          * host's.
1746          */
1747         load_xcr(0, xcrval);
1748         return (HANDLED);
1749 }
1750
1751 static uint64_t
1752 vmx_get_guest_reg(struct vmx *vmx, int vcpu, int ident)
1753 {
1754         const struct vmxctx *vmxctx;
1755
1756         vmxctx = &vmx->vcpus[vcpu].ctx;
1757
1758         switch (ident) {
1759         case 0:
1760                 return (vmxctx->guest_rax);
1761         case 1:
1762                 return (vmxctx->guest_rcx);
1763         case 2:
1764                 return (vmxctx->guest_rdx);
1765         case 3:
1766                 return (vmxctx->guest_rbx);
1767         case 4:
1768                 return (vmcs_read(VMCS_GUEST_RSP));
1769         case 5:
1770                 return (vmxctx->guest_rbp);
1771         case 6:
1772                 return (vmxctx->guest_rsi);
1773         case 7:
1774                 return (vmxctx->guest_rdi);
1775         case 8:
1776                 return (vmxctx->guest_r8);
1777         case 9:
1778                 return (vmxctx->guest_r9);
1779         case 10:
1780                 return (vmxctx->guest_r10);
1781         case 11:
1782                 return (vmxctx->guest_r11);
1783         case 12:
1784                 return (vmxctx->guest_r12);
1785         case 13:
1786                 return (vmxctx->guest_r13);
1787         case 14:
1788                 return (vmxctx->guest_r14);
1789         case 15:
1790                 return (vmxctx->guest_r15);
1791         default:
1792                 panic("invalid vmx register %d", ident);
1793         }
1794 }
1795
1796 static void
1797 vmx_set_guest_reg(struct vmx *vmx, int vcpu, int ident, uint64_t regval)
1798 {
1799         struct vmxctx *vmxctx;
1800
1801         vmxctx = &vmx->vcpus[vcpu].ctx;
1802
1803         switch (ident) {
1804         case 0:
1805                 vmxctx->guest_rax = regval;
1806                 break;
1807         case 1:
1808                 vmxctx->guest_rcx = regval;
1809                 break;
1810         case 2:
1811                 vmxctx->guest_rdx = regval;
1812                 break;
1813         case 3:
1814                 vmxctx->guest_rbx = regval;
1815                 break;
1816         case 4:
1817                 vmcs_write(VMCS_GUEST_RSP, regval);
1818                 break;
1819         case 5:
1820                 vmxctx->guest_rbp = regval;
1821                 break;
1822         case 6:
1823                 vmxctx->guest_rsi = regval;
1824                 break;
1825         case 7:
1826                 vmxctx->guest_rdi = regval;
1827                 break;
1828         case 8:
1829                 vmxctx->guest_r8 = regval;
1830                 break;
1831         case 9:
1832                 vmxctx->guest_r9 = regval;
1833                 break;
1834         case 10:
1835                 vmxctx->guest_r10 = regval;
1836                 break;
1837         case 11:
1838                 vmxctx->guest_r11 = regval;
1839                 break;
1840         case 12:
1841                 vmxctx->guest_r12 = regval;
1842                 break;
1843         case 13:
1844                 vmxctx->guest_r13 = regval;
1845                 break;
1846         case 14:
1847                 vmxctx->guest_r14 = regval;
1848                 break;
1849         case 15:
1850                 vmxctx->guest_r15 = regval;
1851                 break;
1852         default:
1853                 panic("invalid vmx register %d", ident);
1854         }
1855 }
1856
1857 static int
1858 vmx_emulate_cr0_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
1859 {
1860         uint64_t crval, regval;
1861
1862         /* We only handle mov to %cr0 at this time */
1863         if ((exitqual & 0xf0) != 0x00)
1864                 return (UNHANDLED);
1865
1866         regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf);
1867
1868         vmcs_write(VMCS_CR0_SHADOW, regval);
1869
1870         crval = regval | cr0_ones_mask;
1871         crval &= ~cr0_zeros_mask;
1872         vmcs_write(VMCS_GUEST_CR0, crval);
1873
1874         if (regval & CR0_PG) {
1875                 uint64_t efer, entry_ctls;
1876
1877                 /*
1878                  * If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and
1879                  * the "IA-32e mode guest" bit in VM-entry control must be
1880                  * equal.
1881                  */
1882                 efer = vmcs_read(VMCS_GUEST_IA32_EFER);
1883                 if (efer & EFER_LME) {
1884                         efer |= EFER_LMA;
1885                         vmcs_write(VMCS_GUEST_IA32_EFER, efer);
1886                         entry_ctls = vmcs_read(VMCS_ENTRY_CTLS);
1887                         entry_ctls |= VM_ENTRY_GUEST_LMA;
1888                         vmcs_write(VMCS_ENTRY_CTLS, entry_ctls);
1889                 }
1890         }
1891
1892         return (HANDLED);
1893 }
1894
1895 static int
1896 vmx_emulate_cr4_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
1897 {
1898         uint64_t crval, regval;
1899
1900         /* We only handle mov to %cr4 at this time */
1901         if ((exitqual & 0xf0) != 0x00)
1902                 return (UNHANDLED);
1903
1904         regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf);
1905
1906         vmcs_write(VMCS_CR4_SHADOW, regval);
1907
1908         crval = regval | cr4_ones_mask;
1909         crval &= ~cr4_zeros_mask;
1910         vmcs_write(VMCS_GUEST_CR4, crval);
1911
1912         return (HANDLED);
1913 }
1914
1915 static int
1916 vmx_emulate_cr8_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
1917 {
1918         struct vlapic *vlapic;
1919         uint64_t cr8;
1920         int regnum;
1921
1922         /* We only handle mov %cr8 to/from a register at this time. */
1923         if ((exitqual & 0xe0) != 0x00) {
1924                 return (UNHANDLED);
1925         }
1926
1927         vlapic = vm_lapic(vmx->vm, vcpu);
1928         regnum = (exitqual >> 8) & 0xf;
1929         if (exitqual & 0x10) {
1930                 cr8 = vlapic_get_cr8(vlapic);
1931                 vmx_set_guest_reg(vmx, vcpu, regnum, cr8);
1932         } else {
1933                 cr8 = vmx_get_guest_reg(vmx, vcpu, regnum);
1934                 vlapic_set_cr8(vlapic, cr8);
1935         }
1936
1937         return (HANDLED);
1938 }
1939
1940 /*
1941  * From section "Guest Register State" in the Intel SDM: CPL = SS.DPL
1942  */
1943 static int
1944 vmx_cpl(void)
1945 {
1946         uint32_t ssar;
1947
1948         ssar = vmcs_read(VMCS_GUEST_SS_ACCESS_RIGHTS);
1949         return ((ssar >> 5) & 0x3);
1950 }
1951
1952 static enum vm_cpu_mode
1953 vmx_cpu_mode(void)
1954 {
1955         uint32_t csar;
1956
1957         if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA) {
1958                 csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS);
1959                 if (csar & 0x2000)
1960                         return (CPU_MODE_64BIT);        /* CS.L = 1 */
1961                 else
1962                         return (CPU_MODE_COMPATIBILITY);
1963         } else if (vmcs_read(VMCS_GUEST_CR0) & CR0_PE) {
1964                 return (CPU_MODE_PROTECTED);
1965         } else {
1966                 return (CPU_MODE_REAL);
1967         }
1968 }
1969
1970 static enum vm_paging_mode
1971 vmx_paging_mode(void)
1972 {
1973         uint64_t cr4;
1974
1975         if (!(vmcs_read(VMCS_GUEST_CR0) & CR0_PG))
1976                 return (PAGING_MODE_FLAT);
1977         cr4 = vmcs_read(VMCS_GUEST_CR4);
1978         if (!(cr4 & CR4_PAE))
1979                 return (PAGING_MODE_32);
1980         if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LME) {
1981                 if (!(cr4 & CR4_LA57))
1982                         return (PAGING_MODE_64);
1983                 return (PAGING_MODE_64_LA57);
1984         } else
1985                 return (PAGING_MODE_PAE);
1986 }
1987
1988 static uint64_t
1989 inout_str_index(struct vmx *vmx, int vcpuid, int in)
1990 {
1991         uint64_t val;
1992         int error __diagused;
1993         enum vm_reg_name reg;
1994
1995         reg = in ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI;
1996         error = vmx_getreg(vmx, vcpuid, reg, &val);
1997         KASSERT(error == 0, ("%s: vmx_getreg error %d", __func__, error));
1998         return (val);
1999 }
2000
2001 static uint64_t
2002 inout_str_count(struct vmx *vmx, int vcpuid, int rep)
2003 {
2004         uint64_t val;
2005         int error __diagused;
2006
2007         if (rep) {
2008                 error = vmx_getreg(vmx, vcpuid, VM_REG_GUEST_RCX, &val);
2009                 KASSERT(!error, ("%s: vmx_getreg error %d", __func__, error));
2010         } else {
2011                 val = 1;
2012         }
2013         return (val);
2014 }
2015
2016 static int
2017 inout_str_addrsize(uint32_t inst_info)
2018 {
2019         uint32_t size;
2020
2021         size = (inst_info >> 7) & 0x7;
2022         switch (size) {
2023         case 0:
2024                 return (2);     /* 16 bit */
2025         case 1:
2026                 return (4);     /* 32 bit */
2027         case 2:
2028                 return (8);     /* 64 bit */
2029         default:
2030                 panic("%s: invalid size encoding %d", __func__, size);
2031         }
2032 }
2033
2034 static void
2035 inout_str_seginfo(struct vmx *vmx, int vcpuid, uint32_t inst_info, int in,
2036     struct vm_inout_str *vis)
2037 {
2038         int error __diagused, s;
2039
2040         if (in) {
2041                 vis->seg_name = VM_REG_GUEST_ES;
2042         } else {
2043                 s = (inst_info >> 15) & 0x7;
2044                 vis->seg_name = vm_segment_name(s);
2045         }
2046
2047         error = vmx_getdesc(vmx, vcpuid, vis->seg_name, &vis->seg_desc);
2048         KASSERT(error == 0, ("%s: vmx_getdesc error %d", __func__, error));
2049 }
2050
2051 static void
2052 vmx_paging_info(struct vm_guest_paging *paging)
2053 {
2054         paging->cr3 = vmcs_guest_cr3();
2055         paging->cpl = vmx_cpl();
2056         paging->cpu_mode = vmx_cpu_mode();
2057         paging->paging_mode = vmx_paging_mode();
2058 }
2059
2060 static void
2061 vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla)
2062 {
2063         struct vm_guest_paging *paging;
2064         uint32_t csar;
2065
2066         paging = &vmexit->u.inst_emul.paging;
2067
2068         vmexit->exitcode = VM_EXITCODE_INST_EMUL;
2069         vmexit->inst_length = 0;
2070         vmexit->u.inst_emul.gpa = gpa;
2071         vmexit->u.inst_emul.gla = gla;
2072         vmx_paging_info(paging);
2073         switch (paging->cpu_mode) {
2074         case CPU_MODE_REAL:
2075                 vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE);
2076                 vmexit->u.inst_emul.cs_d = 0;
2077                 break;
2078         case CPU_MODE_PROTECTED:
2079         case CPU_MODE_COMPATIBILITY:
2080                 vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE);
2081                 csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS);
2082                 vmexit->u.inst_emul.cs_d = SEG_DESC_DEF32(csar);
2083                 break;
2084         default:
2085                 vmexit->u.inst_emul.cs_base = 0;
2086                 vmexit->u.inst_emul.cs_d = 0;
2087                 break;
2088         }
2089         vie_init(&vmexit->u.inst_emul.vie, NULL, 0);
2090 }
2091
2092 static int
2093 ept_fault_type(uint64_t ept_qual)
2094 {
2095         int fault_type;
2096
2097         if (ept_qual & EPT_VIOLATION_DATA_WRITE)
2098                 fault_type = VM_PROT_WRITE;
2099         else if (ept_qual & EPT_VIOLATION_INST_FETCH)
2100                 fault_type = VM_PROT_EXECUTE;
2101         else
2102                 fault_type= VM_PROT_READ;
2103
2104         return (fault_type);
2105 }
2106
2107 static bool
2108 ept_emulation_fault(uint64_t ept_qual)
2109 {
2110         int read, write;
2111
2112         /* EPT fault on an instruction fetch doesn't make sense here */
2113         if (ept_qual & EPT_VIOLATION_INST_FETCH)
2114                 return (false);
2115
2116         /* EPT fault must be a read fault or a write fault */
2117         read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0;
2118         write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0;
2119         if ((read | write) == 0)
2120                 return (false);
2121
2122         /*
2123          * The EPT violation must have been caused by accessing a
2124          * guest-physical address that is a translation of a guest-linear
2125          * address.
2126          */
2127         if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 ||
2128             (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) {
2129                 return (false);
2130         }
2131
2132         return (true);
2133 }
2134
2135 static __inline int
2136 apic_access_virtualization(struct vmx *vmx, int vcpuid)
2137 {
2138         uint32_t proc_ctls2;
2139
2140         proc_ctls2 = vmx->vcpus[vcpuid].cap.proc_ctls2;
2141         return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) ? 1 : 0);
2142 }
2143
2144 static __inline int
2145 x2apic_virtualization(struct vmx *vmx, int vcpuid)
2146 {
2147         uint32_t proc_ctls2;
2148
2149         proc_ctls2 = vmx->vcpus[vcpuid].cap.proc_ctls2;
2150         return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_X2APIC_MODE) ? 1 : 0);
2151 }
2152
2153 static int
2154 vmx_handle_apic_write(struct vmx *vmx, int vcpuid, struct vlapic *vlapic,
2155     uint64_t qual)
2156 {
2157         int error, handled, offset;
2158         uint32_t *apic_regs, vector;
2159         bool retu;
2160
2161         handled = HANDLED;
2162         offset = APIC_WRITE_OFFSET(qual);
2163
2164         if (!apic_access_virtualization(vmx, vcpuid)) {
2165                 /*
2166                  * In general there should not be any APIC write VM-exits
2167                  * unless APIC-access virtualization is enabled.
2168                  *
2169                  * However self-IPI virtualization can legitimately trigger
2170                  * an APIC-write VM-exit so treat it specially.
2171                  */
2172                 if (x2apic_virtualization(vmx, vcpuid) &&
2173                     offset == APIC_OFFSET_SELF_IPI) {
2174                         apic_regs = (uint32_t *)(vlapic->apic_page);
2175                         vector = apic_regs[APIC_OFFSET_SELF_IPI / 4];
2176                         vlapic_self_ipi_handler(vlapic, vector);
2177                         return (HANDLED);
2178                 } else
2179                         return (UNHANDLED);
2180         }
2181
2182         switch (offset) {
2183         case APIC_OFFSET_ID:
2184                 vlapic_id_write_handler(vlapic);
2185                 break;
2186         case APIC_OFFSET_LDR:
2187                 vlapic_ldr_write_handler(vlapic);
2188                 break;
2189         case APIC_OFFSET_DFR:
2190                 vlapic_dfr_write_handler(vlapic);
2191                 break;
2192         case APIC_OFFSET_SVR:
2193                 vlapic_svr_write_handler(vlapic);
2194                 break;
2195         case APIC_OFFSET_ESR:
2196                 vlapic_esr_write_handler(vlapic);
2197                 break;
2198         case APIC_OFFSET_ICR_LOW:
2199                 retu = false;
2200                 error = vlapic_icrlo_write_handler(vlapic, &retu);
2201                 if (error != 0 || retu)
2202                         handled = UNHANDLED;
2203                 break;
2204         case APIC_OFFSET_CMCI_LVT:
2205         case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
2206                 vlapic_lvt_write_handler(vlapic, offset);
2207                 break;
2208         case APIC_OFFSET_TIMER_ICR:
2209                 vlapic_icrtmr_write_handler(vlapic);
2210                 break;
2211         case APIC_OFFSET_TIMER_DCR:
2212                 vlapic_dcr_write_handler(vlapic);
2213                 break;
2214         default:
2215                 handled = UNHANDLED;
2216                 break;
2217         }
2218         return (handled);
2219 }
2220
2221 static bool
2222 apic_access_fault(struct vmx *vmx, int vcpuid, uint64_t gpa)
2223 {
2224
2225         if (apic_access_virtualization(vmx, vcpuid) &&
2226             (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE))
2227                 return (true);
2228         else
2229                 return (false);
2230 }
2231
2232 static int
2233 vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
2234 {
2235         uint64_t qual;
2236         int access_type, offset, allowed;
2237
2238         if (!apic_access_virtualization(vmx, vcpuid))
2239                 return (UNHANDLED);
2240
2241         qual = vmexit->u.vmx.exit_qualification;
2242         access_type = APIC_ACCESS_TYPE(qual);
2243         offset = APIC_ACCESS_OFFSET(qual);
2244
2245         allowed = 0;
2246         if (access_type == 0) {
2247                 /*
2248                  * Read data access to the following registers is expected.
2249                  */
2250                 switch (offset) {
2251                 case APIC_OFFSET_APR:
2252                 case APIC_OFFSET_PPR:
2253                 case APIC_OFFSET_RRR:
2254                 case APIC_OFFSET_CMCI_LVT:
2255                 case APIC_OFFSET_TIMER_CCR:
2256                         allowed = 1;
2257                         break;
2258                 default:
2259                         break;
2260                 }
2261         } else if (access_type == 1) {
2262                 /*
2263                  * Write data access to the following registers is expected.
2264                  */
2265                 switch (offset) {
2266                 case APIC_OFFSET_VER:
2267                 case APIC_OFFSET_APR:
2268                 case APIC_OFFSET_PPR:
2269                 case APIC_OFFSET_RRR:
2270                 case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
2271                 case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
2272                 case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
2273                 case APIC_OFFSET_CMCI_LVT:
2274                 case APIC_OFFSET_TIMER_CCR:
2275                         allowed = 1;
2276                         break;
2277                 default:
2278                         break;
2279                 }
2280         }
2281
2282         if (allowed) {
2283                 vmexit_inst_emul(vmexit, DEFAULT_APIC_BASE + offset,
2284                     VIE_INVALID_GLA);
2285         }
2286
2287         /*
2288          * Regardless of whether the APIC-access is allowed this handler
2289          * always returns UNHANDLED:
2290          * - if the access is allowed then it is handled by emulating the
2291          *   instruction that caused the VM-exit (outside the critical section)
2292          * - if the access is not allowed then it will be converted to an
2293          *   exitcode of VM_EXITCODE_VMX and will be dealt with in userland.
2294          */
2295         return (UNHANDLED);
2296 }
2297
2298 static enum task_switch_reason
2299 vmx_task_switch_reason(uint64_t qual)
2300 {
2301         int reason;
2302
2303         reason = (qual >> 30) & 0x3;
2304         switch (reason) {
2305         case 0:
2306                 return (TSR_CALL);
2307         case 1:
2308                 return (TSR_IRET);
2309         case 2:
2310                 return (TSR_JMP);
2311         case 3:
2312                 return (TSR_IDT_GATE);
2313         default:
2314                 panic("%s: invalid reason %d", __func__, reason);
2315         }
2316 }
2317
2318 static int
2319 emulate_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu)
2320 {
2321         int error;
2322
2323         if (lapic_msr(num))
2324                 error = lapic_wrmsr(vmx->vm, vcpuid, num, val, retu);
2325         else
2326                 error = vmx_wrmsr(vmx, vcpuid, num, val, retu);
2327
2328         return (error);
2329 }
2330
2331 static int
2332 emulate_rdmsr(struct vmx *vmx, int vcpuid, u_int num, bool *retu)
2333 {
2334         struct vmxctx *vmxctx;
2335         uint64_t result;
2336         uint32_t eax, edx;
2337         int error;
2338
2339         if (lapic_msr(num))
2340                 error = lapic_rdmsr(vmx->vm, vcpuid, num, &result, retu);
2341         else
2342                 error = vmx_rdmsr(vmx, vcpuid, num, &result, retu);
2343
2344         if (error == 0) {
2345                 eax = result;
2346                 vmxctx = &vmx->vcpus[vcpuid].ctx;
2347                 error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RAX, eax);
2348                 KASSERT(error == 0, ("vmxctx_setreg(rax) error %d", error));
2349
2350                 edx = result >> 32;
2351                 error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RDX, edx);
2352                 KASSERT(error == 0, ("vmxctx_setreg(rdx) error %d", error));
2353         }
2354
2355         return (error);
2356 }
2357
2358 static int
2359 vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
2360 {
2361         int error, errcode, errcode_valid, handled, in;
2362         struct vmx_vcpu *vmx_vcpu;
2363         struct vmxctx *vmxctx;
2364         struct vlapic *vlapic;
2365         struct vm_inout_str *vis;
2366         struct vm_task_switch *ts;
2367         uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, inst_info;
2368         uint32_t intr_type, intr_vec, reason;
2369         uint64_t exitintinfo, qual, gpa;
2370         bool retu;
2371
2372         CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0);
2373         CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_NMI_EXITING) != 0);
2374
2375         handled = UNHANDLED;
2376         vmx_vcpu = &vmx->vcpus[vcpu];
2377         vmxctx = &vmx_vcpu->ctx;
2378
2379         qual = vmexit->u.vmx.exit_qualification;
2380         reason = vmexit->u.vmx.exit_reason;
2381         vmexit->exitcode = VM_EXITCODE_BOGUS;
2382
2383         vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1);
2384         SDT_PROBE3(vmm, vmx, exit, entry, vmx, vcpu, vmexit);
2385
2386         /*
2387          * VM-entry failures during or after loading guest state.
2388          *
2389          * These VM-exits are uncommon but must be handled specially
2390          * as most VM-exit fields are not populated as usual.
2391          */
2392         if (__predict_false(reason == EXIT_REASON_MCE_DURING_ENTRY)) {
2393                 VCPU_CTR0(vmx->vm, vcpu, "Handling MCE during VM-entry");
2394                 __asm __volatile("int $18");
2395                 return (1);
2396         }
2397
2398         /*
2399          * VM exits that can be triggered during event delivery need to
2400          * be handled specially by re-injecting the event if the IDT
2401          * vectoring information field's valid bit is set.
2402          *
2403          * See "Information for VM Exits During Event Delivery" in Intel SDM
2404          * for details.
2405          */
2406         idtvec_info = vmcs_idt_vectoring_info();
2407         if (idtvec_info & VMCS_IDT_VEC_VALID) {
2408                 idtvec_info &= ~(1 << 12); /* clear undefined bit */
2409                 exitintinfo = idtvec_info;
2410                 if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
2411                         idtvec_err = vmcs_idt_vectoring_err();
2412                         exitintinfo |= (uint64_t)idtvec_err << 32;
2413                 }
2414                 error = vm_exit_intinfo(vmx->vm, vcpu, exitintinfo);
2415                 KASSERT(error == 0, ("%s: vm_set_intinfo error %d",
2416                     __func__, error));
2417
2418                 /*
2419                  * If 'virtual NMIs' are being used and the VM-exit
2420                  * happened while injecting an NMI during the previous
2421                  * VM-entry, then clear "blocking by NMI" in the
2422                  * Guest Interruptibility-State so the NMI can be
2423                  * reinjected on the subsequent VM-entry.
2424                  *
2425                  * However, if the NMI was being delivered through a task
2426                  * gate, then the new task must start execution with NMIs
2427                  * blocked so don't clear NMI blocking in this case.
2428                  */
2429                 intr_type = idtvec_info & VMCS_INTR_T_MASK;
2430                 if (intr_type == VMCS_INTR_T_NMI) {
2431                         if (reason != EXIT_REASON_TASK_SWITCH)
2432                                 vmx_clear_nmi_blocking(vmx, vcpu);
2433                         else
2434                                 vmx_assert_nmi_blocking(vmx, vcpu);
2435                 }
2436
2437                 /*
2438                  * Update VM-entry instruction length if the event being
2439                  * delivered was a software interrupt or software exception.
2440                  */
2441                 if (intr_type == VMCS_INTR_T_SWINTR ||
2442                     intr_type == VMCS_INTR_T_PRIV_SWEXCEPTION ||
2443                     intr_type == VMCS_INTR_T_SWEXCEPTION) {
2444                         vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
2445                 }
2446         }
2447
2448         switch (reason) {
2449         case EXIT_REASON_TASK_SWITCH:
2450                 ts = &vmexit->u.task_switch;
2451                 ts->tsssel = qual & 0xffff;
2452                 ts->reason = vmx_task_switch_reason(qual);
2453                 ts->ext = 0;
2454                 ts->errcode_valid = 0;
2455                 vmx_paging_info(&ts->paging);
2456                 /*
2457                  * If the task switch was due to a CALL, JMP, IRET, software
2458                  * interrupt (INT n) or software exception (INT3, INTO),
2459                  * then the saved %rip references the instruction that caused
2460                  * the task switch. The instruction length field in the VMCS
2461                  * is valid in this case.
2462                  *
2463                  * In all other cases (e.g., NMI, hardware exception) the
2464                  * saved %rip is one that would have been saved in the old TSS
2465                  * had the task switch completed normally so the instruction
2466                  * length field is not needed in this case and is explicitly
2467                  * set to 0.
2468                  */
2469                 if (ts->reason == TSR_IDT_GATE) {
2470                         KASSERT(idtvec_info & VMCS_IDT_VEC_VALID,
2471                             ("invalid idtvec_info %#x for IDT task switch",
2472                             idtvec_info));
2473                         intr_type = idtvec_info & VMCS_INTR_T_MASK;
2474                         if (intr_type != VMCS_INTR_T_SWINTR &&
2475                             intr_type != VMCS_INTR_T_SWEXCEPTION &&
2476                             intr_type != VMCS_INTR_T_PRIV_SWEXCEPTION) {
2477                                 /* Task switch triggered by external event */
2478                                 ts->ext = 1;
2479                                 vmexit->inst_length = 0;
2480                                 if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
2481                                         ts->errcode_valid = 1;
2482                                         ts->errcode = vmcs_idt_vectoring_err();
2483                                 }
2484                         }
2485                 }
2486                 vmexit->exitcode = VM_EXITCODE_TASK_SWITCH;
2487                 SDT_PROBE4(vmm, vmx, exit, taskswitch, vmx, vcpu, vmexit, ts);
2488                 VCPU_CTR4(vmx->vm, vcpu, "task switch reason %d, tss 0x%04x, "
2489                     "%s errcode 0x%016lx", ts->reason, ts->tsssel,
2490                     ts->ext ? "external" : "internal",
2491                     ((uint64_t)ts->errcode << 32) | ts->errcode_valid);
2492                 break;
2493         case EXIT_REASON_CR_ACCESS:
2494                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1);
2495                 SDT_PROBE4(vmm, vmx, exit, craccess, vmx, vcpu, vmexit, qual);
2496                 switch (qual & 0xf) {
2497                 case 0:
2498                         handled = vmx_emulate_cr0_access(vmx, vcpu, qual);
2499                         break;
2500                 case 4:
2501                         handled = vmx_emulate_cr4_access(vmx, vcpu, qual);
2502                         break;
2503                 case 8:
2504                         handled = vmx_emulate_cr8_access(vmx, vcpu, qual);
2505                         break;
2506                 }
2507                 break;
2508         case EXIT_REASON_RDMSR:
2509                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1);
2510                 retu = false;
2511                 ecx = vmxctx->guest_rcx;
2512                 VCPU_CTR1(vmx->vm, vcpu, "rdmsr 0x%08x", ecx);
2513                 SDT_PROBE4(vmm, vmx, exit, rdmsr, vmx, vcpu, vmexit, ecx);
2514                 error = emulate_rdmsr(vmx, vcpu, ecx, &retu);
2515                 if (error) {
2516                         vmexit->exitcode = VM_EXITCODE_RDMSR;
2517                         vmexit->u.msr.code = ecx;
2518                 } else if (!retu) {
2519                         handled = HANDLED;
2520                 } else {
2521                         /* Return to userspace with a valid exitcode */
2522                         KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
2523                             ("emulate_rdmsr retu with bogus exitcode"));
2524                 }
2525                 break;
2526         case EXIT_REASON_WRMSR:
2527                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_WRMSR, 1);
2528                 retu = false;
2529                 eax = vmxctx->guest_rax;
2530                 ecx = vmxctx->guest_rcx;
2531                 edx = vmxctx->guest_rdx;
2532                 VCPU_CTR2(vmx->vm, vcpu, "wrmsr 0x%08x value 0x%016lx",
2533                     ecx, (uint64_t)edx << 32 | eax);
2534                 SDT_PROBE5(vmm, vmx, exit, wrmsr, vmx, vmexit, vcpu, ecx,
2535                     (uint64_t)edx << 32 | eax);
2536                 error = emulate_wrmsr(vmx, vcpu, ecx,
2537                     (uint64_t)edx << 32 | eax, &retu);
2538                 if (error) {
2539                         vmexit->exitcode = VM_EXITCODE_WRMSR;
2540                         vmexit->u.msr.code = ecx;
2541                         vmexit->u.msr.wval = (uint64_t)edx << 32 | eax;
2542                 } else if (!retu) {
2543                         handled = HANDLED;
2544                 } else {
2545                         /* Return to userspace with a valid exitcode */
2546                         KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
2547                             ("emulate_wrmsr retu with bogus exitcode"));
2548                 }
2549                 break;
2550         case EXIT_REASON_HLT:
2551                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1);
2552                 SDT_PROBE3(vmm, vmx, exit, halt, vmx, vcpu, vmexit);
2553                 vmexit->exitcode = VM_EXITCODE_HLT;
2554                 vmexit->u.hlt.rflags = vmcs_read(VMCS_GUEST_RFLAGS);
2555                 if (virtual_interrupt_delivery)
2556                         vmexit->u.hlt.intr_status =
2557                             vmcs_read(VMCS_GUEST_INTR_STATUS);
2558                 else
2559                         vmexit->u.hlt.intr_status = 0;
2560                 break;
2561         case EXIT_REASON_MTF:
2562                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1);
2563                 SDT_PROBE3(vmm, vmx, exit, mtrap, vmx, vcpu, vmexit);
2564                 vmexit->exitcode = VM_EXITCODE_MTRAP;
2565                 vmexit->inst_length = 0;
2566                 break;
2567         case EXIT_REASON_PAUSE:
2568                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1);
2569                 SDT_PROBE3(vmm, vmx, exit, pause, vmx, vcpu, vmexit);
2570                 vmexit->exitcode = VM_EXITCODE_PAUSE;
2571                 break;
2572         case EXIT_REASON_INTR_WINDOW:
2573                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1);
2574                 SDT_PROBE3(vmm, vmx, exit, intrwindow, vmx, vcpu, vmexit);
2575                 vmx_clear_int_window_exiting(vmx, vcpu);
2576                 return (1);
2577         case EXIT_REASON_EXT_INTR:
2578                 /*
2579                  * External interrupts serve only to cause VM exits and allow
2580                  * the host interrupt handler to run.
2581                  *
2582                  * If this external interrupt triggers a virtual interrupt
2583                  * to a VM, then that state will be recorded by the
2584                  * host interrupt handler in the VM's softc. We will inject
2585                  * this virtual interrupt during the subsequent VM enter.
2586                  */
2587                 intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
2588                 SDT_PROBE4(vmm, vmx, exit, interrupt,
2589                     vmx, vcpu, vmexit, intr_info);
2590
2591                 /*
2592                  * XXX: Ignore this exit if VMCS_INTR_VALID is not set.
2593                  * This appears to be a bug in VMware Fusion?
2594                  */
2595                 if (!(intr_info & VMCS_INTR_VALID))
2596                         return (1);
2597                 KASSERT((intr_info & VMCS_INTR_VALID) != 0 &&
2598                     (intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_HWINTR,
2599                     ("VM exit interruption info invalid: %#x", intr_info));
2600                 vmx_trigger_hostintr(intr_info & 0xff);
2601
2602                 /*
2603                  * This is special. We want to treat this as an 'handled'
2604                  * VM-exit but not increment the instruction pointer.
2605                  */
2606                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1);
2607                 return (1);
2608         case EXIT_REASON_NMI_WINDOW:
2609                 SDT_PROBE3(vmm, vmx, exit, nmiwindow, vmx, vcpu, vmexit);
2610                 /* Exit to allow the pending virtual NMI to be injected */
2611                 if (vm_nmi_pending(vmx->vm, vcpu))
2612                         vmx_inject_nmi(vmx, vcpu);
2613                 vmx_clear_nmi_window_exiting(vmx, vcpu);
2614                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NMI_WINDOW, 1);
2615                 return (1);
2616         case EXIT_REASON_INOUT:
2617                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1);
2618                 vmexit->exitcode = VM_EXITCODE_INOUT;
2619                 vmexit->u.inout.bytes = (qual & 0x7) + 1;
2620                 vmexit->u.inout.in = in = (qual & 0x8) ? 1 : 0;
2621                 vmexit->u.inout.string = (qual & 0x10) ? 1 : 0;
2622                 vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0;
2623                 vmexit->u.inout.port = (uint16_t)(qual >> 16);
2624                 vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax);
2625                 if (vmexit->u.inout.string) {
2626                         inst_info = vmcs_read(VMCS_EXIT_INSTRUCTION_INFO);
2627                         vmexit->exitcode = VM_EXITCODE_INOUT_STR;
2628                         vis = &vmexit->u.inout_str;
2629                         vmx_paging_info(&vis->paging);
2630                         vis->rflags = vmcs_read(VMCS_GUEST_RFLAGS);
2631                         vis->cr0 = vmcs_read(VMCS_GUEST_CR0);
2632                         vis->index = inout_str_index(vmx, vcpu, in);
2633                         vis->count = inout_str_count(vmx, vcpu, vis->inout.rep);
2634                         vis->addrsize = inout_str_addrsize(inst_info);
2635                         inout_str_seginfo(vmx, vcpu, inst_info, in, vis);
2636                 }
2637                 SDT_PROBE3(vmm, vmx, exit, inout, vmx, vcpu, vmexit);
2638                 break;
2639         case EXIT_REASON_CPUID:
2640                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1);
2641                 SDT_PROBE3(vmm, vmx, exit, cpuid, vmx, vcpu, vmexit);
2642                 handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx);
2643                 break;
2644         case EXIT_REASON_EXCEPTION:
2645                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXCEPTION, 1);
2646                 intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
2647                 KASSERT((intr_info & VMCS_INTR_VALID) != 0,
2648                     ("VM exit interruption info invalid: %#x", intr_info));
2649
2650                 intr_vec = intr_info & 0xff;
2651                 intr_type = intr_info & VMCS_INTR_T_MASK;
2652
2653                 /*
2654                  * If Virtual NMIs control is 1 and the VM-exit is due to a
2655                  * fault encountered during the execution of IRET then we must
2656                  * restore the state of "virtual-NMI blocking" before resuming
2657                  * the guest.
2658                  *
2659                  * See "Resuming Guest Software after Handling an Exception".
2660                  * See "Information for VM Exits Due to Vectored Events".
2661                  */
2662                 if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
2663                     (intr_vec != IDT_DF) &&
2664                     (intr_info & EXIT_QUAL_NMIUDTI) != 0)
2665                         vmx_restore_nmi_blocking(vmx, vcpu);
2666
2667                 /*
2668                  * The NMI has already been handled in vmx_exit_handle_nmi().
2669                  */
2670                 if (intr_type == VMCS_INTR_T_NMI)
2671                         return (1);
2672
2673                 /*
2674                  * Call the machine check handler by hand. Also don't reflect
2675                  * the machine check back into the guest.
2676                  */
2677                 if (intr_vec == IDT_MC) {
2678                         VCPU_CTR0(vmx->vm, vcpu, "Vectoring to MCE handler");
2679                         __asm __volatile("int $18");
2680                         return (1);
2681                 }
2682
2683                 /*
2684                  * If the hypervisor has requested user exits for
2685                  * debug exceptions, bounce them out to userland.
2686                  */
2687                 if (intr_type == VMCS_INTR_T_SWEXCEPTION && intr_vec == IDT_BP &&
2688                     (vmx_vcpu->cap.set & (1 << VM_CAP_BPT_EXIT))) {
2689                         vmexit->exitcode = VM_EXITCODE_BPT;
2690                         vmexit->u.bpt.inst_length = vmexit->inst_length;
2691                         vmexit->inst_length = 0;
2692                         break;
2693                 }
2694
2695                 if (intr_vec == IDT_PF) {
2696                         error = vmxctx_setreg(vmxctx, VM_REG_GUEST_CR2, qual);
2697                         KASSERT(error == 0, ("%s: vmxctx_setreg(cr2) error %d",
2698                             __func__, error));
2699                 }
2700
2701                 /*
2702                  * Software exceptions exhibit trap-like behavior. This in
2703                  * turn requires populating the VM-entry instruction length
2704                  * so that the %rip in the trap frame is past the INT3/INTO
2705                  * instruction.
2706                  */
2707                 if (intr_type == VMCS_INTR_T_SWEXCEPTION)
2708                         vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
2709
2710                 /* Reflect all other exceptions back into the guest */
2711                 errcode_valid = errcode = 0;
2712                 if (intr_info & VMCS_INTR_DEL_ERRCODE) {
2713                         errcode_valid = 1;
2714                         errcode = vmcs_read(VMCS_EXIT_INTR_ERRCODE);
2715                 }
2716                 VCPU_CTR2(vmx->vm, vcpu, "Reflecting exception %d/%#x into "
2717                     "the guest", intr_vec, errcode);
2718                 SDT_PROBE5(vmm, vmx, exit, exception,
2719                     vmx, vcpu, vmexit, intr_vec, errcode);
2720                 error = vm_inject_exception(vmx->vm, vcpu, intr_vec,
2721                     errcode_valid, errcode, 0);
2722                 KASSERT(error == 0, ("%s: vm_inject_exception error %d",
2723                     __func__, error));
2724                 return (1);
2725
2726         case EXIT_REASON_EPT_FAULT:
2727                 /*
2728                  * If 'gpa' lies within the address space allocated to
2729                  * memory then this must be a nested page fault otherwise
2730                  * this must be an instruction that accesses MMIO space.
2731                  */
2732                 gpa = vmcs_gpa();
2733                 if (vm_mem_allocated(vmx->vm, vcpu, gpa) ||
2734                     apic_access_fault(vmx, vcpu, gpa)) {
2735                         vmexit->exitcode = VM_EXITCODE_PAGING;
2736                         vmexit->inst_length = 0;
2737                         vmexit->u.paging.gpa = gpa;
2738                         vmexit->u.paging.fault_type = ept_fault_type(qual);
2739                         vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NESTED_FAULT, 1);
2740                         SDT_PROBE5(vmm, vmx, exit, nestedfault,
2741                             vmx, vcpu, vmexit, gpa, qual);
2742                 } else if (ept_emulation_fault(qual)) {
2743                         vmexit_inst_emul(vmexit, gpa, vmcs_gla());
2744                         vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INST_EMUL, 1);
2745                         SDT_PROBE4(vmm, vmx, exit, mmiofault,
2746                             vmx, vcpu, vmexit, gpa);
2747                 }
2748                 /*
2749                  * If Virtual NMIs control is 1 and the VM-exit is due to an
2750                  * EPT fault during the execution of IRET then we must restore
2751                  * the state of "virtual-NMI blocking" before resuming.
2752                  *
2753                  * See description of "NMI unblocking due to IRET" in
2754                  * "Exit Qualification for EPT Violations".
2755                  */
2756                 if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
2757                     (qual & EXIT_QUAL_NMIUDTI) != 0)
2758                         vmx_restore_nmi_blocking(vmx, vcpu);
2759                 break;
2760         case EXIT_REASON_VIRTUALIZED_EOI:
2761                 vmexit->exitcode = VM_EXITCODE_IOAPIC_EOI;
2762                 vmexit->u.ioapic_eoi.vector = qual & 0xFF;
2763                 SDT_PROBE3(vmm, vmx, exit, eoi, vmx, vcpu, vmexit);
2764                 vmexit->inst_length = 0;        /* trap-like */
2765                 break;
2766         case EXIT_REASON_APIC_ACCESS:
2767                 SDT_PROBE3(vmm, vmx, exit, apicaccess, vmx, vcpu, vmexit);
2768                 handled = vmx_handle_apic_access(vmx, vcpu, vmexit);
2769                 break;
2770         case EXIT_REASON_APIC_WRITE:
2771                 /*
2772                  * APIC-write VM exit is trap-like so the %rip is already
2773                  * pointing to the next instruction.
2774                  */
2775                 vmexit->inst_length = 0;
2776                 vlapic = vm_lapic(vmx->vm, vcpu);
2777                 SDT_PROBE4(vmm, vmx, exit, apicwrite,
2778                     vmx, vcpu, vmexit, vlapic);
2779                 handled = vmx_handle_apic_write(vmx, vcpu, vlapic, qual);
2780                 break;
2781         case EXIT_REASON_XSETBV:
2782                 SDT_PROBE3(vmm, vmx, exit, xsetbv, vmx, vcpu, vmexit);
2783                 handled = vmx_emulate_xsetbv(vmx, vcpu, vmexit);
2784                 break;
2785         case EXIT_REASON_MONITOR:
2786                 SDT_PROBE3(vmm, vmx, exit, monitor, vmx, vcpu, vmexit);
2787                 vmexit->exitcode = VM_EXITCODE_MONITOR;
2788                 break;
2789         case EXIT_REASON_MWAIT:
2790                 SDT_PROBE3(vmm, vmx, exit, mwait, vmx, vcpu, vmexit);
2791                 vmexit->exitcode = VM_EXITCODE_MWAIT;
2792                 break;
2793         case EXIT_REASON_TPR:
2794                 vlapic = vm_lapic(vmx->vm, vcpu);
2795                 vlapic_sync_tpr(vlapic);
2796                 vmexit->inst_length = 0;
2797                 handled = HANDLED;
2798                 break;
2799         case EXIT_REASON_VMCALL:
2800         case EXIT_REASON_VMCLEAR:
2801         case EXIT_REASON_VMLAUNCH:
2802         case EXIT_REASON_VMPTRLD:
2803         case EXIT_REASON_VMPTRST:
2804         case EXIT_REASON_VMREAD:
2805         case EXIT_REASON_VMRESUME:
2806         case EXIT_REASON_VMWRITE:
2807         case EXIT_REASON_VMXOFF:
2808         case EXIT_REASON_VMXON:
2809                 SDT_PROBE3(vmm, vmx, exit, vminsn, vmx, vcpu, vmexit);
2810                 vmexit->exitcode = VM_EXITCODE_VMINSN;
2811                 break;
2812         case EXIT_REASON_INVD:
2813         case EXIT_REASON_WBINVD:
2814                 /* ignore exit */
2815                 handled = HANDLED;
2816                 break;
2817         default:
2818                 SDT_PROBE4(vmm, vmx, exit, unknown,
2819                     vmx, vcpu, vmexit, reason);
2820                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1);
2821                 break;
2822         }
2823
2824         if (handled) {
2825                 /*
2826                  * It is possible that control is returned to userland
2827                  * even though we were able to handle the VM exit in the
2828                  * kernel.
2829                  *
2830                  * In such a case we want to make sure that the userland
2831                  * restarts guest execution at the instruction *after*
2832                  * the one we just processed. Therefore we update the
2833                  * guest rip in the VMCS and in 'vmexit'.
2834                  */
2835                 vmexit->rip += vmexit->inst_length;
2836                 vmexit->inst_length = 0;
2837                 vmcs_write(VMCS_GUEST_RIP, vmexit->rip);
2838         } else {
2839                 if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
2840                         /*
2841                          * If this VM exit was not claimed by anybody then
2842                          * treat it as a generic VMX exit.
2843                          */
2844                         vmexit->exitcode = VM_EXITCODE_VMX;
2845                         vmexit->u.vmx.status = VM_SUCCESS;
2846                         vmexit->u.vmx.inst_type = 0;
2847                         vmexit->u.vmx.inst_error = 0;
2848                 } else {
2849                         /*
2850                          * The exitcode and collateral have been populated.
2851                          * The VM exit will be processed further in userland.
2852                          */
2853                 }
2854         }
2855
2856         SDT_PROBE4(vmm, vmx, exit, return,
2857             vmx, vcpu, vmexit, handled);
2858         return (handled);
2859 }
2860
2861 static __inline void
2862 vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit)
2863 {
2864
2865         KASSERT(vmxctx->inst_fail_status != VM_SUCCESS,
2866             ("vmx_exit_inst_error: invalid inst_fail_status %d",
2867             vmxctx->inst_fail_status));
2868
2869         vmexit->inst_length = 0;
2870         vmexit->exitcode = VM_EXITCODE_VMX;
2871         vmexit->u.vmx.status = vmxctx->inst_fail_status;
2872         vmexit->u.vmx.inst_error = vmcs_instruction_error();
2873         vmexit->u.vmx.exit_reason = ~0;
2874         vmexit->u.vmx.exit_qualification = ~0;
2875
2876         switch (rc) {
2877         case VMX_VMRESUME_ERROR:
2878         case VMX_VMLAUNCH_ERROR:
2879                 vmexit->u.vmx.inst_type = rc;
2880                 break;
2881         default:
2882                 panic("vm_exit_inst_error: vmx_enter_guest returned %d", rc);
2883         }
2884 }
2885
2886 /*
2887  * If the NMI-exiting VM execution control is set to '1' then an NMI in
2888  * non-root operation causes a VM-exit. NMI blocking is in effect so it is
2889  * sufficient to simply vector to the NMI handler via a software interrupt.
2890  * However, this must be done before maskable interrupts are enabled
2891  * otherwise the "iret" issued by an interrupt handler will incorrectly
2892  * clear NMI blocking.
2893  */
2894 static __inline void
2895 vmx_exit_handle_nmi(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
2896 {
2897         uint32_t intr_info;
2898
2899         KASSERT((read_rflags() & PSL_I) == 0, ("interrupts enabled"));
2900
2901         if (vmexit->u.vmx.exit_reason != EXIT_REASON_EXCEPTION)
2902                 return;
2903
2904         intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
2905         KASSERT((intr_info & VMCS_INTR_VALID) != 0,
2906             ("VM exit interruption info invalid: %#x", intr_info));
2907
2908         if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI) {
2909                 KASSERT((intr_info & 0xff) == IDT_NMI, ("VM exit due "
2910                     "to NMI has invalid vector: %#x", intr_info));
2911                 VCPU_CTR0(vmx->vm, vcpuid, "Vectoring to NMI handler");
2912                 __asm __volatile("int $2");
2913         }
2914 }
2915
2916 static __inline void
2917 vmx_dr_enter_guest(struct vmxctx *vmxctx)
2918 {
2919         register_t rflags;
2920
2921         /* Save host control debug registers. */
2922         vmxctx->host_dr7 = rdr7();
2923         vmxctx->host_debugctl = rdmsr(MSR_DEBUGCTLMSR);
2924
2925         /*
2926          * Disable debugging in DR7 and DEBUGCTL to avoid triggering
2927          * exceptions in the host based on the guest DRx values.  The
2928          * guest DR7 and DEBUGCTL are saved/restored in the VMCS.
2929          */
2930         load_dr7(0);
2931         wrmsr(MSR_DEBUGCTLMSR, 0);
2932
2933         /*
2934          * Disable single stepping the kernel to avoid corrupting the
2935          * guest DR6.  A debugger might still be able to corrupt the
2936          * guest DR6 by setting a breakpoint after this point and then
2937          * single stepping.
2938          */
2939         rflags = read_rflags();
2940         vmxctx->host_tf = rflags & PSL_T;
2941         write_rflags(rflags & ~PSL_T);
2942
2943         /* Save host debug registers. */
2944         vmxctx->host_dr0 = rdr0();
2945         vmxctx->host_dr1 = rdr1();
2946         vmxctx->host_dr2 = rdr2();
2947         vmxctx->host_dr3 = rdr3();
2948         vmxctx->host_dr6 = rdr6();
2949
2950         /* Restore guest debug registers. */
2951         load_dr0(vmxctx->guest_dr0);
2952         load_dr1(vmxctx->guest_dr1);
2953         load_dr2(vmxctx->guest_dr2);
2954         load_dr3(vmxctx->guest_dr3);
2955         load_dr6(vmxctx->guest_dr6);
2956 }
2957
2958 static __inline void
2959 vmx_dr_leave_guest(struct vmxctx *vmxctx)
2960 {
2961
2962         /* Save guest debug registers. */
2963         vmxctx->guest_dr0 = rdr0();
2964         vmxctx->guest_dr1 = rdr1();
2965         vmxctx->guest_dr2 = rdr2();
2966         vmxctx->guest_dr3 = rdr3();
2967         vmxctx->guest_dr6 = rdr6();
2968
2969         /*
2970          * Restore host debug registers.  Restore DR7, DEBUGCTL, and
2971          * PSL_T last.
2972          */
2973         load_dr0(vmxctx->host_dr0);
2974         load_dr1(vmxctx->host_dr1);
2975         load_dr2(vmxctx->host_dr2);
2976         load_dr3(vmxctx->host_dr3);
2977         load_dr6(vmxctx->host_dr6);
2978         wrmsr(MSR_DEBUGCTLMSR, vmxctx->host_debugctl);
2979         load_dr7(vmxctx->host_dr7);
2980         write_rflags(read_rflags() | vmxctx->host_tf);
2981 }
2982
2983 static __inline void
2984 vmx_pmap_activate(struct vmx *vmx, pmap_t pmap)
2985 {
2986         long eptgen;
2987         int cpu;
2988
2989         cpu = curcpu;
2990
2991         CPU_SET_ATOMIC(cpu, &pmap->pm_active);
2992         smr_enter(pmap->pm_eptsmr);
2993         eptgen = atomic_load_long(&pmap->pm_eptgen);
2994         if (eptgen != vmx->eptgen[cpu]) {
2995                 vmx->eptgen[cpu] = eptgen;
2996                 invept(INVEPT_TYPE_SINGLE_CONTEXT,
2997                     (struct invept_desc){ .eptp = vmx->eptp, ._res = 0 });
2998         }
2999 }
3000
3001 static __inline void
3002 vmx_pmap_deactivate(struct vmx *vmx, pmap_t pmap)
3003 {
3004         smr_exit(pmap->pm_eptsmr);
3005         CPU_CLR_ATOMIC(curcpu, &pmap->pm_active);
3006 }
3007
3008 static int
3009 vmx_run(void *arg, int vcpu, register_t rip, pmap_t pmap,
3010     struct vm_eventinfo *evinfo)
3011 {
3012         int rc, handled, launched;
3013         struct vmx *vmx;
3014         struct vmx_vcpu *vmx_vcpu;
3015         struct vm *vm;
3016         struct vmxctx *vmxctx;
3017         struct vmcs *vmcs;
3018         struct vm_exit *vmexit;
3019         struct vlapic *vlapic;
3020         uint32_t exit_reason;
3021         struct region_descriptor gdtr, idtr;
3022         uint16_t ldt_sel;
3023
3024         vmx = arg;
3025         vm = vmx->vm;
3026         vmx_vcpu = &vmx->vcpus[vcpu];
3027         vmcs = vmx_vcpu->vmcs;
3028         vmxctx = &vmx_vcpu->ctx;
3029         vlapic = vm_lapic(vm, vcpu);
3030         vmexit = vm_exitinfo(vm, vcpu);
3031         launched = 0;
3032
3033         KASSERT(vmxctx->pmap == pmap,
3034             ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap));
3035
3036         vmx_msr_guest_enter(vmx, vcpu);
3037
3038         VMPTRLD(vmcs);
3039
3040         /*
3041          * XXX
3042          * We do this every time because we may setup the virtual machine
3043          * from a different process than the one that actually runs it.
3044          *
3045          * If the life of a virtual machine was spent entirely in the context
3046          * of a single process we could do this once in vmx_init().
3047          */
3048         vmcs_write(VMCS_HOST_CR3, rcr3());
3049
3050         vmcs_write(VMCS_GUEST_RIP, rip);
3051         vmx_set_pcpu_defaults(vmx, vcpu, pmap);
3052         do {
3053                 KASSERT(vmcs_guest_rip() == rip, ("%s: vmcs guest rip mismatch "
3054                     "%#lx/%#lx", __func__, vmcs_guest_rip(), rip));
3055
3056                 handled = UNHANDLED;
3057                 /*
3058                  * Interrupts are disabled from this point on until the
3059                  * guest starts executing. This is done for the following
3060                  * reasons:
3061                  *
3062                  * If an AST is asserted on this thread after the check below,
3063                  * then the IPI_AST notification will not be lost, because it
3064                  * will cause a VM exit due to external interrupt as soon as
3065                  * the guest state is loaded.
3066                  *
3067                  * A posted interrupt after 'vmx_inject_interrupts()' will
3068                  * not be "lost" because it will be held pending in the host
3069                  * APIC because interrupts are disabled. The pending interrupt
3070                  * will be recognized as soon as the guest state is loaded.
3071                  *
3072                  * The same reasoning applies to the IPI generated by
3073                  * pmap_invalidate_ept().
3074                  */
3075                 disable_intr();
3076                 vmx_inject_interrupts(vmx, vcpu, vlapic, rip);
3077
3078                 /*
3079                  * Check for vcpu suspension after injecting events because
3080                  * vmx_inject_interrupts() can suspend the vcpu due to a
3081                  * triple fault.
3082                  */
3083                 if (vcpu_suspended(evinfo)) {
3084                         enable_intr();
3085                         vm_exit_suspended(vmx->vm, vcpu, rip);
3086                         break;
3087                 }
3088
3089                 if (vcpu_rendezvous_pending(evinfo)) {
3090                         enable_intr();
3091                         vm_exit_rendezvous(vmx->vm, vcpu, rip);
3092                         break;
3093                 }
3094
3095                 if (vcpu_reqidle(evinfo)) {
3096                         enable_intr();
3097                         vm_exit_reqidle(vmx->vm, vcpu, rip);
3098                         break;
3099                 }
3100
3101                 if (vcpu_should_yield(vm, vcpu)) {
3102                         enable_intr();
3103                         vm_exit_astpending(vmx->vm, vcpu, rip);
3104                         vmx_astpending_trace(vmx, vcpu, rip);
3105                         handled = HANDLED;
3106                         break;
3107                 }
3108
3109                 if (vcpu_debugged(vm, vcpu)) {
3110                         enable_intr();
3111                         vm_exit_debug(vmx->vm, vcpu, rip);
3112                         break;
3113                 }
3114
3115                 /*
3116                  * If TPR Shadowing is enabled, the TPR Threshold
3117                  * must be updated right before entering the guest.
3118                  */
3119                 if (tpr_shadowing && !virtual_interrupt_delivery) {
3120                         if ((vmx_vcpu->cap.proc_ctls & PROCBASED_USE_TPR_SHADOW) != 0) {
3121                                 vmcs_write(VMCS_TPR_THRESHOLD, vlapic_get_cr8(vlapic));
3122                         }
3123                 }
3124
3125                 /*
3126                  * VM exits restore the base address but not the
3127                  * limits of GDTR and IDTR.  The VMCS only stores the
3128                  * base address, so VM exits set the limits to 0xffff.
3129                  * Save and restore the full GDTR and IDTR to restore
3130                  * the limits.
3131                  *
3132                  * The VMCS does not save the LDTR at all, and VM
3133                  * exits clear LDTR as if a NULL selector were loaded.
3134                  * The userspace hypervisor probably doesn't use a
3135                  * LDT, but save and restore it to be safe.
3136                  */
3137                 sgdt(&gdtr);
3138                 sidt(&idtr);
3139                 ldt_sel = sldt();
3140
3141                 /*
3142                  * The TSC_AUX MSR must be saved/restored while interrupts
3143                  * are disabled so that it is not possible for the guest
3144                  * TSC_AUX MSR value to be overwritten by the resume
3145                  * portion of the IPI_SUSPEND codepath. This is why the
3146                  * transition of this MSR is handled separately from those
3147                  * handled by vmx_msr_guest_{enter,exit}(), which are ok to
3148                  * be transitioned with preemption disabled but interrupts
3149                  * enabled.
3150                  *
3151                  * These vmx_msr_guest_{enter,exit}_tsc_aux() calls can be
3152                  * anywhere in this loop so long as they happen with
3153                  * interrupts disabled. This location is chosen for
3154                  * simplicity.
3155                  */
3156                 vmx_msr_guest_enter_tsc_aux(vmx, vcpu);
3157
3158                 vmx_dr_enter_guest(vmxctx);
3159
3160                 /*
3161                  * Mark the EPT as active on this host CPU and invalidate
3162                  * EPTP-tagged TLB entries if required.
3163                  */
3164                 vmx_pmap_activate(vmx, pmap);
3165
3166                 vmx_run_trace(vmx, vcpu);
3167                 rc = vmx_enter_guest(vmxctx, vmx, launched);
3168
3169                 vmx_pmap_deactivate(vmx, pmap);
3170                 vmx_dr_leave_guest(vmxctx);
3171                 vmx_msr_guest_exit_tsc_aux(vmx, vcpu);
3172
3173                 bare_lgdt(&gdtr);
3174                 lidt(&idtr);
3175                 lldt(ldt_sel);
3176
3177                 /* Collect some information for VM exit processing */
3178                 vmexit->rip = rip = vmcs_guest_rip();
3179                 vmexit->inst_length = vmexit_instruction_length();
3180                 vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason();
3181                 vmexit->u.vmx.exit_qualification = vmcs_exit_qualification();
3182
3183                 /* Update 'nextrip' */
3184                 vmx_vcpu->state.nextrip = rip;
3185
3186                 if (rc == VMX_GUEST_VMEXIT) {
3187                         vmx_exit_handle_nmi(vmx, vcpu, vmexit);
3188                         enable_intr();
3189                         handled = vmx_exit_process(vmx, vcpu, vmexit);
3190                 } else {
3191                         enable_intr();
3192                         vmx_exit_inst_error(vmxctx, rc, vmexit);
3193                 }
3194                 launched = 1;
3195                 vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled);
3196                 rip = vmexit->rip;
3197         } while (handled);
3198
3199         /*
3200          * If a VM exit has been handled then the exitcode must be BOGUS
3201          * If a VM exit is not handled then the exitcode must not be BOGUS
3202          */
3203         if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) ||
3204             (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) {
3205                 panic("Mismatch between handled (%d) and exitcode (%d)",
3206                       handled, vmexit->exitcode);
3207         }
3208
3209         VCPU_CTR1(vm, vcpu, "returning from vmx_run: exitcode %d",
3210             vmexit->exitcode);
3211
3212         VMCLEAR(vmcs);
3213         vmx_msr_guest_exit(vmx, vcpu);
3214
3215         return (0);
3216 }
3217
3218 static void
3219 vmx_cleanup(void *arg)
3220 {
3221         int i;
3222         struct vmx_vcpu *vcpu;
3223         struct vmx *vmx = arg;
3224         uint16_t maxcpus;
3225
3226         if (apic_access_virtualization(vmx, 0))
3227                 vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE);
3228
3229         maxcpus = vm_get_maxcpus(vmx->vm);
3230         for (i = 0; i < maxcpus; i++) {
3231                 vcpu = &vmx->vcpus[i];
3232                 vpid_free(vcpu->state.vpid);
3233                 free(vcpu->pir_desc, M_VMX);
3234                 free(vcpu->apic_page, M_VMX);
3235                 free(vcpu->vmcs, M_VMX);
3236         }
3237
3238         free(vmx->msr_bitmap, M_VMX);
3239         free(vmx, M_VMX);
3240
3241         return;
3242 }
3243
3244 static register_t *
3245 vmxctx_regptr(struct vmxctx *vmxctx, int reg)
3246 {
3247
3248         switch (reg) {
3249         case VM_REG_GUEST_RAX:
3250                 return (&vmxctx->guest_rax);
3251         case VM_REG_GUEST_RBX:
3252                 return (&vmxctx->guest_rbx);
3253         case VM_REG_GUEST_RCX:
3254                 return (&vmxctx->guest_rcx);
3255         case VM_REG_GUEST_RDX:
3256                 return (&vmxctx->guest_rdx);
3257         case VM_REG_GUEST_RSI:
3258                 return (&vmxctx->guest_rsi);
3259         case VM_REG_GUEST_RDI:
3260                 return (&vmxctx->guest_rdi);
3261         case VM_REG_GUEST_RBP:
3262                 return (&vmxctx->guest_rbp);
3263         case VM_REG_GUEST_R8:
3264                 return (&vmxctx->guest_r8);
3265         case VM_REG_GUEST_R9:
3266                 return (&vmxctx->guest_r9);
3267         case VM_REG_GUEST_R10:
3268                 return (&vmxctx->guest_r10);
3269         case VM_REG_GUEST_R11:
3270                 return (&vmxctx->guest_r11);
3271         case VM_REG_GUEST_R12:
3272                 return (&vmxctx->guest_r12);
3273         case VM_REG_GUEST_R13:
3274                 return (&vmxctx->guest_r13);
3275         case VM_REG_GUEST_R14:
3276                 return (&vmxctx->guest_r14);
3277         case VM_REG_GUEST_R15:
3278                 return (&vmxctx->guest_r15);
3279         case VM_REG_GUEST_CR2:
3280                 return (&vmxctx->guest_cr2);
3281         case VM_REG_GUEST_DR0:
3282                 return (&vmxctx->guest_dr0);
3283         case VM_REG_GUEST_DR1:
3284                 return (&vmxctx->guest_dr1);
3285         case VM_REG_GUEST_DR2:
3286                 return (&vmxctx->guest_dr2);
3287         case VM_REG_GUEST_DR3:
3288                 return (&vmxctx->guest_dr3);
3289         case VM_REG_GUEST_DR6:
3290                 return (&vmxctx->guest_dr6);
3291         default:
3292                 break;
3293         }
3294         return (NULL);
3295 }
3296
3297 static int
3298 vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval)
3299 {
3300         register_t *regp;
3301
3302         if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
3303                 *retval = *regp;
3304                 return (0);
3305         } else
3306                 return (EINVAL);
3307 }
3308
3309 static int
3310 vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val)
3311 {
3312         register_t *regp;
3313
3314         if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
3315                 *regp = val;
3316                 return (0);
3317         } else
3318                 return (EINVAL);
3319 }
3320
3321 static int
3322 vmx_get_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t *retval)
3323 {
3324         uint64_t gi;
3325         int error;
3326
3327         error = vmcs_getreg(vmx->vcpus[vcpu].vmcs, running,
3328             VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY), &gi);
3329         *retval = (gi & HWINTR_BLOCKING) ? 1 : 0;
3330         return (error);
3331 }
3332
3333 static int
3334 vmx_modify_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t val)
3335 {
3336         struct vmcs *vmcs;
3337         uint64_t gi;
3338         int error, ident;
3339
3340         /*
3341          * Forcing the vcpu into an interrupt shadow is not supported.
3342          */
3343         if (val) {
3344                 error = EINVAL;
3345                 goto done;
3346         }
3347
3348         vmcs = vmx->vcpus[vcpu].vmcs;
3349         ident = VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY);
3350         error = vmcs_getreg(vmcs, running, ident, &gi);
3351         if (error == 0) {
3352                 gi &= ~HWINTR_BLOCKING;
3353                 error = vmcs_setreg(vmcs, running, ident, gi);
3354         }
3355 done:
3356         VCPU_CTR2(vmx->vm, vcpu, "Setting intr_shadow to %#lx %s", val,
3357             error ? "failed" : "succeeded");
3358         return (error);
3359 }
3360
3361 static int
3362 vmx_shadow_reg(int reg)
3363 {
3364         int shreg;
3365
3366         shreg = -1;
3367
3368         switch (reg) {
3369         case VM_REG_GUEST_CR0:
3370                 shreg = VMCS_CR0_SHADOW;
3371                 break;
3372         case VM_REG_GUEST_CR4:
3373                 shreg = VMCS_CR4_SHADOW;
3374                 break;
3375         default:
3376                 break;
3377         }
3378
3379         return (shreg);
3380 }
3381
3382 static int
3383 vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval)
3384 {
3385         int running, hostcpu;
3386         struct vmx *vmx = arg;
3387
3388         running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
3389         if (running && hostcpu != curcpu)
3390                 panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu);
3391
3392         if (reg == VM_REG_GUEST_INTR_SHADOW)
3393                 return (vmx_get_intr_shadow(vmx, vcpu, running, retval));
3394
3395         if (vmxctx_getreg(&vmx->vcpus[vcpu].ctx, reg, retval) == 0)
3396                 return (0);
3397
3398         return (vmcs_getreg(vmx->vcpus[vcpu].vmcs, running, reg, retval));
3399 }
3400
3401 static int
3402 vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
3403 {
3404         int error, hostcpu, running, shadow;
3405         uint64_t ctls;
3406         pmap_t pmap;
3407         struct vmx *vmx = arg;
3408         struct vmx_vcpu *vmx_vcpu = &vmx->vcpus[vcpu];
3409
3410         running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
3411         if (running && hostcpu != curcpu)
3412                 panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu);
3413
3414         if (reg == VM_REG_GUEST_INTR_SHADOW)
3415                 return (vmx_modify_intr_shadow(vmx, vcpu, running, val));
3416
3417         if (vmxctx_setreg(&vmx_vcpu->ctx, reg, val) == 0)
3418                 return (0);
3419
3420         /* Do not permit user write access to VMCS fields by offset. */
3421         if (reg < 0)
3422                 return (EINVAL);
3423
3424         error = vmcs_setreg(vmx_vcpu->vmcs, running, reg, val);
3425
3426         if (error == 0) {
3427                 /*
3428                  * If the "load EFER" VM-entry control is 1 then the
3429                  * value of EFER.LMA must be identical to "IA-32e mode guest"
3430                  * bit in the VM-entry control.
3431                  */
3432                 if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 &&
3433                     (reg == VM_REG_GUEST_EFER)) {
3434                         vmcs_getreg(vmx_vcpu->vmcs, running,
3435                                     VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls);
3436                         if (val & EFER_LMA)
3437                                 ctls |= VM_ENTRY_GUEST_LMA;
3438                         else
3439                                 ctls &= ~VM_ENTRY_GUEST_LMA;
3440                         vmcs_setreg(vmx_vcpu->vmcs, running,
3441                                     VMCS_IDENT(VMCS_ENTRY_CTLS), ctls);
3442                 }
3443
3444                 shadow = vmx_shadow_reg(reg);
3445                 if (shadow > 0) {
3446                         /*
3447                          * Store the unmodified value in the shadow
3448                          */
3449                         error = vmcs_setreg(vmx_vcpu->vmcs, running,
3450                                     VMCS_IDENT(shadow), val);
3451                 }
3452
3453                 if (reg == VM_REG_GUEST_CR3) {
3454                         /*
3455                          * Invalidate the guest vcpu's TLB mappings to emulate
3456                          * the behavior of updating %cr3.
3457                          *
3458                          * XXX the processor retains global mappings when %cr3
3459                          * is updated but vmx_invvpid() does not.
3460                          */
3461                         pmap = vmx_vcpu->ctx.pmap;
3462                         vmx_invvpid(vmx, vcpu, pmap, running);
3463                 }
3464         }
3465
3466         return (error);
3467 }
3468
3469 static int
3470 vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
3471 {
3472         int hostcpu, running;
3473         struct vmx *vmx = arg;
3474
3475         running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
3476         if (running && hostcpu != curcpu)
3477                 panic("vmx_getdesc: %s%d is running", vm_name(vmx->vm), vcpu);
3478
3479         return (vmcs_getdesc(vmx->vcpus[vcpu].vmcs, running, reg, desc));
3480 }
3481
3482 static int
3483 vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
3484 {
3485         int hostcpu, running;
3486         struct vmx *vmx = arg;
3487
3488         running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
3489         if (running && hostcpu != curcpu)
3490                 panic("vmx_setdesc: %s%d is running", vm_name(vmx->vm), vcpu);
3491
3492         return (vmcs_setdesc(vmx->vcpus[vcpu].vmcs, running, reg, desc));
3493 }
3494
3495 static int
3496 vmx_getcap(void *arg, int vcpu, int type, int *retval)
3497 {
3498         struct vmx *vmx = arg;
3499         int vcap;
3500         int ret;
3501
3502         ret = ENOENT;
3503
3504         vcap = vmx->vcpus[vcpu].cap.set;
3505
3506         switch (type) {
3507         case VM_CAP_HALT_EXIT:
3508                 if (cap_halt_exit)
3509                         ret = 0;
3510                 break;
3511         case VM_CAP_PAUSE_EXIT:
3512                 if (cap_pause_exit)
3513                         ret = 0;
3514                 break;
3515         case VM_CAP_MTRAP_EXIT:
3516                 if (cap_monitor_trap)
3517                         ret = 0;
3518                 break;
3519         case VM_CAP_RDPID:
3520                 if (cap_rdpid)
3521                         ret = 0;
3522                 break;
3523         case VM_CAP_RDTSCP:
3524                 if (cap_rdtscp)
3525                         ret = 0;
3526                 break;
3527         case VM_CAP_UNRESTRICTED_GUEST:
3528                 if (cap_unrestricted_guest)
3529                         ret = 0;
3530                 break;
3531         case VM_CAP_ENABLE_INVPCID:
3532                 if (cap_invpcid)
3533                         ret = 0;
3534                 break;
3535         case VM_CAP_BPT_EXIT:
3536         case VM_CAP_IPI_EXIT:
3537                 ret = 0;
3538                 break;
3539         default:
3540                 break;
3541         }
3542
3543         if (ret == 0)
3544                 *retval = (vcap & (1 << type)) ? 1 : 0;
3545
3546         return (ret);
3547 }
3548
3549 static int
3550 vmx_setcap(void *arg, int vcpu, int type, int val)
3551 {
3552         struct vmx *vmx = arg;
3553         struct vmx_vcpu *vmx_vcpu = &vmx->vcpus[vcpu];
3554         struct vmcs *vmcs = vmx_vcpu->vmcs;
3555         struct vlapic *vlapic;
3556         uint32_t baseval;
3557         uint32_t *pptr;
3558         int error;
3559         int flag;
3560         int reg;
3561         int retval;
3562
3563         retval = ENOENT;
3564         pptr = NULL;
3565
3566         switch (type) {
3567         case VM_CAP_HALT_EXIT:
3568                 if (cap_halt_exit) {
3569                         retval = 0;
3570                         pptr = &vmx_vcpu->cap.proc_ctls;
3571                         baseval = *pptr;
3572                         flag = PROCBASED_HLT_EXITING;
3573                         reg = VMCS_PRI_PROC_BASED_CTLS;
3574                 }
3575                 break;
3576         case VM_CAP_MTRAP_EXIT:
3577                 if (cap_monitor_trap) {
3578                         retval = 0;
3579                         pptr = &vmx_vcpu->cap.proc_ctls;
3580                         baseval = *pptr;
3581                         flag = PROCBASED_MTF;
3582                         reg = VMCS_PRI_PROC_BASED_CTLS;
3583                 }
3584                 break;
3585         case VM_CAP_PAUSE_EXIT:
3586                 if (cap_pause_exit) {
3587                         retval = 0;
3588                         pptr = &vmx_vcpu->cap.proc_ctls;
3589                         baseval = *pptr;
3590                         flag = PROCBASED_PAUSE_EXITING;
3591                         reg = VMCS_PRI_PROC_BASED_CTLS;
3592                 }
3593                 break;
3594         case VM_CAP_RDPID:
3595         case VM_CAP_RDTSCP:
3596                 if (cap_rdpid || cap_rdtscp)
3597                         /*
3598                          * Choose not to support enabling/disabling
3599                          * RDPID/RDTSCP via libvmmapi since, as per the
3600                          * discussion in vmx_modinit(), RDPID/RDTSCP are
3601                          * either always enabled or always disabled.
3602                          */
3603                         error = EOPNOTSUPP;
3604                 break;
3605         case VM_CAP_UNRESTRICTED_GUEST:
3606                 if (cap_unrestricted_guest) {
3607                         retval = 0;
3608                         pptr = &vmx_vcpu->cap.proc_ctls2;
3609                         baseval = *pptr;
3610                         flag = PROCBASED2_UNRESTRICTED_GUEST;
3611                         reg = VMCS_SEC_PROC_BASED_CTLS;
3612                 }
3613                 break;
3614         case VM_CAP_ENABLE_INVPCID:
3615                 if (cap_invpcid) {
3616                         retval = 0;
3617                         pptr = &vmx_vcpu->cap.proc_ctls2;
3618                         baseval = *pptr;
3619                         flag = PROCBASED2_ENABLE_INVPCID;
3620                         reg = VMCS_SEC_PROC_BASED_CTLS;
3621                 }
3622                 break;
3623         case VM_CAP_BPT_EXIT:
3624                 retval = 0;
3625
3626                 /* Don't change the bitmap if we are tracing all exceptions. */
3627                 if (vmx_vcpu->cap.exc_bitmap != 0xffffffff) {
3628                         pptr = &vmx_vcpu->cap.exc_bitmap;
3629                         baseval = *pptr;
3630                         flag = (1 << IDT_BP);
3631                         reg = VMCS_EXCEPTION_BITMAP;
3632                 }
3633                 break;
3634         case VM_CAP_IPI_EXIT:
3635                 retval = 0;
3636
3637                 vlapic = vm_lapic(vmx->vm, vcpu);
3638                 vlapic->ipi_exit = val;
3639                 break;
3640         default:
3641                 break;
3642         }
3643
3644         if (retval)
3645                 return (retval);
3646
3647         if (pptr != NULL) {
3648                 if (val) {
3649                         baseval |= flag;
3650                 } else {
3651                         baseval &= ~flag;
3652                 }
3653                 VMPTRLD(vmcs);
3654                 error = vmwrite(reg, baseval);
3655                 VMCLEAR(vmcs);
3656
3657                 if (error)
3658                         return (error);
3659
3660                 /*
3661                  * Update optional stored flags, and record
3662                  * setting
3663                  */
3664                 *pptr = baseval;
3665         }
3666
3667         if (val) {
3668                 vmx_vcpu->cap.set |= (1 << type);
3669         } else {
3670                 vmx_vcpu->cap.set &= ~(1 << type);
3671         }
3672
3673         return (0);
3674 }
3675
3676 static struct vmspace *
3677 vmx_vmspace_alloc(vm_offset_t min, vm_offset_t max)
3678 {
3679         return (ept_vmspace_alloc(min, max));
3680 }
3681
3682 static void
3683 vmx_vmspace_free(struct vmspace *vmspace)
3684 {
3685         ept_vmspace_free(vmspace);
3686 }
3687
3688 struct vlapic_vtx {
3689         struct vlapic   vlapic;
3690         struct pir_desc *pir_desc;
3691         struct vmx      *vmx;
3692         u_int   pending_prio;
3693 };
3694
3695 #define VPR_PRIO_BIT(vpr)       (1 << ((vpr) >> 4))
3696
3697 #define VMX_CTR_PIR(vm, vcpuid, pir_desc, notify, vector, level, msg)   \
3698 do {                                                                    \
3699         VCPU_CTR2(vm, vcpuid, msg " assert %s-triggered vector %d",     \
3700             level ? "level" : "edge", vector);                          \
3701         VCPU_CTR1(vm, vcpuid, msg " pir0 0x%016lx", pir_desc->pir[0]);  \
3702         VCPU_CTR1(vm, vcpuid, msg " pir1 0x%016lx", pir_desc->pir[1]);  \
3703         VCPU_CTR1(vm, vcpuid, msg " pir2 0x%016lx", pir_desc->pir[2]);  \
3704         VCPU_CTR1(vm, vcpuid, msg " pir3 0x%016lx", pir_desc->pir[3]);  \
3705         VCPU_CTR1(vm, vcpuid, msg " notify: %s", notify ? "yes" : "no");\
3706 } while (0)
3707
3708 /*
3709  * vlapic->ops handlers that utilize the APICv hardware assist described in
3710  * Chapter 29 of the Intel SDM.
3711  */
3712 static int
3713 vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level)
3714 {
3715         struct vlapic_vtx *vlapic_vtx;
3716         struct pir_desc *pir_desc;
3717         uint64_t mask;
3718         int idx, notify = 0;
3719
3720         vlapic_vtx = (struct vlapic_vtx *)vlapic;
3721         pir_desc = vlapic_vtx->pir_desc;
3722
3723         /*
3724          * Keep track of interrupt requests in the PIR descriptor. This is
3725          * because the virtual APIC page pointed to by the VMCS cannot be
3726          * modified if the vcpu is running.
3727          */
3728         idx = vector / 64;
3729         mask = 1UL << (vector % 64);
3730         atomic_set_long(&pir_desc->pir[idx], mask);
3731
3732         /*
3733          * A notification is required whenever the 'pending' bit makes a
3734          * transition from 0->1.
3735          *
3736          * Even if the 'pending' bit is already asserted, notification about
3737          * the incoming interrupt may still be necessary.  For example, if a
3738          * vCPU is HLTed with a high PPR, a low priority interrupt would cause
3739          * the 0->1 'pending' transition with a notification, but the vCPU
3740          * would ignore the interrupt for the time being.  The same vCPU would
3741          * need to then be notified if a high-priority interrupt arrived which
3742          * satisfied the PPR.
3743          *
3744          * The priorities of interrupts injected while 'pending' is asserted
3745          * are tracked in a custom bitfield 'pending_prio'.  Should the
3746          * to-be-injected interrupt exceed the priorities already present, the
3747          * notification is sent.  The priorities recorded in 'pending_prio' are
3748          * cleared whenever the 'pending' bit makes another 0->1 transition.
3749          */
3750         if (atomic_cmpset_long(&pir_desc->pending, 0, 1) != 0) {
3751                 notify = 1;
3752                 vlapic_vtx->pending_prio = 0;
3753         } else {
3754                 const u_int old_prio = vlapic_vtx->pending_prio;
3755                 const u_int prio_bit = VPR_PRIO_BIT(vector & APIC_TPR_INT);
3756
3757                 if ((old_prio & prio_bit) == 0 && prio_bit > old_prio) {
3758                         atomic_set_int(&vlapic_vtx->pending_prio, prio_bit);
3759                         notify = 1;
3760                 }
3761         }
3762
3763         VMX_CTR_PIR(vlapic->vm, vlapic->vcpuid, pir_desc, notify, vector,
3764             level, "vmx_set_intr_ready");
3765         return (notify);
3766 }
3767
3768 static int
3769 vmx_pending_intr(struct vlapic *vlapic, int *vecptr)
3770 {
3771         struct vlapic_vtx *vlapic_vtx;
3772         struct pir_desc *pir_desc;
3773         struct LAPIC *lapic;
3774         uint64_t pending, pirval;
3775         uint32_t ppr, vpr;
3776         int i;
3777
3778         /*
3779          * This function is only expected to be called from the 'HLT' exit
3780          * handler which does not care about the vector that is pending.
3781          */
3782         KASSERT(vecptr == NULL, ("vmx_pending_intr: vecptr must be NULL"));
3783
3784         vlapic_vtx = (struct vlapic_vtx *)vlapic;
3785         pir_desc = vlapic_vtx->pir_desc;
3786
3787         pending = atomic_load_acq_long(&pir_desc->pending);
3788         if (!pending) {
3789                 /*
3790                  * While a virtual interrupt may have already been
3791                  * processed the actual delivery maybe pending the
3792                  * interruptibility of the guest.  Recognize a pending
3793                  * interrupt by reevaluating virtual interrupts
3794                  * following Section 29.2.1 in the Intel SDM Volume 3.
3795                  */
3796                 struct vm_exit *vmexit;
3797                 uint8_t rvi, ppr;
3798
3799                 vmexit = vm_exitinfo(vlapic->vm, vlapic->vcpuid);
3800                 KASSERT(vmexit->exitcode == VM_EXITCODE_HLT,
3801                     ("vmx_pending_intr: exitcode not 'HLT'"));
3802                 rvi = vmexit->u.hlt.intr_status & APIC_TPR_INT;
3803                 lapic = vlapic->apic_page;
3804                 ppr = lapic->ppr & APIC_TPR_INT;
3805                 if (rvi > ppr) {
3806                         return (1);
3807                 }
3808
3809                 return (0);
3810         }
3811
3812         /*
3813          * If there is an interrupt pending then it will be recognized only
3814          * if its priority is greater than the processor priority.
3815          *
3816          * Special case: if the processor priority is zero then any pending
3817          * interrupt will be recognized.
3818          */
3819         lapic = vlapic->apic_page;
3820         ppr = lapic->ppr & APIC_TPR_INT;
3821         if (ppr == 0)
3822                 return (1);
3823
3824         VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "HLT with non-zero PPR %d",
3825             lapic->ppr);
3826
3827         vpr = 0;
3828         for (i = 3; i >= 0; i--) {
3829                 pirval = pir_desc->pir[i];
3830                 if (pirval != 0) {
3831                         vpr = (i * 64 + flsl(pirval) - 1) & APIC_TPR_INT;
3832                         break;
3833                 }
3834         }
3835
3836         /*
3837          * If the highest-priority pending interrupt falls short of the
3838          * processor priority of this vCPU, ensure that 'pending_prio' does not
3839          * have any stale bits which would preclude a higher-priority interrupt
3840          * from incurring a notification later.
3841          */
3842         if (vpr <= ppr) {
3843                 const u_int prio_bit = VPR_PRIO_BIT(vpr);
3844                 const u_int old = vlapic_vtx->pending_prio;
3845
3846                 if (old > prio_bit && (old & prio_bit) == 0) {
3847                         vlapic_vtx->pending_prio = prio_bit;
3848                 }
3849                 return (0);
3850         }
3851         return (1);
3852 }
3853
3854 static void
3855 vmx_intr_accepted(struct vlapic *vlapic, int vector)
3856 {
3857
3858         panic("vmx_intr_accepted: not expected to be called");
3859 }
3860
3861 static void
3862 vmx_set_tmr(struct vlapic *vlapic, int vector, bool level)
3863 {
3864         struct vlapic_vtx *vlapic_vtx;
3865         struct vmx *vmx;
3866         struct vmcs *vmcs;
3867         uint64_t mask, val;
3868
3869         KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector));
3870         KASSERT(!vcpu_is_running(vlapic->vm, vlapic->vcpuid, NULL),
3871             ("vmx_set_tmr: vcpu cannot be running"));
3872
3873         vlapic_vtx = (struct vlapic_vtx *)vlapic;
3874         vmx = vlapic_vtx->vmx;
3875         vmcs = vmx->vcpus[vlapic->vcpuid].vmcs;
3876         mask = 1UL << (vector % 64);
3877
3878         VMPTRLD(vmcs);
3879         val = vmcs_read(VMCS_EOI_EXIT(vector));
3880         if (level)
3881                 val |= mask;
3882         else
3883                 val &= ~mask;
3884         vmcs_write(VMCS_EOI_EXIT(vector), val);
3885         VMCLEAR(vmcs);
3886 }
3887
3888 static void
3889 vmx_enable_x2apic_mode_ts(struct vlapic *vlapic)
3890 {
3891         struct vmx *vmx;
3892         struct vmx_vcpu *vcpu;
3893         struct vmcs *vmcs;
3894         uint32_t proc_ctls;
3895         int vcpuid;
3896
3897         vcpuid = vlapic->vcpuid;
3898         vmx = ((struct vlapic_vtx *)vlapic)->vmx;
3899         vcpu = &vmx->vcpus[vcpuid];
3900         vmcs = vcpu->vmcs;
3901
3902         proc_ctls = vcpu->cap.proc_ctls;
3903         proc_ctls &= ~PROCBASED_USE_TPR_SHADOW;
3904         proc_ctls |= PROCBASED_CR8_LOAD_EXITING;
3905         proc_ctls |= PROCBASED_CR8_STORE_EXITING;
3906         vcpu->cap.proc_ctls = proc_ctls;
3907
3908         VMPTRLD(vmcs);
3909         vmcs_write(VMCS_PRI_PROC_BASED_CTLS, proc_ctls);
3910         VMCLEAR(vmcs);
3911 }
3912
3913 static void
3914 vmx_enable_x2apic_mode_vid(struct vlapic *vlapic)
3915 {
3916         struct vmx *vmx;
3917         struct vmx_vcpu *vcpu;
3918         struct vmcs *vmcs;
3919         uint32_t proc_ctls2;
3920         int vcpuid, error __diagused;
3921
3922         vcpuid = vlapic->vcpuid;
3923         vmx = ((struct vlapic_vtx *)vlapic)->vmx;
3924         vcpu = &vmx->vcpus[vcpuid];
3925         vmcs = vcpu->vmcs;
3926
3927         proc_ctls2 = vcpu->cap.proc_ctls2;
3928         KASSERT((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) != 0,
3929             ("%s: invalid proc_ctls2 %#x", __func__, proc_ctls2));
3930
3931         proc_ctls2 &= ~PROCBASED2_VIRTUALIZE_APIC_ACCESSES;
3932         proc_ctls2 |= PROCBASED2_VIRTUALIZE_X2APIC_MODE;
3933         vcpu->cap.proc_ctls2 = proc_ctls2;
3934
3935         VMPTRLD(vmcs);
3936         vmcs_write(VMCS_SEC_PROC_BASED_CTLS, proc_ctls2);
3937         VMCLEAR(vmcs);
3938
3939         if (vlapic->vcpuid == 0) {
3940                 /*
3941                  * The nested page table mappings are shared by all vcpus
3942                  * so unmap the APIC access page just once.
3943                  */
3944                 error = vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE);
3945                 KASSERT(error == 0, ("%s: vm_unmap_mmio error %d",
3946                     __func__, error));
3947
3948                 /*
3949                  * The MSR bitmap is shared by all vcpus so modify it only
3950                  * once in the context of vcpu 0.
3951                  */
3952                 error = vmx_allow_x2apic_msrs(vmx);
3953                 KASSERT(error == 0, ("%s: vmx_allow_x2apic_msrs error %d",
3954                     __func__, error));
3955         }
3956 }
3957
3958 static void
3959 vmx_post_intr(struct vlapic *vlapic, int hostcpu)
3960 {
3961
3962         ipi_cpu(hostcpu, pirvec);
3963 }
3964
3965 /*
3966  * Transfer the pending interrupts in the PIR descriptor to the IRR
3967  * in the virtual APIC page.
3968  */
3969 static void
3970 vmx_inject_pir(struct vlapic *vlapic)
3971 {
3972         struct vlapic_vtx *vlapic_vtx;
3973         struct pir_desc *pir_desc;
3974         struct LAPIC *lapic;
3975         uint64_t val, pirval;
3976         int rvi, pirbase = -1;
3977         uint16_t intr_status_old, intr_status_new;
3978
3979         vlapic_vtx = (struct vlapic_vtx *)vlapic;
3980         pir_desc = vlapic_vtx->pir_desc;
3981         if (atomic_cmpset_long(&pir_desc->pending, 1, 0) == 0) {
3982                 VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: "
3983                     "no posted interrupt pending");
3984                 return;
3985         }
3986
3987         pirval = 0;
3988         pirbase = -1;
3989         lapic = vlapic->apic_page;
3990
3991         val = atomic_readandclear_long(&pir_desc->pir[0]);
3992         if (val != 0) {
3993                 lapic->irr0 |= val;
3994                 lapic->irr1 |= val >> 32;
3995                 pirbase = 0;
3996                 pirval = val;
3997         }
3998
3999         val = atomic_readandclear_long(&pir_desc->pir[1]);
4000         if (val != 0) {
4001                 lapic->irr2 |= val;
4002                 lapic->irr3 |= val >> 32;
4003                 pirbase = 64;
4004                 pirval = val;
4005         }
4006
4007         val = atomic_readandclear_long(&pir_desc->pir[2]);
4008         if (val != 0) {
4009                 lapic->irr4 |= val;
4010                 lapic->irr5 |= val >> 32;
4011                 pirbase = 128;
4012                 pirval = val;
4013         }
4014
4015         val = atomic_readandclear_long(&pir_desc->pir[3]);
4016         if (val != 0) {
4017                 lapic->irr6 |= val;
4018                 lapic->irr7 |= val >> 32;
4019                 pirbase = 192;
4020                 pirval = val;
4021         }
4022
4023         VLAPIC_CTR_IRR(vlapic, "vmx_inject_pir");
4024
4025         /*
4026          * Update RVI so the processor can evaluate pending virtual
4027          * interrupts on VM-entry.
4028          *
4029          * It is possible for pirval to be 0 here, even though the
4030          * pending bit has been set. The scenario is:
4031          * CPU-Y is sending a posted interrupt to CPU-X, which
4032          * is running a guest and processing posted interrupts in h/w.
4033          * CPU-X will eventually exit and the state seen in s/w is
4034          * the pending bit set, but no PIR bits set.
4035          *
4036          *      CPU-X                      CPU-Y
4037          *   (vm running)                (host running)
4038          *   rx posted interrupt
4039          *   CLEAR pending bit
4040          *                               SET PIR bit
4041          *   READ/CLEAR PIR bits
4042          *                               SET pending bit
4043          *   (vm exit)
4044          *   pending bit set, PIR 0
4045          */
4046         if (pirval != 0) {
4047                 rvi = pirbase + flsl(pirval) - 1;
4048                 intr_status_old = vmcs_read(VMCS_GUEST_INTR_STATUS);
4049                 intr_status_new = (intr_status_old & 0xFF00) | rvi;
4050                 if (intr_status_new > intr_status_old) {
4051                         vmcs_write(VMCS_GUEST_INTR_STATUS, intr_status_new);
4052                         VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: "
4053                             "guest_intr_status changed from 0x%04x to 0x%04x",
4054                             intr_status_old, intr_status_new);
4055                 }
4056         }
4057 }
4058
4059 static struct vlapic *
4060 vmx_vlapic_init(void *arg, int vcpuid)
4061 {
4062         struct vmx *vmx;
4063         struct vlapic *vlapic;
4064         struct vlapic_vtx *vlapic_vtx;
4065
4066         vmx = arg;
4067
4068         vlapic = malloc(sizeof(struct vlapic_vtx), M_VLAPIC, M_WAITOK | M_ZERO);
4069         vlapic->vm = vmx->vm;
4070         vlapic->vcpuid = vcpuid;
4071         vlapic->apic_page = (struct LAPIC *)vmx->vcpus[vcpuid].apic_page;
4072
4073         vlapic_vtx = (struct vlapic_vtx *)vlapic;
4074         vlapic_vtx->pir_desc = vmx->vcpus[vcpuid].pir_desc;
4075         vlapic_vtx->vmx = vmx;
4076
4077         if (tpr_shadowing) {
4078                 vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode_ts;
4079         }
4080
4081         if (virtual_interrupt_delivery) {
4082                 vlapic->ops.set_intr_ready = vmx_set_intr_ready;
4083                 vlapic->ops.pending_intr = vmx_pending_intr;
4084                 vlapic->ops.intr_accepted = vmx_intr_accepted;
4085                 vlapic->ops.set_tmr = vmx_set_tmr;
4086                 vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode_vid;
4087         }
4088
4089         if (posted_interrupts)
4090                 vlapic->ops.post_intr = vmx_post_intr;
4091
4092         vlapic_init(vlapic);
4093
4094         return (vlapic);
4095 }
4096
4097 static void
4098 vmx_vlapic_cleanup(void *arg, struct vlapic *vlapic)
4099 {
4100
4101         vlapic_cleanup(vlapic);
4102         free(vlapic, M_VLAPIC);
4103 }
4104
4105 #ifdef BHYVE_SNAPSHOT
4106 static int
4107 vmx_snapshot(void *arg, struct vm_snapshot_meta *meta)
4108 {
4109         return (0);
4110 }
4111
4112 static int
4113 vmx_vcpu_snapshot(void *arg, struct vm_snapshot_meta *meta, int vcpuid)
4114 {
4115         struct vmcs *vmcs;
4116         struct vmx *vmx;
4117         struct vmx_vcpu *vcpu;
4118         struct vmxctx *vmxctx;
4119         int err, run, hostcpu;
4120
4121         vmx = (struct vmx *)arg;
4122         err = 0;
4123
4124         KASSERT(arg != NULL, ("%s: arg was NULL", __func__));
4125         vcpu = &vmx->vcpus[vcpuid];
4126         vmcs = vcpu->vmcs;
4127
4128         run = vcpu_is_running(vmx->vm, vcpuid, &hostcpu);
4129         if (run && hostcpu != curcpu) {
4130                 printf("%s: %s%d is running", __func__, vm_name(vmx->vm),
4131                     vcpuid);
4132                 return (EINVAL);
4133         }
4134
4135         err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_CR0, meta);
4136         err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_CR3, meta);
4137         err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_CR4, meta);
4138         err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_DR7, meta);
4139         err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_RSP, meta);
4140         err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_RIP, meta);
4141         err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_RFLAGS, meta);
4142
4143         /* Guest segments */
4144         err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_ES, meta);
4145         err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_ES, meta);
4146
4147         err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_CS, meta);
4148         err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_CS, meta);
4149
4150         err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_SS, meta);
4151         err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_SS, meta);
4152
4153         err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_DS, meta);
4154         err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_DS, meta);
4155
4156         err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_FS, meta);
4157         err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_FS, meta);
4158
4159         err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_GS, meta);
4160         err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_GS, meta);
4161
4162         err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_TR, meta);
4163         err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_TR, meta);
4164
4165         err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_LDTR, meta);
4166         err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_LDTR, meta);
4167
4168         err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_EFER, meta);
4169
4170         err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_IDTR, meta);
4171         err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_GDTR, meta);
4172
4173         /* Guest page tables */
4174         err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_PDPTE0, meta);
4175         err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_PDPTE1, meta);
4176         err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_PDPTE2, meta);
4177         err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_PDPTE3, meta);
4178
4179         /* Other guest state */
4180         err += vmcs_snapshot_any(vmcs, run, VMCS_GUEST_IA32_SYSENTER_CS, meta);
4181         err += vmcs_snapshot_any(vmcs, run, VMCS_GUEST_IA32_SYSENTER_ESP, meta);
4182         err += vmcs_snapshot_any(vmcs, run, VMCS_GUEST_IA32_SYSENTER_EIP, meta);
4183         err += vmcs_snapshot_any(vmcs, run, VMCS_GUEST_INTERRUPTIBILITY, meta);
4184         err += vmcs_snapshot_any(vmcs, run, VMCS_GUEST_ACTIVITY, meta);
4185         err += vmcs_snapshot_any(vmcs, run, VMCS_ENTRY_CTLS, meta);
4186         err += vmcs_snapshot_any(vmcs, run, VMCS_EXIT_CTLS, meta);
4187         if (err != 0)
4188                 goto done;
4189
4190         SNAPSHOT_BUF_OR_LEAVE(vcpu->guest_msrs,
4191             sizeof(vcpu->guest_msrs), meta, err, done);
4192
4193         vmxctx = &vcpu->ctx;
4194         SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rdi, meta, err, done);
4195         SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rsi, meta, err, done);
4196         SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rdx, meta, err, done);
4197         SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rcx, meta, err, done);
4198         SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r8, meta, err, done);
4199         SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r9, meta, err, done);
4200         SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rax, meta, err, done);
4201         SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rbx, meta, err, done);
4202         SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rbp, meta, err, done);
4203         SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r10, meta, err, done);
4204         SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r11, meta, err, done);
4205         SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r12, meta, err, done);
4206         SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r13, meta, err, done);
4207         SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r14, meta, err, done);
4208         SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r15, meta, err, done);
4209         SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_cr2, meta, err, done);
4210         SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_dr0, meta, err, done);
4211         SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_dr1, meta, err, done);
4212         SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_dr2, meta, err, done);
4213         SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_dr3, meta, err, done);
4214         SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_dr6, meta, err, done);
4215
4216 done:
4217         return (err);
4218 }
4219
4220 static int
4221 vmx_restore_tsc(void *arg, int vcpu, uint64_t offset)
4222 {
4223         struct vmcs *vmcs;
4224         struct vmx *vmx = (struct vmx *)arg;
4225         int error, running, hostcpu;
4226
4227         KASSERT(arg != NULL, ("%s: arg was NULL", __func__));
4228         vmcs = vmx->vcpus[vcpu].vmcs;
4229
4230         running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
4231         if (running && hostcpu != curcpu) {
4232                 printf("%s: %s%d is running", __func__, vm_name(vmx->vm), vcpu);
4233                 return (EINVAL);
4234         }
4235
4236         if (!running)
4237                 VMPTRLD(vmcs);
4238
4239         error = vmx_set_tsc_offset(vmx, vcpu, offset);
4240
4241         if (!running)
4242                 VMCLEAR(vmcs);
4243         return (error);
4244 }
4245 #endif
4246
4247 const struct vmm_ops vmm_ops_intel = {
4248         .modinit        = vmx_modinit,
4249         .modcleanup     = vmx_modcleanup,
4250         .modresume      = vmx_modresume,
4251         .init           = vmx_init,
4252         .run            = vmx_run,
4253         .cleanup        = vmx_cleanup,
4254         .getreg         = vmx_getreg,
4255         .setreg         = vmx_setreg,
4256         .getdesc        = vmx_getdesc,
4257         .setdesc        = vmx_setdesc,
4258         .getcap         = vmx_getcap,
4259         .setcap         = vmx_setcap,
4260         .vmspace_alloc  = vmx_vmspace_alloc,
4261         .vmspace_free   = vmx_vmspace_free,
4262         .vlapic_init    = vmx_vlapic_init,
4263         .vlapic_cleanup = vmx_vlapic_cleanup,
4264 #ifdef BHYVE_SNAPSHOT
4265         .snapshot       = vmx_snapshot,
4266         .vcpu_snapshot  = vmx_vcpu_snapshot,
4267         .restore_tsc    = vmx_restore_tsc,
4268 #endif
4269 };