]> CyberLeo.Net >> Repos - FreeBSD/stable/10.git/blob - sys/amd64/vmm/intel/vmx.c
MFC 264353,264509,264768,264770,264825,264846,264988,265114,265165,265365,
[FreeBSD/stable/10.git] / sys / amd64 / vmm / intel / vmx.c
1 /*-
2  * Copyright (c) 2011 NetApp, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  * $FreeBSD$
27  */
28
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/smp.h>
35 #include <sys/kernel.h>
36 #include <sys/malloc.h>
37 #include <sys/pcpu.h>
38 #include <sys/proc.h>
39 #include <sys/sysctl.h>
40
41 #include <vm/vm.h>
42 #include <vm/pmap.h>
43
44 #include <machine/psl.h>
45 #include <machine/cpufunc.h>
46 #include <machine/md_var.h>
47 #include <machine/segments.h>
48 #include <machine/smp.h>
49 #include <machine/specialreg.h>
50 #include <machine/vmparam.h>
51
52 #include <machine/vmm.h>
53 #include <machine/vmm_dev.h>
54 #include "vmm_host.h"
55 #include "vmm_ioport.h"
56 #include "vmm_ipi.h"
57 #include "vmm_msr.h"
58 #include "vmm_ktr.h"
59 #include "vmm_stat.h"
60 #include "vatpic.h"
61 #include "vlapic.h"
62 #include "vlapic_priv.h"
63
64 #include "vmx_msr.h"
65 #include "ept.h"
66 #include "vmx_cpufunc.h"
67 #include "vmx.h"
68 #include "x86.h"
69 #include "vmx_controls.h"
70
71 #define PINBASED_CTLS_ONE_SETTING                                       \
72         (PINBASED_EXTINT_EXITING        |                               \
73          PINBASED_NMI_EXITING           |                               \
74          PINBASED_VIRTUAL_NMI)
75 #define PINBASED_CTLS_ZERO_SETTING      0
76
77 #define PROCBASED_CTLS_WINDOW_SETTING                                   \
78         (PROCBASED_INT_WINDOW_EXITING   |                               \
79          PROCBASED_NMI_WINDOW_EXITING)
80
81 #define PROCBASED_CTLS_ONE_SETTING                                      \
82         (PROCBASED_SECONDARY_CONTROLS   |                               \
83          PROCBASED_IO_EXITING           |                               \
84          PROCBASED_MSR_BITMAPS          |                               \
85          PROCBASED_CTLS_WINDOW_SETTING)
86 #define PROCBASED_CTLS_ZERO_SETTING     \
87         (PROCBASED_CR3_LOAD_EXITING |   \
88         PROCBASED_CR3_STORE_EXITING |   \
89         PROCBASED_IO_BITMAPS)
90
91 #define PROCBASED_CTLS2_ONE_SETTING     PROCBASED2_ENABLE_EPT
92 #define PROCBASED_CTLS2_ZERO_SETTING    0
93
94 #define VM_EXIT_CTLS_ONE_SETTING_NO_PAT                                 \
95         (VM_EXIT_HOST_LMA                       |                       \
96         VM_EXIT_SAVE_EFER                       |                       \
97         VM_EXIT_LOAD_EFER)
98
99 #define VM_EXIT_CTLS_ONE_SETTING                                        \
100         (VM_EXIT_CTLS_ONE_SETTING_NO_PAT        |                       \
101         VM_EXIT_ACKNOWLEDGE_INTERRUPT           |                       \
102         VM_EXIT_SAVE_PAT                        |                       \
103         VM_EXIT_LOAD_PAT)
104 #define VM_EXIT_CTLS_ZERO_SETTING       VM_EXIT_SAVE_DEBUG_CONTROLS
105
106 #define VM_ENTRY_CTLS_ONE_SETTING_NO_PAT        VM_ENTRY_LOAD_EFER
107
108 #define VM_ENTRY_CTLS_ONE_SETTING                                       \
109         (VM_ENTRY_CTLS_ONE_SETTING_NO_PAT       |                       \
110         VM_ENTRY_LOAD_PAT)
111 #define VM_ENTRY_CTLS_ZERO_SETTING                                      \
112         (VM_ENTRY_LOAD_DEBUG_CONTROLS           |                       \
113         VM_ENTRY_INTO_SMM                       |                       \
114         VM_ENTRY_DEACTIVATE_DUAL_MONITOR)
115
116 #define guest_msr_rw(vmx, msr) \
117         msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW)
118
119 #define guest_msr_ro(vmx, msr) \
120     msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_READ)
121
122 #define HANDLED         1
123 #define UNHANDLED       0
124
125 static MALLOC_DEFINE(M_VMX, "vmx", "vmx");
126 static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic");
127
128 SYSCTL_DECL(_hw_vmm);
129 SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW, NULL, NULL);
130
131 int vmxon_enabled[MAXCPU];
132 static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE);
133
134 static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2;
135 static uint32_t exit_ctls, entry_ctls;
136
137 static uint64_t cr0_ones_mask, cr0_zeros_mask;
138 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_ones_mask, CTLFLAG_RD,
139              &cr0_ones_mask, 0, NULL);
140 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_zeros_mask, CTLFLAG_RD,
141              &cr0_zeros_mask, 0, NULL);
142
143 static uint64_t cr4_ones_mask, cr4_zeros_mask;
144 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD,
145              &cr4_ones_mask, 0, NULL);
146 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD,
147              &cr4_zeros_mask, 0, NULL);
148
149 static int vmx_no_patmsr;
150
151 static int vmx_initialized;
152 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD,
153            &vmx_initialized, 0, "Intel VMX initialized");
154
155 /*
156  * Optional capabilities
157  */
158 static int cap_halt_exit;
159 static int cap_pause_exit;
160 static int cap_unrestricted_guest;
161 static int cap_monitor_trap;
162 static int cap_invpcid;
163
164 static int virtual_interrupt_delivery;
165 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD,
166     &virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support");
167
168 static int posted_interrupts;
169 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupts, CTLFLAG_RD,
170     &posted_interrupts, 0, "APICv posted interrupt support");
171
172 static int pirvec;
173 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupt_vector, CTLFLAG_RD,
174     &pirvec, 0, "APICv posted interrupt vector");
175
176 static struct unrhdr *vpid_unr;
177 static u_int vpid_alloc_failed;
178 SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD,
179             &vpid_alloc_failed, 0, NULL);
180
181 /*
182  * Use the last page below 4GB as the APIC access address. This address is
183  * occupied by the boot firmware so it is guaranteed that it will not conflict
184  * with a page in system memory.
185  */
186 #define APIC_ACCESS_ADDRESS     0xFFFFF000
187
188 static void vmx_inject_pir(struct vlapic *vlapic);
189
190 #ifdef KTR
191 static const char *
192 exit_reason_to_str(int reason)
193 {
194         static char reasonbuf[32];
195
196         switch (reason) {
197         case EXIT_REASON_EXCEPTION:
198                 return "exception";
199         case EXIT_REASON_EXT_INTR:
200                 return "extint";
201         case EXIT_REASON_TRIPLE_FAULT:
202                 return "triplefault";
203         case EXIT_REASON_INIT:
204                 return "init";
205         case EXIT_REASON_SIPI:
206                 return "sipi";
207         case EXIT_REASON_IO_SMI:
208                 return "iosmi";
209         case EXIT_REASON_SMI:
210                 return "smi";
211         case EXIT_REASON_INTR_WINDOW:
212                 return "intrwindow";
213         case EXIT_REASON_NMI_WINDOW:
214                 return "nmiwindow";
215         case EXIT_REASON_TASK_SWITCH:
216                 return "taskswitch";
217         case EXIT_REASON_CPUID:
218                 return "cpuid";
219         case EXIT_REASON_GETSEC:
220                 return "getsec";
221         case EXIT_REASON_HLT:
222                 return "hlt";
223         case EXIT_REASON_INVD:
224                 return "invd";
225         case EXIT_REASON_INVLPG:
226                 return "invlpg";
227         case EXIT_REASON_RDPMC:
228                 return "rdpmc";
229         case EXIT_REASON_RDTSC:
230                 return "rdtsc";
231         case EXIT_REASON_RSM:
232                 return "rsm";
233         case EXIT_REASON_VMCALL:
234                 return "vmcall";
235         case EXIT_REASON_VMCLEAR:
236                 return "vmclear";
237         case EXIT_REASON_VMLAUNCH:
238                 return "vmlaunch";
239         case EXIT_REASON_VMPTRLD:
240                 return "vmptrld";
241         case EXIT_REASON_VMPTRST:
242                 return "vmptrst";
243         case EXIT_REASON_VMREAD:
244                 return "vmread";
245         case EXIT_REASON_VMRESUME:
246                 return "vmresume";
247         case EXIT_REASON_VMWRITE:
248                 return "vmwrite";
249         case EXIT_REASON_VMXOFF:
250                 return "vmxoff";
251         case EXIT_REASON_VMXON:
252                 return "vmxon";
253         case EXIT_REASON_CR_ACCESS:
254                 return "craccess";
255         case EXIT_REASON_DR_ACCESS:
256                 return "draccess";
257         case EXIT_REASON_INOUT:
258                 return "inout";
259         case EXIT_REASON_RDMSR:
260                 return "rdmsr";
261         case EXIT_REASON_WRMSR:
262                 return "wrmsr";
263         case EXIT_REASON_INVAL_VMCS:
264                 return "invalvmcs";
265         case EXIT_REASON_INVAL_MSR:
266                 return "invalmsr";
267         case EXIT_REASON_MWAIT:
268                 return "mwait";
269         case EXIT_REASON_MTF:
270                 return "mtf";
271         case EXIT_REASON_MONITOR:
272                 return "monitor";
273         case EXIT_REASON_PAUSE:
274                 return "pause";
275         case EXIT_REASON_MCE:
276                 return "mce";
277         case EXIT_REASON_TPR:
278                 return "tpr";
279         case EXIT_REASON_APIC_ACCESS:
280                 return "apic-access";
281         case EXIT_REASON_GDTR_IDTR:
282                 return "gdtridtr";
283         case EXIT_REASON_LDTR_TR:
284                 return "ldtrtr";
285         case EXIT_REASON_EPT_FAULT:
286                 return "eptfault";
287         case EXIT_REASON_EPT_MISCONFIG:
288                 return "eptmisconfig";
289         case EXIT_REASON_INVEPT:
290                 return "invept";
291         case EXIT_REASON_RDTSCP:
292                 return "rdtscp";
293         case EXIT_REASON_VMX_PREEMPT:
294                 return "vmxpreempt";
295         case EXIT_REASON_INVVPID:
296                 return "invvpid";
297         case EXIT_REASON_WBINVD:
298                 return "wbinvd";
299         case EXIT_REASON_XSETBV:
300                 return "xsetbv";
301         case EXIT_REASON_APIC_WRITE:
302                 return "apic-write";
303         default:
304                 snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason);
305                 return (reasonbuf);
306         }
307 }
308 #endif  /* KTR */
309
310 static int
311 vmx_allow_x2apic_msrs(struct vmx *vmx)
312 {
313         int i, error;
314
315         error = 0;
316
317         /*
318          * Allow readonly access to the following x2APIC MSRs from the guest.
319          */
320         error += guest_msr_ro(vmx, MSR_APIC_ID);
321         error += guest_msr_ro(vmx, MSR_APIC_VERSION);
322         error += guest_msr_ro(vmx, MSR_APIC_LDR);
323         error += guest_msr_ro(vmx, MSR_APIC_SVR);
324
325         for (i = 0; i < 8; i++)
326                 error += guest_msr_ro(vmx, MSR_APIC_ISR0 + i);
327
328         for (i = 0; i < 8; i++)
329                 error += guest_msr_ro(vmx, MSR_APIC_TMR0 + i);
330         
331         for (i = 0; i < 8; i++)
332                 error += guest_msr_ro(vmx, MSR_APIC_IRR0 + i);
333
334         error += guest_msr_ro(vmx, MSR_APIC_ESR);
335         error += guest_msr_ro(vmx, MSR_APIC_LVT_TIMER);
336         error += guest_msr_ro(vmx, MSR_APIC_LVT_THERMAL);
337         error += guest_msr_ro(vmx, MSR_APIC_LVT_PCINT);
338         error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT0);
339         error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT1);
340         error += guest_msr_ro(vmx, MSR_APIC_LVT_ERROR);
341         error += guest_msr_ro(vmx, MSR_APIC_ICR_TIMER);
342         error += guest_msr_ro(vmx, MSR_APIC_DCR_TIMER);
343         error += guest_msr_ro(vmx, MSR_APIC_ICR);
344
345         /*
346          * Allow TPR, EOI and SELF_IPI MSRs to be read and written by the guest.
347          *
348          * These registers get special treatment described in the section
349          * "Virtualizing MSR-Based APIC Accesses".
350          */
351         error += guest_msr_rw(vmx, MSR_APIC_TPR);
352         error += guest_msr_rw(vmx, MSR_APIC_EOI);
353         error += guest_msr_rw(vmx, MSR_APIC_SELF_IPI);
354
355         return (error);
356 }
357
358 u_long
359 vmx_fix_cr0(u_long cr0)
360 {
361
362         return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask);
363 }
364
365 u_long
366 vmx_fix_cr4(u_long cr4)
367 {
368
369         return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask);
370 }
371
372 static void
373 vpid_free(int vpid)
374 {
375         if (vpid < 0 || vpid > 0xffff)
376                 panic("vpid_free: invalid vpid %d", vpid);
377
378         /*
379          * VPIDs [0,VM_MAXCPU] are special and are not allocated from
380          * the unit number allocator.
381          */
382
383         if (vpid > VM_MAXCPU)
384                 free_unr(vpid_unr, vpid);
385 }
386
387 static void
388 vpid_alloc(uint16_t *vpid, int num)
389 {
390         int i, x;
391
392         if (num <= 0 || num > VM_MAXCPU)
393                 panic("invalid number of vpids requested: %d", num);
394
395         /*
396          * If the "enable vpid" execution control is not enabled then the
397          * VPID is required to be 0 for all vcpus.
398          */
399         if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) == 0) {
400                 for (i = 0; i < num; i++)
401                         vpid[i] = 0;
402                 return;
403         }
404
405         /*
406          * Allocate a unique VPID for each vcpu from the unit number allocator.
407          */
408         for (i = 0; i < num; i++) {
409                 x = alloc_unr(vpid_unr);
410                 if (x == -1)
411                         break;
412                 else
413                         vpid[i] = x;
414         }
415
416         if (i < num) {
417                 atomic_add_int(&vpid_alloc_failed, 1);
418
419                 /*
420                  * If the unit number allocator does not have enough unique
421                  * VPIDs then we need to allocate from the [1,VM_MAXCPU] range.
422                  *
423                  * These VPIDs are not be unique across VMs but this does not
424                  * affect correctness because the combined mappings are also
425                  * tagged with the EP4TA which is unique for each VM.
426                  *
427                  * It is still sub-optimal because the invvpid will invalidate
428                  * combined mappings for a particular VPID across all EP4TAs.
429                  */
430                 while (i-- > 0)
431                         vpid_free(vpid[i]);
432
433                 for (i = 0; i < num; i++)
434                         vpid[i] = i + 1;
435         }
436 }
437
438 static void
439 vpid_init(void)
440 {
441         /*
442          * VPID 0 is required when the "enable VPID" execution control is
443          * disabled.
444          *
445          * VPIDs [1,VM_MAXCPU] are used as the "overflow namespace" when the
446          * unit number allocator does not have sufficient unique VPIDs to
447          * satisfy the allocation.
448          *
449          * The remaining VPIDs are managed by the unit number allocator.
450          */
451         vpid_unr = new_unrhdr(VM_MAXCPU + 1, 0xffff, NULL);
452 }
453
454 static void
455 msr_save_area_init(struct msr_entry *g_area, int *g_count)
456 {
457         int cnt;
458
459         static struct msr_entry guest_msrs[] = {
460                 { MSR_KGSBASE, 0, 0 },
461         };
462
463         cnt = sizeof(guest_msrs) / sizeof(guest_msrs[0]);
464         if (cnt > GUEST_MSR_MAX_ENTRIES)
465                 panic("guest msr save area overrun");
466         bcopy(guest_msrs, g_area, sizeof(guest_msrs));
467         *g_count = cnt;
468 }
469
470 static void
471 vmx_disable(void *arg __unused)
472 {
473         struct invvpid_desc invvpid_desc = { 0 };
474         struct invept_desc invept_desc = { 0 };
475
476         if (vmxon_enabled[curcpu]) {
477                 /*
478                  * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b.
479                  *
480                  * VMXON or VMXOFF are not required to invalidate any TLB
481                  * caching structures. This prevents potential retention of
482                  * cached information in the TLB between distinct VMX episodes.
483                  */
484                 invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc);
485                 invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc);
486                 vmxoff();
487         }
488         load_cr4(rcr4() & ~CR4_VMXE);
489 }
490
491 static int
492 vmx_cleanup(void)
493 {
494         
495         if (pirvec != 0)
496                 vmm_ipi_free(pirvec);
497
498         if (vpid_unr != NULL) {
499                 delete_unrhdr(vpid_unr);
500                 vpid_unr = NULL;
501         }
502
503         smp_rendezvous(NULL, vmx_disable, NULL, NULL);
504
505         return (0);
506 }
507
508 static void
509 vmx_enable(void *arg __unused)
510 {
511         int error;
512         uint64_t feature_control;
513
514         feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
515         if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 ||
516             (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) {
517                 wrmsr(MSR_IA32_FEATURE_CONTROL,
518                     feature_control | IA32_FEATURE_CONTROL_VMX_EN |
519                     IA32_FEATURE_CONTROL_LOCK);
520         }
521
522         load_cr4(rcr4() | CR4_VMXE);
523
524         *(uint32_t *)vmxon_region[curcpu] = vmx_revision();
525         error = vmxon(vmxon_region[curcpu]);
526         if (error == 0)
527                 vmxon_enabled[curcpu] = 1;
528 }
529
530 static void
531 vmx_restore(void)
532 {
533
534         if (vmxon_enabled[curcpu])
535                 vmxon(vmxon_region[curcpu]);
536 }
537
538 static int
539 vmx_init(int ipinum)
540 {
541         int error, use_tpr_shadow;
542         uint64_t fixed0, fixed1, feature_control;
543         uint32_t tmp, procbased2_vid_bits;
544
545         /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */
546         if (!(cpu_feature2 & CPUID2_VMX)) {
547                 printf("vmx_init: processor does not support VMX operation\n");
548                 return (ENXIO);
549         }
550
551         /*
552          * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits
553          * are set (bits 0 and 2 respectively).
554          */
555         feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
556         if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 1 &&
557             (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) {
558                 printf("vmx_init: VMX operation disabled by BIOS\n");
559                 return (ENXIO);
560         }
561
562         /* Check support for primary processor-based VM-execution controls */
563         error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
564                                MSR_VMX_TRUE_PROCBASED_CTLS,
565                                PROCBASED_CTLS_ONE_SETTING,
566                                PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls);
567         if (error) {
568                 printf("vmx_init: processor does not support desired primary "
569                        "processor-based controls\n");
570                 return (error);
571         }
572
573         /* Clear the processor-based ctl bits that are set on demand */
574         procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING;
575
576         /* Check support for secondary processor-based VM-execution controls */
577         error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
578                                MSR_VMX_PROCBASED_CTLS2,
579                                PROCBASED_CTLS2_ONE_SETTING,
580                                PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2);
581         if (error) {
582                 printf("vmx_init: processor does not support desired secondary "
583                        "processor-based controls\n");
584                 return (error);
585         }
586
587         /* Check support for VPID */
588         error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
589                                PROCBASED2_ENABLE_VPID, 0, &tmp);
590         if (error == 0)
591                 procbased_ctls2 |= PROCBASED2_ENABLE_VPID;
592
593         /* Check support for pin-based VM-execution controls */
594         error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
595                                MSR_VMX_TRUE_PINBASED_CTLS,
596                                PINBASED_CTLS_ONE_SETTING,
597                                PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls);
598         if (error) {
599                 printf("vmx_init: processor does not support desired "
600                        "pin-based controls\n");
601                 return (error);
602         }
603
604         /* Check support for VM-exit controls */
605         error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS,
606                                VM_EXIT_CTLS_ONE_SETTING,
607                                VM_EXIT_CTLS_ZERO_SETTING,
608                                &exit_ctls);
609         if (error) {
610                 /* Try again without the PAT MSR bits */
611                 error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS,
612                                        MSR_VMX_TRUE_EXIT_CTLS,
613                                        VM_EXIT_CTLS_ONE_SETTING_NO_PAT,
614                                        VM_EXIT_CTLS_ZERO_SETTING,
615                                        &exit_ctls);
616                 if (error) {
617                         printf("vmx_init: processor does not support desired "
618                                "exit controls\n");
619                         return (error);
620                 } else {
621                         if (bootverbose)
622                                 printf("vmm: PAT MSR access not supported\n");
623                         guest_msr_valid(MSR_PAT);
624                         vmx_no_patmsr = 1;
625                 }
626         }
627
628         /* Check support for VM-entry controls */
629         if (!vmx_no_patmsr) {
630                 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS,
631                                        MSR_VMX_TRUE_ENTRY_CTLS,
632                                        VM_ENTRY_CTLS_ONE_SETTING,
633                                        VM_ENTRY_CTLS_ZERO_SETTING,
634                                        &entry_ctls);
635         } else {
636                 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS,
637                                        MSR_VMX_TRUE_ENTRY_CTLS,
638                                        VM_ENTRY_CTLS_ONE_SETTING_NO_PAT,
639                                        VM_ENTRY_CTLS_ZERO_SETTING,
640                                        &entry_ctls);
641         }
642
643         if (error) {
644                 printf("vmx_init: processor does not support desired "
645                        "entry controls\n");
646                        return (error);
647         }
648
649         /*
650          * Check support for optional features by testing them
651          * as individual bits
652          */
653         cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
654                                         MSR_VMX_TRUE_PROCBASED_CTLS,
655                                         PROCBASED_HLT_EXITING, 0,
656                                         &tmp) == 0);
657
658         cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
659                                         MSR_VMX_PROCBASED_CTLS,
660                                         PROCBASED_MTF, 0,
661                                         &tmp) == 0);
662
663         cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
664                                          MSR_VMX_TRUE_PROCBASED_CTLS,
665                                          PROCBASED_PAUSE_EXITING, 0,
666                                          &tmp) == 0);
667
668         cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
669                                         MSR_VMX_PROCBASED_CTLS2,
670                                         PROCBASED2_UNRESTRICTED_GUEST, 0,
671                                         &tmp) == 0);
672
673         cap_invpcid = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
674             MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0,
675             &tmp) == 0);
676
677         /*
678          * Check support for virtual interrupt delivery.
679          */
680         procbased2_vid_bits = (PROCBASED2_VIRTUALIZE_APIC_ACCESSES |
681             PROCBASED2_VIRTUALIZE_X2APIC_MODE |
682             PROCBASED2_APIC_REGISTER_VIRTUALIZATION |
683             PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY);
684
685         use_tpr_shadow = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
686             MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_USE_TPR_SHADOW, 0,
687             &tmp) == 0);
688
689         error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
690             procbased2_vid_bits, 0, &tmp);
691         if (error == 0 && use_tpr_shadow) {
692                 virtual_interrupt_delivery = 1;
693                 TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_vid",
694                     &virtual_interrupt_delivery);
695         }
696
697         if (virtual_interrupt_delivery) {
698                 procbased_ctls |= PROCBASED_USE_TPR_SHADOW;
699                 procbased_ctls2 |= procbased2_vid_bits;
700                 procbased_ctls2 &= ~PROCBASED2_VIRTUALIZE_X2APIC_MODE;
701
702                 /*
703                  * Check for Posted Interrupts only if Virtual Interrupt
704                  * Delivery is enabled.
705                  */
706                 error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
707                     MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_POSTED_INTERRUPT, 0,
708                     &tmp);
709                 if (error == 0) {
710                         pirvec = vmm_ipi_alloc();
711                         if (pirvec == 0) {
712                                 if (bootverbose) {
713                                         printf("vmx_init: unable to allocate "
714                                             "posted interrupt vector\n");
715                                 }
716                         } else {
717                                 posted_interrupts = 1;
718                                 TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_pir",
719                                     &posted_interrupts);
720                         }
721                 }
722         }
723
724         if (posted_interrupts)
725                     pinbased_ctls |= PINBASED_POSTED_INTERRUPT;
726
727         /* Initialize EPT */
728         error = ept_init(ipinum);
729         if (error) {
730                 printf("vmx_init: ept initialization failed (%d)\n", error);
731                 return (error);
732         }
733
734         /*
735          * Stash the cr0 and cr4 bits that must be fixed to 0 or 1
736          */
737         fixed0 = rdmsr(MSR_VMX_CR0_FIXED0);
738         fixed1 = rdmsr(MSR_VMX_CR0_FIXED1);
739         cr0_ones_mask = fixed0 & fixed1;
740         cr0_zeros_mask = ~fixed0 & ~fixed1;
741
742         /*
743          * CR0_PE and CR0_PG can be set to zero in VMX non-root operation
744          * if unrestricted guest execution is allowed.
745          */
746         if (cap_unrestricted_guest)
747                 cr0_ones_mask &= ~(CR0_PG | CR0_PE);
748
749         /*
750          * Do not allow the guest to set CR0_NW or CR0_CD.
751          */
752         cr0_zeros_mask |= (CR0_NW | CR0_CD);
753
754         fixed0 = rdmsr(MSR_VMX_CR4_FIXED0);
755         fixed1 = rdmsr(MSR_VMX_CR4_FIXED1);
756         cr4_ones_mask = fixed0 & fixed1;
757         cr4_zeros_mask = ~fixed0 & ~fixed1;
758
759         vpid_init();
760
761         /* enable VMX operation */
762         smp_rendezvous(NULL, vmx_enable, NULL, NULL);
763
764         vmx_initialized = 1;
765
766         return (0);
767 }
768
769 static void
770 vmx_trigger_hostintr(int vector)
771 {
772         uintptr_t func;
773         struct gate_descriptor *gd;
774
775         gd = &idt[vector];
776
777         KASSERT(vector >= 32 && vector <= 255, ("vmx_trigger_hostintr: "
778             "invalid vector %d", vector));
779         KASSERT(gd->gd_p == 1, ("gate descriptor for vector %d not present",
780             vector));
781         KASSERT(gd->gd_type == SDT_SYSIGT, ("gate descriptor for vector %d "
782             "has invalid type %d", vector, gd->gd_type));
783         KASSERT(gd->gd_dpl == SEL_KPL, ("gate descriptor for vector %d "
784             "has invalid dpl %d", vector, gd->gd_dpl));
785         KASSERT(gd->gd_selector == GSEL(GCODE_SEL, SEL_KPL), ("gate descriptor "
786             "for vector %d has invalid selector %d", vector, gd->gd_selector));
787         KASSERT(gd->gd_ist == 0, ("gate descriptor for vector %d has invalid "
788             "IST %d", vector, gd->gd_ist));
789
790         func = ((long)gd->gd_hioffset << 16 | gd->gd_looffset);
791         vmx_call_isr(func);
792 }
793
794 static int
795 vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial)
796 {
797         int error, mask_ident, shadow_ident;
798         uint64_t mask_value;
799
800         if (which != 0 && which != 4)
801                 panic("vmx_setup_cr_shadow: unknown cr%d", which);
802
803         if (which == 0) {
804                 mask_ident = VMCS_CR0_MASK;
805                 mask_value = cr0_ones_mask | cr0_zeros_mask;
806                 shadow_ident = VMCS_CR0_SHADOW;
807         } else {
808                 mask_ident = VMCS_CR4_MASK;
809                 mask_value = cr4_ones_mask | cr4_zeros_mask;
810                 shadow_ident = VMCS_CR4_SHADOW;
811         }
812
813         error = vmcs_setreg(vmcs, 0, VMCS_IDENT(mask_ident), mask_value);
814         if (error)
815                 return (error);
816
817         error = vmcs_setreg(vmcs, 0, VMCS_IDENT(shadow_ident), initial);
818         if (error)
819                 return (error);
820
821         return (0);
822 }
823 #define vmx_setup_cr0_shadow(vmcs,init) vmx_setup_cr_shadow(0, (vmcs), (init))
824 #define vmx_setup_cr4_shadow(vmcs,init) vmx_setup_cr_shadow(4, (vmcs), (init))
825
826 static void *
827 vmx_vminit(struct vm *vm, pmap_t pmap)
828 {
829         uint16_t vpid[VM_MAXCPU];
830         int i, error, guest_msr_count;
831         struct vmx *vmx;
832         struct vmcs *vmcs;
833
834         vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO);
835         if ((uintptr_t)vmx & PAGE_MASK) {
836                 panic("malloc of struct vmx not aligned on %d byte boundary",
837                       PAGE_SIZE);
838         }
839         vmx->vm = vm;
840
841         vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4));
842
843         /*
844          * Clean up EPTP-tagged guest physical and combined mappings
845          *
846          * VMX transitions are not required to invalidate any guest physical
847          * mappings. So, it may be possible for stale guest physical mappings
848          * to be present in the processor TLBs.
849          *
850          * Combined mappings for this EP4TA are also invalidated for all VPIDs.
851          */
852         ept_invalidate_mappings(vmx->eptp);
853
854         msr_bitmap_initialize(vmx->msr_bitmap);
855
856         /*
857          * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE.
858          * The guest FSBASE and GSBASE are saved and restored during
859          * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are
860          * always restored from the vmcs host state area on vm-exit.
861          *
862          * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in
863          * how they are saved/restored so can be directly accessed by the
864          * guest.
865          *
866          * Guest KGSBASE is saved and restored in the guest MSR save area.
867          * Host KGSBASE is restored before returning to userland from the pcb.
868          * There will be a window of time when we are executing in the host
869          * kernel context with a value of KGSBASE from the guest. This is ok
870          * because the value of KGSBASE is inconsequential in kernel context.
871          *
872          * MSR_EFER is saved and restored in the guest VMCS area on a
873          * VM exit and entry respectively. It is also restored from the
874          * host VMCS area on a VM exit.
875          *
876          * The TSC MSR is exposed read-only. Writes are disallowed as that
877          * will impact the host TSC.
878          * XXX Writes would be implemented with a wrmsr trap, and
879          * then modifying the TSC offset in the VMCS.
880          */
881         if (guest_msr_rw(vmx, MSR_GSBASE) ||
882             guest_msr_rw(vmx, MSR_FSBASE) ||
883             guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) ||
884             guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) ||
885             guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) ||
886             guest_msr_rw(vmx, MSR_KGSBASE) ||
887             guest_msr_rw(vmx, MSR_EFER) ||
888             guest_msr_ro(vmx, MSR_TSC))
889                 panic("vmx_vminit: error setting guest msr access");
890
891         /*
892          * MSR_PAT is saved and restored in the guest VMCS are on a VM exit
893          * and entry respectively. It is also restored from the host VMCS
894          * area on a VM exit. However, if running on a system with no
895          * MSR_PAT save/restore support, leave access disabled so accesses
896          * will be trapped.
897          */
898         if (!vmx_no_patmsr && guest_msr_rw(vmx, MSR_PAT))
899                 panic("vmx_vminit: error setting guest pat msr access");
900
901         vpid_alloc(vpid, VM_MAXCPU);
902
903         if (virtual_interrupt_delivery) {
904                 error = vm_map_mmio(vm, DEFAULT_APIC_BASE, PAGE_SIZE,
905                     APIC_ACCESS_ADDRESS);
906                 /* XXX this should really return an error to the caller */
907                 KASSERT(error == 0, ("vm_map_mmio(apicbase) error %d", error));
908         }
909
910         for (i = 0; i < VM_MAXCPU; i++) {
911                 vmcs = &vmx->vmcs[i];
912                 vmcs->identifier = vmx_revision();
913                 error = vmclear(vmcs);
914                 if (error != 0) {
915                         panic("vmx_vminit: vmclear error %d on vcpu %d\n",
916                               error, i);
917                 }
918
919                 error = vmcs_init(vmcs);
920                 KASSERT(error == 0, ("vmcs_init error %d", error));
921
922                 VMPTRLD(vmcs);
923                 error = 0;
924                 error += vmwrite(VMCS_HOST_RSP, (u_long)&vmx->ctx[i]);
925                 error += vmwrite(VMCS_EPTP, vmx->eptp);
926                 error += vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls);
927                 error += vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls);
928                 error += vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2);
929                 error += vmwrite(VMCS_EXIT_CTLS, exit_ctls);
930                 error += vmwrite(VMCS_ENTRY_CTLS, entry_ctls);
931                 error += vmwrite(VMCS_MSR_BITMAP, vtophys(vmx->msr_bitmap));
932                 error += vmwrite(VMCS_VPID, vpid[i]);
933                 if (virtual_interrupt_delivery) {
934                         error += vmwrite(VMCS_APIC_ACCESS, APIC_ACCESS_ADDRESS);
935                         error += vmwrite(VMCS_VIRTUAL_APIC,
936                             vtophys(&vmx->apic_page[i]));
937                         error += vmwrite(VMCS_EOI_EXIT0, 0);
938                         error += vmwrite(VMCS_EOI_EXIT1, 0);
939                         error += vmwrite(VMCS_EOI_EXIT2, 0);
940                         error += vmwrite(VMCS_EOI_EXIT3, 0);
941                 }
942                 if (posted_interrupts) {
943                         error += vmwrite(VMCS_PIR_VECTOR, pirvec);
944                         error += vmwrite(VMCS_PIR_DESC,
945                             vtophys(&vmx->pir_desc[i]));
946                 }
947                 VMCLEAR(vmcs);
948                 KASSERT(error == 0, ("vmx_vminit: error customizing the vmcs"));
949
950                 vmx->cap[i].set = 0;
951                 vmx->cap[i].proc_ctls = procbased_ctls;
952                 vmx->cap[i].proc_ctls2 = procbased_ctls2;
953
954                 vmx->state[i].lastcpu = -1;
955                 vmx->state[i].vpid = vpid[i];
956
957                 msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count);
958
959                 error = vmcs_set_msr_save(vmcs, vtophys(vmx->guest_msrs[i]),
960                     guest_msr_count);
961                 if (error != 0)
962                         panic("vmcs_set_msr_save error %d", error);
963
964                 /*
965                  * Set up the CR0/4 shadows, and init the read shadow
966                  * to the power-on register value from the Intel Sys Arch.
967                  *  CR0 - 0x60000010
968                  *  CR4 - 0
969                  */
970                 error = vmx_setup_cr0_shadow(vmcs, 0x60000010);
971                 if (error != 0)
972                         panic("vmx_setup_cr0_shadow %d", error);
973
974                 error = vmx_setup_cr4_shadow(vmcs, 0);
975                 if (error != 0)
976                         panic("vmx_setup_cr4_shadow %d", error);
977
978                 vmx->ctx[i].pmap = pmap;
979         }
980
981         return (vmx);
982 }
983
984 static int
985 vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx)
986 {
987         int handled, func;
988         
989         func = vmxctx->guest_rax;
990
991         handled = x86_emulate_cpuid(vm, vcpu,
992                                     (uint32_t*)(&vmxctx->guest_rax),
993                                     (uint32_t*)(&vmxctx->guest_rbx),
994                                     (uint32_t*)(&vmxctx->guest_rcx),
995                                     (uint32_t*)(&vmxctx->guest_rdx));
996         return (handled);
997 }
998
999 static __inline void
1000 vmx_run_trace(struct vmx *vmx, int vcpu)
1001 {
1002 #ifdef KTR
1003         VCPU_CTR1(vmx->vm, vcpu, "Resume execution at %#lx", vmcs_guest_rip());
1004 #endif
1005 }
1006
1007 static __inline void
1008 vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason,
1009                int handled)
1010 {
1011 #ifdef KTR
1012         VCPU_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx",
1013                  handled ? "handled" : "unhandled",
1014                  exit_reason_to_str(exit_reason), rip);
1015 #endif
1016 }
1017
1018 static __inline void
1019 vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip)
1020 {
1021 #ifdef KTR
1022         VCPU_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip);
1023 #endif
1024 }
1025
1026 static VMM_STAT_INTEL(VCPU_INVVPID_SAVED, "Number of vpid invalidations saved");
1027
1028 static void
1029 vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap)
1030 {
1031         struct vmxstate *vmxstate;
1032         struct invvpid_desc invvpid_desc;
1033
1034         vmxstate = &vmx->state[vcpu];
1035         if (vmxstate->lastcpu == curcpu)
1036                 return;
1037
1038         vmxstate->lastcpu = curcpu;
1039
1040         vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1);
1041
1042         vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase());
1043         vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase());
1044         vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase());
1045
1046         /*
1047          * If we are using VPIDs then invalidate all mappings tagged with 'vpid'
1048          *
1049          * We do this because this vcpu was executing on a different host
1050          * cpu when it last ran. We do not track whether it invalidated
1051          * mappings associated with its 'vpid' during that run. So we must
1052          * assume that the mappings associated with 'vpid' on 'curcpu' are
1053          * stale and invalidate them.
1054          *
1055          * Note that we incur this penalty only when the scheduler chooses to
1056          * move the thread associated with this vcpu between host cpus.
1057          *
1058          * Note also that this will invalidate mappings tagged with 'vpid'
1059          * for "all" EP4TAs.
1060          */
1061         if (vmxstate->vpid != 0) {
1062                 if (pmap->pm_eptgen == vmx->eptgen[curcpu]) {
1063                         invvpid_desc._res1 = 0;
1064                         invvpid_desc._res2 = 0;
1065                         invvpid_desc.vpid = vmxstate->vpid;
1066                         invvpid_desc.linear_addr = 0;
1067                         invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc);
1068                 } else {
1069                         /*
1070                          * The invvpid can be skipped if an invept is going to
1071                          * be performed before entering the guest. The invept
1072                          * will invalidate combined mappings tagged with
1073                          * 'vmx->eptp' for all vpids.
1074                          */
1075                         vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1);
1076                 }
1077         }
1078 }
1079
1080 /*
1081  * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set.
1082  */
1083 CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0);
1084
1085 static void __inline
1086 vmx_set_int_window_exiting(struct vmx *vmx, int vcpu)
1087 {
1088
1089         if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) == 0) {
1090                 vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING;
1091                 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
1092                 VCPU_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting");
1093         }
1094 }
1095
1096 static void __inline
1097 vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu)
1098 {
1099
1100         KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0,
1101             ("intr_window_exiting not set: %#x", vmx->cap[vcpu].proc_ctls));
1102         vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING;
1103         vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
1104         VCPU_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting");
1105 }
1106
1107 static void __inline
1108 vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu)
1109 {
1110
1111         if ((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) == 0) {
1112                 vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING;
1113                 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
1114                 VCPU_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting");
1115         }
1116 }
1117
1118 static void __inline
1119 vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu)
1120 {
1121
1122         KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0,
1123             ("nmi_window_exiting not set %#x", vmx->cap[vcpu].proc_ctls));
1124         vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING;
1125         vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
1126         VCPU_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting");
1127 }
1128
1129 #define NMI_BLOCKING    (VMCS_INTERRUPTIBILITY_NMI_BLOCKING |           \
1130                          VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING)
1131 #define HWINTR_BLOCKING (VMCS_INTERRUPTIBILITY_STI_BLOCKING |           \
1132                          VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING)
1133
1134 static void
1135 vmx_inject_nmi(struct vmx *vmx, int vcpu)
1136 {
1137         uint32_t gi, info;
1138
1139         gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1140         KASSERT((gi & NMI_BLOCKING) == 0, ("vmx_inject_nmi: invalid guest "
1141             "interruptibility-state %#x", gi));
1142
1143         info = vmcs_read(VMCS_ENTRY_INTR_INFO);
1144         KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_nmi: invalid "
1145             "VM-entry interruption information %#x", info));
1146
1147         /*
1148          * Inject the virtual NMI. The vector must be the NMI IDT entry
1149          * or the VMCS entry check will fail.
1150          */
1151         info = IDT_NMI | VMCS_INTR_T_NMI | VMCS_INTR_VALID;
1152         vmcs_write(VMCS_ENTRY_INTR_INFO, info);
1153
1154         VCPU_CTR0(vmx->vm, vcpu, "Injecting vNMI");
1155
1156         /* Clear the request */
1157         vm_nmi_clear(vmx->vm, vcpu);
1158 }
1159
1160 static void
1161 vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic)
1162 {
1163         struct vm_exception exc;
1164         int vector, need_nmi_exiting, extint_pending;
1165         uint64_t rflags;
1166         uint32_t gi, info;
1167
1168         if (vm_exception_pending(vmx->vm, vcpu, &exc)) {
1169                 KASSERT(exc.vector >= 0 && exc.vector < 32,
1170                     ("%s: invalid exception vector %d", __func__, exc.vector));
1171
1172                 info = vmcs_read(VMCS_ENTRY_INTR_INFO);
1173                 KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject "
1174                      "pending exception %d: %#x", __func__, exc.vector, info));
1175
1176                 info = exc.vector | VMCS_INTR_T_HWEXCEPTION | VMCS_INTR_VALID;
1177                 if (exc.error_code_valid) {
1178                         info |= VMCS_INTR_DEL_ERRCODE;
1179                         vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, exc.error_code);
1180                 }
1181                 vmcs_write(VMCS_ENTRY_INTR_INFO, info);
1182         }
1183
1184         if (vm_nmi_pending(vmx->vm, vcpu)) {
1185                 /*
1186                  * If there are no conditions blocking NMI injection then
1187                  * inject it directly here otherwise enable "NMI window
1188                  * exiting" to inject it as soon as we can.
1189                  *
1190                  * We also check for STI_BLOCKING because some implementations
1191                  * don't allow NMI injection in this case. If we are running
1192                  * on a processor that doesn't have this restriction it will
1193                  * immediately exit and the NMI will be injected in the
1194                  * "NMI window exiting" handler.
1195                  */
1196                 need_nmi_exiting = 1;
1197                 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1198                 if ((gi & (HWINTR_BLOCKING | NMI_BLOCKING)) == 0) {
1199                         info = vmcs_read(VMCS_ENTRY_INTR_INFO);
1200                         if ((info & VMCS_INTR_VALID) == 0) {
1201                                 vmx_inject_nmi(vmx, vcpu);
1202                                 need_nmi_exiting = 0;
1203                         } else {
1204                                 VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI "
1205                                     "due to VM-entry intr info %#x", info);
1206                         }
1207                 } else {
1208                         VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI due to "
1209                             "Guest Interruptibility-state %#x", gi);
1210                 }
1211
1212                 if (need_nmi_exiting)
1213                         vmx_set_nmi_window_exiting(vmx, vcpu);
1214         }
1215
1216         extint_pending = vm_extint_pending(vmx->vm, vcpu);
1217
1218         if (!extint_pending && virtual_interrupt_delivery) {
1219                 vmx_inject_pir(vlapic);
1220                 return;
1221         }
1222
1223         /*
1224          * If interrupt-window exiting is already in effect then don't bother
1225          * checking for pending interrupts. This is just an optimization and
1226          * not needed for correctness.
1227          */
1228         if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0) {
1229                 VCPU_CTR0(vmx->vm, vcpu, "Skip interrupt injection due to "
1230                     "pending int_window_exiting");
1231                 return;
1232         }
1233
1234         if (!extint_pending) {
1235                 /* Ask the local apic for a vector to inject */
1236                 if (!vlapic_pending_intr(vlapic, &vector))
1237                         return;
1238
1239                 /*
1240                  * From the Intel SDM, Volume 3, Section "Maskable
1241                  * Hardware Interrupts":
1242                  * - maskable interrupt vectors [16,255] can be delivered
1243                  *   through the local APIC.
1244                 */
1245                 KASSERT(vector >= 16 && vector <= 255,
1246                     ("invalid vector %d from local APIC", vector));
1247         } else {
1248                 /* Ask the legacy pic for a vector to inject */
1249                 vatpic_pending_intr(vmx->vm, &vector);
1250
1251                 /*
1252                  * From the Intel SDM, Volume 3, Section "Maskable
1253                  * Hardware Interrupts":
1254                  * - maskable interrupt vectors [0,255] can be delivered
1255                  *   through the INTR pin.
1256                  */
1257                 KASSERT(vector >= 0 && vector <= 255,
1258                     ("invalid vector %d from INTR", vector));
1259         }
1260
1261         /* Check RFLAGS.IF and the interruptibility state of the guest */
1262         rflags = vmcs_read(VMCS_GUEST_RFLAGS);
1263         if ((rflags & PSL_I) == 0) {
1264                 VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
1265                     "rflags %#lx", vector, rflags);
1266                 goto cantinject;
1267         }
1268
1269         gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1270         if (gi & HWINTR_BLOCKING) {
1271                 VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
1272                     "Guest Interruptibility-state %#x", vector, gi);
1273                 goto cantinject;
1274         }
1275
1276         info = vmcs_read(VMCS_ENTRY_INTR_INFO);
1277         if (info & VMCS_INTR_VALID) {
1278                 /*
1279                  * This is expected and could happen for multiple reasons:
1280                  * - A vectoring VM-entry was aborted due to astpending
1281                  * - A VM-exit happened during event injection.
1282                  * - An exception was injected above.
1283                  * - An NMI was injected above or after "NMI window exiting"
1284                  */
1285                 VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
1286                     "VM-entry intr info %#x", vector, info);
1287                 goto cantinject;
1288         }
1289
1290         /* Inject the interrupt */
1291         info = VMCS_INTR_T_HWINTR | VMCS_INTR_VALID;
1292         info |= vector;
1293         vmcs_write(VMCS_ENTRY_INTR_INFO, info);
1294
1295         if (!extint_pending) {
1296                 /* Update the Local APIC ISR */
1297                 vlapic_intr_accepted(vlapic, vector);
1298         } else {
1299                 vm_extint_clear(vmx->vm, vcpu);
1300                 vatpic_intr_accepted(vmx->vm, vector);
1301
1302                 /*
1303                  * After we accepted the current ExtINT the PIC may
1304                  * have posted another one.  If that is the case, set
1305                  * the Interrupt Window Exiting execution control so
1306                  * we can inject that one too.
1307                  */
1308                 if (vm_extint_pending(vmx->vm, vcpu))
1309                         vmx_set_int_window_exiting(vmx, vcpu);
1310         }
1311
1312         VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector);
1313
1314         return;
1315
1316 cantinject:
1317         /*
1318          * Set the Interrupt Window Exiting execution control so we can inject
1319          * the interrupt as soon as blocking condition goes away.
1320          */
1321         vmx_set_int_window_exiting(vmx, vcpu);
1322 }
1323
1324 /*
1325  * If the Virtual NMIs execution control is '1' then the logical processor
1326  * tracks virtual-NMI blocking in the Guest Interruptibility-state field of
1327  * the VMCS. An IRET instruction in VMX non-root operation will remove any
1328  * virtual-NMI blocking.
1329  *
1330  * This unblocking occurs even if the IRET causes a fault. In this case the
1331  * hypervisor needs to restore virtual-NMI blocking before resuming the guest.
1332  */
1333 static void
1334 vmx_restore_nmi_blocking(struct vmx *vmx, int vcpuid)
1335 {
1336         uint32_t gi;
1337
1338         VCPU_CTR0(vmx->vm, vcpuid, "Restore Virtual-NMI blocking");
1339         gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1340         gi |= VMCS_INTERRUPTIBILITY_NMI_BLOCKING;
1341         vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
1342 }
1343
1344 static void
1345 vmx_clear_nmi_blocking(struct vmx *vmx, int vcpuid)
1346 {
1347         uint32_t gi;
1348
1349         VCPU_CTR0(vmx->vm, vcpuid, "Clear Virtual-NMI blocking");
1350         gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1351         gi &= ~VMCS_INTERRUPTIBILITY_NMI_BLOCKING;
1352         vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
1353 }
1354
1355 static int
1356 vmx_emulate_xsetbv(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
1357 {
1358         struct vmxctx *vmxctx;
1359         uint64_t xcrval;
1360         const struct xsave_limits *limits;
1361
1362         vmxctx = &vmx->ctx[vcpu];
1363         limits = vmm_get_xsave_limits();
1364
1365         /*
1366          * Note that the processor raises a GP# fault on its own if
1367          * xsetbv is executed for CPL != 0, so we do not have to
1368          * emulate that fault here.
1369          */
1370
1371         /* Only xcr0 is supported. */
1372         if (vmxctx->guest_rcx != 0) {
1373                 vm_inject_gp(vmx->vm, vcpu);
1374                 return (HANDLED);
1375         }
1376
1377         /* We only handle xcr0 if both the host and guest have XSAVE enabled. */
1378         if (!limits->xsave_enabled || !(vmcs_read(VMCS_GUEST_CR4) & CR4_XSAVE)) {
1379                 vm_inject_ud(vmx->vm, vcpu);
1380                 return (HANDLED);
1381         }
1382
1383         xcrval = vmxctx->guest_rdx << 32 | (vmxctx->guest_rax & 0xffffffff);
1384         if ((xcrval & ~limits->xcr0_allowed) != 0) {
1385                 vm_inject_gp(vmx->vm, vcpu);
1386                 return (HANDLED);
1387         }
1388
1389         if (!(xcrval & XFEATURE_ENABLED_X87)) {
1390                 vm_inject_gp(vmx->vm, vcpu);
1391                 return (HANDLED);
1392         }
1393
1394         /* AVX (YMM_Hi128) requires SSE. */
1395         if (xcrval & XFEATURE_ENABLED_AVX &&
1396             (xcrval & XFEATURE_AVX) != XFEATURE_AVX) {
1397                 vm_inject_gp(vmx->vm, vcpu);
1398                 return (HANDLED);
1399         }
1400
1401         /*
1402          * AVX512 requires base AVX (YMM_Hi128) as well as OpMask,
1403          * ZMM_Hi256, and Hi16_ZMM.
1404          */
1405         if (xcrval & XFEATURE_AVX512 &&
1406             (xcrval & (XFEATURE_AVX512 | XFEATURE_AVX)) !=
1407             (XFEATURE_AVX512 | XFEATURE_AVX)) {
1408                 vm_inject_gp(vmx->vm, vcpu);
1409                 return (HANDLED);
1410         }
1411
1412         /*
1413          * Intel MPX requires both bound register state flags to be
1414          * set.
1415          */
1416         if (((xcrval & XFEATURE_ENABLED_BNDREGS) != 0) !=
1417             ((xcrval & XFEATURE_ENABLED_BNDCSR) != 0)) {
1418                 vm_inject_gp(vmx->vm, vcpu);
1419                 return (HANDLED);
1420         }
1421
1422         /*
1423          * This runs "inside" vmrun() with the guest's FPU state, so
1424          * modifying xcr0 directly modifies the guest's xcr0, not the
1425          * host's.
1426          */
1427         load_xcr(0, xcrval);
1428         return (HANDLED);
1429 }
1430
1431 static int
1432 vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
1433 {
1434         int cr, vmcs_guest_cr, vmcs_shadow_cr;
1435         uint64_t crval, regval, ones_mask, zeros_mask;
1436         const struct vmxctx *vmxctx;
1437
1438         /* We only handle mov to %cr0 or %cr4 at this time */
1439         if ((exitqual & 0xf0) != 0x00)
1440                 return (UNHANDLED);
1441
1442         cr = exitqual & 0xf;
1443         if (cr != 0 && cr != 4)
1444                 return (UNHANDLED);
1445
1446         regval = 0; /* silence gcc */
1447         vmxctx = &vmx->ctx[vcpu];
1448
1449         /*
1450          * We must use vmcs_write() directly here because vmcs_setreg() will
1451          * call vmclear(vmcs) as a side-effect which we certainly don't want.
1452          */
1453         switch ((exitqual >> 8) & 0xf) {
1454         case 0:
1455                 regval = vmxctx->guest_rax;
1456                 break;
1457         case 1:
1458                 regval = vmxctx->guest_rcx;
1459                 break;
1460         case 2:
1461                 regval = vmxctx->guest_rdx;
1462                 break;
1463         case 3:
1464                 regval = vmxctx->guest_rbx;
1465                 break;
1466         case 4:
1467                 regval = vmcs_read(VMCS_GUEST_RSP);
1468                 break;
1469         case 5:
1470                 regval = vmxctx->guest_rbp;
1471                 break;
1472         case 6:
1473                 regval = vmxctx->guest_rsi;
1474                 break;
1475         case 7:
1476                 regval = vmxctx->guest_rdi;
1477                 break;
1478         case 8:
1479                 regval = vmxctx->guest_r8;
1480                 break;
1481         case 9:
1482                 regval = vmxctx->guest_r9;
1483                 break;
1484         case 10:
1485                 regval = vmxctx->guest_r10;
1486                 break;
1487         case 11:
1488                 regval = vmxctx->guest_r11;
1489                 break;
1490         case 12:
1491                 regval = vmxctx->guest_r12;
1492                 break;
1493         case 13:
1494                 regval = vmxctx->guest_r13;
1495                 break;
1496         case 14:
1497                 regval = vmxctx->guest_r14;
1498                 break;
1499         case 15:
1500                 regval = vmxctx->guest_r15;
1501                 break;
1502         }
1503
1504         if (cr == 0) {
1505                 ones_mask = cr0_ones_mask;
1506                 zeros_mask = cr0_zeros_mask;
1507                 vmcs_guest_cr = VMCS_GUEST_CR0;
1508                 vmcs_shadow_cr = VMCS_CR0_SHADOW;
1509         } else {
1510                 ones_mask = cr4_ones_mask;
1511                 zeros_mask = cr4_zeros_mask;
1512                 vmcs_guest_cr = VMCS_GUEST_CR4;
1513                 vmcs_shadow_cr = VMCS_CR4_SHADOW;
1514         }
1515         vmcs_write(vmcs_shadow_cr, regval);
1516
1517         crval = regval | ones_mask;
1518         crval &= ~zeros_mask;
1519         vmcs_write(vmcs_guest_cr, crval);
1520
1521         if (cr == 0 && regval & CR0_PG) {
1522                 uint64_t efer, entry_ctls;
1523
1524                 /*
1525                  * If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and
1526                  * the "IA-32e mode guest" bit in VM-entry control must be
1527                  * equal.
1528                  */
1529                 efer = vmcs_read(VMCS_GUEST_IA32_EFER);
1530                 if (efer & EFER_LME) {
1531                         efer |= EFER_LMA;
1532                         vmcs_write(VMCS_GUEST_IA32_EFER, efer);
1533                         entry_ctls = vmcs_read(VMCS_ENTRY_CTLS);
1534                         entry_ctls |= VM_ENTRY_GUEST_LMA;
1535                         vmcs_write(VMCS_ENTRY_CTLS, entry_ctls);
1536                 }
1537         }
1538
1539         return (HANDLED);
1540 }
1541
1542 static enum vie_cpu_mode
1543 vmx_cpu_mode(void)
1544 {
1545
1546         if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA)
1547                 return (CPU_MODE_64BIT);
1548         else
1549                 return (CPU_MODE_COMPATIBILITY);
1550 }
1551
1552 static enum vie_paging_mode
1553 vmx_paging_mode(void)
1554 {
1555
1556         if (!(vmcs_read(VMCS_GUEST_CR0) & CR0_PG))
1557                 return (PAGING_MODE_FLAT);
1558         if (!(vmcs_read(VMCS_GUEST_CR4) & CR4_PAE))
1559                 return (PAGING_MODE_32);
1560         if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LME)
1561                 return (PAGING_MODE_64);
1562         else
1563                 return (PAGING_MODE_PAE);
1564 }
1565
1566 static int
1567 ept_fault_type(uint64_t ept_qual)
1568 {
1569         int fault_type;
1570
1571         if (ept_qual & EPT_VIOLATION_DATA_WRITE)
1572                 fault_type = VM_PROT_WRITE;
1573         else if (ept_qual & EPT_VIOLATION_INST_FETCH)
1574                 fault_type = VM_PROT_EXECUTE;
1575         else
1576                 fault_type= VM_PROT_READ;
1577
1578         return (fault_type);
1579 }
1580
1581 static boolean_t
1582 ept_emulation_fault(uint64_t ept_qual)
1583 {
1584         int read, write;
1585
1586         /* EPT fault on an instruction fetch doesn't make sense here */
1587         if (ept_qual & EPT_VIOLATION_INST_FETCH)
1588                 return (FALSE);
1589
1590         /* EPT fault must be a read fault or a write fault */
1591         read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0;
1592         write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0;
1593         if ((read | write) == 0)
1594                 return (FALSE);
1595
1596         /*
1597          * The EPT violation must have been caused by accessing a
1598          * guest-physical address that is a translation of a guest-linear
1599          * address.
1600          */
1601         if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 ||
1602             (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) {
1603                 return (FALSE);
1604         }
1605
1606         return (TRUE);
1607 }
1608
1609 static __inline int
1610 apic_access_virtualization(struct vmx *vmx, int vcpuid)
1611 {
1612         uint32_t proc_ctls2;
1613
1614         proc_ctls2 = vmx->cap[vcpuid].proc_ctls2;
1615         return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) ? 1 : 0);
1616 }
1617
1618 static __inline int
1619 x2apic_virtualization(struct vmx *vmx, int vcpuid)
1620 {
1621         uint32_t proc_ctls2;
1622
1623         proc_ctls2 = vmx->cap[vcpuid].proc_ctls2;
1624         return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_X2APIC_MODE) ? 1 : 0);
1625 }
1626
1627 static int
1628 vmx_handle_apic_write(struct vmx *vmx, int vcpuid, struct vlapic *vlapic,
1629     uint64_t qual)
1630 {
1631         int error, handled, offset;
1632         uint32_t *apic_regs, vector;
1633         bool retu;
1634
1635         handled = HANDLED;
1636         offset = APIC_WRITE_OFFSET(qual);
1637
1638         if (!apic_access_virtualization(vmx, vcpuid)) {
1639                 /*
1640                  * In general there should not be any APIC write VM-exits
1641                  * unless APIC-access virtualization is enabled.
1642                  *
1643                  * However self-IPI virtualization can legitimately trigger
1644                  * an APIC-write VM-exit so treat it specially.
1645                  */
1646                 if (x2apic_virtualization(vmx, vcpuid) &&
1647                     offset == APIC_OFFSET_SELF_IPI) {
1648                         apic_regs = (uint32_t *)(vlapic->apic_page);
1649                         vector = apic_regs[APIC_OFFSET_SELF_IPI / 4];
1650                         vlapic_self_ipi_handler(vlapic, vector);
1651                         return (HANDLED);
1652                 } else
1653                         return (UNHANDLED);
1654         }
1655
1656         switch (offset) {
1657         case APIC_OFFSET_ID:
1658                 vlapic_id_write_handler(vlapic);
1659                 break;
1660         case APIC_OFFSET_LDR:
1661                 vlapic_ldr_write_handler(vlapic);
1662                 break;
1663         case APIC_OFFSET_DFR:
1664                 vlapic_dfr_write_handler(vlapic);
1665                 break;
1666         case APIC_OFFSET_SVR:
1667                 vlapic_svr_write_handler(vlapic);
1668                 break;
1669         case APIC_OFFSET_ESR:
1670                 vlapic_esr_write_handler(vlapic);
1671                 break;
1672         case APIC_OFFSET_ICR_LOW:
1673                 retu = false;
1674                 error = vlapic_icrlo_write_handler(vlapic, &retu);
1675                 if (error != 0 || retu)
1676                         handled = UNHANDLED;
1677                 break;
1678         case APIC_OFFSET_CMCI_LVT:
1679         case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
1680                 vlapic_lvt_write_handler(vlapic, offset);
1681                 break;
1682         case APIC_OFFSET_TIMER_ICR:
1683                 vlapic_icrtmr_write_handler(vlapic);
1684                 break;
1685         case APIC_OFFSET_TIMER_DCR:
1686                 vlapic_dcr_write_handler(vlapic);
1687                 break;
1688         default:
1689                 handled = UNHANDLED;
1690                 break;
1691         }
1692         return (handled);
1693 }
1694
1695 static bool
1696 apic_access_fault(struct vmx *vmx, int vcpuid, uint64_t gpa)
1697 {
1698
1699         if (apic_access_virtualization(vmx, vcpuid) &&
1700             (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE))
1701                 return (true);
1702         else
1703                 return (false);
1704 }
1705
1706 static int
1707 vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
1708 {
1709         uint64_t qual;
1710         int access_type, offset, allowed;
1711
1712         if (!apic_access_virtualization(vmx, vcpuid))
1713                 return (UNHANDLED);
1714
1715         qual = vmexit->u.vmx.exit_qualification;
1716         access_type = APIC_ACCESS_TYPE(qual);
1717         offset = APIC_ACCESS_OFFSET(qual);
1718
1719         allowed = 0;
1720         if (access_type == 0) {
1721                 /*
1722                  * Read data access to the following registers is expected.
1723                  */
1724                 switch (offset) {
1725                 case APIC_OFFSET_APR:
1726                 case APIC_OFFSET_PPR:
1727                 case APIC_OFFSET_RRR:
1728                 case APIC_OFFSET_CMCI_LVT:
1729                 case APIC_OFFSET_TIMER_CCR:
1730                         allowed = 1;
1731                         break;
1732                 default:
1733                         break;
1734                 }
1735         } else if (access_type == 1) {
1736                 /*
1737                  * Write data access to the following registers is expected.
1738                  */
1739                 switch (offset) {
1740                 case APIC_OFFSET_VER:
1741                 case APIC_OFFSET_APR:
1742                 case APIC_OFFSET_PPR:
1743                 case APIC_OFFSET_RRR:
1744                 case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
1745                 case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
1746                 case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
1747                 case APIC_OFFSET_CMCI_LVT:
1748                 case APIC_OFFSET_TIMER_CCR:
1749                         allowed = 1;
1750                         break;
1751                 default:
1752                         break;
1753                 }
1754         }
1755
1756         if (allowed) {
1757                 vmexit->exitcode = VM_EXITCODE_INST_EMUL;
1758                 vmexit->u.inst_emul.gpa = DEFAULT_APIC_BASE + offset;
1759                 vmexit->u.inst_emul.gla = VIE_INVALID_GLA;
1760                 vmexit->u.inst_emul.cr3 = vmcs_guest_cr3();
1761                 vmexit->u.inst_emul.cpu_mode = vmx_cpu_mode();
1762                 vmexit->u.inst_emul.paging_mode = vmx_paging_mode();
1763         }
1764
1765         /*
1766          * Regardless of whether the APIC-access is allowed this handler
1767          * always returns UNHANDLED:
1768          * - if the access is allowed then it is handled by emulating the
1769          *   instruction that caused the VM-exit (outside the critical section)
1770          * - if the access is not allowed then it will be converted to an
1771          *   exitcode of VM_EXITCODE_VMX and will be dealt with in userland.
1772          */
1773         return (UNHANDLED);
1774 }
1775
1776 static int
1777 vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
1778 {
1779         int error, handled;
1780         struct vmxctx *vmxctx;
1781         struct vlapic *vlapic;
1782         uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, reason;
1783         uint64_t qual, gpa;
1784         bool retu;
1785
1786         CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0);
1787         CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_NMI_EXITING) != 0);
1788
1789         handled = UNHANDLED;
1790         vmxctx = &vmx->ctx[vcpu];
1791
1792         qual = vmexit->u.vmx.exit_qualification;
1793         reason = vmexit->u.vmx.exit_reason;
1794         vmexit->exitcode = VM_EXITCODE_BOGUS;
1795
1796         vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1);
1797
1798         /*
1799          * VM exits that could be triggered during event injection on the
1800          * previous VM entry need to be handled specially by re-injecting
1801          * the event.
1802          *
1803          * See "Information for VM Exits During Event Delivery" in Intel SDM
1804          * for details.
1805          */
1806         switch (reason) {
1807         case EXIT_REASON_EPT_FAULT:
1808         case EXIT_REASON_EPT_MISCONFIG:
1809         case EXIT_REASON_APIC_ACCESS:
1810         case EXIT_REASON_TASK_SWITCH:
1811         case EXIT_REASON_EXCEPTION:
1812                 idtvec_info = vmcs_idt_vectoring_info();
1813                 if (idtvec_info & VMCS_IDT_VEC_VALID) {
1814                         idtvec_info &= ~(1 << 12); /* clear undefined bit */
1815                         vmcs_write(VMCS_ENTRY_INTR_INFO, idtvec_info);
1816                         if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
1817                                 idtvec_err = vmcs_idt_vectoring_err();
1818                                 vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR,
1819                                     idtvec_err);
1820                         }
1821                         /*
1822                          * If 'virtual NMIs' are being used and the VM-exit
1823                          * happened while injecting an NMI during the previous
1824                          * VM-entry, then clear "blocking by NMI" in the Guest
1825                          * Interruptibility-state.
1826                          */
1827                         if ((idtvec_info & VMCS_INTR_T_MASK) ==
1828                             VMCS_INTR_T_NMI) {
1829                                  vmx_clear_nmi_blocking(vmx, vcpu);
1830                         }
1831                         vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
1832                 }
1833         default:
1834                 idtvec_info = 0;
1835                 break;
1836         }
1837
1838         switch (reason) {
1839         case EXIT_REASON_CR_ACCESS:
1840                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1);
1841                 handled = vmx_emulate_cr_access(vmx, vcpu, qual);
1842                 break;
1843         case EXIT_REASON_RDMSR:
1844                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1);
1845                 retu = false;
1846                 ecx = vmxctx->guest_rcx;
1847                 VCPU_CTR1(vmx->vm, vcpu, "rdmsr 0x%08x", ecx);
1848                 error = emulate_rdmsr(vmx->vm, vcpu, ecx, &retu);
1849                 if (error) {
1850                         vmexit->exitcode = VM_EXITCODE_RDMSR;
1851                         vmexit->u.msr.code = ecx;
1852                 } else if (!retu) {
1853                         handled = HANDLED;
1854                 } else {
1855                         /* Return to userspace with a valid exitcode */
1856                         KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
1857                             ("emulate_wrmsr retu with bogus exitcode"));
1858                 }
1859                 break;
1860         case EXIT_REASON_WRMSR:
1861                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_WRMSR, 1);
1862                 retu = false;
1863                 eax = vmxctx->guest_rax;
1864                 ecx = vmxctx->guest_rcx;
1865                 edx = vmxctx->guest_rdx;
1866                 VCPU_CTR2(vmx->vm, vcpu, "wrmsr 0x%08x value 0x%016lx",
1867                     ecx, (uint64_t)edx << 32 | eax);
1868                 error = emulate_wrmsr(vmx->vm, vcpu, ecx,
1869                     (uint64_t)edx << 32 | eax, &retu);
1870                 if (error) {
1871                         vmexit->exitcode = VM_EXITCODE_WRMSR;
1872                         vmexit->u.msr.code = ecx;
1873                         vmexit->u.msr.wval = (uint64_t)edx << 32 | eax;
1874                 } else if (!retu) {
1875                         handled = HANDLED;
1876                 } else {
1877                         /* Return to userspace with a valid exitcode */
1878                         KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
1879                             ("emulate_wrmsr retu with bogus exitcode"));
1880                 }
1881                 break;
1882         case EXIT_REASON_HLT:
1883                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1);
1884                 vmexit->exitcode = VM_EXITCODE_HLT;
1885                 vmexit->u.hlt.rflags = vmcs_read(VMCS_GUEST_RFLAGS);
1886                 break;
1887         case EXIT_REASON_MTF:
1888                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1);
1889                 vmexit->exitcode = VM_EXITCODE_MTRAP;
1890                 break;
1891         case EXIT_REASON_PAUSE:
1892                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1);
1893                 vmexit->exitcode = VM_EXITCODE_PAUSE;
1894                 break;
1895         case EXIT_REASON_INTR_WINDOW:
1896                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1);
1897                 vmx_clear_int_window_exiting(vmx, vcpu);
1898                 return (1);
1899         case EXIT_REASON_EXT_INTR:
1900                 /*
1901                  * External interrupts serve only to cause VM exits and allow
1902                  * the host interrupt handler to run.
1903                  *
1904                  * If this external interrupt triggers a virtual interrupt
1905                  * to a VM, then that state will be recorded by the
1906                  * host interrupt handler in the VM's softc. We will inject
1907                  * this virtual interrupt during the subsequent VM enter.
1908                  */
1909                 intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
1910
1911                 /*
1912                  * XXX: Ignore this exit if VMCS_INTR_VALID is not set.
1913                  * This appears to be a bug in VMware Fusion?
1914                  */
1915                 if (!(intr_info & VMCS_INTR_VALID))
1916                         return (1);
1917                 KASSERT((intr_info & VMCS_INTR_VALID) != 0 &&
1918                     (intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_HWINTR,
1919                     ("VM exit interruption info invalid: %#x", intr_info));
1920                 vmx_trigger_hostintr(intr_info & 0xff);
1921
1922                 /*
1923                  * This is special. We want to treat this as an 'handled'
1924                  * VM-exit but not increment the instruction pointer.
1925                  */
1926                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1);
1927                 return (1);
1928         case EXIT_REASON_NMI_WINDOW:
1929                 /* Exit to allow the pending virtual NMI to be injected */
1930                 if (vm_nmi_pending(vmx->vm, vcpu))
1931                         vmx_inject_nmi(vmx, vcpu);
1932                 vmx_clear_nmi_window_exiting(vmx, vcpu);
1933                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NMI_WINDOW, 1);
1934                 return (1);
1935         case EXIT_REASON_INOUT:
1936                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1);
1937                 vmexit->exitcode = VM_EXITCODE_INOUT;
1938                 vmexit->u.inout.bytes = (qual & 0x7) + 1;
1939                 vmexit->u.inout.in = (qual & 0x8) ? 1 : 0;
1940                 vmexit->u.inout.string = (qual & 0x10) ? 1 : 0;
1941                 vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0;
1942                 vmexit->u.inout.port = (uint16_t)(qual >> 16);
1943                 vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax);
1944                 error = emulate_ioport(vmx->vm, vcpu, vmexit);
1945                 if (error == 0)  {
1946                         handled = 1;
1947                         vmxctx->guest_rax = vmexit->u.inout.eax;
1948                 }
1949                 break;
1950         case EXIT_REASON_CPUID:
1951                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1);
1952                 handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx);
1953                 break;
1954         case EXIT_REASON_EXCEPTION:
1955                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXCEPTION, 1);
1956                 intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
1957                 KASSERT((intr_info & VMCS_INTR_VALID) != 0,
1958                     ("VM exit interruption info invalid: %#x", intr_info));
1959
1960                 /*
1961                  * If Virtual NMIs control is 1 and the VM-exit is due to a
1962                  * fault encountered during the execution of IRET then we must
1963                  * restore the state of "virtual-NMI blocking" before resuming
1964                  * the guest.
1965                  *
1966                  * See "Resuming Guest Software after Handling an Exception".
1967                  */
1968                 if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
1969                     (intr_info & 0xff) != IDT_DF &&
1970                     (intr_info & EXIT_QUAL_NMIUDTI) != 0)
1971                         vmx_restore_nmi_blocking(vmx, vcpu);
1972
1973                 /*
1974                  * The NMI has already been handled in vmx_exit_handle_nmi().
1975                  */
1976                 if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI)
1977                         return (1);
1978                 break;
1979         case EXIT_REASON_EPT_FAULT:
1980                 /*
1981                  * If 'gpa' lies within the address space allocated to
1982                  * memory then this must be a nested page fault otherwise
1983                  * this must be an instruction that accesses MMIO space.
1984                  */
1985                 gpa = vmcs_gpa();
1986                 if (vm_mem_allocated(vmx->vm, gpa) ||
1987                     apic_access_fault(vmx, vcpu, gpa)) {
1988                         vmexit->exitcode = VM_EXITCODE_PAGING;
1989                         vmexit->u.paging.gpa = gpa;
1990                         vmexit->u.paging.fault_type = ept_fault_type(qual);
1991                         vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NESTED_FAULT, 1);
1992                 } else if (ept_emulation_fault(qual)) {
1993                         vmexit->exitcode = VM_EXITCODE_INST_EMUL;
1994                         vmexit->u.inst_emul.gpa = gpa;
1995                         vmexit->u.inst_emul.gla = vmcs_gla();
1996                         vmexit->u.inst_emul.cr3 = vmcs_guest_cr3();
1997                         vmexit->u.inst_emul.cpu_mode = vmx_cpu_mode();
1998                         vmexit->u.inst_emul.paging_mode = vmx_paging_mode();
1999                         vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INST_EMUL, 1);
2000                 }
2001                 /*
2002                  * If Virtual NMIs control is 1 and the VM-exit is due to an
2003                  * EPT fault during the execution of IRET then we must restore
2004                  * the state of "virtual-NMI blocking" before resuming.
2005                  *
2006                  * See description of "NMI unblocking due to IRET" in
2007                  * "Exit Qualification for EPT Violations".
2008                  */
2009                 if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
2010                     (qual & EXIT_QUAL_NMIUDTI) != 0)
2011                         vmx_restore_nmi_blocking(vmx, vcpu);
2012                 break;
2013         case EXIT_REASON_VIRTUALIZED_EOI:
2014                 vmexit->exitcode = VM_EXITCODE_IOAPIC_EOI;
2015                 vmexit->u.ioapic_eoi.vector = qual & 0xFF;
2016                 vmexit->inst_length = 0;        /* trap-like */
2017                 break;
2018         case EXIT_REASON_APIC_ACCESS:
2019                 handled = vmx_handle_apic_access(vmx, vcpu, vmexit);
2020                 break;
2021         case EXIT_REASON_APIC_WRITE:
2022                 /*
2023                  * APIC-write VM exit is trap-like so the %rip is already
2024                  * pointing to the next instruction.
2025                  */
2026                 vmexit->inst_length = 0;
2027                 vlapic = vm_lapic(vmx->vm, vcpu);
2028                 handled = vmx_handle_apic_write(vmx, vcpu, vlapic, qual);
2029                 break;
2030         case EXIT_REASON_XSETBV:
2031                 handled = vmx_emulate_xsetbv(vmx, vcpu, vmexit);
2032                 break;
2033         default:
2034                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1);
2035                 break;
2036         }
2037
2038         if (handled) {
2039                 /*
2040                  * It is possible that control is returned to userland
2041                  * even though we were able to handle the VM exit in the
2042                  * kernel.
2043                  *
2044                  * In such a case we want to make sure that the userland
2045                  * restarts guest execution at the instruction *after*
2046                  * the one we just processed. Therefore we update the
2047                  * guest rip in the VMCS and in 'vmexit'.
2048                  */
2049                 vmexit->rip += vmexit->inst_length;
2050                 vmexit->inst_length = 0;
2051                 vmcs_write(VMCS_GUEST_RIP, vmexit->rip);
2052         } else {
2053                 if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
2054                         /*
2055                          * If this VM exit was not claimed by anybody then
2056                          * treat it as a generic VMX exit.
2057                          */
2058                         vmexit->exitcode = VM_EXITCODE_VMX;
2059                         vmexit->u.vmx.status = VM_SUCCESS;
2060                         vmexit->u.vmx.inst_type = 0;
2061                         vmexit->u.vmx.inst_error = 0;
2062                 } else {
2063                         /*
2064                          * The exitcode and collateral have been populated.
2065                          * The VM exit will be processed further in userland.
2066                          */
2067                 }
2068         }
2069         return (handled);
2070 }
2071
2072 static __inline int
2073 vmx_exit_astpending(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
2074 {
2075
2076         vmexit->rip = vmcs_guest_rip();
2077         vmexit->inst_length = 0;
2078         vmexit->exitcode = VM_EXITCODE_BOGUS;
2079         vmx_astpending_trace(vmx, vcpu, vmexit->rip);
2080         vmm_stat_incr(vmx->vm, vcpu, VMEXIT_ASTPENDING, 1);
2081
2082         return (HANDLED);
2083 }
2084
2085 static __inline int
2086 vmx_exit_rendezvous(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
2087 {
2088
2089         vmexit->rip = vmcs_guest_rip();
2090         vmexit->inst_length = 0;
2091         vmexit->exitcode = VM_EXITCODE_RENDEZVOUS;
2092         vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RENDEZVOUS, 1);
2093
2094         return (UNHANDLED);
2095 }
2096
2097 static __inline int
2098 vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit)
2099 {
2100
2101         KASSERT(vmxctx->inst_fail_status != VM_SUCCESS,
2102             ("vmx_exit_inst_error: invalid inst_fail_status %d",
2103             vmxctx->inst_fail_status));
2104
2105         vmexit->inst_length = 0;
2106         vmexit->exitcode = VM_EXITCODE_VMX;
2107         vmexit->u.vmx.status = vmxctx->inst_fail_status;
2108         vmexit->u.vmx.inst_error = vmcs_instruction_error();
2109         vmexit->u.vmx.exit_reason = ~0;
2110         vmexit->u.vmx.exit_qualification = ~0;
2111
2112         switch (rc) {
2113         case VMX_VMRESUME_ERROR:
2114         case VMX_VMLAUNCH_ERROR:
2115         case VMX_INVEPT_ERROR:
2116                 vmexit->u.vmx.inst_type = rc;
2117                 break;
2118         default:
2119                 panic("vm_exit_inst_error: vmx_enter_guest returned %d", rc);
2120         }
2121
2122         return (UNHANDLED);
2123 }
2124
2125 /*
2126  * If the NMI-exiting VM execution control is set to '1' then an NMI in
2127  * non-root operation causes a VM-exit. NMI blocking is in effect so it is
2128  * sufficient to simply vector to the NMI handler via a software interrupt.
2129  * However, this must be done before maskable interrupts are enabled
2130  * otherwise the "iret" issued by an interrupt handler will incorrectly
2131  * clear NMI blocking.
2132  */
2133 static __inline void
2134 vmx_exit_handle_nmi(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
2135 {
2136         uint32_t intr_info;
2137
2138         KASSERT((read_rflags() & PSL_I) == 0, ("interrupts enabled"));
2139
2140         if (vmexit->u.vmx.exit_reason != EXIT_REASON_EXCEPTION)
2141                 return;
2142
2143         intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
2144         KASSERT((intr_info & VMCS_INTR_VALID) != 0,
2145             ("VM exit interruption info invalid: %#x", intr_info));
2146
2147         if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI) {
2148                 KASSERT((intr_info & 0xff) == IDT_NMI, ("VM exit due "
2149                     "to NMI has invalid vector: %#x", intr_info));
2150                 VCPU_CTR0(vmx->vm, vcpuid, "Vectoring to NMI handler");
2151                 __asm __volatile("int $2");
2152         }
2153 }
2154
2155 static int
2156 vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap,
2157     void *rendezvous_cookie, void *suspend_cookie)
2158 {
2159         int rc, handled, launched;
2160         struct vmx *vmx;
2161         struct vm *vm;
2162         struct vmxctx *vmxctx;
2163         struct vmcs *vmcs;
2164         struct vm_exit *vmexit;
2165         struct vlapic *vlapic;
2166         uint64_t rip;
2167         uint32_t exit_reason;
2168
2169         vmx = arg;
2170         vm = vmx->vm;
2171         vmcs = &vmx->vmcs[vcpu];
2172         vmxctx = &vmx->ctx[vcpu];
2173         vlapic = vm_lapic(vm, vcpu);
2174         vmexit = vm_exitinfo(vm, vcpu);
2175         launched = 0;
2176
2177         KASSERT(vmxctx->pmap == pmap,
2178             ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap));
2179
2180         VMPTRLD(vmcs);
2181
2182         /*
2183          * XXX
2184          * We do this every time because we may setup the virtual machine
2185          * from a different process than the one that actually runs it.
2186          *
2187          * If the life of a virtual machine was spent entirely in the context
2188          * of a single process we could do this once in vmx_vminit().
2189          */
2190         vmcs_write(VMCS_HOST_CR3, rcr3());
2191
2192         vmcs_write(VMCS_GUEST_RIP, startrip);
2193         vmx_set_pcpu_defaults(vmx, vcpu, pmap);
2194         do {
2195                 /*
2196                  * Interrupts are disabled from this point on until the
2197                  * guest starts executing. This is done for the following
2198                  * reasons:
2199                  *
2200                  * If an AST is asserted on this thread after the check below,
2201                  * then the IPI_AST notification will not be lost, because it
2202                  * will cause a VM exit due to external interrupt as soon as
2203                  * the guest state is loaded.
2204                  *
2205                  * A posted interrupt after 'vmx_inject_interrupts()' will
2206                  * not be "lost" because it will be held pending in the host
2207                  * APIC because interrupts are disabled. The pending interrupt
2208                  * will be recognized as soon as the guest state is loaded.
2209                  *
2210                  * The same reasoning applies to the IPI generated by
2211                  * pmap_invalidate_ept().
2212                  */
2213                 disable_intr();
2214                 if (vcpu_suspended(suspend_cookie)) {
2215                         enable_intr();
2216                         vm_exit_suspended(vmx->vm, vcpu, vmcs_guest_rip());
2217                         handled = UNHANDLED;
2218                         break;
2219                 }
2220
2221                 if (vcpu_rendezvous_pending(rendezvous_cookie)) {
2222                         enable_intr();
2223                         handled = vmx_exit_rendezvous(vmx, vcpu, vmexit);
2224                         break;
2225                 }
2226
2227                 if (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED)) {
2228                         enable_intr();
2229                         handled = vmx_exit_astpending(vmx, vcpu, vmexit);
2230                         break;
2231                 }
2232
2233                 vmx_inject_interrupts(vmx, vcpu, vlapic);
2234                 vmx_run_trace(vmx, vcpu);
2235                 rc = vmx_enter_guest(vmxctx, vmx, launched);
2236
2237                 /* Collect some information for VM exit processing */
2238                 vmexit->rip = rip = vmcs_guest_rip();
2239                 vmexit->inst_length = vmexit_instruction_length();
2240                 vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason();
2241                 vmexit->u.vmx.exit_qualification = vmcs_exit_qualification();
2242
2243                 if (rc == VMX_GUEST_VMEXIT) {
2244                         vmx_exit_handle_nmi(vmx, vcpu, vmexit);
2245                         enable_intr();
2246                         handled = vmx_exit_process(vmx, vcpu, vmexit);
2247                 } else {
2248                         enable_intr();
2249                         handled = vmx_exit_inst_error(vmxctx, rc, vmexit);
2250                 }
2251                 launched = 1;
2252                 vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled);
2253         } while (handled);
2254
2255         /*
2256          * If a VM exit has been handled then the exitcode must be BOGUS
2257          * If a VM exit is not handled then the exitcode must not be BOGUS
2258          */
2259         if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) ||
2260             (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) {
2261                 panic("Mismatch between handled (%d) and exitcode (%d)",
2262                       handled, vmexit->exitcode);
2263         }
2264
2265         if (!handled)
2266                 vmm_stat_incr(vm, vcpu, VMEXIT_USERSPACE, 1);
2267
2268         VCPU_CTR1(vm, vcpu, "returning from vmx_run: exitcode %d",
2269             vmexit->exitcode);
2270
2271         VMCLEAR(vmcs);
2272         return (0);
2273 }
2274
2275 static void
2276 vmx_vmcleanup(void *arg)
2277 {
2278         int i;
2279         struct vmx *vmx = arg;
2280
2281         if (apic_access_virtualization(vmx, 0))
2282                 vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE);
2283
2284         for (i = 0; i < VM_MAXCPU; i++)
2285                 vpid_free(vmx->state[i].vpid);
2286
2287         free(vmx, M_VMX);
2288
2289         return;
2290 }
2291
2292 static register_t *
2293 vmxctx_regptr(struct vmxctx *vmxctx, int reg)
2294 {
2295
2296         switch (reg) {
2297         case VM_REG_GUEST_RAX:
2298                 return (&vmxctx->guest_rax);
2299         case VM_REG_GUEST_RBX:
2300                 return (&vmxctx->guest_rbx);
2301         case VM_REG_GUEST_RCX:
2302                 return (&vmxctx->guest_rcx);
2303         case VM_REG_GUEST_RDX:
2304                 return (&vmxctx->guest_rdx);
2305         case VM_REG_GUEST_RSI:
2306                 return (&vmxctx->guest_rsi);
2307         case VM_REG_GUEST_RDI:
2308                 return (&vmxctx->guest_rdi);
2309         case VM_REG_GUEST_RBP:
2310                 return (&vmxctx->guest_rbp);
2311         case VM_REG_GUEST_R8:
2312                 return (&vmxctx->guest_r8);
2313         case VM_REG_GUEST_R9:
2314                 return (&vmxctx->guest_r9);
2315         case VM_REG_GUEST_R10:
2316                 return (&vmxctx->guest_r10);
2317         case VM_REG_GUEST_R11:
2318                 return (&vmxctx->guest_r11);
2319         case VM_REG_GUEST_R12:
2320                 return (&vmxctx->guest_r12);
2321         case VM_REG_GUEST_R13:
2322                 return (&vmxctx->guest_r13);
2323         case VM_REG_GUEST_R14:
2324                 return (&vmxctx->guest_r14);
2325         case VM_REG_GUEST_R15:
2326                 return (&vmxctx->guest_r15);
2327         default:
2328                 break;
2329         }
2330         return (NULL);
2331 }
2332
2333 static int
2334 vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval)
2335 {
2336         register_t *regp;
2337
2338         if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
2339                 *retval = *regp;
2340                 return (0);
2341         } else
2342                 return (EINVAL);
2343 }
2344
2345 static int
2346 vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val)
2347 {
2348         register_t *regp;
2349
2350         if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
2351                 *regp = val;
2352                 return (0);
2353         } else
2354                 return (EINVAL);
2355 }
2356
2357 static int
2358 vmx_shadow_reg(int reg)
2359 {
2360         int shreg;
2361
2362         shreg = -1;
2363
2364         switch (reg) {
2365         case VM_REG_GUEST_CR0:
2366                 shreg = VMCS_CR0_SHADOW;
2367                 break;
2368         case VM_REG_GUEST_CR4:
2369                 shreg = VMCS_CR4_SHADOW;
2370                 break;
2371         default:
2372                 break;
2373         }
2374
2375         return (shreg);
2376 }
2377
2378 static int
2379 vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval)
2380 {
2381         int running, hostcpu;
2382         struct vmx *vmx = arg;
2383
2384         running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
2385         if (running && hostcpu != curcpu)
2386                 panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu);
2387
2388         if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0)
2389                 return (0);
2390
2391         return (vmcs_getreg(&vmx->vmcs[vcpu], running, reg, retval));
2392 }
2393
2394 static int
2395 vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
2396 {
2397         int error, hostcpu, running, shadow;
2398         uint64_t ctls;
2399         struct vmx *vmx = arg;
2400
2401         running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
2402         if (running && hostcpu != curcpu)
2403                 panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu);
2404
2405         if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0)
2406                 return (0);
2407
2408         error = vmcs_setreg(&vmx->vmcs[vcpu], running, reg, val);
2409
2410         if (error == 0) {
2411                 /*
2412                  * If the "load EFER" VM-entry control is 1 then the
2413                  * value of EFER.LMA must be identical to "IA-32e mode guest"
2414                  * bit in the VM-entry control.
2415                  */
2416                 if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 &&
2417                     (reg == VM_REG_GUEST_EFER)) {
2418                         vmcs_getreg(&vmx->vmcs[vcpu], running,
2419                                     VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls);
2420                         if (val & EFER_LMA)
2421                                 ctls |= VM_ENTRY_GUEST_LMA;
2422                         else
2423                                 ctls &= ~VM_ENTRY_GUEST_LMA;
2424                         vmcs_setreg(&vmx->vmcs[vcpu], running,
2425                                     VMCS_IDENT(VMCS_ENTRY_CTLS), ctls);
2426                 }
2427
2428                 shadow = vmx_shadow_reg(reg);
2429                 if (shadow > 0) {
2430                         /*
2431                          * Store the unmodified value in the shadow
2432                          */                     
2433                         error = vmcs_setreg(&vmx->vmcs[vcpu], running,
2434                                     VMCS_IDENT(shadow), val);
2435                 }
2436         }
2437
2438         return (error);
2439 }
2440
2441 static int
2442 vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
2443 {
2444         int hostcpu, running;
2445         struct vmx *vmx = arg;
2446
2447         running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
2448         if (running && hostcpu != curcpu)
2449                 panic("vmx_getdesc: %s%d is running", vm_name(vmx->vm), vcpu);
2450
2451         return (vmcs_getdesc(&vmx->vmcs[vcpu], running, reg, desc));
2452 }
2453
2454 static int
2455 vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
2456 {
2457         int hostcpu, running;
2458         struct vmx *vmx = arg;
2459
2460         running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
2461         if (running && hostcpu != curcpu)
2462                 panic("vmx_setdesc: %s%d is running", vm_name(vmx->vm), vcpu);
2463
2464         return (vmcs_setdesc(&vmx->vmcs[vcpu], running, reg, desc));
2465 }
2466
2467 static int
2468 vmx_getcap(void *arg, int vcpu, int type, int *retval)
2469 {
2470         struct vmx *vmx = arg;
2471         int vcap;
2472         int ret;
2473
2474         ret = ENOENT;
2475
2476         vcap = vmx->cap[vcpu].set;
2477
2478         switch (type) {
2479         case VM_CAP_HALT_EXIT:
2480                 if (cap_halt_exit)
2481                         ret = 0;
2482                 break;
2483         case VM_CAP_PAUSE_EXIT:
2484                 if (cap_pause_exit)
2485                         ret = 0;
2486                 break;
2487         case VM_CAP_MTRAP_EXIT:
2488                 if (cap_monitor_trap)
2489                         ret = 0;
2490                 break;
2491         case VM_CAP_UNRESTRICTED_GUEST:
2492                 if (cap_unrestricted_guest)
2493                         ret = 0;
2494                 break;
2495         case VM_CAP_ENABLE_INVPCID:
2496                 if (cap_invpcid)
2497                         ret = 0;
2498                 break;
2499         default:
2500                 break;
2501         }
2502
2503         if (ret == 0)
2504                 *retval = (vcap & (1 << type)) ? 1 : 0;
2505
2506         return (ret);
2507 }
2508
2509 static int
2510 vmx_setcap(void *arg, int vcpu, int type, int val)
2511 {
2512         struct vmx *vmx = arg;
2513         struct vmcs *vmcs = &vmx->vmcs[vcpu];
2514         uint32_t baseval;
2515         uint32_t *pptr;
2516         int error;
2517         int flag;
2518         int reg;
2519         int retval;
2520
2521         retval = ENOENT;
2522         pptr = NULL;
2523
2524         switch (type) {
2525         case VM_CAP_HALT_EXIT:
2526                 if (cap_halt_exit) {
2527                         retval = 0;
2528                         pptr = &vmx->cap[vcpu].proc_ctls;
2529                         baseval = *pptr;
2530                         flag = PROCBASED_HLT_EXITING;
2531                         reg = VMCS_PRI_PROC_BASED_CTLS;
2532                 }
2533                 break;
2534         case VM_CAP_MTRAP_EXIT:
2535                 if (cap_monitor_trap) {
2536                         retval = 0;
2537                         pptr = &vmx->cap[vcpu].proc_ctls;
2538                         baseval = *pptr;
2539                         flag = PROCBASED_MTF;
2540                         reg = VMCS_PRI_PROC_BASED_CTLS;
2541                 }
2542                 break;
2543         case VM_CAP_PAUSE_EXIT:
2544                 if (cap_pause_exit) {
2545                         retval = 0;
2546                         pptr = &vmx->cap[vcpu].proc_ctls;
2547                         baseval = *pptr;
2548                         flag = PROCBASED_PAUSE_EXITING;
2549                         reg = VMCS_PRI_PROC_BASED_CTLS;
2550                 }
2551                 break;
2552         case VM_CAP_UNRESTRICTED_GUEST:
2553                 if (cap_unrestricted_guest) {
2554                         retval = 0;
2555                         pptr = &vmx->cap[vcpu].proc_ctls2;
2556                         baseval = *pptr;
2557                         flag = PROCBASED2_UNRESTRICTED_GUEST;
2558                         reg = VMCS_SEC_PROC_BASED_CTLS;
2559                 }
2560                 break;
2561         case VM_CAP_ENABLE_INVPCID:
2562                 if (cap_invpcid) {
2563                         retval = 0;
2564                         pptr = &vmx->cap[vcpu].proc_ctls2;
2565                         baseval = *pptr;
2566                         flag = PROCBASED2_ENABLE_INVPCID;
2567                         reg = VMCS_SEC_PROC_BASED_CTLS;
2568                 }
2569                 break;
2570         default:
2571                 break;
2572         }
2573
2574         if (retval == 0) {
2575                 if (val) {
2576                         baseval |= flag;
2577                 } else {
2578                         baseval &= ~flag;
2579                 }
2580                 VMPTRLD(vmcs);
2581                 error = vmwrite(reg, baseval);
2582                 VMCLEAR(vmcs);
2583
2584                 if (error) {
2585                         retval = error;
2586                 } else {
2587                         /*
2588                          * Update optional stored flags, and record
2589                          * setting
2590                          */
2591                         if (pptr != NULL) {
2592                                 *pptr = baseval;
2593                         }
2594
2595                         if (val) {
2596                                 vmx->cap[vcpu].set |= (1 << type);
2597                         } else {
2598                                 vmx->cap[vcpu].set &= ~(1 << type);
2599                         }
2600                 }
2601         }
2602
2603         return (retval);
2604 }
2605
2606 struct vlapic_vtx {
2607         struct vlapic   vlapic;
2608         struct pir_desc *pir_desc;
2609         struct vmx      *vmx;
2610 };
2611
2612 #define VMX_CTR_PIR(vm, vcpuid, pir_desc, notify, vector, level, msg)   \
2613 do {                                                                    \
2614         VCPU_CTR2(vm, vcpuid, msg " assert %s-triggered vector %d",     \
2615             level ? "level" : "edge", vector);                          \
2616         VCPU_CTR1(vm, vcpuid, msg " pir0 0x%016lx", pir_desc->pir[0]);  \
2617         VCPU_CTR1(vm, vcpuid, msg " pir1 0x%016lx", pir_desc->pir[1]);  \
2618         VCPU_CTR1(vm, vcpuid, msg " pir2 0x%016lx", pir_desc->pir[2]);  \
2619         VCPU_CTR1(vm, vcpuid, msg " pir3 0x%016lx", pir_desc->pir[3]);  \
2620         VCPU_CTR1(vm, vcpuid, msg " notify: %s", notify ? "yes" : "no");\
2621 } while (0)
2622
2623 /*
2624  * vlapic->ops handlers that utilize the APICv hardware assist described in
2625  * Chapter 29 of the Intel SDM.
2626  */
2627 static int
2628 vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level)
2629 {
2630         struct vlapic_vtx *vlapic_vtx;
2631         struct pir_desc *pir_desc;
2632         uint64_t mask;
2633         int idx, notify;
2634
2635         vlapic_vtx = (struct vlapic_vtx *)vlapic;
2636         pir_desc = vlapic_vtx->pir_desc;
2637
2638         /*
2639          * Keep track of interrupt requests in the PIR descriptor. This is
2640          * because the virtual APIC page pointed to by the VMCS cannot be
2641          * modified if the vcpu is running.
2642          */
2643         idx = vector / 64;
2644         mask = 1UL << (vector % 64);
2645         atomic_set_long(&pir_desc->pir[idx], mask);
2646         notify = atomic_cmpset_long(&pir_desc->pending, 0, 1);
2647
2648         VMX_CTR_PIR(vlapic->vm, vlapic->vcpuid, pir_desc, notify, vector,
2649             level, "vmx_set_intr_ready");
2650         return (notify);
2651 }
2652
2653 static int
2654 vmx_pending_intr(struct vlapic *vlapic, int *vecptr)
2655 {
2656         struct vlapic_vtx *vlapic_vtx;
2657         struct pir_desc *pir_desc;
2658         struct LAPIC *lapic;
2659         uint64_t pending, pirval;
2660         uint32_t ppr, vpr;
2661         int i;
2662
2663         /*
2664          * This function is only expected to be called from the 'HLT' exit
2665          * handler which does not care about the vector that is pending.
2666          */
2667         KASSERT(vecptr == NULL, ("vmx_pending_intr: vecptr must be NULL"));
2668
2669         vlapic_vtx = (struct vlapic_vtx *)vlapic;
2670         pir_desc = vlapic_vtx->pir_desc;
2671
2672         pending = atomic_load_acq_long(&pir_desc->pending);
2673         if (!pending)
2674                 return (0);     /* common case */
2675
2676         /*
2677          * If there is an interrupt pending then it will be recognized only
2678          * if its priority is greater than the processor priority.
2679          *
2680          * Special case: if the processor priority is zero then any pending
2681          * interrupt will be recognized.
2682          */
2683         lapic = vlapic->apic_page;
2684         ppr = lapic->ppr & 0xf0;
2685         if (ppr == 0)
2686                 return (1);
2687
2688         VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "HLT with non-zero PPR %d",
2689             lapic->ppr);
2690
2691         for (i = 3; i >= 0; i--) {
2692                 pirval = pir_desc->pir[i];
2693                 if (pirval != 0) {
2694                         vpr = (i * 64 + flsl(pirval) - 1) & 0xf0;
2695                         return (vpr > ppr);
2696                 }
2697         }
2698         return (0);
2699 }
2700
2701 static void
2702 vmx_intr_accepted(struct vlapic *vlapic, int vector)
2703 {
2704
2705         panic("vmx_intr_accepted: not expected to be called");
2706 }
2707
2708 static void
2709 vmx_set_tmr(struct vlapic *vlapic, int vector, bool level)
2710 {
2711         struct vlapic_vtx *vlapic_vtx;
2712         struct vmx *vmx;
2713         struct vmcs *vmcs;
2714         uint64_t mask, val;
2715
2716         KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector));
2717         KASSERT(!vcpu_is_running(vlapic->vm, vlapic->vcpuid, NULL),
2718             ("vmx_set_tmr: vcpu cannot be running"));
2719
2720         vlapic_vtx = (struct vlapic_vtx *)vlapic;
2721         vmx = vlapic_vtx->vmx;
2722         vmcs = &vmx->vmcs[vlapic->vcpuid];
2723         mask = 1UL << (vector % 64);
2724
2725         VMPTRLD(vmcs);
2726         val = vmcs_read(VMCS_EOI_EXIT(vector));
2727         if (level)
2728                 val |= mask;
2729         else
2730                 val &= ~mask;
2731         vmcs_write(VMCS_EOI_EXIT(vector), val);
2732         VMCLEAR(vmcs);
2733 }
2734
2735 static void
2736 vmx_enable_x2apic_mode(struct vlapic *vlapic)
2737 {
2738         struct vmx *vmx;
2739         struct vmcs *vmcs;
2740         uint32_t proc_ctls2;
2741         int vcpuid, error;
2742
2743         vcpuid = vlapic->vcpuid;
2744         vmx = ((struct vlapic_vtx *)vlapic)->vmx;
2745         vmcs = &vmx->vmcs[vcpuid];
2746
2747         proc_ctls2 = vmx->cap[vcpuid].proc_ctls2;
2748         KASSERT((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) != 0,
2749             ("%s: invalid proc_ctls2 %#x", __func__, proc_ctls2));
2750
2751         proc_ctls2 &= ~PROCBASED2_VIRTUALIZE_APIC_ACCESSES;
2752         proc_ctls2 |= PROCBASED2_VIRTUALIZE_X2APIC_MODE;
2753         vmx->cap[vcpuid].proc_ctls2 = proc_ctls2;
2754
2755         VMPTRLD(vmcs);
2756         vmcs_write(VMCS_SEC_PROC_BASED_CTLS, proc_ctls2);
2757         VMCLEAR(vmcs);
2758
2759         if (vlapic->vcpuid == 0) {
2760                 /*
2761                  * The nested page table mappings are shared by all vcpus
2762                  * so unmap the APIC access page just once.
2763                  */
2764                 error = vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE);
2765                 KASSERT(error == 0, ("%s: vm_unmap_mmio error %d",
2766                     __func__, error));
2767
2768                 /*
2769                  * The MSR bitmap is shared by all vcpus so modify it only
2770                  * once in the context of vcpu 0.
2771                  */
2772                 error = vmx_allow_x2apic_msrs(vmx);
2773                 KASSERT(error == 0, ("%s: vmx_allow_x2apic_msrs error %d",
2774                     __func__, error));
2775         }
2776 }
2777
2778 static void
2779 vmx_post_intr(struct vlapic *vlapic, int hostcpu)
2780 {
2781
2782         ipi_cpu(hostcpu, pirvec);
2783 }
2784
2785 /*
2786  * Transfer the pending interrupts in the PIR descriptor to the IRR
2787  * in the virtual APIC page.
2788  */
2789 static void
2790 vmx_inject_pir(struct vlapic *vlapic)
2791 {
2792         struct vlapic_vtx *vlapic_vtx;
2793         struct pir_desc *pir_desc;
2794         struct LAPIC *lapic;
2795         uint64_t val, pirval;
2796         int rvi, pirbase = -1;
2797         uint16_t intr_status_old, intr_status_new;
2798
2799         vlapic_vtx = (struct vlapic_vtx *)vlapic;
2800         pir_desc = vlapic_vtx->pir_desc;
2801         if (atomic_cmpset_long(&pir_desc->pending, 1, 0) == 0) {
2802                 VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: "
2803                     "no posted interrupt pending");
2804                 return;
2805         }
2806
2807         pirval = 0;
2808         pirbase = -1;
2809         lapic = vlapic->apic_page;
2810
2811         val = atomic_readandclear_long(&pir_desc->pir[0]);
2812         if (val != 0) {
2813                 lapic->irr0 |= val;
2814                 lapic->irr1 |= val >> 32;
2815                 pirbase = 0;
2816                 pirval = val;
2817         }
2818
2819         val = atomic_readandclear_long(&pir_desc->pir[1]);
2820         if (val != 0) {
2821                 lapic->irr2 |= val;
2822                 lapic->irr3 |= val >> 32;
2823                 pirbase = 64;
2824                 pirval = val;
2825         }
2826
2827         val = atomic_readandclear_long(&pir_desc->pir[2]);
2828         if (val != 0) {
2829                 lapic->irr4 |= val;
2830                 lapic->irr5 |= val >> 32;
2831                 pirbase = 128;
2832                 pirval = val;
2833         }
2834
2835         val = atomic_readandclear_long(&pir_desc->pir[3]);
2836         if (val != 0) {
2837                 lapic->irr6 |= val;
2838                 lapic->irr7 |= val >> 32;
2839                 pirbase = 192;
2840                 pirval = val;
2841         }
2842
2843         VLAPIC_CTR_IRR(vlapic, "vmx_inject_pir");
2844
2845         /*
2846          * Update RVI so the processor can evaluate pending virtual
2847          * interrupts on VM-entry.
2848          *
2849          * It is possible for pirval to be 0 here, even though the
2850          * pending bit has been set. The scenario is:
2851          * CPU-Y is sending a posted interrupt to CPU-X, which
2852          * is running a guest and processing posted interrupts in h/w.
2853          * CPU-X will eventually exit and the state seen in s/w is
2854          * the pending bit set, but no PIR bits set.
2855          *
2856          *      CPU-X                      CPU-Y
2857          *   (vm running)                (host running)
2858          *   rx posted interrupt
2859          *   CLEAR pending bit
2860          *                               SET PIR bit
2861          *   READ/CLEAR PIR bits
2862          *                               SET pending bit
2863          *   (vm exit)
2864          *   pending bit set, PIR 0
2865          */
2866         if (pirval != 0) {
2867                 rvi = pirbase + flsl(pirval) - 1;
2868                 intr_status_old = vmcs_read(VMCS_GUEST_INTR_STATUS);
2869                 intr_status_new = (intr_status_old & 0xFF00) | rvi;
2870                 if (intr_status_new > intr_status_old) {
2871                         vmcs_write(VMCS_GUEST_INTR_STATUS, intr_status_new);
2872                         VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: "
2873                             "guest_intr_status changed from 0x%04x to 0x%04x",
2874                             intr_status_old, intr_status_new);
2875                 }
2876         }
2877 }
2878
2879 static struct vlapic *
2880 vmx_vlapic_init(void *arg, int vcpuid)
2881 {
2882         struct vmx *vmx;
2883         struct vlapic *vlapic;
2884         struct vlapic_vtx *vlapic_vtx;
2885         
2886         vmx = arg;
2887
2888         vlapic = malloc(sizeof(struct vlapic_vtx), M_VLAPIC, M_WAITOK | M_ZERO);
2889         vlapic->vm = vmx->vm;
2890         vlapic->vcpuid = vcpuid;
2891         vlapic->apic_page = (struct LAPIC *)&vmx->apic_page[vcpuid];
2892
2893         vlapic_vtx = (struct vlapic_vtx *)vlapic;
2894         vlapic_vtx->pir_desc = &vmx->pir_desc[vcpuid];
2895         vlapic_vtx->vmx = vmx;
2896
2897         if (virtual_interrupt_delivery) {
2898                 vlapic->ops.set_intr_ready = vmx_set_intr_ready;
2899                 vlapic->ops.pending_intr = vmx_pending_intr;
2900                 vlapic->ops.intr_accepted = vmx_intr_accepted;
2901                 vlapic->ops.set_tmr = vmx_set_tmr;
2902                 vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode;
2903         }
2904
2905         if (posted_interrupts)
2906                 vlapic->ops.post_intr = vmx_post_intr;
2907
2908         vlapic_init(vlapic);
2909
2910         return (vlapic);
2911 }
2912
2913 static void
2914 vmx_vlapic_cleanup(void *arg, struct vlapic *vlapic)
2915 {
2916
2917         vlapic_cleanup(vlapic);
2918         free(vlapic, M_VLAPIC);
2919 }
2920
2921 struct vmm_ops vmm_ops_intel = {
2922         vmx_init,
2923         vmx_cleanup,
2924         vmx_restore,
2925         vmx_vminit,
2926         vmx_run,
2927         vmx_vmcleanup,
2928         vmx_getreg,
2929         vmx_setreg,
2930         vmx_getdesc,
2931         vmx_setdesc,
2932         vmx_getcap,
2933         vmx_setcap,
2934         ept_vmspace_alloc,
2935         ept_vmspace_free,
2936         vmx_vlapic_init,
2937         vmx_vlapic_cleanup,
2938 };