]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/amd64/vmm/intel/vmx.c
If a VM-exit happens during an NMI injection then clear the "NMI Blocking" bit
[FreeBSD/FreeBSD.git] / sys / amd64 / vmm / intel / vmx.c
1 /*-
2  * Copyright (c) 2011 NetApp, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  * $FreeBSD$
27  */
28
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/smp.h>
35 #include <sys/kernel.h>
36 #include <sys/malloc.h>
37 #include <sys/pcpu.h>
38 #include <sys/proc.h>
39 #include <sys/sysctl.h>
40
41 #include <vm/vm.h>
42 #include <vm/pmap.h>
43
44 #include <machine/psl.h>
45 #include <machine/cpufunc.h>
46 #include <machine/md_var.h>
47 #include <machine/segments.h>
48 #include <machine/smp.h>
49 #include <machine/specialreg.h>
50 #include <machine/vmparam.h>
51
52 #include <machine/vmm.h>
53 #include "vmm_host.h"
54 #include "vmm_ipi.h"
55 #include "vmm_msr.h"
56 #include "vmm_ktr.h"
57 #include "vmm_stat.h"
58 #include "vlapic.h"
59 #include "vlapic_priv.h"
60
61 #include "vmx_msr.h"
62 #include "ept.h"
63 #include "vmx_cpufunc.h"
64 #include "vmx.h"
65 #include "x86.h"
66 #include "vmx_controls.h"
67
68 #define PINBASED_CTLS_ONE_SETTING                                       \
69         (PINBASED_EXTINT_EXITING        |                               \
70          PINBASED_NMI_EXITING           |                               \
71          PINBASED_VIRTUAL_NMI)
72 #define PINBASED_CTLS_ZERO_SETTING      0
73
74 #define PROCBASED_CTLS_WINDOW_SETTING                                   \
75         (PROCBASED_INT_WINDOW_EXITING   |                               \
76          PROCBASED_NMI_WINDOW_EXITING)
77
78 #define PROCBASED_CTLS_ONE_SETTING                                      \
79         (PROCBASED_SECONDARY_CONTROLS   |                               \
80          PROCBASED_IO_EXITING           |                               \
81          PROCBASED_MSR_BITMAPS          |                               \
82          PROCBASED_CTLS_WINDOW_SETTING)
83 #define PROCBASED_CTLS_ZERO_SETTING     \
84         (PROCBASED_CR3_LOAD_EXITING |   \
85         PROCBASED_CR3_STORE_EXITING |   \
86         PROCBASED_IO_BITMAPS)
87
88 #define PROCBASED_CTLS2_ONE_SETTING     PROCBASED2_ENABLE_EPT
89 #define PROCBASED_CTLS2_ZERO_SETTING    0
90
91 #define VM_EXIT_CTLS_ONE_SETTING_NO_PAT                                 \
92         (VM_EXIT_HOST_LMA                       |                       \
93         VM_EXIT_SAVE_EFER                       |                       \
94         VM_EXIT_LOAD_EFER)
95
96 #define VM_EXIT_CTLS_ONE_SETTING                                        \
97         (VM_EXIT_CTLS_ONE_SETTING_NO_PAT        |                       \
98         VM_EXIT_ACKNOWLEDGE_INTERRUPT           |                       \
99         VM_EXIT_SAVE_PAT                        |                       \
100         VM_EXIT_LOAD_PAT)
101 #define VM_EXIT_CTLS_ZERO_SETTING       VM_EXIT_SAVE_DEBUG_CONTROLS
102
103 #define VM_ENTRY_CTLS_ONE_SETTING_NO_PAT        VM_ENTRY_LOAD_EFER
104
105 #define VM_ENTRY_CTLS_ONE_SETTING                                       \
106         (VM_ENTRY_CTLS_ONE_SETTING_NO_PAT       |                       \
107         VM_ENTRY_LOAD_PAT)
108 #define VM_ENTRY_CTLS_ZERO_SETTING                                      \
109         (VM_ENTRY_LOAD_DEBUG_CONTROLS           |                       \
110         VM_ENTRY_INTO_SMM                       |                       \
111         VM_ENTRY_DEACTIVATE_DUAL_MONITOR)
112
113 #define guest_msr_rw(vmx, msr) \
114         msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW)
115
116 #define HANDLED         1
117 #define UNHANDLED       0
118
119 static MALLOC_DEFINE(M_VMX, "vmx", "vmx");
120 static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic");
121
122 SYSCTL_DECL(_hw_vmm);
123 SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW, NULL, NULL);
124
125 int vmxon_enabled[MAXCPU];
126 static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE);
127
128 static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2;
129 static uint32_t exit_ctls, entry_ctls;
130
131 static uint64_t cr0_ones_mask, cr0_zeros_mask;
132 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_ones_mask, CTLFLAG_RD,
133              &cr0_ones_mask, 0, NULL);
134 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_zeros_mask, CTLFLAG_RD,
135              &cr0_zeros_mask, 0, NULL);
136
137 static uint64_t cr4_ones_mask, cr4_zeros_mask;
138 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD,
139              &cr4_ones_mask, 0, NULL);
140 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD,
141              &cr4_zeros_mask, 0, NULL);
142
143 static int vmx_no_patmsr;
144
145 static int vmx_initialized;
146 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD,
147            &vmx_initialized, 0, "Intel VMX initialized");
148
149 /*
150  * Virtual NMI blocking conditions.
151  *
152  * Some processor implementations also require NMI to be blocked if
153  * the STI_BLOCKING bit is set. It is possible to detect this at runtime
154  * based on the (exit_reason,exit_qual) tuple being set to 
155  * (EXIT_REASON_INVAL_VMCS, EXIT_QUAL_NMI_WHILE_STI_BLOCKING).
156  *
157  * We take the easy way out and also include STI_BLOCKING as one of the
158  * gating items for vNMI injection.
159  */
160 static uint64_t nmi_blocking_bits = VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING |
161                                     VMCS_INTERRUPTIBILITY_NMI_BLOCKING |
162                                     VMCS_INTERRUPTIBILITY_STI_BLOCKING;
163
164 /*
165  * Optional capabilities
166  */
167 static int cap_halt_exit;
168 static int cap_pause_exit;
169 static int cap_unrestricted_guest;
170 static int cap_monitor_trap;
171 static int cap_invpcid;
172
173 static int virtual_interrupt_delivery;
174 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD,
175     &virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support");
176
177 static int posted_interrupts;
178 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupts, CTLFLAG_RD,
179     &posted_interrupts, 0, "APICv posted interrupt support");
180
181 static int pirvec;
182 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupt_vector, CTLFLAG_RD,
183     &pirvec, 0, "APICv posted interrupt vector");
184
185 static struct unrhdr *vpid_unr;
186 static u_int vpid_alloc_failed;
187 SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD,
188             &vpid_alloc_failed, 0, NULL);
189
190 /*
191  * Use the last page below 4GB as the APIC access address. This address is
192  * occupied by the boot firmware so it is guaranteed that it will not conflict
193  * with a page in system memory.
194  */
195 #define APIC_ACCESS_ADDRESS     0xFFFFF000
196
197 static void vmx_inject_pir(struct vlapic *vlapic);
198
199 #ifdef KTR
200 static const char *
201 exit_reason_to_str(int reason)
202 {
203         static char reasonbuf[32];
204
205         switch (reason) {
206         case EXIT_REASON_EXCEPTION:
207                 return "exception";
208         case EXIT_REASON_EXT_INTR:
209                 return "extint";
210         case EXIT_REASON_TRIPLE_FAULT:
211                 return "triplefault";
212         case EXIT_REASON_INIT:
213                 return "init";
214         case EXIT_REASON_SIPI:
215                 return "sipi";
216         case EXIT_REASON_IO_SMI:
217                 return "iosmi";
218         case EXIT_REASON_SMI:
219                 return "smi";
220         case EXIT_REASON_INTR_WINDOW:
221                 return "intrwindow";
222         case EXIT_REASON_NMI_WINDOW:
223                 return "nmiwindow";
224         case EXIT_REASON_TASK_SWITCH:
225                 return "taskswitch";
226         case EXIT_REASON_CPUID:
227                 return "cpuid";
228         case EXIT_REASON_GETSEC:
229                 return "getsec";
230         case EXIT_REASON_HLT:
231                 return "hlt";
232         case EXIT_REASON_INVD:
233                 return "invd";
234         case EXIT_REASON_INVLPG:
235                 return "invlpg";
236         case EXIT_REASON_RDPMC:
237                 return "rdpmc";
238         case EXIT_REASON_RDTSC:
239                 return "rdtsc";
240         case EXIT_REASON_RSM:
241                 return "rsm";
242         case EXIT_REASON_VMCALL:
243                 return "vmcall";
244         case EXIT_REASON_VMCLEAR:
245                 return "vmclear";
246         case EXIT_REASON_VMLAUNCH:
247                 return "vmlaunch";
248         case EXIT_REASON_VMPTRLD:
249                 return "vmptrld";
250         case EXIT_REASON_VMPTRST:
251                 return "vmptrst";
252         case EXIT_REASON_VMREAD:
253                 return "vmread";
254         case EXIT_REASON_VMRESUME:
255                 return "vmresume";
256         case EXIT_REASON_VMWRITE:
257                 return "vmwrite";
258         case EXIT_REASON_VMXOFF:
259                 return "vmxoff";
260         case EXIT_REASON_VMXON:
261                 return "vmxon";
262         case EXIT_REASON_CR_ACCESS:
263                 return "craccess";
264         case EXIT_REASON_DR_ACCESS:
265                 return "draccess";
266         case EXIT_REASON_INOUT:
267                 return "inout";
268         case EXIT_REASON_RDMSR:
269                 return "rdmsr";
270         case EXIT_REASON_WRMSR:
271                 return "wrmsr";
272         case EXIT_REASON_INVAL_VMCS:
273                 return "invalvmcs";
274         case EXIT_REASON_INVAL_MSR:
275                 return "invalmsr";
276         case EXIT_REASON_MWAIT:
277                 return "mwait";
278         case EXIT_REASON_MTF:
279                 return "mtf";
280         case EXIT_REASON_MONITOR:
281                 return "monitor";
282         case EXIT_REASON_PAUSE:
283                 return "pause";
284         case EXIT_REASON_MCE:
285                 return "mce";
286         case EXIT_REASON_TPR:
287                 return "tpr";
288         case EXIT_REASON_APIC_ACCESS:
289                 return "apic-access";
290         case EXIT_REASON_GDTR_IDTR:
291                 return "gdtridtr";
292         case EXIT_REASON_LDTR_TR:
293                 return "ldtrtr";
294         case EXIT_REASON_EPT_FAULT:
295                 return "eptfault";
296         case EXIT_REASON_EPT_MISCONFIG:
297                 return "eptmisconfig";
298         case EXIT_REASON_INVEPT:
299                 return "invept";
300         case EXIT_REASON_RDTSCP:
301                 return "rdtscp";
302         case EXIT_REASON_VMX_PREEMPT:
303                 return "vmxpreempt";
304         case EXIT_REASON_INVVPID:
305                 return "invvpid";
306         case EXIT_REASON_WBINVD:
307                 return "wbinvd";
308         case EXIT_REASON_XSETBV:
309                 return "xsetbv";
310         case EXIT_REASON_APIC_WRITE:
311                 return "apic-write";
312         default:
313                 snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason);
314                 return (reasonbuf);
315         }
316 }
317 #endif  /* KTR */
318
319 u_long
320 vmx_fix_cr0(u_long cr0)
321 {
322
323         return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask);
324 }
325
326 u_long
327 vmx_fix_cr4(u_long cr4)
328 {
329
330         return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask);
331 }
332
333 static void
334 vpid_free(int vpid)
335 {
336         if (vpid < 0 || vpid > 0xffff)
337                 panic("vpid_free: invalid vpid %d", vpid);
338
339         /*
340          * VPIDs [0,VM_MAXCPU] are special and are not allocated from
341          * the unit number allocator.
342          */
343
344         if (vpid > VM_MAXCPU)
345                 free_unr(vpid_unr, vpid);
346 }
347
348 static void
349 vpid_alloc(uint16_t *vpid, int num)
350 {
351         int i, x;
352
353         if (num <= 0 || num > VM_MAXCPU)
354                 panic("invalid number of vpids requested: %d", num);
355
356         /*
357          * If the "enable vpid" execution control is not enabled then the
358          * VPID is required to be 0 for all vcpus.
359          */
360         if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) == 0) {
361                 for (i = 0; i < num; i++)
362                         vpid[i] = 0;
363                 return;
364         }
365
366         /*
367          * Allocate a unique VPID for each vcpu from the unit number allocator.
368          */
369         for (i = 0; i < num; i++) {
370                 x = alloc_unr(vpid_unr);
371                 if (x == -1)
372                         break;
373                 else
374                         vpid[i] = x;
375         }
376
377         if (i < num) {
378                 atomic_add_int(&vpid_alloc_failed, 1);
379
380                 /*
381                  * If the unit number allocator does not have enough unique
382                  * VPIDs then we need to allocate from the [1,VM_MAXCPU] range.
383                  *
384                  * These VPIDs are not be unique across VMs but this does not
385                  * affect correctness because the combined mappings are also
386                  * tagged with the EP4TA which is unique for each VM.
387                  *
388                  * It is still sub-optimal because the invvpid will invalidate
389                  * combined mappings for a particular VPID across all EP4TAs.
390                  */
391                 while (i-- > 0)
392                         vpid_free(vpid[i]);
393
394                 for (i = 0; i < num; i++)
395                         vpid[i] = i + 1;
396         }
397 }
398
399 static void
400 vpid_init(void)
401 {
402         /*
403          * VPID 0 is required when the "enable VPID" execution control is
404          * disabled.
405          *
406          * VPIDs [1,VM_MAXCPU] are used as the "overflow namespace" when the
407          * unit number allocator does not have sufficient unique VPIDs to
408          * satisfy the allocation.
409          *
410          * The remaining VPIDs are managed by the unit number allocator.
411          */
412         vpid_unr = new_unrhdr(VM_MAXCPU + 1, 0xffff, NULL);
413 }
414
415 static void
416 msr_save_area_init(struct msr_entry *g_area, int *g_count)
417 {
418         int cnt;
419
420         static struct msr_entry guest_msrs[] = {
421                 { MSR_KGSBASE, 0, 0 },
422         };
423
424         cnt = sizeof(guest_msrs) / sizeof(guest_msrs[0]);
425         if (cnt > GUEST_MSR_MAX_ENTRIES)
426                 panic("guest msr save area overrun");
427         bcopy(guest_msrs, g_area, sizeof(guest_msrs));
428         *g_count = cnt;
429 }
430
431 static void
432 vmx_disable(void *arg __unused)
433 {
434         struct invvpid_desc invvpid_desc = { 0 };
435         struct invept_desc invept_desc = { 0 };
436
437         if (vmxon_enabled[curcpu]) {
438                 /*
439                  * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b.
440                  *
441                  * VMXON or VMXOFF are not required to invalidate any TLB
442                  * caching structures. This prevents potential retention of
443                  * cached information in the TLB between distinct VMX episodes.
444                  */
445                 invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc);
446                 invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc);
447                 vmxoff();
448         }
449         load_cr4(rcr4() & ~CR4_VMXE);
450 }
451
452 static int
453 vmx_cleanup(void)
454 {
455         
456         if (pirvec != 0)
457                 vmm_ipi_free(pirvec);
458
459         if (vpid_unr != NULL) {
460                 delete_unrhdr(vpid_unr);
461                 vpid_unr = NULL;
462         }
463
464         smp_rendezvous(NULL, vmx_disable, NULL, NULL);
465
466         return (0);
467 }
468
469 static void
470 vmx_enable(void *arg __unused)
471 {
472         int error;
473
474         load_cr4(rcr4() | CR4_VMXE);
475
476         *(uint32_t *)vmxon_region[curcpu] = vmx_revision();
477         error = vmxon(vmxon_region[curcpu]);
478         if (error == 0)
479                 vmxon_enabled[curcpu] = 1;
480 }
481
482 static void
483 vmx_restore(void)
484 {
485
486         if (vmxon_enabled[curcpu])
487                 vmxon(vmxon_region[curcpu]);
488 }
489
490 static int
491 vmx_init(int ipinum)
492 {
493         int error, use_tpr_shadow;
494         uint64_t fixed0, fixed1, feature_control;
495         uint32_t tmp, procbased2_vid_bits;
496
497         /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */
498         if (!(cpu_feature2 & CPUID2_VMX)) {
499                 printf("vmx_init: processor does not support VMX operation\n");
500                 return (ENXIO);
501         }
502
503         /*
504          * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits
505          * are set (bits 0 and 2 respectively).
506          */
507         feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
508         if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 ||
509             (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) {
510                 printf("vmx_init: VMX operation disabled by BIOS\n");
511                 return (ENXIO);
512         }
513
514         /* Check support for primary processor-based VM-execution controls */
515         error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
516                                MSR_VMX_TRUE_PROCBASED_CTLS,
517                                PROCBASED_CTLS_ONE_SETTING,
518                                PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls);
519         if (error) {
520                 printf("vmx_init: processor does not support desired primary "
521                        "processor-based controls\n");
522                 return (error);
523         }
524
525         /* Clear the processor-based ctl bits that are set on demand */
526         procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING;
527
528         /* Check support for secondary processor-based VM-execution controls */
529         error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
530                                MSR_VMX_PROCBASED_CTLS2,
531                                PROCBASED_CTLS2_ONE_SETTING,
532                                PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2);
533         if (error) {
534                 printf("vmx_init: processor does not support desired secondary "
535                        "processor-based controls\n");
536                 return (error);
537         }
538
539         /* Check support for VPID */
540         error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
541                                PROCBASED2_ENABLE_VPID, 0, &tmp);
542         if (error == 0)
543                 procbased_ctls2 |= PROCBASED2_ENABLE_VPID;
544
545         /* Check support for pin-based VM-execution controls */
546         error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
547                                MSR_VMX_TRUE_PINBASED_CTLS,
548                                PINBASED_CTLS_ONE_SETTING,
549                                PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls);
550         if (error) {
551                 printf("vmx_init: processor does not support desired "
552                        "pin-based controls\n");
553                 return (error);
554         }
555
556         /* Check support for VM-exit controls */
557         error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS,
558                                VM_EXIT_CTLS_ONE_SETTING,
559                                VM_EXIT_CTLS_ZERO_SETTING,
560                                &exit_ctls);
561         if (error) {
562                 /* Try again without the PAT MSR bits */
563                 error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS,
564                                        MSR_VMX_TRUE_EXIT_CTLS,
565                                        VM_EXIT_CTLS_ONE_SETTING_NO_PAT,
566                                        VM_EXIT_CTLS_ZERO_SETTING,
567                                        &exit_ctls);
568                 if (error) {
569                         printf("vmx_init: processor does not support desired "
570                                "exit controls\n");
571                         return (error);
572                 } else {
573                         if (bootverbose)
574                                 printf("vmm: PAT MSR access not supported\n");
575                         guest_msr_valid(MSR_PAT);
576                         vmx_no_patmsr = 1;
577                 }
578         }
579
580         /* Check support for VM-entry controls */
581         if (!vmx_no_patmsr) {
582                 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS,
583                                        MSR_VMX_TRUE_ENTRY_CTLS,
584                                        VM_ENTRY_CTLS_ONE_SETTING,
585                                        VM_ENTRY_CTLS_ZERO_SETTING,
586                                        &entry_ctls);
587         } else {
588                 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS,
589                                        MSR_VMX_TRUE_ENTRY_CTLS,
590                                        VM_ENTRY_CTLS_ONE_SETTING_NO_PAT,
591                                        VM_ENTRY_CTLS_ZERO_SETTING,
592                                        &entry_ctls);
593         }
594
595         if (error) {
596                 printf("vmx_init: processor does not support desired "
597                        "entry controls\n");
598                        return (error);
599         }
600
601         /*
602          * Check support for optional features by testing them
603          * as individual bits
604          */
605         cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
606                                         MSR_VMX_TRUE_PROCBASED_CTLS,
607                                         PROCBASED_HLT_EXITING, 0,
608                                         &tmp) == 0);
609
610         cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
611                                         MSR_VMX_PROCBASED_CTLS,
612                                         PROCBASED_MTF, 0,
613                                         &tmp) == 0);
614
615         cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
616                                          MSR_VMX_TRUE_PROCBASED_CTLS,
617                                          PROCBASED_PAUSE_EXITING, 0,
618                                          &tmp) == 0);
619
620         cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
621                                         MSR_VMX_PROCBASED_CTLS2,
622                                         PROCBASED2_UNRESTRICTED_GUEST, 0,
623                                         &tmp) == 0);
624
625         cap_invpcid = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
626             MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0,
627             &tmp) == 0);
628
629         /*
630          * Check support for virtual interrupt delivery.
631          */
632         procbased2_vid_bits = (PROCBASED2_VIRTUALIZE_APIC_ACCESSES |
633             PROCBASED2_VIRTUALIZE_X2APIC_MODE |
634             PROCBASED2_APIC_REGISTER_VIRTUALIZATION |
635             PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY);
636
637         use_tpr_shadow = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
638             MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_USE_TPR_SHADOW, 0,
639             &tmp) == 0);
640
641         error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
642             procbased2_vid_bits, 0, &tmp);
643         if (error == 0 && use_tpr_shadow) {
644                 virtual_interrupt_delivery = 1;
645                 TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_vid",
646                     &virtual_interrupt_delivery);
647         }
648
649         if (virtual_interrupt_delivery) {
650                 procbased_ctls |= PROCBASED_USE_TPR_SHADOW;
651                 procbased_ctls2 |= procbased2_vid_bits;
652                 procbased_ctls2 &= ~PROCBASED2_VIRTUALIZE_X2APIC_MODE;
653
654                 /*
655                  * Check for Posted Interrupts only if Virtual Interrupt
656                  * Delivery is enabled.
657                  */
658                 error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
659                     MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_POSTED_INTERRUPT, 0,
660                     &tmp);
661                 if (error == 0) {
662                         pirvec = vmm_ipi_alloc();
663                         if (pirvec == 0) {
664                                 if (bootverbose) {
665                                         printf("vmx_init: unable to allocate "
666                                             "posted interrupt vector\n");
667                                 }
668                         } else {
669                                 posted_interrupts = 1;
670                                 TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_pir",
671                                     &posted_interrupts);
672                         }
673                 }
674         }
675
676         if (posted_interrupts)
677                     pinbased_ctls |= PINBASED_POSTED_INTERRUPT;
678
679         /* Initialize EPT */
680         error = ept_init(ipinum);
681         if (error) {
682                 printf("vmx_init: ept initialization failed (%d)\n", error);
683                 return (error);
684         }
685
686         /*
687          * Stash the cr0 and cr4 bits that must be fixed to 0 or 1
688          */
689         fixed0 = rdmsr(MSR_VMX_CR0_FIXED0);
690         fixed1 = rdmsr(MSR_VMX_CR0_FIXED1);
691         cr0_ones_mask = fixed0 & fixed1;
692         cr0_zeros_mask = ~fixed0 & ~fixed1;
693
694         /*
695          * CR0_PE and CR0_PG can be set to zero in VMX non-root operation
696          * if unrestricted guest execution is allowed.
697          */
698         if (cap_unrestricted_guest)
699                 cr0_ones_mask &= ~(CR0_PG | CR0_PE);
700
701         /*
702          * Do not allow the guest to set CR0_NW or CR0_CD.
703          */
704         cr0_zeros_mask |= (CR0_NW | CR0_CD);
705
706         fixed0 = rdmsr(MSR_VMX_CR4_FIXED0);
707         fixed1 = rdmsr(MSR_VMX_CR4_FIXED1);
708         cr4_ones_mask = fixed0 & fixed1;
709         cr4_zeros_mask = ~fixed0 & ~fixed1;
710
711         vpid_init();
712
713         /* enable VMX operation */
714         smp_rendezvous(NULL, vmx_enable, NULL, NULL);
715
716         vmx_initialized = 1;
717
718         return (0);
719 }
720
721 static void
722 vmx_trigger_hostintr(int vector)
723 {
724         uintptr_t func;
725         struct gate_descriptor *gd;
726
727         gd = &idt[vector];
728
729         KASSERT(vector >= 32 && vector <= 255, ("vmx_trigger_hostintr: "
730             "invalid vector %d", vector));
731         KASSERT(gd->gd_p == 1, ("gate descriptor for vector %d not present",
732             vector));
733         KASSERT(gd->gd_type == SDT_SYSIGT, ("gate descriptor for vector %d "
734             "has invalid type %d", vector, gd->gd_type));
735         KASSERT(gd->gd_dpl == SEL_KPL, ("gate descriptor for vector %d "
736             "has invalid dpl %d", vector, gd->gd_dpl));
737         KASSERT(gd->gd_selector == GSEL(GCODE_SEL, SEL_KPL), ("gate descriptor "
738             "for vector %d has invalid selector %d", vector, gd->gd_selector));
739         KASSERT(gd->gd_ist == 0, ("gate descriptor for vector %d has invalid "
740             "IST %d", vector, gd->gd_ist));
741
742         func = ((long)gd->gd_hioffset << 16 | gd->gd_looffset);
743         vmx_call_isr(func);
744 }
745
746 static int
747 vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial)
748 {
749         int error, mask_ident, shadow_ident;
750         uint64_t mask_value;
751
752         if (which != 0 && which != 4)
753                 panic("vmx_setup_cr_shadow: unknown cr%d", which);
754
755         if (which == 0) {
756                 mask_ident = VMCS_CR0_MASK;
757                 mask_value = cr0_ones_mask | cr0_zeros_mask;
758                 shadow_ident = VMCS_CR0_SHADOW;
759         } else {
760                 mask_ident = VMCS_CR4_MASK;
761                 mask_value = cr4_ones_mask | cr4_zeros_mask;
762                 shadow_ident = VMCS_CR4_SHADOW;
763         }
764
765         error = vmcs_setreg(vmcs, 0, VMCS_IDENT(mask_ident), mask_value);
766         if (error)
767                 return (error);
768
769         error = vmcs_setreg(vmcs, 0, VMCS_IDENT(shadow_ident), initial);
770         if (error)
771                 return (error);
772
773         return (0);
774 }
775 #define vmx_setup_cr0_shadow(vmcs,init) vmx_setup_cr_shadow(0, (vmcs), (init))
776 #define vmx_setup_cr4_shadow(vmcs,init) vmx_setup_cr_shadow(4, (vmcs), (init))
777
778 static void *
779 vmx_vminit(struct vm *vm, pmap_t pmap)
780 {
781         uint16_t vpid[VM_MAXCPU];
782         int i, error, guest_msr_count;
783         struct vmx *vmx;
784         struct vmcs *vmcs;
785
786         vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO);
787         if ((uintptr_t)vmx & PAGE_MASK) {
788                 panic("malloc of struct vmx not aligned on %d byte boundary",
789                       PAGE_SIZE);
790         }
791         vmx->vm = vm;
792
793         vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4));
794
795         /*
796          * Clean up EPTP-tagged guest physical and combined mappings
797          *
798          * VMX transitions are not required to invalidate any guest physical
799          * mappings. So, it may be possible for stale guest physical mappings
800          * to be present in the processor TLBs.
801          *
802          * Combined mappings for this EP4TA are also invalidated for all VPIDs.
803          */
804         ept_invalidate_mappings(vmx->eptp);
805
806         msr_bitmap_initialize(vmx->msr_bitmap);
807
808         /*
809          * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE.
810          * The guest FSBASE and GSBASE are saved and restored during
811          * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are
812          * always restored from the vmcs host state area on vm-exit.
813          *
814          * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in
815          * how they are saved/restored so can be directly accessed by the
816          * guest.
817          *
818          * Guest KGSBASE is saved and restored in the guest MSR save area.
819          * Host KGSBASE is restored before returning to userland from the pcb.
820          * There will be a window of time when we are executing in the host
821          * kernel context with a value of KGSBASE from the guest. This is ok
822          * because the value of KGSBASE is inconsequential in kernel context.
823          *
824          * MSR_EFER is saved and restored in the guest VMCS area on a
825          * VM exit and entry respectively. It is also restored from the
826          * host VMCS area on a VM exit.
827          */
828         if (guest_msr_rw(vmx, MSR_GSBASE) ||
829             guest_msr_rw(vmx, MSR_FSBASE) ||
830             guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) ||
831             guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) ||
832             guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) ||
833             guest_msr_rw(vmx, MSR_KGSBASE) ||
834             guest_msr_rw(vmx, MSR_EFER))
835                 panic("vmx_vminit: error setting guest msr access");
836
837         /*
838          * MSR_PAT is saved and restored in the guest VMCS are on a VM exit
839          * and entry respectively. It is also restored from the host VMCS
840          * area on a VM exit. However, if running on a system with no
841          * MSR_PAT save/restore support, leave access disabled so accesses
842          * will be trapped.
843          */
844         if (!vmx_no_patmsr && guest_msr_rw(vmx, MSR_PAT))
845                 panic("vmx_vminit: error setting guest pat msr access");
846
847         vpid_alloc(vpid, VM_MAXCPU);
848
849         if (virtual_interrupt_delivery) {
850                 error = vm_map_mmio(vm, DEFAULT_APIC_BASE, PAGE_SIZE,
851                     APIC_ACCESS_ADDRESS);
852                 /* XXX this should really return an error to the caller */
853                 KASSERT(error == 0, ("vm_map_mmio(apicbase) error %d", error));
854         }
855
856         for (i = 0; i < VM_MAXCPU; i++) {
857                 vmcs = &vmx->vmcs[i];
858                 vmcs->identifier = vmx_revision();
859                 error = vmclear(vmcs);
860                 if (error != 0) {
861                         panic("vmx_vminit: vmclear error %d on vcpu %d\n",
862                               error, i);
863                 }
864
865                 error = vmcs_init(vmcs);
866                 KASSERT(error == 0, ("vmcs_init error %d", error));
867
868                 VMPTRLD(vmcs);
869                 error = 0;
870                 error += vmwrite(VMCS_HOST_RSP, (u_long)&vmx->ctx[i]);
871                 error += vmwrite(VMCS_EPTP, vmx->eptp);
872                 error += vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls);
873                 error += vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls);
874                 error += vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2);
875                 error += vmwrite(VMCS_EXIT_CTLS, exit_ctls);
876                 error += vmwrite(VMCS_ENTRY_CTLS, entry_ctls);
877                 error += vmwrite(VMCS_MSR_BITMAP, vtophys(vmx->msr_bitmap));
878                 error += vmwrite(VMCS_VPID, vpid[i]);
879                 if (virtual_interrupt_delivery) {
880                         error += vmwrite(VMCS_APIC_ACCESS, APIC_ACCESS_ADDRESS);
881                         error += vmwrite(VMCS_VIRTUAL_APIC,
882                             vtophys(&vmx->apic_page[i]));
883                         error += vmwrite(VMCS_EOI_EXIT0, 0);
884                         error += vmwrite(VMCS_EOI_EXIT1, 0);
885                         error += vmwrite(VMCS_EOI_EXIT2, 0);
886                         error += vmwrite(VMCS_EOI_EXIT3, 0);
887                 }
888                 if (posted_interrupts) {
889                         error += vmwrite(VMCS_PIR_VECTOR, pirvec);
890                         error += vmwrite(VMCS_PIR_DESC,
891                             vtophys(&vmx->pir_desc[i]));
892                 }
893                 VMCLEAR(vmcs);
894                 KASSERT(error == 0, ("vmx_vminit: error customizing the vmcs"));
895
896                 vmx->cap[i].set = 0;
897                 vmx->cap[i].proc_ctls = procbased_ctls;
898                 vmx->cap[i].proc_ctls2 = procbased_ctls2;
899
900                 vmx->state[i].lastcpu = -1;
901                 vmx->state[i].vpid = vpid[i];
902
903                 msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count);
904
905                 error = vmcs_set_msr_save(vmcs, vtophys(vmx->guest_msrs[i]),
906                     guest_msr_count);
907                 if (error != 0)
908                         panic("vmcs_set_msr_save error %d", error);
909
910                 /*
911                  * Set up the CR0/4 shadows, and init the read shadow
912                  * to the power-on register value from the Intel Sys Arch.
913                  *  CR0 - 0x60000010
914                  *  CR4 - 0
915                  */
916                 error = vmx_setup_cr0_shadow(vmcs, 0x60000010);
917                 if (error != 0)
918                         panic("vmx_setup_cr0_shadow %d", error);
919
920                 error = vmx_setup_cr4_shadow(vmcs, 0);
921                 if (error != 0)
922                         panic("vmx_setup_cr4_shadow %d", error);
923
924                 vmx->ctx[i].pmap = pmap;
925                 vmx->ctx[i].eptp = vmx->eptp;
926         }
927
928         return (vmx);
929 }
930
931 static int
932 vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx)
933 {
934         int handled, func;
935         
936         func = vmxctx->guest_rax;
937
938         handled = x86_emulate_cpuid(vm, vcpu,
939                                     (uint32_t*)(&vmxctx->guest_rax),
940                                     (uint32_t*)(&vmxctx->guest_rbx),
941                                     (uint32_t*)(&vmxctx->guest_rcx),
942                                     (uint32_t*)(&vmxctx->guest_rdx));
943         return (handled);
944 }
945
946 static __inline void
947 vmx_run_trace(struct vmx *vmx, int vcpu)
948 {
949 #ifdef KTR
950         VCPU_CTR1(vmx->vm, vcpu, "Resume execution at %#lx", vmcs_guest_rip());
951 #endif
952 }
953
954 static __inline void
955 vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason,
956                int handled)
957 {
958 #ifdef KTR
959         VCPU_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx",
960                  handled ? "handled" : "unhandled",
961                  exit_reason_to_str(exit_reason), rip);
962 #endif
963 }
964
965 static __inline void
966 vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip)
967 {
968 #ifdef KTR
969         VCPU_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip);
970 #endif
971 }
972
973 static void
974 vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu)
975 {
976         int lastcpu;
977         struct vmxstate *vmxstate;
978         struct invvpid_desc invvpid_desc = { 0 };
979
980         vmxstate = &vmx->state[vcpu];
981         lastcpu = vmxstate->lastcpu;
982         vmxstate->lastcpu = curcpu;
983
984         if (lastcpu == curcpu)
985                 return;
986
987         vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1);
988
989         vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase());
990         vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase());
991         vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase());
992
993         /*
994          * If we are using VPIDs then invalidate all mappings tagged with 'vpid'
995          *
996          * We do this because this vcpu was executing on a different host
997          * cpu when it last ran. We do not track whether it invalidated
998          * mappings associated with its 'vpid' during that run. So we must
999          * assume that the mappings associated with 'vpid' on 'curcpu' are
1000          * stale and invalidate them.
1001          *
1002          * Note that we incur this penalty only when the scheduler chooses to
1003          * move the thread associated with this vcpu between host cpus.
1004          *
1005          * Note also that this will invalidate mappings tagged with 'vpid'
1006          * for "all" EP4TAs.
1007          */
1008         if (vmxstate->vpid != 0) {
1009                 invvpid_desc.vpid = vmxstate->vpid;
1010                 invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc);
1011         }
1012 }
1013
1014 /*
1015  * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set.
1016  */
1017 CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0);
1018
1019 static void __inline
1020 vmx_set_int_window_exiting(struct vmx *vmx, int vcpu)
1021 {
1022
1023         vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING;
1024         vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
1025 }
1026
1027 static void __inline
1028 vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu)
1029 {
1030
1031         vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING;
1032         vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
1033 }
1034
1035 static void __inline
1036 vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu)
1037 {
1038
1039         vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING;
1040         vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
1041 }
1042
1043 static void __inline
1044 vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu)
1045 {
1046
1047         vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING;
1048         vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
1049 }
1050
1051 static int
1052 vmx_inject_nmi(struct vmx *vmx, int vcpu)
1053 {
1054         uint64_t info, interruptibility;
1055
1056         /* Bail out if no NMI requested */
1057         if (!vm_nmi_pending(vmx->vm, vcpu))
1058                 return (0);
1059
1060         interruptibility = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1061         if (interruptibility & nmi_blocking_bits)
1062                 goto nmiblocked;
1063
1064         /*
1065          * Inject the virtual NMI. The vector must be the NMI IDT entry
1066          * or the VMCS entry check will fail.
1067          */
1068         info = VMCS_INTR_T_NMI | VMCS_INTR_VALID;
1069         info |= IDT_NMI;
1070         vmcs_write(VMCS_ENTRY_INTR_INFO, info);
1071
1072         VCPU_CTR0(vmx->vm, vcpu, "Injecting vNMI");
1073
1074         /* Clear the request */
1075         vm_nmi_clear(vmx->vm, vcpu);
1076         return (1);
1077
1078 nmiblocked:
1079         /*
1080          * Set the NMI Window Exiting execution control so we can inject
1081          * the virtual NMI as soon as blocking condition goes away.
1082          */
1083         vmx_set_nmi_window_exiting(vmx, vcpu);
1084
1085         VCPU_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting");
1086         return (1);
1087 }
1088
1089 static void
1090 vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic)
1091 {
1092         int vector;
1093         uint64_t info, rflags, interruptibility;
1094
1095         const int HWINTR_BLOCKED = VMCS_INTERRUPTIBILITY_STI_BLOCKING |
1096                                    VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING;
1097
1098         /*
1099          * If there is already an interrupt pending then just return.
1100          *
1101          * This could happen if an interrupt was injected on a prior
1102          * VM entry but the actual entry into guest mode was aborted
1103          * because of a pending AST.
1104          */
1105         info = vmcs_read(VMCS_ENTRY_INTR_INFO);
1106         if (info & VMCS_INTR_VALID)
1107                 return;
1108
1109         /*
1110          * NMI injection has priority so deal with those first
1111          */
1112         if (vmx_inject_nmi(vmx, vcpu))
1113                 return;
1114
1115         if (virtual_interrupt_delivery) {
1116                 vmx_inject_pir(vlapic);
1117                 return;
1118         }
1119
1120         /* Ask the local apic for a vector to inject */
1121         if (!vlapic_pending_intr(vlapic, &vector))
1122                 return;
1123
1124         if (vector < 32 || vector > 255)
1125                 panic("vmx_inject_interrupts: invalid vector %d\n", vector);
1126
1127         /* Check RFLAGS.IF and the interruptibility state of the guest */
1128         rflags = vmcs_read(VMCS_GUEST_RFLAGS);
1129         if ((rflags & PSL_I) == 0)
1130                 goto cantinject;
1131
1132         interruptibility = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1133         if (interruptibility & HWINTR_BLOCKED)
1134                 goto cantinject;
1135
1136         /* Inject the interrupt */
1137         info = VMCS_INTR_T_HWINTR | VMCS_INTR_VALID;
1138         info |= vector;
1139         vmcs_write(VMCS_ENTRY_INTR_INFO, info);
1140
1141         /* Update the Local APIC ISR */
1142         vlapic_intr_accepted(vlapic, vector);
1143
1144         VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector);
1145
1146         return;
1147
1148 cantinject:
1149         /*
1150          * Set the Interrupt Window Exiting execution control so we can inject
1151          * the interrupt as soon as blocking condition goes away.
1152          */
1153         vmx_set_int_window_exiting(vmx, vcpu);
1154
1155         VCPU_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting");
1156 }
1157
1158 static int
1159 vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
1160 {
1161         int cr, vmcs_guest_cr, vmcs_shadow_cr;
1162         uint64_t crval, regval, ones_mask, zeros_mask;
1163         const struct vmxctx *vmxctx;
1164
1165         /* We only handle mov to %cr0 or %cr4 at this time */
1166         if ((exitqual & 0xf0) != 0x00)
1167                 return (UNHANDLED);
1168
1169         cr = exitqual & 0xf;
1170         if (cr != 0 && cr != 4)
1171                 return (UNHANDLED);
1172
1173         regval = 0; /* silence gcc */
1174         vmxctx = &vmx->ctx[vcpu];
1175
1176         /*
1177          * We must use vmcs_write() directly here because vmcs_setreg() will
1178          * call vmclear(vmcs) as a side-effect which we certainly don't want.
1179          */
1180         switch ((exitqual >> 8) & 0xf) {
1181         case 0:
1182                 regval = vmxctx->guest_rax;
1183                 break;
1184         case 1:
1185                 regval = vmxctx->guest_rcx;
1186                 break;
1187         case 2:
1188                 regval = vmxctx->guest_rdx;
1189                 break;
1190         case 3:
1191                 regval = vmxctx->guest_rbx;
1192                 break;
1193         case 4:
1194                 regval = vmcs_read(VMCS_GUEST_RSP);
1195                 break;
1196         case 5:
1197                 regval = vmxctx->guest_rbp;
1198                 break;
1199         case 6:
1200                 regval = vmxctx->guest_rsi;
1201                 break;
1202         case 7:
1203                 regval = vmxctx->guest_rdi;
1204                 break;
1205         case 8:
1206                 regval = vmxctx->guest_r8;
1207                 break;
1208         case 9:
1209                 regval = vmxctx->guest_r9;
1210                 break;
1211         case 10:
1212                 regval = vmxctx->guest_r10;
1213                 break;
1214         case 11:
1215                 regval = vmxctx->guest_r11;
1216                 break;
1217         case 12:
1218                 regval = vmxctx->guest_r12;
1219                 break;
1220         case 13:
1221                 regval = vmxctx->guest_r13;
1222                 break;
1223         case 14:
1224                 regval = vmxctx->guest_r14;
1225                 break;
1226         case 15:
1227                 regval = vmxctx->guest_r15;
1228                 break;
1229         }
1230
1231         if (cr == 0) {
1232                 ones_mask = cr0_ones_mask;
1233                 zeros_mask = cr0_zeros_mask;
1234                 vmcs_guest_cr = VMCS_GUEST_CR0;
1235                 vmcs_shadow_cr = VMCS_CR0_SHADOW;
1236         } else {
1237                 ones_mask = cr4_ones_mask;
1238                 zeros_mask = cr4_zeros_mask;
1239                 vmcs_guest_cr = VMCS_GUEST_CR4;
1240                 vmcs_shadow_cr = VMCS_CR4_SHADOW;
1241         }
1242         vmcs_write(vmcs_shadow_cr, regval);
1243
1244         crval = regval | ones_mask;
1245         crval &= ~zeros_mask;
1246         vmcs_write(vmcs_guest_cr, crval);
1247
1248         if (cr == 0 && regval & CR0_PG) {
1249                 uint64_t efer, entry_ctls;
1250
1251                 /*
1252                  * If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and
1253                  * the "IA-32e mode guest" bit in VM-entry control must be
1254                  * equal.
1255                  */
1256                 efer = vmcs_read(VMCS_GUEST_IA32_EFER);
1257                 if (efer & EFER_LME) {
1258                         efer |= EFER_LMA;
1259                         vmcs_write(VMCS_GUEST_IA32_EFER, efer);
1260                         entry_ctls = vmcs_read(VMCS_ENTRY_CTLS);
1261                         entry_ctls |= VM_ENTRY_GUEST_LMA;
1262                         vmcs_write(VMCS_ENTRY_CTLS, entry_ctls);
1263                 }
1264         }
1265
1266         return (HANDLED);
1267 }
1268
1269 static int
1270 ept_fault_type(uint64_t ept_qual)
1271 {
1272         int fault_type;
1273
1274         if (ept_qual & EPT_VIOLATION_DATA_WRITE)
1275                 fault_type = VM_PROT_WRITE;
1276         else if (ept_qual & EPT_VIOLATION_INST_FETCH)
1277                 fault_type = VM_PROT_EXECUTE;
1278         else
1279                 fault_type= VM_PROT_READ;
1280
1281         return (fault_type);
1282 }
1283
1284 static boolean_t
1285 ept_emulation_fault(uint64_t ept_qual)
1286 {
1287         int read, write;
1288
1289         /* EPT fault on an instruction fetch doesn't make sense here */
1290         if (ept_qual & EPT_VIOLATION_INST_FETCH)
1291                 return (FALSE);
1292
1293         /* EPT fault must be a read fault or a write fault */
1294         read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0;
1295         write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0;
1296         if ((read | write) == 0)
1297                 return (FALSE);
1298
1299         /*
1300          * The EPT violation must have been caused by accessing a
1301          * guest-physical address that is a translation of a guest-linear
1302          * address.
1303          */
1304         if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 ||
1305             (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) {
1306                 return (FALSE);
1307         }
1308
1309         return (TRUE);
1310 }
1311
1312 static int
1313 vmx_handle_apic_write(struct vlapic *vlapic, uint64_t qual)
1314 {
1315         int error, handled, offset;
1316         bool retu;
1317
1318         if (!virtual_interrupt_delivery)
1319                 return (UNHANDLED);
1320
1321         handled = 1;
1322         offset = APIC_WRITE_OFFSET(qual);
1323         switch (offset) {
1324         case APIC_OFFSET_ID:
1325                 vlapic_id_write_handler(vlapic);
1326                 break;
1327         case APIC_OFFSET_LDR:
1328                 vlapic_ldr_write_handler(vlapic);
1329                 break;
1330         case APIC_OFFSET_DFR:
1331                 vlapic_dfr_write_handler(vlapic);
1332                 break;
1333         case APIC_OFFSET_SVR:
1334                 vlapic_svr_write_handler(vlapic);
1335                 break;
1336         case APIC_OFFSET_ESR:
1337                 vlapic_esr_write_handler(vlapic);
1338                 break;
1339         case APIC_OFFSET_ICR_LOW:
1340                 retu = false;
1341                 error = vlapic_icrlo_write_handler(vlapic, &retu);
1342                 if (error != 0 || retu)
1343                         handled = 0;
1344                 break;
1345         case APIC_OFFSET_CMCI_LVT:
1346         case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
1347                 vlapic_lvt_write_handler(vlapic, offset);
1348                 break;
1349         case APIC_OFFSET_TIMER_ICR:
1350                 vlapic_icrtmr_write_handler(vlapic);
1351                 break;
1352         case APIC_OFFSET_TIMER_DCR:
1353                 vlapic_dcr_write_handler(vlapic);
1354                 break;
1355         default:
1356                 handled = 0;
1357                 break;
1358         }
1359         return (handled);
1360 }
1361
1362 static bool
1363 apic_access_fault(uint64_t gpa)
1364 {
1365
1366         if (virtual_interrupt_delivery &&
1367             (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE))
1368                 return (true);
1369         else
1370                 return (false);
1371 }
1372
1373 static int
1374 vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
1375 {
1376         uint64_t qual;
1377         int access_type, offset, allowed;
1378
1379         if (!virtual_interrupt_delivery)
1380                 return (UNHANDLED);
1381
1382         qual = vmexit->u.vmx.exit_qualification;
1383         access_type = APIC_ACCESS_TYPE(qual);
1384         offset = APIC_ACCESS_OFFSET(qual);
1385
1386         allowed = 0;
1387         if (access_type == 0) {
1388                 /*
1389                  * Read data access to the following registers is expected.
1390                  */
1391                 switch (offset) {
1392                 case APIC_OFFSET_APR:
1393                 case APIC_OFFSET_PPR:
1394                 case APIC_OFFSET_RRR:
1395                 case APIC_OFFSET_CMCI_LVT:
1396                 case APIC_OFFSET_TIMER_CCR:
1397                         allowed = 1;
1398                         break;
1399                 default:
1400                         break;
1401                 }
1402         } else if (access_type == 1) {
1403                 /*
1404                  * Write data access to the following registers is expected.
1405                  */
1406                 switch (offset) {
1407                 case APIC_OFFSET_VER:
1408                 case APIC_OFFSET_APR:
1409                 case APIC_OFFSET_PPR:
1410                 case APIC_OFFSET_RRR:
1411                 case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
1412                 case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
1413                 case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
1414                 case APIC_OFFSET_CMCI_LVT:
1415                 case APIC_OFFSET_TIMER_CCR:
1416                         allowed = 1;
1417                         break;
1418                 default:
1419                         break;
1420                 }
1421         }
1422
1423         if (allowed) {
1424                 vmexit->exitcode = VM_EXITCODE_INST_EMUL;
1425                 vmexit->u.inst_emul.gpa = DEFAULT_APIC_BASE + offset;
1426                 vmexit->u.inst_emul.gla = VIE_INVALID_GLA;
1427                 vmexit->u.inst_emul.cr3 = vmcs_guest_cr3();
1428         }
1429
1430         /*
1431          * Regardless of whether the APIC-access is allowed this handler
1432          * always returns UNHANDLED:
1433          * - if the access is allowed then it is handled by emulating the
1434          *   instruction that caused the VM-exit (outside the critical section)
1435          * - if the access is not allowed then it will be converted to an
1436          *   exitcode of VM_EXITCODE_VMX and will be dealt with in userland.
1437          */
1438         return (UNHANDLED);
1439 }
1440
1441 static int
1442 vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
1443 {
1444         int error, handled;
1445         struct vmxctx *vmxctx;
1446         struct vlapic *vlapic;
1447         uint32_t eax, ecx, edx, gi, idtvec_info, idtvec_err, intr_info, reason;
1448         uint64_t qual, gpa;
1449         bool retu;
1450
1451         CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0);
1452
1453         handled = 0;
1454         vmxctx = &vmx->ctx[vcpu];
1455
1456         qual = vmexit->u.vmx.exit_qualification;
1457         reason = vmexit->u.vmx.exit_reason;
1458         vmexit->exitcode = VM_EXITCODE_BOGUS;
1459
1460         vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1);
1461
1462         /*
1463          * VM exits that could be triggered during event injection on the
1464          * previous VM entry need to be handled specially by re-injecting
1465          * the event.
1466          *
1467          * See "Information for VM Exits During Event Delivery" in Intel SDM
1468          * for details.
1469          */
1470         switch (reason) {
1471         case EXIT_REASON_EPT_FAULT:
1472         case EXIT_REASON_EPT_MISCONFIG:
1473         case EXIT_REASON_APIC_ACCESS:
1474         case EXIT_REASON_TASK_SWITCH:
1475         case EXIT_REASON_EXCEPTION:
1476                 idtvec_info = vmcs_idt_vectoring_info();
1477                 if (idtvec_info & VMCS_IDT_VEC_VALID) {
1478                         idtvec_info &= ~(1 << 12); /* clear undefined bit */
1479                         vmcs_write(VMCS_ENTRY_INTR_INFO, idtvec_info);
1480                         if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
1481                                 idtvec_err = vmcs_idt_vectoring_err();
1482                                 vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR,
1483                                     idtvec_err);
1484                         }
1485                         /*
1486                          * If 'virtual NMIs' are being used and the VM-exit
1487                          * happened while injecting an NMI during the previous
1488                          * VM-entry, then clear "blocking by NMI" in the Guest
1489                          * Interruptibility-state.
1490                          */
1491                         if ((idtvec_info & VMCS_INTR_T_MASK) ==
1492                             VMCS_INTR_T_NMI) {
1493                                  gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1494                                  gi &= ~VMCS_INTERRUPTIBILITY_NMI_BLOCKING;
1495                                  vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
1496                         }
1497                         vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
1498                 }
1499         default:
1500                 break;
1501         }
1502
1503         switch (reason) {
1504         case EXIT_REASON_CR_ACCESS:
1505                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1);
1506                 handled = vmx_emulate_cr_access(vmx, vcpu, qual);
1507                 break;
1508         case EXIT_REASON_RDMSR:
1509                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1);
1510                 retu = false;
1511                 ecx = vmxctx->guest_rcx;
1512                 error = emulate_rdmsr(vmx->vm, vcpu, ecx, &retu);
1513                 if (error) {
1514                         vmexit->exitcode = VM_EXITCODE_RDMSR;
1515                         vmexit->u.msr.code = ecx;
1516                 } else if (!retu) {
1517                         handled = 1;
1518                 } else {
1519                         /* Return to userspace with a valid exitcode */
1520                         KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
1521                             ("emulate_wrmsr retu with bogus exitcode"));
1522                 }
1523                 break;
1524         case EXIT_REASON_WRMSR:
1525                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_WRMSR, 1);
1526                 retu = false;
1527                 eax = vmxctx->guest_rax;
1528                 ecx = vmxctx->guest_rcx;
1529                 edx = vmxctx->guest_rdx;
1530                 error = emulate_wrmsr(vmx->vm, vcpu, ecx,
1531                     (uint64_t)edx << 32 | eax, &retu);
1532                 if (error) {
1533                         vmexit->exitcode = VM_EXITCODE_WRMSR;
1534                         vmexit->u.msr.code = ecx;
1535                         vmexit->u.msr.wval = (uint64_t)edx << 32 | eax;
1536                 } else if (!retu) {
1537                         handled = 1;
1538                 } else {
1539                         /* Return to userspace with a valid exitcode */
1540                         KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
1541                             ("emulate_wrmsr retu with bogus exitcode"));
1542                 }
1543                 break;
1544         case EXIT_REASON_HLT:
1545                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1);
1546                 vmexit->exitcode = VM_EXITCODE_HLT;
1547                 vmexit->u.hlt.rflags = vmcs_read(VMCS_GUEST_RFLAGS);
1548                 break;
1549         case EXIT_REASON_MTF:
1550                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1);
1551                 vmexit->exitcode = VM_EXITCODE_MTRAP;
1552                 break;
1553         case EXIT_REASON_PAUSE:
1554                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1);
1555                 vmexit->exitcode = VM_EXITCODE_PAUSE;
1556                 break;
1557         case EXIT_REASON_INTR_WINDOW:
1558                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1);
1559                 vmx_clear_int_window_exiting(vmx, vcpu);
1560                 VCPU_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting");
1561                 return (1);
1562         case EXIT_REASON_EXT_INTR:
1563                 /*
1564                  * External interrupts serve only to cause VM exits and allow
1565                  * the host interrupt handler to run.
1566                  *
1567                  * If this external interrupt triggers a virtual interrupt
1568                  * to a VM, then that state will be recorded by the
1569                  * host interrupt handler in the VM's softc. We will inject
1570                  * this virtual interrupt during the subsequent VM enter.
1571                  */
1572                 intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
1573                 KASSERT((intr_info & VMCS_INTR_VALID) != 0 &&
1574                     (intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_HWINTR,
1575                     ("VM exit interruption info invalid: %#x", intr_info));
1576                 vmx_trigger_hostintr(intr_info & 0xff);
1577
1578                 /*
1579                  * This is special. We want to treat this as an 'handled'
1580                  * VM-exit but not increment the instruction pointer.
1581                  */
1582                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1);
1583                 return (1);
1584         case EXIT_REASON_NMI_WINDOW:
1585                 /* Exit to allow the pending virtual NMI to be injected */
1586                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NMI_WINDOW, 1);
1587                 vmx_clear_nmi_window_exiting(vmx, vcpu);
1588                 VCPU_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting");
1589                 return (1);
1590         case EXIT_REASON_INOUT:
1591                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1);
1592                 vmexit->exitcode = VM_EXITCODE_INOUT;
1593                 vmexit->u.inout.bytes = (qual & 0x7) + 1;
1594                 vmexit->u.inout.in = (qual & 0x8) ? 1 : 0;
1595                 vmexit->u.inout.string = (qual & 0x10) ? 1 : 0;
1596                 vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0;
1597                 vmexit->u.inout.port = (uint16_t)(qual >> 16);
1598                 vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax);
1599                 break;
1600         case EXIT_REASON_CPUID:
1601                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1);
1602                 handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx);
1603                 break;
1604         case EXIT_REASON_EPT_FAULT:
1605                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EPT_FAULT, 1);
1606                 /*
1607                  * If 'gpa' lies within the address space allocated to
1608                  * memory then this must be a nested page fault otherwise
1609                  * this must be an instruction that accesses MMIO space.
1610                  */
1611                 gpa = vmcs_gpa();
1612                 if (vm_mem_allocated(vmx->vm, gpa) || apic_access_fault(gpa)) {
1613                         vmexit->exitcode = VM_EXITCODE_PAGING;
1614                         vmexit->u.paging.gpa = gpa;
1615                         vmexit->u.paging.fault_type = ept_fault_type(qual);
1616                 } else if (ept_emulation_fault(qual)) {
1617                         vmexit->exitcode = VM_EXITCODE_INST_EMUL;
1618                         vmexit->u.inst_emul.gpa = gpa;
1619                         vmexit->u.inst_emul.gla = vmcs_gla();
1620                         vmexit->u.inst_emul.cr3 = vmcs_guest_cr3();
1621                 }
1622                 break;
1623         case EXIT_REASON_APIC_ACCESS:
1624                 handled = vmx_handle_apic_access(vmx, vcpu, vmexit);
1625                 break;
1626         case EXIT_REASON_APIC_WRITE:
1627                 /*
1628                  * APIC-write VM exit is trap-like so the %rip is already
1629                  * pointing to the next instruction.
1630                  */
1631                 vmexit->inst_length = 0;
1632                 vlapic = vm_lapic(vmx->vm, vcpu);
1633                 handled = vmx_handle_apic_write(vlapic, qual);
1634                 break;
1635         default:
1636                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1);
1637                 break;
1638         }
1639
1640         if (handled) {
1641                 /*
1642                  * It is possible that control is returned to userland
1643                  * even though we were able to handle the VM exit in the
1644                  * kernel.
1645                  *
1646                  * In such a case we want to make sure that the userland
1647                  * restarts guest execution at the instruction *after*
1648                  * the one we just processed. Therefore we update the
1649                  * guest rip in the VMCS and in 'vmexit'.
1650                  */
1651                 vmexit->rip += vmexit->inst_length;
1652                 vmexit->inst_length = 0;
1653                 vmcs_write(VMCS_GUEST_RIP, vmexit->rip);
1654         } else {
1655                 if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
1656                         /*
1657                          * If this VM exit was not claimed by anybody then
1658                          * treat it as a generic VMX exit.
1659                          */
1660                         vmexit->exitcode = VM_EXITCODE_VMX;
1661                         vmexit->u.vmx.status = VM_SUCCESS;
1662                 } else {
1663                         /*
1664                          * The exitcode and collateral have been populated.
1665                          * The VM exit will be processed further in userland.
1666                          */
1667                 }
1668         }
1669         return (handled);
1670 }
1671
1672 static __inline int
1673 vmx_exit_astpending(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
1674 {
1675
1676         vmexit->rip = vmcs_guest_rip();
1677         vmexit->inst_length = 0;
1678         vmexit->exitcode = VM_EXITCODE_BOGUS;
1679         vmx_astpending_trace(vmx, vcpu, vmexit->rip);
1680         vmm_stat_incr(vmx->vm, vcpu, VMEXIT_ASTPENDING, 1);
1681
1682         return (HANDLED);
1683 }
1684
1685 static __inline int
1686 vmx_exit_rendezvous(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
1687 {
1688
1689         vmexit->rip = vmcs_guest_rip();
1690         vmexit->inst_length = 0;
1691         vmexit->exitcode = VM_EXITCODE_RENDEZVOUS;
1692         vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RENDEZVOUS, 1);
1693
1694         return (UNHANDLED);
1695 }
1696
1697 static __inline int
1698 vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit)
1699 {
1700
1701         KASSERT(vmxctx->inst_fail_status != VM_SUCCESS,
1702             ("vmx_exit_inst_error: invalid inst_fail_status %d",
1703             vmxctx->inst_fail_status));
1704
1705         vmexit->inst_length = 0;
1706         vmexit->exitcode = VM_EXITCODE_VMX;
1707         vmexit->u.vmx.status = vmxctx->inst_fail_status;
1708         vmexit->u.vmx.inst_error = vmcs_instruction_error();
1709         vmexit->u.vmx.exit_reason = ~0;
1710         vmexit->u.vmx.exit_qualification = ~0;
1711
1712         switch (rc) {
1713         case VMX_VMRESUME_ERROR:
1714         case VMX_VMLAUNCH_ERROR:
1715         case VMX_INVEPT_ERROR:
1716                 vmexit->u.vmx.inst_type = rc;
1717                 break;
1718         default:
1719                 panic("vm_exit_inst_error: vmx_enter_guest returned %d", rc);
1720         }
1721
1722         return (UNHANDLED);
1723 }
1724
1725 static int
1726 vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap,
1727     void *rendezvous_cookie)
1728 {
1729         int rc, handled, launched;
1730         struct vmx *vmx;
1731         struct vm *vm;
1732         struct vmxctx *vmxctx;
1733         struct vmcs *vmcs;
1734         struct vm_exit *vmexit;
1735         struct vlapic *vlapic;
1736         uint64_t rip;
1737         uint32_t exit_reason;
1738
1739         vmx = arg;
1740         vm = vmx->vm;
1741         vmcs = &vmx->vmcs[vcpu];
1742         vmxctx = &vmx->ctx[vcpu];
1743         vlapic = vm_lapic(vm, vcpu);
1744         vmexit = vm_exitinfo(vm, vcpu);
1745         launched = 0;
1746
1747         KASSERT(vmxctx->pmap == pmap,
1748             ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap));
1749         KASSERT(vmxctx->eptp == vmx->eptp,
1750             ("eptp %p different than ctx eptp %#lx", eptp, vmxctx->eptp));
1751
1752         VMPTRLD(vmcs);
1753
1754         /*
1755          * XXX
1756          * We do this every time because we may setup the virtual machine
1757          * from a different process than the one that actually runs it.
1758          *
1759          * If the life of a virtual machine was spent entirely in the context
1760          * of a single process we could do this once in vmx_vminit().
1761          */
1762         vmcs_write(VMCS_HOST_CR3, rcr3());
1763
1764         vmcs_write(VMCS_GUEST_RIP, startrip);
1765         vmx_set_pcpu_defaults(vmx, vcpu);
1766         do {
1767                 /*
1768                  * Interrupts are disabled from this point on until the
1769                  * guest starts executing. This is done for the following
1770                  * reasons:
1771                  *
1772                  * If an AST is asserted on this thread after the check below,
1773                  * then the IPI_AST notification will not be lost, because it
1774                  * will cause a VM exit due to external interrupt as soon as
1775                  * the guest state is loaded.
1776                  *
1777                  * A posted interrupt after 'vmx_inject_interrupts()' will
1778                  * not be "lost" because it will be held pending in the host
1779                  * APIC because interrupts are disabled. The pending interrupt
1780                  * will be recognized as soon as the guest state is loaded.
1781                  *
1782                  * The same reasoning applies to the IPI generated by
1783                  * pmap_invalidate_ept().
1784                  */
1785                 disable_intr();
1786                 if (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED)) {
1787                         enable_intr();
1788                         handled = vmx_exit_astpending(vmx, vcpu, vmexit);
1789                         break;
1790                 }
1791
1792                 if (vcpu_rendezvous_pending(rendezvous_cookie)) {
1793                         enable_intr();
1794                         handled = vmx_exit_rendezvous(vmx, vcpu, vmexit);
1795                         break;
1796                 }
1797
1798                 vmx_inject_interrupts(vmx, vcpu, vlapic);
1799                 vmx_run_trace(vmx, vcpu);
1800                 rc = vmx_enter_guest(vmxctx, launched);
1801
1802                 enable_intr();
1803
1804                 /* Collect some information for VM exit processing */
1805                 vmexit->rip = rip = vmcs_guest_rip();
1806                 vmexit->inst_length = vmexit_instruction_length();
1807                 vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason();
1808                 vmexit->u.vmx.exit_qualification = vmcs_exit_qualification();
1809
1810                 if (rc == VMX_GUEST_VMEXIT) {
1811                         launched = 1;
1812                         handled = vmx_exit_process(vmx, vcpu, vmexit);
1813                 } else {
1814                         handled = vmx_exit_inst_error(vmxctx, rc, vmexit);
1815                 }
1816
1817                 vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled);
1818         } while (handled);
1819
1820         /*
1821          * If a VM exit has been handled then the exitcode must be BOGUS
1822          * If a VM exit is not handled then the exitcode must not be BOGUS
1823          */
1824         if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) ||
1825             (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) {
1826                 panic("Mismatch between handled (%d) and exitcode (%d)",
1827                       handled, vmexit->exitcode);
1828         }
1829
1830         if (!handled)
1831                 vmm_stat_incr(vm, vcpu, VMEXIT_USERSPACE, 1);
1832
1833         VCPU_CTR1(vm, vcpu, "returning from vmx_run: exitcode %d",
1834             vmexit->exitcode);
1835
1836         VMCLEAR(vmcs);
1837         return (0);
1838 }
1839
1840 static void
1841 vmx_vmcleanup(void *arg)
1842 {
1843         int i, error;
1844         struct vmx *vmx = arg;
1845
1846         if (virtual_interrupt_delivery)
1847                 vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE);
1848
1849         for (i = 0; i < VM_MAXCPU; i++)
1850                 vpid_free(vmx->state[i].vpid);
1851
1852         /*
1853          * XXXSMP we also need to clear the VMCS active on the other vcpus.
1854          */
1855         error = vmclear(&vmx->vmcs[0]);
1856         if (error != 0)
1857                 panic("vmx_vmcleanup: vmclear error %d on vcpu 0", error);
1858
1859         free(vmx, M_VMX);
1860
1861         return;
1862 }
1863
1864 static register_t *
1865 vmxctx_regptr(struct vmxctx *vmxctx, int reg)
1866 {
1867
1868         switch (reg) {
1869         case VM_REG_GUEST_RAX:
1870                 return (&vmxctx->guest_rax);
1871         case VM_REG_GUEST_RBX:
1872                 return (&vmxctx->guest_rbx);
1873         case VM_REG_GUEST_RCX:
1874                 return (&vmxctx->guest_rcx);
1875         case VM_REG_GUEST_RDX:
1876                 return (&vmxctx->guest_rdx);
1877         case VM_REG_GUEST_RSI:
1878                 return (&vmxctx->guest_rsi);
1879         case VM_REG_GUEST_RDI:
1880                 return (&vmxctx->guest_rdi);
1881         case VM_REG_GUEST_RBP:
1882                 return (&vmxctx->guest_rbp);
1883         case VM_REG_GUEST_R8:
1884                 return (&vmxctx->guest_r8);
1885         case VM_REG_GUEST_R9:
1886                 return (&vmxctx->guest_r9);
1887         case VM_REG_GUEST_R10:
1888                 return (&vmxctx->guest_r10);
1889         case VM_REG_GUEST_R11:
1890                 return (&vmxctx->guest_r11);
1891         case VM_REG_GUEST_R12:
1892                 return (&vmxctx->guest_r12);
1893         case VM_REG_GUEST_R13:
1894                 return (&vmxctx->guest_r13);
1895         case VM_REG_GUEST_R14:
1896                 return (&vmxctx->guest_r14);
1897         case VM_REG_GUEST_R15:
1898                 return (&vmxctx->guest_r15);
1899         default:
1900                 break;
1901         }
1902         return (NULL);
1903 }
1904
1905 static int
1906 vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval)
1907 {
1908         register_t *regp;
1909
1910         if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
1911                 *retval = *regp;
1912                 return (0);
1913         } else
1914                 return (EINVAL);
1915 }
1916
1917 static int
1918 vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val)
1919 {
1920         register_t *regp;
1921
1922         if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
1923                 *regp = val;
1924                 return (0);
1925         } else
1926                 return (EINVAL);
1927 }
1928
1929 static int
1930 vmx_shadow_reg(int reg)
1931 {
1932         int shreg;
1933
1934         shreg = -1;
1935
1936         switch (reg) {
1937         case VM_REG_GUEST_CR0:
1938                 shreg = VMCS_CR0_SHADOW;
1939                 break;
1940         case VM_REG_GUEST_CR4:
1941                 shreg = VMCS_CR4_SHADOW;
1942                 break;
1943         default:
1944                 break;
1945         }
1946
1947         return (shreg);
1948 }
1949
1950 static int
1951 vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval)
1952 {
1953         int running, hostcpu;
1954         struct vmx *vmx = arg;
1955
1956         running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
1957         if (running && hostcpu != curcpu)
1958                 panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu);
1959
1960         if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0)
1961                 return (0);
1962
1963         return (vmcs_getreg(&vmx->vmcs[vcpu], running, reg, retval));
1964 }
1965
1966 static int
1967 vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
1968 {
1969         int error, hostcpu, running, shadow;
1970         uint64_t ctls;
1971         struct vmx *vmx = arg;
1972
1973         running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
1974         if (running && hostcpu != curcpu)
1975                 panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu);
1976
1977         if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0)
1978                 return (0);
1979
1980         error = vmcs_setreg(&vmx->vmcs[vcpu], running, reg, val);
1981
1982         if (error == 0) {
1983                 /*
1984                  * If the "load EFER" VM-entry control is 1 then the
1985                  * value of EFER.LMA must be identical to "IA-32e mode guest"
1986                  * bit in the VM-entry control.
1987                  */
1988                 if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 &&
1989                     (reg == VM_REG_GUEST_EFER)) {
1990                         vmcs_getreg(&vmx->vmcs[vcpu], running,
1991                                     VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls);
1992                         if (val & EFER_LMA)
1993                                 ctls |= VM_ENTRY_GUEST_LMA;
1994                         else
1995                                 ctls &= ~VM_ENTRY_GUEST_LMA;
1996                         vmcs_setreg(&vmx->vmcs[vcpu], running,
1997                                     VMCS_IDENT(VMCS_ENTRY_CTLS), ctls);
1998                 }
1999
2000                 shadow = vmx_shadow_reg(reg);
2001                 if (shadow > 0) {
2002                         /*
2003                          * Store the unmodified value in the shadow
2004                          */                     
2005                         error = vmcs_setreg(&vmx->vmcs[vcpu], running,
2006                                     VMCS_IDENT(shadow), val);
2007                 }
2008         }
2009
2010         return (error);
2011 }
2012
2013 static int
2014 vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
2015 {
2016         struct vmx *vmx = arg;
2017
2018         return (vmcs_getdesc(&vmx->vmcs[vcpu], reg, desc));
2019 }
2020
2021 static int
2022 vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
2023 {
2024         struct vmx *vmx = arg;
2025
2026         return (vmcs_setdesc(&vmx->vmcs[vcpu], reg, desc));
2027 }
2028
2029 static int
2030 vmx_inject(void *arg, int vcpu, int type, int vector, uint32_t code,
2031            int code_valid)
2032 {
2033         int error;
2034         uint64_t info;
2035         struct vmx *vmx = arg;
2036         struct vmcs *vmcs = &vmx->vmcs[vcpu];
2037
2038         static uint32_t type_map[VM_EVENT_MAX] = {
2039                 0x1,            /* VM_EVENT_NONE */
2040                 0x0,            /* VM_HW_INTR */
2041                 0x2,            /* VM_NMI */
2042                 0x3,            /* VM_HW_EXCEPTION */
2043                 0x4,            /* VM_SW_INTR */
2044                 0x5,            /* VM_PRIV_SW_EXCEPTION */
2045                 0x6,            /* VM_SW_EXCEPTION */
2046         };
2047
2048         /*
2049          * If there is already an exception pending to be delivered to the
2050          * vcpu then just return.
2051          */
2052         error = vmcs_getreg(vmcs, 0, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), &info);
2053         if (error)
2054                 return (error);
2055
2056         if (info & VMCS_INTR_VALID)
2057                 return (EAGAIN);
2058
2059         info = vector | (type_map[type] << 8) | (code_valid ? 1 << 11 : 0);
2060         info |= VMCS_INTR_VALID;
2061         error = vmcs_setreg(vmcs, 0, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), info);
2062         if (error != 0)
2063                 return (error);
2064
2065         if (code_valid) {
2066                 error = vmcs_setreg(vmcs, 0,
2067                                     VMCS_IDENT(VMCS_ENTRY_EXCEPTION_ERROR),
2068                                     code);
2069         }
2070         return (error);
2071 }
2072
2073 static int
2074 vmx_getcap(void *arg, int vcpu, int type, int *retval)
2075 {
2076         struct vmx *vmx = arg;
2077         int vcap;
2078         int ret;
2079
2080         ret = ENOENT;
2081
2082         vcap = vmx->cap[vcpu].set;
2083
2084         switch (type) {
2085         case VM_CAP_HALT_EXIT:
2086                 if (cap_halt_exit)
2087                         ret = 0;
2088                 break;
2089         case VM_CAP_PAUSE_EXIT:
2090                 if (cap_pause_exit)
2091                         ret = 0;
2092                 break;
2093         case VM_CAP_MTRAP_EXIT:
2094                 if (cap_monitor_trap)
2095                         ret = 0;
2096                 break;
2097         case VM_CAP_UNRESTRICTED_GUEST:
2098                 if (cap_unrestricted_guest)
2099                         ret = 0;
2100                 break;
2101         case VM_CAP_ENABLE_INVPCID:
2102                 if (cap_invpcid)
2103                         ret = 0;
2104                 break;
2105         default:
2106                 break;
2107         }
2108
2109         if (ret == 0)
2110                 *retval = (vcap & (1 << type)) ? 1 : 0;
2111
2112         return (ret);
2113 }
2114
2115 static int
2116 vmx_setcap(void *arg, int vcpu, int type, int val)
2117 {
2118         struct vmx *vmx = arg;
2119         struct vmcs *vmcs = &vmx->vmcs[vcpu];
2120         uint32_t baseval;
2121         uint32_t *pptr;
2122         int error;
2123         int flag;
2124         int reg;
2125         int retval;
2126
2127         retval = ENOENT;
2128         pptr = NULL;
2129
2130         switch (type) {
2131         case VM_CAP_HALT_EXIT:
2132                 if (cap_halt_exit) {
2133                         retval = 0;
2134                         pptr = &vmx->cap[vcpu].proc_ctls;
2135                         baseval = *pptr;
2136                         flag = PROCBASED_HLT_EXITING;
2137                         reg = VMCS_PRI_PROC_BASED_CTLS;
2138                 }
2139                 break;
2140         case VM_CAP_MTRAP_EXIT:
2141                 if (cap_monitor_trap) {
2142                         retval = 0;
2143                         pptr = &vmx->cap[vcpu].proc_ctls;
2144                         baseval = *pptr;
2145                         flag = PROCBASED_MTF;
2146                         reg = VMCS_PRI_PROC_BASED_CTLS;
2147                 }
2148                 break;
2149         case VM_CAP_PAUSE_EXIT:
2150                 if (cap_pause_exit) {
2151                         retval = 0;
2152                         pptr = &vmx->cap[vcpu].proc_ctls;
2153                         baseval = *pptr;
2154                         flag = PROCBASED_PAUSE_EXITING;
2155                         reg = VMCS_PRI_PROC_BASED_CTLS;
2156                 }
2157                 break;
2158         case VM_CAP_UNRESTRICTED_GUEST:
2159                 if (cap_unrestricted_guest) {
2160                         retval = 0;
2161                         pptr = &vmx->cap[vcpu].proc_ctls2;
2162                         baseval = *pptr;
2163                         flag = PROCBASED2_UNRESTRICTED_GUEST;
2164                         reg = VMCS_SEC_PROC_BASED_CTLS;
2165                 }
2166                 break;
2167         case VM_CAP_ENABLE_INVPCID:
2168                 if (cap_invpcid) {
2169                         retval = 0;
2170                         pptr = &vmx->cap[vcpu].proc_ctls2;
2171                         baseval = *pptr;
2172                         flag = PROCBASED2_ENABLE_INVPCID;
2173                         reg = VMCS_SEC_PROC_BASED_CTLS;
2174                 }
2175                 break;
2176         default:
2177                 break;
2178         }
2179
2180         if (retval == 0) {
2181                 if (val) {
2182                         baseval |= flag;
2183                 } else {
2184                         baseval &= ~flag;
2185                 }
2186                 VMPTRLD(vmcs);
2187                 error = vmwrite(reg, baseval);
2188                 VMCLEAR(vmcs);
2189
2190                 if (error) {
2191                         retval = error;
2192                 } else {
2193                         /*
2194                          * Update optional stored flags, and record
2195                          * setting
2196                          */
2197                         if (pptr != NULL) {
2198                                 *pptr = baseval;
2199                         }
2200
2201                         if (val) {
2202                                 vmx->cap[vcpu].set |= (1 << type);
2203                         } else {
2204                                 vmx->cap[vcpu].set &= ~(1 << type);
2205                         }
2206                 }
2207         }
2208
2209         return (retval);
2210 }
2211
2212 struct vlapic_vtx {
2213         struct vlapic   vlapic;
2214         struct pir_desc *pir_desc;
2215 };
2216
2217 #define VMX_CTR_PIR(vm, vcpuid, pir_desc, notify, vector, level, msg)   \
2218 do {                                                                    \
2219         VCPU_CTR2(vm, vcpuid, msg " assert %s-triggered vector %d",     \
2220             level ? "level" : "edge", vector);                          \
2221         VCPU_CTR1(vm, vcpuid, msg " pir0 0x%016lx", pir_desc->pir[0]);  \
2222         VCPU_CTR1(vm, vcpuid, msg " pir1 0x%016lx", pir_desc->pir[1]);  \
2223         VCPU_CTR1(vm, vcpuid, msg " pir2 0x%016lx", pir_desc->pir[2]);  \
2224         VCPU_CTR1(vm, vcpuid, msg " pir3 0x%016lx", pir_desc->pir[3]);  \
2225         VCPU_CTR1(vm, vcpuid, msg " notify: %s", notify ? "yes" : "no");\
2226 } while (0)
2227
2228 /*
2229  * vlapic->ops handlers that utilize the APICv hardware assist described in
2230  * Chapter 29 of the Intel SDM.
2231  */
2232 static int
2233 vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level)
2234 {
2235         struct vlapic_vtx *vlapic_vtx;
2236         struct pir_desc *pir_desc;
2237         uint64_t mask;
2238         int idx, notify;
2239
2240         /*
2241          * XXX need to deal with level triggered interrupts
2242          */
2243         vlapic_vtx = (struct vlapic_vtx *)vlapic;
2244         pir_desc = vlapic_vtx->pir_desc;
2245
2246         /*
2247          * Keep track of interrupt requests in the PIR descriptor. This is
2248          * because the virtual APIC page pointed to by the VMCS cannot be
2249          * modified if the vcpu is running.
2250          */
2251         idx = vector / 64;
2252         mask = 1UL << (vector % 64);
2253         atomic_set_long(&pir_desc->pir[idx], mask);
2254         notify = atomic_cmpset_long(&pir_desc->pending, 0, 1);
2255
2256         VMX_CTR_PIR(vlapic->vm, vlapic->vcpuid, pir_desc, notify, vector,
2257             level, "vmx_set_intr_ready");
2258         return (notify);
2259 }
2260
2261 static int
2262 vmx_pending_intr(struct vlapic *vlapic, int *vecptr)
2263 {
2264         struct vlapic_vtx *vlapic_vtx;
2265         struct pir_desc *pir_desc;
2266         struct LAPIC *lapic;
2267         uint64_t pending, pirval;
2268         uint32_t ppr, vpr;
2269         int i;
2270
2271         /*
2272          * This function is only expected to be called from the 'HLT' exit
2273          * handler which does not care about the vector that is pending.
2274          */
2275         KASSERT(vecptr == NULL, ("vmx_pending_intr: vecptr must be NULL"));
2276
2277         vlapic_vtx = (struct vlapic_vtx *)vlapic;
2278         pir_desc = vlapic_vtx->pir_desc;
2279
2280         pending = atomic_load_acq_long(&pir_desc->pending);
2281         if (!pending)
2282                 return (0);     /* common case */
2283
2284         /*
2285          * If there is an interrupt pending then it will be recognized only
2286          * if its priority is greater than the processor priority.
2287          *
2288          * Special case: if the processor priority is zero then any pending
2289          * interrupt will be recognized.
2290          */
2291         lapic = vlapic->apic_page;
2292         ppr = lapic->ppr & 0xf0;
2293         if (ppr == 0)
2294                 return (1);
2295
2296         VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "HLT with non-zero PPR %d",
2297             lapic->ppr);
2298
2299         for (i = 3; i >= 0; i--) {
2300                 pirval = pir_desc->pir[i];
2301                 if (pirval != 0) {
2302                         vpr = (i * 64 + flsl(pirval) - 1) & 0xf0;
2303                         return (vpr > ppr);
2304                 }
2305         }
2306         return (0);
2307 }
2308
2309 static void
2310 vmx_intr_accepted(struct vlapic *vlapic, int vector)
2311 {
2312
2313         panic("vmx_intr_accepted: not expected to be called");
2314 }
2315
2316 static void
2317 vmx_post_intr(struct vlapic *vlapic, int hostcpu)
2318 {
2319
2320         ipi_cpu(hostcpu, pirvec);
2321 }
2322
2323 /*
2324  * Transfer the pending interrupts in the PIR descriptor to the IRR
2325  * in the virtual APIC page.
2326  */
2327 static void
2328 vmx_inject_pir(struct vlapic *vlapic)
2329 {
2330         struct vlapic_vtx *vlapic_vtx;
2331         struct pir_desc *pir_desc;
2332         struct LAPIC *lapic;
2333         uint64_t val, pirval;
2334         int rvi, pirbase;
2335         uint16_t intr_status_old, intr_status_new;
2336
2337         vlapic_vtx = (struct vlapic_vtx *)vlapic;
2338         pir_desc = vlapic_vtx->pir_desc;
2339         if (atomic_cmpset_long(&pir_desc->pending, 1, 0) == 0) {
2340                 VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: "
2341                     "no posted interrupt pending");
2342                 return;
2343         }
2344
2345         pirval = 0;
2346         lapic = vlapic->apic_page;
2347
2348         val = atomic_readandclear_long(&pir_desc->pir[0]);
2349         if (val != 0) {
2350                 lapic->irr0 |= val;
2351                 lapic->irr1 |= val >> 32;
2352                 pirbase = 0;
2353                 pirval = val;
2354         }
2355
2356         val = atomic_readandclear_long(&pir_desc->pir[1]);
2357         if (val != 0) {
2358                 lapic->irr2 |= val;
2359                 lapic->irr3 |= val >> 32;
2360                 pirbase = 64;
2361                 pirval = val;
2362         }
2363
2364         val = atomic_readandclear_long(&pir_desc->pir[2]);
2365         if (val != 0) {
2366                 lapic->irr4 |= val;
2367                 lapic->irr5 |= val >> 32;
2368                 pirbase = 128;
2369                 pirval = val;
2370         }
2371
2372         val = atomic_readandclear_long(&pir_desc->pir[3]);
2373         if (val != 0) {
2374                 lapic->irr6 |= val;
2375                 lapic->irr7 |= val >> 32;
2376                 pirbase = 192;
2377                 pirval = val;
2378         }
2379         VLAPIC_CTR_IRR(vlapic, "vmx_inject_pir");
2380
2381         /*
2382          * Update RVI so the processor can evaluate pending virtual
2383          * interrupts on VM-entry.
2384          */
2385         if (pirval != 0) {
2386                 rvi = pirbase + flsl(pirval) - 1;
2387                 intr_status_old = vmcs_read(VMCS_GUEST_INTR_STATUS);
2388                 intr_status_new = (intr_status_old & 0xFF00) | rvi;
2389                 if (intr_status_new > intr_status_old) {
2390                         vmcs_write(VMCS_GUEST_INTR_STATUS, intr_status_new);
2391                         VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: "
2392                             "guest_intr_status changed from 0x%04x to 0x%04x",
2393                             intr_status_old, intr_status_new);
2394                 }
2395         }
2396 }
2397
2398 static struct vlapic *
2399 vmx_vlapic_init(void *arg, int vcpuid)
2400 {
2401         struct vmx *vmx;
2402         struct vlapic *vlapic;
2403         struct vlapic_vtx *vlapic_vtx;
2404         
2405         vmx = arg;
2406
2407         vlapic = malloc(sizeof(struct vlapic_vtx), M_VLAPIC, M_WAITOK | M_ZERO);
2408         vlapic->vm = vmx->vm;
2409         vlapic->vcpuid = vcpuid;
2410         vlapic->apic_page = (struct LAPIC *)&vmx->apic_page[vcpuid];
2411
2412         vlapic_vtx = (struct vlapic_vtx *)vlapic;
2413         vlapic_vtx->pir_desc = &vmx->pir_desc[vcpuid];
2414
2415         if (virtual_interrupt_delivery) {
2416                 vlapic->ops.set_intr_ready = vmx_set_intr_ready;
2417                 vlapic->ops.pending_intr = vmx_pending_intr;
2418                 vlapic->ops.intr_accepted = vmx_intr_accepted;
2419         }
2420
2421         if (posted_interrupts)
2422                 vlapic->ops.post_intr = vmx_post_intr;
2423
2424         vlapic_init(vlapic);
2425
2426         return (vlapic);
2427 }
2428
2429 static void
2430 vmx_vlapic_cleanup(void *arg, struct vlapic *vlapic)
2431 {
2432
2433         vlapic_cleanup(vlapic);
2434         free(vlapic, M_VLAPIC);
2435 }
2436
2437 struct vmm_ops vmm_ops_intel = {
2438         vmx_init,
2439         vmx_cleanup,
2440         vmx_restore,
2441         vmx_vminit,
2442         vmx_run,
2443         vmx_vmcleanup,
2444         vmx_getreg,
2445         vmx_setreg,
2446         vmx_getdesc,
2447         vmx_setdesc,
2448         vmx_inject,
2449         vmx_getcap,
2450         vmx_setcap,
2451         ept_vmspace_alloc,
2452         ept_vmspace_free,
2453         vmx_vlapic_init,
2454         vmx_vlapic_cleanup,
2455 };