]> CyberLeo.Net >> Repos - FreeBSD/stable/10.git/blob - sys/amd64/vmm/intel/vmx.c
MFC 260802,260836,260863,261001,261074,261617:
[FreeBSD/stable/10.git] / sys / amd64 / vmm / intel / vmx.c
1 /*-
2  * Copyright (c) 2011 NetApp, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  * $FreeBSD$
27  */
28
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/smp.h>
35 #include <sys/kernel.h>
36 #include <sys/malloc.h>
37 #include <sys/pcpu.h>
38 #include <sys/proc.h>
39 #include <sys/sysctl.h>
40
41 #include <vm/vm.h>
42 #include <vm/pmap.h>
43
44 #include <machine/psl.h>
45 #include <machine/cpufunc.h>
46 #include <machine/md_var.h>
47 #include <machine/segments.h>
48 #include <machine/smp.h>
49 #include <machine/specialreg.h>
50 #include <machine/vmparam.h>
51
52 #include <machine/vmm.h>
53 #include "vmm_host.h"
54 #include "vmm_ipi.h"
55 #include "vmm_msr.h"
56 #include "vmm_ktr.h"
57 #include "vmm_stat.h"
58 #include "vlapic.h"
59 #include "vlapic_priv.h"
60
61 #include "vmx_msr.h"
62 #include "ept.h"
63 #include "vmx_cpufunc.h"
64 #include "vmx.h"
65 #include "x86.h"
66 #include "vmx_controls.h"
67
68 #define PINBASED_CTLS_ONE_SETTING                                       \
69         (PINBASED_EXTINT_EXITING        |                               \
70          PINBASED_NMI_EXITING           |                               \
71          PINBASED_VIRTUAL_NMI)
72 #define PINBASED_CTLS_ZERO_SETTING      0
73
74 #define PROCBASED_CTLS_WINDOW_SETTING                                   \
75         (PROCBASED_INT_WINDOW_EXITING   |                               \
76          PROCBASED_NMI_WINDOW_EXITING)
77
78 #define PROCBASED_CTLS_ONE_SETTING                                      \
79         (PROCBASED_SECONDARY_CONTROLS   |                               \
80          PROCBASED_IO_EXITING           |                               \
81          PROCBASED_MSR_BITMAPS          |                               \
82          PROCBASED_CTLS_WINDOW_SETTING)
83 #define PROCBASED_CTLS_ZERO_SETTING     \
84         (PROCBASED_CR3_LOAD_EXITING |   \
85         PROCBASED_CR3_STORE_EXITING |   \
86         PROCBASED_IO_BITMAPS)
87
88 #define PROCBASED_CTLS2_ONE_SETTING     PROCBASED2_ENABLE_EPT
89 #define PROCBASED_CTLS2_ZERO_SETTING    0
90
91 #define VM_EXIT_CTLS_ONE_SETTING_NO_PAT                                 \
92         (VM_EXIT_HOST_LMA                       |                       \
93         VM_EXIT_SAVE_EFER                       |                       \
94         VM_EXIT_LOAD_EFER)
95
96 #define VM_EXIT_CTLS_ONE_SETTING                                        \
97         (VM_EXIT_CTLS_ONE_SETTING_NO_PAT        |                       \
98         VM_EXIT_ACKNOWLEDGE_INTERRUPT           |                       \
99         VM_EXIT_SAVE_PAT                        |                       \
100         VM_EXIT_LOAD_PAT)
101 #define VM_EXIT_CTLS_ZERO_SETTING       VM_EXIT_SAVE_DEBUG_CONTROLS
102
103 #define VM_ENTRY_CTLS_ONE_SETTING_NO_PAT        VM_ENTRY_LOAD_EFER
104
105 #define VM_ENTRY_CTLS_ONE_SETTING                                       \
106         (VM_ENTRY_CTLS_ONE_SETTING_NO_PAT       |                       \
107         VM_ENTRY_LOAD_PAT)
108 #define VM_ENTRY_CTLS_ZERO_SETTING                                      \
109         (VM_ENTRY_LOAD_DEBUG_CONTROLS           |                       \
110         VM_ENTRY_INTO_SMM                       |                       \
111         VM_ENTRY_DEACTIVATE_DUAL_MONITOR)
112
113 #define guest_msr_rw(vmx, msr) \
114         msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW)
115
116 #define HANDLED         1
117 #define UNHANDLED       0
118
119 static MALLOC_DEFINE(M_VMX, "vmx", "vmx");
120 static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic");
121
122 SYSCTL_DECL(_hw_vmm);
123 SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW, NULL, NULL);
124
125 int vmxon_enabled[MAXCPU];
126 static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE);
127
128 static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2;
129 static uint32_t exit_ctls, entry_ctls;
130
131 static uint64_t cr0_ones_mask, cr0_zeros_mask;
132 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_ones_mask, CTLFLAG_RD,
133              &cr0_ones_mask, 0, NULL);
134 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_zeros_mask, CTLFLAG_RD,
135              &cr0_zeros_mask, 0, NULL);
136
137 static uint64_t cr4_ones_mask, cr4_zeros_mask;
138 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD,
139              &cr4_ones_mask, 0, NULL);
140 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD,
141              &cr4_zeros_mask, 0, NULL);
142
143 static int vmx_no_patmsr;
144
145 static int vmx_initialized;
146 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD,
147            &vmx_initialized, 0, "Intel VMX initialized");
148
149 /*
150  * Optional capabilities
151  */
152 static int cap_halt_exit;
153 static int cap_pause_exit;
154 static int cap_unrestricted_guest;
155 static int cap_monitor_trap;
156 static int cap_invpcid;
157
158 static int virtual_interrupt_delivery;
159 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD,
160     &virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support");
161
162 static int posted_interrupts;
163 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupts, CTLFLAG_RD,
164     &posted_interrupts, 0, "APICv posted interrupt support");
165
166 static int pirvec;
167 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupt_vector, CTLFLAG_RD,
168     &pirvec, 0, "APICv posted interrupt vector");
169
170 static struct unrhdr *vpid_unr;
171 static u_int vpid_alloc_failed;
172 SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD,
173             &vpid_alloc_failed, 0, NULL);
174
175 /*
176  * Use the last page below 4GB as the APIC access address. This address is
177  * occupied by the boot firmware so it is guaranteed that it will not conflict
178  * with a page in system memory.
179  */
180 #define APIC_ACCESS_ADDRESS     0xFFFFF000
181
182 static void vmx_inject_pir(struct vlapic *vlapic);
183
184 #ifdef KTR
185 static const char *
186 exit_reason_to_str(int reason)
187 {
188         static char reasonbuf[32];
189
190         switch (reason) {
191         case EXIT_REASON_EXCEPTION:
192                 return "exception";
193         case EXIT_REASON_EXT_INTR:
194                 return "extint";
195         case EXIT_REASON_TRIPLE_FAULT:
196                 return "triplefault";
197         case EXIT_REASON_INIT:
198                 return "init";
199         case EXIT_REASON_SIPI:
200                 return "sipi";
201         case EXIT_REASON_IO_SMI:
202                 return "iosmi";
203         case EXIT_REASON_SMI:
204                 return "smi";
205         case EXIT_REASON_INTR_WINDOW:
206                 return "intrwindow";
207         case EXIT_REASON_NMI_WINDOW:
208                 return "nmiwindow";
209         case EXIT_REASON_TASK_SWITCH:
210                 return "taskswitch";
211         case EXIT_REASON_CPUID:
212                 return "cpuid";
213         case EXIT_REASON_GETSEC:
214                 return "getsec";
215         case EXIT_REASON_HLT:
216                 return "hlt";
217         case EXIT_REASON_INVD:
218                 return "invd";
219         case EXIT_REASON_INVLPG:
220                 return "invlpg";
221         case EXIT_REASON_RDPMC:
222                 return "rdpmc";
223         case EXIT_REASON_RDTSC:
224                 return "rdtsc";
225         case EXIT_REASON_RSM:
226                 return "rsm";
227         case EXIT_REASON_VMCALL:
228                 return "vmcall";
229         case EXIT_REASON_VMCLEAR:
230                 return "vmclear";
231         case EXIT_REASON_VMLAUNCH:
232                 return "vmlaunch";
233         case EXIT_REASON_VMPTRLD:
234                 return "vmptrld";
235         case EXIT_REASON_VMPTRST:
236                 return "vmptrst";
237         case EXIT_REASON_VMREAD:
238                 return "vmread";
239         case EXIT_REASON_VMRESUME:
240                 return "vmresume";
241         case EXIT_REASON_VMWRITE:
242                 return "vmwrite";
243         case EXIT_REASON_VMXOFF:
244                 return "vmxoff";
245         case EXIT_REASON_VMXON:
246                 return "vmxon";
247         case EXIT_REASON_CR_ACCESS:
248                 return "craccess";
249         case EXIT_REASON_DR_ACCESS:
250                 return "draccess";
251         case EXIT_REASON_INOUT:
252                 return "inout";
253         case EXIT_REASON_RDMSR:
254                 return "rdmsr";
255         case EXIT_REASON_WRMSR:
256                 return "wrmsr";
257         case EXIT_REASON_INVAL_VMCS:
258                 return "invalvmcs";
259         case EXIT_REASON_INVAL_MSR:
260                 return "invalmsr";
261         case EXIT_REASON_MWAIT:
262                 return "mwait";
263         case EXIT_REASON_MTF:
264                 return "mtf";
265         case EXIT_REASON_MONITOR:
266                 return "monitor";
267         case EXIT_REASON_PAUSE:
268                 return "pause";
269         case EXIT_REASON_MCE:
270                 return "mce";
271         case EXIT_REASON_TPR:
272                 return "tpr";
273         case EXIT_REASON_APIC_ACCESS:
274                 return "apic-access";
275         case EXIT_REASON_GDTR_IDTR:
276                 return "gdtridtr";
277         case EXIT_REASON_LDTR_TR:
278                 return "ldtrtr";
279         case EXIT_REASON_EPT_FAULT:
280                 return "eptfault";
281         case EXIT_REASON_EPT_MISCONFIG:
282                 return "eptmisconfig";
283         case EXIT_REASON_INVEPT:
284                 return "invept";
285         case EXIT_REASON_RDTSCP:
286                 return "rdtscp";
287         case EXIT_REASON_VMX_PREEMPT:
288                 return "vmxpreempt";
289         case EXIT_REASON_INVVPID:
290                 return "invvpid";
291         case EXIT_REASON_WBINVD:
292                 return "wbinvd";
293         case EXIT_REASON_XSETBV:
294                 return "xsetbv";
295         case EXIT_REASON_APIC_WRITE:
296                 return "apic-write";
297         default:
298                 snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason);
299                 return (reasonbuf);
300         }
301 }
302 #endif  /* KTR */
303
304 u_long
305 vmx_fix_cr0(u_long cr0)
306 {
307
308         return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask);
309 }
310
311 u_long
312 vmx_fix_cr4(u_long cr4)
313 {
314
315         return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask);
316 }
317
318 static void
319 vpid_free(int vpid)
320 {
321         if (vpid < 0 || vpid > 0xffff)
322                 panic("vpid_free: invalid vpid %d", vpid);
323
324         /*
325          * VPIDs [0,VM_MAXCPU] are special and are not allocated from
326          * the unit number allocator.
327          */
328
329         if (vpid > VM_MAXCPU)
330                 free_unr(vpid_unr, vpid);
331 }
332
333 static void
334 vpid_alloc(uint16_t *vpid, int num)
335 {
336         int i, x;
337
338         if (num <= 0 || num > VM_MAXCPU)
339                 panic("invalid number of vpids requested: %d", num);
340
341         /*
342          * If the "enable vpid" execution control is not enabled then the
343          * VPID is required to be 0 for all vcpus.
344          */
345         if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) == 0) {
346                 for (i = 0; i < num; i++)
347                         vpid[i] = 0;
348                 return;
349         }
350
351         /*
352          * Allocate a unique VPID for each vcpu from the unit number allocator.
353          */
354         for (i = 0; i < num; i++) {
355                 x = alloc_unr(vpid_unr);
356                 if (x == -1)
357                         break;
358                 else
359                         vpid[i] = x;
360         }
361
362         if (i < num) {
363                 atomic_add_int(&vpid_alloc_failed, 1);
364
365                 /*
366                  * If the unit number allocator does not have enough unique
367                  * VPIDs then we need to allocate from the [1,VM_MAXCPU] range.
368                  *
369                  * These VPIDs are not be unique across VMs but this does not
370                  * affect correctness because the combined mappings are also
371                  * tagged with the EP4TA which is unique for each VM.
372                  *
373                  * It is still sub-optimal because the invvpid will invalidate
374                  * combined mappings for a particular VPID across all EP4TAs.
375                  */
376                 while (i-- > 0)
377                         vpid_free(vpid[i]);
378
379                 for (i = 0; i < num; i++)
380                         vpid[i] = i + 1;
381         }
382 }
383
384 static void
385 vpid_init(void)
386 {
387         /*
388          * VPID 0 is required when the "enable VPID" execution control is
389          * disabled.
390          *
391          * VPIDs [1,VM_MAXCPU] are used as the "overflow namespace" when the
392          * unit number allocator does not have sufficient unique VPIDs to
393          * satisfy the allocation.
394          *
395          * The remaining VPIDs are managed by the unit number allocator.
396          */
397         vpid_unr = new_unrhdr(VM_MAXCPU + 1, 0xffff, NULL);
398 }
399
400 static void
401 msr_save_area_init(struct msr_entry *g_area, int *g_count)
402 {
403         int cnt;
404
405         static struct msr_entry guest_msrs[] = {
406                 { MSR_KGSBASE, 0, 0 },
407         };
408
409         cnt = sizeof(guest_msrs) / sizeof(guest_msrs[0]);
410         if (cnt > GUEST_MSR_MAX_ENTRIES)
411                 panic("guest msr save area overrun");
412         bcopy(guest_msrs, g_area, sizeof(guest_msrs));
413         *g_count = cnt;
414 }
415
416 static void
417 vmx_disable(void *arg __unused)
418 {
419         struct invvpid_desc invvpid_desc = { 0 };
420         struct invept_desc invept_desc = { 0 };
421
422         if (vmxon_enabled[curcpu]) {
423                 /*
424                  * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b.
425                  *
426                  * VMXON or VMXOFF are not required to invalidate any TLB
427                  * caching structures. This prevents potential retention of
428                  * cached information in the TLB between distinct VMX episodes.
429                  */
430                 invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc);
431                 invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc);
432                 vmxoff();
433         }
434         load_cr4(rcr4() & ~CR4_VMXE);
435 }
436
437 static int
438 vmx_cleanup(void)
439 {
440         
441         if (pirvec != 0)
442                 vmm_ipi_free(pirvec);
443
444         if (vpid_unr != NULL) {
445                 delete_unrhdr(vpid_unr);
446                 vpid_unr = NULL;
447         }
448
449         smp_rendezvous(NULL, vmx_disable, NULL, NULL);
450
451         return (0);
452 }
453
454 static void
455 vmx_enable(void *arg __unused)
456 {
457         int error;
458
459         load_cr4(rcr4() | CR4_VMXE);
460
461         *(uint32_t *)vmxon_region[curcpu] = vmx_revision();
462         error = vmxon(vmxon_region[curcpu]);
463         if (error == 0)
464                 vmxon_enabled[curcpu] = 1;
465 }
466
467 static void
468 vmx_restore(void)
469 {
470
471         if (vmxon_enabled[curcpu])
472                 vmxon(vmxon_region[curcpu]);
473 }
474
475 static int
476 vmx_init(int ipinum)
477 {
478         int error, use_tpr_shadow;
479         uint64_t fixed0, fixed1, feature_control;
480         uint32_t tmp, procbased2_vid_bits;
481
482         /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */
483         if (!(cpu_feature2 & CPUID2_VMX)) {
484                 printf("vmx_init: processor does not support VMX operation\n");
485                 return (ENXIO);
486         }
487
488         /*
489          * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits
490          * are set (bits 0 and 2 respectively).
491          */
492         feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
493         if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 ||
494             (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) {
495                 printf("vmx_init: VMX operation disabled by BIOS\n");
496                 return (ENXIO);
497         }
498
499         /* Check support for primary processor-based VM-execution controls */
500         error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
501                                MSR_VMX_TRUE_PROCBASED_CTLS,
502                                PROCBASED_CTLS_ONE_SETTING,
503                                PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls);
504         if (error) {
505                 printf("vmx_init: processor does not support desired primary "
506                        "processor-based controls\n");
507                 return (error);
508         }
509
510         /* Clear the processor-based ctl bits that are set on demand */
511         procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING;
512
513         /* Check support for secondary processor-based VM-execution controls */
514         error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
515                                MSR_VMX_PROCBASED_CTLS2,
516                                PROCBASED_CTLS2_ONE_SETTING,
517                                PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2);
518         if (error) {
519                 printf("vmx_init: processor does not support desired secondary "
520                        "processor-based controls\n");
521                 return (error);
522         }
523
524         /* Check support for VPID */
525         error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
526                                PROCBASED2_ENABLE_VPID, 0, &tmp);
527         if (error == 0)
528                 procbased_ctls2 |= PROCBASED2_ENABLE_VPID;
529
530         /* Check support for pin-based VM-execution controls */
531         error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
532                                MSR_VMX_TRUE_PINBASED_CTLS,
533                                PINBASED_CTLS_ONE_SETTING,
534                                PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls);
535         if (error) {
536                 printf("vmx_init: processor does not support desired "
537                        "pin-based controls\n");
538                 return (error);
539         }
540
541         /* Check support for VM-exit controls */
542         error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS,
543                                VM_EXIT_CTLS_ONE_SETTING,
544                                VM_EXIT_CTLS_ZERO_SETTING,
545                                &exit_ctls);
546         if (error) {
547                 /* Try again without the PAT MSR bits */
548                 error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS,
549                                        MSR_VMX_TRUE_EXIT_CTLS,
550                                        VM_EXIT_CTLS_ONE_SETTING_NO_PAT,
551                                        VM_EXIT_CTLS_ZERO_SETTING,
552                                        &exit_ctls);
553                 if (error) {
554                         printf("vmx_init: processor does not support desired "
555                                "exit controls\n");
556                         return (error);
557                 } else {
558                         if (bootverbose)
559                                 printf("vmm: PAT MSR access not supported\n");
560                         guest_msr_valid(MSR_PAT);
561                         vmx_no_patmsr = 1;
562                 }
563         }
564
565         /* Check support for VM-entry controls */
566         if (!vmx_no_patmsr) {
567                 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS,
568                                        MSR_VMX_TRUE_ENTRY_CTLS,
569                                        VM_ENTRY_CTLS_ONE_SETTING,
570                                        VM_ENTRY_CTLS_ZERO_SETTING,
571                                        &entry_ctls);
572         } else {
573                 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS,
574                                        MSR_VMX_TRUE_ENTRY_CTLS,
575                                        VM_ENTRY_CTLS_ONE_SETTING_NO_PAT,
576                                        VM_ENTRY_CTLS_ZERO_SETTING,
577                                        &entry_ctls);
578         }
579
580         if (error) {
581                 printf("vmx_init: processor does not support desired "
582                        "entry controls\n");
583                        return (error);
584         }
585
586         /*
587          * Check support for optional features by testing them
588          * as individual bits
589          */
590         cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
591                                         MSR_VMX_TRUE_PROCBASED_CTLS,
592                                         PROCBASED_HLT_EXITING, 0,
593                                         &tmp) == 0);
594
595         cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
596                                         MSR_VMX_PROCBASED_CTLS,
597                                         PROCBASED_MTF, 0,
598                                         &tmp) == 0);
599
600         cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
601                                          MSR_VMX_TRUE_PROCBASED_CTLS,
602                                          PROCBASED_PAUSE_EXITING, 0,
603                                          &tmp) == 0);
604
605         cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
606                                         MSR_VMX_PROCBASED_CTLS2,
607                                         PROCBASED2_UNRESTRICTED_GUEST, 0,
608                                         &tmp) == 0);
609
610         cap_invpcid = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
611             MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0,
612             &tmp) == 0);
613
614         /*
615          * Check support for virtual interrupt delivery.
616          */
617         procbased2_vid_bits = (PROCBASED2_VIRTUALIZE_APIC_ACCESSES |
618             PROCBASED2_VIRTUALIZE_X2APIC_MODE |
619             PROCBASED2_APIC_REGISTER_VIRTUALIZATION |
620             PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY);
621
622         use_tpr_shadow = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
623             MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_USE_TPR_SHADOW, 0,
624             &tmp) == 0);
625
626         error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
627             procbased2_vid_bits, 0, &tmp);
628         if (error == 0 && use_tpr_shadow) {
629                 virtual_interrupt_delivery = 1;
630                 TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_vid",
631                     &virtual_interrupt_delivery);
632         }
633
634         if (virtual_interrupt_delivery) {
635                 procbased_ctls |= PROCBASED_USE_TPR_SHADOW;
636                 procbased_ctls2 |= procbased2_vid_bits;
637                 procbased_ctls2 &= ~PROCBASED2_VIRTUALIZE_X2APIC_MODE;
638
639                 /*
640                  * Check for Posted Interrupts only if Virtual Interrupt
641                  * Delivery is enabled.
642                  */
643                 error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
644                     MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_POSTED_INTERRUPT, 0,
645                     &tmp);
646                 if (error == 0) {
647                         pirvec = vmm_ipi_alloc();
648                         if (pirvec == 0) {
649                                 if (bootverbose) {
650                                         printf("vmx_init: unable to allocate "
651                                             "posted interrupt vector\n");
652                                 }
653                         } else {
654                                 posted_interrupts = 1;
655                                 TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_pir",
656                                     &posted_interrupts);
657                         }
658                 }
659         }
660
661         if (posted_interrupts)
662                     pinbased_ctls |= PINBASED_POSTED_INTERRUPT;
663
664         /* Initialize EPT */
665         error = ept_init(ipinum);
666         if (error) {
667                 printf("vmx_init: ept initialization failed (%d)\n", error);
668                 return (error);
669         }
670
671         /*
672          * Stash the cr0 and cr4 bits that must be fixed to 0 or 1
673          */
674         fixed0 = rdmsr(MSR_VMX_CR0_FIXED0);
675         fixed1 = rdmsr(MSR_VMX_CR0_FIXED1);
676         cr0_ones_mask = fixed0 & fixed1;
677         cr0_zeros_mask = ~fixed0 & ~fixed1;
678
679         /*
680          * CR0_PE and CR0_PG can be set to zero in VMX non-root operation
681          * if unrestricted guest execution is allowed.
682          */
683         if (cap_unrestricted_guest)
684                 cr0_ones_mask &= ~(CR0_PG | CR0_PE);
685
686         /*
687          * Do not allow the guest to set CR0_NW or CR0_CD.
688          */
689         cr0_zeros_mask |= (CR0_NW | CR0_CD);
690
691         fixed0 = rdmsr(MSR_VMX_CR4_FIXED0);
692         fixed1 = rdmsr(MSR_VMX_CR4_FIXED1);
693         cr4_ones_mask = fixed0 & fixed1;
694         cr4_zeros_mask = ~fixed0 & ~fixed1;
695
696         vpid_init();
697
698         /* enable VMX operation */
699         smp_rendezvous(NULL, vmx_enable, NULL, NULL);
700
701         vmx_initialized = 1;
702
703         return (0);
704 }
705
706 static void
707 vmx_trigger_hostintr(int vector)
708 {
709         uintptr_t func;
710         struct gate_descriptor *gd;
711
712         gd = &idt[vector];
713
714         KASSERT(vector >= 32 && vector <= 255, ("vmx_trigger_hostintr: "
715             "invalid vector %d", vector));
716         KASSERT(gd->gd_p == 1, ("gate descriptor for vector %d not present",
717             vector));
718         KASSERT(gd->gd_type == SDT_SYSIGT, ("gate descriptor for vector %d "
719             "has invalid type %d", vector, gd->gd_type));
720         KASSERT(gd->gd_dpl == SEL_KPL, ("gate descriptor for vector %d "
721             "has invalid dpl %d", vector, gd->gd_dpl));
722         KASSERT(gd->gd_selector == GSEL(GCODE_SEL, SEL_KPL), ("gate descriptor "
723             "for vector %d has invalid selector %d", vector, gd->gd_selector));
724         KASSERT(gd->gd_ist == 0, ("gate descriptor for vector %d has invalid "
725             "IST %d", vector, gd->gd_ist));
726
727         func = ((long)gd->gd_hioffset << 16 | gd->gd_looffset);
728         vmx_call_isr(func);
729 }
730
731 static int
732 vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial)
733 {
734         int error, mask_ident, shadow_ident;
735         uint64_t mask_value;
736
737         if (which != 0 && which != 4)
738                 panic("vmx_setup_cr_shadow: unknown cr%d", which);
739
740         if (which == 0) {
741                 mask_ident = VMCS_CR0_MASK;
742                 mask_value = cr0_ones_mask | cr0_zeros_mask;
743                 shadow_ident = VMCS_CR0_SHADOW;
744         } else {
745                 mask_ident = VMCS_CR4_MASK;
746                 mask_value = cr4_ones_mask | cr4_zeros_mask;
747                 shadow_ident = VMCS_CR4_SHADOW;
748         }
749
750         error = vmcs_setreg(vmcs, 0, VMCS_IDENT(mask_ident), mask_value);
751         if (error)
752                 return (error);
753
754         error = vmcs_setreg(vmcs, 0, VMCS_IDENT(shadow_ident), initial);
755         if (error)
756                 return (error);
757
758         return (0);
759 }
760 #define vmx_setup_cr0_shadow(vmcs,init) vmx_setup_cr_shadow(0, (vmcs), (init))
761 #define vmx_setup_cr4_shadow(vmcs,init) vmx_setup_cr_shadow(4, (vmcs), (init))
762
763 static void *
764 vmx_vminit(struct vm *vm, pmap_t pmap)
765 {
766         uint16_t vpid[VM_MAXCPU];
767         int i, error, guest_msr_count;
768         struct vmx *vmx;
769         struct vmcs *vmcs;
770
771         vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO);
772         if ((uintptr_t)vmx & PAGE_MASK) {
773                 panic("malloc of struct vmx not aligned on %d byte boundary",
774                       PAGE_SIZE);
775         }
776         vmx->vm = vm;
777
778         vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4));
779
780         /*
781          * Clean up EPTP-tagged guest physical and combined mappings
782          *
783          * VMX transitions are not required to invalidate any guest physical
784          * mappings. So, it may be possible for stale guest physical mappings
785          * to be present in the processor TLBs.
786          *
787          * Combined mappings for this EP4TA are also invalidated for all VPIDs.
788          */
789         ept_invalidate_mappings(vmx->eptp);
790
791         msr_bitmap_initialize(vmx->msr_bitmap);
792
793         /*
794          * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE.
795          * The guest FSBASE and GSBASE are saved and restored during
796          * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are
797          * always restored from the vmcs host state area on vm-exit.
798          *
799          * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in
800          * how they are saved/restored so can be directly accessed by the
801          * guest.
802          *
803          * Guest KGSBASE is saved and restored in the guest MSR save area.
804          * Host KGSBASE is restored before returning to userland from the pcb.
805          * There will be a window of time when we are executing in the host
806          * kernel context with a value of KGSBASE from the guest. This is ok
807          * because the value of KGSBASE is inconsequential in kernel context.
808          *
809          * MSR_EFER is saved and restored in the guest VMCS area on a
810          * VM exit and entry respectively. It is also restored from the
811          * host VMCS area on a VM exit.
812          */
813         if (guest_msr_rw(vmx, MSR_GSBASE) ||
814             guest_msr_rw(vmx, MSR_FSBASE) ||
815             guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) ||
816             guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) ||
817             guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) ||
818             guest_msr_rw(vmx, MSR_KGSBASE) ||
819             guest_msr_rw(vmx, MSR_EFER))
820                 panic("vmx_vminit: error setting guest msr access");
821
822         /*
823          * MSR_PAT is saved and restored in the guest VMCS are on a VM exit
824          * and entry respectively. It is also restored from the host VMCS
825          * area on a VM exit. However, if running on a system with no
826          * MSR_PAT save/restore support, leave access disabled so accesses
827          * will be trapped.
828          */
829         if (!vmx_no_patmsr && guest_msr_rw(vmx, MSR_PAT))
830                 panic("vmx_vminit: error setting guest pat msr access");
831
832         vpid_alloc(vpid, VM_MAXCPU);
833
834         if (virtual_interrupt_delivery) {
835                 error = vm_map_mmio(vm, DEFAULT_APIC_BASE, PAGE_SIZE,
836                     APIC_ACCESS_ADDRESS);
837                 /* XXX this should really return an error to the caller */
838                 KASSERT(error == 0, ("vm_map_mmio(apicbase) error %d", error));
839         }
840
841         for (i = 0; i < VM_MAXCPU; i++) {
842                 vmcs = &vmx->vmcs[i];
843                 vmcs->identifier = vmx_revision();
844                 error = vmclear(vmcs);
845                 if (error != 0) {
846                         panic("vmx_vminit: vmclear error %d on vcpu %d\n",
847                               error, i);
848                 }
849
850                 error = vmcs_init(vmcs);
851                 KASSERT(error == 0, ("vmcs_init error %d", error));
852
853                 VMPTRLD(vmcs);
854                 error = 0;
855                 error += vmwrite(VMCS_HOST_RSP, (u_long)&vmx->ctx[i]);
856                 error += vmwrite(VMCS_EPTP, vmx->eptp);
857                 error += vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls);
858                 error += vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls);
859                 error += vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2);
860                 error += vmwrite(VMCS_EXIT_CTLS, exit_ctls);
861                 error += vmwrite(VMCS_ENTRY_CTLS, entry_ctls);
862                 error += vmwrite(VMCS_MSR_BITMAP, vtophys(vmx->msr_bitmap));
863                 error += vmwrite(VMCS_VPID, vpid[i]);
864                 if (virtual_interrupt_delivery) {
865                         error += vmwrite(VMCS_APIC_ACCESS, APIC_ACCESS_ADDRESS);
866                         error += vmwrite(VMCS_VIRTUAL_APIC,
867                             vtophys(&vmx->apic_page[i]));
868                         error += vmwrite(VMCS_EOI_EXIT0, 0);
869                         error += vmwrite(VMCS_EOI_EXIT1, 0);
870                         error += vmwrite(VMCS_EOI_EXIT2, 0);
871                         error += vmwrite(VMCS_EOI_EXIT3, 0);
872                 }
873                 if (posted_interrupts) {
874                         error += vmwrite(VMCS_PIR_VECTOR, pirvec);
875                         error += vmwrite(VMCS_PIR_DESC,
876                             vtophys(&vmx->pir_desc[i]));
877                 }
878                 VMCLEAR(vmcs);
879                 KASSERT(error == 0, ("vmx_vminit: error customizing the vmcs"));
880
881                 vmx->cap[i].set = 0;
882                 vmx->cap[i].proc_ctls = procbased_ctls;
883                 vmx->cap[i].proc_ctls2 = procbased_ctls2;
884
885                 vmx->state[i].lastcpu = -1;
886                 vmx->state[i].vpid = vpid[i];
887
888                 msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count);
889
890                 error = vmcs_set_msr_save(vmcs, vtophys(vmx->guest_msrs[i]),
891                     guest_msr_count);
892                 if (error != 0)
893                         panic("vmcs_set_msr_save error %d", error);
894
895                 /*
896                  * Set up the CR0/4 shadows, and init the read shadow
897                  * to the power-on register value from the Intel Sys Arch.
898                  *  CR0 - 0x60000010
899                  *  CR4 - 0
900                  */
901                 error = vmx_setup_cr0_shadow(vmcs, 0x60000010);
902                 if (error != 0)
903                         panic("vmx_setup_cr0_shadow %d", error);
904
905                 error = vmx_setup_cr4_shadow(vmcs, 0);
906                 if (error != 0)
907                         panic("vmx_setup_cr4_shadow %d", error);
908
909                 vmx->ctx[i].pmap = pmap;
910         }
911
912         return (vmx);
913 }
914
915 static int
916 vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx)
917 {
918         int handled, func;
919         
920         func = vmxctx->guest_rax;
921
922         handled = x86_emulate_cpuid(vm, vcpu,
923                                     (uint32_t*)(&vmxctx->guest_rax),
924                                     (uint32_t*)(&vmxctx->guest_rbx),
925                                     (uint32_t*)(&vmxctx->guest_rcx),
926                                     (uint32_t*)(&vmxctx->guest_rdx));
927         return (handled);
928 }
929
930 static __inline void
931 vmx_run_trace(struct vmx *vmx, int vcpu)
932 {
933 #ifdef KTR
934         VCPU_CTR1(vmx->vm, vcpu, "Resume execution at %#lx", vmcs_guest_rip());
935 #endif
936 }
937
938 static __inline void
939 vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason,
940                int handled)
941 {
942 #ifdef KTR
943         VCPU_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx",
944                  handled ? "handled" : "unhandled",
945                  exit_reason_to_str(exit_reason), rip);
946 #endif
947 }
948
949 static __inline void
950 vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip)
951 {
952 #ifdef KTR
953         VCPU_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip);
954 #endif
955 }
956
957 static VMM_STAT_INTEL(VCPU_INVVPID_SAVED, "Number of vpid invalidations saved");
958
959 static void
960 vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap)
961 {
962         struct vmxstate *vmxstate;
963         struct invvpid_desc invvpid_desc;
964
965         vmxstate = &vmx->state[vcpu];
966         if (vmxstate->lastcpu == curcpu)
967                 return;
968
969         vmxstate->lastcpu = curcpu;
970
971         vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1);
972
973         vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase());
974         vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase());
975         vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase());
976
977         /*
978          * If we are using VPIDs then invalidate all mappings tagged with 'vpid'
979          *
980          * We do this because this vcpu was executing on a different host
981          * cpu when it last ran. We do not track whether it invalidated
982          * mappings associated with its 'vpid' during that run. So we must
983          * assume that the mappings associated with 'vpid' on 'curcpu' are
984          * stale and invalidate them.
985          *
986          * Note that we incur this penalty only when the scheduler chooses to
987          * move the thread associated with this vcpu between host cpus.
988          *
989          * Note also that this will invalidate mappings tagged with 'vpid'
990          * for "all" EP4TAs.
991          */
992         if (vmxstate->vpid != 0) {
993                 if (pmap->pm_eptgen == vmx->eptgen[curcpu]) {
994                         invvpid_desc._res1 = 0;
995                         invvpid_desc._res2 = 0;
996                         invvpid_desc.vpid = vmxstate->vpid;
997                         invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc);
998                 } else {
999                         /*
1000                          * The invvpid can be skipped if an invept is going to
1001                          * be performed before entering the guest. The invept
1002                          * will invalidate combined mappings tagged with
1003                          * 'vmx->eptp' for all vpids.
1004                          */
1005                         vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1);
1006                 }
1007         }
1008 }
1009
1010 /*
1011  * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set.
1012  */
1013 CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0);
1014
1015 static void __inline
1016 vmx_set_int_window_exiting(struct vmx *vmx, int vcpu)
1017 {
1018
1019         if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) == 0) {
1020                 vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING;
1021                 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
1022                 VCPU_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting");
1023         }
1024 }
1025
1026 static void __inline
1027 vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu)
1028 {
1029
1030         KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0,
1031             ("intr_window_exiting not set: %#x", vmx->cap[vcpu].proc_ctls));
1032         vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING;
1033         vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
1034         VCPU_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting");
1035 }
1036
1037 static void __inline
1038 vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu)
1039 {
1040
1041         if ((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) == 0) {
1042                 vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING;
1043                 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
1044                 VCPU_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting");
1045         }
1046 }
1047
1048 static void __inline
1049 vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu)
1050 {
1051
1052         KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0,
1053             ("nmi_window_exiting not set %#x", vmx->cap[vcpu].proc_ctls));
1054         vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING;
1055         vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
1056         VCPU_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting");
1057 }
1058
1059 #define NMI_BLOCKING    (VMCS_INTERRUPTIBILITY_NMI_BLOCKING |           \
1060                          VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING)
1061 #define HWINTR_BLOCKING (VMCS_INTERRUPTIBILITY_STI_BLOCKING |           \
1062                          VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING)
1063
1064 static void
1065 vmx_inject_nmi(struct vmx *vmx, int vcpu)
1066 {
1067         uint32_t gi, info;
1068
1069         gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1070         KASSERT((gi & NMI_BLOCKING) == 0, ("vmx_inject_nmi: invalid guest "
1071             "interruptibility-state %#x", gi));
1072
1073         info = vmcs_read(VMCS_ENTRY_INTR_INFO);
1074         KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_nmi: invalid "
1075             "VM-entry interruption information %#x", info));
1076
1077         /*
1078          * Inject the virtual NMI. The vector must be the NMI IDT entry
1079          * or the VMCS entry check will fail.
1080          */
1081         info = IDT_NMI | VMCS_INTR_T_NMI | VMCS_INTR_VALID;
1082         vmcs_write(VMCS_ENTRY_INTR_INFO, info);
1083
1084         VCPU_CTR0(vmx->vm, vcpu, "Injecting vNMI");
1085
1086         /* Clear the request */
1087         vm_nmi_clear(vmx->vm, vcpu);
1088 }
1089
1090 static void
1091 vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic)
1092 {
1093         int vector, need_nmi_exiting;
1094         uint64_t rflags;
1095         uint32_t gi, info;
1096
1097         if (vm_nmi_pending(vmx->vm, vcpu)) {
1098                 /*
1099                  * If there are no conditions blocking NMI injection then
1100                  * inject it directly here otherwise enable "NMI window
1101                  * exiting" to inject it as soon as we can.
1102                  *
1103                  * We also check for STI_BLOCKING because some implementations
1104                  * don't allow NMI injection in this case. If we are running
1105                  * on a processor that doesn't have this restriction it will
1106                  * immediately exit and the NMI will be injected in the
1107                  * "NMI window exiting" handler.
1108                  */
1109                 need_nmi_exiting = 1;
1110                 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1111                 if ((gi & (HWINTR_BLOCKING | NMI_BLOCKING)) == 0) {
1112                         info = vmcs_read(VMCS_ENTRY_INTR_INFO);
1113                         if ((info & VMCS_INTR_VALID) == 0) {
1114                                 vmx_inject_nmi(vmx, vcpu);
1115                                 need_nmi_exiting = 0;
1116                         } else {
1117                                 VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI "
1118                                     "due to VM-entry intr info %#x", info);
1119                         }
1120                 } else {
1121                         VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI due to "
1122                             "Guest Interruptibility-state %#x", gi);
1123                 }
1124
1125                 if (need_nmi_exiting)
1126                         vmx_set_nmi_window_exiting(vmx, vcpu);
1127         }
1128
1129         if (virtual_interrupt_delivery) {
1130                 vmx_inject_pir(vlapic);
1131                 return;
1132         }
1133
1134         /*
1135          * If interrupt-window exiting is already in effect then don't bother
1136          * checking for pending interrupts. This is just an optimization and
1137          * not needed for correctness.
1138          */
1139         if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0) {
1140                 VCPU_CTR0(vmx->vm, vcpu, "Skip interrupt injection due to "
1141                     "pending int_window_exiting");
1142                 return;
1143         }
1144
1145         /* Ask the local apic for a vector to inject */
1146         if (!vlapic_pending_intr(vlapic, &vector))
1147                 return;
1148
1149         KASSERT(vector >= 32 && vector <= 255, ("invalid vector %d", vector));
1150
1151         /* Check RFLAGS.IF and the interruptibility state of the guest */
1152         rflags = vmcs_read(VMCS_GUEST_RFLAGS);
1153         if ((rflags & PSL_I) == 0) {
1154                 VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
1155                     "rflags %#lx", vector, rflags);
1156                 goto cantinject;
1157         }
1158
1159         gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1160         if (gi & HWINTR_BLOCKING) {
1161                 VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
1162                     "Guest Interruptibility-state %#x", vector, gi);
1163                 goto cantinject;
1164         }
1165
1166         info = vmcs_read(VMCS_ENTRY_INTR_INFO);
1167         if (info & VMCS_INTR_VALID) {
1168                 /*
1169                  * This is expected and could happen for multiple reasons:
1170                  * - A vectoring VM-entry was aborted due to astpending
1171                  * - A VM-exit happened during event injection.
1172                  * - An NMI was injected above or after "NMI window exiting"
1173                  */
1174                 VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
1175                     "VM-entry intr info %#x", vector, info);
1176                 goto cantinject;
1177         }
1178
1179         /* Inject the interrupt */
1180         info = VMCS_INTR_T_HWINTR | VMCS_INTR_VALID;
1181         info |= vector;
1182         vmcs_write(VMCS_ENTRY_INTR_INFO, info);
1183
1184         /* Update the Local APIC ISR */
1185         vlapic_intr_accepted(vlapic, vector);
1186
1187         VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector);
1188
1189         return;
1190
1191 cantinject:
1192         /*
1193          * Set the Interrupt Window Exiting execution control so we can inject
1194          * the interrupt as soon as blocking condition goes away.
1195          */
1196         vmx_set_int_window_exiting(vmx, vcpu);
1197 }
1198
1199 /*
1200  * If the Virtual NMIs execution control is '1' then the logical processor
1201  * tracks virtual-NMI blocking in the Guest Interruptibility-state field of
1202  * the VMCS. An IRET instruction in VMX non-root operation will remove any
1203  * virtual-NMI blocking.
1204  *
1205  * This unblocking occurs even if the IRET causes a fault. In this case the
1206  * hypervisor needs to restore virtual-NMI blocking before resuming the guest.
1207  */
1208 static void
1209 vmx_restore_nmi_blocking(struct vmx *vmx, int vcpuid)
1210 {
1211         uint32_t gi;
1212
1213         VCPU_CTR0(vmx->vm, vcpuid, "Restore Virtual-NMI blocking");
1214         gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1215         gi |= VMCS_INTERRUPTIBILITY_NMI_BLOCKING;
1216         vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
1217 }
1218
1219 static void
1220 vmx_clear_nmi_blocking(struct vmx *vmx, int vcpuid)
1221 {
1222         uint32_t gi;
1223
1224         VCPU_CTR0(vmx->vm, vcpuid, "Clear Virtual-NMI blocking");
1225         gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1226         gi &= ~VMCS_INTERRUPTIBILITY_NMI_BLOCKING;
1227         vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
1228 }
1229
1230 static int
1231 vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
1232 {
1233         int cr, vmcs_guest_cr, vmcs_shadow_cr;
1234         uint64_t crval, regval, ones_mask, zeros_mask;
1235         const struct vmxctx *vmxctx;
1236
1237         /* We only handle mov to %cr0 or %cr4 at this time */
1238         if ((exitqual & 0xf0) != 0x00)
1239                 return (UNHANDLED);
1240
1241         cr = exitqual & 0xf;
1242         if (cr != 0 && cr != 4)
1243                 return (UNHANDLED);
1244
1245         vmxctx = &vmx->ctx[vcpu];
1246
1247         /*
1248          * We must use vmcs_write() directly here because vmcs_setreg() will
1249          * call vmclear(vmcs) as a side-effect which we certainly don't want.
1250          */
1251         switch ((exitqual >> 8) & 0xf) {
1252         case 0:
1253                 regval = vmxctx->guest_rax;
1254                 break;
1255         case 1:
1256                 regval = vmxctx->guest_rcx;
1257                 break;
1258         case 2:
1259                 regval = vmxctx->guest_rdx;
1260                 break;
1261         case 3:
1262                 regval = vmxctx->guest_rbx;
1263                 break;
1264         case 4:
1265                 regval = vmcs_read(VMCS_GUEST_RSP);
1266                 break;
1267         case 5:
1268                 regval = vmxctx->guest_rbp;
1269                 break;
1270         case 6:
1271                 regval = vmxctx->guest_rsi;
1272                 break;
1273         case 7:
1274                 regval = vmxctx->guest_rdi;
1275                 break;
1276         case 8:
1277                 regval = vmxctx->guest_r8;
1278                 break;
1279         case 9:
1280                 regval = vmxctx->guest_r9;
1281                 break;
1282         case 10:
1283                 regval = vmxctx->guest_r10;
1284                 break;
1285         case 11:
1286                 regval = vmxctx->guest_r11;
1287                 break;
1288         case 12:
1289                 regval = vmxctx->guest_r12;
1290                 break;
1291         case 13:
1292                 regval = vmxctx->guest_r13;
1293                 break;
1294         case 14:
1295                 regval = vmxctx->guest_r14;
1296                 break;
1297         case 15:
1298                 regval = vmxctx->guest_r15;
1299                 break;
1300         }
1301
1302         if (cr == 0) {
1303                 ones_mask = cr0_ones_mask;
1304                 zeros_mask = cr0_zeros_mask;
1305                 vmcs_guest_cr = VMCS_GUEST_CR0;
1306                 vmcs_shadow_cr = VMCS_CR0_SHADOW;
1307         } else {
1308                 ones_mask = cr4_ones_mask;
1309                 zeros_mask = cr4_zeros_mask;
1310                 vmcs_guest_cr = VMCS_GUEST_CR4;
1311                 vmcs_shadow_cr = VMCS_CR4_SHADOW;
1312         }
1313         vmcs_write(vmcs_shadow_cr, regval);
1314
1315         crval = regval | ones_mask;
1316         crval &= ~zeros_mask;
1317         vmcs_write(vmcs_guest_cr, crval);
1318
1319         if (cr == 0 && regval & CR0_PG) {
1320                 uint64_t efer, entry_ctls;
1321
1322                 /*
1323                  * If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and
1324                  * the "IA-32e mode guest" bit in VM-entry control must be
1325                  * equal.
1326                  */
1327                 efer = vmcs_read(VMCS_GUEST_IA32_EFER);
1328                 if (efer & EFER_LME) {
1329                         efer |= EFER_LMA;
1330                         vmcs_write(VMCS_GUEST_IA32_EFER, efer);
1331                         entry_ctls = vmcs_read(VMCS_ENTRY_CTLS);
1332                         entry_ctls |= VM_ENTRY_GUEST_LMA;
1333                         vmcs_write(VMCS_ENTRY_CTLS, entry_ctls);
1334                 }
1335         }
1336
1337         return (HANDLED);
1338 }
1339
1340 static int
1341 ept_fault_type(uint64_t ept_qual)
1342 {
1343         int fault_type;
1344
1345         if (ept_qual & EPT_VIOLATION_DATA_WRITE)
1346                 fault_type = VM_PROT_WRITE;
1347         else if (ept_qual & EPT_VIOLATION_INST_FETCH)
1348                 fault_type = VM_PROT_EXECUTE;
1349         else
1350                 fault_type= VM_PROT_READ;
1351
1352         return (fault_type);
1353 }
1354
1355 static boolean_t
1356 ept_emulation_fault(uint64_t ept_qual)
1357 {
1358         int read, write;
1359
1360         /* EPT fault on an instruction fetch doesn't make sense here */
1361         if (ept_qual & EPT_VIOLATION_INST_FETCH)
1362                 return (FALSE);
1363
1364         /* EPT fault must be a read fault or a write fault */
1365         read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0;
1366         write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0;
1367         if ((read | write) == 0)
1368                 return (FALSE);
1369
1370         /*
1371          * The EPT violation must have been caused by accessing a
1372          * guest-physical address that is a translation of a guest-linear
1373          * address.
1374          */
1375         if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 ||
1376             (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) {
1377                 return (FALSE);
1378         }
1379
1380         return (TRUE);
1381 }
1382
1383 static int
1384 vmx_handle_apic_write(struct vlapic *vlapic, uint64_t qual)
1385 {
1386         int error, handled, offset;
1387         bool retu;
1388
1389         if (!virtual_interrupt_delivery)
1390                 return (UNHANDLED);
1391
1392         handled = 1;
1393         offset = APIC_WRITE_OFFSET(qual);
1394         switch (offset) {
1395         case APIC_OFFSET_ID:
1396                 vlapic_id_write_handler(vlapic);
1397                 break;
1398         case APIC_OFFSET_LDR:
1399                 vlapic_ldr_write_handler(vlapic);
1400                 break;
1401         case APIC_OFFSET_DFR:
1402                 vlapic_dfr_write_handler(vlapic);
1403                 break;
1404         case APIC_OFFSET_SVR:
1405                 vlapic_svr_write_handler(vlapic);
1406                 break;
1407         case APIC_OFFSET_ESR:
1408                 vlapic_esr_write_handler(vlapic);
1409                 break;
1410         case APIC_OFFSET_ICR_LOW:
1411                 retu = false;
1412                 error = vlapic_icrlo_write_handler(vlapic, &retu);
1413                 if (error != 0 || retu)
1414                         handled = 0;
1415                 break;
1416         case APIC_OFFSET_CMCI_LVT:
1417         case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
1418                 vlapic_lvt_write_handler(vlapic, offset);
1419                 break;
1420         case APIC_OFFSET_TIMER_ICR:
1421                 vlapic_icrtmr_write_handler(vlapic);
1422                 break;
1423         case APIC_OFFSET_TIMER_DCR:
1424                 vlapic_dcr_write_handler(vlapic);
1425                 break;
1426         default:
1427                 handled = 0;
1428                 break;
1429         }
1430         return (handled);
1431 }
1432
1433 static bool
1434 apic_access_fault(uint64_t gpa)
1435 {
1436
1437         if (virtual_interrupt_delivery &&
1438             (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE))
1439                 return (true);
1440         else
1441                 return (false);
1442 }
1443
1444 static int
1445 vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
1446 {
1447         uint64_t qual;
1448         int access_type, offset, allowed;
1449
1450         if (!virtual_interrupt_delivery)
1451                 return (UNHANDLED);
1452
1453         qual = vmexit->u.vmx.exit_qualification;
1454         access_type = APIC_ACCESS_TYPE(qual);
1455         offset = APIC_ACCESS_OFFSET(qual);
1456
1457         allowed = 0;
1458         if (access_type == 0) {
1459                 /*
1460                  * Read data access to the following registers is expected.
1461                  */
1462                 switch (offset) {
1463                 case APIC_OFFSET_APR:
1464                 case APIC_OFFSET_PPR:
1465                 case APIC_OFFSET_RRR:
1466                 case APIC_OFFSET_CMCI_LVT:
1467                 case APIC_OFFSET_TIMER_CCR:
1468                         allowed = 1;
1469                         break;
1470                 default:
1471                         break;
1472                 }
1473         } else if (access_type == 1) {
1474                 /*
1475                  * Write data access to the following registers is expected.
1476                  */
1477                 switch (offset) {
1478                 case APIC_OFFSET_VER:
1479                 case APIC_OFFSET_APR:
1480                 case APIC_OFFSET_PPR:
1481                 case APIC_OFFSET_RRR:
1482                 case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
1483                 case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
1484                 case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
1485                 case APIC_OFFSET_CMCI_LVT:
1486                 case APIC_OFFSET_TIMER_CCR:
1487                         allowed = 1;
1488                         break;
1489                 default:
1490                         break;
1491                 }
1492         }
1493
1494         if (allowed) {
1495                 vmexit->exitcode = VM_EXITCODE_INST_EMUL;
1496                 vmexit->u.inst_emul.gpa = DEFAULT_APIC_BASE + offset;
1497                 vmexit->u.inst_emul.gla = VIE_INVALID_GLA;
1498                 vmexit->u.inst_emul.cr3 = vmcs_guest_cr3();
1499         }
1500
1501         /*
1502          * Regardless of whether the APIC-access is allowed this handler
1503          * always returns UNHANDLED:
1504          * - if the access is allowed then it is handled by emulating the
1505          *   instruction that caused the VM-exit (outside the critical section)
1506          * - if the access is not allowed then it will be converted to an
1507          *   exitcode of VM_EXITCODE_VMX and will be dealt with in userland.
1508          */
1509         return (UNHANDLED);
1510 }
1511
1512 static int
1513 vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
1514 {
1515         int error, handled;
1516         struct vmxctx *vmxctx;
1517         struct vlapic *vlapic;
1518         uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, reason;
1519         uint64_t qual, gpa;
1520         bool retu;
1521
1522         CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0);
1523         CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_NMI_EXITING) != 0);
1524
1525         handled = 0;
1526         vmxctx = &vmx->ctx[vcpu];
1527
1528         qual = vmexit->u.vmx.exit_qualification;
1529         reason = vmexit->u.vmx.exit_reason;
1530         vmexit->exitcode = VM_EXITCODE_BOGUS;
1531
1532         vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1);
1533
1534         /*
1535          * VM exits that could be triggered during event injection on the
1536          * previous VM entry need to be handled specially by re-injecting
1537          * the event.
1538          *
1539          * See "Information for VM Exits During Event Delivery" in Intel SDM
1540          * for details.
1541          */
1542         switch (reason) {
1543         case EXIT_REASON_EPT_FAULT:
1544         case EXIT_REASON_EPT_MISCONFIG:
1545         case EXIT_REASON_APIC_ACCESS:
1546         case EXIT_REASON_TASK_SWITCH:
1547         case EXIT_REASON_EXCEPTION:
1548                 idtvec_info = vmcs_idt_vectoring_info();
1549                 if (idtvec_info & VMCS_IDT_VEC_VALID) {
1550                         idtvec_info &= ~(1 << 12); /* clear undefined bit */
1551                         vmcs_write(VMCS_ENTRY_INTR_INFO, idtvec_info);
1552                         if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
1553                                 idtvec_err = vmcs_idt_vectoring_err();
1554                                 vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR,
1555                                     idtvec_err);
1556                         }
1557                         /*
1558                          * If 'virtual NMIs' are being used and the VM-exit
1559                          * happened while injecting an NMI during the previous
1560                          * VM-entry, then clear "blocking by NMI" in the Guest
1561                          * Interruptibility-state.
1562                          */
1563                         if ((idtvec_info & VMCS_INTR_T_MASK) ==
1564                             VMCS_INTR_T_NMI) {
1565                                  vmx_clear_nmi_blocking(vmx, vcpu);
1566                         }
1567                         vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
1568                 }
1569         default:
1570                 idtvec_info = 0;
1571                 break;
1572         }
1573
1574         switch (reason) {
1575         case EXIT_REASON_CR_ACCESS:
1576                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1);
1577                 handled = vmx_emulate_cr_access(vmx, vcpu, qual);
1578                 break;
1579         case EXIT_REASON_RDMSR:
1580                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1);
1581                 retu = false;
1582                 ecx = vmxctx->guest_rcx;
1583                 error = emulate_rdmsr(vmx->vm, vcpu, ecx, &retu);
1584                 if (error) {
1585                         vmexit->exitcode = VM_EXITCODE_RDMSR;
1586                         vmexit->u.msr.code = ecx;
1587                 } else if (!retu) {
1588                         handled = 1;
1589                 } else {
1590                         /* Return to userspace with a valid exitcode */
1591                         KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
1592                             ("emulate_wrmsr retu with bogus exitcode"));
1593                 }
1594                 break;
1595         case EXIT_REASON_WRMSR:
1596                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_WRMSR, 1);
1597                 retu = false;
1598                 eax = vmxctx->guest_rax;
1599                 ecx = vmxctx->guest_rcx;
1600                 edx = vmxctx->guest_rdx;
1601                 error = emulate_wrmsr(vmx->vm, vcpu, ecx,
1602                     (uint64_t)edx << 32 | eax, &retu);
1603                 if (error) {
1604                         vmexit->exitcode = VM_EXITCODE_WRMSR;
1605                         vmexit->u.msr.code = ecx;
1606                         vmexit->u.msr.wval = (uint64_t)edx << 32 | eax;
1607                 } else if (!retu) {
1608                         handled = 1;
1609                 } else {
1610                         /* Return to userspace with a valid exitcode */
1611                         KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
1612                             ("emulate_wrmsr retu with bogus exitcode"));
1613                 }
1614                 break;
1615         case EXIT_REASON_HLT:
1616                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1);
1617                 vmexit->exitcode = VM_EXITCODE_HLT;
1618                 vmexit->u.hlt.rflags = vmcs_read(VMCS_GUEST_RFLAGS);
1619                 break;
1620         case EXIT_REASON_MTF:
1621                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1);
1622                 vmexit->exitcode = VM_EXITCODE_MTRAP;
1623                 break;
1624         case EXIT_REASON_PAUSE:
1625                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1);
1626                 vmexit->exitcode = VM_EXITCODE_PAUSE;
1627                 break;
1628         case EXIT_REASON_INTR_WINDOW:
1629                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1);
1630                 vmx_clear_int_window_exiting(vmx, vcpu);
1631                 return (1);
1632         case EXIT_REASON_EXT_INTR:
1633                 /*
1634                  * External interrupts serve only to cause VM exits and allow
1635                  * the host interrupt handler to run.
1636                  *
1637                  * If this external interrupt triggers a virtual interrupt
1638                  * to a VM, then that state will be recorded by the
1639                  * host interrupt handler in the VM's softc. We will inject
1640                  * this virtual interrupt during the subsequent VM enter.
1641                  */
1642                 intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
1643                 KASSERT((intr_info & VMCS_INTR_VALID) != 0 &&
1644                     (intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_HWINTR,
1645                     ("VM exit interruption info invalid: %#x", intr_info));
1646                 vmx_trigger_hostintr(intr_info & 0xff);
1647
1648                 /*
1649                  * This is special. We want to treat this as an 'handled'
1650                  * VM-exit but not increment the instruction pointer.
1651                  */
1652                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1);
1653                 return (1);
1654         case EXIT_REASON_NMI_WINDOW:
1655                 /* Exit to allow the pending virtual NMI to be injected */
1656                 if (vm_nmi_pending(vmx->vm, vcpu))
1657                         vmx_inject_nmi(vmx, vcpu);
1658                 vmx_clear_nmi_window_exiting(vmx, vcpu);
1659                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NMI_WINDOW, 1);
1660                 return (1);
1661         case EXIT_REASON_INOUT:
1662                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1);
1663                 vmexit->exitcode = VM_EXITCODE_INOUT;
1664                 vmexit->u.inout.bytes = (qual & 0x7) + 1;
1665                 vmexit->u.inout.in = (qual & 0x8) ? 1 : 0;
1666                 vmexit->u.inout.string = (qual & 0x10) ? 1 : 0;
1667                 vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0;
1668                 vmexit->u.inout.port = (uint16_t)(qual >> 16);
1669                 vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax);
1670                 break;
1671         case EXIT_REASON_CPUID:
1672                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1);
1673                 handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx);
1674                 break;
1675         case EXIT_REASON_EXCEPTION:
1676                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXCEPTION, 1);
1677                 intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
1678                 KASSERT((intr_info & VMCS_INTR_VALID) != 0,
1679                     ("VM exit interruption info invalid: %#x", intr_info));
1680
1681                 /*
1682                  * If Virtual NMIs control is 1 and the VM-exit is due to a
1683                  * fault encountered during the execution of IRET then we must
1684                  * restore the state of "virtual-NMI blocking" before resuming
1685                  * the guest.
1686                  *
1687                  * See "Resuming Guest Software after Handling an Exception".
1688                  */
1689                 if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
1690                     (intr_info & 0xff) != IDT_DF &&
1691                     (intr_info & EXIT_QUAL_NMIUDTI) != 0)
1692                         vmx_restore_nmi_blocking(vmx, vcpu);
1693
1694                 /*
1695                  * The NMI has already been handled in vmx_exit_handle_nmi().
1696                  */
1697                 if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI)
1698                         return (1);
1699                 break;
1700         case EXIT_REASON_EPT_FAULT:
1701                 /*
1702                  * If 'gpa' lies within the address space allocated to
1703                  * memory then this must be a nested page fault otherwise
1704                  * this must be an instruction that accesses MMIO space.
1705                  */
1706                 gpa = vmcs_gpa();
1707                 if (vm_mem_allocated(vmx->vm, gpa) || apic_access_fault(gpa)) {
1708                         vmexit->exitcode = VM_EXITCODE_PAGING;
1709                         vmexit->u.paging.gpa = gpa;
1710                         vmexit->u.paging.fault_type = ept_fault_type(qual);
1711                         vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NESTED_FAULT, 1);
1712                 } else if (ept_emulation_fault(qual)) {
1713                         vmexit->exitcode = VM_EXITCODE_INST_EMUL;
1714                         vmexit->u.inst_emul.gpa = gpa;
1715                         vmexit->u.inst_emul.gla = vmcs_gla();
1716                         vmexit->u.inst_emul.cr3 = vmcs_guest_cr3();
1717                         vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INST_EMUL, 1);
1718                 }
1719                 /*
1720                  * If Virtual NMIs control is 1 and the VM-exit is due to an
1721                  * EPT fault during the execution of IRET then we must restore
1722                  * the state of "virtual-NMI blocking" before resuming.
1723                  *
1724                  * See description of "NMI unblocking due to IRET" in
1725                  * "Exit Qualification for EPT Violations".
1726                  */
1727                 if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
1728                     (qual & EXIT_QUAL_NMIUDTI) != 0)
1729                         vmx_restore_nmi_blocking(vmx, vcpu);
1730                 break;
1731         case EXIT_REASON_VIRTUALIZED_EOI:
1732                 vmexit->exitcode = VM_EXITCODE_IOAPIC_EOI;
1733                 vmexit->u.ioapic_eoi.vector = qual & 0xFF;
1734                 vmexit->inst_length = 0;        /* trap-like */
1735                 break;
1736         case EXIT_REASON_APIC_ACCESS:
1737                 handled = vmx_handle_apic_access(vmx, vcpu, vmexit);
1738                 break;
1739         case EXIT_REASON_APIC_WRITE:
1740                 /*
1741                  * APIC-write VM exit is trap-like so the %rip is already
1742                  * pointing to the next instruction.
1743                  */
1744                 vmexit->inst_length = 0;
1745                 vlapic = vm_lapic(vmx->vm, vcpu);
1746                 handled = vmx_handle_apic_write(vlapic, qual);
1747                 break;
1748         default:
1749                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1);
1750                 break;
1751         }
1752
1753         if (handled) {
1754                 /*
1755                  * It is possible that control is returned to userland
1756                  * even though we were able to handle the VM exit in the
1757                  * kernel.
1758                  *
1759                  * In such a case we want to make sure that the userland
1760                  * restarts guest execution at the instruction *after*
1761                  * the one we just processed. Therefore we update the
1762                  * guest rip in the VMCS and in 'vmexit'.
1763                  */
1764                 vmexit->rip += vmexit->inst_length;
1765                 vmexit->inst_length = 0;
1766                 vmcs_write(VMCS_GUEST_RIP, vmexit->rip);
1767         } else {
1768                 if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
1769                         /*
1770                          * If this VM exit was not claimed by anybody then
1771                          * treat it as a generic VMX exit.
1772                          */
1773                         vmexit->exitcode = VM_EXITCODE_VMX;
1774                         vmexit->u.vmx.status = VM_SUCCESS;
1775                         vmexit->u.vmx.inst_type = 0;
1776                         vmexit->u.vmx.inst_error = 0;
1777                 } else {
1778                         /*
1779                          * The exitcode and collateral have been populated.
1780                          * The VM exit will be processed further in userland.
1781                          */
1782                 }
1783         }
1784         return (handled);
1785 }
1786
1787 static __inline int
1788 vmx_exit_astpending(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
1789 {
1790
1791         vmexit->rip = vmcs_guest_rip();
1792         vmexit->inst_length = 0;
1793         vmexit->exitcode = VM_EXITCODE_BOGUS;
1794         vmx_astpending_trace(vmx, vcpu, vmexit->rip);
1795         vmm_stat_incr(vmx->vm, vcpu, VMEXIT_ASTPENDING, 1);
1796
1797         return (HANDLED);
1798 }
1799
1800 static __inline int
1801 vmx_exit_rendezvous(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
1802 {
1803
1804         vmexit->rip = vmcs_guest_rip();
1805         vmexit->inst_length = 0;
1806         vmexit->exitcode = VM_EXITCODE_RENDEZVOUS;
1807         vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RENDEZVOUS, 1);
1808
1809         return (UNHANDLED);
1810 }
1811
1812 static __inline int
1813 vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit)
1814 {
1815
1816         KASSERT(vmxctx->inst_fail_status != VM_SUCCESS,
1817             ("vmx_exit_inst_error: invalid inst_fail_status %d",
1818             vmxctx->inst_fail_status));
1819
1820         vmexit->inst_length = 0;
1821         vmexit->exitcode = VM_EXITCODE_VMX;
1822         vmexit->u.vmx.status = vmxctx->inst_fail_status;
1823         vmexit->u.vmx.inst_error = vmcs_instruction_error();
1824         vmexit->u.vmx.exit_reason = ~0;
1825         vmexit->u.vmx.exit_qualification = ~0;
1826
1827         switch (rc) {
1828         case VMX_VMRESUME_ERROR:
1829         case VMX_VMLAUNCH_ERROR:
1830         case VMX_INVEPT_ERROR:
1831                 vmexit->u.vmx.inst_type = rc;
1832                 break;
1833         default:
1834                 panic("vm_exit_inst_error: vmx_enter_guest returned %d", rc);
1835         }
1836
1837         return (UNHANDLED);
1838 }
1839
1840 /*
1841  * If the NMI-exiting VM execution control is set to '1' then an NMI in
1842  * non-root operation causes a VM-exit. NMI blocking is in effect so it is
1843  * sufficient to simply vector to the NMI handler via a software interrupt.
1844  * However, this must be done before maskable interrupts are enabled
1845  * otherwise the "iret" issued by an interrupt handler will incorrectly
1846  * clear NMI blocking.
1847  */
1848 static __inline void
1849 vmx_exit_handle_nmi(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
1850 {
1851         uint32_t intr_info;
1852
1853         KASSERT((read_rflags() & PSL_I) == 0, ("interrupts enabled"));
1854
1855         if (vmexit->u.vmx.exit_reason != EXIT_REASON_EXCEPTION)
1856                 return;
1857
1858         intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
1859         KASSERT((intr_info & VMCS_INTR_VALID) != 0,
1860             ("VM exit interruption info invalid: %#x", intr_info));
1861
1862         if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI) {
1863                 KASSERT((intr_info & 0xff) == IDT_NMI, ("VM exit due "
1864                     "to NMI has invalid vector: %#x", intr_info));
1865                 VCPU_CTR0(vmx->vm, vcpuid, "Vectoring to NMI handler");
1866                 __asm __volatile("int $2");
1867         }
1868 }
1869
1870 static int
1871 vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap,
1872     void *rendezvous_cookie)
1873 {
1874         int rc, handled, launched;
1875         struct vmx *vmx;
1876         struct vm *vm;
1877         struct vmxctx *vmxctx;
1878         struct vmcs *vmcs;
1879         struct vm_exit *vmexit;
1880         struct vlapic *vlapic;
1881         uint64_t rip;
1882         uint32_t exit_reason;
1883
1884         vmx = arg;
1885         vm = vmx->vm;
1886         vmcs = &vmx->vmcs[vcpu];
1887         vmxctx = &vmx->ctx[vcpu];
1888         vlapic = vm_lapic(vm, vcpu);
1889         vmexit = vm_exitinfo(vm, vcpu);
1890         launched = 0;
1891
1892         KASSERT(vmxctx->pmap == pmap,
1893             ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap));
1894
1895         VMPTRLD(vmcs);
1896
1897         /*
1898          * XXX
1899          * We do this every time because we may setup the virtual machine
1900          * from a different process than the one that actually runs it.
1901          *
1902          * If the life of a virtual machine was spent entirely in the context
1903          * of a single process we could do this once in vmx_vminit().
1904          */
1905         vmcs_write(VMCS_HOST_CR3, rcr3());
1906
1907         vmcs_write(VMCS_GUEST_RIP, startrip);
1908         vmx_set_pcpu_defaults(vmx, vcpu, pmap);
1909         do {
1910                 /*
1911                  * Interrupts are disabled from this point on until the
1912                  * guest starts executing. This is done for the following
1913                  * reasons:
1914                  *
1915                  * If an AST is asserted on this thread after the check below,
1916                  * then the IPI_AST notification will not be lost, because it
1917                  * will cause a VM exit due to external interrupt as soon as
1918                  * the guest state is loaded.
1919                  *
1920                  * A posted interrupt after 'vmx_inject_interrupts()' will
1921                  * not be "lost" because it will be held pending in the host
1922                  * APIC because interrupts are disabled. The pending interrupt
1923                  * will be recognized as soon as the guest state is loaded.
1924                  *
1925                  * The same reasoning applies to the IPI generated by
1926                  * pmap_invalidate_ept().
1927                  */
1928                 disable_intr();
1929                 if (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED)) {
1930                         enable_intr();
1931                         handled = vmx_exit_astpending(vmx, vcpu, vmexit);
1932                         break;
1933                 }
1934
1935                 if (vcpu_rendezvous_pending(rendezvous_cookie)) {
1936                         enable_intr();
1937                         handled = vmx_exit_rendezvous(vmx, vcpu, vmexit);
1938                         break;
1939                 }
1940
1941                 vmx_inject_interrupts(vmx, vcpu, vlapic);
1942                 vmx_run_trace(vmx, vcpu);
1943                 rc = vmx_enter_guest(vmxctx, vmx, launched);
1944
1945                 /* Collect some information for VM exit processing */
1946                 vmexit->rip = rip = vmcs_guest_rip();
1947                 vmexit->inst_length = vmexit_instruction_length();
1948                 vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason();
1949                 vmexit->u.vmx.exit_qualification = vmcs_exit_qualification();
1950
1951                 if (rc == VMX_GUEST_VMEXIT) {
1952                         vmx_exit_handle_nmi(vmx, vcpu, vmexit);
1953                         enable_intr();
1954                         handled = vmx_exit_process(vmx, vcpu, vmexit);
1955                 } else {
1956                         enable_intr();
1957                         handled = vmx_exit_inst_error(vmxctx, rc, vmexit);
1958                 }
1959                 launched = 1;
1960                 vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled);
1961         } while (handled);
1962
1963         /*
1964          * If a VM exit has been handled then the exitcode must be BOGUS
1965          * If a VM exit is not handled then the exitcode must not be BOGUS
1966          */
1967         if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) ||
1968             (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) {
1969                 panic("Mismatch between handled (%d) and exitcode (%d)",
1970                       handled, vmexit->exitcode);
1971         }
1972
1973         if (!handled)
1974                 vmm_stat_incr(vm, vcpu, VMEXIT_USERSPACE, 1);
1975
1976         VCPU_CTR1(vm, vcpu, "returning from vmx_run: exitcode %d",
1977             vmexit->exitcode);
1978
1979         VMCLEAR(vmcs);
1980         return (0);
1981 }
1982
1983 static void
1984 vmx_vmcleanup(void *arg)
1985 {
1986         int i, error;
1987         struct vmx *vmx = arg;
1988
1989         if (virtual_interrupt_delivery)
1990                 vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE);
1991
1992         for (i = 0; i < VM_MAXCPU; i++)
1993                 vpid_free(vmx->state[i].vpid);
1994
1995         /*
1996          * XXXSMP we also need to clear the VMCS active on the other vcpus.
1997          */
1998         error = vmclear(&vmx->vmcs[0]);
1999         if (error != 0)
2000                 panic("vmx_vmcleanup: vmclear error %d on vcpu 0", error);
2001
2002         free(vmx, M_VMX);
2003
2004         return;
2005 }
2006
2007 static register_t *
2008 vmxctx_regptr(struct vmxctx *vmxctx, int reg)
2009 {
2010
2011         switch (reg) {
2012         case VM_REG_GUEST_RAX:
2013                 return (&vmxctx->guest_rax);
2014         case VM_REG_GUEST_RBX:
2015                 return (&vmxctx->guest_rbx);
2016         case VM_REG_GUEST_RCX:
2017                 return (&vmxctx->guest_rcx);
2018         case VM_REG_GUEST_RDX:
2019                 return (&vmxctx->guest_rdx);
2020         case VM_REG_GUEST_RSI:
2021                 return (&vmxctx->guest_rsi);
2022         case VM_REG_GUEST_RDI:
2023                 return (&vmxctx->guest_rdi);
2024         case VM_REG_GUEST_RBP:
2025                 return (&vmxctx->guest_rbp);
2026         case VM_REG_GUEST_R8:
2027                 return (&vmxctx->guest_r8);
2028         case VM_REG_GUEST_R9:
2029                 return (&vmxctx->guest_r9);
2030         case VM_REG_GUEST_R10:
2031                 return (&vmxctx->guest_r10);
2032         case VM_REG_GUEST_R11:
2033                 return (&vmxctx->guest_r11);
2034         case VM_REG_GUEST_R12:
2035                 return (&vmxctx->guest_r12);
2036         case VM_REG_GUEST_R13:
2037                 return (&vmxctx->guest_r13);
2038         case VM_REG_GUEST_R14:
2039                 return (&vmxctx->guest_r14);
2040         case VM_REG_GUEST_R15:
2041                 return (&vmxctx->guest_r15);
2042         default:
2043                 break;
2044         }
2045         return (NULL);
2046 }
2047
2048 static int
2049 vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval)
2050 {
2051         register_t *regp;
2052
2053         if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
2054                 *retval = *regp;
2055                 return (0);
2056         } else
2057                 return (EINVAL);
2058 }
2059
2060 static int
2061 vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val)
2062 {
2063         register_t *regp;
2064
2065         if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
2066                 *regp = val;
2067                 return (0);
2068         } else
2069                 return (EINVAL);
2070 }
2071
2072 static int
2073 vmx_shadow_reg(int reg)
2074 {
2075         int shreg;
2076
2077         shreg = -1;
2078
2079         switch (reg) {
2080         case VM_REG_GUEST_CR0:
2081                 shreg = VMCS_CR0_SHADOW;
2082                 break;
2083         case VM_REG_GUEST_CR4:
2084                 shreg = VMCS_CR4_SHADOW;
2085                 break;
2086         default:
2087                 break;
2088         }
2089
2090         return (shreg);
2091 }
2092
2093 static int
2094 vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval)
2095 {
2096         int running, hostcpu;
2097         struct vmx *vmx = arg;
2098
2099         running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
2100         if (running && hostcpu != curcpu)
2101                 panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu);
2102
2103         if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0)
2104                 return (0);
2105
2106         return (vmcs_getreg(&vmx->vmcs[vcpu], running, reg, retval));
2107 }
2108
2109 static int
2110 vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
2111 {
2112         int error, hostcpu, running, shadow;
2113         uint64_t ctls;
2114         struct vmx *vmx = arg;
2115
2116         running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
2117         if (running && hostcpu != curcpu)
2118                 panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu);
2119
2120         if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0)
2121                 return (0);
2122
2123         error = vmcs_setreg(&vmx->vmcs[vcpu], running, reg, val);
2124
2125         if (error == 0) {
2126                 /*
2127                  * If the "load EFER" VM-entry control is 1 then the
2128                  * value of EFER.LMA must be identical to "IA-32e mode guest"
2129                  * bit in the VM-entry control.
2130                  */
2131                 if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 &&
2132                     (reg == VM_REG_GUEST_EFER)) {
2133                         vmcs_getreg(&vmx->vmcs[vcpu], running,
2134                                     VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls);
2135                         if (val & EFER_LMA)
2136                                 ctls |= VM_ENTRY_GUEST_LMA;
2137                         else
2138                                 ctls &= ~VM_ENTRY_GUEST_LMA;
2139                         vmcs_setreg(&vmx->vmcs[vcpu], running,
2140                                     VMCS_IDENT(VMCS_ENTRY_CTLS), ctls);
2141                 }
2142
2143                 shadow = vmx_shadow_reg(reg);
2144                 if (shadow > 0) {
2145                         /*
2146                          * Store the unmodified value in the shadow
2147                          */                     
2148                         error = vmcs_setreg(&vmx->vmcs[vcpu], running,
2149                                     VMCS_IDENT(shadow), val);
2150                 }
2151         }
2152
2153         return (error);
2154 }
2155
2156 static int
2157 vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
2158 {
2159         struct vmx *vmx = arg;
2160
2161         return (vmcs_getdesc(&vmx->vmcs[vcpu], reg, desc));
2162 }
2163
2164 static int
2165 vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
2166 {
2167         struct vmx *vmx = arg;
2168
2169         return (vmcs_setdesc(&vmx->vmcs[vcpu], reg, desc));
2170 }
2171
2172 static int
2173 vmx_inject(void *arg, int vcpu, int type, int vector, uint32_t code,
2174            int code_valid)
2175 {
2176         int error;
2177         uint64_t info;
2178         struct vmx *vmx = arg;
2179         struct vmcs *vmcs = &vmx->vmcs[vcpu];
2180
2181         static uint32_t type_map[VM_EVENT_MAX] = {
2182                 0x1,            /* VM_EVENT_NONE */
2183                 0x0,            /* VM_HW_INTR */
2184                 0x2,            /* VM_NMI */
2185                 0x3,            /* VM_HW_EXCEPTION */
2186                 0x4,            /* VM_SW_INTR */
2187                 0x5,            /* VM_PRIV_SW_EXCEPTION */
2188                 0x6,            /* VM_SW_EXCEPTION */
2189         };
2190
2191         /*
2192          * If there is already an exception pending to be delivered to the
2193          * vcpu then just return.
2194          */
2195         error = vmcs_getreg(vmcs, 0, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), &info);
2196         if (error)
2197                 return (error);
2198
2199         if (info & VMCS_INTR_VALID)
2200                 return (EAGAIN);
2201
2202         info = vector | (type_map[type] << 8) | (code_valid ? 1 << 11 : 0);
2203         info |= VMCS_INTR_VALID;
2204         error = vmcs_setreg(vmcs, 0, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), info);
2205         if (error != 0)
2206                 return (error);
2207
2208         if (code_valid) {
2209                 error = vmcs_setreg(vmcs, 0,
2210                                     VMCS_IDENT(VMCS_ENTRY_EXCEPTION_ERROR),
2211                                     code);
2212         }
2213         return (error);
2214 }
2215
2216 static int
2217 vmx_getcap(void *arg, int vcpu, int type, int *retval)
2218 {
2219         struct vmx *vmx = arg;
2220         int vcap;
2221         int ret;
2222
2223         ret = ENOENT;
2224
2225         vcap = vmx->cap[vcpu].set;
2226
2227         switch (type) {
2228         case VM_CAP_HALT_EXIT:
2229                 if (cap_halt_exit)
2230                         ret = 0;
2231                 break;
2232         case VM_CAP_PAUSE_EXIT:
2233                 if (cap_pause_exit)
2234                         ret = 0;
2235                 break;
2236         case VM_CAP_MTRAP_EXIT:
2237                 if (cap_monitor_trap)
2238                         ret = 0;
2239                 break;
2240         case VM_CAP_UNRESTRICTED_GUEST:
2241                 if (cap_unrestricted_guest)
2242                         ret = 0;
2243                 break;
2244         case VM_CAP_ENABLE_INVPCID:
2245                 if (cap_invpcid)
2246                         ret = 0;
2247                 break;
2248         default:
2249                 break;
2250         }
2251
2252         if (ret == 0)
2253                 *retval = (vcap & (1 << type)) ? 1 : 0;
2254
2255         return (ret);
2256 }
2257
2258 static int
2259 vmx_setcap(void *arg, int vcpu, int type, int val)
2260 {
2261         struct vmx *vmx = arg;
2262         struct vmcs *vmcs = &vmx->vmcs[vcpu];
2263         uint32_t baseval;
2264         uint32_t *pptr;
2265         int error;
2266         int flag;
2267         int reg;
2268         int retval;
2269
2270         retval = ENOENT;
2271         pptr = NULL;
2272
2273         switch (type) {
2274         case VM_CAP_HALT_EXIT:
2275                 if (cap_halt_exit) {
2276                         retval = 0;
2277                         pptr = &vmx->cap[vcpu].proc_ctls;
2278                         baseval = *pptr;
2279                         flag = PROCBASED_HLT_EXITING;
2280                         reg = VMCS_PRI_PROC_BASED_CTLS;
2281                 }
2282                 break;
2283         case VM_CAP_MTRAP_EXIT:
2284                 if (cap_monitor_trap) {
2285                         retval = 0;
2286                         pptr = &vmx->cap[vcpu].proc_ctls;
2287                         baseval = *pptr;
2288                         flag = PROCBASED_MTF;
2289                         reg = VMCS_PRI_PROC_BASED_CTLS;
2290                 }
2291                 break;
2292         case VM_CAP_PAUSE_EXIT:
2293                 if (cap_pause_exit) {
2294                         retval = 0;
2295                         pptr = &vmx->cap[vcpu].proc_ctls;
2296                         baseval = *pptr;
2297                         flag = PROCBASED_PAUSE_EXITING;
2298                         reg = VMCS_PRI_PROC_BASED_CTLS;
2299                 }
2300                 break;
2301         case VM_CAP_UNRESTRICTED_GUEST:
2302                 if (cap_unrestricted_guest) {
2303                         retval = 0;
2304                         pptr = &vmx->cap[vcpu].proc_ctls2;
2305                         baseval = *pptr;
2306                         flag = PROCBASED2_UNRESTRICTED_GUEST;
2307                         reg = VMCS_SEC_PROC_BASED_CTLS;
2308                 }
2309                 break;
2310         case VM_CAP_ENABLE_INVPCID:
2311                 if (cap_invpcid) {
2312                         retval = 0;
2313                         pptr = &vmx->cap[vcpu].proc_ctls2;
2314                         baseval = *pptr;
2315                         flag = PROCBASED2_ENABLE_INVPCID;
2316                         reg = VMCS_SEC_PROC_BASED_CTLS;
2317                 }
2318                 break;
2319         default:
2320                 break;
2321         }
2322
2323         if (retval == 0) {
2324                 if (val) {
2325                         baseval |= flag;
2326                 } else {
2327                         baseval &= ~flag;
2328                 }
2329                 VMPTRLD(vmcs);
2330                 error = vmwrite(reg, baseval);
2331                 VMCLEAR(vmcs);
2332
2333                 if (error) {
2334                         retval = error;
2335                 } else {
2336                         /*
2337                          * Update optional stored flags, and record
2338                          * setting
2339                          */
2340                         if (pptr != NULL) {
2341                                 *pptr = baseval;
2342                         }
2343
2344                         if (val) {
2345                                 vmx->cap[vcpu].set |= (1 << type);
2346                         } else {
2347                                 vmx->cap[vcpu].set &= ~(1 << type);
2348                         }
2349                 }
2350         }
2351
2352         return (retval);
2353 }
2354
2355 struct vlapic_vtx {
2356         struct vlapic   vlapic;
2357         struct pir_desc *pir_desc;
2358         struct vmx      *vmx;
2359 };
2360
2361 #define VMX_CTR_PIR(vm, vcpuid, pir_desc, notify, vector, level, msg)   \
2362 do {                                                                    \
2363         VCPU_CTR2(vm, vcpuid, msg " assert %s-triggered vector %d",     \
2364             level ? "level" : "edge", vector);                          \
2365         VCPU_CTR1(vm, vcpuid, msg " pir0 0x%016lx", pir_desc->pir[0]);  \
2366         VCPU_CTR1(vm, vcpuid, msg " pir1 0x%016lx", pir_desc->pir[1]);  \
2367         VCPU_CTR1(vm, vcpuid, msg " pir2 0x%016lx", pir_desc->pir[2]);  \
2368         VCPU_CTR1(vm, vcpuid, msg " pir3 0x%016lx", pir_desc->pir[3]);  \
2369         VCPU_CTR1(vm, vcpuid, msg " notify: %s", notify ? "yes" : "no");\
2370 } while (0)
2371
2372 /*
2373  * vlapic->ops handlers that utilize the APICv hardware assist described in
2374  * Chapter 29 of the Intel SDM.
2375  */
2376 static int
2377 vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level)
2378 {
2379         struct vlapic_vtx *vlapic_vtx;
2380         struct pir_desc *pir_desc;
2381         uint64_t mask;
2382         int idx, notify;
2383
2384         vlapic_vtx = (struct vlapic_vtx *)vlapic;
2385         pir_desc = vlapic_vtx->pir_desc;
2386
2387         /*
2388          * Keep track of interrupt requests in the PIR descriptor. This is
2389          * because the virtual APIC page pointed to by the VMCS cannot be
2390          * modified if the vcpu is running.
2391          */
2392         idx = vector / 64;
2393         mask = 1UL << (vector % 64);
2394         atomic_set_long(&pir_desc->pir[idx], mask);
2395         notify = atomic_cmpset_long(&pir_desc->pending, 0, 1);
2396
2397         VMX_CTR_PIR(vlapic->vm, vlapic->vcpuid, pir_desc, notify, vector,
2398             level, "vmx_set_intr_ready");
2399         return (notify);
2400 }
2401
2402 static int
2403 vmx_pending_intr(struct vlapic *vlapic, int *vecptr)
2404 {
2405         struct vlapic_vtx *vlapic_vtx;
2406         struct pir_desc *pir_desc;
2407         struct LAPIC *lapic;
2408         uint64_t pending, pirval;
2409         uint32_t ppr, vpr;
2410         int i;
2411
2412         /*
2413          * This function is only expected to be called from the 'HLT' exit
2414          * handler which does not care about the vector that is pending.
2415          */
2416         KASSERT(vecptr == NULL, ("vmx_pending_intr: vecptr must be NULL"));
2417
2418         vlapic_vtx = (struct vlapic_vtx *)vlapic;
2419         pir_desc = vlapic_vtx->pir_desc;
2420
2421         pending = atomic_load_acq_long(&pir_desc->pending);
2422         if (!pending)
2423                 return (0);     /* common case */
2424
2425         /*
2426          * If there is an interrupt pending then it will be recognized only
2427          * if its priority is greater than the processor priority.
2428          *
2429          * Special case: if the processor priority is zero then any pending
2430          * interrupt will be recognized.
2431          */
2432         lapic = vlapic->apic_page;
2433         ppr = lapic->ppr & 0xf0;
2434         if (ppr == 0)
2435                 return (1);
2436
2437         VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "HLT with non-zero PPR %d",
2438             lapic->ppr);
2439
2440         for (i = 3; i >= 0; i--) {
2441                 pirval = pir_desc->pir[i];
2442                 if (pirval != 0) {
2443                         vpr = (i * 64 + flsl(pirval) - 1) & 0xf0;
2444                         return (vpr > ppr);
2445                 }
2446         }
2447         return (0);
2448 }
2449
2450 static void
2451 vmx_intr_accepted(struct vlapic *vlapic, int vector)
2452 {
2453
2454         panic("vmx_intr_accepted: not expected to be called");
2455 }
2456
2457 static void
2458 vmx_set_tmr(struct vlapic *vlapic, int vector, bool level)
2459 {
2460         struct vlapic_vtx *vlapic_vtx;
2461         struct vmx *vmx;
2462         struct vmcs *vmcs;
2463         uint64_t mask, val;
2464
2465         KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector));
2466         KASSERT(!vcpu_is_running(vlapic->vm, vlapic->vcpuid, NULL),
2467             ("vmx_set_tmr: vcpu cannot be running"));
2468
2469         vlapic_vtx = (struct vlapic_vtx *)vlapic;
2470         vmx = vlapic_vtx->vmx;
2471         vmcs = &vmx->vmcs[vlapic->vcpuid];
2472         mask = 1UL << (vector % 64);
2473
2474         VMPTRLD(vmcs);
2475         val = vmcs_read(VMCS_EOI_EXIT(vector));
2476         if (level)
2477                 val |= mask;
2478         else
2479                 val &= ~mask;
2480         vmcs_write(VMCS_EOI_EXIT(vector), val);
2481         VMCLEAR(vmcs);
2482 }
2483
2484 static void
2485 vmx_post_intr(struct vlapic *vlapic, int hostcpu)
2486 {
2487
2488         ipi_cpu(hostcpu, pirvec);
2489 }
2490
2491 /*
2492  * Transfer the pending interrupts in the PIR descriptor to the IRR
2493  * in the virtual APIC page.
2494  */
2495 static void
2496 vmx_inject_pir(struct vlapic *vlapic)
2497 {
2498         struct vlapic_vtx *vlapic_vtx;
2499         struct pir_desc *pir_desc;
2500         struct LAPIC *lapic;
2501         uint64_t val, pirval;
2502         int rvi, pirbase;
2503         uint16_t intr_status_old, intr_status_new;
2504
2505         vlapic_vtx = (struct vlapic_vtx *)vlapic;
2506         pir_desc = vlapic_vtx->pir_desc;
2507         if (atomic_cmpset_long(&pir_desc->pending, 1, 0) == 0) {
2508                 VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: "
2509                     "no posted interrupt pending");
2510                 return;
2511         }
2512
2513         pirval = 0;
2514         lapic = vlapic->apic_page;
2515
2516         val = atomic_readandclear_long(&pir_desc->pir[0]);
2517         if (val != 0) {
2518                 lapic->irr0 |= val;
2519                 lapic->irr1 |= val >> 32;
2520                 pirbase = 0;
2521                 pirval = val;
2522         }
2523
2524         val = atomic_readandclear_long(&pir_desc->pir[1]);
2525         if (val != 0) {
2526                 lapic->irr2 |= val;
2527                 lapic->irr3 |= val >> 32;
2528                 pirbase = 64;
2529                 pirval = val;
2530         }
2531
2532         val = atomic_readandclear_long(&pir_desc->pir[2]);
2533         if (val != 0) {
2534                 lapic->irr4 |= val;
2535                 lapic->irr5 |= val >> 32;
2536                 pirbase = 128;
2537                 pirval = val;
2538         }
2539
2540         val = atomic_readandclear_long(&pir_desc->pir[3]);
2541         if (val != 0) {
2542                 lapic->irr6 |= val;
2543                 lapic->irr7 |= val >> 32;
2544                 pirbase = 192;
2545                 pirval = val;
2546         }
2547         VLAPIC_CTR_IRR(vlapic, "vmx_inject_pir");
2548
2549         /*
2550          * Update RVI so the processor can evaluate pending virtual
2551          * interrupts on VM-entry.
2552          */
2553         if (pirval != 0) {
2554                 rvi = pirbase + flsl(pirval) - 1;
2555                 intr_status_old = vmcs_read(VMCS_GUEST_INTR_STATUS);
2556                 intr_status_new = (intr_status_old & 0xFF00) | rvi;
2557                 if (intr_status_new > intr_status_old) {
2558                         vmcs_write(VMCS_GUEST_INTR_STATUS, intr_status_new);
2559                         VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: "
2560                             "guest_intr_status changed from 0x%04x to 0x%04x",
2561                             intr_status_old, intr_status_new);
2562                 }
2563         }
2564 }
2565
2566 static struct vlapic *
2567 vmx_vlapic_init(void *arg, int vcpuid)
2568 {
2569         struct vmx *vmx;
2570         struct vlapic *vlapic;
2571         struct vlapic_vtx *vlapic_vtx;
2572         
2573         vmx = arg;
2574
2575         vlapic = malloc(sizeof(struct vlapic_vtx), M_VLAPIC, M_WAITOK | M_ZERO);
2576         vlapic->vm = vmx->vm;
2577         vlapic->vcpuid = vcpuid;
2578         vlapic->apic_page = (struct LAPIC *)&vmx->apic_page[vcpuid];
2579
2580         vlapic_vtx = (struct vlapic_vtx *)vlapic;
2581         vlapic_vtx->pir_desc = &vmx->pir_desc[vcpuid];
2582         vlapic_vtx->vmx = vmx;
2583
2584         if (virtual_interrupt_delivery) {
2585                 vlapic->ops.set_intr_ready = vmx_set_intr_ready;
2586                 vlapic->ops.pending_intr = vmx_pending_intr;
2587                 vlapic->ops.intr_accepted = vmx_intr_accepted;
2588                 vlapic->ops.set_tmr = vmx_set_tmr;
2589         }
2590
2591         if (posted_interrupts)
2592                 vlapic->ops.post_intr = vmx_post_intr;
2593
2594         vlapic_init(vlapic);
2595
2596         return (vlapic);
2597 }
2598
2599 static void
2600 vmx_vlapic_cleanup(void *arg, struct vlapic *vlapic)
2601 {
2602
2603         vlapic_cleanup(vlapic);
2604         free(vlapic, M_VLAPIC);
2605 }
2606
2607 struct vmm_ops vmm_ops_intel = {
2608         vmx_init,
2609         vmx_cleanup,
2610         vmx_restore,
2611         vmx_vminit,
2612         vmx_run,
2613         vmx_vmcleanup,
2614         vmx_getreg,
2615         vmx_setreg,
2616         vmx_getdesc,
2617         vmx_setdesc,
2618         vmx_inject,
2619         vmx_getcap,
2620         vmx_setcap,
2621         ept_vmspace_alloc,
2622         ept_vmspace_free,
2623         vmx_vlapic_init,
2624         vmx_vlapic_cleanup,
2625 };