]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/amd64/vmm/intel/vmx.c
Update LLDB snapshot to upstream r201577
[FreeBSD/FreeBSD.git] / sys / amd64 / vmm / intel / vmx.c
1 /*-
2  * Copyright (c) 2011 NetApp, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  * $FreeBSD$
27  */
28
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/smp.h>
35 #include <sys/kernel.h>
36 #include <sys/malloc.h>
37 #include <sys/pcpu.h>
38 #include <sys/proc.h>
39 #include <sys/sysctl.h>
40
41 #include <vm/vm.h>
42 #include <vm/pmap.h>
43
44 #include <machine/psl.h>
45 #include <machine/cpufunc.h>
46 #include <machine/md_var.h>
47 #include <machine/segments.h>
48 #include <machine/smp.h>
49 #include <machine/specialreg.h>
50 #include <machine/vmparam.h>
51
52 #include <machine/vmm.h>
53 #include "vmm_host.h"
54 #include "vmm_ipi.h"
55 #include "vmm_msr.h"
56 #include "vmm_ktr.h"
57 #include "vmm_stat.h"
58 #include "vlapic.h"
59 #include "vlapic_priv.h"
60
61 #include "vmx_msr.h"
62 #include "ept.h"
63 #include "vmx_cpufunc.h"
64 #include "vmx.h"
65 #include "x86.h"
66 #include "vmx_controls.h"
67
68 #define PINBASED_CTLS_ONE_SETTING                                       \
69         (PINBASED_EXTINT_EXITING        |                               \
70          PINBASED_NMI_EXITING           |                               \
71          PINBASED_VIRTUAL_NMI)
72 #define PINBASED_CTLS_ZERO_SETTING      0
73
74 #define PROCBASED_CTLS_WINDOW_SETTING                                   \
75         (PROCBASED_INT_WINDOW_EXITING   |                               \
76          PROCBASED_NMI_WINDOW_EXITING)
77
78 #define PROCBASED_CTLS_ONE_SETTING                                      \
79         (PROCBASED_SECONDARY_CONTROLS   |                               \
80          PROCBASED_IO_EXITING           |                               \
81          PROCBASED_MSR_BITMAPS          |                               \
82          PROCBASED_CTLS_WINDOW_SETTING)
83 #define PROCBASED_CTLS_ZERO_SETTING     \
84         (PROCBASED_CR3_LOAD_EXITING |   \
85         PROCBASED_CR3_STORE_EXITING |   \
86         PROCBASED_IO_BITMAPS)
87
88 #define PROCBASED_CTLS2_ONE_SETTING     PROCBASED2_ENABLE_EPT
89 #define PROCBASED_CTLS2_ZERO_SETTING    0
90
91 #define VM_EXIT_CTLS_ONE_SETTING_NO_PAT                                 \
92         (VM_EXIT_HOST_LMA                       |                       \
93         VM_EXIT_SAVE_EFER                       |                       \
94         VM_EXIT_LOAD_EFER)
95
96 #define VM_EXIT_CTLS_ONE_SETTING                                        \
97         (VM_EXIT_CTLS_ONE_SETTING_NO_PAT        |                       \
98         VM_EXIT_ACKNOWLEDGE_INTERRUPT           |                       \
99         VM_EXIT_SAVE_PAT                        |                       \
100         VM_EXIT_LOAD_PAT)
101 #define VM_EXIT_CTLS_ZERO_SETTING       VM_EXIT_SAVE_DEBUG_CONTROLS
102
103 #define VM_ENTRY_CTLS_ONE_SETTING_NO_PAT        VM_ENTRY_LOAD_EFER
104
105 #define VM_ENTRY_CTLS_ONE_SETTING                                       \
106         (VM_ENTRY_CTLS_ONE_SETTING_NO_PAT       |                       \
107         VM_ENTRY_LOAD_PAT)
108 #define VM_ENTRY_CTLS_ZERO_SETTING                                      \
109         (VM_ENTRY_LOAD_DEBUG_CONTROLS           |                       \
110         VM_ENTRY_INTO_SMM                       |                       \
111         VM_ENTRY_DEACTIVATE_DUAL_MONITOR)
112
113 #define guest_msr_rw(vmx, msr) \
114         msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW)
115
116 #define HANDLED         1
117 #define UNHANDLED       0
118
119 static MALLOC_DEFINE(M_VMX, "vmx", "vmx");
120 static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic");
121
122 SYSCTL_DECL(_hw_vmm);
123 SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW, NULL, NULL);
124
125 int vmxon_enabled[MAXCPU];
126 static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE);
127
128 static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2;
129 static uint32_t exit_ctls, entry_ctls;
130
131 static uint64_t cr0_ones_mask, cr0_zeros_mask;
132 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_ones_mask, CTLFLAG_RD,
133              &cr0_ones_mask, 0, NULL);
134 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_zeros_mask, CTLFLAG_RD,
135              &cr0_zeros_mask, 0, NULL);
136
137 static uint64_t cr4_ones_mask, cr4_zeros_mask;
138 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD,
139              &cr4_ones_mask, 0, NULL);
140 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD,
141              &cr4_zeros_mask, 0, NULL);
142
143 static int vmx_no_patmsr;
144
145 static int vmx_initialized;
146 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD,
147            &vmx_initialized, 0, "Intel VMX initialized");
148
149 /*
150  * Optional capabilities
151  */
152 static int cap_halt_exit;
153 static int cap_pause_exit;
154 static int cap_unrestricted_guest;
155 static int cap_monitor_trap;
156 static int cap_invpcid;
157
158 static int virtual_interrupt_delivery;
159 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD,
160     &virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support");
161
162 static int posted_interrupts;
163 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupts, CTLFLAG_RD,
164     &posted_interrupts, 0, "APICv posted interrupt support");
165
166 static int pirvec;
167 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupt_vector, CTLFLAG_RD,
168     &pirvec, 0, "APICv posted interrupt vector");
169
170 static struct unrhdr *vpid_unr;
171 static u_int vpid_alloc_failed;
172 SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD,
173             &vpid_alloc_failed, 0, NULL);
174
175 /*
176  * Use the last page below 4GB as the APIC access address. This address is
177  * occupied by the boot firmware so it is guaranteed that it will not conflict
178  * with a page in system memory.
179  */
180 #define APIC_ACCESS_ADDRESS     0xFFFFF000
181
182 static void vmx_inject_pir(struct vlapic *vlapic);
183
184 #ifdef KTR
185 static const char *
186 exit_reason_to_str(int reason)
187 {
188         static char reasonbuf[32];
189
190         switch (reason) {
191         case EXIT_REASON_EXCEPTION:
192                 return "exception";
193         case EXIT_REASON_EXT_INTR:
194                 return "extint";
195         case EXIT_REASON_TRIPLE_FAULT:
196                 return "triplefault";
197         case EXIT_REASON_INIT:
198                 return "init";
199         case EXIT_REASON_SIPI:
200                 return "sipi";
201         case EXIT_REASON_IO_SMI:
202                 return "iosmi";
203         case EXIT_REASON_SMI:
204                 return "smi";
205         case EXIT_REASON_INTR_WINDOW:
206                 return "intrwindow";
207         case EXIT_REASON_NMI_WINDOW:
208                 return "nmiwindow";
209         case EXIT_REASON_TASK_SWITCH:
210                 return "taskswitch";
211         case EXIT_REASON_CPUID:
212                 return "cpuid";
213         case EXIT_REASON_GETSEC:
214                 return "getsec";
215         case EXIT_REASON_HLT:
216                 return "hlt";
217         case EXIT_REASON_INVD:
218                 return "invd";
219         case EXIT_REASON_INVLPG:
220                 return "invlpg";
221         case EXIT_REASON_RDPMC:
222                 return "rdpmc";
223         case EXIT_REASON_RDTSC:
224                 return "rdtsc";
225         case EXIT_REASON_RSM:
226                 return "rsm";
227         case EXIT_REASON_VMCALL:
228                 return "vmcall";
229         case EXIT_REASON_VMCLEAR:
230                 return "vmclear";
231         case EXIT_REASON_VMLAUNCH:
232                 return "vmlaunch";
233         case EXIT_REASON_VMPTRLD:
234                 return "vmptrld";
235         case EXIT_REASON_VMPTRST:
236                 return "vmptrst";
237         case EXIT_REASON_VMREAD:
238                 return "vmread";
239         case EXIT_REASON_VMRESUME:
240                 return "vmresume";
241         case EXIT_REASON_VMWRITE:
242                 return "vmwrite";
243         case EXIT_REASON_VMXOFF:
244                 return "vmxoff";
245         case EXIT_REASON_VMXON:
246                 return "vmxon";
247         case EXIT_REASON_CR_ACCESS:
248                 return "craccess";
249         case EXIT_REASON_DR_ACCESS:
250                 return "draccess";
251         case EXIT_REASON_INOUT:
252                 return "inout";
253         case EXIT_REASON_RDMSR:
254                 return "rdmsr";
255         case EXIT_REASON_WRMSR:
256                 return "wrmsr";
257         case EXIT_REASON_INVAL_VMCS:
258                 return "invalvmcs";
259         case EXIT_REASON_INVAL_MSR:
260                 return "invalmsr";
261         case EXIT_REASON_MWAIT:
262                 return "mwait";
263         case EXIT_REASON_MTF:
264                 return "mtf";
265         case EXIT_REASON_MONITOR:
266                 return "monitor";
267         case EXIT_REASON_PAUSE:
268                 return "pause";
269         case EXIT_REASON_MCE:
270                 return "mce";
271         case EXIT_REASON_TPR:
272                 return "tpr";
273         case EXIT_REASON_APIC_ACCESS:
274                 return "apic-access";
275         case EXIT_REASON_GDTR_IDTR:
276                 return "gdtridtr";
277         case EXIT_REASON_LDTR_TR:
278                 return "ldtrtr";
279         case EXIT_REASON_EPT_FAULT:
280                 return "eptfault";
281         case EXIT_REASON_EPT_MISCONFIG:
282                 return "eptmisconfig";
283         case EXIT_REASON_INVEPT:
284                 return "invept";
285         case EXIT_REASON_RDTSCP:
286                 return "rdtscp";
287         case EXIT_REASON_VMX_PREEMPT:
288                 return "vmxpreempt";
289         case EXIT_REASON_INVVPID:
290                 return "invvpid";
291         case EXIT_REASON_WBINVD:
292                 return "wbinvd";
293         case EXIT_REASON_XSETBV:
294                 return "xsetbv";
295         case EXIT_REASON_APIC_WRITE:
296                 return "apic-write";
297         default:
298                 snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason);
299                 return (reasonbuf);
300         }
301 }
302 #endif  /* KTR */
303
304 u_long
305 vmx_fix_cr0(u_long cr0)
306 {
307
308         return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask);
309 }
310
311 u_long
312 vmx_fix_cr4(u_long cr4)
313 {
314
315         return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask);
316 }
317
318 static void
319 vpid_free(int vpid)
320 {
321         if (vpid < 0 || vpid > 0xffff)
322                 panic("vpid_free: invalid vpid %d", vpid);
323
324         /*
325          * VPIDs [0,VM_MAXCPU] are special and are not allocated from
326          * the unit number allocator.
327          */
328
329         if (vpid > VM_MAXCPU)
330                 free_unr(vpid_unr, vpid);
331 }
332
333 static void
334 vpid_alloc(uint16_t *vpid, int num)
335 {
336         int i, x;
337
338         if (num <= 0 || num > VM_MAXCPU)
339                 panic("invalid number of vpids requested: %d", num);
340
341         /*
342          * If the "enable vpid" execution control is not enabled then the
343          * VPID is required to be 0 for all vcpus.
344          */
345         if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) == 0) {
346                 for (i = 0; i < num; i++)
347                         vpid[i] = 0;
348                 return;
349         }
350
351         /*
352          * Allocate a unique VPID for each vcpu from the unit number allocator.
353          */
354         for (i = 0; i < num; i++) {
355                 x = alloc_unr(vpid_unr);
356                 if (x == -1)
357                         break;
358                 else
359                         vpid[i] = x;
360         }
361
362         if (i < num) {
363                 atomic_add_int(&vpid_alloc_failed, 1);
364
365                 /*
366                  * If the unit number allocator does not have enough unique
367                  * VPIDs then we need to allocate from the [1,VM_MAXCPU] range.
368                  *
369                  * These VPIDs are not be unique across VMs but this does not
370                  * affect correctness because the combined mappings are also
371                  * tagged with the EP4TA which is unique for each VM.
372                  *
373                  * It is still sub-optimal because the invvpid will invalidate
374                  * combined mappings for a particular VPID across all EP4TAs.
375                  */
376                 while (i-- > 0)
377                         vpid_free(vpid[i]);
378
379                 for (i = 0; i < num; i++)
380                         vpid[i] = i + 1;
381         }
382 }
383
384 static void
385 vpid_init(void)
386 {
387         /*
388          * VPID 0 is required when the "enable VPID" execution control is
389          * disabled.
390          *
391          * VPIDs [1,VM_MAXCPU] are used as the "overflow namespace" when the
392          * unit number allocator does not have sufficient unique VPIDs to
393          * satisfy the allocation.
394          *
395          * The remaining VPIDs are managed by the unit number allocator.
396          */
397         vpid_unr = new_unrhdr(VM_MAXCPU + 1, 0xffff, NULL);
398 }
399
400 static void
401 msr_save_area_init(struct msr_entry *g_area, int *g_count)
402 {
403         int cnt;
404
405         static struct msr_entry guest_msrs[] = {
406                 { MSR_KGSBASE, 0, 0 },
407         };
408
409         cnt = sizeof(guest_msrs) / sizeof(guest_msrs[0]);
410         if (cnt > GUEST_MSR_MAX_ENTRIES)
411                 panic("guest msr save area overrun");
412         bcopy(guest_msrs, g_area, sizeof(guest_msrs));
413         *g_count = cnt;
414 }
415
416 static void
417 vmx_disable(void *arg __unused)
418 {
419         struct invvpid_desc invvpid_desc = { 0 };
420         struct invept_desc invept_desc = { 0 };
421
422         if (vmxon_enabled[curcpu]) {
423                 /*
424                  * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b.
425                  *
426                  * VMXON or VMXOFF are not required to invalidate any TLB
427                  * caching structures. This prevents potential retention of
428                  * cached information in the TLB between distinct VMX episodes.
429                  */
430                 invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc);
431                 invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc);
432                 vmxoff();
433         }
434         load_cr4(rcr4() & ~CR4_VMXE);
435 }
436
437 static int
438 vmx_cleanup(void)
439 {
440         
441         if (pirvec != 0)
442                 vmm_ipi_free(pirvec);
443
444         if (vpid_unr != NULL) {
445                 delete_unrhdr(vpid_unr);
446                 vpid_unr = NULL;
447         }
448
449         smp_rendezvous(NULL, vmx_disable, NULL, NULL);
450
451         return (0);
452 }
453
454 static void
455 vmx_enable(void *arg __unused)
456 {
457         int error;
458
459         load_cr4(rcr4() | CR4_VMXE);
460
461         *(uint32_t *)vmxon_region[curcpu] = vmx_revision();
462         error = vmxon(vmxon_region[curcpu]);
463         if (error == 0)
464                 vmxon_enabled[curcpu] = 1;
465 }
466
467 static void
468 vmx_restore(void)
469 {
470
471         if (vmxon_enabled[curcpu])
472                 vmxon(vmxon_region[curcpu]);
473 }
474
475 static int
476 vmx_init(int ipinum)
477 {
478         int error, use_tpr_shadow;
479         uint64_t fixed0, fixed1, feature_control;
480         uint32_t tmp, procbased2_vid_bits;
481
482         /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */
483         if (!(cpu_feature2 & CPUID2_VMX)) {
484                 printf("vmx_init: processor does not support VMX operation\n");
485                 return (ENXIO);
486         }
487
488         /*
489          * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits
490          * are set (bits 0 and 2 respectively).
491          */
492         feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
493         if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 ||
494             (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) {
495                 printf("vmx_init: VMX operation disabled by BIOS\n");
496                 return (ENXIO);
497         }
498
499         /* Check support for primary processor-based VM-execution controls */
500         error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
501                                MSR_VMX_TRUE_PROCBASED_CTLS,
502                                PROCBASED_CTLS_ONE_SETTING,
503                                PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls);
504         if (error) {
505                 printf("vmx_init: processor does not support desired primary "
506                        "processor-based controls\n");
507                 return (error);
508         }
509
510         /* Clear the processor-based ctl bits that are set on demand */
511         procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING;
512
513         /* Check support for secondary processor-based VM-execution controls */
514         error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
515                                MSR_VMX_PROCBASED_CTLS2,
516                                PROCBASED_CTLS2_ONE_SETTING,
517                                PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2);
518         if (error) {
519                 printf("vmx_init: processor does not support desired secondary "
520                        "processor-based controls\n");
521                 return (error);
522         }
523
524         /* Check support for VPID */
525         error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
526                                PROCBASED2_ENABLE_VPID, 0, &tmp);
527         if (error == 0)
528                 procbased_ctls2 |= PROCBASED2_ENABLE_VPID;
529
530         /* Check support for pin-based VM-execution controls */
531         error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
532                                MSR_VMX_TRUE_PINBASED_CTLS,
533                                PINBASED_CTLS_ONE_SETTING,
534                                PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls);
535         if (error) {
536                 printf("vmx_init: processor does not support desired "
537                        "pin-based controls\n");
538                 return (error);
539         }
540
541         /* Check support for VM-exit controls */
542         error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS,
543                                VM_EXIT_CTLS_ONE_SETTING,
544                                VM_EXIT_CTLS_ZERO_SETTING,
545                                &exit_ctls);
546         if (error) {
547                 /* Try again without the PAT MSR bits */
548                 error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS,
549                                        MSR_VMX_TRUE_EXIT_CTLS,
550                                        VM_EXIT_CTLS_ONE_SETTING_NO_PAT,
551                                        VM_EXIT_CTLS_ZERO_SETTING,
552                                        &exit_ctls);
553                 if (error) {
554                         printf("vmx_init: processor does not support desired "
555                                "exit controls\n");
556                         return (error);
557                 } else {
558                         if (bootverbose)
559                                 printf("vmm: PAT MSR access not supported\n");
560                         guest_msr_valid(MSR_PAT);
561                         vmx_no_patmsr = 1;
562                 }
563         }
564
565         /* Check support for VM-entry controls */
566         if (!vmx_no_patmsr) {
567                 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS,
568                                        MSR_VMX_TRUE_ENTRY_CTLS,
569                                        VM_ENTRY_CTLS_ONE_SETTING,
570                                        VM_ENTRY_CTLS_ZERO_SETTING,
571                                        &entry_ctls);
572         } else {
573                 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS,
574                                        MSR_VMX_TRUE_ENTRY_CTLS,
575                                        VM_ENTRY_CTLS_ONE_SETTING_NO_PAT,
576                                        VM_ENTRY_CTLS_ZERO_SETTING,
577                                        &entry_ctls);
578         }
579
580         if (error) {
581                 printf("vmx_init: processor does not support desired "
582                        "entry controls\n");
583                        return (error);
584         }
585
586         /*
587          * Check support for optional features by testing them
588          * as individual bits
589          */
590         cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
591                                         MSR_VMX_TRUE_PROCBASED_CTLS,
592                                         PROCBASED_HLT_EXITING, 0,
593                                         &tmp) == 0);
594
595         cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
596                                         MSR_VMX_PROCBASED_CTLS,
597                                         PROCBASED_MTF, 0,
598                                         &tmp) == 0);
599
600         cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
601                                          MSR_VMX_TRUE_PROCBASED_CTLS,
602                                          PROCBASED_PAUSE_EXITING, 0,
603                                          &tmp) == 0);
604
605         cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
606                                         MSR_VMX_PROCBASED_CTLS2,
607                                         PROCBASED2_UNRESTRICTED_GUEST, 0,
608                                         &tmp) == 0);
609
610         cap_invpcid = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
611             MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0,
612             &tmp) == 0);
613
614         /*
615          * Check support for virtual interrupt delivery.
616          */
617         procbased2_vid_bits = (PROCBASED2_VIRTUALIZE_APIC_ACCESSES |
618             PROCBASED2_VIRTUALIZE_X2APIC_MODE |
619             PROCBASED2_APIC_REGISTER_VIRTUALIZATION |
620             PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY);
621
622         use_tpr_shadow = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
623             MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_USE_TPR_SHADOW, 0,
624             &tmp) == 0);
625
626         error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
627             procbased2_vid_bits, 0, &tmp);
628         if (error == 0 && use_tpr_shadow) {
629                 virtual_interrupt_delivery = 1;
630                 TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_vid",
631                     &virtual_interrupt_delivery);
632         }
633
634         if (virtual_interrupt_delivery) {
635                 procbased_ctls |= PROCBASED_USE_TPR_SHADOW;
636                 procbased_ctls2 |= procbased2_vid_bits;
637                 procbased_ctls2 &= ~PROCBASED2_VIRTUALIZE_X2APIC_MODE;
638
639                 /*
640                  * Check for Posted Interrupts only if Virtual Interrupt
641                  * Delivery is enabled.
642                  */
643                 error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
644                     MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_POSTED_INTERRUPT, 0,
645                     &tmp);
646                 if (error == 0) {
647                         pirvec = vmm_ipi_alloc();
648                         if (pirvec == 0) {
649                                 if (bootverbose) {
650                                         printf("vmx_init: unable to allocate "
651                                             "posted interrupt vector\n");
652                                 }
653                         } else {
654                                 posted_interrupts = 1;
655                                 TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_pir",
656                                     &posted_interrupts);
657                         }
658                 }
659         }
660
661         if (posted_interrupts)
662                     pinbased_ctls |= PINBASED_POSTED_INTERRUPT;
663
664         /* Initialize EPT */
665         error = ept_init(ipinum);
666         if (error) {
667                 printf("vmx_init: ept initialization failed (%d)\n", error);
668                 return (error);
669         }
670
671         /*
672          * Stash the cr0 and cr4 bits that must be fixed to 0 or 1
673          */
674         fixed0 = rdmsr(MSR_VMX_CR0_FIXED0);
675         fixed1 = rdmsr(MSR_VMX_CR0_FIXED1);
676         cr0_ones_mask = fixed0 & fixed1;
677         cr0_zeros_mask = ~fixed0 & ~fixed1;
678
679         /*
680          * CR0_PE and CR0_PG can be set to zero in VMX non-root operation
681          * if unrestricted guest execution is allowed.
682          */
683         if (cap_unrestricted_guest)
684                 cr0_ones_mask &= ~(CR0_PG | CR0_PE);
685
686         /*
687          * Do not allow the guest to set CR0_NW or CR0_CD.
688          */
689         cr0_zeros_mask |= (CR0_NW | CR0_CD);
690
691         fixed0 = rdmsr(MSR_VMX_CR4_FIXED0);
692         fixed1 = rdmsr(MSR_VMX_CR4_FIXED1);
693         cr4_ones_mask = fixed0 & fixed1;
694         cr4_zeros_mask = ~fixed0 & ~fixed1;
695
696         vpid_init();
697
698         /* enable VMX operation */
699         smp_rendezvous(NULL, vmx_enable, NULL, NULL);
700
701         vmx_initialized = 1;
702
703         return (0);
704 }
705
706 static void
707 vmx_trigger_hostintr(int vector)
708 {
709         uintptr_t func;
710         struct gate_descriptor *gd;
711
712         gd = &idt[vector];
713
714         KASSERT(vector >= 32 && vector <= 255, ("vmx_trigger_hostintr: "
715             "invalid vector %d", vector));
716         KASSERT(gd->gd_p == 1, ("gate descriptor for vector %d not present",
717             vector));
718         KASSERT(gd->gd_type == SDT_SYSIGT, ("gate descriptor for vector %d "
719             "has invalid type %d", vector, gd->gd_type));
720         KASSERT(gd->gd_dpl == SEL_KPL, ("gate descriptor for vector %d "
721             "has invalid dpl %d", vector, gd->gd_dpl));
722         KASSERT(gd->gd_selector == GSEL(GCODE_SEL, SEL_KPL), ("gate descriptor "
723             "for vector %d has invalid selector %d", vector, gd->gd_selector));
724         KASSERT(gd->gd_ist == 0, ("gate descriptor for vector %d has invalid "
725             "IST %d", vector, gd->gd_ist));
726
727         func = ((long)gd->gd_hioffset << 16 | gd->gd_looffset);
728         vmx_call_isr(func);
729 }
730
731 static int
732 vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial)
733 {
734         int error, mask_ident, shadow_ident;
735         uint64_t mask_value;
736
737         if (which != 0 && which != 4)
738                 panic("vmx_setup_cr_shadow: unknown cr%d", which);
739
740         if (which == 0) {
741                 mask_ident = VMCS_CR0_MASK;
742                 mask_value = cr0_ones_mask | cr0_zeros_mask;
743                 shadow_ident = VMCS_CR0_SHADOW;
744         } else {
745                 mask_ident = VMCS_CR4_MASK;
746                 mask_value = cr4_ones_mask | cr4_zeros_mask;
747                 shadow_ident = VMCS_CR4_SHADOW;
748         }
749
750         error = vmcs_setreg(vmcs, 0, VMCS_IDENT(mask_ident), mask_value);
751         if (error)
752                 return (error);
753
754         error = vmcs_setreg(vmcs, 0, VMCS_IDENT(shadow_ident), initial);
755         if (error)
756                 return (error);
757
758         return (0);
759 }
760 #define vmx_setup_cr0_shadow(vmcs,init) vmx_setup_cr_shadow(0, (vmcs), (init))
761 #define vmx_setup_cr4_shadow(vmcs,init) vmx_setup_cr_shadow(4, (vmcs), (init))
762
763 static void *
764 vmx_vminit(struct vm *vm, pmap_t pmap)
765 {
766         uint16_t vpid[VM_MAXCPU];
767         int i, error, guest_msr_count;
768         struct vmx *vmx;
769         struct vmcs *vmcs;
770
771         vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO);
772         if ((uintptr_t)vmx & PAGE_MASK) {
773                 panic("malloc of struct vmx not aligned on %d byte boundary",
774                       PAGE_SIZE);
775         }
776         vmx->vm = vm;
777
778         vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4));
779
780         /*
781          * Clean up EPTP-tagged guest physical and combined mappings
782          *
783          * VMX transitions are not required to invalidate any guest physical
784          * mappings. So, it may be possible for stale guest physical mappings
785          * to be present in the processor TLBs.
786          *
787          * Combined mappings for this EP4TA are also invalidated for all VPIDs.
788          */
789         ept_invalidate_mappings(vmx->eptp);
790
791         msr_bitmap_initialize(vmx->msr_bitmap);
792
793         /*
794          * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE.
795          * The guest FSBASE and GSBASE are saved and restored during
796          * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are
797          * always restored from the vmcs host state area on vm-exit.
798          *
799          * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in
800          * how they are saved/restored so can be directly accessed by the
801          * guest.
802          *
803          * Guest KGSBASE is saved and restored in the guest MSR save area.
804          * Host KGSBASE is restored before returning to userland from the pcb.
805          * There will be a window of time when we are executing in the host
806          * kernel context with a value of KGSBASE from the guest. This is ok
807          * because the value of KGSBASE is inconsequential in kernel context.
808          *
809          * MSR_EFER is saved and restored in the guest VMCS area on a
810          * VM exit and entry respectively. It is also restored from the
811          * host VMCS area on a VM exit.
812          */
813         if (guest_msr_rw(vmx, MSR_GSBASE) ||
814             guest_msr_rw(vmx, MSR_FSBASE) ||
815             guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) ||
816             guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) ||
817             guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) ||
818             guest_msr_rw(vmx, MSR_KGSBASE) ||
819             guest_msr_rw(vmx, MSR_EFER))
820                 panic("vmx_vminit: error setting guest msr access");
821
822         /*
823          * MSR_PAT is saved and restored in the guest VMCS are on a VM exit
824          * and entry respectively. It is also restored from the host VMCS
825          * area on a VM exit. However, if running on a system with no
826          * MSR_PAT save/restore support, leave access disabled so accesses
827          * will be trapped.
828          */
829         if (!vmx_no_patmsr && guest_msr_rw(vmx, MSR_PAT))
830                 panic("vmx_vminit: error setting guest pat msr access");
831
832         vpid_alloc(vpid, VM_MAXCPU);
833
834         if (virtual_interrupt_delivery) {
835                 error = vm_map_mmio(vm, DEFAULT_APIC_BASE, PAGE_SIZE,
836                     APIC_ACCESS_ADDRESS);
837                 /* XXX this should really return an error to the caller */
838                 KASSERT(error == 0, ("vm_map_mmio(apicbase) error %d", error));
839         }
840
841         for (i = 0; i < VM_MAXCPU; i++) {
842                 vmcs = &vmx->vmcs[i];
843                 vmcs->identifier = vmx_revision();
844                 error = vmclear(vmcs);
845                 if (error != 0) {
846                         panic("vmx_vminit: vmclear error %d on vcpu %d\n",
847                               error, i);
848                 }
849
850                 error = vmcs_init(vmcs);
851                 KASSERT(error == 0, ("vmcs_init error %d", error));
852
853                 VMPTRLD(vmcs);
854                 error = 0;
855                 error += vmwrite(VMCS_HOST_RSP, (u_long)&vmx->ctx[i]);
856                 error += vmwrite(VMCS_EPTP, vmx->eptp);
857                 error += vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls);
858                 error += vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls);
859                 error += vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2);
860                 error += vmwrite(VMCS_EXIT_CTLS, exit_ctls);
861                 error += vmwrite(VMCS_ENTRY_CTLS, entry_ctls);
862                 error += vmwrite(VMCS_MSR_BITMAP, vtophys(vmx->msr_bitmap));
863                 error += vmwrite(VMCS_VPID, vpid[i]);
864                 if (virtual_interrupt_delivery) {
865                         error += vmwrite(VMCS_APIC_ACCESS, APIC_ACCESS_ADDRESS);
866                         error += vmwrite(VMCS_VIRTUAL_APIC,
867                             vtophys(&vmx->apic_page[i]));
868                         error += vmwrite(VMCS_EOI_EXIT0, 0);
869                         error += vmwrite(VMCS_EOI_EXIT1, 0);
870                         error += vmwrite(VMCS_EOI_EXIT2, 0);
871                         error += vmwrite(VMCS_EOI_EXIT3, 0);
872                 }
873                 if (posted_interrupts) {
874                         error += vmwrite(VMCS_PIR_VECTOR, pirvec);
875                         error += vmwrite(VMCS_PIR_DESC,
876                             vtophys(&vmx->pir_desc[i]));
877                 }
878                 VMCLEAR(vmcs);
879                 KASSERT(error == 0, ("vmx_vminit: error customizing the vmcs"));
880
881                 vmx->cap[i].set = 0;
882                 vmx->cap[i].proc_ctls = procbased_ctls;
883                 vmx->cap[i].proc_ctls2 = procbased_ctls2;
884
885                 vmx->state[i].lastcpu = -1;
886                 vmx->state[i].vpid = vpid[i];
887                 vmx->state[i].user_event.intr_info = 0;
888
889                 msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count);
890
891                 error = vmcs_set_msr_save(vmcs, vtophys(vmx->guest_msrs[i]),
892                     guest_msr_count);
893                 if (error != 0)
894                         panic("vmcs_set_msr_save error %d", error);
895
896                 /*
897                  * Set up the CR0/4 shadows, and init the read shadow
898                  * to the power-on register value from the Intel Sys Arch.
899                  *  CR0 - 0x60000010
900                  *  CR4 - 0
901                  */
902                 error = vmx_setup_cr0_shadow(vmcs, 0x60000010);
903                 if (error != 0)
904                         panic("vmx_setup_cr0_shadow %d", error);
905
906                 error = vmx_setup_cr4_shadow(vmcs, 0);
907                 if (error != 0)
908                         panic("vmx_setup_cr4_shadow %d", error);
909
910                 vmx->ctx[i].pmap = pmap;
911         }
912
913         return (vmx);
914 }
915
916 static int
917 vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx)
918 {
919         int handled, func;
920         
921         func = vmxctx->guest_rax;
922
923         handled = x86_emulate_cpuid(vm, vcpu,
924                                     (uint32_t*)(&vmxctx->guest_rax),
925                                     (uint32_t*)(&vmxctx->guest_rbx),
926                                     (uint32_t*)(&vmxctx->guest_rcx),
927                                     (uint32_t*)(&vmxctx->guest_rdx));
928         return (handled);
929 }
930
931 static __inline void
932 vmx_run_trace(struct vmx *vmx, int vcpu)
933 {
934 #ifdef KTR
935         VCPU_CTR1(vmx->vm, vcpu, "Resume execution at %#lx", vmcs_guest_rip());
936 #endif
937 }
938
939 static __inline void
940 vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason,
941                int handled)
942 {
943 #ifdef KTR
944         VCPU_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx",
945                  handled ? "handled" : "unhandled",
946                  exit_reason_to_str(exit_reason), rip);
947 #endif
948 }
949
950 static __inline void
951 vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip)
952 {
953 #ifdef KTR
954         VCPU_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip);
955 #endif
956 }
957
958 static VMM_STAT_INTEL(VCPU_INVVPID_SAVED, "Number of vpid invalidations saved");
959
960 static void
961 vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap)
962 {
963         struct vmxstate *vmxstate;
964         struct invvpid_desc invvpid_desc;
965
966         vmxstate = &vmx->state[vcpu];
967         if (vmxstate->lastcpu == curcpu)
968                 return;
969
970         vmxstate->lastcpu = curcpu;
971
972         vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1);
973
974         vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase());
975         vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase());
976         vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase());
977
978         /*
979          * If we are using VPIDs then invalidate all mappings tagged with 'vpid'
980          *
981          * We do this because this vcpu was executing on a different host
982          * cpu when it last ran. We do not track whether it invalidated
983          * mappings associated with its 'vpid' during that run. So we must
984          * assume that the mappings associated with 'vpid' on 'curcpu' are
985          * stale and invalidate them.
986          *
987          * Note that we incur this penalty only when the scheduler chooses to
988          * move the thread associated with this vcpu between host cpus.
989          *
990          * Note also that this will invalidate mappings tagged with 'vpid'
991          * for "all" EP4TAs.
992          */
993         if (vmxstate->vpid != 0) {
994                 if (pmap->pm_eptgen == vmx->eptgen[curcpu]) {
995                         invvpid_desc._res1 = 0;
996                         invvpid_desc._res2 = 0;
997                         invvpid_desc.vpid = vmxstate->vpid;
998                         invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc);
999                 } else {
1000                         /*
1001                          * The invvpid can be skipped if an invept is going to
1002                          * be performed before entering the guest. The invept
1003                          * will invalidate combined mappings tagged with
1004                          * 'vmx->eptp' for all vpids.
1005                          */
1006                         vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1);
1007                 }
1008         }
1009 }
1010
1011 /*
1012  * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set.
1013  */
1014 CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0);
1015
1016 static void __inline
1017 vmx_set_int_window_exiting(struct vmx *vmx, int vcpu)
1018 {
1019
1020         if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) == 0) {
1021                 vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING;
1022                 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
1023                 VCPU_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting");
1024         }
1025 }
1026
1027 static void __inline
1028 vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu)
1029 {
1030
1031         KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0,
1032             ("intr_window_exiting not set: %#x", vmx->cap[vcpu].proc_ctls));
1033         vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING;
1034         vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
1035         VCPU_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting");
1036 }
1037
1038 static void __inline
1039 vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu)
1040 {
1041
1042         if ((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) == 0) {
1043                 vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING;
1044                 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
1045                 VCPU_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting");
1046         }
1047 }
1048
1049 static void __inline
1050 vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu)
1051 {
1052
1053         KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0,
1054             ("nmi_window_exiting not set %#x", vmx->cap[vcpu].proc_ctls));
1055         vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING;
1056         vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
1057         VCPU_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting");
1058 }
1059
1060 #define NMI_BLOCKING    (VMCS_INTERRUPTIBILITY_NMI_BLOCKING |           \
1061                          VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING)
1062 #define HWINTR_BLOCKING (VMCS_INTERRUPTIBILITY_STI_BLOCKING |           \
1063                          VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING)
1064
1065 static void
1066 vmx_inject_user_event(struct vmx *vmx, int vcpu)
1067 {
1068         struct vmxevent *user_event;
1069         uint32_t info;
1070
1071         user_event = &vmx->state[vcpu].user_event;
1072         
1073         info = vmcs_read(VMCS_ENTRY_INTR_INFO);
1074         KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_user_event: invalid "
1075             "VM-entry interruption information %#x", info));
1076
1077         vmcs_write(VMCS_ENTRY_INTR_INFO, user_event->intr_info);
1078         if (user_event->intr_info & VMCS_INTR_DEL_ERRCODE)
1079                 vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, user_event->error_code);
1080         user_event->intr_info = 0;
1081 }
1082
1083 static void
1084 vmx_inject_exception(struct vmx *vmx, int vcpu, struct vm_exit *vmexit,
1085     int fault, int errvalid, int errcode)
1086 {
1087         uint32_t info;
1088
1089         info = vmcs_read(VMCS_ENTRY_INTR_INFO);
1090         KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_exception: invalid "
1091             "VM-entry interruption information %#x", info));
1092
1093         /*
1094          * Although INTR_T_HWEXCEPTION does not advance %rip, vmx_run()
1095          * always advances it, so we clear the instruction length to zero
1096          * explicitly.
1097          */
1098         vmexit->inst_length = 0;
1099         info = fault | VMCS_INTR_T_HWEXCEPTION | VMCS_INTR_VALID;
1100         if (errvalid) {
1101                 info |= VMCS_INTR_DEL_ERRCODE;
1102                 vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, errcode);
1103         }
1104         vmcs_write(VMCS_ENTRY_INTR_INFO, info);
1105
1106         VCPU_CTR2(vmx->vm, vcpu, "Injecting fault %d (errcode %d)", fault,
1107             errcode);
1108 }
1109
1110 /* All GP# faults VMM injects use an error code of 0. */
1111 static void
1112 vmx_inject_gp(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
1113 {
1114
1115         vmx_inject_exception(vmx, vcpu, vmexit, IDT_GP, 1, 0);
1116 }
1117
1118 static void
1119 vmx_inject_ud(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
1120 {
1121
1122         vmx_inject_exception(vmx, vcpu, vmexit, IDT_UD, 0, 0);
1123 }
1124
1125 static void
1126 vmx_inject_nmi(struct vmx *vmx, int vcpu)
1127 {
1128         uint32_t gi, info;
1129
1130         gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1131         KASSERT((gi & NMI_BLOCKING) == 0, ("vmx_inject_nmi: invalid guest "
1132             "interruptibility-state %#x", gi));
1133
1134         info = vmcs_read(VMCS_ENTRY_INTR_INFO);
1135         KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_nmi: invalid "
1136             "VM-entry interruption information %#x", info));
1137
1138         /*
1139          * Inject the virtual NMI. The vector must be the NMI IDT entry
1140          * or the VMCS entry check will fail.
1141          */
1142         info = IDT_NMI | VMCS_INTR_T_NMI | VMCS_INTR_VALID;
1143         vmcs_write(VMCS_ENTRY_INTR_INFO, info);
1144
1145         VCPU_CTR0(vmx->vm, vcpu, "Injecting vNMI");
1146
1147         /* Clear the request */
1148         vm_nmi_clear(vmx->vm, vcpu);
1149 }
1150
1151 static void
1152 vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic)
1153 {
1154         int vector, need_nmi_exiting;
1155         uint64_t rflags;
1156         uint32_t gi, info;
1157
1158         if (vm_nmi_pending(vmx->vm, vcpu)) {
1159                 /*
1160                  * If there are no conditions blocking NMI injection then
1161                  * inject it directly here otherwise enable "NMI window
1162                  * exiting" to inject it as soon as we can.
1163                  *
1164                  * We also check for STI_BLOCKING because some implementations
1165                  * don't allow NMI injection in this case. If we are running
1166                  * on a processor that doesn't have this restriction it will
1167                  * immediately exit and the NMI will be injected in the
1168                  * "NMI window exiting" handler.
1169                  */
1170                 need_nmi_exiting = 1;
1171                 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1172                 if ((gi & (HWINTR_BLOCKING | NMI_BLOCKING)) == 0) {
1173                         info = vmcs_read(VMCS_ENTRY_INTR_INFO);
1174                         if ((info & VMCS_INTR_VALID) == 0) {
1175                                 vmx_inject_nmi(vmx, vcpu);
1176                                 need_nmi_exiting = 0;
1177                         } else {
1178                                 VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI "
1179                                     "due to VM-entry intr info %#x", info);
1180                         }
1181                 } else {
1182                         VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI due to "
1183                             "Guest Interruptibility-state %#x", gi);
1184                 }
1185
1186                 if (need_nmi_exiting)
1187                         vmx_set_nmi_window_exiting(vmx, vcpu);
1188         }
1189
1190         /*
1191          * If there is a user injection event pending and there isn't
1192          * an interrupt queued already, inject the user event.
1193          */
1194         if (vmx->state[vcpu].user_event.intr_info & VMCS_INTR_VALID) {
1195                 info = vmcs_read(VMCS_ENTRY_INTR_INFO);
1196                 if ((info & VMCS_INTR_VALID) == 0) {
1197                         vmx_inject_user_event(vmx, vcpu);
1198                 } else {
1199                         /*
1200                          * XXX: Do we need to force an exit so this can
1201                          * be injected?
1202                          */
1203                         VCPU_CTR1(vmx->vm, vcpu, "Cannot inject user event "
1204                             "due to VM-entry intr info %#x", info);
1205                 }
1206         }
1207          
1208         if (virtual_interrupt_delivery) {
1209                 vmx_inject_pir(vlapic);
1210                 return;
1211         }
1212
1213         /*
1214          * If interrupt-window exiting is already in effect then don't bother
1215          * checking for pending interrupts. This is just an optimization and
1216          * not needed for correctness.
1217          */
1218         if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0) {
1219                 VCPU_CTR0(vmx->vm, vcpu, "Skip interrupt injection due to "
1220                     "pending int_window_exiting");
1221                 return;
1222         }
1223
1224         /* Ask the local apic for a vector to inject */
1225         if (!vlapic_pending_intr(vlapic, &vector))
1226                 return;
1227
1228         KASSERT(vector >= 32 && vector <= 255, ("invalid vector %d", vector));
1229
1230         /* Check RFLAGS.IF and the interruptibility state of the guest */
1231         rflags = vmcs_read(VMCS_GUEST_RFLAGS);
1232         if ((rflags & PSL_I) == 0) {
1233                 VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
1234                     "rflags %#lx", vector, rflags);
1235                 goto cantinject;
1236         }
1237
1238         gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1239         if (gi & HWINTR_BLOCKING) {
1240                 VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
1241                     "Guest Interruptibility-state %#x", vector, gi);
1242                 goto cantinject;
1243         }
1244
1245         info = vmcs_read(VMCS_ENTRY_INTR_INFO);
1246         if (info & VMCS_INTR_VALID) {
1247                 /*
1248                  * This is expected and could happen for multiple reasons:
1249                  * - A vectoring VM-entry was aborted due to astpending
1250                  * - A VM-exit happened during event injection.
1251                  * - An NMI was injected above or after "NMI window exiting"
1252                  */
1253                 VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
1254                     "VM-entry intr info %#x", vector, info);
1255                 goto cantinject;
1256         }
1257
1258         /* Inject the interrupt */
1259         info = VMCS_INTR_T_HWINTR | VMCS_INTR_VALID;
1260         info |= vector;
1261         vmcs_write(VMCS_ENTRY_INTR_INFO, info);
1262
1263         /* Update the Local APIC ISR */
1264         vlapic_intr_accepted(vlapic, vector);
1265
1266         VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector);
1267
1268         return;
1269
1270 cantinject:
1271         /*
1272          * Set the Interrupt Window Exiting execution control so we can inject
1273          * the interrupt as soon as blocking condition goes away.
1274          */
1275         vmx_set_int_window_exiting(vmx, vcpu);
1276 }
1277
1278 /*
1279  * If the Virtual NMIs execution control is '1' then the logical processor
1280  * tracks virtual-NMI blocking in the Guest Interruptibility-state field of
1281  * the VMCS. An IRET instruction in VMX non-root operation will remove any
1282  * virtual-NMI blocking.
1283  *
1284  * This unblocking occurs even if the IRET causes a fault. In this case the
1285  * hypervisor needs to restore virtual-NMI blocking before resuming the guest.
1286  */
1287 static void
1288 vmx_restore_nmi_blocking(struct vmx *vmx, int vcpuid)
1289 {
1290         uint32_t gi;
1291
1292         VCPU_CTR0(vmx->vm, vcpuid, "Restore Virtual-NMI blocking");
1293         gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1294         gi |= VMCS_INTERRUPTIBILITY_NMI_BLOCKING;
1295         vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
1296 }
1297
1298 static void
1299 vmx_clear_nmi_blocking(struct vmx *vmx, int vcpuid)
1300 {
1301         uint32_t gi;
1302
1303         VCPU_CTR0(vmx->vm, vcpuid, "Clear Virtual-NMI blocking");
1304         gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1305         gi &= ~VMCS_INTERRUPTIBILITY_NMI_BLOCKING;
1306         vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
1307 }
1308
1309 static int
1310 vmx_emulate_xsetbv(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
1311 {
1312         struct vmxctx *vmxctx;
1313         uint64_t xcrval;
1314         const struct xsave_limits *limits;
1315
1316         vmxctx = &vmx->ctx[vcpu];
1317         limits = vmm_get_xsave_limits();
1318
1319         /*
1320          * Note that the processor raises a GP# fault on its own if
1321          * xsetbv is executed for CPL != 0, so we do not have to
1322          * emulate that fault here.
1323          */
1324
1325         /* Only xcr0 is supported. */
1326         if (vmxctx->guest_rcx != 0) {
1327                 vmx_inject_gp(vmx, vcpu, vmexit);
1328                 return (HANDLED);
1329         }
1330
1331         /* We only handle xcr0 if both the host and guest have XSAVE enabled. */
1332         if (!limits->xsave_enabled || !(vmcs_read(VMCS_GUEST_CR4) & CR4_XSAVE)) {
1333                 vmx_inject_ud(vmx, vcpu, vmexit);
1334                 return (HANDLED);
1335         }
1336
1337         xcrval = vmxctx->guest_rdx << 32 | (vmxctx->guest_rax & 0xffffffff);
1338         if ((xcrval & ~limits->xcr0_allowed) != 0) {
1339                 vmx_inject_gp(vmx, vcpu, vmexit);
1340                 return (HANDLED);
1341         }
1342
1343         if (!(xcrval & XFEATURE_ENABLED_X87)) {
1344                 vmx_inject_gp(vmx, vcpu, vmexit);
1345                 return (HANDLED);
1346         }
1347
1348         if ((xcrval & (XFEATURE_ENABLED_AVX | XFEATURE_ENABLED_SSE)) ==
1349             XFEATURE_ENABLED_AVX) {
1350                 vmx_inject_gp(vmx, vcpu, vmexit);
1351                 return (HANDLED);
1352         }
1353
1354         /*
1355          * This runs "inside" vmrun() with the guest's FPU state, so
1356          * modifying xcr0 directly modifies the guest's xcr0, not the
1357          * host's.
1358          */
1359         load_xcr(0, xcrval);
1360         return (HANDLED);
1361 }
1362
1363 static int
1364 vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
1365 {
1366         int cr, vmcs_guest_cr, vmcs_shadow_cr;
1367         uint64_t crval, regval, ones_mask, zeros_mask;
1368         const struct vmxctx *vmxctx;
1369
1370         /* We only handle mov to %cr0 or %cr4 at this time */
1371         if ((exitqual & 0xf0) != 0x00)
1372                 return (UNHANDLED);
1373
1374         cr = exitqual & 0xf;
1375         if (cr != 0 && cr != 4)
1376                 return (UNHANDLED);
1377
1378         regval = 0; /* silence gcc */
1379         vmxctx = &vmx->ctx[vcpu];
1380
1381         /*
1382          * We must use vmcs_write() directly here because vmcs_setreg() will
1383          * call vmclear(vmcs) as a side-effect which we certainly don't want.
1384          */
1385         switch ((exitqual >> 8) & 0xf) {
1386         case 0:
1387                 regval = vmxctx->guest_rax;
1388                 break;
1389         case 1:
1390                 regval = vmxctx->guest_rcx;
1391                 break;
1392         case 2:
1393                 regval = vmxctx->guest_rdx;
1394                 break;
1395         case 3:
1396                 regval = vmxctx->guest_rbx;
1397                 break;
1398         case 4:
1399                 regval = vmcs_read(VMCS_GUEST_RSP);
1400                 break;
1401         case 5:
1402                 regval = vmxctx->guest_rbp;
1403                 break;
1404         case 6:
1405                 regval = vmxctx->guest_rsi;
1406                 break;
1407         case 7:
1408                 regval = vmxctx->guest_rdi;
1409                 break;
1410         case 8:
1411                 regval = vmxctx->guest_r8;
1412                 break;
1413         case 9:
1414                 regval = vmxctx->guest_r9;
1415                 break;
1416         case 10:
1417                 regval = vmxctx->guest_r10;
1418                 break;
1419         case 11:
1420                 regval = vmxctx->guest_r11;
1421                 break;
1422         case 12:
1423                 regval = vmxctx->guest_r12;
1424                 break;
1425         case 13:
1426                 regval = vmxctx->guest_r13;
1427                 break;
1428         case 14:
1429                 regval = vmxctx->guest_r14;
1430                 break;
1431         case 15:
1432                 regval = vmxctx->guest_r15;
1433                 break;
1434         }
1435
1436         if (cr == 0) {
1437                 ones_mask = cr0_ones_mask;
1438                 zeros_mask = cr0_zeros_mask;
1439                 vmcs_guest_cr = VMCS_GUEST_CR0;
1440                 vmcs_shadow_cr = VMCS_CR0_SHADOW;
1441         } else {
1442                 ones_mask = cr4_ones_mask;
1443                 zeros_mask = cr4_zeros_mask;
1444                 vmcs_guest_cr = VMCS_GUEST_CR4;
1445                 vmcs_shadow_cr = VMCS_CR4_SHADOW;
1446         }
1447         vmcs_write(vmcs_shadow_cr, regval);
1448
1449         crval = regval | ones_mask;
1450         crval &= ~zeros_mask;
1451         vmcs_write(vmcs_guest_cr, crval);
1452
1453         if (cr == 0 && regval & CR0_PG) {
1454                 uint64_t efer, entry_ctls;
1455
1456                 /*
1457                  * If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and
1458                  * the "IA-32e mode guest" bit in VM-entry control must be
1459                  * equal.
1460                  */
1461                 efer = vmcs_read(VMCS_GUEST_IA32_EFER);
1462                 if (efer & EFER_LME) {
1463                         efer |= EFER_LMA;
1464                         vmcs_write(VMCS_GUEST_IA32_EFER, efer);
1465                         entry_ctls = vmcs_read(VMCS_ENTRY_CTLS);
1466                         entry_ctls |= VM_ENTRY_GUEST_LMA;
1467                         vmcs_write(VMCS_ENTRY_CTLS, entry_ctls);
1468                 }
1469         }
1470
1471         return (HANDLED);
1472 }
1473
1474 static enum vie_cpu_mode
1475 vmx_cpu_mode(void)
1476 {
1477
1478         if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA)
1479                 return (CPU_MODE_64BIT);
1480         else
1481                 return (CPU_MODE_COMPATIBILITY);
1482 }
1483
1484 static enum vie_paging_mode
1485 vmx_paging_mode(void)
1486 {
1487
1488         if (!(vmcs_read(VMCS_GUEST_CR0) & CR0_PG))
1489                 return (PAGING_MODE_FLAT);
1490         if (!(vmcs_read(VMCS_GUEST_CR4) & CR4_PAE))
1491                 return (PAGING_MODE_32);
1492         if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LME)
1493                 return (PAGING_MODE_64);
1494         else
1495                 return (PAGING_MODE_PAE);
1496 }
1497
1498 static int
1499 ept_fault_type(uint64_t ept_qual)
1500 {
1501         int fault_type;
1502
1503         if (ept_qual & EPT_VIOLATION_DATA_WRITE)
1504                 fault_type = VM_PROT_WRITE;
1505         else if (ept_qual & EPT_VIOLATION_INST_FETCH)
1506                 fault_type = VM_PROT_EXECUTE;
1507         else
1508                 fault_type= VM_PROT_READ;
1509
1510         return (fault_type);
1511 }
1512
1513 static boolean_t
1514 ept_emulation_fault(uint64_t ept_qual)
1515 {
1516         int read, write;
1517
1518         /* EPT fault on an instruction fetch doesn't make sense here */
1519         if (ept_qual & EPT_VIOLATION_INST_FETCH)
1520                 return (FALSE);
1521
1522         /* EPT fault must be a read fault or a write fault */
1523         read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0;
1524         write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0;
1525         if ((read | write) == 0)
1526                 return (FALSE);
1527
1528         /*
1529          * The EPT violation must have been caused by accessing a
1530          * guest-physical address that is a translation of a guest-linear
1531          * address.
1532          */
1533         if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 ||
1534             (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) {
1535                 return (FALSE);
1536         }
1537
1538         return (TRUE);
1539 }
1540
1541 static int
1542 vmx_handle_apic_write(struct vlapic *vlapic, uint64_t qual)
1543 {
1544         int error, handled, offset;
1545         bool retu;
1546
1547         if (!virtual_interrupt_delivery)
1548                 return (UNHANDLED);
1549
1550         handled = HANDLED;
1551         offset = APIC_WRITE_OFFSET(qual);
1552         switch (offset) {
1553         case APIC_OFFSET_ID:
1554                 vlapic_id_write_handler(vlapic);
1555                 break;
1556         case APIC_OFFSET_LDR:
1557                 vlapic_ldr_write_handler(vlapic);
1558                 break;
1559         case APIC_OFFSET_DFR:
1560                 vlapic_dfr_write_handler(vlapic);
1561                 break;
1562         case APIC_OFFSET_SVR:
1563                 vlapic_svr_write_handler(vlapic);
1564                 break;
1565         case APIC_OFFSET_ESR:
1566                 vlapic_esr_write_handler(vlapic);
1567                 break;
1568         case APIC_OFFSET_ICR_LOW:
1569                 retu = false;
1570                 error = vlapic_icrlo_write_handler(vlapic, &retu);
1571                 if (error != 0 || retu)
1572                         handled = UNHANDLED;
1573                 break;
1574         case APIC_OFFSET_CMCI_LVT:
1575         case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
1576                 vlapic_lvt_write_handler(vlapic, offset);
1577                 break;
1578         case APIC_OFFSET_TIMER_ICR:
1579                 vlapic_icrtmr_write_handler(vlapic);
1580                 break;
1581         case APIC_OFFSET_TIMER_DCR:
1582                 vlapic_dcr_write_handler(vlapic);
1583                 break;
1584         default:
1585                 handled = UNHANDLED;
1586                 break;
1587         }
1588         return (handled);
1589 }
1590
1591 static bool
1592 apic_access_fault(uint64_t gpa)
1593 {
1594
1595         if (virtual_interrupt_delivery &&
1596             (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE))
1597                 return (true);
1598         else
1599                 return (false);
1600 }
1601
1602 static int
1603 vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
1604 {
1605         uint64_t qual;
1606         int access_type, offset, allowed;
1607
1608         if (!virtual_interrupt_delivery)
1609                 return (UNHANDLED);
1610
1611         qual = vmexit->u.vmx.exit_qualification;
1612         access_type = APIC_ACCESS_TYPE(qual);
1613         offset = APIC_ACCESS_OFFSET(qual);
1614
1615         allowed = 0;
1616         if (access_type == 0) {
1617                 /*
1618                  * Read data access to the following registers is expected.
1619                  */
1620                 switch (offset) {
1621                 case APIC_OFFSET_APR:
1622                 case APIC_OFFSET_PPR:
1623                 case APIC_OFFSET_RRR:
1624                 case APIC_OFFSET_CMCI_LVT:
1625                 case APIC_OFFSET_TIMER_CCR:
1626                         allowed = 1;
1627                         break;
1628                 default:
1629                         break;
1630                 }
1631         } else if (access_type == 1) {
1632                 /*
1633                  * Write data access to the following registers is expected.
1634                  */
1635                 switch (offset) {
1636                 case APIC_OFFSET_VER:
1637                 case APIC_OFFSET_APR:
1638                 case APIC_OFFSET_PPR:
1639                 case APIC_OFFSET_RRR:
1640                 case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
1641                 case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
1642                 case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
1643                 case APIC_OFFSET_CMCI_LVT:
1644                 case APIC_OFFSET_TIMER_CCR:
1645                         allowed = 1;
1646                         break;
1647                 default:
1648                         break;
1649                 }
1650         }
1651
1652         if (allowed) {
1653                 vmexit->exitcode = VM_EXITCODE_INST_EMUL;
1654                 vmexit->u.inst_emul.gpa = DEFAULT_APIC_BASE + offset;
1655                 vmexit->u.inst_emul.gla = VIE_INVALID_GLA;
1656                 vmexit->u.inst_emul.cr3 = vmcs_guest_cr3();
1657                 vmexit->u.inst_emul.cpu_mode = vmx_cpu_mode();
1658                 vmexit->u.inst_emul.paging_mode = vmx_paging_mode();
1659         }
1660
1661         /*
1662          * Regardless of whether the APIC-access is allowed this handler
1663          * always returns UNHANDLED:
1664          * - if the access is allowed then it is handled by emulating the
1665          *   instruction that caused the VM-exit (outside the critical section)
1666          * - if the access is not allowed then it will be converted to an
1667          *   exitcode of VM_EXITCODE_VMX and will be dealt with in userland.
1668          */
1669         return (UNHANDLED);
1670 }
1671
1672 static int
1673 vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
1674 {
1675         int error, handled;
1676         struct vmxctx *vmxctx;
1677         struct vlapic *vlapic;
1678         uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, reason;
1679         uint64_t qual, gpa;
1680         bool retu;
1681
1682         CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0);
1683         CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_NMI_EXITING) != 0);
1684
1685         handled = UNHANDLED;
1686         vmxctx = &vmx->ctx[vcpu];
1687
1688         qual = vmexit->u.vmx.exit_qualification;
1689         reason = vmexit->u.vmx.exit_reason;
1690         vmexit->exitcode = VM_EXITCODE_BOGUS;
1691
1692         vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1);
1693
1694         /*
1695          * VM exits that could be triggered during event injection on the
1696          * previous VM entry need to be handled specially by re-injecting
1697          * the event.
1698          *
1699          * See "Information for VM Exits During Event Delivery" in Intel SDM
1700          * for details.
1701          */
1702         switch (reason) {
1703         case EXIT_REASON_EPT_FAULT:
1704         case EXIT_REASON_EPT_MISCONFIG:
1705         case EXIT_REASON_APIC_ACCESS:
1706         case EXIT_REASON_TASK_SWITCH:
1707         case EXIT_REASON_EXCEPTION:
1708                 idtvec_info = vmcs_idt_vectoring_info();
1709                 if (idtvec_info & VMCS_IDT_VEC_VALID) {
1710                         idtvec_info &= ~(1 << 12); /* clear undefined bit */
1711                         vmcs_write(VMCS_ENTRY_INTR_INFO, idtvec_info);
1712                         if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
1713                                 idtvec_err = vmcs_idt_vectoring_err();
1714                                 vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR,
1715                                     idtvec_err);
1716                         }
1717                         /*
1718                          * If 'virtual NMIs' are being used and the VM-exit
1719                          * happened while injecting an NMI during the previous
1720                          * VM-entry, then clear "blocking by NMI" in the Guest
1721                          * Interruptibility-state.
1722                          */
1723                         if ((idtvec_info & VMCS_INTR_T_MASK) ==
1724                             VMCS_INTR_T_NMI) {
1725                                  vmx_clear_nmi_blocking(vmx, vcpu);
1726                         }
1727                         vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
1728                 }
1729         default:
1730                 idtvec_info = 0;
1731                 break;
1732         }
1733
1734         switch (reason) {
1735         case EXIT_REASON_CR_ACCESS:
1736                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1);
1737                 handled = vmx_emulate_cr_access(vmx, vcpu, qual);
1738                 break;
1739         case EXIT_REASON_RDMSR:
1740                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1);
1741                 retu = false;
1742                 ecx = vmxctx->guest_rcx;
1743                 error = emulate_rdmsr(vmx->vm, vcpu, ecx, &retu);
1744                 if (error) {
1745                         vmexit->exitcode = VM_EXITCODE_RDMSR;
1746                         vmexit->u.msr.code = ecx;
1747                 } else if (!retu) {
1748                         handled = HANDLED;
1749                 } else {
1750                         /* Return to userspace with a valid exitcode */
1751                         KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
1752                             ("emulate_wrmsr retu with bogus exitcode"));
1753                 }
1754                 break;
1755         case EXIT_REASON_WRMSR:
1756                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_WRMSR, 1);
1757                 retu = false;
1758                 eax = vmxctx->guest_rax;
1759                 ecx = vmxctx->guest_rcx;
1760                 edx = vmxctx->guest_rdx;
1761                 error = emulate_wrmsr(vmx->vm, vcpu, ecx,
1762                     (uint64_t)edx << 32 | eax, &retu);
1763                 if (error) {
1764                         vmexit->exitcode = VM_EXITCODE_WRMSR;
1765                         vmexit->u.msr.code = ecx;
1766                         vmexit->u.msr.wval = (uint64_t)edx << 32 | eax;
1767                 } else if (!retu) {
1768                         handled = HANDLED;
1769                 } else {
1770                         /* Return to userspace with a valid exitcode */
1771                         KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
1772                             ("emulate_wrmsr retu with bogus exitcode"));
1773                 }
1774                 break;
1775         case EXIT_REASON_HLT:
1776                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1);
1777                 vmexit->exitcode = VM_EXITCODE_HLT;
1778                 vmexit->u.hlt.rflags = vmcs_read(VMCS_GUEST_RFLAGS);
1779                 break;
1780         case EXIT_REASON_MTF:
1781                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1);
1782                 vmexit->exitcode = VM_EXITCODE_MTRAP;
1783                 break;
1784         case EXIT_REASON_PAUSE:
1785                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1);
1786                 vmexit->exitcode = VM_EXITCODE_PAUSE;
1787                 break;
1788         case EXIT_REASON_INTR_WINDOW:
1789                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1);
1790                 vmx_clear_int_window_exiting(vmx, vcpu);
1791                 return (1);
1792         case EXIT_REASON_EXT_INTR:
1793                 /*
1794                  * External interrupts serve only to cause VM exits and allow
1795                  * the host interrupt handler to run.
1796                  *
1797                  * If this external interrupt triggers a virtual interrupt
1798                  * to a VM, then that state will be recorded by the
1799                  * host interrupt handler in the VM's softc. We will inject
1800                  * this virtual interrupt during the subsequent VM enter.
1801                  */
1802                 intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
1803                 KASSERT((intr_info & VMCS_INTR_VALID) != 0 &&
1804                     (intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_HWINTR,
1805                     ("VM exit interruption info invalid: %#x", intr_info));
1806                 vmx_trigger_hostintr(intr_info & 0xff);
1807
1808                 /*
1809                  * This is special. We want to treat this as an 'handled'
1810                  * VM-exit but not increment the instruction pointer.
1811                  */
1812                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1);
1813                 return (1);
1814         case EXIT_REASON_NMI_WINDOW:
1815                 /* Exit to allow the pending virtual NMI to be injected */
1816                 if (vm_nmi_pending(vmx->vm, vcpu))
1817                         vmx_inject_nmi(vmx, vcpu);
1818                 vmx_clear_nmi_window_exiting(vmx, vcpu);
1819                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NMI_WINDOW, 1);
1820                 return (1);
1821         case EXIT_REASON_INOUT:
1822                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1);
1823                 vmexit->exitcode = VM_EXITCODE_INOUT;
1824                 vmexit->u.inout.bytes = (qual & 0x7) + 1;
1825                 vmexit->u.inout.in = (qual & 0x8) ? 1 : 0;
1826                 vmexit->u.inout.string = (qual & 0x10) ? 1 : 0;
1827                 vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0;
1828                 vmexit->u.inout.port = (uint16_t)(qual >> 16);
1829                 vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax);
1830                 break;
1831         case EXIT_REASON_CPUID:
1832                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1);
1833                 handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx);
1834                 break;
1835         case EXIT_REASON_EXCEPTION:
1836                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXCEPTION, 1);
1837                 intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
1838                 KASSERT((intr_info & VMCS_INTR_VALID) != 0,
1839                     ("VM exit interruption info invalid: %#x", intr_info));
1840
1841                 /*
1842                  * If Virtual NMIs control is 1 and the VM-exit is due to a
1843                  * fault encountered during the execution of IRET then we must
1844                  * restore the state of "virtual-NMI blocking" before resuming
1845                  * the guest.
1846                  *
1847                  * See "Resuming Guest Software after Handling an Exception".
1848                  */
1849                 if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
1850                     (intr_info & 0xff) != IDT_DF &&
1851                     (intr_info & EXIT_QUAL_NMIUDTI) != 0)
1852                         vmx_restore_nmi_blocking(vmx, vcpu);
1853
1854                 /*
1855                  * The NMI has already been handled in vmx_exit_handle_nmi().
1856                  */
1857                 if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI)
1858                         return (1);
1859                 break;
1860         case EXIT_REASON_EPT_FAULT:
1861                 /*
1862                  * If 'gpa' lies within the address space allocated to
1863                  * memory then this must be a nested page fault otherwise
1864                  * this must be an instruction that accesses MMIO space.
1865                  */
1866                 gpa = vmcs_gpa();
1867                 if (vm_mem_allocated(vmx->vm, gpa) || apic_access_fault(gpa)) {
1868                         vmexit->exitcode = VM_EXITCODE_PAGING;
1869                         vmexit->u.paging.gpa = gpa;
1870                         vmexit->u.paging.fault_type = ept_fault_type(qual);
1871                         vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NESTED_FAULT, 1);
1872                 } else if (ept_emulation_fault(qual)) {
1873                         vmexit->exitcode = VM_EXITCODE_INST_EMUL;
1874                         vmexit->u.inst_emul.gpa = gpa;
1875                         vmexit->u.inst_emul.gla = vmcs_gla();
1876                         vmexit->u.inst_emul.cr3 = vmcs_guest_cr3();
1877                         vmexit->u.inst_emul.cpu_mode = vmx_cpu_mode();
1878                         vmexit->u.inst_emul.paging_mode = vmx_paging_mode();
1879                         vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INST_EMUL, 1);
1880                 }
1881                 /*
1882                  * If Virtual NMIs control is 1 and the VM-exit is due to an
1883                  * EPT fault during the execution of IRET then we must restore
1884                  * the state of "virtual-NMI blocking" before resuming.
1885                  *
1886                  * See description of "NMI unblocking due to IRET" in
1887                  * "Exit Qualification for EPT Violations".
1888                  */
1889                 if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
1890                     (qual & EXIT_QUAL_NMIUDTI) != 0)
1891                         vmx_restore_nmi_blocking(vmx, vcpu);
1892                 break;
1893         case EXIT_REASON_VIRTUALIZED_EOI:
1894                 vmexit->exitcode = VM_EXITCODE_IOAPIC_EOI;
1895                 vmexit->u.ioapic_eoi.vector = qual & 0xFF;
1896                 vmexit->inst_length = 0;        /* trap-like */
1897                 break;
1898         case EXIT_REASON_APIC_ACCESS:
1899                 handled = vmx_handle_apic_access(vmx, vcpu, vmexit);
1900                 break;
1901         case EXIT_REASON_APIC_WRITE:
1902                 /*
1903                  * APIC-write VM exit is trap-like so the %rip is already
1904                  * pointing to the next instruction.
1905                  */
1906                 vmexit->inst_length = 0;
1907                 vlapic = vm_lapic(vmx->vm, vcpu);
1908                 handled = vmx_handle_apic_write(vlapic, qual);
1909                 break;
1910         case EXIT_REASON_XSETBV:
1911                 handled = vmx_emulate_xsetbv(vmx, vcpu, vmexit);
1912                 break;
1913         default:
1914                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1);
1915                 break;
1916         }
1917
1918         if (handled) {
1919                 /*
1920                  * It is possible that control is returned to userland
1921                  * even though we were able to handle the VM exit in the
1922                  * kernel.
1923                  *
1924                  * In such a case we want to make sure that the userland
1925                  * restarts guest execution at the instruction *after*
1926                  * the one we just processed. Therefore we update the
1927                  * guest rip in the VMCS and in 'vmexit'.
1928                  */
1929                 vmexit->rip += vmexit->inst_length;
1930                 vmexit->inst_length = 0;
1931                 vmcs_write(VMCS_GUEST_RIP, vmexit->rip);
1932         } else {
1933                 if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
1934                         /*
1935                          * If this VM exit was not claimed by anybody then
1936                          * treat it as a generic VMX exit.
1937                          */
1938                         vmexit->exitcode = VM_EXITCODE_VMX;
1939                         vmexit->u.vmx.status = VM_SUCCESS;
1940                         vmexit->u.vmx.inst_type = 0;
1941                         vmexit->u.vmx.inst_error = 0;
1942                 } else {
1943                         /*
1944                          * The exitcode and collateral have been populated.
1945                          * The VM exit will be processed further in userland.
1946                          */
1947                 }
1948         }
1949         return (handled);
1950 }
1951
1952 static __inline int
1953 vmx_exit_astpending(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
1954 {
1955
1956         vmexit->rip = vmcs_guest_rip();
1957         vmexit->inst_length = 0;
1958         vmexit->exitcode = VM_EXITCODE_BOGUS;
1959         vmx_astpending_trace(vmx, vcpu, vmexit->rip);
1960         vmm_stat_incr(vmx->vm, vcpu, VMEXIT_ASTPENDING, 1);
1961
1962         return (HANDLED);
1963 }
1964
1965 static __inline int
1966 vmx_exit_rendezvous(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
1967 {
1968
1969         vmexit->rip = vmcs_guest_rip();
1970         vmexit->inst_length = 0;
1971         vmexit->exitcode = VM_EXITCODE_RENDEZVOUS;
1972         vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RENDEZVOUS, 1);
1973
1974         return (UNHANDLED);
1975 }
1976
1977 static __inline int
1978 vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit)
1979 {
1980
1981         KASSERT(vmxctx->inst_fail_status != VM_SUCCESS,
1982             ("vmx_exit_inst_error: invalid inst_fail_status %d",
1983             vmxctx->inst_fail_status));
1984
1985         vmexit->inst_length = 0;
1986         vmexit->exitcode = VM_EXITCODE_VMX;
1987         vmexit->u.vmx.status = vmxctx->inst_fail_status;
1988         vmexit->u.vmx.inst_error = vmcs_instruction_error();
1989         vmexit->u.vmx.exit_reason = ~0;
1990         vmexit->u.vmx.exit_qualification = ~0;
1991
1992         switch (rc) {
1993         case VMX_VMRESUME_ERROR:
1994         case VMX_VMLAUNCH_ERROR:
1995         case VMX_INVEPT_ERROR:
1996                 vmexit->u.vmx.inst_type = rc;
1997                 break;
1998         default:
1999                 panic("vm_exit_inst_error: vmx_enter_guest returned %d", rc);
2000         }
2001
2002         return (UNHANDLED);
2003 }
2004
2005 /*
2006  * If the NMI-exiting VM execution control is set to '1' then an NMI in
2007  * non-root operation causes a VM-exit. NMI blocking is in effect so it is
2008  * sufficient to simply vector to the NMI handler via a software interrupt.
2009  * However, this must be done before maskable interrupts are enabled
2010  * otherwise the "iret" issued by an interrupt handler will incorrectly
2011  * clear NMI blocking.
2012  */
2013 static __inline void
2014 vmx_exit_handle_nmi(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
2015 {
2016         uint32_t intr_info;
2017
2018         KASSERT((read_rflags() & PSL_I) == 0, ("interrupts enabled"));
2019
2020         if (vmexit->u.vmx.exit_reason != EXIT_REASON_EXCEPTION)
2021                 return;
2022
2023         intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
2024         KASSERT((intr_info & VMCS_INTR_VALID) != 0,
2025             ("VM exit interruption info invalid: %#x", intr_info));
2026
2027         if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI) {
2028                 KASSERT((intr_info & 0xff) == IDT_NMI, ("VM exit due "
2029                     "to NMI has invalid vector: %#x", intr_info));
2030                 VCPU_CTR0(vmx->vm, vcpuid, "Vectoring to NMI handler");
2031                 __asm __volatile("int $2");
2032         }
2033 }
2034
2035 static int
2036 vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap,
2037     void *rendezvous_cookie)
2038 {
2039         int rc, handled, launched;
2040         struct vmx *vmx;
2041         struct vm *vm;
2042         struct vmxctx *vmxctx;
2043         struct vmcs *vmcs;
2044         struct vm_exit *vmexit;
2045         struct vlapic *vlapic;
2046         uint64_t rip;
2047         uint32_t exit_reason;
2048
2049         vmx = arg;
2050         vm = vmx->vm;
2051         vmcs = &vmx->vmcs[vcpu];
2052         vmxctx = &vmx->ctx[vcpu];
2053         vlapic = vm_lapic(vm, vcpu);
2054         vmexit = vm_exitinfo(vm, vcpu);
2055         launched = 0;
2056
2057         KASSERT(vmxctx->pmap == pmap,
2058             ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap));
2059
2060         VMPTRLD(vmcs);
2061
2062         /*
2063          * XXX
2064          * We do this every time because we may setup the virtual machine
2065          * from a different process than the one that actually runs it.
2066          *
2067          * If the life of a virtual machine was spent entirely in the context
2068          * of a single process we could do this once in vmx_vminit().
2069          */
2070         vmcs_write(VMCS_HOST_CR3, rcr3());
2071
2072         vmcs_write(VMCS_GUEST_RIP, startrip);
2073         vmx_set_pcpu_defaults(vmx, vcpu, pmap);
2074         do {
2075                 /*
2076                  * Interrupts are disabled from this point on until the
2077                  * guest starts executing. This is done for the following
2078                  * reasons:
2079                  *
2080                  * If an AST is asserted on this thread after the check below,
2081                  * then the IPI_AST notification will not be lost, because it
2082                  * will cause a VM exit due to external interrupt as soon as
2083                  * the guest state is loaded.
2084                  *
2085                  * A posted interrupt after 'vmx_inject_interrupts()' will
2086                  * not be "lost" because it will be held pending in the host
2087                  * APIC because interrupts are disabled. The pending interrupt
2088                  * will be recognized as soon as the guest state is loaded.
2089                  *
2090                  * The same reasoning applies to the IPI generated by
2091                  * pmap_invalidate_ept().
2092                  */
2093                 disable_intr();
2094                 if (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED)) {
2095                         enable_intr();
2096                         handled = vmx_exit_astpending(vmx, vcpu, vmexit);
2097                         break;
2098                 }
2099
2100                 if (vcpu_rendezvous_pending(rendezvous_cookie)) {
2101                         enable_intr();
2102                         handled = vmx_exit_rendezvous(vmx, vcpu, vmexit);
2103                         break;
2104                 }
2105
2106                 vmx_inject_interrupts(vmx, vcpu, vlapic);
2107                 vmx_run_trace(vmx, vcpu);
2108                 rc = vmx_enter_guest(vmxctx, vmx, launched);
2109
2110                 /* Collect some information for VM exit processing */
2111                 vmexit->rip = rip = vmcs_guest_rip();
2112                 vmexit->inst_length = vmexit_instruction_length();
2113                 vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason();
2114                 vmexit->u.vmx.exit_qualification = vmcs_exit_qualification();
2115
2116                 if (rc == VMX_GUEST_VMEXIT) {
2117                         vmx_exit_handle_nmi(vmx, vcpu, vmexit);
2118                         enable_intr();
2119                         handled = vmx_exit_process(vmx, vcpu, vmexit);
2120                 } else {
2121                         enable_intr();
2122                         handled = vmx_exit_inst_error(vmxctx, rc, vmexit);
2123                 }
2124                 launched = 1;
2125                 vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled);
2126         } while (handled);
2127
2128         /*
2129          * If a VM exit has been handled then the exitcode must be BOGUS
2130          * If a VM exit is not handled then the exitcode must not be BOGUS
2131          */
2132         if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) ||
2133             (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) {
2134                 panic("Mismatch between handled (%d) and exitcode (%d)",
2135                       handled, vmexit->exitcode);
2136         }
2137
2138         if (!handled)
2139                 vmm_stat_incr(vm, vcpu, VMEXIT_USERSPACE, 1);
2140
2141         VCPU_CTR1(vm, vcpu, "returning from vmx_run: exitcode %d",
2142             vmexit->exitcode);
2143
2144         VMCLEAR(vmcs);
2145         return (0);
2146 }
2147
2148 static void
2149 vmx_vmcleanup(void *arg)
2150 {
2151         int i, error;
2152         struct vmx *vmx = arg;
2153
2154         if (virtual_interrupt_delivery)
2155                 vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE);
2156
2157         for (i = 0; i < VM_MAXCPU; i++)
2158                 vpid_free(vmx->state[i].vpid);
2159
2160         /*
2161          * XXXSMP we also need to clear the VMCS active on the other vcpus.
2162          */
2163         error = vmclear(&vmx->vmcs[0]);
2164         if (error != 0)
2165                 panic("vmx_vmcleanup: vmclear error %d on vcpu 0", error);
2166
2167         free(vmx, M_VMX);
2168
2169         return;
2170 }
2171
2172 static register_t *
2173 vmxctx_regptr(struct vmxctx *vmxctx, int reg)
2174 {
2175
2176         switch (reg) {
2177         case VM_REG_GUEST_RAX:
2178                 return (&vmxctx->guest_rax);
2179         case VM_REG_GUEST_RBX:
2180                 return (&vmxctx->guest_rbx);
2181         case VM_REG_GUEST_RCX:
2182                 return (&vmxctx->guest_rcx);
2183         case VM_REG_GUEST_RDX:
2184                 return (&vmxctx->guest_rdx);
2185         case VM_REG_GUEST_RSI:
2186                 return (&vmxctx->guest_rsi);
2187         case VM_REG_GUEST_RDI:
2188                 return (&vmxctx->guest_rdi);
2189         case VM_REG_GUEST_RBP:
2190                 return (&vmxctx->guest_rbp);
2191         case VM_REG_GUEST_R8:
2192                 return (&vmxctx->guest_r8);
2193         case VM_REG_GUEST_R9:
2194                 return (&vmxctx->guest_r9);
2195         case VM_REG_GUEST_R10:
2196                 return (&vmxctx->guest_r10);
2197         case VM_REG_GUEST_R11:
2198                 return (&vmxctx->guest_r11);
2199         case VM_REG_GUEST_R12:
2200                 return (&vmxctx->guest_r12);
2201         case VM_REG_GUEST_R13:
2202                 return (&vmxctx->guest_r13);
2203         case VM_REG_GUEST_R14:
2204                 return (&vmxctx->guest_r14);
2205         case VM_REG_GUEST_R15:
2206                 return (&vmxctx->guest_r15);
2207         default:
2208                 break;
2209         }
2210         return (NULL);
2211 }
2212
2213 static int
2214 vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval)
2215 {
2216         register_t *regp;
2217
2218         if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
2219                 *retval = *regp;
2220                 return (0);
2221         } else
2222                 return (EINVAL);
2223 }
2224
2225 static int
2226 vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val)
2227 {
2228         register_t *regp;
2229
2230         if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
2231                 *regp = val;
2232                 return (0);
2233         } else
2234                 return (EINVAL);
2235 }
2236
2237 static int
2238 vmx_shadow_reg(int reg)
2239 {
2240         int shreg;
2241
2242         shreg = -1;
2243
2244         switch (reg) {
2245         case VM_REG_GUEST_CR0:
2246                 shreg = VMCS_CR0_SHADOW;
2247                 break;
2248         case VM_REG_GUEST_CR4:
2249                 shreg = VMCS_CR4_SHADOW;
2250                 break;
2251         default:
2252                 break;
2253         }
2254
2255         return (shreg);
2256 }
2257
2258 static int
2259 vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval)
2260 {
2261         int running, hostcpu;
2262         struct vmx *vmx = arg;
2263
2264         running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
2265         if (running && hostcpu != curcpu)
2266                 panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu);
2267
2268         if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0)
2269                 return (0);
2270
2271         return (vmcs_getreg(&vmx->vmcs[vcpu], running, reg, retval));
2272 }
2273
2274 static int
2275 vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
2276 {
2277         int error, hostcpu, running, shadow;
2278         uint64_t ctls;
2279         struct vmx *vmx = arg;
2280
2281         running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
2282         if (running && hostcpu != curcpu)
2283                 panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu);
2284
2285         if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0)
2286                 return (0);
2287
2288         error = vmcs_setreg(&vmx->vmcs[vcpu], running, reg, val);
2289
2290         if (error == 0) {
2291                 /*
2292                  * If the "load EFER" VM-entry control is 1 then the
2293                  * value of EFER.LMA must be identical to "IA-32e mode guest"
2294                  * bit in the VM-entry control.
2295                  */
2296                 if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 &&
2297                     (reg == VM_REG_GUEST_EFER)) {
2298                         vmcs_getreg(&vmx->vmcs[vcpu], running,
2299                                     VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls);
2300                         if (val & EFER_LMA)
2301                                 ctls |= VM_ENTRY_GUEST_LMA;
2302                         else
2303                                 ctls &= ~VM_ENTRY_GUEST_LMA;
2304                         vmcs_setreg(&vmx->vmcs[vcpu], running,
2305                                     VMCS_IDENT(VMCS_ENTRY_CTLS), ctls);
2306                 }
2307
2308                 shadow = vmx_shadow_reg(reg);
2309                 if (shadow > 0) {
2310                         /*
2311                          * Store the unmodified value in the shadow
2312                          */                     
2313                         error = vmcs_setreg(&vmx->vmcs[vcpu], running,
2314                                     VMCS_IDENT(shadow), val);
2315                 }
2316         }
2317
2318         return (error);
2319 }
2320
2321 static int
2322 vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
2323 {
2324         struct vmx *vmx = arg;
2325
2326         return (vmcs_getdesc(&vmx->vmcs[vcpu], reg, desc));
2327 }
2328
2329 static int
2330 vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
2331 {
2332         struct vmx *vmx = arg;
2333
2334         return (vmcs_setdesc(&vmx->vmcs[vcpu], reg, desc));
2335 }
2336
2337 static int
2338 vmx_inject(void *arg, int vcpu, int type, int vector, uint32_t code,
2339            int code_valid)
2340 {
2341         struct vmx *vmx = arg;
2342         struct vmxevent *user_event = &vmx->state[vcpu].user_event;
2343
2344         static uint32_t type_map[VM_EVENT_MAX] = {
2345                 0x1,            /* VM_EVENT_NONE */
2346                 0x0,            /* VM_HW_INTR */
2347                 0x2,            /* VM_NMI */
2348                 0x3,            /* VM_HW_EXCEPTION */
2349                 0x4,            /* VM_SW_INTR */
2350                 0x5,            /* VM_PRIV_SW_EXCEPTION */
2351                 0x6,            /* VM_SW_EXCEPTION */
2352         };
2353
2354         /*
2355          * If there is already an exception pending to be delivered to the
2356          * vcpu then just return.
2357          */
2358         if (user_event->intr_info & VMCS_INTR_VALID)
2359                 return (EAGAIN);
2360
2361         user_event->intr_info = vector | (type_map[type] << 8) | VMCS_INTR_VALID;
2362         if (code_valid) {
2363                 user_event->intr_info |= VMCS_INTR_DEL_ERRCODE;
2364                 user_event->error_code = code;
2365         }
2366         return (0);
2367 }
2368
2369 static int
2370 vmx_getcap(void *arg, int vcpu, int type, int *retval)
2371 {
2372         struct vmx *vmx = arg;
2373         int vcap;
2374         int ret;
2375
2376         ret = ENOENT;
2377
2378         vcap = vmx->cap[vcpu].set;
2379
2380         switch (type) {
2381         case VM_CAP_HALT_EXIT:
2382                 if (cap_halt_exit)
2383                         ret = 0;
2384                 break;
2385         case VM_CAP_PAUSE_EXIT:
2386                 if (cap_pause_exit)
2387                         ret = 0;
2388                 break;
2389         case VM_CAP_MTRAP_EXIT:
2390                 if (cap_monitor_trap)
2391                         ret = 0;
2392                 break;
2393         case VM_CAP_UNRESTRICTED_GUEST:
2394                 if (cap_unrestricted_guest)
2395                         ret = 0;
2396                 break;
2397         case VM_CAP_ENABLE_INVPCID:
2398                 if (cap_invpcid)
2399                         ret = 0;
2400                 break;
2401         default:
2402                 break;
2403         }
2404
2405         if (ret == 0)
2406                 *retval = (vcap & (1 << type)) ? 1 : 0;
2407
2408         return (ret);
2409 }
2410
2411 static int
2412 vmx_setcap(void *arg, int vcpu, int type, int val)
2413 {
2414         struct vmx *vmx = arg;
2415         struct vmcs *vmcs = &vmx->vmcs[vcpu];
2416         uint32_t baseval;
2417         uint32_t *pptr;
2418         int error;
2419         int flag;
2420         int reg;
2421         int retval;
2422
2423         retval = ENOENT;
2424         pptr = NULL;
2425
2426         switch (type) {
2427         case VM_CAP_HALT_EXIT:
2428                 if (cap_halt_exit) {
2429                         retval = 0;
2430                         pptr = &vmx->cap[vcpu].proc_ctls;
2431                         baseval = *pptr;
2432                         flag = PROCBASED_HLT_EXITING;
2433                         reg = VMCS_PRI_PROC_BASED_CTLS;
2434                 }
2435                 break;
2436         case VM_CAP_MTRAP_EXIT:
2437                 if (cap_monitor_trap) {
2438                         retval = 0;
2439                         pptr = &vmx->cap[vcpu].proc_ctls;
2440                         baseval = *pptr;
2441                         flag = PROCBASED_MTF;
2442                         reg = VMCS_PRI_PROC_BASED_CTLS;
2443                 }
2444                 break;
2445         case VM_CAP_PAUSE_EXIT:
2446                 if (cap_pause_exit) {
2447                         retval = 0;
2448                         pptr = &vmx->cap[vcpu].proc_ctls;
2449                         baseval = *pptr;
2450                         flag = PROCBASED_PAUSE_EXITING;
2451                         reg = VMCS_PRI_PROC_BASED_CTLS;
2452                 }
2453                 break;
2454         case VM_CAP_UNRESTRICTED_GUEST:
2455                 if (cap_unrestricted_guest) {
2456                         retval = 0;
2457                         pptr = &vmx->cap[vcpu].proc_ctls2;
2458                         baseval = *pptr;
2459                         flag = PROCBASED2_UNRESTRICTED_GUEST;
2460                         reg = VMCS_SEC_PROC_BASED_CTLS;
2461                 }
2462                 break;
2463         case VM_CAP_ENABLE_INVPCID:
2464                 if (cap_invpcid) {
2465                         retval = 0;
2466                         pptr = &vmx->cap[vcpu].proc_ctls2;
2467                         baseval = *pptr;
2468                         flag = PROCBASED2_ENABLE_INVPCID;
2469                         reg = VMCS_SEC_PROC_BASED_CTLS;
2470                 }
2471                 break;
2472         default:
2473                 break;
2474         }
2475
2476         if (retval == 0) {
2477                 if (val) {
2478                         baseval |= flag;
2479                 } else {
2480                         baseval &= ~flag;
2481                 }
2482                 VMPTRLD(vmcs);
2483                 error = vmwrite(reg, baseval);
2484                 VMCLEAR(vmcs);
2485
2486                 if (error) {
2487                         retval = error;
2488                 } else {
2489                         /*
2490                          * Update optional stored flags, and record
2491                          * setting
2492                          */
2493                         if (pptr != NULL) {
2494                                 *pptr = baseval;
2495                         }
2496
2497                         if (val) {
2498                                 vmx->cap[vcpu].set |= (1 << type);
2499                         } else {
2500                                 vmx->cap[vcpu].set &= ~(1 << type);
2501                         }
2502                 }
2503         }
2504
2505         return (retval);
2506 }
2507
2508 struct vlapic_vtx {
2509         struct vlapic   vlapic;
2510         struct pir_desc *pir_desc;
2511         struct vmx      *vmx;
2512 };
2513
2514 #define VMX_CTR_PIR(vm, vcpuid, pir_desc, notify, vector, level, msg)   \
2515 do {                                                                    \
2516         VCPU_CTR2(vm, vcpuid, msg " assert %s-triggered vector %d",     \
2517             level ? "level" : "edge", vector);                          \
2518         VCPU_CTR1(vm, vcpuid, msg " pir0 0x%016lx", pir_desc->pir[0]);  \
2519         VCPU_CTR1(vm, vcpuid, msg " pir1 0x%016lx", pir_desc->pir[1]);  \
2520         VCPU_CTR1(vm, vcpuid, msg " pir2 0x%016lx", pir_desc->pir[2]);  \
2521         VCPU_CTR1(vm, vcpuid, msg " pir3 0x%016lx", pir_desc->pir[3]);  \
2522         VCPU_CTR1(vm, vcpuid, msg " notify: %s", notify ? "yes" : "no");\
2523 } while (0)
2524
2525 /*
2526  * vlapic->ops handlers that utilize the APICv hardware assist described in
2527  * Chapter 29 of the Intel SDM.
2528  */
2529 static int
2530 vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level)
2531 {
2532         struct vlapic_vtx *vlapic_vtx;
2533         struct pir_desc *pir_desc;
2534         uint64_t mask;
2535         int idx, notify;
2536
2537         vlapic_vtx = (struct vlapic_vtx *)vlapic;
2538         pir_desc = vlapic_vtx->pir_desc;
2539
2540         /*
2541          * Keep track of interrupt requests in the PIR descriptor. This is
2542          * because the virtual APIC page pointed to by the VMCS cannot be
2543          * modified if the vcpu is running.
2544          */
2545         idx = vector / 64;
2546         mask = 1UL << (vector % 64);
2547         atomic_set_long(&pir_desc->pir[idx], mask);
2548         notify = atomic_cmpset_long(&pir_desc->pending, 0, 1);
2549
2550         VMX_CTR_PIR(vlapic->vm, vlapic->vcpuid, pir_desc, notify, vector,
2551             level, "vmx_set_intr_ready");
2552         return (notify);
2553 }
2554
2555 static int
2556 vmx_pending_intr(struct vlapic *vlapic, int *vecptr)
2557 {
2558         struct vlapic_vtx *vlapic_vtx;
2559         struct pir_desc *pir_desc;
2560         struct LAPIC *lapic;
2561         uint64_t pending, pirval;
2562         uint32_t ppr, vpr;
2563         int i;
2564
2565         /*
2566          * This function is only expected to be called from the 'HLT' exit
2567          * handler which does not care about the vector that is pending.
2568          */
2569         KASSERT(vecptr == NULL, ("vmx_pending_intr: vecptr must be NULL"));
2570
2571         vlapic_vtx = (struct vlapic_vtx *)vlapic;
2572         pir_desc = vlapic_vtx->pir_desc;
2573
2574         pending = atomic_load_acq_long(&pir_desc->pending);
2575         if (!pending)
2576                 return (0);     /* common case */
2577
2578         /*
2579          * If there is an interrupt pending then it will be recognized only
2580          * if its priority is greater than the processor priority.
2581          *
2582          * Special case: if the processor priority is zero then any pending
2583          * interrupt will be recognized.
2584          */
2585         lapic = vlapic->apic_page;
2586         ppr = lapic->ppr & 0xf0;
2587         if (ppr == 0)
2588                 return (1);
2589
2590         VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "HLT with non-zero PPR %d",
2591             lapic->ppr);
2592
2593         for (i = 3; i >= 0; i--) {
2594                 pirval = pir_desc->pir[i];
2595                 if (pirval != 0) {
2596                         vpr = (i * 64 + flsl(pirval) - 1) & 0xf0;
2597                         return (vpr > ppr);
2598                 }
2599         }
2600         return (0);
2601 }
2602
2603 static void
2604 vmx_intr_accepted(struct vlapic *vlapic, int vector)
2605 {
2606
2607         panic("vmx_intr_accepted: not expected to be called");
2608 }
2609
2610 static void
2611 vmx_set_tmr(struct vlapic *vlapic, int vector, bool level)
2612 {
2613         struct vlapic_vtx *vlapic_vtx;
2614         struct vmx *vmx;
2615         struct vmcs *vmcs;
2616         uint64_t mask, val;
2617
2618         KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector));
2619         KASSERT(!vcpu_is_running(vlapic->vm, vlapic->vcpuid, NULL),
2620             ("vmx_set_tmr: vcpu cannot be running"));
2621
2622         vlapic_vtx = (struct vlapic_vtx *)vlapic;
2623         vmx = vlapic_vtx->vmx;
2624         vmcs = &vmx->vmcs[vlapic->vcpuid];
2625         mask = 1UL << (vector % 64);
2626
2627         VMPTRLD(vmcs);
2628         val = vmcs_read(VMCS_EOI_EXIT(vector));
2629         if (level)
2630                 val |= mask;
2631         else
2632                 val &= ~mask;
2633         vmcs_write(VMCS_EOI_EXIT(vector), val);
2634         VMCLEAR(vmcs);
2635 }
2636
2637 static void
2638 vmx_post_intr(struct vlapic *vlapic, int hostcpu)
2639 {
2640
2641         ipi_cpu(hostcpu, pirvec);
2642 }
2643
2644 /*
2645  * Transfer the pending interrupts in the PIR descriptor to the IRR
2646  * in the virtual APIC page.
2647  */
2648 static void
2649 vmx_inject_pir(struct vlapic *vlapic)
2650 {
2651         struct vlapic_vtx *vlapic_vtx;
2652         struct pir_desc *pir_desc;
2653         struct LAPIC *lapic;
2654         uint64_t val, pirval;
2655         int rvi, pirbase;
2656         uint16_t intr_status_old, intr_status_new;
2657
2658         vlapic_vtx = (struct vlapic_vtx *)vlapic;
2659         pir_desc = vlapic_vtx->pir_desc;
2660         if (atomic_cmpset_long(&pir_desc->pending, 1, 0) == 0) {
2661                 VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: "
2662                     "no posted interrupt pending");
2663                 return;
2664         }
2665
2666         pirval = 0;
2667         lapic = vlapic->apic_page;
2668
2669         val = atomic_readandclear_long(&pir_desc->pir[0]);
2670         if (val != 0) {
2671                 lapic->irr0 |= val;
2672                 lapic->irr1 |= val >> 32;
2673                 pirbase = 0;
2674                 pirval = val;
2675         }
2676
2677         val = atomic_readandclear_long(&pir_desc->pir[1]);
2678         if (val != 0) {
2679                 lapic->irr2 |= val;
2680                 lapic->irr3 |= val >> 32;
2681                 pirbase = 64;
2682                 pirval = val;
2683         }
2684
2685         val = atomic_readandclear_long(&pir_desc->pir[2]);
2686         if (val != 0) {
2687                 lapic->irr4 |= val;
2688                 lapic->irr5 |= val >> 32;
2689                 pirbase = 128;
2690                 pirval = val;
2691         }
2692
2693         val = atomic_readandclear_long(&pir_desc->pir[3]);
2694         if (val != 0) {
2695                 lapic->irr6 |= val;
2696                 lapic->irr7 |= val >> 32;
2697                 pirbase = 192;
2698                 pirval = val;
2699         }
2700         VLAPIC_CTR_IRR(vlapic, "vmx_inject_pir");
2701
2702         /*
2703          * Update RVI so the processor can evaluate pending virtual
2704          * interrupts on VM-entry.
2705          */
2706         if (pirval != 0) {
2707                 rvi = pirbase + flsl(pirval) - 1;
2708                 intr_status_old = vmcs_read(VMCS_GUEST_INTR_STATUS);
2709                 intr_status_new = (intr_status_old & 0xFF00) | rvi;
2710                 if (intr_status_new > intr_status_old) {
2711                         vmcs_write(VMCS_GUEST_INTR_STATUS, intr_status_new);
2712                         VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: "
2713                             "guest_intr_status changed from 0x%04x to 0x%04x",
2714                             intr_status_old, intr_status_new);
2715                 }
2716         }
2717 }
2718
2719 static struct vlapic *
2720 vmx_vlapic_init(void *arg, int vcpuid)
2721 {
2722         struct vmx *vmx;
2723         struct vlapic *vlapic;
2724         struct vlapic_vtx *vlapic_vtx;
2725         
2726         vmx = arg;
2727
2728         vlapic = malloc(sizeof(struct vlapic_vtx), M_VLAPIC, M_WAITOK | M_ZERO);
2729         vlapic->vm = vmx->vm;
2730         vlapic->vcpuid = vcpuid;
2731         vlapic->apic_page = (struct LAPIC *)&vmx->apic_page[vcpuid];
2732
2733         vlapic_vtx = (struct vlapic_vtx *)vlapic;
2734         vlapic_vtx->pir_desc = &vmx->pir_desc[vcpuid];
2735         vlapic_vtx->vmx = vmx;
2736
2737         if (virtual_interrupt_delivery) {
2738                 vlapic->ops.set_intr_ready = vmx_set_intr_ready;
2739                 vlapic->ops.pending_intr = vmx_pending_intr;
2740                 vlapic->ops.intr_accepted = vmx_intr_accepted;
2741                 vlapic->ops.set_tmr = vmx_set_tmr;
2742         }
2743
2744         if (posted_interrupts)
2745                 vlapic->ops.post_intr = vmx_post_intr;
2746
2747         vlapic_init(vlapic);
2748
2749         return (vlapic);
2750 }
2751
2752 static void
2753 vmx_vlapic_cleanup(void *arg, struct vlapic *vlapic)
2754 {
2755
2756         vlapic_cleanup(vlapic);
2757         free(vlapic, M_VLAPIC);
2758 }
2759
2760 struct vmm_ops vmm_ops_intel = {
2761         vmx_init,
2762         vmx_cleanup,
2763         vmx_restore,
2764         vmx_vminit,
2765         vmx_run,
2766         vmx_vmcleanup,
2767         vmx_getreg,
2768         vmx_setreg,
2769         vmx_getdesc,
2770         vmx_setdesc,
2771         vmx_inject,
2772         vmx_getcap,
2773         vmx_setcap,
2774         ept_vmspace_alloc,
2775         ept_vmspace_free,
2776         vmx_vlapic_init,
2777         vmx_vlapic_cleanup,
2778 };