]> CyberLeo.Net >> Repos - FreeBSD/releng/10.2.git/blob - sys/amd64/vmm/intel/vmx.c
- Copy stable/10@285827 to releng/10.2 in preparation for 10.2-RC1
[FreeBSD/releng/10.2.git] / sys / amd64 / vmm / intel / vmx.c
1 /*-
2  * Copyright (c) 2011 NetApp, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  * $FreeBSD$
27  */
28
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/smp.h>
35 #include <sys/kernel.h>
36 #include <sys/malloc.h>
37 #include <sys/pcpu.h>
38 #include <sys/proc.h>
39 #include <sys/sysctl.h>
40
41 #include <vm/vm.h>
42 #include <vm/pmap.h>
43
44 #include <machine/psl.h>
45 #include <machine/cpufunc.h>
46 #include <machine/md_var.h>
47 #include <machine/segments.h>
48 #include <machine/smp.h>
49 #include <machine/specialreg.h>
50 #include <machine/vmparam.h>
51
52 #include <machine/vmm.h>
53 #include <machine/vmm_dev.h>
54 #include <machine/vmm_instruction_emul.h>
55 #include "vmm_lapic.h"
56 #include "vmm_host.h"
57 #include "vmm_ioport.h"
58 #include "vmm_ipi.h"
59 #include "vmm_ktr.h"
60 #include "vmm_stat.h"
61 #include "vatpic.h"
62 #include "vlapic.h"
63 #include "vlapic_priv.h"
64
65 #include "ept.h"
66 #include "vmx_cpufunc.h"
67 #include "vmx.h"
68 #include "vmx_msr.h"
69 #include "x86.h"
70 #include "vmx_controls.h"
71
72 #define PINBASED_CTLS_ONE_SETTING                                       \
73         (PINBASED_EXTINT_EXITING        |                               \
74          PINBASED_NMI_EXITING           |                               \
75          PINBASED_VIRTUAL_NMI)
76 #define PINBASED_CTLS_ZERO_SETTING      0
77
78 #define PROCBASED_CTLS_WINDOW_SETTING                                   \
79         (PROCBASED_INT_WINDOW_EXITING   |                               \
80          PROCBASED_NMI_WINDOW_EXITING)
81
82 #define PROCBASED_CTLS_ONE_SETTING                                      \
83         (PROCBASED_SECONDARY_CONTROLS   |                               \
84          PROCBASED_MWAIT_EXITING        |                               \
85          PROCBASED_MONITOR_EXITING      |                               \
86          PROCBASED_IO_EXITING           |                               \
87          PROCBASED_MSR_BITMAPS          |                               \
88          PROCBASED_CTLS_WINDOW_SETTING  |                               \
89          PROCBASED_CR8_LOAD_EXITING     |                               \
90          PROCBASED_CR8_STORE_EXITING)
91 #define PROCBASED_CTLS_ZERO_SETTING     \
92         (PROCBASED_CR3_LOAD_EXITING |   \
93         PROCBASED_CR3_STORE_EXITING |   \
94         PROCBASED_IO_BITMAPS)
95
96 #define PROCBASED_CTLS2_ONE_SETTING     PROCBASED2_ENABLE_EPT
97 #define PROCBASED_CTLS2_ZERO_SETTING    0
98
99 #define VM_EXIT_CTLS_ONE_SETTING                                        \
100         (VM_EXIT_HOST_LMA                       |                       \
101         VM_EXIT_SAVE_EFER                       |                       \
102         VM_EXIT_LOAD_EFER                       |                       \
103         VM_EXIT_ACKNOWLEDGE_INTERRUPT)
104
105 #define VM_EXIT_CTLS_ZERO_SETTING       VM_EXIT_SAVE_DEBUG_CONTROLS
106
107 #define VM_ENTRY_CTLS_ONE_SETTING       (VM_ENTRY_LOAD_EFER)
108
109 #define VM_ENTRY_CTLS_ZERO_SETTING                                      \
110         (VM_ENTRY_LOAD_DEBUG_CONTROLS           |                       \
111         VM_ENTRY_INTO_SMM                       |                       \
112         VM_ENTRY_DEACTIVATE_DUAL_MONITOR)
113
114 #define HANDLED         1
115 #define UNHANDLED       0
116
117 static MALLOC_DEFINE(M_VMX, "vmx", "vmx");
118 static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic");
119
120 SYSCTL_DECL(_hw_vmm);
121 SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW, NULL, NULL);
122
123 int vmxon_enabled[MAXCPU];
124 static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE);
125
126 static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2;
127 static uint32_t exit_ctls, entry_ctls;
128
129 static uint64_t cr0_ones_mask, cr0_zeros_mask;
130 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_ones_mask, CTLFLAG_RD,
131              &cr0_ones_mask, 0, NULL);
132 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_zeros_mask, CTLFLAG_RD,
133              &cr0_zeros_mask, 0, NULL);
134
135 static uint64_t cr4_ones_mask, cr4_zeros_mask;
136 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD,
137              &cr4_ones_mask, 0, NULL);
138 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD,
139              &cr4_zeros_mask, 0, NULL);
140
141 static int vmx_initialized;
142 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD,
143            &vmx_initialized, 0, "Intel VMX initialized");
144
145 /*
146  * Optional capabilities
147  */
148 static SYSCTL_NODE(_hw_vmm_vmx, OID_AUTO, cap, CTLFLAG_RW, NULL, NULL);
149
150 static int cap_halt_exit;
151 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, halt_exit, CTLFLAG_RD, &cap_halt_exit, 0,
152     "HLT triggers a VM-exit");
153
154 static int cap_pause_exit;
155 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, pause_exit, CTLFLAG_RD, &cap_pause_exit,
156     0, "PAUSE triggers a VM-exit");
157
158 static int cap_unrestricted_guest;
159 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, unrestricted_guest, CTLFLAG_RD,
160     &cap_unrestricted_guest, 0, "Unrestricted guests");
161
162 static int cap_monitor_trap;
163 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, monitor_trap, CTLFLAG_RD,
164     &cap_monitor_trap, 0, "Monitor trap flag");
165
166 static int cap_invpcid;
167 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, invpcid, CTLFLAG_RD, &cap_invpcid,
168     0, "Guests are allowed to use INVPCID");
169
170 static int virtual_interrupt_delivery;
171 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD,
172     &virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support");
173
174 static int posted_interrupts;
175 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, posted_interrupts, CTLFLAG_RD,
176     &posted_interrupts, 0, "APICv posted interrupt support");
177
178 static int pirvec;
179 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupt_vector, CTLFLAG_RD,
180     &pirvec, 0, "APICv posted interrupt vector");
181
182 static struct unrhdr *vpid_unr;
183 static u_int vpid_alloc_failed;
184 SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD,
185             &vpid_alloc_failed, 0, NULL);
186
187 /*
188  * Use the last page below 4GB as the APIC access address. This address is
189  * occupied by the boot firmware so it is guaranteed that it will not conflict
190  * with a page in system memory.
191  */
192 #define APIC_ACCESS_ADDRESS     0xFFFFF000
193
194 static int vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc);
195 static int vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval);
196 static int vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val);
197 static void vmx_inject_pir(struct vlapic *vlapic);
198
199 #ifdef KTR
200 static const char *
201 exit_reason_to_str(int reason)
202 {
203         static char reasonbuf[32];
204
205         switch (reason) {
206         case EXIT_REASON_EXCEPTION:
207                 return "exception";
208         case EXIT_REASON_EXT_INTR:
209                 return "extint";
210         case EXIT_REASON_TRIPLE_FAULT:
211                 return "triplefault";
212         case EXIT_REASON_INIT:
213                 return "init";
214         case EXIT_REASON_SIPI:
215                 return "sipi";
216         case EXIT_REASON_IO_SMI:
217                 return "iosmi";
218         case EXIT_REASON_SMI:
219                 return "smi";
220         case EXIT_REASON_INTR_WINDOW:
221                 return "intrwindow";
222         case EXIT_REASON_NMI_WINDOW:
223                 return "nmiwindow";
224         case EXIT_REASON_TASK_SWITCH:
225                 return "taskswitch";
226         case EXIT_REASON_CPUID:
227                 return "cpuid";
228         case EXIT_REASON_GETSEC:
229                 return "getsec";
230         case EXIT_REASON_HLT:
231                 return "hlt";
232         case EXIT_REASON_INVD:
233                 return "invd";
234         case EXIT_REASON_INVLPG:
235                 return "invlpg";
236         case EXIT_REASON_RDPMC:
237                 return "rdpmc";
238         case EXIT_REASON_RDTSC:
239                 return "rdtsc";
240         case EXIT_REASON_RSM:
241                 return "rsm";
242         case EXIT_REASON_VMCALL:
243                 return "vmcall";
244         case EXIT_REASON_VMCLEAR:
245                 return "vmclear";
246         case EXIT_REASON_VMLAUNCH:
247                 return "vmlaunch";
248         case EXIT_REASON_VMPTRLD:
249                 return "vmptrld";
250         case EXIT_REASON_VMPTRST:
251                 return "vmptrst";
252         case EXIT_REASON_VMREAD:
253                 return "vmread";
254         case EXIT_REASON_VMRESUME:
255                 return "vmresume";
256         case EXIT_REASON_VMWRITE:
257                 return "vmwrite";
258         case EXIT_REASON_VMXOFF:
259                 return "vmxoff";
260         case EXIT_REASON_VMXON:
261                 return "vmxon";
262         case EXIT_REASON_CR_ACCESS:
263                 return "craccess";
264         case EXIT_REASON_DR_ACCESS:
265                 return "draccess";
266         case EXIT_REASON_INOUT:
267                 return "inout";
268         case EXIT_REASON_RDMSR:
269                 return "rdmsr";
270         case EXIT_REASON_WRMSR:
271                 return "wrmsr";
272         case EXIT_REASON_INVAL_VMCS:
273                 return "invalvmcs";
274         case EXIT_REASON_INVAL_MSR:
275                 return "invalmsr";
276         case EXIT_REASON_MWAIT:
277                 return "mwait";
278         case EXIT_REASON_MTF:
279                 return "mtf";
280         case EXIT_REASON_MONITOR:
281                 return "monitor";
282         case EXIT_REASON_PAUSE:
283                 return "pause";
284         case EXIT_REASON_MCE_DURING_ENTRY:
285                 return "mce-during-entry";
286         case EXIT_REASON_TPR:
287                 return "tpr";
288         case EXIT_REASON_APIC_ACCESS:
289                 return "apic-access";
290         case EXIT_REASON_GDTR_IDTR:
291                 return "gdtridtr";
292         case EXIT_REASON_LDTR_TR:
293                 return "ldtrtr";
294         case EXIT_REASON_EPT_FAULT:
295                 return "eptfault";
296         case EXIT_REASON_EPT_MISCONFIG:
297                 return "eptmisconfig";
298         case EXIT_REASON_INVEPT:
299                 return "invept";
300         case EXIT_REASON_RDTSCP:
301                 return "rdtscp";
302         case EXIT_REASON_VMX_PREEMPT:
303                 return "vmxpreempt";
304         case EXIT_REASON_INVVPID:
305                 return "invvpid";
306         case EXIT_REASON_WBINVD:
307                 return "wbinvd";
308         case EXIT_REASON_XSETBV:
309                 return "xsetbv";
310         case EXIT_REASON_APIC_WRITE:
311                 return "apic-write";
312         default:
313                 snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason);
314                 return (reasonbuf);
315         }
316 }
317 #endif  /* KTR */
318
319 static int
320 vmx_allow_x2apic_msrs(struct vmx *vmx)
321 {
322         int i, error;
323
324         error = 0;
325
326         /*
327          * Allow readonly access to the following x2APIC MSRs from the guest.
328          */
329         error += guest_msr_ro(vmx, MSR_APIC_ID);
330         error += guest_msr_ro(vmx, MSR_APIC_VERSION);
331         error += guest_msr_ro(vmx, MSR_APIC_LDR);
332         error += guest_msr_ro(vmx, MSR_APIC_SVR);
333
334         for (i = 0; i < 8; i++)
335                 error += guest_msr_ro(vmx, MSR_APIC_ISR0 + i);
336
337         for (i = 0; i < 8; i++)
338                 error += guest_msr_ro(vmx, MSR_APIC_TMR0 + i);
339         
340         for (i = 0; i < 8; i++)
341                 error += guest_msr_ro(vmx, MSR_APIC_IRR0 + i);
342
343         error += guest_msr_ro(vmx, MSR_APIC_ESR);
344         error += guest_msr_ro(vmx, MSR_APIC_LVT_TIMER);
345         error += guest_msr_ro(vmx, MSR_APIC_LVT_THERMAL);
346         error += guest_msr_ro(vmx, MSR_APIC_LVT_PCINT);
347         error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT0);
348         error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT1);
349         error += guest_msr_ro(vmx, MSR_APIC_LVT_ERROR);
350         error += guest_msr_ro(vmx, MSR_APIC_ICR_TIMER);
351         error += guest_msr_ro(vmx, MSR_APIC_DCR_TIMER);
352         error += guest_msr_ro(vmx, MSR_APIC_ICR);
353
354         /*
355          * Allow TPR, EOI and SELF_IPI MSRs to be read and written by the guest.
356          *
357          * These registers get special treatment described in the section
358          * "Virtualizing MSR-Based APIC Accesses".
359          */
360         error += guest_msr_rw(vmx, MSR_APIC_TPR);
361         error += guest_msr_rw(vmx, MSR_APIC_EOI);
362         error += guest_msr_rw(vmx, MSR_APIC_SELF_IPI);
363
364         return (error);
365 }
366
367 u_long
368 vmx_fix_cr0(u_long cr0)
369 {
370
371         return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask);
372 }
373
374 u_long
375 vmx_fix_cr4(u_long cr4)
376 {
377
378         return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask);
379 }
380
381 static void
382 vpid_free(int vpid)
383 {
384         if (vpid < 0 || vpid > 0xffff)
385                 panic("vpid_free: invalid vpid %d", vpid);
386
387         /*
388          * VPIDs [0,VM_MAXCPU] are special and are not allocated from
389          * the unit number allocator.
390          */
391
392         if (vpid > VM_MAXCPU)
393                 free_unr(vpid_unr, vpid);
394 }
395
396 static void
397 vpid_alloc(uint16_t *vpid, int num)
398 {
399         int i, x;
400
401         if (num <= 0 || num > VM_MAXCPU)
402                 panic("invalid number of vpids requested: %d", num);
403
404         /*
405          * If the "enable vpid" execution control is not enabled then the
406          * VPID is required to be 0 for all vcpus.
407          */
408         if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) == 0) {
409                 for (i = 0; i < num; i++)
410                         vpid[i] = 0;
411                 return;
412         }
413
414         /*
415          * Allocate a unique VPID for each vcpu from the unit number allocator.
416          */
417         for (i = 0; i < num; i++) {
418                 x = alloc_unr(vpid_unr);
419                 if (x == -1)
420                         break;
421                 else
422                         vpid[i] = x;
423         }
424
425         if (i < num) {
426                 atomic_add_int(&vpid_alloc_failed, 1);
427
428                 /*
429                  * If the unit number allocator does not have enough unique
430                  * VPIDs then we need to allocate from the [1,VM_MAXCPU] range.
431                  *
432                  * These VPIDs are not be unique across VMs but this does not
433                  * affect correctness because the combined mappings are also
434                  * tagged with the EP4TA which is unique for each VM.
435                  *
436                  * It is still sub-optimal because the invvpid will invalidate
437                  * combined mappings for a particular VPID across all EP4TAs.
438                  */
439                 while (i-- > 0)
440                         vpid_free(vpid[i]);
441
442                 for (i = 0; i < num; i++)
443                         vpid[i] = i + 1;
444         }
445 }
446
447 static void
448 vpid_init(void)
449 {
450         /*
451          * VPID 0 is required when the "enable VPID" execution control is
452          * disabled.
453          *
454          * VPIDs [1,VM_MAXCPU] are used as the "overflow namespace" when the
455          * unit number allocator does not have sufficient unique VPIDs to
456          * satisfy the allocation.
457          *
458          * The remaining VPIDs are managed by the unit number allocator.
459          */
460         vpid_unr = new_unrhdr(VM_MAXCPU + 1, 0xffff, NULL);
461 }
462
463 static void
464 vmx_disable(void *arg __unused)
465 {
466         struct invvpid_desc invvpid_desc = { 0 };
467         struct invept_desc invept_desc = { 0 };
468
469         if (vmxon_enabled[curcpu]) {
470                 /*
471                  * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b.
472                  *
473                  * VMXON or VMXOFF are not required to invalidate any TLB
474                  * caching structures. This prevents potential retention of
475                  * cached information in the TLB between distinct VMX episodes.
476                  */
477                 invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc);
478                 invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc);
479                 vmxoff();
480         }
481         load_cr4(rcr4() & ~CR4_VMXE);
482 }
483
484 static int
485 vmx_cleanup(void)
486 {
487         
488         if (pirvec != 0)
489                 vmm_ipi_free(pirvec);
490
491         if (vpid_unr != NULL) {
492                 delete_unrhdr(vpid_unr);
493                 vpid_unr = NULL;
494         }
495
496         smp_rendezvous(NULL, vmx_disable, NULL, NULL);
497
498         return (0);
499 }
500
501 static void
502 vmx_enable(void *arg __unused)
503 {
504         int error;
505         uint64_t feature_control;
506
507         feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
508         if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 ||
509             (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) {
510                 wrmsr(MSR_IA32_FEATURE_CONTROL,
511                     feature_control | IA32_FEATURE_CONTROL_VMX_EN |
512                     IA32_FEATURE_CONTROL_LOCK);
513         }
514
515         load_cr4(rcr4() | CR4_VMXE);
516
517         *(uint32_t *)vmxon_region[curcpu] = vmx_revision();
518         error = vmxon(vmxon_region[curcpu]);
519         if (error == 0)
520                 vmxon_enabled[curcpu] = 1;
521 }
522
523 static void
524 vmx_restore(void)
525 {
526
527         if (vmxon_enabled[curcpu])
528                 vmxon(vmxon_region[curcpu]);
529 }
530
531 static int
532 vmx_init(int ipinum)
533 {
534         int error, use_tpr_shadow;
535         uint64_t basic, fixed0, fixed1, feature_control;
536         uint32_t tmp, procbased2_vid_bits;
537
538         /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */
539         if (!(cpu_feature2 & CPUID2_VMX)) {
540                 printf("vmx_init: processor does not support VMX operation\n");
541                 return (ENXIO);
542         }
543
544         /*
545          * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits
546          * are set (bits 0 and 2 respectively).
547          */
548         feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
549         if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 1 &&
550             (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) {
551                 printf("vmx_init: VMX operation disabled by BIOS\n");
552                 return (ENXIO);
553         }
554
555         /*
556          * Verify capabilities MSR_VMX_BASIC:
557          * - bit 54 indicates support for INS/OUTS decoding
558          */
559         basic = rdmsr(MSR_VMX_BASIC);
560         if ((basic & (1UL << 54)) == 0) {
561                 printf("vmx_init: processor does not support desired basic "
562                     "capabilities\n");
563                 return (EINVAL);
564         }
565
566         /* Check support for primary processor-based VM-execution controls */
567         error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
568                                MSR_VMX_TRUE_PROCBASED_CTLS,
569                                PROCBASED_CTLS_ONE_SETTING,
570                                PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls);
571         if (error) {
572                 printf("vmx_init: processor does not support desired primary "
573                        "processor-based controls\n");
574                 return (error);
575         }
576
577         /* Clear the processor-based ctl bits that are set on demand */
578         procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING;
579
580         /* Check support for secondary processor-based VM-execution controls */
581         error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
582                                MSR_VMX_PROCBASED_CTLS2,
583                                PROCBASED_CTLS2_ONE_SETTING,
584                                PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2);
585         if (error) {
586                 printf("vmx_init: processor does not support desired secondary "
587                        "processor-based controls\n");
588                 return (error);
589         }
590
591         /* Check support for VPID */
592         error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
593                                PROCBASED2_ENABLE_VPID, 0, &tmp);
594         if (error == 0)
595                 procbased_ctls2 |= PROCBASED2_ENABLE_VPID;
596
597         /* Check support for pin-based VM-execution controls */
598         error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
599                                MSR_VMX_TRUE_PINBASED_CTLS,
600                                PINBASED_CTLS_ONE_SETTING,
601                                PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls);
602         if (error) {
603                 printf("vmx_init: processor does not support desired "
604                        "pin-based controls\n");
605                 return (error);
606         }
607
608         /* Check support for VM-exit controls */
609         error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS,
610                                VM_EXIT_CTLS_ONE_SETTING,
611                                VM_EXIT_CTLS_ZERO_SETTING,
612                                &exit_ctls);
613         if (error) {
614                 printf("vmx_init: processor does not support desired "
615                     "exit controls\n");
616                 return (error);
617         }
618
619         /* Check support for VM-entry controls */
620         error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS,
621             VM_ENTRY_CTLS_ONE_SETTING, VM_ENTRY_CTLS_ZERO_SETTING,
622             &entry_ctls);
623         if (error) {
624                 printf("vmx_init: processor does not support desired "
625                     "entry controls\n");
626                 return (error);
627         }
628
629         /*
630          * Check support for optional features by testing them
631          * as individual bits
632          */
633         cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
634                                         MSR_VMX_TRUE_PROCBASED_CTLS,
635                                         PROCBASED_HLT_EXITING, 0,
636                                         &tmp) == 0);
637
638         cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
639                                         MSR_VMX_PROCBASED_CTLS,
640                                         PROCBASED_MTF, 0,
641                                         &tmp) == 0);
642
643         cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
644                                          MSR_VMX_TRUE_PROCBASED_CTLS,
645                                          PROCBASED_PAUSE_EXITING, 0,
646                                          &tmp) == 0);
647
648         cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
649                                         MSR_VMX_PROCBASED_CTLS2,
650                                         PROCBASED2_UNRESTRICTED_GUEST, 0,
651                                         &tmp) == 0);
652
653         cap_invpcid = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
654             MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0,
655             &tmp) == 0);
656
657         /*
658          * Check support for virtual interrupt delivery.
659          */
660         procbased2_vid_bits = (PROCBASED2_VIRTUALIZE_APIC_ACCESSES |
661             PROCBASED2_VIRTUALIZE_X2APIC_MODE |
662             PROCBASED2_APIC_REGISTER_VIRTUALIZATION |
663             PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY);
664
665         use_tpr_shadow = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
666             MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_USE_TPR_SHADOW, 0,
667             &tmp) == 0);
668
669         error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
670             procbased2_vid_bits, 0, &tmp);
671         if (error == 0 && use_tpr_shadow) {
672                 virtual_interrupt_delivery = 1;
673                 TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_vid",
674                     &virtual_interrupt_delivery);
675         }
676
677         if (virtual_interrupt_delivery) {
678                 procbased_ctls |= PROCBASED_USE_TPR_SHADOW;
679                 procbased_ctls2 |= procbased2_vid_bits;
680                 procbased_ctls2 &= ~PROCBASED2_VIRTUALIZE_X2APIC_MODE;
681
682                 /*
683                  * No need to emulate accesses to %CR8 if virtual
684                  * interrupt delivery is enabled.
685                  */
686                 procbased_ctls &= ~PROCBASED_CR8_LOAD_EXITING;
687                 procbased_ctls &= ~PROCBASED_CR8_STORE_EXITING;
688
689                 /*
690                  * Check for Posted Interrupts only if Virtual Interrupt
691                  * Delivery is enabled.
692                  */
693                 error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
694                     MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_POSTED_INTERRUPT, 0,
695                     &tmp);
696                 if (error == 0) {
697                         pirvec = vmm_ipi_alloc();
698                         if (pirvec == 0) {
699                                 if (bootverbose) {
700                                         printf("vmx_init: unable to allocate "
701                                             "posted interrupt vector\n");
702                                 }
703                         } else {
704                                 posted_interrupts = 1;
705                                 TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_pir",
706                                     &posted_interrupts);
707                         }
708                 }
709         }
710
711         if (posted_interrupts)
712                     pinbased_ctls |= PINBASED_POSTED_INTERRUPT;
713
714         /* Initialize EPT */
715         error = ept_init(ipinum);
716         if (error) {
717                 printf("vmx_init: ept initialization failed (%d)\n", error);
718                 return (error);
719         }
720
721         /*
722          * Stash the cr0 and cr4 bits that must be fixed to 0 or 1
723          */
724         fixed0 = rdmsr(MSR_VMX_CR0_FIXED0);
725         fixed1 = rdmsr(MSR_VMX_CR0_FIXED1);
726         cr0_ones_mask = fixed0 & fixed1;
727         cr0_zeros_mask = ~fixed0 & ~fixed1;
728
729         /*
730          * CR0_PE and CR0_PG can be set to zero in VMX non-root operation
731          * if unrestricted guest execution is allowed.
732          */
733         if (cap_unrestricted_guest)
734                 cr0_ones_mask &= ~(CR0_PG | CR0_PE);
735
736         /*
737          * Do not allow the guest to set CR0_NW or CR0_CD.
738          */
739         cr0_zeros_mask |= (CR0_NW | CR0_CD);
740
741         fixed0 = rdmsr(MSR_VMX_CR4_FIXED0);
742         fixed1 = rdmsr(MSR_VMX_CR4_FIXED1);
743         cr4_ones_mask = fixed0 & fixed1;
744         cr4_zeros_mask = ~fixed0 & ~fixed1;
745
746         vpid_init();
747
748         vmx_msr_init();
749
750         /* enable VMX operation */
751         smp_rendezvous(NULL, vmx_enable, NULL, NULL);
752
753         vmx_initialized = 1;
754
755         return (0);
756 }
757
758 static void
759 vmx_trigger_hostintr(int vector)
760 {
761         uintptr_t func;
762         struct gate_descriptor *gd;
763
764         gd = &idt[vector];
765
766         KASSERT(vector >= 32 && vector <= 255, ("vmx_trigger_hostintr: "
767             "invalid vector %d", vector));
768         KASSERT(gd->gd_p == 1, ("gate descriptor for vector %d not present",
769             vector));
770         KASSERT(gd->gd_type == SDT_SYSIGT, ("gate descriptor for vector %d "
771             "has invalid type %d", vector, gd->gd_type));
772         KASSERT(gd->gd_dpl == SEL_KPL, ("gate descriptor for vector %d "
773             "has invalid dpl %d", vector, gd->gd_dpl));
774         KASSERT(gd->gd_selector == GSEL(GCODE_SEL, SEL_KPL), ("gate descriptor "
775             "for vector %d has invalid selector %d", vector, gd->gd_selector));
776         KASSERT(gd->gd_ist == 0, ("gate descriptor for vector %d has invalid "
777             "IST %d", vector, gd->gd_ist));
778
779         func = ((long)gd->gd_hioffset << 16 | gd->gd_looffset);
780         vmx_call_isr(func);
781 }
782
783 static int
784 vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial)
785 {
786         int error, mask_ident, shadow_ident;
787         uint64_t mask_value;
788
789         if (which != 0 && which != 4)
790                 panic("vmx_setup_cr_shadow: unknown cr%d", which);
791
792         if (which == 0) {
793                 mask_ident = VMCS_CR0_MASK;
794                 mask_value = cr0_ones_mask | cr0_zeros_mask;
795                 shadow_ident = VMCS_CR0_SHADOW;
796         } else {
797                 mask_ident = VMCS_CR4_MASK;
798                 mask_value = cr4_ones_mask | cr4_zeros_mask;
799                 shadow_ident = VMCS_CR4_SHADOW;
800         }
801
802         error = vmcs_setreg(vmcs, 0, VMCS_IDENT(mask_ident), mask_value);
803         if (error)
804                 return (error);
805
806         error = vmcs_setreg(vmcs, 0, VMCS_IDENT(shadow_ident), initial);
807         if (error)
808                 return (error);
809
810         return (0);
811 }
812 #define vmx_setup_cr0_shadow(vmcs,init) vmx_setup_cr_shadow(0, (vmcs), (init))
813 #define vmx_setup_cr4_shadow(vmcs,init) vmx_setup_cr_shadow(4, (vmcs), (init))
814
815 static void *
816 vmx_vminit(struct vm *vm, pmap_t pmap)
817 {
818         uint16_t vpid[VM_MAXCPU];
819         int i, error;
820         struct vmx *vmx;
821         struct vmcs *vmcs;
822         uint32_t exc_bitmap;
823
824         vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO);
825         if ((uintptr_t)vmx & PAGE_MASK) {
826                 panic("malloc of struct vmx not aligned on %d byte boundary",
827                       PAGE_SIZE);
828         }
829         vmx->vm = vm;
830
831         vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4));
832
833         /*
834          * Clean up EPTP-tagged guest physical and combined mappings
835          *
836          * VMX transitions are not required to invalidate any guest physical
837          * mappings. So, it may be possible for stale guest physical mappings
838          * to be present in the processor TLBs.
839          *
840          * Combined mappings for this EP4TA are also invalidated for all VPIDs.
841          */
842         ept_invalidate_mappings(vmx->eptp);
843
844         msr_bitmap_initialize(vmx->msr_bitmap);
845
846         /*
847          * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE.
848          * The guest FSBASE and GSBASE are saved and restored during
849          * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are
850          * always restored from the vmcs host state area on vm-exit.
851          *
852          * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in
853          * how they are saved/restored so can be directly accessed by the
854          * guest.
855          *
856          * MSR_EFER is saved and restored in the guest VMCS area on a
857          * VM exit and entry respectively. It is also restored from the
858          * host VMCS area on a VM exit.
859          *
860          * The TSC MSR is exposed read-only. Writes are disallowed as
861          * that will impact the host TSC.  If the guest does a write
862          * the "use TSC offsetting" execution control is enabled and the
863          * difference between the host TSC and the guest TSC is written
864          * into the TSC offset in the VMCS.
865          */
866         if (guest_msr_rw(vmx, MSR_GSBASE) ||
867             guest_msr_rw(vmx, MSR_FSBASE) ||
868             guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) ||
869             guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) ||
870             guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) ||
871             guest_msr_rw(vmx, MSR_EFER) ||
872             guest_msr_ro(vmx, MSR_TSC))
873                 panic("vmx_vminit: error setting guest msr access");
874
875         vpid_alloc(vpid, VM_MAXCPU);
876
877         if (virtual_interrupt_delivery) {
878                 error = vm_map_mmio(vm, DEFAULT_APIC_BASE, PAGE_SIZE,
879                     APIC_ACCESS_ADDRESS);
880                 /* XXX this should really return an error to the caller */
881                 KASSERT(error == 0, ("vm_map_mmio(apicbase) error %d", error));
882         }
883
884         for (i = 0; i < VM_MAXCPU; i++) {
885                 vmcs = &vmx->vmcs[i];
886                 vmcs->identifier = vmx_revision();
887                 error = vmclear(vmcs);
888                 if (error != 0) {
889                         panic("vmx_vminit: vmclear error %d on vcpu %d\n",
890                               error, i);
891                 }
892
893                 vmx_msr_guest_init(vmx, i);
894
895                 error = vmcs_init(vmcs);
896                 KASSERT(error == 0, ("vmcs_init error %d", error));
897
898                 VMPTRLD(vmcs);
899                 error = 0;
900                 error += vmwrite(VMCS_HOST_RSP, (u_long)&vmx->ctx[i]);
901                 error += vmwrite(VMCS_EPTP, vmx->eptp);
902                 error += vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls);
903                 error += vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls);
904                 error += vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2);
905                 error += vmwrite(VMCS_EXIT_CTLS, exit_ctls);
906                 error += vmwrite(VMCS_ENTRY_CTLS, entry_ctls);
907                 error += vmwrite(VMCS_MSR_BITMAP, vtophys(vmx->msr_bitmap));
908                 error += vmwrite(VMCS_VPID, vpid[i]);
909
910                 /* exception bitmap */
911                 if (vcpu_trace_exceptions(vm, i))
912                         exc_bitmap = 0xffffffff;
913                 else
914                         exc_bitmap = 1 << IDT_MC;
915                 error += vmwrite(VMCS_EXCEPTION_BITMAP, exc_bitmap);
916
917                 if (virtual_interrupt_delivery) {
918                         error += vmwrite(VMCS_APIC_ACCESS, APIC_ACCESS_ADDRESS);
919                         error += vmwrite(VMCS_VIRTUAL_APIC,
920                             vtophys(&vmx->apic_page[i]));
921                         error += vmwrite(VMCS_EOI_EXIT0, 0);
922                         error += vmwrite(VMCS_EOI_EXIT1, 0);
923                         error += vmwrite(VMCS_EOI_EXIT2, 0);
924                         error += vmwrite(VMCS_EOI_EXIT3, 0);
925                 }
926                 if (posted_interrupts) {
927                         error += vmwrite(VMCS_PIR_VECTOR, pirvec);
928                         error += vmwrite(VMCS_PIR_DESC,
929                             vtophys(&vmx->pir_desc[i]));
930                 }
931                 VMCLEAR(vmcs);
932                 KASSERT(error == 0, ("vmx_vminit: error customizing the vmcs"));
933
934                 vmx->cap[i].set = 0;
935                 vmx->cap[i].proc_ctls = procbased_ctls;
936                 vmx->cap[i].proc_ctls2 = procbased_ctls2;
937
938                 vmx->state[i].nextrip = ~0;
939                 vmx->state[i].lastcpu = NOCPU;
940                 vmx->state[i].vpid = vpid[i];
941
942                 /*
943                  * Set up the CR0/4 shadows, and init the read shadow
944                  * to the power-on register value from the Intel Sys Arch.
945                  *  CR0 - 0x60000010
946                  *  CR4 - 0
947                  */
948                 error = vmx_setup_cr0_shadow(vmcs, 0x60000010);
949                 if (error != 0)
950                         panic("vmx_setup_cr0_shadow %d", error);
951
952                 error = vmx_setup_cr4_shadow(vmcs, 0);
953                 if (error != 0)
954                         panic("vmx_setup_cr4_shadow %d", error);
955
956                 vmx->ctx[i].pmap = pmap;
957         }
958
959         return (vmx);
960 }
961
962 static int
963 vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx)
964 {
965         int handled, func;
966         
967         func = vmxctx->guest_rax;
968
969         handled = x86_emulate_cpuid(vm, vcpu,
970                                     (uint32_t*)(&vmxctx->guest_rax),
971                                     (uint32_t*)(&vmxctx->guest_rbx),
972                                     (uint32_t*)(&vmxctx->guest_rcx),
973                                     (uint32_t*)(&vmxctx->guest_rdx));
974         return (handled);
975 }
976
977 static __inline void
978 vmx_run_trace(struct vmx *vmx, int vcpu)
979 {
980 #ifdef KTR
981         VCPU_CTR1(vmx->vm, vcpu, "Resume execution at %#lx", vmcs_guest_rip());
982 #endif
983 }
984
985 static __inline void
986 vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason,
987                int handled)
988 {
989 #ifdef KTR
990         VCPU_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx",
991                  handled ? "handled" : "unhandled",
992                  exit_reason_to_str(exit_reason), rip);
993 #endif
994 }
995
996 static __inline void
997 vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip)
998 {
999 #ifdef KTR
1000         VCPU_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip);
1001 #endif
1002 }
1003
1004 static VMM_STAT_INTEL(VCPU_INVVPID_SAVED, "Number of vpid invalidations saved");
1005 static VMM_STAT_INTEL(VCPU_INVVPID_DONE, "Number of vpid invalidations done");
1006
1007 /*
1008  * Invalidate guest mappings identified by its vpid from the TLB.
1009  */
1010 static __inline void
1011 vmx_invvpid(struct vmx *vmx, int vcpu, pmap_t pmap, int running)
1012 {
1013         struct vmxstate *vmxstate;
1014         struct invvpid_desc invvpid_desc;
1015
1016         vmxstate = &vmx->state[vcpu];
1017         if (vmxstate->vpid == 0)
1018                 return;
1019
1020         if (!running) {
1021                 /*
1022                  * Set the 'lastcpu' to an invalid host cpu.
1023                  *
1024                  * This will invalidate TLB entries tagged with the vcpu's
1025                  * vpid the next time it runs via vmx_set_pcpu_defaults().
1026                  */
1027                 vmxstate->lastcpu = NOCPU;
1028                 return;
1029         }
1030
1031         KASSERT(curthread->td_critnest > 0, ("%s: vcpu %d running outside "
1032             "critical section", __func__, vcpu));
1033
1034         /*
1035          * Invalidate all mappings tagged with 'vpid'
1036          *
1037          * We do this because this vcpu was executing on a different host
1038          * cpu when it last ran. We do not track whether it invalidated
1039          * mappings associated with its 'vpid' during that run. So we must
1040          * assume that the mappings associated with 'vpid' on 'curcpu' are
1041          * stale and invalidate them.
1042          *
1043          * Note that we incur this penalty only when the scheduler chooses to
1044          * move the thread associated with this vcpu between host cpus.
1045          *
1046          * Note also that this will invalidate mappings tagged with 'vpid'
1047          * for "all" EP4TAs.
1048          */
1049         if (pmap->pm_eptgen == vmx->eptgen[curcpu]) {
1050                 invvpid_desc._res1 = 0;
1051                 invvpid_desc._res2 = 0;
1052                 invvpid_desc.vpid = vmxstate->vpid;
1053                 invvpid_desc.linear_addr = 0;
1054                 invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc);
1055                 vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_DONE, 1);
1056         } else {
1057                 /*
1058                  * The invvpid can be skipped if an invept is going to
1059                  * be performed before entering the guest. The invept
1060                  * will invalidate combined mappings tagged with
1061                  * 'vmx->eptp' for all vpids.
1062                  */
1063                 vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1);
1064         }
1065 }
1066
1067 static void
1068 vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap)
1069 {
1070         struct vmxstate *vmxstate;
1071
1072         vmxstate = &vmx->state[vcpu];
1073         if (vmxstate->lastcpu == curcpu)
1074                 return;
1075
1076         vmxstate->lastcpu = curcpu;
1077
1078         vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1);
1079
1080         vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase());
1081         vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase());
1082         vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase());
1083         vmx_invvpid(vmx, vcpu, pmap, 1);
1084 }
1085
1086 /*
1087  * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set.
1088  */
1089 CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0);
1090
1091 static void __inline
1092 vmx_set_int_window_exiting(struct vmx *vmx, int vcpu)
1093 {
1094
1095         if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) == 0) {
1096                 vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING;
1097                 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
1098                 VCPU_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting");
1099         }
1100 }
1101
1102 static void __inline
1103 vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu)
1104 {
1105
1106         KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0,
1107             ("intr_window_exiting not set: %#x", vmx->cap[vcpu].proc_ctls));
1108         vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING;
1109         vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
1110         VCPU_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting");
1111 }
1112
1113 static void __inline
1114 vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu)
1115 {
1116
1117         if ((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) == 0) {
1118                 vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING;
1119                 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
1120                 VCPU_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting");
1121         }
1122 }
1123
1124 static void __inline
1125 vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu)
1126 {
1127
1128         KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0,
1129             ("nmi_window_exiting not set %#x", vmx->cap[vcpu].proc_ctls));
1130         vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING;
1131         vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
1132         VCPU_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting");
1133 }
1134
1135 int
1136 vmx_set_tsc_offset(struct vmx *vmx, int vcpu, uint64_t offset)
1137 {
1138         int error;
1139
1140         if ((vmx->cap[vcpu].proc_ctls & PROCBASED_TSC_OFFSET) == 0) {
1141                 vmx->cap[vcpu].proc_ctls |= PROCBASED_TSC_OFFSET;
1142                 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
1143                 VCPU_CTR0(vmx->vm, vcpu, "Enabling TSC offsetting");
1144         }
1145
1146         error = vmwrite(VMCS_TSC_OFFSET, offset);
1147
1148         return (error);
1149 }
1150
1151 #define NMI_BLOCKING    (VMCS_INTERRUPTIBILITY_NMI_BLOCKING |           \
1152                          VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING)
1153 #define HWINTR_BLOCKING (VMCS_INTERRUPTIBILITY_STI_BLOCKING |           \
1154                          VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING)
1155
1156 static void
1157 vmx_inject_nmi(struct vmx *vmx, int vcpu)
1158 {
1159         uint32_t gi, info;
1160
1161         gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1162         KASSERT((gi & NMI_BLOCKING) == 0, ("vmx_inject_nmi: invalid guest "
1163             "interruptibility-state %#x", gi));
1164
1165         info = vmcs_read(VMCS_ENTRY_INTR_INFO);
1166         KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_nmi: invalid "
1167             "VM-entry interruption information %#x", info));
1168
1169         /*
1170          * Inject the virtual NMI. The vector must be the NMI IDT entry
1171          * or the VMCS entry check will fail.
1172          */
1173         info = IDT_NMI | VMCS_INTR_T_NMI | VMCS_INTR_VALID;
1174         vmcs_write(VMCS_ENTRY_INTR_INFO, info);
1175
1176         VCPU_CTR0(vmx->vm, vcpu, "Injecting vNMI");
1177
1178         /* Clear the request */
1179         vm_nmi_clear(vmx->vm, vcpu);
1180 }
1181
1182 static void
1183 vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic,
1184     uint64_t guestrip)
1185 {
1186         int vector, need_nmi_exiting, extint_pending;
1187         uint64_t rflags, entryinfo;
1188         uint32_t gi, info;
1189
1190         if (vmx->state[vcpu].nextrip != guestrip) {
1191                 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1192                 if (gi & HWINTR_BLOCKING) {
1193                         VCPU_CTR2(vmx->vm, vcpu, "Guest interrupt blocking "
1194                             "cleared due to rip change: %#lx/%#lx",
1195                             vmx->state[vcpu].nextrip, guestrip);
1196                         gi &= ~HWINTR_BLOCKING;
1197                         vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
1198                 }
1199         }
1200
1201         if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) {
1202                 KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry "
1203                     "intinfo is not valid: %#lx", __func__, entryinfo));
1204
1205                 info = vmcs_read(VMCS_ENTRY_INTR_INFO);
1206                 KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject "
1207                      "pending exception: %#lx/%#x", __func__, entryinfo, info));
1208
1209                 info = entryinfo;
1210                 vector = info & 0xff;
1211                 if (vector == IDT_BP || vector == IDT_OF) {
1212                         /*
1213                          * VT-x requires #BP and #OF to be injected as software
1214                          * exceptions.
1215                          */
1216                         info &= ~VMCS_INTR_T_MASK;
1217                         info |= VMCS_INTR_T_SWEXCEPTION;
1218                 }
1219
1220                 if (info & VMCS_INTR_DEL_ERRCODE)
1221                         vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, entryinfo >> 32);
1222
1223                 vmcs_write(VMCS_ENTRY_INTR_INFO, info);
1224         }
1225
1226         if (vm_nmi_pending(vmx->vm, vcpu)) {
1227                 /*
1228                  * If there are no conditions blocking NMI injection then
1229                  * inject it directly here otherwise enable "NMI window
1230                  * exiting" to inject it as soon as we can.
1231                  *
1232                  * We also check for STI_BLOCKING because some implementations
1233                  * don't allow NMI injection in this case. If we are running
1234                  * on a processor that doesn't have this restriction it will
1235                  * immediately exit and the NMI will be injected in the
1236                  * "NMI window exiting" handler.
1237                  */
1238                 need_nmi_exiting = 1;
1239                 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1240                 if ((gi & (HWINTR_BLOCKING | NMI_BLOCKING)) == 0) {
1241                         info = vmcs_read(VMCS_ENTRY_INTR_INFO);
1242                         if ((info & VMCS_INTR_VALID) == 0) {
1243                                 vmx_inject_nmi(vmx, vcpu);
1244                                 need_nmi_exiting = 0;
1245                         } else {
1246                                 VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI "
1247                                     "due to VM-entry intr info %#x", info);
1248                         }
1249                 } else {
1250                         VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI due to "
1251                             "Guest Interruptibility-state %#x", gi);
1252                 }
1253
1254                 if (need_nmi_exiting)
1255                         vmx_set_nmi_window_exiting(vmx, vcpu);
1256         }
1257
1258         extint_pending = vm_extint_pending(vmx->vm, vcpu);
1259
1260         if (!extint_pending && virtual_interrupt_delivery) {
1261                 vmx_inject_pir(vlapic);
1262                 return;
1263         }
1264
1265         /*
1266          * If interrupt-window exiting is already in effect then don't bother
1267          * checking for pending interrupts. This is just an optimization and
1268          * not needed for correctness.
1269          */
1270         if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0) {
1271                 VCPU_CTR0(vmx->vm, vcpu, "Skip interrupt injection due to "
1272                     "pending int_window_exiting");
1273                 return;
1274         }
1275
1276         if (!extint_pending) {
1277                 /* Ask the local apic for a vector to inject */
1278                 if (!vlapic_pending_intr(vlapic, &vector))
1279                         return;
1280
1281                 /*
1282                  * From the Intel SDM, Volume 3, Section "Maskable
1283                  * Hardware Interrupts":
1284                  * - maskable interrupt vectors [16,255] can be delivered
1285                  *   through the local APIC.
1286                 */
1287                 KASSERT(vector >= 16 && vector <= 255,
1288                     ("invalid vector %d from local APIC", vector));
1289         } else {
1290                 /* Ask the legacy pic for a vector to inject */
1291                 vatpic_pending_intr(vmx->vm, &vector);
1292
1293                 /*
1294                  * From the Intel SDM, Volume 3, Section "Maskable
1295                  * Hardware Interrupts":
1296                  * - maskable interrupt vectors [0,255] can be delivered
1297                  *   through the INTR pin.
1298                  */
1299                 KASSERT(vector >= 0 && vector <= 255,
1300                     ("invalid vector %d from INTR", vector));
1301         }
1302
1303         /* Check RFLAGS.IF and the interruptibility state of the guest */
1304         rflags = vmcs_read(VMCS_GUEST_RFLAGS);
1305         if ((rflags & PSL_I) == 0) {
1306                 VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
1307                     "rflags %#lx", vector, rflags);
1308                 goto cantinject;
1309         }
1310
1311         gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1312         if (gi & HWINTR_BLOCKING) {
1313                 VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
1314                     "Guest Interruptibility-state %#x", vector, gi);
1315                 goto cantinject;
1316         }
1317
1318         info = vmcs_read(VMCS_ENTRY_INTR_INFO);
1319         if (info & VMCS_INTR_VALID) {
1320                 /*
1321                  * This is expected and could happen for multiple reasons:
1322                  * - A vectoring VM-entry was aborted due to astpending
1323                  * - A VM-exit happened during event injection.
1324                  * - An exception was injected above.
1325                  * - An NMI was injected above or after "NMI window exiting"
1326                  */
1327                 VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
1328                     "VM-entry intr info %#x", vector, info);
1329                 goto cantinject;
1330         }
1331
1332         /* Inject the interrupt */
1333         info = VMCS_INTR_T_HWINTR | VMCS_INTR_VALID;
1334         info |= vector;
1335         vmcs_write(VMCS_ENTRY_INTR_INFO, info);
1336
1337         if (!extint_pending) {
1338                 /* Update the Local APIC ISR */
1339                 vlapic_intr_accepted(vlapic, vector);
1340         } else {
1341                 vm_extint_clear(vmx->vm, vcpu);
1342                 vatpic_intr_accepted(vmx->vm, vector);
1343
1344                 /*
1345                  * After we accepted the current ExtINT the PIC may
1346                  * have posted another one.  If that is the case, set
1347                  * the Interrupt Window Exiting execution control so
1348                  * we can inject that one too.
1349                  *
1350                  * Also, interrupt window exiting allows us to inject any
1351                  * pending APIC vector that was preempted by the ExtINT
1352                  * as soon as possible. This applies both for the software
1353                  * emulated vlapic and the hardware assisted virtual APIC.
1354                  */
1355                 vmx_set_int_window_exiting(vmx, vcpu);
1356         }
1357
1358         VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector);
1359
1360         return;
1361
1362 cantinject:
1363         /*
1364          * Set the Interrupt Window Exiting execution control so we can inject
1365          * the interrupt as soon as blocking condition goes away.
1366          */
1367         vmx_set_int_window_exiting(vmx, vcpu);
1368 }
1369
1370 /*
1371  * If the Virtual NMIs execution control is '1' then the logical processor
1372  * tracks virtual-NMI blocking in the Guest Interruptibility-state field of
1373  * the VMCS. An IRET instruction in VMX non-root operation will remove any
1374  * virtual-NMI blocking.
1375  *
1376  * This unblocking occurs even if the IRET causes a fault. In this case the
1377  * hypervisor needs to restore virtual-NMI blocking before resuming the guest.
1378  */
1379 static void
1380 vmx_restore_nmi_blocking(struct vmx *vmx, int vcpuid)
1381 {
1382         uint32_t gi;
1383
1384         VCPU_CTR0(vmx->vm, vcpuid, "Restore Virtual-NMI blocking");
1385         gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1386         gi |= VMCS_INTERRUPTIBILITY_NMI_BLOCKING;
1387         vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
1388 }
1389
1390 static void
1391 vmx_clear_nmi_blocking(struct vmx *vmx, int vcpuid)
1392 {
1393         uint32_t gi;
1394
1395         VCPU_CTR0(vmx->vm, vcpuid, "Clear Virtual-NMI blocking");
1396         gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1397         gi &= ~VMCS_INTERRUPTIBILITY_NMI_BLOCKING;
1398         vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
1399 }
1400
1401 static void
1402 vmx_assert_nmi_blocking(struct vmx *vmx, int vcpuid)
1403 {
1404         uint32_t gi;
1405
1406         gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1407         KASSERT(gi & VMCS_INTERRUPTIBILITY_NMI_BLOCKING,
1408             ("NMI blocking is not in effect %#x", gi));
1409 }
1410
1411 static int
1412 vmx_emulate_xsetbv(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
1413 {
1414         struct vmxctx *vmxctx;
1415         uint64_t xcrval;
1416         const struct xsave_limits *limits;
1417
1418         vmxctx = &vmx->ctx[vcpu];
1419         limits = vmm_get_xsave_limits();
1420
1421         /*
1422          * Note that the processor raises a GP# fault on its own if
1423          * xsetbv is executed for CPL != 0, so we do not have to
1424          * emulate that fault here.
1425          */
1426
1427         /* Only xcr0 is supported. */
1428         if (vmxctx->guest_rcx != 0) {
1429                 vm_inject_gp(vmx->vm, vcpu);
1430                 return (HANDLED);
1431         }
1432
1433         /* We only handle xcr0 if both the host and guest have XSAVE enabled. */
1434         if (!limits->xsave_enabled || !(vmcs_read(VMCS_GUEST_CR4) & CR4_XSAVE)) {
1435                 vm_inject_ud(vmx->vm, vcpu);
1436                 return (HANDLED);
1437         }
1438
1439         xcrval = vmxctx->guest_rdx << 32 | (vmxctx->guest_rax & 0xffffffff);
1440         if ((xcrval & ~limits->xcr0_allowed) != 0) {
1441                 vm_inject_gp(vmx->vm, vcpu);
1442                 return (HANDLED);
1443         }
1444
1445         if (!(xcrval & XFEATURE_ENABLED_X87)) {
1446                 vm_inject_gp(vmx->vm, vcpu);
1447                 return (HANDLED);
1448         }
1449
1450         /* AVX (YMM_Hi128) requires SSE. */
1451         if (xcrval & XFEATURE_ENABLED_AVX &&
1452             (xcrval & XFEATURE_AVX) != XFEATURE_AVX) {
1453                 vm_inject_gp(vmx->vm, vcpu);
1454                 return (HANDLED);
1455         }
1456
1457         /*
1458          * AVX512 requires base AVX (YMM_Hi128) as well as OpMask,
1459          * ZMM_Hi256, and Hi16_ZMM.
1460          */
1461         if (xcrval & XFEATURE_AVX512 &&
1462             (xcrval & (XFEATURE_AVX512 | XFEATURE_AVX)) !=
1463             (XFEATURE_AVX512 | XFEATURE_AVX)) {
1464                 vm_inject_gp(vmx->vm, vcpu);
1465                 return (HANDLED);
1466         }
1467
1468         /*
1469          * Intel MPX requires both bound register state flags to be
1470          * set.
1471          */
1472         if (((xcrval & XFEATURE_ENABLED_BNDREGS) != 0) !=
1473             ((xcrval & XFEATURE_ENABLED_BNDCSR) != 0)) {
1474                 vm_inject_gp(vmx->vm, vcpu);
1475                 return (HANDLED);
1476         }
1477
1478         /*
1479          * This runs "inside" vmrun() with the guest's FPU state, so
1480          * modifying xcr0 directly modifies the guest's xcr0, not the
1481          * host's.
1482          */
1483         load_xcr(0, xcrval);
1484         return (HANDLED);
1485 }
1486
1487 static uint64_t
1488 vmx_get_guest_reg(struct vmx *vmx, int vcpu, int ident)
1489 {
1490         const struct vmxctx *vmxctx;
1491
1492         vmxctx = &vmx->ctx[vcpu];
1493
1494         switch (ident) {
1495         case 0:
1496                 return (vmxctx->guest_rax);
1497         case 1:
1498                 return (vmxctx->guest_rcx);
1499         case 2:
1500                 return (vmxctx->guest_rdx);
1501         case 3:
1502                 return (vmxctx->guest_rbx);
1503         case 4:
1504                 return (vmcs_read(VMCS_GUEST_RSP));
1505         case 5:
1506                 return (vmxctx->guest_rbp);
1507         case 6:
1508                 return (vmxctx->guest_rsi);
1509         case 7:
1510                 return (vmxctx->guest_rdi);
1511         case 8:
1512                 return (vmxctx->guest_r8);
1513         case 9:
1514                 return (vmxctx->guest_r9);
1515         case 10:
1516                 return (vmxctx->guest_r10);
1517         case 11:
1518                 return (vmxctx->guest_r11);
1519         case 12:
1520                 return (vmxctx->guest_r12);
1521         case 13:
1522                 return (vmxctx->guest_r13);
1523         case 14:
1524                 return (vmxctx->guest_r14);
1525         case 15:
1526                 return (vmxctx->guest_r15);
1527         default:
1528                 panic("invalid vmx register %d", ident);
1529         }
1530 }
1531
1532 static void
1533 vmx_set_guest_reg(struct vmx *vmx, int vcpu, int ident, uint64_t regval)
1534 {
1535         struct vmxctx *vmxctx;
1536
1537         vmxctx = &vmx->ctx[vcpu];
1538
1539         switch (ident) {
1540         case 0:
1541                 vmxctx->guest_rax = regval;
1542                 break;
1543         case 1:
1544                 vmxctx->guest_rcx = regval;
1545                 break;
1546         case 2:
1547                 vmxctx->guest_rdx = regval;
1548                 break;
1549         case 3:
1550                 vmxctx->guest_rbx = regval;
1551                 break;
1552         case 4:
1553                 vmcs_write(VMCS_GUEST_RSP, regval);
1554                 break;
1555         case 5:
1556                 vmxctx->guest_rbp = regval;
1557                 break;
1558         case 6:
1559                 vmxctx->guest_rsi = regval;
1560                 break;
1561         case 7:
1562                 vmxctx->guest_rdi = regval;
1563                 break;
1564         case 8:
1565                 vmxctx->guest_r8 = regval;
1566                 break;
1567         case 9:
1568                 vmxctx->guest_r9 = regval;
1569                 break;
1570         case 10:
1571                 vmxctx->guest_r10 = regval;
1572                 break;
1573         case 11:
1574                 vmxctx->guest_r11 = regval;
1575                 break;
1576         case 12:
1577                 vmxctx->guest_r12 = regval;
1578                 break;
1579         case 13:
1580                 vmxctx->guest_r13 = regval;
1581                 break;
1582         case 14:
1583                 vmxctx->guest_r14 = regval;
1584                 break;
1585         case 15:
1586                 vmxctx->guest_r15 = regval;
1587                 break;
1588         default:
1589                 panic("invalid vmx register %d", ident);
1590         }
1591 }
1592
1593 static int
1594 vmx_emulate_cr0_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
1595 {
1596         uint64_t crval, regval;
1597
1598         /* We only handle mov to %cr0 at this time */
1599         if ((exitqual & 0xf0) != 0x00)
1600                 return (UNHANDLED);
1601
1602         regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf);
1603
1604         vmcs_write(VMCS_CR0_SHADOW, regval);
1605
1606         crval = regval | cr0_ones_mask;
1607         crval &= ~cr0_zeros_mask;
1608         vmcs_write(VMCS_GUEST_CR0, crval);
1609
1610         if (regval & CR0_PG) {
1611                 uint64_t efer, entry_ctls;
1612
1613                 /*
1614                  * If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and
1615                  * the "IA-32e mode guest" bit in VM-entry control must be
1616                  * equal.
1617                  */
1618                 efer = vmcs_read(VMCS_GUEST_IA32_EFER);
1619                 if (efer & EFER_LME) {
1620                         efer |= EFER_LMA;
1621                         vmcs_write(VMCS_GUEST_IA32_EFER, efer);
1622                         entry_ctls = vmcs_read(VMCS_ENTRY_CTLS);
1623                         entry_ctls |= VM_ENTRY_GUEST_LMA;
1624                         vmcs_write(VMCS_ENTRY_CTLS, entry_ctls);
1625                 }
1626         }
1627
1628         return (HANDLED);
1629 }
1630
1631 static int
1632 vmx_emulate_cr4_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
1633 {
1634         uint64_t crval, regval;
1635
1636         /* We only handle mov to %cr4 at this time */
1637         if ((exitqual & 0xf0) != 0x00)
1638                 return (UNHANDLED);
1639
1640         regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf);
1641
1642         vmcs_write(VMCS_CR4_SHADOW, regval);
1643
1644         crval = regval | cr4_ones_mask;
1645         crval &= ~cr4_zeros_mask;
1646         vmcs_write(VMCS_GUEST_CR4, crval);
1647
1648         return (HANDLED);
1649 }
1650
1651 static int
1652 vmx_emulate_cr8_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
1653 {
1654         struct vlapic *vlapic;
1655         uint64_t cr8;
1656         int regnum;
1657
1658         /* We only handle mov %cr8 to/from a register at this time. */
1659         if ((exitqual & 0xe0) != 0x00) {
1660                 return (UNHANDLED);
1661         }
1662
1663         vlapic = vm_lapic(vmx->vm, vcpu);
1664         regnum = (exitqual >> 8) & 0xf;
1665         if (exitqual & 0x10) {
1666                 cr8 = vlapic_get_cr8(vlapic);
1667                 vmx_set_guest_reg(vmx, vcpu, regnum, cr8);
1668         } else {
1669                 cr8 = vmx_get_guest_reg(vmx, vcpu, regnum);
1670                 vlapic_set_cr8(vlapic, cr8);
1671         }
1672
1673         return (HANDLED);
1674 }
1675
1676 /*
1677  * From section "Guest Register State" in the Intel SDM: CPL = SS.DPL
1678  */
1679 static int
1680 vmx_cpl(void)
1681 {
1682         uint32_t ssar;
1683
1684         ssar = vmcs_read(VMCS_GUEST_SS_ACCESS_RIGHTS);
1685         return ((ssar >> 5) & 0x3);
1686 }
1687
1688 static enum vm_cpu_mode
1689 vmx_cpu_mode(void)
1690 {
1691         uint32_t csar;
1692
1693         if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA) {
1694                 csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS);
1695                 if (csar & 0x2000)
1696                         return (CPU_MODE_64BIT);        /* CS.L = 1 */
1697                 else
1698                         return (CPU_MODE_COMPATIBILITY);
1699         } else if (vmcs_read(VMCS_GUEST_CR0) & CR0_PE) {
1700                 return (CPU_MODE_PROTECTED);
1701         } else {
1702                 return (CPU_MODE_REAL);
1703         }
1704 }
1705
1706 static enum vm_paging_mode
1707 vmx_paging_mode(void)
1708 {
1709
1710         if (!(vmcs_read(VMCS_GUEST_CR0) & CR0_PG))
1711                 return (PAGING_MODE_FLAT);
1712         if (!(vmcs_read(VMCS_GUEST_CR4) & CR4_PAE))
1713                 return (PAGING_MODE_32);
1714         if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LME)
1715                 return (PAGING_MODE_64);
1716         else
1717                 return (PAGING_MODE_PAE);
1718 }
1719
1720 static uint64_t
1721 inout_str_index(struct vmx *vmx, int vcpuid, int in)
1722 {
1723         uint64_t val;
1724         int error;
1725         enum vm_reg_name reg;
1726
1727         reg = in ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI;
1728         error = vmx_getreg(vmx, vcpuid, reg, &val);
1729         KASSERT(error == 0, ("%s: vmx_getreg error %d", __func__, error));
1730         return (val);
1731 }
1732
1733 static uint64_t
1734 inout_str_count(struct vmx *vmx, int vcpuid, int rep)
1735 {
1736         uint64_t val;
1737         int error;
1738
1739         if (rep) {
1740                 error = vmx_getreg(vmx, vcpuid, VM_REG_GUEST_RCX, &val);
1741                 KASSERT(!error, ("%s: vmx_getreg error %d", __func__, error));
1742         } else {
1743                 val = 1;
1744         }
1745         return (val);
1746 }
1747
1748 static int
1749 inout_str_addrsize(uint32_t inst_info)
1750 {
1751         uint32_t size;
1752
1753         size = (inst_info >> 7) & 0x7;
1754         switch (size) {
1755         case 0:
1756                 return (2);     /* 16 bit */
1757         case 1:
1758                 return (4);     /* 32 bit */
1759         case 2:
1760                 return (8);     /* 64 bit */
1761         default:
1762                 panic("%s: invalid size encoding %d", __func__, size);
1763         }
1764 }
1765
1766 static void
1767 inout_str_seginfo(struct vmx *vmx, int vcpuid, uint32_t inst_info, int in,
1768     struct vm_inout_str *vis)
1769 {
1770         int error, s;
1771
1772         if (in) {
1773                 vis->seg_name = VM_REG_GUEST_ES;
1774         } else {
1775                 s = (inst_info >> 15) & 0x7;
1776                 vis->seg_name = vm_segment_name(s);
1777         }
1778
1779         error = vmx_getdesc(vmx, vcpuid, vis->seg_name, &vis->seg_desc);
1780         KASSERT(error == 0, ("%s: vmx_getdesc error %d", __func__, error));
1781 }
1782
1783 static void
1784 vmx_paging_info(struct vm_guest_paging *paging)
1785 {
1786         paging->cr3 = vmcs_guest_cr3();
1787         paging->cpl = vmx_cpl();
1788         paging->cpu_mode = vmx_cpu_mode();
1789         paging->paging_mode = vmx_paging_mode();
1790 }
1791
1792 static void
1793 vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla)
1794 {
1795         struct vm_guest_paging *paging;
1796         uint32_t csar;
1797
1798         paging = &vmexit->u.inst_emul.paging;
1799
1800         vmexit->exitcode = VM_EXITCODE_INST_EMUL;
1801         vmexit->inst_length = 0;
1802         vmexit->u.inst_emul.gpa = gpa;
1803         vmexit->u.inst_emul.gla = gla;
1804         vmx_paging_info(paging);
1805         switch (paging->cpu_mode) {
1806         case CPU_MODE_REAL:
1807                 vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE);
1808                 vmexit->u.inst_emul.cs_d = 0;
1809                 break;
1810         case CPU_MODE_PROTECTED:
1811         case CPU_MODE_COMPATIBILITY:
1812                 vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE);
1813                 csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS);
1814                 vmexit->u.inst_emul.cs_d = SEG_DESC_DEF32(csar);
1815                 break;
1816         default:
1817                 vmexit->u.inst_emul.cs_base = 0;
1818                 vmexit->u.inst_emul.cs_d = 0;
1819                 break;
1820         }
1821         vie_init(&vmexit->u.inst_emul.vie, NULL, 0);
1822 }
1823
1824 static int
1825 ept_fault_type(uint64_t ept_qual)
1826 {
1827         int fault_type;
1828
1829         if (ept_qual & EPT_VIOLATION_DATA_WRITE)
1830                 fault_type = VM_PROT_WRITE;
1831         else if (ept_qual & EPT_VIOLATION_INST_FETCH)
1832                 fault_type = VM_PROT_EXECUTE;
1833         else
1834                 fault_type= VM_PROT_READ;
1835
1836         return (fault_type);
1837 }
1838
1839 static boolean_t
1840 ept_emulation_fault(uint64_t ept_qual)
1841 {
1842         int read, write;
1843
1844         /* EPT fault on an instruction fetch doesn't make sense here */
1845         if (ept_qual & EPT_VIOLATION_INST_FETCH)
1846                 return (FALSE);
1847
1848         /* EPT fault must be a read fault or a write fault */
1849         read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0;
1850         write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0;
1851         if ((read | write) == 0)
1852                 return (FALSE);
1853
1854         /*
1855          * The EPT violation must have been caused by accessing a
1856          * guest-physical address that is a translation of a guest-linear
1857          * address.
1858          */
1859         if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 ||
1860             (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) {
1861                 return (FALSE);
1862         }
1863
1864         return (TRUE);
1865 }
1866
1867 static __inline int
1868 apic_access_virtualization(struct vmx *vmx, int vcpuid)
1869 {
1870         uint32_t proc_ctls2;
1871
1872         proc_ctls2 = vmx->cap[vcpuid].proc_ctls2;
1873         return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) ? 1 : 0);
1874 }
1875
1876 static __inline int
1877 x2apic_virtualization(struct vmx *vmx, int vcpuid)
1878 {
1879         uint32_t proc_ctls2;
1880
1881         proc_ctls2 = vmx->cap[vcpuid].proc_ctls2;
1882         return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_X2APIC_MODE) ? 1 : 0);
1883 }
1884
1885 static int
1886 vmx_handle_apic_write(struct vmx *vmx, int vcpuid, struct vlapic *vlapic,
1887     uint64_t qual)
1888 {
1889         int error, handled, offset;
1890         uint32_t *apic_regs, vector;
1891         bool retu;
1892
1893         handled = HANDLED;
1894         offset = APIC_WRITE_OFFSET(qual);
1895
1896         if (!apic_access_virtualization(vmx, vcpuid)) {
1897                 /*
1898                  * In general there should not be any APIC write VM-exits
1899                  * unless APIC-access virtualization is enabled.
1900                  *
1901                  * However self-IPI virtualization can legitimately trigger
1902                  * an APIC-write VM-exit so treat it specially.
1903                  */
1904                 if (x2apic_virtualization(vmx, vcpuid) &&
1905                     offset == APIC_OFFSET_SELF_IPI) {
1906                         apic_regs = (uint32_t *)(vlapic->apic_page);
1907                         vector = apic_regs[APIC_OFFSET_SELF_IPI / 4];
1908                         vlapic_self_ipi_handler(vlapic, vector);
1909                         return (HANDLED);
1910                 } else
1911                         return (UNHANDLED);
1912         }
1913
1914         switch (offset) {
1915         case APIC_OFFSET_ID:
1916                 vlapic_id_write_handler(vlapic);
1917                 break;
1918         case APIC_OFFSET_LDR:
1919                 vlapic_ldr_write_handler(vlapic);
1920                 break;
1921         case APIC_OFFSET_DFR:
1922                 vlapic_dfr_write_handler(vlapic);
1923                 break;
1924         case APIC_OFFSET_SVR:
1925                 vlapic_svr_write_handler(vlapic);
1926                 break;
1927         case APIC_OFFSET_ESR:
1928                 vlapic_esr_write_handler(vlapic);
1929                 break;
1930         case APIC_OFFSET_ICR_LOW:
1931                 retu = false;
1932                 error = vlapic_icrlo_write_handler(vlapic, &retu);
1933                 if (error != 0 || retu)
1934                         handled = UNHANDLED;
1935                 break;
1936         case APIC_OFFSET_CMCI_LVT:
1937         case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
1938                 vlapic_lvt_write_handler(vlapic, offset);
1939                 break;
1940         case APIC_OFFSET_TIMER_ICR:
1941                 vlapic_icrtmr_write_handler(vlapic);
1942                 break;
1943         case APIC_OFFSET_TIMER_DCR:
1944                 vlapic_dcr_write_handler(vlapic);
1945                 break;
1946         default:
1947                 handled = UNHANDLED;
1948                 break;
1949         }
1950         return (handled);
1951 }
1952
1953 static bool
1954 apic_access_fault(struct vmx *vmx, int vcpuid, uint64_t gpa)
1955 {
1956
1957         if (apic_access_virtualization(vmx, vcpuid) &&
1958             (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE))
1959                 return (true);
1960         else
1961                 return (false);
1962 }
1963
1964 static int
1965 vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
1966 {
1967         uint64_t qual;
1968         int access_type, offset, allowed;
1969
1970         if (!apic_access_virtualization(vmx, vcpuid))
1971                 return (UNHANDLED);
1972
1973         qual = vmexit->u.vmx.exit_qualification;
1974         access_type = APIC_ACCESS_TYPE(qual);
1975         offset = APIC_ACCESS_OFFSET(qual);
1976
1977         allowed = 0;
1978         if (access_type == 0) {
1979                 /*
1980                  * Read data access to the following registers is expected.
1981                  */
1982                 switch (offset) {
1983                 case APIC_OFFSET_APR:
1984                 case APIC_OFFSET_PPR:
1985                 case APIC_OFFSET_RRR:
1986                 case APIC_OFFSET_CMCI_LVT:
1987                 case APIC_OFFSET_TIMER_CCR:
1988                         allowed = 1;
1989                         break;
1990                 default:
1991                         break;
1992                 }
1993         } else if (access_type == 1) {
1994                 /*
1995                  * Write data access to the following registers is expected.
1996                  */
1997                 switch (offset) {
1998                 case APIC_OFFSET_VER:
1999                 case APIC_OFFSET_APR:
2000                 case APIC_OFFSET_PPR:
2001                 case APIC_OFFSET_RRR:
2002                 case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
2003                 case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
2004                 case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
2005                 case APIC_OFFSET_CMCI_LVT:
2006                 case APIC_OFFSET_TIMER_CCR:
2007                         allowed = 1;
2008                         break;
2009                 default:
2010                         break;
2011                 }
2012         }
2013
2014         if (allowed) {
2015                 vmexit_inst_emul(vmexit, DEFAULT_APIC_BASE + offset,
2016                     VIE_INVALID_GLA);
2017         }
2018
2019         /*
2020          * Regardless of whether the APIC-access is allowed this handler
2021          * always returns UNHANDLED:
2022          * - if the access is allowed then it is handled by emulating the
2023          *   instruction that caused the VM-exit (outside the critical section)
2024          * - if the access is not allowed then it will be converted to an
2025          *   exitcode of VM_EXITCODE_VMX and will be dealt with in userland.
2026          */
2027         return (UNHANDLED);
2028 }
2029
2030 static enum task_switch_reason
2031 vmx_task_switch_reason(uint64_t qual)
2032 {
2033         int reason;
2034
2035         reason = (qual >> 30) & 0x3;
2036         switch (reason) {
2037         case 0:
2038                 return (TSR_CALL);
2039         case 1:
2040                 return (TSR_IRET);
2041         case 2:
2042                 return (TSR_JMP);
2043         case 3:
2044                 return (TSR_IDT_GATE);
2045         default:
2046                 panic("%s: invalid reason %d", __func__, reason);
2047         }
2048 }
2049
2050 static int
2051 emulate_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu)
2052 {
2053         int error;
2054
2055         if (lapic_msr(num))
2056                 error = lapic_wrmsr(vmx->vm, vcpuid, num, val, retu);
2057         else
2058                 error = vmx_wrmsr(vmx, vcpuid, num, val, retu);
2059
2060         return (error);
2061 }
2062
2063 static int
2064 emulate_rdmsr(struct vmx *vmx, int vcpuid, u_int num, bool *retu)
2065 {
2066         struct vmxctx *vmxctx;
2067         uint64_t result;
2068         uint32_t eax, edx;
2069         int error;
2070
2071         if (lapic_msr(num))
2072                 error = lapic_rdmsr(vmx->vm, vcpuid, num, &result, retu);
2073         else
2074                 error = vmx_rdmsr(vmx, vcpuid, num, &result, retu);
2075
2076         if (error == 0) {
2077                 eax = result;
2078                 vmxctx = &vmx->ctx[vcpuid];
2079                 error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RAX, eax);
2080                 KASSERT(error == 0, ("vmxctx_setreg(rax) error %d", error));
2081
2082                 edx = result >> 32;
2083                 error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RDX, edx);
2084                 KASSERT(error == 0, ("vmxctx_setreg(rdx) error %d", error));
2085         }
2086
2087         return (error);
2088 }
2089
2090 static int
2091 vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
2092 {
2093         int error, errcode, errcode_valid, handled, in;
2094         struct vmxctx *vmxctx;
2095         struct vlapic *vlapic;
2096         struct vm_inout_str *vis;
2097         struct vm_task_switch *ts;
2098         uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, inst_info;
2099         uint32_t intr_type, intr_vec, reason;
2100         uint64_t exitintinfo, qual, gpa;
2101         bool retu;
2102
2103         CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0);
2104         CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_NMI_EXITING) != 0);
2105
2106         handled = UNHANDLED;
2107         vmxctx = &vmx->ctx[vcpu];
2108
2109         qual = vmexit->u.vmx.exit_qualification;
2110         reason = vmexit->u.vmx.exit_reason;
2111         vmexit->exitcode = VM_EXITCODE_BOGUS;
2112
2113         vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1);
2114
2115         /*
2116          * VM-entry failures during or after loading guest state.
2117          *
2118          * These VM-exits are uncommon but must be handled specially
2119          * as most VM-exit fields are not populated as usual.
2120          */
2121         if (__predict_false(reason == EXIT_REASON_MCE_DURING_ENTRY)) {
2122                 VCPU_CTR0(vmx->vm, vcpu, "Handling MCE during VM-entry");
2123                 __asm __volatile("int $18");
2124                 return (1);
2125         }
2126
2127         /*
2128          * VM exits that can be triggered during event delivery need to
2129          * be handled specially by re-injecting the event if the IDT
2130          * vectoring information field's valid bit is set.
2131          *
2132          * See "Information for VM Exits During Event Delivery" in Intel SDM
2133          * for details.
2134          */
2135         idtvec_info = vmcs_idt_vectoring_info();
2136         if (idtvec_info & VMCS_IDT_VEC_VALID) {
2137                 idtvec_info &= ~(1 << 12); /* clear undefined bit */
2138                 exitintinfo = idtvec_info;
2139                 if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
2140                         idtvec_err = vmcs_idt_vectoring_err();
2141                         exitintinfo |= (uint64_t)idtvec_err << 32;
2142                 }
2143                 error = vm_exit_intinfo(vmx->vm, vcpu, exitintinfo);
2144                 KASSERT(error == 0, ("%s: vm_set_intinfo error %d",
2145                     __func__, error));
2146
2147                 /*
2148                  * If 'virtual NMIs' are being used and the VM-exit
2149                  * happened while injecting an NMI during the previous
2150                  * VM-entry, then clear "blocking by NMI" in the
2151                  * Guest Interruptibility-State so the NMI can be
2152                  * reinjected on the subsequent VM-entry.
2153                  *
2154                  * However, if the NMI was being delivered through a task
2155                  * gate, then the new task must start execution with NMIs
2156                  * blocked so don't clear NMI blocking in this case.
2157                  */
2158                 intr_type = idtvec_info & VMCS_INTR_T_MASK;
2159                 if (intr_type == VMCS_INTR_T_NMI) {
2160                         if (reason != EXIT_REASON_TASK_SWITCH)
2161                                 vmx_clear_nmi_blocking(vmx, vcpu);
2162                         else
2163                                 vmx_assert_nmi_blocking(vmx, vcpu);
2164                 }
2165
2166                 /*
2167                  * Update VM-entry instruction length if the event being
2168                  * delivered was a software interrupt or software exception.
2169                  */
2170                 if (intr_type == VMCS_INTR_T_SWINTR ||
2171                     intr_type == VMCS_INTR_T_PRIV_SWEXCEPTION ||
2172                     intr_type == VMCS_INTR_T_SWEXCEPTION) {
2173                         vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
2174                 }
2175         }
2176
2177         switch (reason) {
2178         case EXIT_REASON_TASK_SWITCH:
2179                 ts = &vmexit->u.task_switch;
2180                 ts->tsssel = qual & 0xffff;
2181                 ts->reason = vmx_task_switch_reason(qual);
2182                 ts->ext = 0;
2183                 ts->errcode_valid = 0;
2184                 vmx_paging_info(&ts->paging);
2185                 /*
2186                  * If the task switch was due to a CALL, JMP, IRET, software
2187                  * interrupt (INT n) or software exception (INT3, INTO),
2188                  * then the saved %rip references the instruction that caused
2189                  * the task switch. The instruction length field in the VMCS
2190                  * is valid in this case.
2191                  *
2192                  * In all other cases (e.g., NMI, hardware exception) the
2193                  * saved %rip is one that would have been saved in the old TSS
2194                  * had the task switch completed normally so the instruction
2195                  * length field is not needed in this case and is explicitly
2196                  * set to 0.
2197                  */
2198                 if (ts->reason == TSR_IDT_GATE) {
2199                         KASSERT(idtvec_info & VMCS_IDT_VEC_VALID,
2200                             ("invalid idtvec_info %#x for IDT task switch",
2201                             idtvec_info));
2202                         intr_type = idtvec_info & VMCS_INTR_T_MASK;
2203                         if (intr_type != VMCS_INTR_T_SWINTR &&
2204                             intr_type != VMCS_INTR_T_SWEXCEPTION &&
2205                             intr_type != VMCS_INTR_T_PRIV_SWEXCEPTION) {
2206                                 /* Task switch triggered by external event */
2207                                 ts->ext = 1;
2208                                 vmexit->inst_length = 0;
2209                                 if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
2210                                         ts->errcode_valid = 1;
2211                                         ts->errcode = vmcs_idt_vectoring_err();
2212                                 }
2213                         }
2214                 }
2215                 vmexit->exitcode = VM_EXITCODE_TASK_SWITCH;
2216                 VCPU_CTR4(vmx->vm, vcpu, "task switch reason %d, tss 0x%04x, "
2217                     "%s errcode 0x%016lx", ts->reason, ts->tsssel,
2218                     ts->ext ? "external" : "internal",
2219                     ((uint64_t)ts->errcode << 32) | ts->errcode_valid);
2220                 break;
2221         case EXIT_REASON_CR_ACCESS:
2222                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1);
2223                 switch (qual & 0xf) {
2224                 case 0:
2225                         handled = vmx_emulate_cr0_access(vmx, vcpu, qual);
2226                         break;
2227                 case 4:
2228                         handled = vmx_emulate_cr4_access(vmx, vcpu, qual);
2229                         break;
2230                 case 8:
2231                         handled = vmx_emulate_cr8_access(vmx, vcpu, qual);
2232                         break;
2233                 }
2234                 break;
2235         case EXIT_REASON_RDMSR:
2236                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1);
2237                 retu = false;
2238                 ecx = vmxctx->guest_rcx;
2239                 VCPU_CTR1(vmx->vm, vcpu, "rdmsr 0x%08x", ecx);
2240                 error = emulate_rdmsr(vmx, vcpu, ecx, &retu);
2241                 if (error) {
2242                         vmexit->exitcode = VM_EXITCODE_RDMSR;
2243                         vmexit->u.msr.code = ecx;
2244                 } else if (!retu) {
2245                         handled = HANDLED;
2246                 } else {
2247                         /* Return to userspace with a valid exitcode */
2248                         KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
2249                             ("emulate_rdmsr retu with bogus exitcode"));
2250                 }
2251                 break;
2252         case EXIT_REASON_WRMSR:
2253                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_WRMSR, 1);
2254                 retu = false;
2255                 eax = vmxctx->guest_rax;
2256                 ecx = vmxctx->guest_rcx;
2257                 edx = vmxctx->guest_rdx;
2258                 VCPU_CTR2(vmx->vm, vcpu, "wrmsr 0x%08x value 0x%016lx",
2259                     ecx, (uint64_t)edx << 32 | eax);
2260                 error = emulate_wrmsr(vmx, vcpu, ecx,
2261                     (uint64_t)edx << 32 | eax, &retu);
2262                 if (error) {
2263                         vmexit->exitcode = VM_EXITCODE_WRMSR;
2264                         vmexit->u.msr.code = ecx;
2265                         vmexit->u.msr.wval = (uint64_t)edx << 32 | eax;
2266                 } else if (!retu) {
2267                         handled = HANDLED;
2268                 } else {
2269                         /* Return to userspace with a valid exitcode */
2270                         KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
2271                             ("emulate_wrmsr retu with bogus exitcode"));
2272                 }
2273                 break;
2274         case EXIT_REASON_HLT:
2275                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1);
2276                 vmexit->exitcode = VM_EXITCODE_HLT;
2277                 vmexit->u.hlt.rflags = vmcs_read(VMCS_GUEST_RFLAGS);
2278                 break;
2279         case EXIT_REASON_MTF:
2280                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1);
2281                 vmexit->exitcode = VM_EXITCODE_MTRAP;
2282                 vmexit->inst_length = 0;
2283                 break;
2284         case EXIT_REASON_PAUSE:
2285                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1);
2286                 vmexit->exitcode = VM_EXITCODE_PAUSE;
2287                 break;
2288         case EXIT_REASON_INTR_WINDOW:
2289                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1);
2290                 vmx_clear_int_window_exiting(vmx, vcpu);
2291                 return (1);
2292         case EXIT_REASON_EXT_INTR:
2293                 /*
2294                  * External interrupts serve only to cause VM exits and allow
2295                  * the host interrupt handler to run.
2296                  *
2297                  * If this external interrupt triggers a virtual interrupt
2298                  * to a VM, then that state will be recorded by the
2299                  * host interrupt handler in the VM's softc. We will inject
2300                  * this virtual interrupt during the subsequent VM enter.
2301                  */
2302                 intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
2303
2304                 /*
2305                  * XXX: Ignore this exit if VMCS_INTR_VALID is not set.
2306                  * This appears to be a bug in VMware Fusion?
2307                  */
2308                 if (!(intr_info & VMCS_INTR_VALID))
2309                         return (1);
2310                 KASSERT((intr_info & VMCS_INTR_VALID) != 0 &&
2311                     (intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_HWINTR,
2312                     ("VM exit interruption info invalid: %#x", intr_info));
2313                 vmx_trigger_hostintr(intr_info & 0xff);
2314
2315                 /*
2316                  * This is special. We want to treat this as an 'handled'
2317                  * VM-exit but not increment the instruction pointer.
2318                  */
2319                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1);
2320                 return (1);
2321         case EXIT_REASON_NMI_WINDOW:
2322                 /* Exit to allow the pending virtual NMI to be injected */
2323                 if (vm_nmi_pending(vmx->vm, vcpu))
2324                         vmx_inject_nmi(vmx, vcpu);
2325                 vmx_clear_nmi_window_exiting(vmx, vcpu);
2326                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NMI_WINDOW, 1);
2327                 return (1);
2328         case EXIT_REASON_INOUT:
2329                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1);
2330                 vmexit->exitcode = VM_EXITCODE_INOUT;
2331                 vmexit->u.inout.bytes = (qual & 0x7) + 1;
2332                 vmexit->u.inout.in = in = (qual & 0x8) ? 1 : 0;
2333                 vmexit->u.inout.string = (qual & 0x10) ? 1 : 0;
2334                 vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0;
2335                 vmexit->u.inout.port = (uint16_t)(qual >> 16);
2336                 vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax);
2337                 if (vmexit->u.inout.string) {
2338                         inst_info = vmcs_read(VMCS_EXIT_INSTRUCTION_INFO);
2339                         vmexit->exitcode = VM_EXITCODE_INOUT_STR;
2340                         vis = &vmexit->u.inout_str;
2341                         vmx_paging_info(&vis->paging);
2342                         vis->rflags = vmcs_read(VMCS_GUEST_RFLAGS);
2343                         vis->cr0 = vmcs_read(VMCS_GUEST_CR0);
2344                         vis->index = inout_str_index(vmx, vcpu, in);
2345                         vis->count = inout_str_count(vmx, vcpu, vis->inout.rep);
2346                         vis->addrsize = inout_str_addrsize(inst_info);
2347                         inout_str_seginfo(vmx, vcpu, inst_info, in, vis);
2348                 }
2349                 break;
2350         case EXIT_REASON_CPUID:
2351                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1);
2352                 handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx);
2353                 break;
2354         case EXIT_REASON_EXCEPTION:
2355                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXCEPTION, 1);
2356                 intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
2357                 KASSERT((intr_info & VMCS_INTR_VALID) != 0,
2358                     ("VM exit interruption info invalid: %#x", intr_info));
2359
2360                 intr_vec = intr_info & 0xff;
2361                 intr_type = intr_info & VMCS_INTR_T_MASK;
2362
2363                 /*
2364                  * If Virtual NMIs control is 1 and the VM-exit is due to a
2365                  * fault encountered during the execution of IRET then we must
2366                  * restore the state of "virtual-NMI blocking" before resuming
2367                  * the guest.
2368                  *
2369                  * See "Resuming Guest Software after Handling an Exception".
2370                  * See "Information for VM Exits Due to Vectored Events".
2371                  */
2372                 if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
2373                     (intr_vec != IDT_DF) &&
2374                     (intr_info & EXIT_QUAL_NMIUDTI) != 0)
2375                         vmx_restore_nmi_blocking(vmx, vcpu);
2376
2377                 /*
2378                  * The NMI has already been handled in vmx_exit_handle_nmi().
2379                  */
2380                 if (intr_type == VMCS_INTR_T_NMI)
2381                         return (1);
2382
2383                 /*
2384                  * Call the machine check handler by hand. Also don't reflect
2385                  * the machine check back into the guest.
2386                  */
2387                 if (intr_vec == IDT_MC) {
2388                         VCPU_CTR0(vmx->vm, vcpu, "Vectoring to MCE handler");
2389                         __asm __volatile("int $18");
2390                         return (1);
2391                 }
2392
2393                 if (intr_vec == IDT_PF) {
2394                         error = vmxctx_setreg(vmxctx, VM_REG_GUEST_CR2, qual);
2395                         KASSERT(error == 0, ("%s: vmxctx_setreg(cr2) error %d",
2396                             __func__, error));
2397                 }
2398
2399                 /*
2400                  * Software exceptions exhibit trap-like behavior. This in
2401                  * turn requires populating the VM-entry instruction length
2402                  * so that the %rip in the trap frame is past the INT3/INTO
2403                  * instruction.
2404                  */
2405                 if (intr_type == VMCS_INTR_T_SWEXCEPTION)
2406                         vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
2407
2408                 /* Reflect all other exceptions back into the guest */
2409                 errcode_valid = errcode = 0;
2410                 if (intr_info & VMCS_INTR_DEL_ERRCODE) {
2411                         errcode_valid = 1;
2412                         errcode = vmcs_read(VMCS_EXIT_INTR_ERRCODE);
2413                 }
2414                 VCPU_CTR2(vmx->vm, vcpu, "Reflecting exception %d/%#x into "
2415                     "the guest", intr_vec, errcode);
2416                 error = vm_inject_exception(vmx->vm, vcpu, intr_vec,
2417                     errcode_valid, errcode, 0);
2418                 KASSERT(error == 0, ("%s: vm_inject_exception error %d",
2419                     __func__, error));
2420                 return (1);
2421
2422         case EXIT_REASON_EPT_FAULT:
2423                 /*
2424                  * If 'gpa' lies within the address space allocated to
2425                  * memory then this must be a nested page fault otherwise
2426                  * this must be an instruction that accesses MMIO space.
2427                  */
2428                 gpa = vmcs_gpa();
2429                 if (vm_mem_allocated(vmx->vm, gpa) ||
2430                     apic_access_fault(vmx, vcpu, gpa)) {
2431                         vmexit->exitcode = VM_EXITCODE_PAGING;
2432                         vmexit->inst_length = 0;
2433                         vmexit->u.paging.gpa = gpa;
2434                         vmexit->u.paging.fault_type = ept_fault_type(qual);
2435                         vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NESTED_FAULT, 1);
2436                 } else if (ept_emulation_fault(qual)) {
2437                         vmexit_inst_emul(vmexit, gpa, vmcs_gla());
2438                         vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INST_EMUL, 1);
2439                 }
2440                 /*
2441                  * If Virtual NMIs control is 1 and the VM-exit is due to an
2442                  * EPT fault during the execution of IRET then we must restore
2443                  * the state of "virtual-NMI blocking" before resuming.
2444                  *
2445                  * See description of "NMI unblocking due to IRET" in
2446                  * "Exit Qualification for EPT Violations".
2447                  */
2448                 if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
2449                     (qual & EXIT_QUAL_NMIUDTI) != 0)
2450                         vmx_restore_nmi_blocking(vmx, vcpu);
2451                 break;
2452         case EXIT_REASON_VIRTUALIZED_EOI:
2453                 vmexit->exitcode = VM_EXITCODE_IOAPIC_EOI;
2454                 vmexit->u.ioapic_eoi.vector = qual & 0xFF;
2455                 vmexit->inst_length = 0;        /* trap-like */
2456                 break;
2457         case EXIT_REASON_APIC_ACCESS:
2458                 handled = vmx_handle_apic_access(vmx, vcpu, vmexit);
2459                 break;
2460         case EXIT_REASON_APIC_WRITE:
2461                 /*
2462                  * APIC-write VM exit is trap-like so the %rip is already
2463                  * pointing to the next instruction.
2464                  */
2465                 vmexit->inst_length = 0;
2466                 vlapic = vm_lapic(vmx->vm, vcpu);
2467                 handled = vmx_handle_apic_write(vmx, vcpu, vlapic, qual);
2468                 break;
2469         case EXIT_REASON_XSETBV:
2470                 handled = vmx_emulate_xsetbv(vmx, vcpu, vmexit);
2471                 break;
2472         case EXIT_REASON_MONITOR:
2473                 vmexit->exitcode = VM_EXITCODE_MONITOR;
2474                 break;
2475         case EXIT_REASON_MWAIT:
2476                 vmexit->exitcode = VM_EXITCODE_MWAIT;
2477                 break;
2478         default:
2479                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1);
2480                 break;
2481         }
2482
2483         if (handled) {
2484                 /*
2485                  * It is possible that control is returned to userland
2486                  * even though we were able to handle the VM exit in the
2487                  * kernel.
2488                  *
2489                  * In such a case we want to make sure that the userland
2490                  * restarts guest execution at the instruction *after*
2491                  * the one we just processed. Therefore we update the
2492                  * guest rip in the VMCS and in 'vmexit'.
2493                  */
2494                 vmexit->rip += vmexit->inst_length;
2495                 vmexit->inst_length = 0;
2496                 vmcs_write(VMCS_GUEST_RIP, vmexit->rip);
2497         } else {
2498                 if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
2499                         /*
2500                          * If this VM exit was not claimed by anybody then
2501                          * treat it as a generic VMX exit.
2502                          */
2503                         vmexit->exitcode = VM_EXITCODE_VMX;
2504                         vmexit->u.vmx.status = VM_SUCCESS;
2505                         vmexit->u.vmx.inst_type = 0;
2506                         vmexit->u.vmx.inst_error = 0;
2507                 } else {
2508                         /*
2509                          * The exitcode and collateral have been populated.
2510                          * The VM exit will be processed further in userland.
2511                          */
2512                 }
2513         }
2514         return (handled);
2515 }
2516
2517 static __inline void
2518 vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit)
2519 {
2520
2521         KASSERT(vmxctx->inst_fail_status != VM_SUCCESS,
2522             ("vmx_exit_inst_error: invalid inst_fail_status %d",
2523             vmxctx->inst_fail_status));
2524
2525         vmexit->inst_length = 0;
2526         vmexit->exitcode = VM_EXITCODE_VMX;
2527         vmexit->u.vmx.status = vmxctx->inst_fail_status;
2528         vmexit->u.vmx.inst_error = vmcs_instruction_error();
2529         vmexit->u.vmx.exit_reason = ~0;
2530         vmexit->u.vmx.exit_qualification = ~0;
2531
2532         switch (rc) {
2533         case VMX_VMRESUME_ERROR:
2534         case VMX_VMLAUNCH_ERROR:
2535         case VMX_INVEPT_ERROR:
2536                 vmexit->u.vmx.inst_type = rc;
2537                 break;
2538         default:
2539                 panic("vm_exit_inst_error: vmx_enter_guest returned %d", rc);
2540         }
2541 }
2542
2543 /*
2544  * If the NMI-exiting VM execution control is set to '1' then an NMI in
2545  * non-root operation causes a VM-exit. NMI blocking is in effect so it is
2546  * sufficient to simply vector to the NMI handler via a software interrupt.
2547  * However, this must be done before maskable interrupts are enabled
2548  * otherwise the "iret" issued by an interrupt handler will incorrectly
2549  * clear NMI blocking.
2550  */
2551 static __inline void
2552 vmx_exit_handle_nmi(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
2553 {
2554         uint32_t intr_info;
2555
2556         KASSERT((read_rflags() & PSL_I) == 0, ("interrupts enabled"));
2557
2558         if (vmexit->u.vmx.exit_reason != EXIT_REASON_EXCEPTION)
2559                 return;
2560
2561         intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
2562         KASSERT((intr_info & VMCS_INTR_VALID) != 0,
2563             ("VM exit interruption info invalid: %#x", intr_info));
2564
2565         if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI) {
2566                 KASSERT((intr_info & 0xff) == IDT_NMI, ("VM exit due "
2567                     "to NMI has invalid vector: %#x", intr_info));
2568                 VCPU_CTR0(vmx->vm, vcpuid, "Vectoring to NMI handler");
2569                 __asm __volatile("int $2");
2570         }
2571 }
2572
2573 static int
2574 vmx_run(void *arg, int vcpu, register_t rip, pmap_t pmap,
2575     struct vm_eventinfo *evinfo)
2576 {
2577         int rc, handled, launched;
2578         struct vmx *vmx;
2579         struct vm *vm;
2580         struct vmxctx *vmxctx;
2581         struct vmcs *vmcs;
2582         struct vm_exit *vmexit;
2583         struct vlapic *vlapic;
2584         uint32_t exit_reason;
2585
2586         vmx = arg;
2587         vm = vmx->vm;
2588         vmcs = &vmx->vmcs[vcpu];
2589         vmxctx = &vmx->ctx[vcpu];
2590         vlapic = vm_lapic(vm, vcpu);
2591         vmexit = vm_exitinfo(vm, vcpu);
2592         launched = 0;
2593
2594         KASSERT(vmxctx->pmap == pmap,
2595             ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap));
2596
2597         vmx_msr_guest_enter(vmx, vcpu);
2598
2599         VMPTRLD(vmcs);
2600
2601         /*
2602          * XXX
2603          * We do this every time because we may setup the virtual machine
2604          * from a different process than the one that actually runs it.
2605          *
2606          * If the life of a virtual machine was spent entirely in the context
2607          * of a single process we could do this once in vmx_vminit().
2608          */
2609         vmcs_write(VMCS_HOST_CR3, rcr3());
2610
2611         vmcs_write(VMCS_GUEST_RIP, rip);
2612         vmx_set_pcpu_defaults(vmx, vcpu, pmap);
2613         do {
2614                 KASSERT(vmcs_guest_rip() == rip, ("%s: vmcs guest rip mismatch "
2615                     "%#lx/%#lx", __func__, vmcs_guest_rip(), rip));
2616
2617                 handled = UNHANDLED;
2618                 /*
2619                  * Interrupts are disabled from this point on until the
2620                  * guest starts executing. This is done for the following
2621                  * reasons:
2622                  *
2623                  * If an AST is asserted on this thread after the check below,
2624                  * then the IPI_AST notification will not be lost, because it
2625                  * will cause a VM exit due to external interrupt as soon as
2626                  * the guest state is loaded.
2627                  *
2628                  * A posted interrupt after 'vmx_inject_interrupts()' will
2629                  * not be "lost" because it will be held pending in the host
2630                  * APIC because interrupts are disabled. The pending interrupt
2631                  * will be recognized as soon as the guest state is loaded.
2632                  *
2633                  * The same reasoning applies to the IPI generated by
2634                  * pmap_invalidate_ept().
2635                  */
2636                 disable_intr();
2637                 vmx_inject_interrupts(vmx, vcpu, vlapic, rip);
2638
2639                 /*
2640                  * Check for vcpu suspension after injecting events because
2641                  * vmx_inject_interrupts() can suspend the vcpu due to a
2642                  * triple fault.
2643                  */
2644                 if (vcpu_suspended(evinfo)) {
2645                         enable_intr();
2646                         vm_exit_suspended(vmx->vm, vcpu, rip);
2647                         break;
2648                 }
2649
2650                 if (vcpu_rendezvous_pending(evinfo)) {
2651                         enable_intr();
2652                         vm_exit_rendezvous(vmx->vm, vcpu, rip);
2653                         break;
2654                 }
2655
2656                 if (vcpu_reqidle(evinfo)) {
2657                         enable_intr();
2658                         vm_exit_reqidle(vmx->vm, vcpu, rip);
2659                         break;
2660                 }
2661
2662                 if (vcpu_should_yield(vm, vcpu)) {
2663                         enable_intr();
2664                         vm_exit_astpending(vmx->vm, vcpu, rip);
2665                         vmx_astpending_trace(vmx, vcpu, rip);
2666                         handled = HANDLED;
2667                         break;
2668                 }
2669
2670                 vmx_run_trace(vmx, vcpu);
2671                 rc = vmx_enter_guest(vmxctx, vmx, launched);
2672
2673                 /* Collect some information for VM exit processing */
2674                 vmexit->rip = rip = vmcs_guest_rip();
2675                 vmexit->inst_length = vmexit_instruction_length();
2676                 vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason();
2677                 vmexit->u.vmx.exit_qualification = vmcs_exit_qualification();
2678
2679                 /* Update 'nextrip' */
2680                 vmx->state[vcpu].nextrip = rip;
2681
2682                 if (rc == VMX_GUEST_VMEXIT) {
2683                         vmx_exit_handle_nmi(vmx, vcpu, vmexit);
2684                         enable_intr();
2685                         handled = vmx_exit_process(vmx, vcpu, vmexit);
2686                 } else {
2687                         enable_intr();
2688                         vmx_exit_inst_error(vmxctx, rc, vmexit);
2689                 }
2690                 launched = 1;
2691                 vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled);
2692                 rip = vmexit->rip;
2693         } while (handled);
2694
2695         /*
2696          * If a VM exit has been handled then the exitcode must be BOGUS
2697          * If a VM exit is not handled then the exitcode must not be BOGUS
2698          */
2699         if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) ||
2700             (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) {
2701                 panic("Mismatch between handled (%d) and exitcode (%d)",
2702                       handled, vmexit->exitcode);
2703         }
2704
2705         if (!handled)
2706                 vmm_stat_incr(vm, vcpu, VMEXIT_USERSPACE, 1);
2707
2708         VCPU_CTR1(vm, vcpu, "returning from vmx_run: exitcode %d",
2709             vmexit->exitcode);
2710
2711         VMCLEAR(vmcs);
2712         vmx_msr_guest_exit(vmx, vcpu);
2713
2714         return (0);
2715 }
2716
2717 static void
2718 vmx_vmcleanup(void *arg)
2719 {
2720         int i;
2721         struct vmx *vmx = arg;
2722
2723         if (apic_access_virtualization(vmx, 0))
2724                 vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE);
2725
2726         for (i = 0; i < VM_MAXCPU; i++)
2727                 vpid_free(vmx->state[i].vpid);
2728
2729         free(vmx, M_VMX);
2730
2731         return;
2732 }
2733
2734 static register_t *
2735 vmxctx_regptr(struct vmxctx *vmxctx, int reg)
2736 {
2737
2738         switch (reg) {
2739         case VM_REG_GUEST_RAX:
2740                 return (&vmxctx->guest_rax);
2741         case VM_REG_GUEST_RBX:
2742                 return (&vmxctx->guest_rbx);
2743         case VM_REG_GUEST_RCX:
2744                 return (&vmxctx->guest_rcx);
2745         case VM_REG_GUEST_RDX:
2746                 return (&vmxctx->guest_rdx);
2747         case VM_REG_GUEST_RSI:
2748                 return (&vmxctx->guest_rsi);
2749         case VM_REG_GUEST_RDI:
2750                 return (&vmxctx->guest_rdi);
2751         case VM_REG_GUEST_RBP:
2752                 return (&vmxctx->guest_rbp);
2753         case VM_REG_GUEST_R8:
2754                 return (&vmxctx->guest_r8);
2755         case VM_REG_GUEST_R9:
2756                 return (&vmxctx->guest_r9);
2757         case VM_REG_GUEST_R10:
2758                 return (&vmxctx->guest_r10);
2759         case VM_REG_GUEST_R11:
2760                 return (&vmxctx->guest_r11);
2761         case VM_REG_GUEST_R12:
2762                 return (&vmxctx->guest_r12);
2763         case VM_REG_GUEST_R13:
2764                 return (&vmxctx->guest_r13);
2765         case VM_REG_GUEST_R14:
2766                 return (&vmxctx->guest_r14);
2767         case VM_REG_GUEST_R15:
2768                 return (&vmxctx->guest_r15);
2769         case VM_REG_GUEST_CR2:
2770                 return (&vmxctx->guest_cr2);
2771         default:
2772                 break;
2773         }
2774         return (NULL);
2775 }
2776
2777 static int
2778 vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval)
2779 {
2780         register_t *regp;
2781
2782         if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
2783                 *retval = *regp;
2784                 return (0);
2785         } else
2786                 return (EINVAL);
2787 }
2788
2789 static int
2790 vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val)
2791 {
2792         register_t *regp;
2793
2794         if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
2795                 *regp = val;
2796                 return (0);
2797         } else
2798                 return (EINVAL);
2799 }
2800
2801 static int
2802 vmx_get_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t *retval)
2803 {
2804         uint64_t gi;
2805         int error;
2806
2807         error = vmcs_getreg(&vmx->vmcs[vcpu], running, 
2808             VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY), &gi);
2809         *retval = (gi & HWINTR_BLOCKING) ? 1 : 0;
2810         return (error);
2811 }
2812
2813 static int
2814 vmx_modify_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t val)
2815 {
2816         struct vmcs *vmcs;
2817         uint64_t gi;
2818         int error, ident;
2819
2820         /*
2821          * Forcing the vcpu into an interrupt shadow is not supported.
2822          */
2823         if (val) {
2824                 error = EINVAL;
2825                 goto done;
2826         }
2827
2828         vmcs = &vmx->vmcs[vcpu];
2829         ident = VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY);
2830         error = vmcs_getreg(vmcs, running, ident, &gi);
2831         if (error == 0) {
2832                 gi &= ~HWINTR_BLOCKING;
2833                 error = vmcs_setreg(vmcs, running, ident, gi);
2834         }
2835 done:
2836         VCPU_CTR2(vmx->vm, vcpu, "Setting intr_shadow to %#lx %s", val,
2837             error ? "failed" : "succeeded");
2838         return (error);
2839 }
2840
2841 static int
2842 vmx_shadow_reg(int reg)
2843 {
2844         int shreg;
2845
2846         shreg = -1;
2847
2848         switch (reg) {
2849         case VM_REG_GUEST_CR0:
2850                 shreg = VMCS_CR0_SHADOW;
2851                 break;
2852         case VM_REG_GUEST_CR4:
2853                 shreg = VMCS_CR4_SHADOW;
2854                 break;
2855         default:
2856                 break;
2857         }
2858
2859         return (shreg);
2860 }
2861
2862 static int
2863 vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval)
2864 {
2865         int running, hostcpu;
2866         struct vmx *vmx = arg;
2867
2868         running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
2869         if (running && hostcpu != curcpu)
2870                 panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu);
2871
2872         if (reg == VM_REG_GUEST_INTR_SHADOW)
2873                 return (vmx_get_intr_shadow(vmx, vcpu, running, retval));
2874
2875         if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0)
2876                 return (0);
2877
2878         return (vmcs_getreg(&vmx->vmcs[vcpu], running, reg, retval));
2879 }
2880
2881 static int
2882 vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
2883 {
2884         int error, hostcpu, running, shadow;
2885         uint64_t ctls;
2886         pmap_t pmap;
2887         struct vmx *vmx = arg;
2888
2889         running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
2890         if (running && hostcpu != curcpu)
2891                 panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu);
2892
2893         if (reg == VM_REG_GUEST_INTR_SHADOW)
2894                 return (vmx_modify_intr_shadow(vmx, vcpu, running, val));
2895
2896         if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0)
2897                 return (0);
2898
2899         error = vmcs_setreg(&vmx->vmcs[vcpu], running, reg, val);
2900
2901         if (error == 0) {
2902                 /*
2903                  * If the "load EFER" VM-entry control is 1 then the
2904                  * value of EFER.LMA must be identical to "IA-32e mode guest"
2905                  * bit in the VM-entry control.
2906                  */
2907                 if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 &&
2908                     (reg == VM_REG_GUEST_EFER)) {
2909                         vmcs_getreg(&vmx->vmcs[vcpu], running,
2910                                     VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls);
2911                         if (val & EFER_LMA)
2912                                 ctls |= VM_ENTRY_GUEST_LMA;
2913                         else
2914                                 ctls &= ~VM_ENTRY_GUEST_LMA;
2915                         vmcs_setreg(&vmx->vmcs[vcpu], running,
2916                                     VMCS_IDENT(VMCS_ENTRY_CTLS), ctls);
2917                 }
2918
2919                 shadow = vmx_shadow_reg(reg);
2920                 if (shadow > 0) {
2921                         /*
2922                          * Store the unmodified value in the shadow
2923                          */                     
2924                         error = vmcs_setreg(&vmx->vmcs[vcpu], running,
2925                                     VMCS_IDENT(shadow), val);
2926                 }
2927
2928                 if (reg == VM_REG_GUEST_CR3) {
2929                         /*
2930                          * Invalidate the guest vcpu's TLB mappings to emulate
2931                          * the behavior of updating %cr3.
2932                          *
2933                          * XXX the processor retains global mappings when %cr3
2934                          * is updated but vmx_invvpid() does not.
2935                          */
2936                         pmap = vmx->ctx[vcpu].pmap;
2937                         vmx_invvpid(vmx, vcpu, pmap, running);
2938                 }
2939         }
2940
2941         return (error);
2942 }
2943
2944 static int
2945 vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
2946 {
2947         int hostcpu, running;
2948         struct vmx *vmx = arg;
2949
2950         running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
2951         if (running && hostcpu != curcpu)
2952                 panic("vmx_getdesc: %s%d is running", vm_name(vmx->vm), vcpu);
2953
2954         return (vmcs_getdesc(&vmx->vmcs[vcpu], running, reg, desc));
2955 }
2956
2957 static int
2958 vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
2959 {
2960         int hostcpu, running;
2961         struct vmx *vmx = arg;
2962
2963         running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
2964         if (running && hostcpu != curcpu)
2965                 panic("vmx_setdesc: %s%d is running", vm_name(vmx->vm), vcpu);
2966
2967         return (vmcs_setdesc(&vmx->vmcs[vcpu], running, reg, desc));
2968 }
2969
2970 static int
2971 vmx_getcap(void *arg, int vcpu, int type, int *retval)
2972 {
2973         struct vmx *vmx = arg;
2974         int vcap;
2975         int ret;
2976
2977         ret = ENOENT;
2978
2979         vcap = vmx->cap[vcpu].set;
2980
2981         switch (type) {
2982         case VM_CAP_HALT_EXIT:
2983                 if (cap_halt_exit)
2984                         ret = 0;
2985                 break;
2986         case VM_CAP_PAUSE_EXIT:
2987                 if (cap_pause_exit)
2988                         ret = 0;
2989                 break;
2990         case VM_CAP_MTRAP_EXIT:
2991                 if (cap_monitor_trap)
2992                         ret = 0;
2993                 break;
2994         case VM_CAP_UNRESTRICTED_GUEST:
2995                 if (cap_unrestricted_guest)
2996                         ret = 0;
2997                 break;
2998         case VM_CAP_ENABLE_INVPCID:
2999                 if (cap_invpcid)
3000                         ret = 0;
3001                 break;
3002         default:
3003                 break;
3004         }
3005
3006         if (ret == 0)
3007                 *retval = (vcap & (1 << type)) ? 1 : 0;
3008
3009         return (ret);
3010 }
3011
3012 static int
3013 vmx_setcap(void *arg, int vcpu, int type, int val)
3014 {
3015         struct vmx *vmx = arg;
3016         struct vmcs *vmcs = &vmx->vmcs[vcpu];
3017         uint32_t baseval;
3018         uint32_t *pptr;
3019         int error;
3020         int flag;
3021         int reg;
3022         int retval;
3023
3024         retval = ENOENT;
3025         pptr = NULL;
3026
3027         switch (type) {
3028         case VM_CAP_HALT_EXIT:
3029                 if (cap_halt_exit) {
3030                         retval = 0;
3031                         pptr = &vmx->cap[vcpu].proc_ctls;
3032                         baseval = *pptr;
3033                         flag = PROCBASED_HLT_EXITING;
3034                         reg = VMCS_PRI_PROC_BASED_CTLS;
3035                 }
3036                 break;
3037         case VM_CAP_MTRAP_EXIT:
3038                 if (cap_monitor_trap) {
3039                         retval = 0;
3040                         pptr = &vmx->cap[vcpu].proc_ctls;
3041                         baseval = *pptr;
3042                         flag = PROCBASED_MTF;
3043                         reg = VMCS_PRI_PROC_BASED_CTLS;
3044                 }
3045                 break;
3046         case VM_CAP_PAUSE_EXIT:
3047                 if (cap_pause_exit) {
3048                         retval = 0;
3049                         pptr = &vmx->cap[vcpu].proc_ctls;
3050                         baseval = *pptr;
3051                         flag = PROCBASED_PAUSE_EXITING;
3052                         reg = VMCS_PRI_PROC_BASED_CTLS;
3053                 }
3054                 break;
3055         case VM_CAP_UNRESTRICTED_GUEST:
3056                 if (cap_unrestricted_guest) {
3057                         retval = 0;
3058                         pptr = &vmx->cap[vcpu].proc_ctls2;
3059                         baseval = *pptr;
3060                         flag = PROCBASED2_UNRESTRICTED_GUEST;
3061                         reg = VMCS_SEC_PROC_BASED_CTLS;
3062                 }
3063                 break;
3064         case VM_CAP_ENABLE_INVPCID:
3065                 if (cap_invpcid) {
3066                         retval = 0;
3067                         pptr = &vmx->cap[vcpu].proc_ctls2;
3068                         baseval = *pptr;
3069                         flag = PROCBASED2_ENABLE_INVPCID;
3070                         reg = VMCS_SEC_PROC_BASED_CTLS;
3071                 }
3072                 break;
3073         default:
3074                 break;
3075         }
3076
3077         if (retval == 0) {
3078                 if (val) {
3079                         baseval |= flag;
3080                 } else {
3081                         baseval &= ~flag;
3082                 }
3083                 VMPTRLD(vmcs);
3084                 error = vmwrite(reg, baseval);
3085                 VMCLEAR(vmcs);
3086
3087                 if (error) {
3088                         retval = error;
3089                 } else {
3090                         /*
3091                          * Update optional stored flags, and record
3092                          * setting
3093                          */
3094                         if (pptr != NULL) {
3095                                 *pptr = baseval;
3096                         }
3097
3098                         if (val) {
3099                                 vmx->cap[vcpu].set |= (1 << type);
3100                         } else {
3101                                 vmx->cap[vcpu].set &= ~(1 << type);
3102                         }
3103                 }
3104         }
3105
3106         return (retval);
3107 }
3108
3109 struct vlapic_vtx {
3110         struct vlapic   vlapic;
3111         struct pir_desc *pir_desc;
3112         struct vmx      *vmx;
3113 };
3114
3115 #define VMX_CTR_PIR(vm, vcpuid, pir_desc, notify, vector, level, msg)   \
3116 do {                                                                    \
3117         VCPU_CTR2(vm, vcpuid, msg " assert %s-triggered vector %d",     \
3118             level ? "level" : "edge", vector);                          \
3119         VCPU_CTR1(vm, vcpuid, msg " pir0 0x%016lx", pir_desc->pir[0]);  \
3120         VCPU_CTR1(vm, vcpuid, msg " pir1 0x%016lx", pir_desc->pir[1]);  \
3121         VCPU_CTR1(vm, vcpuid, msg " pir2 0x%016lx", pir_desc->pir[2]);  \
3122         VCPU_CTR1(vm, vcpuid, msg " pir3 0x%016lx", pir_desc->pir[3]);  \
3123         VCPU_CTR1(vm, vcpuid, msg " notify: %s", notify ? "yes" : "no");\
3124 } while (0)
3125
3126 /*
3127  * vlapic->ops handlers that utilize the APICv hardware assist described in
3128  * Chapter 29 of the Intel SDM.
3129  */
3130 static int
3131 vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level)
3132 {
3133         struct vlapic_vtx *vlapic_vtx;
3134         struct pir_desc *pir_desc;
3135         uint64_t mask;
3136         int idx, notify;
3137
3138         vlapic_vtx = (struct vlapic_vtx *)vlapic;
3139         pir_desc = vlapic_vtx->pir_desc;
3140
3141         /*
3142          * Keep track of interrupt requests in the PIR descriptor. This is
3143          * because the virtual APIC page pointed to by the VMCS cannot be
3144          * modified if the vcpu is running.
3145          */
3146         idx = vector / 64;
3147         mask = 1UL << (vector % 64);
3148         atomic_set_long(&pir_desc->pir[idx], mask);
3149         notify = atomic_cmpset_long(&pir_desc->pending, 0, 1);
3150
3151         VMX_CTR_PIR(vlapic->vm, vlapic->vcpuid, pir_desc, notify, vector,
3152             level, "vmx_set_intr_ready");
3153         return (notify);
3154 }
3155
3156 static int
3157 vmx_pending_intr(struct vlapic *vlapic, int *vecptr)
3158 {
3159         struct vlapic_vtx *vlapic_vtx;
3160         struct pir_desc *pir_desc;
3161         struct LAPIC *lapic;
3162         uint64_t pending, pirval;
3163         uint32_t ppr, vpr;
3164         int i;
3165
3166         /*
3167          * This function is only expected to be called from the 'HLT' exit
3168          * handler which does not care about the vector that is pending.
3169          */
3170         KASSERT(vecptr == NULL, ("vmx_pending_intr: vecptr must be NULL"));
3171
3172         vlapic_vtx = (struct vlapic_vtx *)vlapic;
3173         pir_desc = vlapic_vtx->pir_desc;
3174
3175         pending = atomic_load_acq_long(&pir_desc->pending);
3176         if (!pending)
3177                 return (0);     /* common case */
3178
3179         /*
3180          * If there is an interrupt pending then it will be recognized only
3181          * if its priority is greater than the processor priority.
3182          *
3183          * Special case: if the processor priority is zero then any pending
3184          * interrupt will be recognized.
3185          */
3186         lapic = vlapic->apic_page;
3187         ppr = lapic->ppr & 0xf0;
3188         if (ppr == 0)
3189                 return (1);
3190
3191         VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "HLT with non-zero PPR %d",
3192             lapic->ppr);
3193
3194         for (i = 3; i >= 0; i--) {
3195                 pirval = pir_desc->pir[i];
3196                 if (pirval != 0) {
3197                         vpr = (i * 64 + flsl(pirval) - 1) & 0xf0;
3198                         return (vpr > ppr);
3199                 }
3200         }
3201         return (0);
3202 }
3203
3204 static void
3205 vmx_intr_accepted(struct vlapic *vlapic, int vector)
3206 {
3207
3208         panic("vmx_intr_accepted: not expected to be called");
3209 }
3210
3211 static void
3212 vmx_set_tmr(struct vlapic *vlapic, int vector, bool level)
3213 {
3214         struct vlapic_vtx *vlapic_vtx;
3215         struct vmx *vmx;
3216         struct vmcs *vmcs;
3217         uint64_t mask, val;
3218
3219         KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector));
3220         KASSERT(!vcpu_is_running(vlapic->vm, vlapic->vcpuid, NULL),
3221             ("vmx_set_tmr: vcpu cannot be running"));
3222
3223         vlapic_vtx = (struct vlapic_vtx *)vlapic;
3224         vmx = vlapic_vtx->vmx;
3225         vmcs = &vmx->vmcs[vlapic->vcpuid];
3226         mask = 1UL << (vector % 64);
3227
3228         VMPTRLD(vmcs);
3229         val = vmcs_read(VMCS_EOI_EXIT(vector));
3230         if (level)
3231                 val |= mask;
3232         else
3233                 val &= ~mask;
3234         vmcs_write(VMCS_EOI_EXIT(vector), val);
3235         VMCLEAR(vmcs);
3236 }
3237
3238 static void
3239 vmx_enable_x2apic_mode(struct vlapic *vlapic)
3240 {
3241         struct vmx *vmx;
3242         struct vmcs *vmcs;
3243         uint32_t proc_ctls2;
3244         int vcpuid, error;
3245
3246         vcpuid = vlapic->vcpuid;
3247         vmx = ((struct vlapic_vtx *)vlapic)->vmx;
3248         vmcs = &vmx->vmcs[vcpuid];
3249
3250         proc_ctls2 = vmx->cap[vcpuid].proc_ctls2;
3251         KASSERT((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) != 0,
3252             ("%s: invalid proc_ctls2 %#x", __func__, proc_ctls2));
3253
3254         proc_ctls2 &= ~PROCBASED2_VIRTUALIZE_APIC_ACCESSES;
3255         proc_ctls2 |= PROCBASED2_VIRTUALIZE_X2APIC_MODE;
3256         vmx->cap[vcpuid].proc_ctls2 = proc_ctls2;
3257
3258         VMPTRLD(vmcs);
3259         vmcs_write(VMCS_SEC_PROC_BASED_CTLS, proc_ctls2);
3260         VMCLEAR(vmcs);
3261
3262         if (vlapic->vcpuid == 0) {
3263                 /*
3264                  * The nested page table mappings are shared by all vcpus
3265                  * so unmap the APIC access page just once.
3266                  */
3267                 error = vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE);
3268                 KASSERT(error == 0, ("%s: vm_unmap_mmio error %d",
3269                     __func__, error));
3270
3271                 /*
3272                  * The MSR bitmap is shared by all vcpus so modify it only
3273                  * once in the context of vcpu 0.
3274                  */
3275                 error = vmx_allow_x2apic_msrs(vmx);
3276                 KASSERT(error == 0, ("%s: vmx_allow_x2apic_msrs error %d",
3277                     __func__, error));
3278         }
3279 }
3280
3281 static void
3282 vmx_post_intr(struct vlapic *vlapic, int hostcpu)
3283 {
3284
3285         ipi_cpu(hostcpu, pirvec);
3286 }
3287
3288 /*
3289  * Transfer the pending interrupts in the PIR descriptor to the IRR
3290  * in the virtual APIC page.
3291  */
3292 static void
3293 vmx_inject_pir(struct vlapic *vlapic)
3294 {
3295         struct vlapic_vtx *vlapic_vtx;
3296         struct pir_desc *pir_desc;
3297         struct LAPIC *lapic;
3298         uint64_t val, pirval;
3299         int rvi, pirbase = -1;
3300         uint16_t intr_status_old, intr_status_new;
3301
3302         vlapic_vtx = (struct vlapic_vtx *)vlapic;
3303         pir_desc = vlapic_vtx->pir_desc;
3304         if (atomic_cmpset_long(&pir_desc->pending, 1, 0) == 0) {
3305                 VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: "
3306                     "no posted interrupt pending");
3307                 return;
3308         }
3309
3310         pirval = 0;
3311         pirbase = -1;
3312         lapic = vlapic->apic_page;
3313
3314         val = atomic_readandclear_long(&pir_desc->pir[0]);
3315         if (val != 0) {
3316                 lapic->irr0 |= val;
3317                 lapic->irr1 |= val >> 32;
3318                 pirbase = 0;
3319                 pirval = val;
3320         }
3321
3322         val = atomic_readandclear_long(&pir_desc->pir[1]);
3323         if (val != 0) {
3324                 lapic->irr2 |= val;
3325                 lapic->irr3 |= val >> 32;
3326                 pirbase = 64;
3327                 pirval = val;
3328         }
3329
3330         val = atomic_readandclear_long(&pir_desc->pir[2]);
3331         if (val != 0) {
3332                 lapic->irr4 |= val;
3333                 lapic->irr5 |= val >> 32;
3334                 pirbase = 128;
3335                 pirval = val;
3336         }
3337
3338         val = atomic_readandclear_long(&pir_desc->pir[3]);
3339         if (val != 0) {
3340                 lapic->irr6 |= val;
3341                 lapic->irr7 |= val >> 32;
3342                 pirbase = 192;
3343                 pirval = val;
3344         }
3345
3346         VLAPIC_CTR_IRR(vlapic, "vmx_inject_pir");
3347
3348         /*
3349          * Update RVI so the processor can evaluate pending virtual
3350          * interrupts on VM-entry.
3351          *
3352          * It is possible for pirval to be 0 here, even though the
3353          * pending bit has been set. The scenario is:
3354          * CPU-Y is sending a posted interrupt to CPU-X, which
3355          * is running a guest and processing posted interrupts in h/w.
3356          * CPU-X will eventually exit and the state seen in s/w is
3357          * the pending bit set, but no PIR bits set.
3358          *
3359          *      CPU-X                      CPU-Y
3360          *   (vm running)                (host running)
3361          *   rx posted interrupt
3362          *   CLEAR pending bit
3363          *                               SET PIR bit
3364          *   READ/CLEAR PIR bits
3365          *                               SET pending bit
3366          *   (vm exit)
3367          *   pending bit set, PIR 0
3368          */
3369         if (pirval != 0) {
3370                 rvi = pirbase + flsl(pirval) - 1;
3371                 intr_status_old = vmcs_read(VMCS_GUEST_INTR_STATUS);
3372                 intr_status_new = (intr_status_old & 0xFF00) | rvi;
3373                 if (intr_status_new > intr_status_old) {
3374                         vmcs_write(VMCS_GUEST_INTR_STATUS, intr_status_new);
3375                         VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: "
3376                             "guest_intr_status changed from 0x%04x to 0x%04x",
3377                             intr_status_old, intr_status_new);
3378                 }
3379         }
3380 }
3381
3382 static struct vlapic *
3383 vmx_vlapic_init(void *arg, int vcpuid)
3384 {
3385         struct vmx *vmx;
3386         struct vlapic *vlapic;
3387         struct vlapic_vtx *vlapic_vtx;
3388         
3389         vmx = arg;
3390
3391         vlapic = malloc(sizeof(struct vlapic_vtx), M_VLAPIC, M_WAITOK | M_ZERO);
3392         vlapic->vm = vmx->vm;
3393         vlapic->vcpuid = vcpuid;
3394         vlapic->apic_page = (struct LAPIC *)&vmx->apic_page[vcpuid];
3395
3396         vlapic_vtx = (struct vlapic_vtx *)vlapic;
3397         vlapic_vtx->pir_desc = &vmx->pir_desc[vcpuid];
3398         vlapic_vtx->vmx = vmx;
3399
3400         if (virtual_interrupt_delivery) {
3401                 vlapic->ops.set_intr_ready = vmx_set_intr_ready;
3402                 vlapic->ops.pending_intr = vmx_pending_intr;
3403                 vlapic->ops.intr_accepted = vmx_intr_accepted;
3404                 vlapic->ops.set_tmr = vmx_set_tmr;
3405                 vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode;
3406         }
3407
3408         if (posted_interrupts)
3409                 vlapic->ops.post_intr = vmx_post_intr;
3410
3411         vlapic_init(vlapic);
3412
3413         return (vlapic);
3414 }
3415
3416 static void
3417 vmx_vlapic_cleanup(void *arg, struct vlapic *vlapic)
3418 {
3419
3420         vlapic_cleanup(vlapic);
3421         free(vlapic, M_VLAPIC);
3422 }
3423
3424 struct vmm_ops vmm_ops_intel = {
3425         vmx_init,
3426         vmx_cleanup,
3427         vmx_restore,
3428         vmx_vminit,
3429         vmx_run,
3430         vmx_vmcleanup,
3431         vmx_getreg,
3432         vmx_setreg,
3433         vmx_getdesc,
3434         vmx_setdesc,
3435         vmx_getcap,
3436         vmx_setcap,
3437         ept_vmspace_alloc,
3438         ept_vmspace_free,
3439         vmx_vlapic_init,
3440         vmx_vlapic_cleanup,
3441 };