]> CyberLeo.Net >> Repos - FreeBSD/stable/10.git/blob - sys/amd64/vmm/intel/vmx.c
MFC 261504:
[FreeBSD/stable/10.git] / sys / amd64 / vmm / intel / vmx.c
1 /*-
2  * Copyright (c) 2011 NetApp, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  * $FreeBSD$
27  */
28
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/smp.h>
35 #include <sys/kernel.h>
36 #include <sys/malloc.h>
37 #include <sys/pcpu.h>
38 #include <sys/proc.h>
39 #include <sys/sysctl.h>
40
41 #include <vm/vm.h>
42 #include <vm/pmap.h>
43
44 #include <machine/psl.h>
45 #include <machine/cpufunc.h>
46 #include <machine/md_var.h>
47 #include <machine/segments.h>
48 #include <machine/smp.h>
49 #include <machine/specialreg.h>
50 #include <machine/vmparam.h>
51
52 #include <machine/vmm.h>
53 #include "vmm_host.h"
54 #include "vmm_ipi.h"
55 #include "vmm_msr.h"
56 #include "vmm_ktr.h"
57 #include "vmm_stat.h"
58 #include "vlapic.h"
59 #include "vlapic_priv.h"
60
61 #include "vmx_msr.h"
62 #include "ept.h"
63 #include "vmx_cpufunc.h"
64 #include "vmx.h"
65 #include "x86.h"
66 #include "vmx_controls.h"
67
68 #define PINBASED_CTLS_ONE_SETTING                                       \
69         (PINBASED_EXTINT_EXITING        |                               \
70          PINBASED_NMI_EXITING           |                               \
71          PINBASED_VIRTUAL_NMI)
72 #define PINBASED_CTLS_ZERO_SETTING      0
73
74 #define PROCBASED_CTLS_WINDOW_SETTING                                   \
75         (PROCBASED_INT_WINDOW_EXITING   |                               \
76          PROCBASED_NMI_WINDOW_EXITING)
77
78 #define PROCBASED_CTLS_ONE_SETTING                                      \
79         (PROCBASED_SECONDARY_CONTROLS   |                               \
80          PROCBASED_IO_EXITING           |                               \
81          PROCBASED_MSR_BITMAPS          |                               \
82          PROCBASED_CTLS_WINDOW_SETTING)
83 #define PROCBASED_CTLS_ZERO_SETTING     \
84         (PROCBASED_CR3_LOAD_EXITING |   \
85         PROCBASED_CR3_STORE_EXITING |   \
86         PROCBASED_IO_BITMAPS)
87
88 #define PROCBASED_CTLS2_ONE_SETTING     PROCBASED2_ENABLE_EPT
89 #define PROCBASED_CTLS2_ZERO_SETTING    0
90
91 #define VM_EXIT_CTLS_ONE_SETTING_NO_PAT                                 \
92         (VM_EXIT_HOST_LMA                       |                       \
93         VM_EXIT_SAVE_EFER                       |                       \
94         VM_EXIT_LOAD_EFER)
95
96 #define VM_EXIT_CTLS_ONE_SETTING                                        \
97         (VM_EXIT_CTLS_ONE_SETTING_NO_PAT        |                       \
98         VM_EXIT_ACKNOWLEDGE_INTERRUPT           |                       \
99         VM_EXIT_SAVE_PAT                        |                       \
100         VM_EXIT_LOAD_PAT)
101 #define VM_EXIT_CTLS_ZERO_SETTING       VM_EXIT_SAVE_DEBUG_CONTROLS
102
103 #define VM_ENTRY_CTLS_ONE_SETTING_NO_PAT        VM_ENTRY_LOAD_EFER
104
105 #define VM_ENTRY_CTLS_ONE_SETTING                                       \
106         (VM_ENTRY_CTLS_ONE_SETTING_NO_PAT       |                       \
107         VM_ENTRY_LOAD_PAT)
108 #define VM_ENTRY_CTLS_ZERO_SETTING                                      \
109         (VM_ENTRY_LOAD_DEBUG_CONTROLS           |                       \
110         VM_ENTRY_INTO_SMM                       |                       \
111         VM_ENTRY_DEACTIVATE_DUAL_MONITOR)
112
113 #define guest_msr_rw(vmx, msr) \
114         msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW)
115
116 #define HANDLED         1
117 #define UNHANDLED       0
118
119 static MALLOC_DEFINE(M_VMX, "vmx", "vmx");
120 static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic");
121
122 SYSCTL_DECL(_hw_vmm);
123 SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW, NULL, NULL);
124
125 int vmxon_enabled[MAXCPU];
126 static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE);
127
128 static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2;
129 static uint32_t exit_ctls, entry_ctls;
130
131 static uint64_t cr0_ones_mask, cr0_zeros_mask;
132 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_ones_mask, CTLFLAG_RD,
133              &cr0_ones_mask, 0, NULL);
134 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_zeros_mask, CTLFLAG_RD,
135              &cr0_zeros_mask, 0, NULL);
136
137 static uint64_t cr4_ones_mask, cr4_zeros_mask;
138 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD,
139              &cr4_ones_mask, 0, NULL);
140 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD,
141              &cr4_zeros_mask, 0, NULL);
142
143 static int vmx_no_patmsr;
144
145 static int vmx_initialized;
146 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD,
147            &vmx_initialized, 0, "Intel VMX initialized");
148
149 /*
150  * Optional capabilities
151  */
152 static int cap_halt_exit;
153 static int cap_pause_exit;
154 static int cap_unrestricted_guest;
155 static int cap_monitor_trap;
156 static int cap_invpcid;
157
158 static int virtual_interrupt_delivery;
159 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD,
160     &virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support");
161
162 static int posted_interrupts;
163 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupts, CTLFLAG_RD,
164     &posted_interrupts, 0, "APICv posted interrupt support");
165
166 static int pirvec;
167 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupt_vector, CTLFLAG_RD,
168     &pirvec, 0, "APICv posted interrupt vector");
169
170 static struct unrhdr *vpid_unr;
171 static u_int vpid_alloc_failed;
172 SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD,
173             &vpid_alloc_failed, 0, NULL);
174
175 /*
176  * Use the last page below 4GB as the APIC access address. This address is
177  * occupied by the boot firmware so it is guaranteed that it will not conflict
178  * with a page in system memory.
179  */
180 #define APIC_ACCESS_ADDRESS     0xFFFFF000
181
182 static void vmx_inject_pir(struct vlapic *vlapic);
183
184 #ifdef KTR
185 static const char *
186 exit_reason_to_str(int reason)
187 {
188         static char reasonbuf[32];
189
190         switch (reason) {
191         case EXIT_REASON_EXCEPTION:
192                 return "exception";
193         case EXIT_REASON_EXT_INTR:
194                 return "extint";
195         case EXIT_REASON_TRIPLE_FAULT:
196                 return "triplefault";
197         case EXIT_REASON_INIT:
198                 return "init";
199         case EXIT_REASON_SIPI:
200                 return "sipi";
201         case EXIT_REASON_IO_SMI:
202                 return "iosmi";
203         case EXIT_REASON_SMI:
204                 return "smi";
205         case EXIT_REASON_INTR_WINDOW:
206                 return "intrwindow";
207         case EXIT_REASON_NMI_WINDOW:
208                 return "nmiwindow";
209         case EXIT_REASON_TASK_SWITCH:
210                 return "taskswitch";
211         case EXIT_REASON_CPUID:
212                 return "cpuid";
213         case EXIT_REASON_GETSEC:
214                 return "getsec";
215         case EXIT_REASON_HLT:
216                 return "hlt";
217         case EXIT_REASON_INVD:
218                 return "invd";
219         case EXIT_REASON_INVLPG:
220                 return "invlpg";
221         case EXIT_REASON_RDPMC:
222                 return "rdpmc";
223         case EXIT_REASON_RDTSC:
224                 return "rdtsc";
225         case EXIT_REASON_RSM:
226                 return "rsm";
227         case EXIT_REASON_VMCALL:
228                 return "vmcall";
229         case EXIT_REASON_VMCLEAR:
230                 return "vmclear";
231         case EXIT_REASON_VMLAUNCH:
232                 return "vmlaunch";
233         case EXIT_REASON_VMPTRLD:
234                 return "vmptrld";
235         case EXIT_REASON_VMPTRST:
236                 return "vmptrst";
237         case EXIT_REASON_VMREAD:
238                 return "vmread";
239         case EXIT_REASON_VMRESUME:
240                 return "vmresume";
241         case EXIT_REASON_VMWRITE:
242                 return "vmwrite";
243         case EXIT_REASON_VMXOFF:
244                 return "vmxoff";
245         case EXIT_REASON_VMXON:
246                 return "vmxon";
247         case EXIT_REASON_CR_ACCESS:
248                 return "craccess";
249         case EXIT_REASON_DR_ACCESS:
250                 return "draccess";
251         case EXIT_REASON_INOUT:
252                 return "inout";
253         case EXIT_REASON_RDMSR:
254                 return "rdmsr";
255         case EXIT_REASON_WRMSR:
256                 return "wrmsr";
257         case EXIT_REASON_INVAL_VMCS:
258                 return "invalvmcs";
259         case EXIT_REASON_INVAL_MSR:
260                 return "invalmsr";
261         case EXIT_REASON_MWAIT:
262                 return "mwait";
263         case EXIT_REASON_MTF:
264                 return "mtf";
265         case EXIT_REASON_MONITOR:
266                 return "monitor";
267         case EXIT_REASON_PAUSE:
268                 return "pause";
269         case EXIT_REASON_MCE:
270                 return "mce";
271         case EXIT_REASON_TPR:
272                 return "tpr";
273         case EXIT_REASON_APIC_ACCESS:
274                 return "apic-access";
275         case EXIT_REASON_GDTR_IDTR:
276                 return "gdtridtr";
277         case EXIT_REASON_LDTR_TR:
278                 return "ldtrtr";
279         case EXIT_REASON_EPT_FAULT:
280                 return "eptfault";
281         case EXIT_REASON_EPT_MISCONFIG:
282                 return "eptmisconfig";
283         case EXIT_REASON_INVEPT:
284                 return "invept";
285         case EXIT_REASON_RDTSCP:
286                 return "rdtscp";
287         case EXIT_REASON_VMX_PREEMPT:
288                 return "vmxpreempt";
289         case EXIT_REASON_INVVPID:
290                 return "invvpid";
291         case EXIT_REASON_WBINVD:
292                 return "wbinvd";
293         case EXIT_REASON_XSETBV:
294                 return "xsetbv";
295         case EXIT_REASON_APIC_WRITE:
296                 return "apic-write";
297         default:
298                 snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason);
299                 return (reasonbuf);
300         }
301 }
302 #endif  /* KTR */
303
304 u_long
305 vmx_fix_cr0(u_long cr0)
306 {
307
308         return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask);
309 }
310
311 u_long
312 vmx_fix_cr4(u_long cr4)
313 {
314
315         return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask);
316 }
317
318 static void
319 vpid_free(int vpid)
320 {
321         if (vpid < 0 || vpid > 0xffff)
322                 panic("vpid_free: invalid vpid %d", vpid);
323
324         /*
325          * VPIDs [0,VM_MAXCPU] are special and are not allocated from
326          * the unit number allocator.
327          */
328
329         if (vpid > VM_MAXCPU)
330                 free_unr(vpid_unr, vpid);
331 }
332
333 static void
334 vpid_alloc(uint16_t *vpid, int num)
335 {
336         int i, x;
337
338         if (num <= 0 || num > VM_MAXCPU)
339                 panic("invalid number of vpids requested: %d", num);
340
341         /*
342          * If the "enable vpid" execution control is not enabled then the
343          * VPID is required to be 0 for all vcpus.
344          */
345         if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) == 0) {
346                 for (i = 0; i < num; i++)
347                         vpid[i] = 0;
348                 return;
349         }
350
351         /*
352          * Allocate a unique VPID for each vcpu from the unit number allocator.
353          */
354         for (i = 0; i < num; i++) {
355                 x = alloc_unr(vpid_unr);
356                 if (x == -1)
357                         break;
358                 else
359                         vpid[i] = x;
360         }
361
362         if (i < num) {
363                 atomic_add_int(&vpid_alloc_failed, 1);
364
365                 /*
366                  * If the unit number allocator does not have enough unique
367                  * VPIDs then we need to allocate from the [1,VM_MAXCPU] range.
368                  *
369                  * These VPIDs are not be unique across VMs but this does not
370                  * affect correctness because the combined mappings are also
371                  * tagged with the EP4TA which is unique for each VM.
372                  *
373                  * It is still sub-optimal because the invvpid will invalidate
374                  * combined mappings for a particular VPID across all EP4TAs.
375                  */
376                 while (i-- > 0)
377                         vpid_free(vpid[i]);
378
379                 for (i = 0; i < num; i++)
380                         vpid[i] = i + 1;
381         }
382 }
383
384 static void
385 vpid_init(void)
386 {
387         /*
388          * VPID 0 is required when the "enable VPID" execution control is
389          * disabled.
390          *
391          * VPIDs [1,VM_MAXCPU] are used as the "overflow namespace" when the
392          * unit number allocator does not have sufficient unique VPIDs to
393          * satisfy the allocation.
394          *
395          * The remaining VPIDs are managed by the unit number allocator.
396          */
397         vpid_unr = new_unrhdr(VM_MAXCPU + 1, 0xffff, NULL);
398 }
399
400 static void
401 msr_save_area_init(struct msr_entry *g_area, int *g_count)
402 {
403         int cnt;
404
405         static struct msr_entry guest_msrs[] = {
406                 { MSR_KGSBASE, 0, 0 },
407         };
408
409         cnt = sizeof(guest_msrs) / sizeof(guest_msrs[0]);
410         if (cnt > GUEST_MSR_MAX_ENTRIES)
411                 panic("guest msr save area overrun");
412         bcopy(guest_msrs, g_area, sizeof(guest_msrs));
413         *g_count = cnt;
414 }
415
416 static void
417 vmx_disable(void *arg __unused)
418 {
419         struct invvpid_desc invvpid_desc = { 0 };
420         struct invept_desc invept_desc = { 0 };
421
422         if (vmxon_enabled[curcpu]) {
423                 /*
424                  * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b.
425                  *
426                  * VMXON or VMXOFF are not required to invalidate any TLB
427                  * caching structures. This prevents potential retention of
428                  * cached information in the TLB between distinct VMX episodes.
429                  */
430                 invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc);
431                 invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc);
432                 vmxoff();
433         }
434         load_cr4(rcr4() & ~CR4_VMXE);
435 }
436
437 static int
438 vmx_cleanup(void)
439 {
440         
441         if (pirvec != 0)
442                 vmm_ipi_free(pirvec);
443
444         if (vpid_unr != NULL) {
445                 delete_unrhdr(vpid_unr);
446                 vpid_unr = NULL;
447         }
448
449         smp_rendezvous(NULL, vmx_disable, NULL, NULL);
450
451         return (0);
452 }
453
454 static void
455 vmx_enable(void *arg __unused)
456 {
457         int error;
458
459         load_cr4(rcr4() | CR4_VMXE);
460
461         *(uint32_t *)vmxon_region[curcpu] = vmx_revision();
462         error = vmxon(vmxon_region[curcpu]);
463         if (error == 0)
464                 vmxon_enabled[curcpu] = 1;
465 }
466
467 static void
468 vmx_restore(void)
469 {
470
471         if (vmxon_enabled[curcpu])
472                 vmxon(vmxon_region[curcpu]);
473 }
474
475 static int
476 vmx_init(int ipinum)
477 {
478         int error, use_tpr_shadow;
479         uint64_t fixed0, fixed1, feature_control;
480         uint32_t tmp, procbased2_vid_bits;
481
482         /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */
483         if (!(cpu_feature2 & CPUID2_VMX)) {
484                 printf("vmx_init: processor does not support VMX operation\n");
485                 return (ENXIO);
486         }
487
488         /*
489          * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits
490          * are set (bits 0 and 2 respectively).
491          */
492         feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
493         if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 ||
494             (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) {
495                 printf("vmx_init: VMX operation disabled by BIOS\n");
496                 return (ENXIO);
497         }
498
499         /* Check support for primary processor-based VM-execution controls */
500         error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
501                                MSR_VMX_TRUE_PROCBASED_CTLS,
502                                PROCBASED_CTLS_ONE_SETTING,
503                                PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls);
504         if (error) {
505                 printf("vmx_init: processor does not support desired primary "
506                        "processor-based controls\n");
507                 return (error);
508         }
509
510         /* Clear the processor-based ctl bits that are set on demand */
511         procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING;
512
513         /* Check support for secondary processor-based VM-execution controls */
514         error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
515                                MSR_VMX_PROCBASED_CTLS2,
516                                PROCBASED_CTLS2_ONE_SETTING,
517                                PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2);
518         if (error) {
519                 printf("vmx_init: processor does not support desired secondary "
520                        "processor-based controls\n");
521                 return (error);
522         }
523
524         /* Check support for VPID */
525         error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
526                                PROCBASED2_ENABLE_VPID, 0, &tmp);
527         if (error == 0)
528                 procbased_ctls2 |= PROCBASED2_ENABLE_VPID;
529
530         /* Check support for pin-based VM-execution controls */
531         error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
532                                MSR_VMX_TRUE_PINBASED_CTLS,
533                                PINBASED_CTLS_ONE_SETTING,
534                                PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls);
535         if (error) {
536                 printf("vmx_init: processor does not support desired "
537                        "pin-based controls\n");
538                 return (error);
539         }
540
541         /* Check support for VM-exit controls */
542         error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS,
543                                VM_EXIT_CTLS_ONE_SETTING,
544                                VM_EXIT_CTLS_ZERO_SETTING,
545                                &exit_ctls);
546         if (error) {
547                 /* Try again without the PAT MSR bits */
548                 error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS,
549                                        MSR_VMX_TRUE_EXIT_CTLS,
550                                        VM_EXIT_CTLS_ONE_SETTING_NO_PAT,
551                                        VM_EXIT_CTLS_ZERO_SETTING,
552                                        &exit_ctls);
553                 if (error) {
554                         printf("vmx_init: processor does not support desired "
555                                "exit controls\n");
556                         return (error);
557                 } else {
558                         if (bootverbose)
559                                 printf("vmm: PAT MSR access not supported\n");
560                         guest_msr_valid(MSR_PAT);
561                         vmx_no_patmsr = 1;
562                 }
563         }
564
565         /* Check support for VM-entry controls */
566         if (!vmx_no_patmsr) {
567                 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS,
568                                        MSR_VMX_TRUE_ENTRY_CTLS,
569                                        VM_ENTRY_CTLS_ONE_SETTING,
570                                        VM_ENTRY_CTLS_ZERO_SETTING,
571                                        &entry_ctls);
572         } else {
573                 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS,
574                                        MSR_VMX_TRUE_ENTRY_CTLS,
575                                        VM_ENTRY_CTLS_ONE_SETTING_NO_PAT,
576                                        VM_ENTRY_CTLS_ZERO_SETTING,
577                                        &entry_ctls);
578         }
579
580         if (error) {
581                 printf("vmx_init: processor does not support desired "
582                        "entry controls\n");
583                        return (error);
584         }
585
586         /*
587          * Check support for optional features by testing them
588          * as individual bits
589          */
590         cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
591                                         MSR_VMX_TRUE_PROCBASED_CTLS,
592                                         PROCBASED_HLT_EXITING, 0,
593                                         &tmp) == 0);
594
595         cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
596                                         MSR_VMX_PROCBASED_CTLS,
597                                         PROCBASED_MTF, 0,
598                                         &tmp) == 0);
599
600         cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
601                                          MSR_VMX_TRUE_PROCBASED_CTLS,
602                                          PROCBASED_PAUSE_EXITING, 0,
603                                          &tmp) == 0);
604
605         cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
606                                         MSR_VMX_PROCBASED_CTLS2,
607                                         PROCBASED2_UNRESTRICTED_GUEST, 0,
608                                         &tmp) == 0);
609
610         cap_invpcid = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
611             MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0,
612             &tmp) == 0);
613
614         /*
615          * Check support for virtual interrupt delivery.
616          */
617         procbased2_vid_bits = (PROCBASED2_VIRTUALIZE_APIC_ACCESSES |
618             PROCBASED2_VIRTUALIZE_X2APIC_MODE |
619             PROCBASED2_APIC_REGISTER_VIRTUALIZATION |
620             PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY);
621
622         use_tpr_shadow = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
623             MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_USE_TPR_SHADOW, 0,
624             &tmp) == 0);
625
626         error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
627             procbased2_vid_bits, 0, &tmp);
628         if (error == 0 && use_tpr_shadow) {
629                 virtual_interrupt_delivery = 1;
630                 TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_vid",
631                     &virtual_interrupt_delivery);
632         }
633
634         if (virtual_interrupt_delivery) {
635                 procbased_ctls |= PROCBASED_USE_TPR_SHADOW;
636                 procbased_ctls2 |= procbased2_vid_bits;
637                 procbased_ctls2 &= ~PROCBASED2_VIRTUALIZE_X2APIC_MODE;
638
639                 /*
640                  * Check for Posted Interrupts only if Virtual Interrupt
641                  * Delivery is enabled.
642                  */
643                 error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
644                     MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_POSTED_INTERRUPT, 0,
645                     &tmp);
646                 if (error == 0) {
647                         pirvec = vmm_ipi_alloc();
648                         if (pirvec == 0) {
649                                 if (bootverbose) {
650                                         printf("vmx_init: unable to allocate "
651                                             "posted interrupt vector\n");
652                                 }
653                         } else {
654                                 posted_interrupts = 1;
655                                 TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_pir",
656                                     &posted_interrupts);
657                         }
658                 }
659         }
660
661         if (posted_interrupts)
662                     pinbased_ctls |= PINBASED_POSTED_INTERRUPT;
663
664         /* Initialize EPT */
665         error = ept_init(ipinum);
666         if (error) {
667                 printf("vmx_init: ept initialization failed (%d)\n", error);
668                 return (error);
669         }
670
671         /*
672          * Stash the cr0 and cr4 bits that must be fixed to 0 or 1
673          */
674         fixed0 = rdmsr(MSR_VMX_CR0_FIXED0);
675         fixed1 = rdmsr(MSR_VMX_CR0_FIXED1);
676         cr0_ones_mask = fixed0 & fixed1;
677         cr0_zeros_mask = ~fixed0 & ~fixed1;
678
679         /*
680          * CR0_PE and CR0_PG can be set to zero in VMX non-root operation
681          * if unrestricted guest execution is allowed.
682          */
683         if (cap_unrestricted_guest)
684                 cr0_ones_mask &= ~(CR0_PG | CR0_PE);
685
686         /*
687          * Do not allow the guest to set CR0_NW or CR0_CD.
688          */
689         cr0_zeros_mask |= (CR0_NW | CR0_CD);
690
691         fixed0 = rdmsr(MSR_VMX_CR4_FIXED0);
692         fixed1 = rdmsr(MSR_VMX_CR4_FIXED1);
693         cr4_ones_mask = fixed0 & fixed1;
694         cr4_zeros_mask = ~fixed0 & ~fixed1;
695
696         vpid_init();
697
698         /* enable VMX operation */
699         smp_rendezvous(NULL, vmx_enable, NULL, NULL);
700
701         vmx_initialized = 1;
702
703         return (0);
704 }
705
706 static void
707 vmx_trigger_hostintr(int vector)
708 {
709         uintptr_t func;
710         struct gate_descriptor *gd;
711
712         gd = &idt[vector];
713
714         KASSERT(vector >= 32 && vector <= 255, ("vmx_trigger_hostintr: "
715             "invalid vector %d", vector));
716         KASSERT(gd->gd_p == 1, ("gate descriptor for vector %d not present",
717             vector));
718         KASSERT(gd->gd_type == SDT_SYSIGT, ("gate descriptor for vector %d "
719             "has invalid type %d", vector, gd->gd_type));
720         KASSERT(gd->gd_dpl == SEL_KPL, ("gate descriptor for vector %d "
721             "has invalid dpl %d", vector, gd->gd_dpl));
722         KASSERT(gd->gd_selector == GSEL(GCODE_SEL, SEL_KPL), ("gate descriptor "
723             "for vector %d has invalid selector %d", vector, gd->gd_selector));
724         KASSERT(gd->gd_ist == 0, ("gate descriptor for vector %d has invalid "
725             "IST %d", vector, gd->gd_ist));
726
727         func = ((long)gd->gd_hioffset << 16 | gd->gd_looffset);
728         vmx_call_isr(func);
729 }
730
731 static int
732 vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial)
733 {
734         int error, mask_ident, shadow_ident;
735         uint64_t mask_value;
736
737         if (which != 0 && which != 4)
738                 panic("vmx_setup_cr_shadow: unknown cr%d", which);
739
740         if (which == 0) {
741                 mask_ident = VMCS_CR0_MASK;
742                 mask_value = cr0_ones_mask | cr0_zeros_mask;
743                 shadow_ident = VMCS_CR0_SHADOW;
744         } else {
745                 mask_ident = VMCS_CR4_MASK;
746                 mask_value = cr4_ones_mask | cr4_zeros_mask;
747                 shadow_ident = VMCS_CR4_SHADOW;
748         }
749
750         error = vmcs_setreg(vmcs, 0, VMCS_IDENT(mask_ident), mask_value);
751         if (error)
752                 return (error);
753
754         error = vmcs_setreg(vmcs, 0, VMCS_IDENT(shadow_ident), initial);
755         if (error)
756                 return (error);
757
758         return (0);
759 }
760 #define vmx_setup_cr0_shadow(vmcs,init) vmx_setup_cr_shadow(0, (vmcs), (init))
761 #define vmx_setup_cr4_shadow(vmcs,init) vmx_setup_cr_shadow(4, (vmcs), (init))
762
763 static void *
764 vmx_vminit(struct vm *vm, pmap_t pmap)
765 {
766         uint16_t vpid[VM_MAXCPU];
767         int i, error, guest_msr_count;
768         struct vmx *vmx;
769         struct vmcs *vmcs;
770
771         vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO);
772         if ((uintptr_t)vmx & PAGE_MASK) {
773                 panic("malloc of struct vmx not aligned on %d byte boundary",
774                       PAGE_SIZE);
775         }
776         vmx->vm = vm;
777
778         vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4));
779
780         /*
781          * Clean up EPTP-tagged guest physical and combined mappings
782          *
783          * VMX transitions are not required to invalidate any guest physical
784          * mappings. So, it may be possible for stale guest physical mappings
785          * to be present in the processor TLBs.
786          *
787          * Combined mappings for this EP4TA are also invalidated for all VPIDs.
788          */
789         ept_invalidate_mappings(vmx->eptp);
790
791         msr_bitmap_initialize(vmx->msr_bitmap);
792
793         /*
794          * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE.
795          * The guest FSBASE and GSBASE are saved and restored during
796          * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are
797          * always restored from the vmcs host state area on vm-exit.
798          *
799          * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in
800          * how they are saved/restored so can be directly accessed by the
801          * guest.
802          *
803          * Guest KGSBASE is saved and restored in the guest MSR save area.
804          * Host KGSBASE is restored before returning to userland from the pcb.
805          * There will be a window of time when we are executing in the host
806          * kernel context with a value of KGSBASE from the guest. This is ok
807          * because the value of KGSBASE is inconsequential in kernel context.
808          *
809          * MSR_EFER is saved and restored in the guest VMCS area on a
810          * VM exit and entry respectively. It is also restored from the
811          * host VMCS area on a VM exit.
812          */
813         if (guest_msr_rw(vmx, MSR_GSBASE) ||
814             guest_msr_rw(vmx, MSR_FSBASE) ||
815             guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) ||
816             guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) ||
817             guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) ||
818             guest_msr_rw(vmx, MSR_KGSBASE) ||
819             guest_msr_rw(vmx, MSR_EFER))
820                 panic("vmx_vminit: error setting guest msr access");
821
822         /*
823          * MSR_PAT is saved and restored in the guest VMCS are on a VM exit
824          * and entry respectively. It is also restored from the host VMCS
825          * area on a VM exit. However, if running on a system with no
826          * MSR_PAT save/restore support, leave access disabled so accesses
827          * will be trapped.
828          */
829         if (!vmx_no_patmsr && guest_msr_rw(vmx, MSR_PAT))
830                 panic("vmx_vminit: error setting guest pat msr access");
831
832         vpid_alloc(vpid, VM_MAXCPU);
833
834         if (virtual_interrupt_delivery) {
835                 error = vm_map_mmio(vm, DEFAULT_APIC_BASE, PAGE_SIZE,
836                     APIC_ACCESS_ADDRESS);
837                 /* XXX this should really return an error to the caller */
838                 KASSERT(error == 0, ("vm_map_mmio(apicbase) error %d", error));
839         }
840
841         for (i = 0; i < VM_MAXCPU; i++) {
842                 vmcs = &vmx->vmcs[i];
843                 vmcs->identifier = vmx_revision();
844                 error = vmclear(vmcs);
845                 if (error != 0) {
846                         panic("vmx_vminit: vmclear error %d on vcpu %d\n",
847                               error, i);
848                 }
849
850                 error = vmcs_init(vmcs);
851                 KASSERT(error == 0, ("vmcs_init error %d", error));
852
853                 VMPTRLD(vmcs);
854                 error = 0;
855                 error += vmwrite(VMCS_HOST_RSP, (u_long)&vmx->ctx[i]);
856                 error += vmwrite(VMCS_EPTP, vmx->eptp);
857                 error += vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls);
858                 error += vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls);
859                 error += vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2);
860                 error += vmwrite(VMCS_EXIT_CTLS, exit_ctls);
861                 error += vmwrite(VMCS_ENTRY_CTLS, entry_ctls);
862                 error += vmwrite(VMCS_MSR_BITMAP, vtophys(vmx->msr_bitmap));
863                 error += vmwrite(VMCS_VPID, vpid[i]);
864                 if (virtual_interrupt_delivery) {
865                         error += vmwrite(VMCS_APIC_ACCESS, APIC_ACCESS_ADDRESS);
866                         error += vmwrite(VMCS_VIRTUAL_APIC,
867                             vtophys(&vmx->apic_page[i]));
868                         error += vmwrite(VMCS_EOI_EXIT0, 0);
869                         error += vmwrite(VMCS_EOI_EXIT1, 0);
870                         error += vmwrite(VMCS_EOI_EXIT2, 0);
871                         error += vmwrite(VMCS_EOI_EXIT3, 0);
872                 }
873                 if (posted_interrupts) {
874                         error += vmwrite(VMCS_PIR_VECTOR, pirvec);
875                         error += vmwrite(VMCS_PIR_DESC,
876                             vtophys(&vmx->pir_desc[i]));
877                 }
878                 VMCLEAR(vmcs);
879                 KASSERT(error == 0, ("vmx_vminit: error customizing the vmcs"));
880
881                 vmx->cap[i].set = 0;
882                 vmx->cap[i].proc_ctls = procbased_ctls;
883                 vmx->cap[i].proc_ctls2 = procbased_ctls2;
884
885                 vmx->state[i].lastcpu = -1;
886                 vmx->state[i].vpid = vpid[i];
887
888                 msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count);
889
890                 error = vmcs_set_msr_save(vmcs, vtophys(vmx->guest_msrs[i]),
891                     guest_msr_count);
892                 if (error != 0)
893                         panic("vmcs_set_msr_save error %d", error);
894
895                 /*
896                  * Set up the CR0/4 shadows, and init the read shadow
897                  * to the power-on register value from the Intel Sys Arch.
898                  *  CR0 - 0x60000010
899                  *  CR4 - 0
900                  */
901                 error = vmx_setup_cr0_shadow(vmcs, 0x60000010);
902                 if (error != 0)
903                         panic("vmx_setup_cr0_shadow %d", error);
904
905                 error = vmx_setup_cr4_shadow(vmcs, 0);
906                 if (error != 0)
907                         panic("vmx_setup_cr4_shadow %d", error);
908
909                 vmx->ctx[i].pmap = pmap;
910         }
911
912         return (vmx);
913 }
914
915 static int
916 vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx)
917 {
918         int handled, func;
919         
920         func = vmxctx->guest_rax;
921
922         handled = x86_emulate_cpuid(vm, vcpu,
923                                     (uint32_t*)(&vmxctx->guest_rax),
924                                     (uint32_t*)(&vmxctx->guest_rbx),
925                                     (uint32_t*)(&vmxctx->guest_rcx),
926                                     (uint32_t*)(&vmxctx->guest_rdx));
927         return (handled);
928 }
929
930 static __inline void
931 vmx_run_trace(struct vmx *vmx, int vcpu)
932 {
933 #ifdef KTR
934         VCPU_CTR1(vmx->vm, vcpu, "Resume execution at %#lx", vmcs_guest_rip());
935 #endif
936 }
937
938 static __inline void
939 vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason,
940                int handled)
941 {
942 #ifdef KTR
943         VCPU_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx",
944                  handled ? "handled" : "unhandled",
945                  exit_reason_to_str(exit_reason), rip);
946 #endif
947 }
948
949 static __inline void
950 vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip)
951 {
952 #ifdef KTR
953         VCPU_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip);
954 #endif
955 }
956
957 static VMM_STAT_INTEL(VCPU_INVVPID_SAVED, "Number of vpid invalidations saved");
958
959 static void
960 vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap)
961 {
962         struct vmxstate *vmxstate;
963         struct invvpid_desc invvpid_desc;
964
965         vmxstate = &vmx->state[vcpu];
966         if (vmxstate->lastcpu == curcpu)
967                 return;
968
969         vmxstate->lastcpu = curcpu;
970
971         vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1);
972
973         vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase());
974         vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase());
975         vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase());
976
977         /*
978          * If we are using VPIDs then invalidate all mappings tagged with 'vpid'
979          *
980          * We do this because this vcpu was executing on a different host
981          * cpu when it last ran. We do not track whether it invalidated
982          * mappings associated with its 'vpid' during that run. So we must
983          * assume that the mappings associated with 'vpid' on 'curcpu' are
984          * stale and invalidate them.
985          *
986          * Note that we incur this penalty only when the scheduler chooses to
987          * move the thread associated with this vcpu between host cpus.
988          *
989          * Note also that this will invalidate mappings tagged with 'vpid'
990          * for "all" EP4TAs.
991          */
992         if (vmxstate->vpid != 0) {
993                 if (pmap->pm_eptgen == vmx->eptgen[curcpu]) {
994                         invvpid_desc._res1 = 0;
995                         invvpid_desc._res2 = 0;
996                         invvpid_desc.vpid = vmxstate->vpid;
997                         invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc);
998                 } else {
999                         /*
1000                          * The invvpid can be skipped if an invept is going to
1001                          * be performed before entering the guest. The invept
1002                          * will invalidate combined mappings tagged with
1003                          * 'vmx->eptp' for all vpids.
1004                          */
1005                         vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1);
1006                 }
1007         }
1008 }
1009
1010 /*
1011  * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set.
1012  */
1013 CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0);
1014
1015 static void __inline
1016 vmx_set_int_window_exiting(struct vmx *vmx, int vcpu)
1017 {
1018
1019         if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) == 0) {
1020                 vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING;
1021                 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
1022                 VCPU_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting");
1023         }
1024 }
1025
1026 static void __inline
1027 vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu)
1028 {
1029
1030         KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0,
1031             ("intr_window_exiting not set: %#x", vmx->cap[vcpu].proc_ctls));
1032         vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING;
1033         vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
1034         VCPU_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting");
1035 }
1036
1037 static void __inline
1038 vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu)
1039 {
1040
1041         if ((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) == 0) {
1042                 vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING;
1043                 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
1044                 VCPU_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting");
1045         }
1046 }
1047
1048 static void __inline
1049 vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu)
1050 {
1051
1052         KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0,
1053             ("nmi_window_exiting not set %#x", vmx->cap[vcpu].proc_ctls));
1054         vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING;
1055         vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
1056         VCPU_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting");
1057 }
1058
1059 #define NMI_BLOCKING    (VMCS_INTERRUPTIBILITY_NMI_BLOCKING |           \
1060                          VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING)
1061 #define HWINTR_BLOCKING (VMCS_INTERRUPTIBILITY_STI_BLOCKING |           \
1062                          VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING)
1063
1064 static void
1065 vmx_inject_nmi(struct vmx *vmx, int vcpu)
1066 {
1067         uint32_t gi, info;
1068
1069         gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1070         KASSERT((gi & NMI_BLOCKING) == 0, ("vmx_inject_nmi: invalid guest "
1071             "interruptibility-state %#x", gi));
1072
1073         info = vmcs_read(VMCS_ENTRY_INTR_INFO);
1074         KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_nmi: invalid "
1075             "VM-entry interruption information %#x", info));
1076
1077         /*
1078          * Inject the virtual NMI. The vector must be the NMI IDT entry
1079          * or the VMCS entry check will fail.
1080          */
1081         info = IDT_NMI | VMCS_INTR_T_NMI | VMCS_INTR_VALID;
1082         vmcs_write(VMCS_ENTRY_INTR_INFO, info);
1083
1084         VCPU_CTR0(vmx->vm, vcpu, "Injecting vNMI");
1085
1086         /* Clear the request */
1087         vm_nmi_clear(vmx->vm, vcpu);
1088 }
1089
1090 static void
1091 vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic)
1092 {
1093         int vector, need_nmi_exiting;
1094         uint64_t rflags;
1095         uint32_t gi, info;
1096
1097         if (vm_nmi_pending(vmx->vm, vcpu)) {
1098                 /*
1099                  * If there are no conditions blocking NMI injection then
1100                  * inject it directly here otherwise enable "NMI window
1101                  * exiting" to inject it as soon as we can.
1102                  *
1103                  * We also check for STI_BLOCKING because some implementations
1104                  * don't allow NMI injection in this case. If we are running
1105                  * on a processor that doesn't have this restriction it will
1106                  * immediately exit and the NMI will be injected in the
1107                  * "NMI window exiting" handler.
1108                  */
1109                 need_nmi_exiting = 1;
1110                 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1111                 if ((gi & (HWINTR_BLOCKING | NMI_BLOCKING)) == 0) {
1112                         info = vmcs_read(VMCS_ENTRY_INTR_INFO);
1113                         if ((info & VMCS_INTR_VALID) == 0) {
1114                                 vmx_inject_nmi(vmx, vcpu);
1115                                 need_nmi_exiting = 0;
1116                         } else {
1117                                 VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI "
1118                                     "due to VM-entry intr info %#x", info);
1119                         }
1120                 } else {
1121                         VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI due to "
1122                             "Guest Interruptibility-state %#x", gi);
1123                 }
1124
1125                 if (need_nmi_exiting)
1126                         vmx_set_nmi_window_exiting(vmx, vcpu);
1127         }
1128
1129         if (virtual_interrupt_delivery) {
1130                 vmx_inject_pir(vlapic);
1131                 return;
1132         }
1133
1134         /*
1135          * If interrupt-window exiting is already in effect then don't bother
1136          * checking for pending interrupts. This is just an optimization and
1137          * not needed for correctness.
1138          */
1139         if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0) {
1140                 VCPU_CTR0(vmx->vm, vcpu, "Skip interrupt injection due to "
1141                     "pending int_window_exiting");
1142                 return;
1143         }
1144
1145         /* Ask the local apic for a vector to inject */
1146         if (!vlapic_pending_intr(vlapic, &vector))
1147                 return;
1148
1149         KASSERT(vector >= 32 && vector <= 255, ("invalid vector %d", vector));
1150
1151         /* Check RFLAGS.IF and the interruptibility state of the guest */
1152         rflags = vmcs_read(VMCS_GUEST_RFLAGS);
1153         if ((rflags & PSL_I) == 0) {
1154                 VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
1155                     "rflags %#lx", vector, rflags);
1156                 goto cantinject;
1157         }
1158
1159         gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1160         if (gi & HWINTR_BLOCKING) {
1161                 VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
1162                     "Guest Interruptibility-state %#x", vector, gi);
1163                 goto cantinject;
1164         }
1165
1166         info = vmcs_read(VMCS_ENTRY_INTR_INFO);
1167         if (info & VMCS_INTR_VALID) {
1168                 /*
1169                  * This is expected and could happen for multiple reasons:
1170                  * - A vectoring VM-entry was aborted due to astpending
1171                  * - A VM-exit happened during event injection.
1172                  * - An NMI was injected above or after "NMI window exiting"
1173                  */
1174                 VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
1175                     "VM-entry intr info %#x", vector, info);
1176                 goto cantinject;
1177         }
1178
1179         /* Inject the interrupt */
1180         info = VMCS_INTR_T_HWINTR | VMCS_INTR_VALID;
1181         info |= vector;
1182         vmcs_write(VMCS_ENTRY_INTR_INFO, info);
1183
1184         /* Update the Local APIC ISR */
1185         vlapic_intr_accepted(vlapic, vector);
1186
1187         VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector);
1188
1189         return;
1190
1191 cantinject:
1192         /*
1193          * Set the Interrupt Window Exiting execution control so we can inject
1194          * the interrupt as soon as blocking condition goes away.
1195          */
1196         vmx_set_int_window_exiting(vmx, vcpu);
1197 }
1198
1199 /*
1200  * If the Virtual NMIs execution control is '1' then the logical processor
1201  * tracks virtual-NMI blocking in the Guest Interruptibility-state field of
1202  * the VMCS. An IRET instruction in VMX non-root operation will remove any
1203  * virtual-NMI blocking.
1204  *
1205  * This unblocking occurs even if the IRET causes a fault. In this case the
1206  * hypervisor needs to restore virtual-NMI blocking before resuming the guest.
1207  */
1208 static void
1209 vmx_restore_nmi_blocking(struct vmx *vmx, int vcpuid)
1210 {
1211         uint32_t gi;
1212
1213         VCPU_CTR0(vmx->vm, vcpuid, "Restore Virtual-NMI blocking");
1214         gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1215         gi |= VMCS_INTERRUPTIBILITY_NMI_BLOCKING;
1216         vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
1217 }
1218
1219 static void
1220 vmx_clear_nmi_blocking(struct vmx *vmx, int vcpuid)
1221 {
1222         uint32_t gi;
1223
1224         VCPU_CTR0(vmx->vm, vcpuid, "Clear Virtual-NMI blocking");
1225         gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1226         gi &= ~VMCS_INTERRUPTIBILITY_NMI_BLOCKING;
1227         vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
1228 }
1229
1230 static int
1231 vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
1232 {
1233         int cr, vmcs_guest_cr, vmcs_shadow_cr;
1234         uint64_t crval, regval, ones_mask, zeros_mask;
1235         const struct vmxctx *vmxctx;
1236
1237         /* We only handle mov to %cr0 or %cr4 at this time */
1238         if ((exitqual & 0xf0) != 0x00)
1239                 return (UNHANDLED);
1240
1241         cr = exitqual & 0xf;
1242         if (cr != 0 && cr != 4)
1243                 return (UNHANDLED);
1244
1245         vmxctx = &vmx->ctx[vcpu];
1246
1247         /*
1248          * We must use vmcs_write() directly here because vmcs_setreg() will
1249          * call vmclear(vmcs) as a side-effect which we certainly don't want.
1250          */
1251         switch ((exitqual >> 8) & 0xf) {
1252         case 0:
1253                 regval = vmxctx->guest_rax;
1254                 break;
1255         case 1:
1256                 regval = vmxctx->guest_rcx;
1257                 break;
1258         case 2:
1259                 regval = vmxctx->guest_rdx;
1260                 break;
1261         case 3:
1262                 regval = vmxctx->guest_rbx;
1263                 break;
1264         case 4:
1265                 regval = vmcs_read(VMCS_GUEST_RSP);
1266                 break;
1267         case 5:
1268                 regval = vmxctx->guest_rbp;
1269                 break;
1270         case 6:
1271                 regval = vmxctx->guest_rsi;
1272                 break;
1273         case 7:
1274                 regval = vmxctx->guest_rdi;
1275                 break;
1276         case 8:
1277                 regval = vmxctx->guest_r8;
1278                 break;
1279         case 9:
1280                 regval = vmxctx->guest_r9;
1281                 break;
1282         case 10:
1283                 regval = vmxctx->guest_r10;
1284                 break;
1285         case 11:
1286                 regval = vmxctx->guest_r11;
1287                 break;
1288         case 12:
1289                 regval = vmxctx->guest_r12;
1290                 break;
1291         case 13:
1292                 regval = vmxctx->guest_r13;
1293                 break;
1294         case 14:
1295                 regval = vmxctx->guest_r14;
1296                 break;
1297         case 15:
1298                 regval = vmxctx->guest_r15;
1299                 break;
1300         }
1301
1302         if (cr == 0) {
1303                 ones_mask = cr0_ones_mask;
1304                 zeros_mask = cr0_zeros_mask;
1305                 vmcs_guest_cr = VMCS_GUEST_CR0;
1306                 vmcs_shadow_cr = VMCS_CR0_SHADOW;
1307         } else {
1308                 ones_mask = cr4_ones_mask;
1309                 zeros_mask = cr4_zeros_mask;
1310                 vmcs_guest_cr = VMCS_GUEST_CR4;
1311                 vmcs_shadow_cr = VMCS_CR4_SHADOW;
1312         }
1313         vmcs_write(vmcs_shadow_cr, regval);
1314
1315         crval = regval | ones_mask;
1316         crval &= ~zeros_mask;
1317         vmcs_write(vmcs_guest_cr, crval);
1318
1319         if (cr == 0 && regval & CR0_PG) {
1320                 uint64_t efer, entry_ctls;
1321
1322                 /*
1323                  * If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and
1324                  * the "IA-32e mode guest" bit in VM-entry control must be
1325                  * equal.
1326                  */
1327                 efer = vmcs_read(VMCS_GUEST_IA32_EFER);
1328                 if (efer & EFER_LME) {
1329                         efer |= EFER_LMA;
1330                         vmcs_write(VMCS_GUEST_IA32_EFER, efer);
1331                         entry_ctls = vmcs_read(VMCS_ENTRY_CTLS);
1332                         entry_ctls |= VM_ENTRY_GUEST_LMA;
1333                         vmcs_write(VMCS_ENTRY_CTLS, entry_ctls);
1334                 }
1335         }
1336
1337         return (HANDLED);
1338 }
1339
1340 static enum vie_cpu_mode
1341 vmx_cpu_mode(void)
1342 {
1343
1344         if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA)
1345                 return (CPU_MODE_64BIT);
1346         else
1347                 return (CPU_MODE_COMPATIBILITY);
1348 }
1349
1350 static enum vie_paging_mode
1351 vmx_paging_mode(void)
1352 {
1353
1354         if (!(vmcs_read(VMCS_GUEST_CR0) & CR0_PG))
1355                 return (PAGING_MODE_FLAT);
1356         if (!(vmcs_read(VMCS_GUEST_CR4) & CR4_PAE))
1357                 return (PAGING_MODE_32);
1358         if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LME)
1359                 return (PAGING_MODE_64);
1360         else
1361                 return (PAGING_MODE_PAE);
1362 }
1363
1364 static int
1365 ept_fault_type(uint64_t ept_qual)
1366 {
1367         int fault_type;
1368
1369         if (ept_qual & EPT_VIOLATION_DATA_WRITE)
1370                 fault_type = VM_PROT_WRITE;
1371         else if (ept_qual & EPT_VIOLATION_INST_FETCH)
1372                 fault_type = VM_PROT_EXECUTE;
1373         else
1374                 fault_type= VM_PROT_READ;
1375
1376         return (fault_type);
1377 }
1378
1379 static boolean_t
1380 ept_emulation_fault(uint64_t ept_qual)
1381 {
1382         int read, write;
1383
1384         /* EPT fault on an instruction fetch doesn't make sense here */
1385         if (ept_qual & EPT_VIOLATION_INST_FETCH)
1386                 return (FALSE);
1387
1388         /* EPT fault must be a read fault or a write fault */
1389         read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0;
1390         write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0;
1391         if ((read | write) == 0)
1392                 return (FALSE);
1393
1394         /*
1395          * The EPT violation must have been caused by accessing a
1396          * guest-physical address that is a translation of a guest-linear
1397          * address.
1398          */
1399         if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 ||
1400             (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) {
1401                 return (FALSE);
1402         }
1403
1404         return (TRUE);
1405 }
1406
1407 static int
1408 vmx_handle_apic_write(struct vlapic *vlapic, uint64_t qual)
1409 {
1410         int error, handled, offset;
1411         bool retu;
1412
1413         if (!virtual_interrupt_delivery)
1414                 return (UNHANDLED);
1415
1416         handled = 1;
1417         offset = APIC_WRITE_OFFSET(qual);
1418         switch (offset) {
1419         case APIC_OFFSET_ID:
1420                 vlapic_id_write_handler(vlapic);
1421                 break;
1422         case APIC_OFFSET_LDR:
1423                 vlapic_ldr_write_handler(vlapic);
1424                 break;
1425         case APIC_OFFSET_DFR:
1426                 vlapic_dfr_write_handler(vlapic);
1427                 break;
1428         case APIC_OFFSET_SVR:
1429                 vlapic_svr_write_handler(vlapic);
1430                 break;
1431         case APIC_OFFSET_ESR:
1432                 vlapic_esr_write_handler(vlapic);
1433                 break;
1434         case APIC_OFFSET_ICR_LOW:
1435                 retu = false;
1436                 error = vlapic_icrlo_write_handler(vlapic, &retu);
1437                 if (error != 0 || retu)
1438                         handled = 0;
1439                 break;
1440         case APIC_OFFSET_CMCI_LVT:
1441         case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
1442                 vlapic_lvt_write_handler(vlapic, offset);
1443                 break;
1444         case APIC_OFFSET_TIMER_ICR:
1445                 vlapic_icrtmr_write_handler(vlapic);
1446                 break;
1447         case APIC_OFFSET_TIMER_DCR:
1448                 vlapic_dcr_write_handler(vlapic);
1449                 break;
1450         default:
1451                 handled = 0;
1452                 break;
1453         }
1454         return (handled);
1455 }
1456
1457 static bool
1458 apic_access_fault(uint64_t gpa)
1459 {
1460
1461         if (virtual_interrupt_delivery &&
1462             (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE))
1463                 return (true);
1464         else
1465                 return (false);
1466 }
1467
1468 static int
1469 vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
1470 {
1471         uint64_t qual;
1472         int access_type, offset, allowed;
1473
1474         if (!virtual_interrupt_delivery)
1475                 return (UNHANDLED);
1476
1477         qual = vmexit->u.vmx.exit_qualification;
1478         access_type = APIC_ACCESS_TYPE(qual);
1479         offset = APIC_ACCESS_OFFSET(qual);
1480
1481         allowed = 0;
1482         if (access_type == 0) {
1483                 /*
1484                  * Read data access to the following registers is expected.
1485                  */
1486                 switch (offset) {
1487                 case APIC_OFFSET_APR:
1488                 case APIC_OFFSET_PPR:
1489                 case APIC_OFFSET_RRR:
1490                 case APIC_OFFSET_CMCI_LVT:
1491                 case APIC_OFFSET_TIMER_CCR:
1492                         allowed = 1;
1493                         break;
1494                 default:
1495                         break;
1496                 }
1497         } else if (access_type == 1) {
1498                 /*
1499                  * Write data access to the following registers is expected.
1500                  */
1501                 switch (offset) {
1502                 case APIC_OFFSET_VER:
1503                 case APIC_OFFSET_APR:
1504                 case APIC_OFFSET_PPR:
1505                 case APIC_OFFSET_RRR:
1506                 case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
1507                 case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
1508                 case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
1509                 case APIC_OFFSET_CMCI_LVT:
1510                 case APIC_OFFSET_TIMER_CCR:
1511                         allowed = 1;
1512                         break;
1513                 default:
1514                         break;
1515                 }
1516         }
1517
1518         if (allowed) {
1519                 vmexit->exitcode = VM_EXITCODE_INST_EMUL;
1520                 vmexit->u.inst_emul.gpa = DEFAULT_APIC_BASE + offset;
1521                 vmexit->u.inst_emul.gla = VIE_INVALID_GLA;
1522                 vmexit->u.inst_emul.cr3 = vmcs_guest_cr3();
1523                 vmexit->u.inst_emul.cpu_mode = vmx_cpu_mode();
1524                 vmexit->u.inst_emul.paging_mode = vmx_paging_mode();
1525         }
1526
1527         /*
1528          * Regardless of whether the APIC-access is allowed this handler
1529          * always returns UNHANDLED:
1530          * - if the access is allowed then it is handled by emulating the
1531          *   instruction that caused the VM-exit (outside the critical section)
1532          * - if the access is not allowed then it will be converted to an
1533          *   exitcode of VM_EXITCODE_VMX and will be dealt with in userland.
1534          */
1535         return (UNHANDLED);
1536 }
1537
1538 static int
1539 vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
1540 {
1541         int error, handled;
1542         struct vmxctx *vmxctx;
1543         struct vlapic *vlapic;
1544         uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, reason;
1545         uint64_t qual, gpa;
1546         bool retu;
1547
1548         CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0);
1549         CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_NMI_EXITING) != 0);
1550
1551         handled = 0;
1552         vmxctx = &vmx->ctx[vcpu];
1553
1554         qual = vmexit->u.vmx.exit_qualification;
1555         reason = vmexit->u.vmx.exit_reason;
1556         vmexit->exitcode = VM_EXITCODE_BOGUS;
1557
1558         vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1);
1559
1560         /*
1561          * VM exits that could be triggered during event injection on the
1562          * previous VM entry need to be handled specially by re-injecting
1563          * the event.
1564          *
1565          * See "Information for VM Exits During Event Delivery" in Intel SDM
1566          * for details.
1567          */
1568         switch (reason) {
1569         case EXIT_REASON_EPT_FAULT:
1570         case EXIT_REASON_EPT_MISCONFIG:
1571         case EXIT_REASON_APIC_ACCESS:
1572         case EXIT_REASON_TASK_SWITCH:
1573         case EXIT_REASON_EXCEPTION:
1574                 idtvec_info = vmcs_idt_vectoring_info();
1575                 if (idtvec_info & VMCS_IDT_VEC_VALID) {
1576                         idtvec_info &= ~(1 << 12); /* clear undefined bit */
1577                         vmcs_write(VMCS_ENTRY_INTR_INFO, idtvec_info);
1578                         if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
1579                                 idtvec_err = vmcs_idt_vectoring_err();
1580                                 vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR,
1581                                     idtvec_err);
1582                         }
1583                         /*
1584                          * If 'virtual NMIs' are being used and the VM-exit
1585                          * happened while injecting an NMI during the previous
1586                          * VM-entry, then clear "blocking by NMI" in the Guest
1587                          * Interruptibility-state.
1588                          */
1589                         if ((idtvec_info & VMCS_INTR_T_MASK) ==
1590                             VMCS_INTR_T_NMI) {
1591                                  vmx_clear_nmi_blocking(vmx, vcpu);
1592                         }
1593                         vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
1594                 }
1595         default:
1596                 idtvec_info = 0;
1597                 break;
1598         }
1599
1600         switch (reason) {
1601         case EXIT_REASON_CR_ACCESS:
1602                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1);
1603                 handled = vmx_emulate_cr_access(vmx, vcpu, qual);
1604                 break;
1605         case EXIT_REASON_RDMSR:
1606                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1);
1607                 retu = false;
1608                 ecx = vmxctx->guest_rcx;
1609                 error = emulate_rdmsr(vmx->vm, vcpu, ecx, &retu);
1610                 if (error) {
1611                         vmexit->exitcode = VM_EXITCODE_RDMSR;
1612                         vmexit->u.msr.code = ecx;
1613                 } else if (!retu) {
1614                         handled = 1;
1615                 } else {
1616                         /* Return to userspace with a valid exitcode */
1617                         KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
1618                             ("emulate_wrmsr retu with bogus exitcode"));
1619                 }
1620                 break;
1621         case EXIT_REASON_WRMSR:
1622                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_WRMSR, 1);
1623                 retu = false;
1624                 eax = vmxctx->guest_rax;
1625                 ecx = vmxctx->guest_rcx;
1626                 edx = vmxctx->guest_rdx;
1627                 error = emulate_wrmsr(vmx->vm, vcpu, ecx,
1628                     (uint64_t)edx << 32 | eax, &retu);
1629                 if (error) {
1630                         vmexit->exitcode = VM_EXITCODE_WRMSR;
1631                         vmexit->u.msr.code = ecx;
1632                         vmexit->u.msr.wval = (uint64_t)edx << 32 | eax;
1633                 } else if (!retu) {
1634                         handled = 1;
1635                 } else {
1636                         /* Return to userspace with a valid exitcode */
1637                         KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
1638                             ("emulate_wrmsr retu with bogus exitcode"));
1639                 }
1640                 break;
1641         case EXIT_REASON_HLT:
1642                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1);
1643                 vmexit->exitcode = VM_EXITCODE_HLT;
1644                 vmexit->u.hlt.rflags = vmcs_read(VMCS_GUEST_RFLAGS);
1645                 break;
1646         case EXIT_REASON_MTF:
1647                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1);
1648                 vmexit->exitcode = VM_EXITCODE_MTRAP;
1649                 break;
1650         case EXIT_REASON_PAUSE:
1651                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1);
1652                 vmexit->exitcode = VM_EXITCODE_PAUSE;
1653                 break;
1654         case EXIT_REASON_INTR_WINDOW:
1655                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1);
1656                 vmx_clear_int_window_exiting(vmx, vcpu);
1657                 return (1);
1658         case EXIT_REASON_EXT_INTR:
1659                 /*
1660                  * External interrupts serve only to cause VM exits and allow
1661                  * the host interrupt handler to run.
1662                  *
1663                  * If this external interrupt triggers a virtual interrupt
1664                  * to a VM, then that state will be recorded by the
1665                  * host interrupt handler in the VM's softc. We will inject
1666                  * this virtual interrupt during the subsequent VM enter.
1667                  */
1668                 intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
1669                 KASSERT((intr_info & VMCS_INTR_VALID) != 0 &&
1670                     (intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_HWINTR,
1671                     ("VM exit interruption info invalid: %#x", intr_info));
1672                 vmx_trigger_hostintr(intr_info & 0xff);
1673
1674                 /*
1675                  * This is special. We want to treat this as an 'handled'
1676                  * VM-exit but not increment the instruction pointer.
1677                  */
1678                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1);
1679                 return (1);
1680         case EXIT_REASON_NMI_WINDOW:
1681                 /* Exit to allow the pending virtual NMI to be injected */
1682                 if (vm_nmi_pending(vmx->vm, vcpu))
1683                         vmx_inject_nmi(vmx, vcpu);
1684                 vmx_clear_nmi_window_exiting(vmx, vcpu);
1685                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NMI_WINDOW, 1);
1686                 return (1);
1687         case EXIT_REASON_INOUT:
1688                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1);
1689                 vmexit->exitcode = VM_EXITCODE_INOUT;
1690                 vmexit->u.inout.bytes = (qual & 0x7) + 1;
1691                 vmexit->u.inout.in = (qual & 0x8) ? 1 : 0;
1692                 vmexit->u.inout.string = (qual & 0x10) ? 1 : 0;
1693                 vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0;
1694                 vmexit->u.inout.port = (uint16_t)(qual >> 16);
1695                 vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax);
1696                 break;
1697         case EXIT_REASON_CPUID:
1698                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1);
1699                 handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx);
1700                 break;
1701         case EXIT_REASON_EXCEPTION:
1702                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXCEPTION, 1);
1703                 intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
1704                 KASSERT((intr_info & VMCS_INTR_VALID) != 0,
1705                     ("VM exit interruption info invalid: %#x", intr_info));
1706
1707                 /*
1708                  * If Virtual NMIs control is 1 and the VM-exit is due to a
1709                  * fault encountered during the execution of IRET then we must
1710                  * restore the state of "virtual-NMI blocking" before resuming
1711                  * the guest.
1712                  *
1713                  * See "Resuming Guest Software after Handling an Exception".
1714                  */
1715                 if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
1716                     (intr_info & 0xff) != IDT_DF &&
1717                     (intr_info & EXIT_QUAL_NMIUDTI) != 0)
1718                         vmx_restore_nmi_blocking(vmx, vcpu);
1719
1720                 /*
1721                  * The NMI has already been handled in vmx_exit_handle_nmi().
1722                  */
1723                 if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI)
1724                         return (1);
1725                 break;
1726         case EXIT_REASON_EPT_FAULT:
1727                 /*
1728                  * If 'gpa' lies within the address space allocated to
1729                  * memory then this must be a nested page fault otherwise
1730                  * this must be an instruction that accesses MMIO space.
1731                  */
1732                 gpa = vmcs_gpa();
1733                 if (vm_mem_allocated(vmx->vm, gpa) || apic_access_fault(gpa)) {
1734                         vmexit->exitcode = VM_EXITCODE_PAGING;
1735                         vmexit->u.paging.gpa = gpa;
1736                         vmexit->u.paging.fault_type = ept_fault_type(qual);
1737                         vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NESTED_FAULT, 1);
1738                 } else if (ept_emulation_fault(qual)) {
1739                         vmexit->exitcode = VM_EXITCODE_INST_EMUL;
1740                         vmexit->u.inst_emul.gpa = gpa;
1741                         vmexit->u.inst_emul.gla = vmcs_gla();
1742                         vmexit->u.inst_emul.cr3 = vmcs_guest_cr3();
1743                         vmexit->u.inst_emul.cpu_mode = vmx_cpu_mode();
1744                         vmexit->u.inst_emul.paging_mode = vmx_paging_mode();
1745                         vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INST_EMUL, 1);
1746                 }
1747                 /*
1748                  * If Virtual NMIs control is 1 and the VM-exit is due to an
1749                  * EPT fault during the execution of IRET then we must restore
1750                  * the state of "virtual-NMI blocking" before resuming.
1751                  *
1752                  * See description of "NMI unblocking due to IRET" in
1753                  * "Exit Qualification for EPT Violations".
1754                  */
1755                 if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
1756                     (qual & EXIT_QUAL_NMIUDTI) != 0)
1757                         vmx_restore_nmi_blocking(vmx, vcpu);
1758                 break;
1759         case EXIT_REASON_VIRTUALIZED_EOI:
1760                 vmexit->exitcode = VM_EXITCODE_IOAPIC_EOI;
1761                 vmexit->u.ioapic_eoi.vector = qual & 0xFF;
1762                 vmexit->inst_length = 0;        /* trap-like */
1763                 break;
1764         case EXIT_REASON_APIC_ACCESS:
1765                 handled = vmx_handle_apic_access(vmx, vcpu, vmexit);
1766                 break;
1767         case EXIT_REASON_APIC_WRITE:
1768                 /*
1769                  * APIC-write VM exit is trap-like so the %rip is already
1770                  * pointing to the next instruction.
1771                  */
1772                 vmexit->inst_length = 0;
1773                 vlapic = vm_lapic(vmx->vm, vcpu);
1774                 handled = vmx_handle_apic_write(vlapic, qual);
1775                 break;
1776         default:
1777                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1);
1778                 break;
1779         }
1780
1781         if (handled) {
1782                 /*
1783                  * It is possible that control is returned to userland
1784                  * even though we were able to handle the VM exit in the
1785                  * kernel.
1786                  *
1787                  * In such a case we want to make sure that the userland
1788                  * restarts guest execution at the instruction *after*
1789                  * the one we just processed. Therefore we update the
1790                  * guest rip in the VMCS and in 'vmexit'.
1791                  */
1792                 vmexit->rip += vmexit->inst_length;
1793                 vmexit->inst_length = 0;
1794                 vmcs_write(VMCS_GUEST_RIP, vmexit->rip);
1795         } else {
1796                 if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
1797                         /*
1798                          * If this VM exit was not claimed by anybody then
1799                          * treat it as a generic VMX exit.
1800                          */
1801                         vmexit->exitcode = VM_EXITCODE_VMX;
1802                         vmexit->u.vmx.status = VM_SUCCESS;
1803                         vmexit->u.vmx.inst_type = 0;
1804                         vmexit->u.vmx.inst_error = 0;
1805                 } else {
1806                         /*
1807                          * The exitcode and collateral have been populated.
1808                          * The VM exit will be processed further in userland.
1809                          */
1810                 }
1811         }
1812         return (handled);
1813 }
1814
1815 static __inline int
1816 vmx_exit_astpending(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
1817 {
1818
1819         vmexit->rip = vmcs_guest_rip();
1820         vmexit->inst_length = 0;
1821         vmexit->exitcode = VM_EXITCODE_BOGUS;
1822         vmx_astpending_trace(vmx, vcpu, vmexit->rip);
1823         vmm_stat_incr(vmx->vm, vcpu, VMEXIT_ASTPENDING, 1);
1824
1825         return (HANDLED);
1826 }
1827
1828 static __inline int
1829 vmx_exit_rendezvous(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
1830 {
1831
1832         vmexit->rip = vmcs_guest_rip();
1833         vmexit->inst_length = 0;
1834         vmexit->exitcode = VM_EXITCODE_RENDEZVOUS;
1835         vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RENDEZVOUS, 1);
1836
1837         return (UNHANDLED);
1838 }
1839
1840 static __inline int
1841 vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit)
1842 {
1843
1844         KASSERT(vmxctx->inst_fail_status != VM_SUCCESS,
1845             ("vmx_exit_inst_error: invalid inst_fail_status %d",
1846             vmxctx->inst_fail_status));
1847
1848         vmexit->inst_length = 0;
1849         vmexit->exitcode = VM_EXITCODE_VMX;
1850         vmexit->u.vmx.status = vmxctx->inst_fail_status;
1851         vmexit->u.vmx.inst_error = vmcs_instruction_error();
1852         vmexit->u.vmx.exit_reason = ~0;
1853         vmexit->u.vmx.exit_qualification = ~0;
1854
1855         switch (rc) {
1856         case VMX_VMRESUME_ERROR:
1857         case VMX_VMLAUNCH_ERROR:
1858         case VMX_INVEPT_ERROR:
1859                 vmexit->u.vmx.inst_type = rc;
1860                 break;
1861         default:
1862                 panic("vm_exit_inst_error: vmx_enter_guest returned %d", rc);
1863         }
1864
1865         return (UNHANDLED);
1866 }
1867
1868 /*
1869  * If the NMI-exiting VM execution control is set to '1' then an NMI in
1870  * non-root operation causes a VM-exit. NMI blocking is in effect so it is
1871  * sufficient to simply vector to the NMI handler via a software interrupt.
1872  * However, this must be done before maskable interrupts are enabled
1873  * otherwise the "iret" issued by an interrupt handler will incorrectly
1874  * clear NMI blocking.
1875  */
1876 static __inline void
1877 vmx_exit_handle_nmi(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
1878 {
1879         uint32_t intr_info;
1880
1881         KASSERT((read_rflags() & PSL_I) == 0, ("interrupts enabled"));
1882
1883         if (vmexit->u.vmx.exit_reason != EXIT_REASON_EXCEPTION)
1884                 return;
1885
1886         intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
1887         KASSERT((intr_info & VMCS_INTR_VALID) != 0,
1888             ("VM exit interruption info invalid: %#x", intr_info));
1889
1890         if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI) {
1891                 KASSERT((intr_info & 0xff) == IDT_NMI, ("VM exit due "
1892                     "to NMI has invalid vector: %#x", intr_info));
1893                 VCPU_CTR0(vmx->vm, vcpuid, "Vectoring to NMI handler");
1894                 __asm __volatile("int $2");
1895         }
1896 }
1897
1898 static int
1899 vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap,
1900     void *rendezvous_cookie)
1901 {
1902         int rc, handled, launched;
1903         struct vmx *vmx;
1904         struct vm *vm;
1905         struct vmxctx *vmxctx;
1906         struct vmcs *vmcs;
1907         struct vm_exit *vmexit;
1908         struct vlapic *vlapic;
1909         uint64_t rip;
1910         uint32_t exit_reason;
1911
1912         vmx = arg;
1913         vm = vmx->vm;
1914         vmcs = &vmx->vmcs[vcpu];
1915         vmxctx = &vmx->ctx[vcpu];
1916         vlapic = vm_lapic(vm, vcpu);
1917         vmexit = vm_exitinfo(vm, vcpu);
1918         launched = 0;
1919
1920         KASSERT(vmxctx->pmap == pmap,
1921             ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap));
1922
1923         VMPTRLD(vmcs);
1924
1925         /*
1926          * XXX
1927          * We do this every time because we may setup the virtual machine
1928          * from a different process than the one that actually runs it.
1929          *
1930          * If the life of a virtual machine was spent entirely in the context
1931          * of a single process we could do this once in vmx_vminit().
1932          */
1933         vmcs_write(VMCS_HOST_CR3, rcr3());
1934
1935         vmcs_write(VMCS_GUEST_RIP, startrip);
1936         vmx_set_pcpu_defaults(vmx, vcpu, pmap);
1937         do {
1938                 /*
1939                  * Interrupts are disabled from this point on until the
1940                  * guest starts executing. This is done for the following
1941                  * reasons:
1942                  *
1943                  * If an AST is asserted on this thread after the check below,
1944                  * then the IPI_AST notification will not be lost, because it
1945                  * will cause a VM exit due to external interrupt as soon as
1946                  * the guest state is loaded.
1947                  *
1948                  * A posted interrupt after 'vmx_inject_interrupts()' will
1949                  * not be "lost" because it will be held pending in the host
1950                  * APIC because interrupts are disabled. The pending interrupt
1951                  * will be recognized as soon as the guest state is loaded.
1952                  *
1953                  * The same reasoning applies to the IPI generated by
1954                  * pmap_invalidate_ept().
1955                  */
1956                 disable_intr();
1957                 if (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED)) {
1958                         enable_intr();
1959                         handled = vmx_exit_astpending(vmx, vcpu, vmexit);
1960                         break;
1961                 }
1962
1963                 if (vcpu_rendezvous_pending(rendezvous_cookie)) {
1964                         enable_intr();
1965                         handled = vmx_exit_rendezvous(vmx, vcpu, vmexit);
1966                         break;
1967                 }
1968
1969                 vmx_inject_interrupts(vmx, vcpu, vlapic);
1970                 vmx_run_trace(vmx, vcpu);
1971                 rc = vmx_enter_guest(vmxctx, vmx, launched);
1972
1973                 /* Collect some information for VM exit processing */
1974                 vmexit->rip = rip = vmcs_guest_rip();
1975                 vmexit->inst_length = vmexit_instruction_length();
1976                 vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason();
1977                 vmexit->u.vmx.exit_qualification = vmcs_exit_qualification();
1978
1979                 if (rc == VMX_GUEST_VMEXIT) {
1980                         vmx_exit_handle_nmi(vmx, vcpu, vmexit);
1981                         enable_intr();
1982                         handled = vmx_exit_process(vmx, vcpu, vmexit);
1983                 } else {
1984                         enable_intr();
1985                         handled = vmx_exit_inst_error(vmxctx, rc, vmexit);
1986                 }
1987                 launched = 1;
1988                 vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled);
1989         } while (handled);
1990
1991         /*
1992          * If a VM exit has been handled then the exitcode must be BOGUS
1993          * If a VM exit is not handled then the exitcode must not be BOGUS
1994          */
1995         if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) ||
1996             (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) {
1997                 panic("Mismatch between handled (%d) and exitcode (%d)",
1998                       handled, vmexit->exitcode);
1999         }
2000
2001         if (!handled)
2002                 vmm_stat_incr(vm, vcpu, VMEXIT_USERSPACE, 1);
2003
2004         VCPU_CTR1(vm, vcpu, "returning from vmx_run: exitcode %d",
2005             vmexit->exitcode);
2006
2007         VMCLEAR(vmcs);
2008         return (0);
2009 }
2010
2011 static void
2012 vmx_vmcleanup(void *arg)
2013 {
2014         int i, error;
2015         struct vmx *vmx = arg;
2016
2017         if (virtual_interrupt_delivery)
2018                 vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE);
2019
2020         for (i = 0; i < VM_MAXCPU; i++)
2021                 vpid_free(vmx->state[i].vpid);
2022
2023         /*
2024          * XXXSMP we also need to clear the VMCS active on the other vcpus.
2025          */
2026         error = vmclear(&vmx->vmcs[0]);
2027         if (error != 0)
2028                 panic("vmx_vmcleanup: vmclear error %d on vcpu 0", error);
2029
2030         free(vmx, M_VMX);
2031
2032         return;
2033 }
2034
2035 static register_t *
2036 vmxctx_regptr(struct vmxctx *vmxctx, int reg)
2037 {
2038
2039         switch (reg) {
2040         case VM_REG_GUEST_RAX:
2041                 return (&vmxctx->guest_rax);
2042         case VM_REG_GUEST_RBX:
2043                 return (&vmxctx->guest_rbx);
2044         case VM_REG_GUEST_RCX:
2045                 return (&vmxctx->guest_rcx);
2046         case VM_REG_GUEST_RDX:
2047                 return (&vmxctx->guest_rdx);
2048         case VM_REG_GUEST_RSI:
2049                 return (&vmxctx->guest_rsi);
2050         case VM_REG_GUEST_RDI:
2051                 return (&vmxctx->guest_rdi);
2052         case VM_REG_GUEST_RBP:
2053                 return (&vmxctx->guest_rbp);
2054         case VM_REG_GUEST_R8:
2055                 return (&vmxctx->guest_r8);
2056         case VM_REG_GUEST_R9:
2057                 return (&vmxctx->guest_r9);
2058         case VM_REG_GUEST_R10:
2059                 return (&vmxctx->guest_r10);
2060         case VM_REG_GUEST_R11:
2061                 return (&vmxctx->guest_r11);
2062         case VM_REG_GUEST_R12:
2063                 return (&vmxctx->guest_r12);
2064         case VM_REG_GUEST_R13:
2065                 return (&vmxctx->guest_r13);
2066         case VM_REG_GUEST_R14:
2067                 return (&vmxctx->guest_r14);
2068         case VM_REG_GUEST_R15:
2069                 return (&vmxctx->guest_r15);
2070         default:
2071                 break;
2072         }
2073         return (NULL);
2074 }
2075
2076 static int
2077 vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval)
2078 {
2079         register_t *regp;
2080
2081         if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
2082                 *retval = *regp;
2083                 return (0);
2084         } else
2085                 return (EINVAL);
2086 }
2087
2088 static int
2089 vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val)
2090 {
2091         register_t *regp;
2092
2093         if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
2094                 *regp = val;
2095                 return (0);
2096         } else
2097                 return (EINVAL);
2098 }
2099
2100 static int
2101 vmx_shadow_reg(int reg)
2102 {
2103         int shreg;
2104
2105         shreg = -1;
2106
2107         switch (reg) {
2108         case VM_REG_GUEST_CR0:
2109                 shreg = VMCS_CR0_SHADOW;
2110                 break;
2111         case VM_REG_GUEST_CR4:
2112                 shreg = VMCS_CR4_SHADOW;
2113                 break;
2114         default:
2115                 break;
2116         }
2117
2118         return (shreg);
2119 }
2120
2121 static int
2122 vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval)
2123 {
2124         int running, hostcpu;
2125         struct vmx *vmx = arg;
2126
2127         running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
2128         if (running && hostcpu != curcpu)
2129                 panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu);
2130
2131         if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0)
2132                 return (0);
2133
2134         return (vmcs_getreg(&vmx->vmcs[vcpu], running, reg, retval));
2135 }
2136
2137 static int
2138 vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
2139 {
2140         int error, hostcpu, running, shadow;
2141         uint64_t ctls;
2142         struct vmx *vmx = arg;
2143
2144         running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
2145         if (running && hostcpu != curcpu)
2146                 panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu);
2147
2148         if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0)
2149                 return (0);
2150
2151         error = vmcs_setreg(&vmx->vmcs[vcpu], running, reg, val);
2152
2153         if (error == 0) {
2154                 /*
2155                  * If the "load EFER" VM-entry control is 1 then the
2156                  * value of EFER.LMA must be identical to "IA-32e mode guest"
2157                  * bit in the VM-entry control.
2158                  */
2159                 if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 &&
2160                     (reg == VM_REG_GUEST_EFER)) {
2161                         vmcs_getreg(&vmx->vmcs[vcpu], running,
2162                                     VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls);
2163                         if (val & EFER_LMA)
2164                                 ctls |= VM_ENTRY_GUEST_LMA;
2165                         else
2166                                 ctls &= ~VM_ENTRY_GUEST_LMA;
2167                         vmcs_setreg(&vmx->vmcs[vcpu], running,
2168                                     VMCS_IDENT(VMCS_ENTRY_CTLS), ctls);
2169                 }
2170
2171                 shadow = vmx_shadow_reg(reg);
2172                 if (shadow > 0) {
2173                         /*
2174                          * Store the unmodified value in the shadow
2175                          */                     
2176                         error = vmcs_setreg(&vmx->vmcs[vcpu], running,
2177                                     VMCS_IDENT(shadow), val);
2178                 }
2179         }
2180
2181         return (error);
2182 }
2183
2184 static int
2185 vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
2186 {
2187         struct vmx *vmx = arg;
2188
2189         return (vmcs_getdesc(&vmx->vmcs[vcpu], reg, desc));
2190 }
2191
2192 static int
2193 vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
2194 {
2195         struct vmx *vmx = arg;
2196
2197         return (vmcs_setdesc(&vmx->vmcs[vcpu], reg, desc));
2198 }
2199
2200 static int
2201 vmx_inject(void *arg, int vcpu, int type, int vector, uint32_t code,
2202            int code_valid)
2203 {
2204         int error;
2205         uint64_t info;
2206         struct vmx *vmx = arg;
2207         struct vmcs *vmcs = &vmx->vmcs[vcpu];
2208
2209         static uint32_t type_map[VM_EVENT_MAX] = {
2210                 0x1,            /* VM_EVENT_NONE */
2211                 0x0,            /* VM_HW_INTR */
2212                 0x2,            /* VM_NMI */
2213                 0x3,            /* VM_HW_EXCEPTION */
2214                 0x4,            /* VM_SW_INTR */
2215                 0x5,            /* VM_PRIV_SW_EXCEPTION */
2216                 0x6,            /* VM_SW_EXCEPTION */
2217         };
2218
2219         /*
2220          * If there is already an exception pending to be delivered to the
2221          * vcpu then just return.
2222          */
2223         error = vmcs_getreg(vmcs, 0, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), &info);
2224         if (error)
2225                 return (error);
2226
2227         if (info & VMCS_INTR_VALID)
2228                 return (EAGAIN);
2229
2230         info = vector | (type_map[type] << 8) | (code_valid ? 1 << 11 : 0);
2231         info |= VMCS_INTR_VALID;
2232         error = vmcs_setreg(vmcs, 0, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), info);
2233         if (error != 0)
2234                 return (error);
2235
2236         if (code_valid) {
2237                 error = vmcs_setreg(vmcs, 0,
2238                                     VMCS_IDENT(VMCS_ENTRY_EXCEPTION_ERROR),
2239                                     code);
2240         }
2241         return (error);
2242 }
2243
2244 static int
2245 vmx_getcap(void *arg, int vcpu, int type, int *retval)
2246 {
2247         struct vmx *vmx = arg;
2248         int vcap;
2249         int ret;
2250
2251         ret = ENOENT;
2252
2253         vcap = vmx->cap[vcpu].set;
2254
2255         switch (type) {
2256         case VM_CAP_HALT_EXIT:
2257                 if (cap_halt_exit)
2258                         ret = 0;
2259                 break;
2260         case VM_CAP_PAUSE_EXIT:
2261                 if (cap_pause_exit)
2262                         ret = 0;
2263                 break;
2264         case VM_CAP_MTRAP_EXIT:
2265                 if (cap_monitor_trap)
2266                         ret = 0;
2267                 break;
2268         case VM_CAP_UNRESTRICTED_GUEST:
2269                 if (cap_unrestricted_guest)
2270                         ret = 0;
2271                 break;
2272         case VM_CAP_ENABLE_INVPCID:
2273                 if (cap_invpcid)
2274                         ret = 0;
2275                 break;
2276         default:
2277                 break;
2278         }
2279
2280         if (ret == 0)
2281                 *retval = (vcap & (1 << type)) ? 1 : 0;
2282
2283         return (ret);
2284 }
2285
2286 static int
2287 vmx_setcap(void *arg, int vcpu, int type, int val)
2288 {
2289         struct vmx *vmx = arg;
2290         struct vmcs *vmcs = &vmx->vmcs[vcpu];
2291         uint32_t baseval;
2292         uint32_t *pptr;
2293         int error;
2294         int flag;
2295         int reg;
2296         int retval;
2297
2298         retval = ENOENT;
2299         pptr = NULL;
2300
2301         switch (type) {
2302         case VM_CAP_HALT_EXIT:
2303                 if (cap_halt_exit) {
2304                         retval = 0;
2305                         pptr = &vmx->cap[vcpu].proc_ctls;
2306                         baseval = *pptr;
2307                         flag = PROCBASED_HLT_EXITING;
2308                         reg = VMCS_PRI_PROC_BASED_CTLS;
2309                 }
2310                 break;
2311         case VM_CAP_MTRAP_EXIT:
2312                 if (cap_monitor_trap) {
2313                         retval = 0;
2314                         pptr = &vmx->cap[vcpu].proc_ctls;
2315                         baseval = *pptr;
2316                         flag = PROCBASED_MTF;
2317                         reg = VMCS_PRI_PROC_BASED_CTLS;
2318                 }
2319                 break;
2320         case VM_CAP_PAUSE_EXIT:
2321                 if (cap_pause_exit) {
2322                         retval = 0;
2323                         pptr = &vmx->cap[vcpu].proc_ctls;
2324                         baseval = *pptr;
2325                         flag = PROCBASED_PAUSE_EXITING;
2326                         reg = VMCS_PRI_PROC_BASED_CTLS;
2327                 }
2328                 break;
2329         case VM_CAP_UNRESTRICTED_GUEST:
2330                 if (cap_unrestricted_guest) {
2331                         retval = 0;
2332                         pptr = &vmx->cap[vcpu].proc_ctls2;
2333                         baseval = *pptr;
2334                         flag = PROCBASED2_UNRESTRICTED_GUEST;
2335                         reg = VMCS_SEC_PROC_BASED_CTLS;
2336                 }
2337                 break;
2338         case VM_CAP_ENABLE_INVPCID:
2339                 if (cap_invpcid) {
2340                         retval = 0;
2341                         pptr = &vmx->cap[vcpu].proc_ctls2;
2342                         baseval = *pptr;
2343                         flag = PROCBASED2_ENABLE_INVPCID;
2344                         reg = VMCS_SEC_PROC_BASED_CTLS;
2345                 }
2346                 break;
2347         default:
2348                 break;
2349         }
2350
2351         if (retval == 0) {
2352                 if (val) {
2353                         baseval |= flag;
2354                 } else {
2355                         baseval &= ~flag;
2356                 }
2357                 VMPTRLD(vmcs);
2358                 error = vmwrite(reg, baseval);
2359                 VMCLEAR(vmcs);
2360
2361                 if (error) {
2362                         retval = error;
2363                 } else {
2364                         /*
2365                          * Update optional stored flags, and record
2366                          * setting
2367                          */
2368                         if (pptr != NULL) {
2369                                 *pptr = baseval;
2370                         }
2371
2372                         if (val) {
2373                                 vmx->cap[vcpu].set |= (1 << type);
2374                         } else {
2375                                 vmx->cap[vcpu].set &= ~(1 << type);
2376                         }
2377                 }
2378         }
2379
2380         return (retval);
2381 }
2382
2383 struct vlapic_vtx {
2384         struct vlapic   vlapic;
2385         struct pir_desc *pir_desc;
2386         struct vmx      *vmx;
2387 };
2388
2389 #define VMX_CTR_PIR(vm, vcpuid, pir_desc, notify, vector, level, msg)   \
2390 do {                                                                    \
2391         VCPU_CTR2(vm, vcpuid, msg " assert %s-triggered vector %d",     \
2392             level ? "level" : "edge", vector);                          \
2393         VCPU_CTR1(vm, vcpuid, msg " pir0 0x%016lx", pir_desc->pir[0]);  \
2394         VCPU_CTR1(vm, vcpuid, msg " pir1 0x%016lx", pir_desc->pir[1]);  \
2395         VCPU_CTR1(vm, vcpuid, msg " pir2 0x%016lx", pir_desc->pir[2]);  \
2396         VCPU_CTR1(vm, vcpuid, msg " pir3 0x%016lx", pir_desc->pir[3]);  \
2397         VCPU_CTR1(vm, vcpuid, msg " notify: %s", notify ? "yes" : "no");\
2398 } while (0)
2399
2400 /*
2401  * vlapic->ops handlers that utilize the APICv hardware assist described in
2402  * Chapter 29 of the Intel SDM.
2403  */
2404 static int
2405 vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level)
2406 {
2407         struct vlapic_vtx *vlapic_vtx;
2408         struct pir_desc *pir_desc;
2409         uint64_t mask;
2410         int idx, notify;
2411
2412         vlapic_vtx = (struct vlapic_vtx *)vlapic;
2413         pir_desc = vlapic_vtx->pir_desc;
2414
2415         /*
2416          * Keep track of interrupt requests in the PIR descriptor. This is
2417          * because the virtual APIC page pointed to by the VMCS cannot be
2418          * modified if the vcpu is running.
2419          */
2420         idx = vector / 64;
2421         mask = 1UL << (vector % 64);
2422         atomic_set_long(&pir_desc->pir[idx], mask);
2423         notify = atomic_cmpset_long(&pir_desc->pending, 0, 1);
2424
2425         VMX_CTR_PIR(vlapic->vm, vlapic->vcpuid, pir_desc, notify, vector,
2426             level, "vmx_set_intr_ready");
2427         return (notify);
2428 }
2429
2430 static int
2431 vmx_pending_intr(struct vlapic *vlapic, int *vecptr)
2432 {
2433         struct vlapic_vtx *vlapic_vtx;
2434         struct pir_desc *pir_desc;
2435         struct LAPIC *lapic;
2436         uint64_t pending, pirval;
2437         uint32_t ppr, vpr;
2438         int i;
2439
2440         /*
2441          * This function is only expected to be called from the 'HLT' exit
2442          * handler which does not care about the vector that is pending.
2443          */
2444         KASSERT(vecptr == NULL, ("vmx_pending_intr: vecptr must be NULL"));
2445
2446         vlapic_vtx = (struct vlapic_vtx *)vlapic;
2447         pir_desc = vlapic_vtx->pir_desc;
2448
2449         pending = atomic_load_acq_long(&pir_desc->pending);
2450         if (!pending)
2451                 return (0);     /* common case */
2452
2453         /*
2454          * If there is an interrupt pending then it will be recognized only
2455          * if its priority is greater than the processor priority.
2456          *
2457          * Special case: if the processor priority is zero then any pending
2458          * interrupt will be recognized.
2459          */
2460         lapic = vlapic->apic_page;
2461         ppr = lapic->ppr & 0xf0;
2462         if (ppr == 0)
2463                 return (1);
2464
2465         VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "HLT with non-zero PPR %d",
2466             lapic->ppr);
2467
2468         for (i = 3; i >= 0; i--) {
2469                 pirval = pir_desc->pir[i];
2470                 if (pirval != 0) {
2471                         vpr = (i * 64 + flsl(pirval) - 1) & 0xf0;
2472                         return (vpr > ppr);
2473                 }
2474         }
2475         return (0);
2476 }
2477
2478 static void
2479 vmx_intr_accepted(struct vlapic *vlapic, int vector)
2480 {
2481
2482         panic("vmx_intr_accepted: not expected to be called");
2483 }
2484
2485 static void
2486 vmx_set_tmr(struct vlapic *vlapic, int vector, bool level)
2487 {
2488         struct vlapic_vtx *vlapic_vtx;
2489         struct vmx *vmx;
2490         struct vmcs *vmcs;
2491         uint64_t mask, val;
2492
2493         KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector));
2494         KASSERT(!vcpu_is_running(vlapic->vm, vlapic->vcpuid, NULL),
2495             ("vmx_set_tmr: vcpu cannot be running"));
2496
2497         vlapic_vtx = (struct vlapic_vtx *)vlapic;
2498         vmx = vlapic_vtx->vmx;
2499         vmcs = &vmx->vmcs[vlapic->vcpuid];
2500         mask = 1UL << (vector % 64);
2501
2502         VMPTRLD(vmcs);
2503         val = vmcs_read(VMCS_EOI_EXIT(vector));
2504         if (level)
2505                 val |= mask;
2506         else
2507                 val &= ~mask;
2508         vmcs_write(VMCS_EOI_EXIT(vector), val);
2509         VMCLEAR(vmcs);
2510 }
2511
2512 static void
2513 vmx_post_intr(struct vlapic *vlapic, int hostcpu)
2514 {
2515
2516         ipi_cpu(hostcpu, pirvec);
2517 }
2518
2519 /*
2520  * Transfer the pending interrupts in the PIR descriptor to the IRR
2521  * in the virtual APIC page.
2522  */
2523 static void
2524 vmx_inject_pir(struct vlapic *vlapic)
2525 {
2526         struct vlapic_vtx *vlapic_vtx;
2527         struct pir_desc *pir_desc;
2528         struct LAPIC *lapic;
2529         uint64_t val, pirval;
2530         int rvi, pirbase;
2531         uint16_t intr_status_old, intr_status_new;
2532
2533         vlapic_vtx = (struct vlapic_vtx *)vlapic;
2534         pir_desc = vlapic_vtx->pir_desc;
2535         if (atomic_cmpset_long(&pir_desc->pending, 1, 0) == 0) {
2536                 VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: "
2537                     "no posted interrupt pending");
2538                 return;
2539         }
2540
2541         pirval = 0;
2542         lapic = vlapic->apic_page;
2543
2544         val = atomic_readandclear_long(&pir_desc->pir[0]);
2545         if (val != 0) {
2546                 lapic->irr0 |= val;
2547                 lapic->irr1 |= val >> 32;
2548                 pirbase = 0;
2549                 pirval = val;
2550         }
2551
2552         val = atomic_readandclear_long(&pir_desc->pir[1]);
2553         if (val != 0) {
2554                 lapic->irr2 |= val;
2555                 lapic->irr3 |= val >> 32;
2556                 pirbase = 64;
2557                 pirval = val;
2558         }
2559
2560         val = atomic_readandclear_long(&pir_desc->pir[2]);
2561         if (val != 0) {
2562                 lapic->irr4 |= val;
2563                 lapic->irr5 |= val >> 32;
2564                 pirbase = 128;
2565                 pirval = val;
2566         }
2567
2568         val = atomic_readandclear_long(&pir_desc->pir[3]);
2569         if (val != 0) {
2570                 lapic->irr6 |= val;
2571                 lapic->irr7 |= val >> 32;
2572                 pirbase = 192;
2573                 pirval = val;
2574         }
2575         VLAPIC_CTR_IRR(vlapic, "vmx_inject_pir");
2576
2577         /*
2578          * Update RVI so the processor can evaluate pending virtual
2579          * interrupts on VM-entry.
2580          */
2581         if (pirval != 0) {
2582                 rvi = pirbase + flsl(pirval) - 1;
2583                 intr_status_old = vmcs_read(VMCS_GUEST_INTR_STATUS);
2584                 intr_status_new = (intr_status_old & 0xFF00) | rvi;
2585                 if (intr_status_new > intr_status_old) {
2586                         vmcs_write(VMCS_GUEST_INTR_STATUS, intr_status_new);
2587                         VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: "
2588                             "guest_intr_status changed from 0x%04x to 0x%04x",
2589                             intr_status_old, intr_status_new);
2590                 }
2591         }
2592 }
2593
2594 static struct vlapic *
2595 vmx_vlapic_init(void *arg, int vcpuid)
2596 {
2597         struct vmx *vmx;
2598         struct vlapic *vlapic;
2599         struct vlapic_vtx *vlapic_vtx;
2600         
2601         vmx = arg;
2602
2603         vlapic = malloc(sizeof(struct vlapic_vtx), M_VLAPIC, M_WAITOK | M_ZERO);
2604         vlapic->vm = vmx->vm;
2605         vlapic->vcpuid = vcpuid;
2606         vlapic->apic_page = (struct LAPIC *)&vmx->apic_page[vcpuid];
2607
2608         vlapic_vtx = (struct vlapic_vtx *)vlapic;
2609         vlapic_vtx->pir_desc = &vmx->pir_desc[vcpuid];
2610         vlapic_vtx->vmx = vmx;
2611
2612         if (virtual_interrupt_delivery) {
2613                 vlapic->ops.set_intr_ready = vmx_set_intr_ready;
2614                 vlapic->ops.pending_intr = vmx_pending_intr;
2615                 vlapic->ops.intr_accepted = vmx_intr_accepted;
2616                 vlapic->ops.set_tmr = vmx_set_tmr;
2617         }
2618
2619         if (posted_interrupts)
2620                 vlapic->ops.post_intr = vmx_post_intr;
2621
2622         vlapic_init(vlapic);
2623
2624         return (vlapic);
2625 }
2626
2627 static void
2628 vmx_vlapic_cleanup(void *arg, struct vlapic *vlapic)
2629 {
2630
2631         vlapic_cleanup(vlapic);
2632         free(vlapic, M_VLAPIC);
2633 }
2634
2635 struct vmm_ops vmm_ops_intel = {
2636         vmx_init,
2637         vmx_cleanup,
2638         vmx_restore,
2639         vmx_vminit,
2640         vmx_run,
2641         vmx_vmcleanup,
2642         vmx_getreg,
2643         vmx_setreg,
2644         vmx_getdesc,
2645         vmx_setdesc,
2646         vmx_inject,
2647         vmx_getcap,
2648         vmx_setcap,
2649         ept_vmspace_alloc,
2650         ept_vmspace_free,
2651         vmx_vlapic_init,
2652         vmx_vlapic_cleanup,
2653 };