]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/amd64/vmm/intel/vmx.c
Use the 'Virtual Interrupt Delivery' feature of Intel VT-x if supported by
[FreeBSD/FreeBSD.git] / sys / amd64 / vmm / intel / vmx.c
1 /*-
2  * Copyright (c) 2011 NetApp, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  * $FreeBSD$
27  */
28
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/smp.h>
35 #include <sys/kernel.h>
36 #include <sys/malloc.h>
37 #include <sys/pcpu.h>
38 #include <sys/proc.h>
39 #include <sys/sysctl.h>
40
41 #include <vm/vm.h>
42 #include <vm/pmap.h>
43
44 #include <machine/psl.h>
45 #include <machine/cpufunc.h>
46 #include <machine/md_var.h>
47 #include <machine/segments.h>
48 #include <machine/specialreg.h>
49 #include <machine/vmparam.h>
50
51 #include <machine/vmm.h>
52 #include "vmm_host.h"
53 #include "vmm_msr.h"
54 #include "vmm_ktr.h"
55 #include "vmm_stat.h"
56 #include "vlapic.h"
57 #include "vlapic_priv.h"
58
59 #include "vmx_msr.h"
60 #include "ept.h"
61 #include "vmx_cpufunc.h"
62 #include "vmx.h"
63 #include "x86.h"
64 #include "vmx_controls.h"
65
66 #define PINBASED_CTLS_ONE_SETTING                                       \
67         (PINBASED_EXTINT_EXITING        |                               \
68          PINBASED_NMI_EXITING           |                               \
69          PINBASED_VIRTUAL_NMI)
70 #define PINBASED_CTLS_ZERO_SETTING      0
71
72 #define PROCBASED_CTLS_WINDOW_SETTING                                   \
73         (PROCBASED_INT_WINDOW_EXITING   |                               \
74          PROCBASED_NMI_WINDOW_EXITING)
75
76 #define PROCBASED_CTLS_ONE_SETTING                                      \
77         (PROCBASED_SECONDARY_CONTROLS   |                               \
78          PROCBASED_IO_EXITING           |                               \
79          PROCBASED_MSR_BITMAPS          |                               \
80          PROCBASED_CTLS_WINDOW_SETTING)
81 #define PROCBASED_CTLS_ZERO_SETTING     \
82         (PROCBASED_CR3_LOAD_EXITING |   \
83         PROCBASED_CR3_STORE_EXITING |   \
84         PROCBASED_IO_BITMAPS)
85
86 #define PROCBASED_CTLS2_ONE_SETTING     PROCBASED2_ENABLE_EPT
87 #define PROCBASED_CTLS2_ZERO_SETTING    0
88
89 #define VM_EXIT_CTLS_ONE_SETTING_NO_PAT                                 \
90         (VM_EXIT_HOST_LMA                       |                       \
91         VM_EXIT_SAVE_EFER                       |                       \
92         VM_EXIT_LOAD_EFER)
93
94 #define VM_EXIT_CTLS_ONE_SETTING                                        \
95         (VM_EXIT_CTLS_ONE_SETTING_NO_PAT        |                       \
96         VM_EXIT_SAVE_PAT                        |                       \
97         VM_EXIT_LOAD_PAT)
98 #define VM_EXIT_CTLS_ZERO_SETTING       VM_EXIT_SAVE_DEBUG_CONTROLS
99
100 #define VM_ENTRY_CTLS_ONE_SETTING_NO_PAT        VM_ENTRY_LOAD_EFER
101
102 #define VM_ENTRY_CTLS_ONE_SETTING                                       \
103         (VM_ENTRY_CTLS_ONE_SETTING_NO_PAT       |                       \
104         VM_ENTRY_LOAD_PAT)
105 #define VM_ENTRY_CTLS_ZERO_SETTING                                      \
106         (VM_ENTRY_LOAD_DEBUG_CONTROLS           |                       \
107         VM_ENTRY_INTO_SMM                       |                       \
108         VM_ENTRY_DEACTIVATE_DUAL_MONITOR)
109
110 #define guest_msr_rw(vmx, msr) \
111         msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW)
112
113 #define HANDLED         1
114 #define UNHANDLED       0
115
116 static MALLOC_DEFINE(M_VMX, "vmx", "vmx");
117 static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic");
118
119 SYSCTL_DECL(_hw_vmm);
120 SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW, NULL, NULL);
121
122 int vmxon_enabled[MAXCPU];
123 static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE);
124
125 static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2;
126 static uint32_t exit_ctls, entry_ctls;
127
128 static uint64_t cr0_ones_mask, cr0_zeros_mask;
129 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_ones_mask, CTLFLAG_RD,
130              &cr0_ones_mask, 0, NULL);
131 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_zeros_mask, CTLFLAG_RD,
132              &cr0_zeros_mask, 0, NULL);
133
134 static uint64_t cr4_ones_mask, cr4_zeros_mask;
135 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD,
136              &cr4_ones_mask, 0, NULL);
137 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD,
138              &cr4_zeros_mask, 0, NULL);
139
140 static int vmx_no_patmsr;
141
142 static int vmx_initialized;
143 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD,
144            &vmx_initialized, 0, "Intel VMX initialized");
145
146 /*
147  * Virtual NMI blocking conditions.
148  *
149  * Some processor implementations also require NMI to be blocked if
150  * the STI_BLOCKING bit is set. It is possible to detect this at runtime
151  * based on the (exit_reason,exit_qual) tuple being set to 
152  * (EXIT_REASON_INVAL_VMCS, EXIT_QUAL_NMI_WHILE_STI_BLOCKING).
153  *
154  * We take the easy way out and also include STI_BLOCKING as one of the
155  * gating items for vNMI injection.
156  */
157 static uint64_t nmi_blocking_bits = VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING |
158                                     VMCS_INTERRUPTIBILITY_NMI_BLOCKING |
159                                     VMCS_INTERRUPTIBILITY_STI_BLOCKING;
160
161 /*
162  * Optional capabilities
163  */
164 static int cap_halt_exit;
165 static int cap_pause_exit;
166 static int cap_unrestricted_guest;
167 static int cap_monitor_trap;
168 static int cap_invpcid;
169
170 static int virtual_interrupt_delivery;
171 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD,
172     &virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support");
173
174 static struct unrhdr *vpid_unr;
175 static u_int vpid_alloc_failed;
176 SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD,
177             &vpid_alloc_failed, 0, NULL);
178
179 /*
180  * Use the last page below 4GB as the APIC access address. This address is
181  * occupied by the boot firmware so it is guaranteed that it will not conflict
182  * with a page in system memory.
183  */
184 #define APIC_ACCESS_ADDRESS     0xFFFFF000
185
186 static void vmx_inject_pir(struct vlapic *vlapic);
187
188 #ifdef KTR
189 static const char *
190 exit_reason_to_str(int reason)
191 {
192         static char reasonbuf[32];
193
194         switch (reason) {
195         case EXIT_REASON_EXCEPTION:
196                 return "exception";
197         case EXIT_REASON_EXT_INTR:
198                 return "extint";
199         case EXIT_REASON_TRIPLE_FAULT:
200                 return "triplefault";
201         case EXIT_REASON_INIT:
202                 return "init";
203         case EXIT_REASON_SIPI:
204                 return "sipi";
205         case EXIT_REASON_IO_SMI:
206                 return "iosmi";
207         case EXIT_REASON_SMI:
208                 return "smi";
209         case EXIT_REASON_INTR_WINDOW:
210                 return "intrwindow";
211         case EXIT_REASON_NMI_WINDOW:
212                 return "nmiwindow";
213         case EXIT_REASON_TASK_SWITCH:
214                 return "taskswitch";
215         case EXIT_REASON_CPUID:
216                 return "cpuid";
217         case EXIT_REASON_GETSEC:
218                 return "getsec";
219         case EXIT_REASON_HLT:
220                 return "hlt";
221         case EXIT_REASON_INVD:
222                 return "invd";
223         case EXIT_REASON_INVLPG:
224                 return "invlpg";
225         case EXIT_REASON_RDPMC:
226                 return "rdpmc";
227         case EXIT_REASON_RDTSC:
228                 return "rdtsc";
229         case EXIT_REASON_RSM:
230                 return "rsm";
231         case EXIT_REASON_VMCALL:
232                 return "vmcall";
233         case EXIT_REASON_VMCLEAR:
234                 return "vmclear";
235         case EXIT_REASON_VMLAUNCH:
236                 return "vmlaunch";
237         case EXIT_REASON_VMPTRLD:
238                 return "vmptrld";
239         case EXIT_REASON_VMPTRST:
240                 return "vmptrst";
241         case EXIT_REASON_VMREAD:
242                 return "vmread";
243         case EXIT_REASON_VMRESUME:
244                 return "vmresume";
245         case EXIT_REASON_VMWRITE:
246                 return "vmwrite";
247         case EXIT_REASON_VMXOFF:
248                 return "vmxoff";
249         case EXIT_REASON_VMXON:
250                 return "vmxon";
251         case EXIT_REASON_CR_ACCESS:
252                 return "craccess";
253         case EXIT_REASON_DR_ACCESS:
254                 return "draccess";
255         case EXIT_REASON_INOUT:
256                 return "inout";
257         case EXIT_REASON_RDMSR:
258                 return "rdmsr";
259         case EXIT_REASON_WRMSR:
260                 return "wrmsr";
261         case EXIT_REASON_INVAL_VMCS:
262                 return "invalvmcs";
263         case EXIT_REASON_INVAL_MSR:
264                 return "invalmsr";
265         case EXIT_REASON_MWAIT:
266                 return "mwait";
267         case EXIT_REASON_MTF:
268                 return "mtf";
269         case EXIT_REASON_MONITOR:
270                 return "monitor";
271         case EXIT_REASON_PAUSE:
272                 return "pause";
273         case EXIT_REASON_MCE:
274                 return "mce";
275         case EXIT_REASON_TPR:
276                 return "tpr";
277         case EXIT_REASON_APIC_ACCESS:
278                 return "apic-access";
279         case EXIT_REASON_GDTR_IDTR:
280                 return "gdtridtr";
281         case EXIT_REASON_LDTR_TR:
282                 return "ldtrtr";
283         case EXIT_REASON_EPT_FAULT:
284                 return "eptfault";
285         case EXIT_REASON_EPT_MISCONFIG:
286                 return "eptmisconfig";
287         case EXIT_REASON_INVEPT:
288                 return "invept";
289         case EXIT_REASON_RDTSCP:
290                 return "rdtscp";
291         case EXIT_REASON_VMX_PREEMPT:
292                 return "vmxpreempt";
293         case EXIT_REASON_INVVPID:
294                 return "invvpid";
295         case EXIT_REASON_WBINVD:
296                 return "wbinvd";
297         case EXIT_REASON_XSETBV:
298                 return "xsetbv";
299         case EXIT_REASON_APIC_WRITE:
300                 return "apic-write";
301         default:
302                 snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason);
303                 return (reasonbuf);
304         }
305 }
306 #endif  /* KTR */
307
308 u_long
309 vmx_fix_cr0(u_long cr0)
310 {
311
312         return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask);
313 }
314
315 u_long
316 vmx_fix_cr4(u_long cr4)
317 {
318
319         return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask);
320 }
321
322 static void
323 vpid_free(int vpid)
324 {
325         if (vpid < 0 || vpid > 0xffff)
326                 panic("vpid_free: invalid vpid %d", vpid);
327
328         /*
329          * VPIDs [0,VM_MAXCPU] are special and are not allocated from
330          * the unit number allocator.
331          */
332
333         if (vpid > VM_MAXCPU)
334                 free_unr(vpid_unr, vpid);
335 }
336
337 static void
338 vpid_alloc(uint16_t *vpid, int num)
339 {
340         int i, x;
341
342         if (num <= 0 || num > VM_MAXCPU)
343                 panic("invalid number of vpids requested: %d", num);
344
345         /*
346          * If the "enable vpid" execution control is not enabled then the
347          * VPID is required to be 0 for all vcpus.
348          */
349         if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) == 0) {
350                 for (i = 0; i < num; i++)
351                         vpid[i] = 0;
352                 return;
353         }
354
355         /*
356          * Allocate a unique VPID for each vcpu from the unit number allocator.
357          */
358         for (i = 0; i < num; i++) {
359                 x = alloc_unr(vpid_unr);
360                 if (x == -1)
361                         break;
362                 else
363                         vpid[i] = x;
364         }
365
366         if (i < num) {
367                 atomic_add_int(&vpid_alloc_failed, 1);
368
369                 /*
370                  * If the unit number allocator does not have enough unique
371                  * VPIDs then we need to allocate from the [1,VM_MAXCPU] range.
372                  *
373                  * These VPIDs are not be unique across VMs but this does not
374                  * affect correctness because the combined mappings are also
375                  * tagged with the EP4TA which is unique for each VM.
376                  *
377                  * It is still sub-optimal because the invvpid will invalidate
378                  * combined mappings for a particular VPID across all EP4TAs.
379                  */
380                 while (i-- > 0)
381                         vpid_free(vpid[i]);
382
383                 for (i = 0; i < num; i++)
384                         vpid[i] = i + 1;
385         }
386 }
387
388 static void
389 vpid_init(void)
390 {
391         /*
392          * VPID 0 is required when the "enable VPID" execution control is
393          * disabled.
394          *
395          * VPIDs [1,VM_MAXCPU] are used as the "overflow namespace" when the
396          * unit number allocator does not have sufficient unique VPIDs to
397          * satisfy the allocation.
398          *
399          * The remaining VPIDs are managed by the unit number allocator.
400          */
401         vpid_unr = new_unrhdr(VM_MAXCPU + 1, 0xffff, NULL);
402 }
403
404 static void
405 msr_save_area_init(struct msr_entry *g_area, int *g_count)
406 {
407         int cnt;
408
409         static struct msr_entry guest_msrs[] = {
410                 { MSR_KGSBASE, 0, 0 },
411         };
412
413         cnt = sizeof(guest_msrs) / sizeof(guest_msrs[0]);
414         if (cnt > GUEST_MSR_MAX_ENTRIES)
415                 panic("guest msr save area overrun");
416         bcopy(guest_msrs, g_area, sizeof(guest_msrs));
417         *g_count = cnt;
418 }
419
420 static void
421 vmx_disable(void *arg __unused)
422 {
423         struct invvpid_desc invvpid_desc = { 0 };
424         struct invept_desc invept_desc = { 0 };
425
426         if (vmxon_enabled[curcpu]) {
427                 /*
428                  * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b.
429                  *
430                  * VMXON or VMXOFF are not required to invalidate any TLB
431                  * caching structures. This prevents potential retention of
432                  * cached information in the TLB between distinct VMX episodes.
433                  */
434                 invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc);
435                 invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc);
436                 vmxoff();
437         }
438         load_cr4(rcr4() & ~CR4_VMXE);
439 }
440
441 static int
442 vmx_cleanup(void)
443 {
444
445         if (vpid_unr != NULL) {
446                 delete_unrhdr(vpid_unr);
447                 vpid_unr = NULL;
448         }
449
450         smp_rendezvous(NULL, vmx_disable, NULL, NULL);
451
452         return (0);
453 }
454
455 static void
456 vmx_enable(void *arg __unused)
457 {
458         int error;
459
460         load_cr4(rcr4() | CR4_VMXE);
461
462         *(uint32_t *)vmxon_region[curcpu] = vmx_revision();
463         error = vmxon(vmxon_region[curcpu]);
464         if (error == 0)
465                 vmxon_enabled[curcpu] = 1;
466 }
467
468 static void
469 vmx_restore(void)
470 {
471
472         if (vmxon_enabled[curcpu])
473                 vmxon(vmxon_region[curcpu]);
474 }
475
476 static int
477 vmx_init(void)
478 {
479         int error, use_tpr_shadow;
480         uint64_t fixed0, fixed1, feature_control;
481         uint32_t tmp, procbased2_vid_bits;
482
483         /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */
484         if (!(cpu_feature2 & CPUID2_VMX)) {
485                 printf("vmx_init: processor does not support VMX operation\n");
486                 return (ENXIO);
487         }
488
489         /*
490          * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits
491          * are set (bits 0 and 2 respectively).
492          */
493         feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
494         if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 ||
495             (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) {
496                 printf("vmx_init: VMX operation disabled by BIOS\n");
497                 return (ENXIO);
498         }
499
500         /* Check support for primary processor-based VM-execution controls */
501         error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
502                                MSR_VMX_TRUE_PROCBASED_CTLS,
503                                PROCBASED_CTLS_ONE_SETTING,
504                                PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls);
505         if (error) {
506                 printf("vmx_init: processor does not support desired primary "
507                        "processor-based controls\n");
508                 return (error);
509         }
510
511         /* Clear the processor-based ctl bits that are set on demand */
512         procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING;
513
514         /* Check support for secondary processor-based VM-execution controls */
515         error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
516                                MSR_VMX_PROCBASED_CTLS2,
517                                PROCBASED_CTLS2_ONE_SETTING,
518                                PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2);
519         if (error) {
520                 printf("vmx_init: processor does not support desired secondary "
521                        "processor-based controls\n");
522                 return (error);
523         }
524
525         /* Check support for VPID */
526         error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
527                                PROCBASED2_ENABLE_VPID, 0, &tmp);
528         if (error == 0)
529                 procbased_ctls2 |= PROCBASED2_ENABLE_VPID;
530
531         /* Check support for pin-based VM-execution controls */
532         error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
533                                MSR_VMX_TRUE_PINBASED_CTLS,
534                                PINBASED_CTLS_ONE_SETTING,
535                                PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls);
536         if (error) {
537                 printf("vmx_init: processor does not support desired "
538                        "pin-based controls\n");
539                 return (error);
540         }
541
542         /* Check support for VM-exit controls */
543         error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS,
544                                VM_EXIT_CTLS_ONE_SETTING,
545                                VM_EXIT_CTLS_ZERO_SETTING,
546                                &exit_ctls);
547         if (error) {
548                 /* Try again without the PAT MSR bits */
549                 error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS,
550                                        MSR_VMX_TRUE_EXIT_CTLS,
551                                        VM_EXIT_CTLS_ONE_SETTING_NO_PAT,
552                                        VM_EXIT_CTLS_ZERO_SETTING,
553                                        &exit_ctls);
554                 if (error) {
555                         printf("vmx_init: processor does not support desired "
556                                "exit controls\n");
557                         return (error);
558                 } else {
559                         if (bootverbose)
560                                 printf("vmm: PAT MSR access not supported\n");
561                         guest_msr_valid(MSR_PAT);
562                         vmx_no_patmsr = 1;
563                 }
564         }
565
566         /* Check support for VM-entry controls */
567         if (!vmx_no_patmsr) {
568                 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS,
569                                        MSR_VMX_TRUE_ENTRY_CTLS,
570                                        VM_ENTRY_CTLS_ONE_SETTING,
571                                        VM_ENTRY_CTLS_ZERO_SETTING,
572                                        &entry_ctls);
573         } else {
574                 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS,
575                                        MSR_VMX_TRUE_ENTRY_CTLS,
576                                        VM_ENTRY_CTLS_ONE_SETTING_NO_PAT,
577                                        VM_ENTRY_CTLS_ZERO_SETTING,
578                                        &entry_ctls);
579         }
580
581         if (error) {
582                 printf("vmx_init: processor does not support desired "
583                        "entry controls\n");
584                        return (error);
585         }
586
587         /*
588          * Check support for optional features by testing them
589          * as individual bits
590          */
591         cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
592                                         MSR_VMX_TRUE_PROCBASED_CTLS,
593                                         PROCBASED_HLT_EXITING, 0,
594                                         &tmp) == 0);
595
596         cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
597                                         MSR_VMX_PROCBASED_CTLS,
598                                         PROCBASED_MTF, 0,
599                                         &tmp) == 0);
600
601         cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
602                                          MSR_VMX_TRUE_PROCBASED_CTLS,
603                                          PROCBASED_PAUSE_EXITING, 0,
604                                          &tmp) == 0);
605
606         cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
607                                         MSR_VMX_PROCBASED_CTLS2,
608                                         PROCBASED2_UNRESTRICTED_GUEST, 0,
609                                         &tmp) == 0);
610
611         cap_invpcid = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
612             MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0,
613             &tmp) == 0);
614
615         /*
616          * Check support for virtual interrupt delivery.
617          */
618         procbased2_vid_bits = (PROCBASED2_VIRTUALIZE_APIC_ACCESSES |
619             PROCBASED2_VIRTUALIZE_X2APIC_MODE |
620             PROCBASED2_APIC_REGISTER_VIRTUALIZATION |
621             PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY);
622
623         use_tpr_shadow = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
624             MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_USE_TPR_SHADOW, 0,
625             &tmp) == 0);
626
627         error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
628             procbased2_vid_bits, 0, &tmp);
629         if (error == 0 && use_tpr_shadow) {
630                 virtual_interrupt_delivery = 1;
631                 TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_vid",
632                     &virtual_interrupt_delivery);
633         }
634
635         if (virtual_interrupt_delivery) {
636                 procbased_ctls |= PROCBASED_USE_TPR_SHADOW;
637                 procbased_ctls2 |= procbased2_vid_bits;
638                 procbased_ctls2 &= ~PROCBASED2_VIRTUALIZE_X2APIC_MODE;
639         }
640
641         /* Initialize EPT */
642         error = ept_init();
643         if (error) {
644                 printf("vmx_init: ept initialization failed (%d)\n", error);
645                 return (error);
646         }
647
648         /*
649          * Stash the cr0 and cr4 bits that must be fixed to 0 or 1
650          */
651         fixed0 = rdmsr(MSR_VMX_CR0_FIXED0);
652         fixed1 = rdmsr(MSR_VMX_CR0_FIXED1);
653         cr0_ones_mask = fixed0 & fixed1;
654         cr0_zeros_mask = ~fixed0 & ~fixed1;
655
656         /*
657          * CR0_PE and CR0_PG can be set to zero in VMX non-root operation
658          * if unrestricted guest execution is allowed.
659          */
660         if (cap_unrestricted_guest)
661                 cr0_ones_mask &= ~(CR0_PG | CR0_PE);
662
663         /*
664          * Do not allow the guest to set CR0_NW or CR0_CD.
665          */
666         cr0_zeros_mask |= (CR0_NW | CR0_CD);
667
668         fixed0 = rdmsr(MSR_VMX_CR4_FIXED0);
669         fixed1 = rdmsr(MSR_VMX_CR4_FIXED1);
670         cr4_ones_mask = fixed0 & fixed1;
671         cr4_zeros_mask = ~fixed0 & ~fixed1;
672
673         vpid_init();
674
675         /* enable VMX operation */
676         smp_rendezvous(NULL, vmx_enable, NULL, NULL);
677
678         vmx_initialized = 1;
679
680         return (0);
681 }
682
683 static int
684 vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial)
685 {
686         int error, mask_ident, shadow_ident;
687         uint64_t mask_value;
688
689         if (which != 0 && which != 4)
690                 panic("vmx_setup_cr_shadow: unknown cr%d", which);
691
692         if (which == 0) {
693                 mask_ident = VMCS_CR0_MASK;
694                 mask_value = cr0_ones_mask | cr0_zeros_mask;
695                 shadow_ident = VMCS_CR0_SHADOW;
696         } else {
697                 mask_ident = VMCS_CR4_MASK;
698                 mask_value = cr4_ones_mask | cr4_zeros_mask;
699                 shadow_ident = VMCS_CR4_SHADOW;
700         }
701
702         error = vmcs_setreg(vmcs, 0, VMCS_IDENT(mask_ident), mask_value);
703         if (error)
704                 return (error);
705
706         error = vmcs_setreg(vmcs, 0, VMCS_IDENT(shadow_ident), initial);
707         if (error)
708                 return (error);
709
710         return (0);
711 }
712 #define vmx_setup_cr0_shadow(vmcs,init) vmx_setup_cr_shadow(0, (vmcs), (init))
713 #define vmx_setup_cr4_shadow(vmcs,init) vmx_setup_cr_shadow(4, (vmcs), (init))
714
715 static void *
716 vmx_vminit(struct vm *vm, pmap_t pmap)
717 {
718         uint16_t vpid[VM_MAXCPU];
719         int i, error, guest_msr_count;
720         struct vmx *vmx;
721         struct vmcs *vmcs;
722
723         vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO);
724         if ((uintptr_t)vmx & PAGE_MASK) {
725                 panic("malloc of struct vmx not aligned on %d byte boundary",
726                       PAGE_SIZE);
727         }
728         vmx->vm = vm;
729
730         vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4));
731
732         /*
733          * Clean up EPTP-tagged guest physical and combined mappings
734          *
735          * VMX transitions are not required to invalidate any guest physical
736          * mappings. So, it may be possible for stale guest physical mappings
737          * to be present in the processor TLBs.
738          *
739          * Combined mappings for this EP4TA are also invalidated for all VPIDs.
740          */
741         ept_invalidate_mappings(vmx->eptp);
742
743         msr_bitmap_initialize(vmx->msr_bitmap);
744
745         /*
746          * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE.
747          * The guest FSBASE and GSBASE are saved and restored during
748          * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are
749          * always restored from the vmcs host state area on vm-exit.
750          *
751          * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in
752          * how they are saved/restored so can be directly accessed by the
753          * guest.
754          *
755          * Guest KGSBASE is saved and restored in the guest MSR save area.
756          * Host KGSBASE is restored before returning to userland from the pcb.
757          * There will be a window of time when we are executing in the host
758          * kernel context with a value of KGSBASE from the guest. This is ok
759          * because the value of KGSBASE is inconsequential in kernel context.
760          *
761          * MSR_EFER is saved and restored in the guest VMCS area on a
762          * VM exit and entry respectively. It is also restored from the
763          * host VMCS area on a VM exit.
764          */
765         if (guest_msr_rw(vmx, MSR_GSBASE) ||
766             guest_msr_rw(vmx, MSR_FSBASE) ||
767             guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) ||
768             guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) ||
769             guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) ||
770             guest_msr_rw(vmx, MSR_KGSBASE) ||
771             guest_msr_rw(vmx, MSR_EFER))
772                 panic("vmx_vminit: error setting guest msr access");
773
774         /*
775          * MSR_PAT is saved and restored in the guest VMCS are on a VM exit
776          * and entry respectively. It is also restored from the host VMCS
777          * area on a VM exit. However, if running on a system with no
778          * MSR_PAT save/restore support, leave access disabled so accesses
779          * will be trapped.
780          */
781         if (!vmx_no_patmsr && guest_msr_rw(vmx, MSR_PAT))
782                 panic("vmx_vminit: error setting guest pat msr access");
783
784         vpid_alloc(vpid, VM_MAXCPU);
785
786         if (virtual_interrupt_delivery) {
787                 error = vm_map_mmio(vm, DEFAULT_APIC_BASE, PAGE_SIZE,
788                     APIC_ACCESS_ADDRESS);
789                 /* XXX this should really return an error to the caller */
790                 KASSERT(error == 0, ("vm_map_mmio(apicbase) error %d", error));
791         }
792
793         for (i = 0; i < VM_MAXCPU; i++) {
794                 vmcs = &vmx->vmcs[i];
795                 vmcs->identifier = vmx_revision();
796                 error = vmclear(vmcs);
797                 if (error != 0) {
798                         panic("vmx_vminit: vmclear error %d on vcpu %d\n",
799                               error, i);
800                 }
801
802                 error = vmcs_init(vmcs);
803                 KASSERT(error == 0, ("vmcs_init error %d", error));
804
805                 VMPTRLD(vmcs);
806                 error = 0;
807                 error += vmwrite(VMCS_HOST_RSP, (u_long)&vmx->ctx[i]);
808                 error += vmwrite(VMCS_EPTP, vmx->eptp);
809                 error += vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls);
810                 error += vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls);
811                 error += vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2);
812                 error += vmwrite(VMCS_EXIT_CTLS, exit_ctls);
813                 error += vmwrite(VMCS_ENTRY_CTLS, entry_ctls);
814                 error += vmwrite(VMCS_MSR_BITMAP, vtophys(vmx->msr_bitmap));
815                 error += vmwrite(VMCS_VPID, vpid[i]);
816                 if (virtual_interrupt_delivery) {
817                         error += vmwrite(VMCS_APIC_ACCESS, APIC_ACCESS_ADDRESS);
818                         error += vmwrite(VMCS_VIRTUAL_APIC,
819                             vtophys(&vmx->apic_page[i]));
820                         error += vmwrite(VMCS_EOI_EXIT0, 0);
821                         error += vmwrite(VMCS_EOI_EXIT1, 0);
822                         error += vmwrite(VMCS_EOI_EXIT2, 0);
823                         error += vmwrite(VMCS_EOI_EXIT3, 0);
824                 }
825                 VMCLEAR(vmcs);
826                 KASSERT(error == 0, ("vmx_vminit: error customizing the vmcs"));
827
828                 vmx->cap[i].set = 0;
829                 vmx->cap[i].proc_ctls = procbased_ctls;
830                 vmx->cap[i].proc_ctls2 = procbased_ctls2;
831
832                 vmx->state[i].lastcpu = -1;
833                 vmx->state[i].vpid = vpid[i];
834
835                 msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count);
836
837                 error = vmcs_set_msr_save(vmcs, vtophys(vmx->guest_msrs[i]),
838                     guest_msr_count);
839                 if (error != 0)
840                         panic("vmcs_set_msr_save error %d", error);
841
842                 /*
843                  * Set up the CR0/4 shadows, and init the read shadow
844                  * to the power-on register value from the Intel Sys Arch.
845                  *  CR0 - 0x60000010
846                  *  CR4 - 0
847                  */
848                 error = vmx_setup_cr0_shadow(vmcs, 0x60000010);
849                 if (error != 0)
850                         panic("vmx_setup_cr0_shadow %d", error);
851
852                 error = vmx_setup_cr4_shadow(vmcs, 0);
853                 if (error != 0)
854                         panic("vmx_setup_cr4_shadow %d", error);
855
856                 vmx->ctx[i].pmap = pmap;
857                 vmx->ctx[i].eptp = vmx->eptp;
858         }
859
860         return (vmx);
861 }
862
863 static int
864 vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx)
865 {
866         int handled, func;
867         
868         func = vmxctx->guest_rax;
869
870         handled = x86_emulate_cpuid(vm, vcpu,
871                                     (uint32_t*)(&vmxctx->guest_rax),
872                                     (uint32_t*)(&vmxctx->guest_rbx),
873                                     (uint32_t*)(&vmxctx->guest_rcx),
874                                     (uint32_t*)(&vmxctx->guest_rdx));
875         return (handled);
876 }
877
878 static __inline void
879 vmx_run_trace(struct vmx *vmx, int vcpu)
880 {
881 #ifdef KTR
882         VCPU_CTR1(vmx->vm, vcpu, "Resume execution at %#lx", vmcs_guest_rip());
883 #endif
884 }
885
886 static __inline void
887 vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason,
888                int handled)
889 {
890 #ifdef KTR
891         VCPU_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx",
892                  handled ? "handled" : "unhandled",
893                  exit_reason_to_str(exit_reason), rip);
894 #endif
895 }
896
897 static __inline void
898 vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip)
899 {
900 #ifdef KTR
901         VCPU_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip);
902 #endif
903 }
904
905 static void
906 vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu)
907 {
908         int lastcpu;
909         struct vmxstate *vmxstate;
910         struct invvpid_desc invvpid_desc = { 0 };
911
912         vmxstate = &vmx->state[vcpu];
913         lastcpu = vmxstate->lastcpu;
914         vmxstate->lastcpu = curcpu;
915
916         if (lastcpu == curcpu)
917                 return;
918
919         vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1);
920
921         vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase());
922         vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase());
923         vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase());
924
925         /*
926          * If we are using VPIDs then invalidate all mappings tagged with 'vpid'
927          *
928          * We do this because this vcpu was executing on a different host
929          * cpu when it last ran. We do not track whether it invalidated
930          * mappings associated with its 'vpid' during that run. So we must
931          * assume that the mappings associated with 'vpid' on 'curcpu' are
932          * stale and invalidate them.
933          *
934          * Note that we incur this penalty only when the scheduler chooses to
935          * move the thread associated with this vcpu between host cpus.
936          *
937          * Note also that this will invalidate mappings tagged with 'vpid'
938          * for "all" EP4TAs.
939          */
940         if (vmxstate->vpid != 0) {
941                 invvpid_desc.vpid = vmxstate->vpid;
942                 invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc);
943         }
944 }
945
946 /*
947  * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set.
948  */
949 CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0);
950
951 static void __inline
952 vmx_set_int_window_exiting(struct vmx *vmx, int vcpu)
953 {
954
955         vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING;
956         vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
957 }
958
959 static void __inline
960 vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu)
961 {
962
963         vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING;
964         vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
965 }
966
967 static void __inline
968 vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu)
969 {
970
971         vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING;
972         vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
973 }
974
975 static void __inline
976 vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu)
977 {
978
979         vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING;
980         vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
981 }
982
983 static int
984 vmx_inject_nmi(struct vmx *vmx, int vcpu)
985 {
986         uint64_t info, interruptibility;
987
988         /* Bail out if no NMI requested */
989         if (!vm_nmi_pending(vmx->vm, vcpu))
990                 return (0);
991
992         interruptibility = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
993         if (interruptibility & nmi_blocking_bits)
994                 goto nmiblocked;
995
996         /*
997          * Inject the virtual NMI. The vector must be the NMI IDT entry
998          * or the VMCS entry check will fail.
999          */
1000         info = VMCS_INTERRUPTION_INFO_NMI | VMCS_INTERRUPTION_INFO_VALID;
1001         info |= IDT_NMI;
1002         vmcs_write(VMCS_ENTRY_INTR_INFO, info);
1003
1004         VCPU_CTR0(vmx->vm, vcpu, "Injecting vNMI");
1005
1006         /* Clear the request */
1007         vm_nmi_clear(vmx->vm, vcpu);
1008         return (1);
1009
1010 nmiblocked:
1011         /*
1012          * Set the NMI Window Exiting execution control so we can inject
1013          * the virtual NMI as soon as blocking condition goes away.
1014          */
1015         vmx_set_nmi_window_exiting(vmx, vcpu);
1016
1017         VCPU_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting");
1018         return (1);
1019 }
1020
1021 static void
1022 vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic)
1023 {
1024         int vector;
1025         uint64_t info, rflags, interruptibility;
1026
1027         const int HWINTR_BLOCKED = VMCS_INTERRUPTIBILITY_STI_BLOCKING |
1028                                    VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING;
1029
1030         /*
1031          * If there is already an interrupt pending then just return.
1032          *
1033          * This could happen if an interrupt was injected on a prior
1034          * VM entry but the actual entry into guest mode was aborted
1035          * because of a pending AST.
1036          */
1037         info = vmcs_read(VMCS_ENTRY_INTR_INFO);
1038         if (info & VMCS_INTERRUPTION_INFO_VALID)
1039                 return;
1040
1041         /*
1042          * NMI injection has priority so deal with those first
1043          */
1044         if (vmx_inject_nmi(vmx, vcpu))
1045                 return;
1046
1047         if (virtual_interrupt_delivery) {
1048                 vmx_inject_pir(vlapic);
1049                 return;
1050         }
1051
1052         /* Ask the local apic for a vector to inject */
1053         if (!vlapic_pending_intr(vlapic, &vector))
1054                 return;
1055
1056         if (vector < 32 || vector > 255)
1057                 panic("vmx_inject_interrupts: invalid vector %d\n", vector);
1058
1059         /* Check RFLAGS.IF and the interruptibility state of the guest */
1060         rflags = vmcs_read(VMCS_GUEST_RFLAGS);
1061         if ((rflags & PSL_I) == 0)
1062                 goto cantinject;
1063
1064         interruptibility = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
1065         if (interruptibility & HWINTR_BLOCKED)
1066                 goto cantinject;
1067
1068         /* Inject the interrupt */
1069         info = VMCS_INTERRUPTION_INFO_HW_INTR | VMCS_INTERRUPTION_INFO_VALID;
1070         info |= vector;
1071         vmcs_write(VMCS_ENTRY_INTR_INFO, info);
1072
1073         /* Update the Local APIC ISR */
1074         vlapic_intr_accepted(vlapic, vector);
1075
1076         VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector);
1077
1078         return;
1079
1080 cantinject:
1081         /*
1082          * Set the Interrupt Window Exiting execution control so we can inject
1083          * the interrupt as soon as blocking condition goes away.
1084          */
1085         vmx_set_int_window_exiting(vmx, vcpu);
1086
1087         VCPU_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting");
1088 }
1089
1090 static int
1091 vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
1092 {
1093         int cr, vmcs_guest_cr, vmcs_shadow_cr;
1094         uint64_t crval, regval, ones_mask, zeros_mask;
1095         const struct vmxctx *vmxctx;
1096
1097         /* We only handle mov to %cr0 or %cr4 at this time */
1098         if ((exitqual & 0xf0) != 0x00)
1099                 return (UNHANDLED);
1100
1101         cr = exitqual & 0xf;
1102         if (cr != 0 && cr != 4)
1103                 return (UNHANDLED);
1104
1105         regval = 0; /* silence gcc */
1106         vmxctx = &vmx->ctx[vcpu];
1107
1108         /*
1109          * We must use vmcs_write() directly here because vmcs_setreg() will
1110          * call vmclear(vmcs) as a side-effect which we certainly don't want.
1111          */
1112         switch ((exitqual >> 8) & 0xf) {
1113         case 0:
1114                 regval = vmxctx->guest_rax;
1115                 break;
1116         case 1:
1117                 regval = vmxctx->guest_rcx;
1118                 break;
1119         case 2:
1120                 regval = vmxctx->guest_rdx;
1121                 break;
1122         case 3:
1123                 regval = vmxctx->guest_rbx;
1124                 break;
1125         case 4:
1126                 regval = vmcs_read(VMCS_GUEST_RSP);
1127                 break;
1128         case 5:
1129                 regval = vmxctx->guest_rbp;
1130                 break;
1131         case 6:
1132                 regval = vmxctx->guest_rsi;
1133                 break;
1134         case 7:
1135                 regval = vmxctx->guest_rdi;
1136                 break;
1137         case 8:
1138                 regval = vmxctx->guest_r8;
1139                 break;
1140         case 9:
1141                 regval = vmxctx->guest_r9;
1142                 break;
1143         case 10:
1144                 regval = vmxctx->guest_r10;
1145                 break;
1146         case 11:
1147                 regval = vmxctx->guest_r11;
1148                 break;
1149         case 12:
1150                 regval = vmxctx->guest_r12;
1151                 break;
1152         case 13:
1153                 regval = vmxctx->guest_r13;
1154                 break;
1155         case 14:
1156                 regval = vmxctx->guest_r14;
1157                 break;
1158         case 15:
1159                 regval = vmxctx->guest_r15;
1160                 break;
1161         }
1162
1163         if (cr == 0) {
1164                 ones_mask = cr0_ones_mask;
1165                 zeros_mask = cr0_zeros_mask;
1166                 vmcs_guest_cr = VMCS_GUEST_CR0;
1167                 vmcs_shadow_cr = VMCS_CR0_SHADOW;
1168         } else {
1169                 ones_mask = cr4_ones_mask;
1170                 zeros_mask = cr4_zeros_mask;
1171                 vmcs_guest_cr = VMCS_GUEST_CR4;
1172                 vmcs_shadow_cr = VMCS_CR4_SHADOW;
1173         }
1174         vmcs_write(vmcs_shadow_cr, regval);
1175
1176         crval = regval | ones_mask;
1177         crval &= ~zeros_mask;
1178         vmcs_write(vmcs_guest_cr, crval);
1179
1180         if (cr == 0 && regval & CR0_PG) {
1181                 uint64_t efer, entry_ctls;
1182
1183                 /*
1184                  * If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and
1185                  * the "IA-32e mode guest" bit in VM-entry control must be
1186                  * equal.
1187                  */
1188                 efer = vmcs_read(VMCS_GUEST_IA32_EFER);
1189                 if (efer & EFER_LME) {
1190                         efer |= EFER_LMA;
1191                         vmcs_write(VMCS_GUEST_IA32_EFER, efer);
1192                         entry_ctls = vmcs_read(VMCS_ENTRY_CTLS);
1193                         entry_ctls |= VM_ENTRY_GUEST_LMA;
1194                         vmcs_write(VMCS_ENTRY_CTLS, entry_ctls);
1195                 }
1196         }
1197
1198         return (HANDLED);
1199 }
1200
1201 static int
1202 ept_fault_type(uint64_t ept_qual)
1203 {
1204         int fault_type;
1205
1206         if (ept_qual & EPT_VIOLATION_DATA_WRITE)
1207                 fault_type = VM_PROT_WRITE;
1208         else if (ept_qual & EPT_VIOLATION_INST_FETCH)
1209                 fault_type = VM_PROT_EXECUTE;
1210         else
1211                 fault_type= VM_PROT_READ;
1212
1213         return (fault_type);
1214 }
1215
1216 static boolean_t
1217 ept_emulation_fault(uint64_t ept_qual)
1218 {
1219         int read, write;
1220
1221         /* EPT fault on an instruction fetch doesn't make sense here */
1222         if (ept_qual & EPT_VIOLATION_INST_FETCH)
1223                 return (FALSE);
1224
1225         /* EPT fault must be a read fault or a write fault */
1226         read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0;
1227         write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0;
1228         if ((read | write) == 0)
1229                 return (FALSE);
1230
1231         /*
1232          * The EPT violation must have been caused by accessing a
1233          * guest-physical address that is a translation of a guest-linear
1234          * address.
1235          */
1236         if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 ||
1237             (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) {
1238                 return (FALSE);
1239         }
1240
1241         return (TRUE);
1242 }
1243
1244 static int
1245 vmx_handle_apic_write(struct vlapic *vlapic, uint64_t qual)
1246 {
1247         int error, handled, offset;
1248         bool retu;
1249
1250         if (!virtual_interrupt_delivery)
1251                 return (UNHANDLED);
1252
1253         handled = 1;
1254         offset = APIC_WRITE_OFFSET(qual);
1255         switch (offset) {
1256         case APIC_OFFSET_ID:
1257                 vlapic_id_write_handler(vlapic);
1258                 break;
1259         case APIC_OFFSET_LDR:
1260                 vlapic_ldr_write_handler(vlapic);
1261                 break;
1262         case APIC_OFFSET_DFR:
1263                 vlapic_dfr_write_handler(vlapic);
1264                 break;
1265         case APIC_OFFSET_SVR:
1266                 vlapic_svr_write_handler(vlapic);
1267                 break;
1268         case APIC_OFFSET_ESR:
1269                 vlapic_esr_write_handler(vlapic);
1270                 break;
1271         case APIC_OFFSET_ICR_LOW:
1272                 retu = false;
1273                 error = vlapic_icrlo_write_handler(vlapic, &retu);
1274                 if (error != 0 || retu)
1275                         handled = 0;
1276                 break;
1277         case APIC_OFFSET_CMCI_LVT:
1278         case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
1279                 vlapic_lvt_write_handler(vlapic, offset);
1280                 break;
1281         case APIC_OFFSET_TIMER_ICR:
1282                 vlapic_icrtmr_write_handler(vlapic);
1283                 break;
1284         case APIC_OFFSET_TIMER_DCR:
1285                 vlapic_dcr_write_handler(vlapic);
1286                 break;
1287         default:
1288                 handled = 0;
1289                 break;
1290         }
1291         return (handled);
1292 }
1293
1294 static bool
1295 apic_access_fault(uint64_t gpa)
1296 {
1297
1298         if (virtual_interrupt_delivery &&
1299             (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE))
1300                 return (true);
1301         else
1302                 return (false);
1303 }
1304
1305 static int
1306 vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
1307 {
1308         uint64_t qual;
1309         int access_type, offset, allowed;
1310
1311         if (!virtual_interrupt_delivery)
1312                 return (UNHANDLED);
1313
1314         qual = vmexit->u.vmx.exit_qualification;
1315         access_type = APIC_ACCESS_TYPE(qual);
1316         offset = APIC_ACCESS_OFFSET(qual);
1317
1318         allowed = 0;
1319         if (access_type == 0) {
1320                 /*
1321                  * Read data access to the following registers is expected.
1322                  */
1323                 switch (offset) {
1324                 case APIC_OFFSET_APR:
1325                 case APIC_OFFSET_PPR:
1326                 case APIC_OFFSET_RRR:
1327                 case APIC_OFFSET_CMCI_LVT:
1328                 case APIC_OFFSET_TIMER_CCR:
1329                         allowed = 1;
1330                         break;
1331                 default:
1332                         break;
1333                 }
1334         } else if (access_type == 1) {
1335                 /*
1336                  * Write data access to the following registers is expected.
1337                  */
1338                 switch (offset) {
1339                 case APIC_OFFSET_VER:
1340                 case APIC_OFFSET_APR:
1341                 case APIC_OFFSET_PPR:
1342                 case APIC_OFFSET_RRR:
1343                 case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
1344                 case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
1345                 case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
1346                 case APIC_OFFSET_CMCI_LVT:
1347                 case APIC_OFFSET_TIMER_CCR:
1348                         allowed = 1;
1349                         break;
1350                 default:
1351                         break;
1352                 }
1353         }
1354
1355         if (allowed) {
1356                 vmexit->exitcode = VM_EXITCODE_INST_EMUL;
1357                 vmexit->u.inst_emul.gpa = DEFAULT_APIC_BASE + offset;
1358                 vmexit->u.inst_emul.gla = VIE_INVALID_GLA;
1359                 vmexit->u.inst_emul.cr3 = vmcs_guest_cr3();
1360         }
1361
1362         /*
1363          * Regardless of whether the APIC-access is allowed this handler
1364          * always returns UNHANDLED:
1365          * - if the access is allowed then it is handled by emulating the
1366          *   instruction that caused the VM-exit (outside the critical section)
1367          * - if the access is not allowed then it will be converted to an
1368          *   exitcode of VM_EXITCODE_VMX and will be dealt with in userland.
1369          */
1370         return (UNHANDLED);
1371 }
1372
1373 static int
1374 vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
1375 {
1376         int error, handled;
1377         struct vmxctx *vmxctx;
1378         struct vlapic *vlapic;
1379         uint32_t eax, ecx, edx, idtvec_info, idtvec_err, reason;
1380         uint64_t qual, gpa;
1381         bool retu;
1382
1383         handled = 0;
1384         vmxctx = &vmx->ctx[vcpu];
1385
1386         qual = vmexit->u.vmx.exit_qualification;
1387         reason = vmexit->u.vmx.exit_reason;
1388         vmexit->exitcode = VM_EXITCODE_BOGUS;
1389
1390         vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1);
1391
1392         /*
1393          * VM exits that could be triggered during event injection on the
1394          * previous VM entry need to be handled specially by re-injecting
1395          * the event.
1396          *
1397          * See "Information for VM Exits During Event Delivery" in Intel SDM
1398          * for details.
1399          */
1400         switch (reason) {
1401         case EXIT_REASON_EPT_FAULT:
1402         case EXIT_REASON_EPT_MISCONFIG:
1403         case EXIT_REASON_APIC_ACCESS:
1404         case EXIT_REASON_TASK_SWITCH:
1405         case EXIT_REASON_EXCEPTION:
1406                 idtvec_info = vmcs_idt_vectoring_info();
1407                 if (idtvec_info & VMCS_IDT_VEC_VALID) {
1408                         idtvec_info &= ~(1 << 12); /* clear undefined bit */
1409                         vmcs_write(VMCS_ENTRY_INTR_INFO, idtvec_info);
1410                         if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
1411                                 idtvec_err = vmcs_idt_vectoring_err();
1412                                 vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR,
1413                                     idtvec_err);
1414                         }
1415                         vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
1416                 }
1417         default:
1418                 break;
1419         }
1420
1421         switch (reason) {
1422         case EXIT_REASON_CR_ACCESS:
1423                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1);
1424                 handled = vmx_emulate_cr_access(vmx, vcpu, qual);
1425                 break;
1426         case EXIT_REASON_RDMSR:
1427                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1);
1428                 retu = false;
1429                 ecx = vmxctx->guest_rcx;
1430                 error = emulate_rdmsr(vmx->vm, vcpu, ecx, &retu);
1431                 if (error) {
1432                         vmexit->exitcode = VM_EXITCODE_RDMSR;
1433                         vmexit->u.msr.code = ecx;
1434                 } else if (!retu) {
1435                         handled = 1;
1436                 } else {
1437                         /* Return to userspace with a valid exitcode */
1438                         KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
1439                             ("emulate_wrmsr retu with bogus exitcode"));
1440                 }
1441                 break;
1442         case EXIT_REASON_WRMSR:
1443                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_WRMSR, 1);
1444                 retu = false;
1445                 eax = vmxctx->guest_rax;
1446                 ecx = vmxctx->guest_rcx;
1447                 edx = vmxctx->guest_rdx;
1448                 error = emulate_wrmsr(vmx->vm, vcpu, ecx,
1449                     (uint64_t)edx << 32 | eax, &retu);
1450                 if (error) {
1451                         vmexit->exitcode = VM_EXITCODE_WRMSR;
1452                         vmexit->u.msr.code = ecx;
1453                         vmexit->u.msr.wval = (uint64_t)edx << 32 | eax;
1454                 } else if (!retu) {
1455                         handled = 1;
1456                 } else {
1457                         /* Return to userspace with a valid exitcode */
1458                         KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
1459                             ("emulate_wrmsr retu with bogus exitcode"));
1460                 }
1461                 break;
1462         case EXIT_REASON_HLT:
1463                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1);
1464                 vmexit->exitcode = VM_EXITCODE_HLT;
1465                 vmexit->u.hlt.rflags = vmcs_read(VMCS_GUEST_RFLAGS);
1466                 break;
1467         case EXIT_REASON_MTF:
1468                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1);
1469                 vmexit->exitcode = VM_EXITCODE_MTRAP;
1470                 break;
1471         case EXIT_REASON_PAUSE:
1472                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1);
1473                 vmexit->exitcode = VM_EXITCODE_PAUSE;
1474                 break;
1475         case EXIT_REASON_INTR_WINDOW:
1476                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1);
1477                 vmx_clear_int_window_exiting(vmx, vcpu);
1478                 VCPU_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting");
1479                 return (1);
1480         case EXIT_REASON_EXT_INTR:
1481                 /*
1482                  * External interrupts serve only to cause VM exits and allow
1483                  * the host interrupt handler to run.
1484                  *
1485                  * If this external interrupt triggers a virtual interrupt
1486                  * to a VM, then that state will be recorded by the
1487                  * host interrupt handler in the VM's softc. We will inject
1488                  * this virtual interrupt during the subsequent VM enter.
1489                  */
1490
1491                 /*
1492                  * This is special. We want to treat this as an 'handled'
1493                  * VM-exit but not increment the instruction pointer.
1494                  */
1495                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1);
1496                 return (1);
1497         case EXIT_REASON_NMI_WINDOW:
1498                 /* Exit to allow the pending virtual NMI to be injected */
1499                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NMI_WINDOW, 1);
1500                 vmx_clear_nmi_window_exiting(vmx, vcpu);
1501                 VCPU_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting");
1502                 return (1);
1503         case EXIT_REASON_INOUT:
1504                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1);
1505                 vmexit->exitcode = VM_EXITCODE_INOUT;
1506                 vmexit->u.inout.bytes = (qual & 0x7) + 1;
1507                 vmexit->u.inout.in = (qual & 0x8) ? 1 : 0;
1508                 vmexit->u.inout.string = (qual & 0x10) ? 1 : 0;
1509                 vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0;
1510                 vmexit->u.inout.port = (uint16_t)(qual >> 16);
1511                 vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax);
1512                 break;
1513         case EXIT_REASON_CPUID:
1514                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1);
1515                 handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx);
1516                 break;
1517         case EXIT_REASON_EPT_FAULT:
1518                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EPT_FAULT, 1);
1519                 /*
1520                  * If 'gpa' lies within the address space allocated to
1521                  * memory then this must be a nested page fault otherwise
1522                  * this must be an instruction that accesses MMIO space.
1523                  */
1524                 gpa = vmcs_gpa();
1525                 if (vm_mem_allocated(vmx->vm, gpa) || apic_access_fault(gpa)) {
1526                         vmexit->exitcode = VM_EXITCODE_PAGING;
1527                         vmexit->u.paging.gpa = gpa;
1528                         vmexit->u.paging.fault_type = ept_fault_type(qual);
1529                 } else if (ept_emulation_fault(qual)) {
1530                         vmexit->exitcode = VM_EXITCODE_INST_EMUL;
1531                         vmexit->u.inst_emul.gpa = gpa;
1532                         vmexit->u.inst_emul.gla = vmcs_gla();
1533                         vmexit->u.inst_emul.cr3 = vmcs_guest_cr3();
1534                 }
1535                 break;
1536         case EXIT_REASON_APIC_ACCESS:
1537                 handled = vmx_handle_apic_access(vmx, vcpu, vmexit);
1538                 break;
1539         case EXIT_REASON_APIC_WRITE:
1540                 /*
1541                  * APIC-write VM exit is trap-like so the %rip is already
1542                  * pointing to the next instruction.
1543                  */
1544                 vmexit->inst_length = 0;
1545                 vlapic = vm_lapic(vmx->vm, vcpu);
1546                 handled = vmx_handle_apic_write(vlapic, qual);
1547                 break;
1548         default:
1549                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1);
1550                 break;
1551         }
1552
1553         if (handled) {
1554                 /*
1555                  * It is possible that control is returned to userland
1556                  * even though we were able to handle the VM exit in the
1557                  * kernel.
1558                  *
1559                  * In such a case we want to make sure that the userland
1560                  * restarts guest execution at the instruction *after*
1561                  * the one we just processed. Therefore we update the
1562                  * guest rip in the VMCS and in 'vmexit'.
1563                  */
1564                 vmexit->rip += vmexit->inst_length;
1565                 vmexit->inst_length = 0;
1566                 vmcs_write(VMCS_GUEST_RIP, vmexit->rip);
1567         } else {
1568                 if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
1569                         /*
1570                          * If this VM exit was not claimed by anybody then
1571                          * treat it as a generic VMX exit.
1572                          */
1573                         vmexit->exitcode = VM_EXITCODE_VMX;
1574                         vmexit->u.vmx.status = VM_SUCCESS;
1575                 } else {
1576                         /*
1577                          * The exitcode and collateral have been populated.
1578                          * The VM exit will be processed further in userland.
1579                          */
1580                 }
1581         }
1582         return (handled);
1583 }
1584
1585 static __inline int
1586 vmx_exit_astpending(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
1587 {
1588
1589         vmexit->rip = vmcs_guest_rip();
1590         vmexit->inst_length = 0;
1591         vmexit->exitcode = VM_EXITCODE_BOGUS;
1592         vmx_astpending_trace(vmx, vcpu, vmexit->rip);
1593         vmm_stat_incr(vmx->vm, vcpu, VMEXIT_ASTPENDING, 1);
1594
1595         return (HANDLED);
1596 }
1597
1598 static __inline int
1599 vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit)
1600 {
1601
1602         KASSERT(vmxctx->inst_fail_status != VM_SUCCESS,
1603             ("vmx_exit_inst_error: invalid inst_fail_status %d",
1604             vmxctx->inst_fail_status));
1605
1606         vmexit->inst_length = 0;
1607         vmexit->exitcode = VM_EXITCODE_VMX;
1608         vmexit->u.vmx.status = vmxctx->inst_fail_status;
1609         vmexit->u.vmx.inst_error = vmcs_instruction_error();
1610         vmexit->u.vmx.exit_reason = ~0;
1611         vmexit->u.vmx.exit_qualification = ~0;
1612
1613         switch (rc) {
1614         case VMX_VMRESUME_ERROR:
1615         case VMX_VMLAUNCH_ERROR:
1616         case VMX_INVEPT_ERROR:
1617                 vmexit->u.vmx.inst_type = rc;
1618                 break;
1619         default:
1620                 panic("vm_exit_inst_error: vmx_enter_guest returned %d", rc);
1621         }
1622
1623         return (UNHANDLED);
1624 }
1625
1626 static int
1627 vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap)
1628 {
1629         int rc, handled, launched;
1630         struct vmx *vmx;
1631         struct vmxctx *vmxctx;
1632         struct vmcs *vmcs;
1633         struct vm_exit *vmexit;
1634         struct vlapic *vlapic;
1635         uint64_t rip;
1636         uint32_t exit_reason;
1637
1638         vmx = arg;
1639         vmcs = &vmx->vmcs[vcpu];
1640         vmxctx = &vmx->ctx[vcpu];
1641         vlapic = vm_lapic(vmx->vm, vcpu);
1642         vmexit = vm_exitinfo(vmx->vm, vcpu);
1643         launched = 0;
1644
1645         KASSERT(vmxctx->pmap == pmap,
1646             ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap));
1647         KASSERT(vmxctx->eptp == vmx->eptp,
1648             ("eptp %p different than ctx eptp %#lx", eptp, vmxctx->eptp));
1649
1650         VMPTRLD(vmcs);
1651
1652         /*
1653          * XXX
1654          * We do this every time because we may setup the virtual machine
1655          * from a different process than the one that actually runs it.
1656          *
1657          * If the life of a virtual machine was spent entirely in the context
1658          * of a single process we could do this once in vmx_vminit().
1659          */
1660         vmcs_write(VMCS_HOST_CR3, rcr3());
1661
1662         vmcs_write(VMCS_GUEST_RIP, startrip);
1663         vmx_set_pcpu_defaults(vmx, vcpu);
1664         do {
1665                 /*
1666                  * Interrupts are disabled from this point on until the
1667                  * guest starts executing. This is done for the following
1668                  * reasons:
1669                  *
1670                  * If an AST is asserted on this thread after the check below,
1671                  * then the IPI_AST notification will not be lost, because it
1672                  * will cause a VM exit due to external interrupt as soon as
1673                  * the guest state is loaded.
1674                  *
1675                  * A posted interrupt after 'vmx_inject_interrupts()' will
1676                  * not be "lost" because it will be held pending in the host
1677                  * APIC because interrupts are disabled. The pending interrupt
1678                  * will be recognized as soon as the guest state is loaded.
1679                  *
1680                  * The same reasoning applies to the IPI generated by
1681                  * pmap_invalidate_ept().
1682                  */
1683                 disable_intr();
1684                 if (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED)) {
1685                         enable_intr();
1686                         handled = vmx_exit_astpending(vmx, vcpu, vmexit);
1687                         break;
1688                 }
1689
1690                 vmx_inject_interrupts(vmx, vcpu, vlapic);
1691                 vmx_run_trace(vmx, vcpu);
1692                 rc = vmx_enter_guest(vmxctx, launched);
1693
1694                 enable_intr();
1695
1696                 /* Collect some information for VM exit processing */
1697                 vmexit->rip = rip = vmcs_guest_rip();
1698                 vmexit->inst_length = vmexit_instruction_length();
1699                 vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason();
1700                 vmexit->u.vmx.exit_qualification = vmcs_exit_qualification();
1701
1702                 if (rc == VMX_GUEST_VMEXIT) {
1703                         launched = 1;
1704                         handled = vmx_exit_process(vmx, vcpu, vmexit);
1705                 } else {
1706                         handled = vmx_exit_inst_error(vmxctx, rc, vmexit);
1707                 }
1708
1709                 vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled);
1710         } while (handled);
1711
1712         /*
1713          * If a VM exit has been handled then the exitcode must be BOGUS
1714          * If a VM exit is not handled then the exitcode must not be BOGUS
1715          */
1716         if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) ||
1717             (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) {
1718                 panic("Mismatch between handled (%d) and exitcode (%d)",
1719                       handled, vmexit->exitcode);
1720         }
1721
1722         if (!handled)
1723                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_USERSPACE, 1);
1724
1725         VCPU_CTR1(vmx->vm, vcpu, "returning from vmx_run: exitcode %d",
1726             vmexit->exitcode);
1727
1728         VMCLEAR(vmcs);
1729         return (0);
1730 }
1731
1732 static void
1733 vmx_vmcleanup(void *arg)
1734 {
1735         int i, error;
1736         struct vmx *vmx = arg;
1737
1738         if (virtual_interrupt_delivery)
1739                 vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE);
1740
1741         for (i = 0; i < VM_MAXCPU; i++)
1742                 vpid_free(vmx->state[i].vpid);
1743
1744         /*
1745          * XXXSMP we also need to clear the VMCS active on the other vcpus.
1746          */
1747         error = vmclear(&vmx->vmcs[0]);
1748         if (error != 0)
1749                 panic("vmx_vmcleanup: vmclear error %d on vcpu 0", error);
1750
1751         free(vmx, M_VMX);
1752
1753         return;
1754 }
1755
1756 static register_t *
1757 vmxctx_regptr(struct vmxctx *vmxctx, int reg)
1758 {
1759
1760         switch (reg) {
1761         case VM_REG_GUEST_RAX:
1762                 return (&vmxctx->guest_rax);
1763         case VM_REG_GUEST_RBX:
1764                 return (&vmxctx->guest_rbx);
1765         case VM_REG_GUEST_RCX:
1766                 return (&vmxctx->guest_rcx);
1767         case VM_REG_GUEST_RDX:
1768                 return (&vmxctx->guest_rdx);
1769         case VM_REG_GUEST_RSI:
1770                 return (&vmxctx->guest_rsi);
1771         case VM_REG_GUEST_RDI:
1772                 return (&vmxctx->guest_rdi);
1773         case VM_REG_GUEST_RBP:
1774                 return (&vmxctx->guest_rbp);
1775         case VM_REG_GUEST_R8:
1776                 return (&vmxctx->guest_r8);
1777         case VM_REG_GUEST_R9:
1778                 return (&vmxctx->guest_r9);
1779         case VM_REG_GUEST_R10:
1780                 return (&vmxctx->guest_r10);
1781         case VM_REG_GUEST_R11:
1782                 return (&vmxctx->guest_r11);
1783         case VM_REG_GUEST_R12:
1784                 return (&vmxctx->guest_r12);
1785         case VM_REG_GUEST_R13:
1786                 return (&vmxctx->guest_r13);
1787         case VM_REG_GUEST_R14:
1788                 return (&vmxctx->guest_r14);
1789         case VM_REG_GUEST_R15:
1790                 return (&vmxctx->guest_r15);
1791         default:
1792                 break;
1793         }
1794         return (NULL);
1795 }
1796
1797 static int
1798 vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval)
1799 {
1800         register_t *regp;
1801
1802         if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
1803                 *retval = *regp;
1804                 return (0);
1805         } else
1806                 return (EINVAL);
1807 }
1808
1809 static int
1810 vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val)
1811 {
1812         register_t *regp;
1813
1814         if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
1815                 *regp = val;
1816                 return (0);
1817         } else
1818                 return (EINVAL);
1819 }
1820
1821 static int
1822 vmx_shadow_reg(int reg)
1823 {
1824         int shreg;
1825
1826         shreg = -1;
1827
1828         switch (reg) {
1829         case VM_REG_GUEST_CR0:
1830                 shreg = VMCS_CR0_SHADOW;
1831                 break;
1832         case VM_REG_GUEST_CR4:
1833                 shreg = VMCS_CR4_SHADOW;
1834                 break;
1835         default:
1836                 break;
1837         }
1838
1839         return (shreg);
1840 }
1841
1842 static int
1843 vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval)
1844 {
1845         int running, hostcpu;
1846         struct vmx *vmx = arg;
1847
1848         running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
1849         if (running && hostcpu != curcpu)
1850                 panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu);
1851
1852         if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0)
1853                 return (0);
1854
1855         return (vmcs_getreg(&vmx->vmcs[vcpu], running, reg, retval));
1856 }
1857
1858 static int
1859 vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
1860 {
1861         int error, hostcpu, running, shadow;
1862         uint64_t ctls;
1863         struct vmx *vmx = arg;
1864
1865         running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
1866         if (running && hostcpu != curcpu)
1867                 panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu);
1868
1869         if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0)
1870                 return (0);
1871
1872         error = vmcs_setreg(&vmx->vmcs[vcpu], running, reg, val);
1873
1874         if (error == 0) {
1875                 /*
1876                  * If the "load EFER" VM-entry control is 1 then the
1877                  * value of EFER.LMA must be identical to "IA-32e mode guest"
1878                  * bit in the VM-entry control.
1879                  */
1880                 if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 &&
1881                     (reg == VM_REG_GUEST_EFER)) {
1882                         vmcs_getreg(&vmx->vmcs[vcpu], running,
1883                                     VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls);
1884                         if (val & EFER_LMA)
1885                                 ctls |= VM_ENTRY_GUEST_LMA;
1886                         else
1887                                 ctls &= ~VM_ENTRY_GUEST_LMA;
1888                         vmcs_setreg(&vmx->vmcs[vcpu], running,
1889                                     VMCS_IDENT(VMCS_ENTRY_CTLS), ctls);
1890                 }
1891
1892                 shadow = vmx_shadow_reg(reg);
1893                 if (shadow > 0) {
1894                         /*
1895                          * Store the unmodified value in the shadow
1896                          */                     
1897                         error = vmcs_setreg(&vmx->vmcs[vcpu], running,
1898                                     VMCS_IDENT(shadow), val);
1899                 }
1900         }
1901
1902         return (error);
1903 }
1904
1905 static int
1906 vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
1907 {
1908         struct vmx *vmx = arg;
1909
1910         return (vmcs_getdesc(&vmx->vmcs[vcpu], reg, desc));
1911 }
1912
1913 static int
1914 vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
1915 {
1916         struct vmx *vmx = arg;
1917
1918         return (vmcs_setdesc(&vmx->vmcs[vcpu], reg, desc));
1919 }
1920
1921 static int
1922 vmx_inject(void *arg, int vcpu, int type, int vector, uint32_t code,
1923            int code_valid)
1924 {
1925         int error;
1926         uint64_t info;
1927         struct vmx *vmx = arg;
1928         struct vmcs *vmcs = &vmx->vmcs[vcpu];
1929
1930         static uint32_t type_map[VM_EVENT_MAX] = {
1931                 0x1,            /* VM_EVENT_NONE */
1932                 0x0,            /* VM_HW_INTR */
1933                 0x2,            /* VM_NMI */
1934                 0x3,            /* VM_HW_EXCEPTION */
1935                 0x4,            /* VM_SW_INTR */
1936                 0x5,            /* VM_PRIV_SW_EXCEPTION */
1937                 0x6,            /* VM_SW_EXCEPTION */
1938         };
1939
1940         /*
1941          * If there is already an exception pending to be delivered to the
1942          * vcpu then just return.
1943          */
1944         error = vmcs_getreg(vmcs, 0, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), &info);
1945         if (error)
1946                 return (error);
1947
1948         if (info & VMCS_INTERRUPTION_INFO_VALID)
1949                 return (EAGAIN);
1950
1951         info = vector | (type_map[type] << 8) | (code_valid ? 1 << 11 : 0);
1952         info |= VMCS_INTERRUPTION_INFO_VALID;
1953         error = vmcs_setreg(vmcs, 0, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), info);
1954         if (error != 0)
1955                 return (error);
1956
1957         if (code_valid) {
1958                 error = vmcs_setreg(vmcs, 0,
1959                                     VMCS_IDENT(VMCS_ENTRY_EXCEPTION_ERROR),
1960                                     code);
1961         }
1962         return (error);
1963 }
1964
1965 static int
1966 vmx_getcap(void *arg, int vcpu, int type, int *retval)
1967 {
1968         struct vmx *vmx = arg;
1969         int vcap;
1970         int ret;
1971
1972         ret = ENOENT;
1973
1974         vcap = vmx->cap[vcpu].set;
1975
1976         switch (type) {
1977         case VM_CAP_HALT_EXIT:
1978                 if (cap_halt_exit)
1979                         ret = 0;
1980                 break;
1981         case VM_CAP_PAUSE_EXIT:
1982                 if (cap_pause_exit)
1983                         ret = 0;
1984                 break;
1985         case VM_CAP_MTRAP_EXIT:
1986                 if (cap_monitor_trap)
1987                         ret = 0;
1988                 break;
1989         case VM_CAP_UNRESTRICTED_GUEST:
1990                 if (cap_unrestricted_guest)
1991                         ret = 0;
1992                 break;
1993         case VM_CAP_ENABLE_INVPCID:
1994                 if (cap_invpcid)
1995                         ret = 0;
1996                 break;
1997         default:
1998                 break;
1999         }
2000
2001         if (ret == 0)
2002                 *retval = (vcap & (1 << type)) ? 1 : 0;
2003
2004         return (ret);
2005 }
2006
2007 static int
2008 vmx_setcap(void *arg, int vcpu, int type, int val)
2009 {
2010         struct vmx *vmx = arg;
2011         struct vmcs *vmcs = &vmx->vmcs[vcpu];
2012         uint32_t baseval;
2013         uint32_t *pptr;
2014         int error;
2015         int flag;
2016         int reg;
2017         int retval;
2018
2019         retval = ENOENT;
2020         pptr = NULL;
2021
2022         switch (type) {
2023         case VM_CAP_HALT_EXIT:
2024                 if (cap_halt_exit) {
2025                         retval = 0;
2026                         pptr = &vmx->cap[vcpu].proc_ctls;
2027                         baseval = *pptr;
2028                         flag = PROCBASED_HLT_EXITING;
2029                         reg = VMCS_PRI_PROC_BASED_CTLS;
2030                 }
2031                 break;
2032         case VM_CAP_MTRAP_EXIT:
2033                 if (cap_monitor_trap) {
2034                         retval = 0;
2035                         pptr = &vmx->cap[vcpu].proc_ctls;
2036                         baseval = *pptr;
2037                         flag = PROCBASED_MTF;
2038                         reg = VMCS_PRI_PROC_BASED_CTLS;
2039                 }
2040                 break;
2041         case VM_CAP_PAUSE_EXIT:
2042                 if (cap_pause_exit) {
2043                         retval = 0;
2044                         pptr = &vmx->cap[vcpu].proc_ctls;
2045                         baseval = *pptr;
2046                         flag = PROCBASED_PAUSE_EXITING;
2047                         reg = VMCS_PRI_PROC_BASED_CTLS;
2048                 }
2049                 break;
2050         case VM_CAP_UNRESTRICTED_GUEST:
2051                 if (cap_unrestricted_guest) {
2052                         retval = 0;
2053                         pptr = &vmx->cap[vcpu].proc_ctls2;
2054                         baseval = *pptr;
2055                         flag = PROCBASED2_UNRESTRICTED_GUEST;
2056                         reg = VMCS_SEC_PROC_BASED_CTLS;
2057                 }
2058                 break;
2059         case VM_CAP_ENABLE_INVPCID:
2060                 if (cap_invpcid) {
2061                         retval = 0;
2062                         pptr = &vmx->cap[vcpu].proc_ctls2;
2063                         baseval = *pptr;
2064                         flag = PROCBASED2_ENABLE_INVPCID;
2065                         reg = VMCS_SEC_PROC_BASED_CTLS;
2066                 }
2067                 break;
2068         default:
2069                 break;
2070         }
2071
2072         if (retval == 0) {
2073                 if (val) {
2074                         baseval |= flag;
2075                 } else {
2076                         baseval &= ~flag;
2077                 }
2078                 VMPTRLD(vmcs);
2079                 error = vmwrite(reg, baseval);
2080                 VMCLEAR(vmcs);
2081
2082                 if (error) {
2083                         retval = error;
2084                 } else {
2085                         /*
2086                          * Update optional stored flags, and record
2087                          * setting
2088                          */
2089                         if (pptr != NULL) {
2090                                 *pptr = baseval;
2091                         }
2092
2093                         if (val) {
2094                                 vmx->cap[vcpu].set |= (1 << type);
2095                         } else {
2096                                 vmx->cap[vcpu].set &= ~(1 << type);
2097                         }
2098                 }
2099         }
2100
2101         return (retval);
2102 }
2103
2104 /*
2105  * Posted Interrupt Descriptor (described in section 29.6 of the Intel SDM).
2106  */
2107 struct pir_desc {
2108         uint64_t        pir[4];
2109         uint64_t        pending;
2110         uint64_t        unused[3];
2111 } __aligned(64);
2112 CTASSERT(sizeof(struct pir_desc) == 64);
2113
2114 struct vlapic_vtx {
2115         struct vlapic   vlapic;
2116         struct pir_desc pir_desc;
2117 };
2118
2119 #define VMX_CTR_PIR(vm, vcpuid, pir_desc, notify, vector, level, msg)   \
2120 do {                                                                    \
2121         VCPU_CTR2(vm, vcpuid, msg " assert %s-triggered vector %d",     \
2122             level ? "level" : "edge", vector);                          \
2123         VCPU_CTR1(vm, vcpuid, msg " pir0 0x%016lx", pir_desc->pir[0]);  \
2124         VCPU_CTR1(vm, vcpuid, msg " pir1 0x%016lx", pir_desc->pir[1]);  \
2125         VCPU_CTR1(vm, vcpuid, msg " pir2 0x%016lx", pir_desc->pir[2]);  \
2126         VCPU_CTR1(vm, vcpuid, msg " pir3 0x%016lx", pir_desc->pir[3]);  \
2127         VCPU_CTR1(vm, vcpuid, msg " notify: %s", notify ? "yes" : "no");\
2128 } while (0)
2129
2130 /*
2131  * vlapic->ops handlers that utilize the APICv hardware assist described in
2132  * Chapter 29 of the Intel SDM.
2133  */
2134 static int
2135 vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level)
2136 {
2137         struct vlapic_vtx *vlapic_vtx;
2138         struct pir_desc *pir_desc;
2139         uint64_t mask;
2140         int idx, notify;
2141
2142         /*
2143          * XXX need to deal with level triggered interrupts
2144          */
2145         vlapic_vtx = (struct vlapic_vtx *)vlapic;
2146         pir_desc = &vlapic_vtx->pir_desc;
2147
2148         /*
2149          * Keep track of interrupt requests in the PIR descriptor. This is
2150          * because the virtual APIC page pointed to by the VMCS cannot be
2151          * modified if the vcpu is running.
2152          */
2153         idx = vector / 64;
2154         mask = 1UL << (vector % 64);
2155         atomic_set_long(&pir_desc->pir[idx], mask);
2156         notify = atomic_cmpset_long(&pir_desc->pending, 0, 1);
2157
2158         VMX_CTR_PIR(vlapic->vm, vlapic->vcpuid, pir_desc, notify, vector,
2159             level, "vmx_set_intr_ready");
2160         return (notify);
2161 }
2162
2163 static int
2164 vmx_pending_intr(struct vlapic *vlapic, int *vecptr)
2165 {
2166         struct vlapic_vtx *vlapic_vtx;
2167         struct pir_desc *pir_desc;
2168         struct LAPIC *lapic;
2169         uint64_t pending, pirval;
2170         uint32_t ppr, vpr;
2171         int i;
2172
2173         /*
2174          * This function is only expected to be called from the 'HLT' exit
2175          * handler which does not care about the vector that is pending.
2176          */
2177         KASSERT(vecptr == NULL, ("vmx_pending_intr: vecptr must be NULL"));
2178
2179         vlapic_vtx = (struct vlapic_vtx *)vlapic;
2180         pir_desc = &vlapic_vtx->pir_desc;
2181
2182         pending = atomic_load_acq_long(&pir_desc->pending);
2183         if (!pending)
2184                 return (0);     /* common case */
2185
2186         /*
2187          * If there is an interrupt pending then it will be recognized only
2188          * if its priority is greater than the processor priority.
2189          *
2190          * Special case: if the processor priority is zero then any pending
2191          * interrupt will be recognized.
2192          */
2193         lapic = vlapic->apic_page;
2194         ppr = lapic->ppr & 0xf0;
2195         if (ppr == 0)
2196                 return (1);
2197
2198         VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "HLT with non-zero PPR %d",
2199             lapic->ppr);
2200
2201         for (i = 3; i >= 0; i--) {
2202                 pirval = pir_desc->pir[i];
2203                 if (pirval != 0) {
2204                         vpr = (i * 64 + flsl(pirval) - 1) & 0xf0;
2205                         return (vpr > ppr);
2206                 }
2207         }
2208         return (0);
2209 }
2210
2211 static void
2212 vmx_intr_accepted(struct vlapic *vlapic, int vector)
2213 {
2214
2215         panic("vmx_intr_accepted: not expected to be called");
2216 }
2217
2218 /*
2219  * Transfer the pending interrupts in the PIR descriptor to the IRR
2220  * in the virtual APIC page.
2221  */
2222 static void
2223 vmx_inject_pir(struct vlapic *vlapic)
2224 {
2225         struct vlapic_vtx *vlapic_vtx;
2226         struct pir_desc *pir_desc;
2227         struct LAPIC *lapic;
2228         uint64_t val, pirval;
2229         int rvi, pirbase;
2230         uint16_t intr_status_old, intr_status_new;
2231
2232         vlapic_vtx = (struct vlapic_vtx *)vlapic;
2233         pir_desc = &vlapic_vtx->pir_desc;
2234         if (atomic_cmpset_long(&pir_desc->pending, 1, 0) == 0) {
2235                 VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: "
2236                     "no posted interrupt pending");
2237                 return;
2238         }
2239
2240         pirval = 0;
2241         lapic = vlapic->apic_page;
2242
2243         val = atomic_readandclear_long(&pir_desc->pir[0]);
2244         if (val != 0) {
2245                 lapic->irr0 |= val;
2246                 lapic->irr1 |= val >> 32;
2247                 pirbase = 0;
2248                 pirval = val;
2249         }
2250
2251         val = atomic_readandclear_long(&pir_desc->pir[1]);
2252         if (val != 0) {
2253                 lapic->irr2 |= val;
2254                 lapic->irr3 |= val >> 32;
2255                 pirbase = 64;
2256                 pirval = val;
2257         }
2258
2259         val = atomic_readandclear_long(&pir_desc->pir[2]);
2260         if (val != 0) {
2261                 lapic->irr4 |= val;
2262                 lapic->irr5 |= val >> 32;
2263                 pirbase = 128;
2264                 pirval = val;
2265         }
2266
2267         val = atomic_readandclear_long(&pir_desc->pir[3]);
2268         if (val != 0) {
2269                 lapic->irr6 |= val;
2270                 lapic->irr7 |= val >> 32;
2271                 pirbase = 192;
2272                 pirval = val;
2273         }
2274         VLAPIC_CTR_IRR(vlapic, "vmx_inject_pir");
2275
2276         /*
2277          * Update RVI so the processor can evaluate pending virtual
2278          * interrupts on VM-entry.
2279          */
2280         if (pirval != 0) {
2281                 rvi = pirbase + flsl(pirval) - 1;
2282                 intr_status_old = vmcs_read(VMCS_GUEST_INTR_STATUS);
2283                 intr_status_new = (intr_status_old & 0xFF00) | rvi;
2284                 if (intr_status_new > intr_status_old) {
2285                         vmcs_write(VMCS_GUEST_INTR_STATUS, intr_status_new);
2286                         VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: "
2287                             "guest_intr_status changed from 0x%04x to 0x%04x",
2288                             intr_status_old, intr_status_new);
2289                 }
2290         }
2291 }
2292
2293 static struct vlapic *
2294 vmx_vlapic_init(void *arg, int vcpuid)
2295 {
2296         struct vmx *vmx;
2297         struct vlapic *vlapic;
2298         
2299         vmx = arg;
2300
2301         vlapic = malloc(sizeof(struct vlapic_vtx), M_VLAPIC, M_WAITOK | M_ZERO);
2302         vlapic->vm = vmx->vm;
2303         vlapic->vcpuid = vcpuid;
2304         vlapic->apic_page = (struct LAPIC *)&vmx->apic_page[vcpuid];
2305
2306         if (virtual_interrupt_delivery) {
2307                 vlapic->ops.set_intr_ready = vmx_set_intr_ready;
2308                 vlapic->ops.pending_intr = vmx_pending_intr;
2309                 vlapic->ops.intr_accepted = vmx_intr_accepted;
2310         }
2311
2312         vlapic_init(vlapic);
2313
2314         return (vlapic);
2315 }
2316
2317 static void
2318 vmx_vlapic_cleanup(void *arg, struct vlapic *vlapic)
2319 {
2320
2321         vlapic_cleanup(vlapic);
2322         free(vlapic, M_VLAPIC);
2323 }
2324
2325 struct vmm_ops vmm_ops_intel = {
2326         vmx_init,
2327         vmx_cleanup,
2328         vmx_restore,
2329         vmx_vminit,
2330         vmx_run,
2331         vmx_vmcleanup,
2332         vmx_getreg,
2333         vmx_setreg,
2334         vmx_getdesc,
2335         vmx_setdesc,
2336         vmx_inject,
2337         vmx_getcap,
2338         vmx_setcap,
2339         ept_vmspace_alloc,
2340         ept_vmspace_free,
2341         vmx_vlapic_init,
2342         vmx_vlapic_cleanup,
2343 };