]> CyberLeo.Net >> Repos - FreeBSD/stable/10.git/blob - sys/amd64/vmm/intel/vmx.c
Copy head (r256279) to stable/10 as part of the 10.0-RELEASE cycle.
[FreeBSD/stable/10.git] / sys / amd64 / vmm / intel / vmx.c
1 /*-
2  * Copyright (c) 2011 NetApp, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  * $FreeBSD$
27  */
28
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/smp.h>
35 #include <sys/kernel.h>
36 #include <sys/malloc.h>
37 #include <sys/pcpu.h>
38 #include <sys/proc.h>
39 #include <sys/sysctl.h>
40
41 #include <vm/vm.h>
42 #include <vm/pmap.h>
43
44 #include <machine/psl.h>
45 #include <machine/cpufunc.h>
46 #include <machine/md_var.h>
47 #include <machine/pmap.h>
48 #include <machine/segments.h>
49 #include <machine/specialreg.h>
50 #include <machine/vmparam.h>
51
52 #include <machine/vmm.h>
53 #include "vmm_host.h"
54 #include "vmm_lapic.h"
55 #include "vmm_msr.h"
56 #include "vmm_ktr.h"
57 #include "vmm_stat.h"
58
59 #include "vmx_msr.h"
60 #include "ept.h"
61 #include "vmx_cpufunc.h"
62 #include "vmx.h"
63 #include "x86.h"
64 #include "vmx_controls.h"
65
66 #define PINBASED_CTLS_ONE_SETTING                                       \
67         (PINBASED_EXTINT_EXITING        |                               \
68          PINBASED_NMI_EXITING           |                               \
69          PINBASED_VIRTUAL_NMI)
70 #define PINBASED_CTLS_ZERO_SETTING      0
71
72 #define PROCBASED_CTLS_WINDOW_SETTING                                   \
73         (PROCBASED_INT_WINDOW_EXITING   |                               \
74          PROCBASED_NMI_WINDOW_EXITING)
75
76 #define PROCBASED_CTLS_ONE_SETTING                                      \
77         (PROCBASED_SECONDARY_CONTROLS   |                               \
78          PROCBASED_IO_EXITING           |                               \
79          PROCBASED_MSR_BITMAPS          |                               \
80          PROCBASED_CTLS_WINDOW_SETTING)
81 #define PROCBASED_CTLS_ZERO_SETTING     \
82         (PROCBASED_CR3_LOAD_EXITING |   \
83         PROCBASED_CR3_STORE_EXITING |   \
84         PROCBASED_IO_BITMAPS)
85
86 #define PROCBASED_CTLS2_ONE_SETTING     PROCBASED2_ENABLE_EPT
87 #define PROCBASED_CTLS2_ZERO_SETTING    0
88
89 #define VM_EXIT_CTLS_ONE_SETTING_NO_PAT                                 \
90         (VM_EXIT_HOST_LMA                       |                       \
91         VM_EXIT_SAVE_EFER                       |                       \
92         VM_EXIT_LOAD_EFER)
93
94 #define VM_EXIT_CTLS_ONE_SETTING                                        \
95         (VM_EXIT_CTLS_ONE_SETTING_NO_PAT        |                       \
96         VM_EXIT_SAVE_PAT                        |                       \
97         VM_EXIT_LOAD_PAT)
98 #define VM_EXIT_CTLS_ZERO_SETTING       VM_EXIT_SAVE_DEBUG_CONTROLS
99
100 #define VM_ENTRY_CTLS_ONE_SETTING_NO_PAT        VM_ENTRY_LOAD_EFER
101
102 #define VM_ENTRY_CTLS_ONE_SETTING                                       \
103         (VM_ENTRY_CTLS_ONE_SETTING_NO_PAT       |                       \
104         VM_ENTRY_LOAD_PAT)
105 #define VM_ENTRY_CTLS_ZERO_SETTING                                      \
106         (VM_ENTRY_LOAD_DEBUG_CONTROLS           |                       \
107         VM_ENTRY_INTO_SMM                       |                       \
108         VM_ENTRY_DEACTIVATE_DUAL_MONITOR)
109
110 #define guest_msr_rw(vmx, msr) \
111         msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW)
112
113 #define HANDLED         1
114 #define UNHANDLED       0
115
116 MALLOC_DEFINE(M_VMX, "vmx", "vmx");
117
118 SYSCTL_DECL(_hw_vmm);
119 SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW, NULL, NULL);
120
121 int vmxon_enabled[MAXCPU];
122 static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE);
123
124 static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2;
125 static uint32_t exit_ctls, entry_ctls;
126
127 static uint64_t cr0_ones_mask, cr0_zeros_mask;
128 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_ones_mask, CTLFLAG_RD,
129              &cr0_ones_mask, 0, NULL);
130 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_zeros_mask, CTLFLAG_RD,
131              &cr0_zeros_mask, 0, NULL);
132
133 static uint64_t cr4_ones_mask, cr4_zeros_mask;
134 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD,
135              &cr4_ones_mask, 0, NULL);
136 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD,
137              &cr4_zeros_mask, 0, NULL);
138
139 static int vmx_no_patmsr;
140
141 static int vmx_initialized;
142 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD,
143            &vmx_initialized, 0, "Intel VMX initialized");
144
145 /*
146  * Virtual NMI blocking conditions.
147  *
148  * Some processor implementations also require NMI to be blocked if
149  * the STI_BLOCKING bit is set. It is possible to detect this at runtime
150  * based on the (exit_reason,exit_qual) tuple being set to 
151  * (EXIT_REASON_INVAL_VMCS, EXIT_QUAL_NMI_WHILE_STI_BLOCKING).
152  *
153  * We take the easy way out and also include STI_BLOCKING as one of the
154  * gating items for vNMI injection.
155  */
156 static uint64_t nmi_blocking_bits = VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING |
157                                     VMCS_INTERRUPTIBILITY_NMI_BLOCKING |
158                                     VMCS_INTERRUPTIBILITY_STI_BLOCKING;
159
160 /*
161  * Optional capabilities
162  */
163 static int cap_halt_exit;
164 static int cap_pause_exit;
165 static int cap_unrestricted_guest;
166 static int cap_monitor_trap;
167  
168 static struct unrhdr *vpid_unr;
169 static u_int vpid_alloc_failed;
170 SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD,
171             &vpid_alloc_failed, 0, NULL);
172
173 #ifdef KTR
174 static const char *
175 exit_reason_to_str(int reason)
176 {
177         static char reasonbuf[32];
178
179         switch (reason) {
180         case EXIT_REASON_EXCEPTION:
181                 return "exception";
182         case EXIT_REASON_EXT_INTR:
183                 return "extint";
184         case EXIT_REASON_TRIPLE_FAULT:
185                 return "triplefault";
186         case EXIT_REASON_INIT:
187                 return "init";
188         case EXIT_REASON_SIPI:
189                 return "sipi";
190         case EXIT_REASON_IO_SMI:
191                 return "iosmi";
192         case EXIT_REASON_SMI:
193                 return "smi";
194         case EXIT_REASON_INTR_WINDOW:
195                 return "intrwindow";
196         case EXIT_REASON_NMI_WINDOW:
197                 return "nmiwindow";
198         case EXIT_REASON_TASK_SWITCH:
199                 return "taskswitch";
200         case EXIT_REASON_CPUID:
201                 return "cpuid";
202         case EXIT_REASON_GETSEC:
203                 return "getsec";
204         case EXIT_REASON_HLT:
205                 return "hlt";
206         case EXIT_REASON_INVD:
207                 return "invd";
208         case EXIT_REASON_INVLPG:
209                 return "invlpg";
210         case EXIT_REASON_RDPMC:
211                 return "rdpmc";
212         case EXIT_REASON_RDTSC:
213                 return "rdtsc";
214         case EXIT_REASON_RSM:
215                 return "rsm";
216         case EXIT_REASON_VMCALL:
217                 return "vmcall";
218         case EXIT_REASON_VMCLEAR:
219                 return "vmclear";
220         case EXIT_REASON_VMLAUNCH:
221                 return "vmlaunch";
222         case EXIT_REASON_VMPTRLD:
223                 return "vmptrld";
224         case EXIT_REASON_VMPTRST:
225                 return "vmptrst";
226         case EXIT_REASON_VMREAD:
227                 return "vmread";
228         case EXIT_REASON_VMRESUME:
229                 return "vmresume";
230         case EXIT_REASON_VMWRITE:
231                 return "vmwrite";
232         case EXIT_REASON_VMXOFF:
233                 return "vmxoff";
234         case EXIT_REASON_VMXON:
235                 return "vmxon";
236         case EXIT_REASON_CR_ACCESS:
237                 return "craccess";
238         case EXIT_REASON_DR_ACCESS:
239                 return "draccess";
240         case EXIT_REASON_INOUT:
241                 return "inout";
242         case EXIT_REASON_RDMSR:
243                 return "rdmsr";
244         case EXIT_REASON_WRMSR:
245                 return "wrmsr";
246         case EXIT_REASON_INVAL_VMCS:
247                 return "invalvmcs";
248         case EXIT_REASON_INVAL_MSR:
249                 return "invalmsr";
250         case EXIT_REASON_MWAIT:
251                 return "mwait";
252         case EXIT_REASON_MTF:
253                 return "mtf";
254         case EXIT_REASON_MONITOR:
255                 return "monitor";
256         case EXIT_REASON_PAUSE:
257                 return "pause";
258         case EXIT_REASON_MCE:
259                 return "mce";
260         case EXIT_REASON_TPR:
261                 return "tpr";
262         case EXIT_REASON_APIC:
263                 return "apic";
264         case EXIT_REASON_GDTR_IDTR:
265                 return "gdtridtr";
266         case EXIT_REASON_LDTR_TR:
267                 return "ldtrtr";
268         case EXIT_REASON_EPT_FAULT:
269                 return "eptfault";
270         case EXIT_REASON_EPT_MISCONFIG:
271                 return "eptmisconfig";
272         case EXIT_REASON_INVEPT:
273                 return "invept";
274         case EXIT_REASON_RDTSCP:
275                 return "rdtscp";
276         case EXIT_REASON_VMX_PREEMPT:
277                 return "vmxpreempt";
278         case EXIT_REASON_INVVPID:
279                 return "invvpid";
280         case EXIT_REASON_WBINVD:
281                 return "wbinvd";
282         case EXIT_REASON_XSETBV:
283                 return "xsetbv";
284         default:
285                 snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason);
286                 return (reasonbuf);
287         }
288 }
289
290 #ifdef SETJMP_TRACE
291 static const char *
292 vmx_setjmp_rc2str(int rc)
293 {
294         switch (rc) {
295         case VMX_RETURN_DIRECT:
296                 return "direct";
297         case VMX_RETURN_LONGJMP:
298                 return "longjmp";
299         case VMX_RETURN_VMRESUME:
300                 return "vmresume";
301         case VMX_RETURN_VMLAUNCH:
302                 return "vmlaunch";
303         case VMX_RETURN_AST:
304                 return "ast";
305         default:
306                 return "unknown";
307         }
308 }
309
310 #define SETJMP_TRACE(vmx, vcpu, vmxctx, regname)                          \
311         VMM_CTR1((vmx)->vm, (vcpu), "setjmp trace " #regname " 0x%016lx", \
312                  (vmxctx)->regname)
313
314 static void
315 vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc)
316 {
317         uint64_t host_rip, host_rsp;
318
319         if (vmxctx != &vmx->ctx[vcpu])
320                 panic("vmx_setjmp_trace: invalid vmxctx %p; should be %p",
321                         vmxctx, &vmx->ctx[vcpu]);
322
323         VMM_CTR1((vmx)->vm, (vcpu), "vmxctx = %p", vmxctx);
324         VMM_CTR2((vmx)->vm, (vcpu), "setjmp return code %s(%d)",
325                  vmx_setjmp_rc2str(rc), rc);
326
327         host_rsp = host_rip = ~0;
328         vmread(VMCS_HOST_RIP, &host_rip);
329         vmread(VMCS_HOST_RSP, &host_rsp);
330         VMM_CTR2((vmx)->vm, (vcpu), "vmcs host_rip 0x%016lx, host_rsp 0x%016lx",
331                  host_rip, host_rsp);
332
333         SETJMP_TRACE(vmx, vcpu, vmxctx, host_r15);
334         SETJMP_TRACE(vmx, vcpu, vmxctx, host_r14);
335         SETJMP_TRACE(vmx, vcpu, vmxctx, host_r13);
336         SETJMP_TRACE(vmx, vcpu, vmxctx, host_r12);
337         SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbp);
338         SETJMP_TRACE(vmx, vcpu, vmxctx, host_rsp);
339         SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbx);
340         SETJMP_TRACE(vmx, vcpu, vmxctx, host_rip);
341
342         SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdi);
343         SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rsi);
344         SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdx);
345         SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rcx);
346         SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r8);
347         SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r9);
348         SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rax);
349         SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbx);
350         SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbp);
351         SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r10);
352         SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r11);
353         SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r12);
354         SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r13);
355         SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r14);
356         SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r15);
357         SETJMP_TRACE(vmx, vcpu, vmxctx, guest_cr2);
358 }
359 #endif
360 #else
361 static void __inline
362 vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc)
363 {
364         return;
365 }
366 #endif  /* KTR */
367
368 u_long
369 vmx_fix_cr0(u_long cr0)
370 {
371
372         return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask);
373 }
374
375 u_long
376 vmx_fix_cr4(u_long cr4)
377 {
378
379         return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask);
380 }
381
382 static void
383 vpid_free(int vpid)
384 {
385         if (vpid < 0 || vpid > 0xffff)
386                 panic("vpid_free: invalid vpid %d", vpid);
387
388         /*
389          * VPIDs [0,VM_MAXCPU] are special and are not allocated from
390          * the unit number allocator.
391          */
392
393         if (vpid > VM_MAXCPU)
394                 free_unr(vpid_unr, vpid);
395 }
396
397 static void
398 vpid_alloc(uint16_t *vpid, int num)
399 {
400         int i, x;
401
402         if (num <= 0 || num > VM_MAXCPU)
403                 panic("invalid number of vpids requested: %d", num);
404
405         /*
406          * If the "enable vpid" execution control is not enabled then the
407          * VPID is required to be 0 for all vcpus.
408          */
409         if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) == 0) {
410                 for (i = 0; i < num; i++)
411                         vpid[i] = 0;
412                 return;
413         }
414
415         /*
416          * Allocate a unique VPID for each vcpu from the unit number allocator.
417          */
418         for (i = 0; i < num; i++) {
419                 x = alloc_unr(vpid_unr);
420                 if (x == -1)
421                         break;
422                 else
423                         vpid[i] = x;
424         }
425
426         if (i < num) {
427                 atomic_add_int(&vpid_alloc_failed, 1);
428
429                 /*
430                  * If the unit number allocator does not have enough unique
431                  * VPIDs then we need to allocate from the [1,VM_MAXCPU] range.
432                  *
433                  * These VPIDs are not be unique across VMs but this does not
434                  * affect correctness because the combined mappings are also
435                  * tagged with the EP4TA which is unique for each VM.
436                  *
437                  * It is still sub-optimal because the invvpid will invalidate
438                  * combined mappings for a particular VPID across all EP4TAs.
439                  */
440                 while (i-- > 0)
441                         vpid_free(vpid[i]);
442
443                 for (i = 0; i < num; i++)
444                         vpid[i] = i + 1;
445         }
446 }
447
448 static void
449 vpid_init(void)
450 {
451         /*
452          * VPID 0 is required when the "enable VPID" execution control is
453          * disabled.
454          *
455          * VPIDs [1,VM_MAXCPU] are used as the "overflow namespace" when the
456          * unit number allocator does not have sufficient unique VPIDs to
457          * satisfy the allocation.
458          *
459          * The remaining VPIDs are managed by the unit number allocator.
460          */
461         vpid_unr = new_unrhdr(VM_MAXCPU + 1, 0xffff, NULL);
462 }
463
464 static void
465 msr_save_area_init(struct msr_entry *g_area, int *g_count)
466 {
467         int cnt;
468
469         static struct msr_entry guest_msrs[] = {
470                 { MSR_KGSBASE, 0, 0 },
471         };
472
473         cnt = sizeof(guest_msrs) / sizeof(guest_msrs[0]);
474         if (cnt > GUEST_MSR_MAX_ENTRIES)
475                 panic("guest msr save area overrun");
476         bcopy(guest_msrs, g_area, sizeof(guest_msrs));
477         *g_count = cnt;
478 }
479
480 static void
481 vmx_disable(void *arg __unused)
482 {
483         struct invvpid_desc invvpid_desc = { 0 };
484         struct invept_desc invept_desc = { 0 };
485
486         if (vmxon_enabled[curcpu]) {
487                 /*
488                  * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b.
489                  *
490                  * VMXON or VMXOFF are not required to invalidate any TLB
491                  * caching structures. This prevents potential retention of
492                  * cached information in the TLB between distinct VMX episodes.
493                  */
494                 invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc);
495                 invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc);
496                 vmxoff();
497         }
498         load_cr4(rcr4() & ~CR4_VMXE);
499 }
500
501 static int
502 vmx_cleanup(void)
503 {
504
505         if (vpid_unr != NULL) {
506                 delete_unrhdr(vpid_unr);
507                 vpid_unr = NULL;
508         }
509
510         smp_rendezvous(NULL, vmx_disable, NULL, NULL);
511
512         return (0);
513 }
514
515 static void
516 vmx_enable(void *arg __unused)
517 {
518         int error;
519
520         load_cr4(rcr4() | CR4_VMXE);
521
522         *(uint32_t *)vmxon_region[curcpu] = vmx_revision();
523         error = vmxon(vmxon_region[curcpu]);
524         if (error == 0)
525                 vmxon_enabled[curcpu] = 1;
526 }
527
528 static int
529 vmx_init(void)
530 {
531         int error;
532         uint64_t fixed0, fixed1, feature_control;
533         uint32_t tmp;
534
535         /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */
536         if (!(cpu_feature2 & CPUID2_VMX)) {
537                 printf("vmx_init: processor does not support VMX operation\n");
538                 return (ENXIO);
539         }
540
541         /*
542          * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits
543          * are set (bits 0 and 2 respectively).
544          */
545         feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
546         if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 ||
547             (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) {
548                 printf("vmx_init: VMX operation disabled by BIOS\n");
549                 return (ENXIO);
550         }
551
552         /* Check support for primary processor-based VM-execution controls */
553         error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
554                                MSR_VMX_TRUE_PROCBASED_CTLS,
555                                PROCBASED_CTLS_ONE_SETTING,
556                                PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls);
557         if (error) {
558                 printf("vmx_init: processor does not support desired primary "
559                        "processor-based controls\n");
560                 return (error);
561         }
562
563         /* Clear the processor-based ctl bits that are set on demand */
564         procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING;
565
566         /* Check support for secondary processor-based VM-execution controls */
567         error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
568                                MSR_VMX_PROCBASED_CTLS2,
569                                PROCBASED_CTLS2_ONE_SETTING,
570                                PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2);
571         if (error) {
572                 printf("vmx_init: processor does not support desired secondary "
573                        "processor-based controls\n");
574                 return (error);
575         }
576
577         /* Check support for VPID */
578         error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
579                                PROCBASED2_ENABLE_VPID, 0, &tmp);
580         if (error == 0)
581                 procbased_ctls2 |= PROCBASED2_ENABLE_VPID;
582
583         /* Check support for pin-based VM-execution controls */
584         error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
585                                MSR_VMX_TRUE_PINBASED_CTLS,
586                                PINBASED_CTLS_ONE_SETTING,
587                                PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls);
588         if (error) {
589                 printf("vmx_init: processor does not support desired "
590                        "pin-based controls\n");
591                 return (error);
592         }
593
594         /* Check support for VM-exit controls */
595         error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS,
596                                VM_EXIT_CTLS_ONE_SETTING,
597                                VM_EXIT_CTLS_ZERO_SETTING,
598                                &exit_ctls);
599         if (error) {
600                 /* Try again without the PAT MSR bits */
601                 error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS,
602                                        MSR_VMX_TRUE_EXIT_CTLS,
603                                        VM_EXIT_CTLS_ONE_SETTING_NO_PAT,
604                                        VM_EXIT_CTLS_ZERO_SETTING,
605                                        &exit_ctls);
606                 if (error) {
607                         printf("vmx_init: processor does not support desired "
608                                "exit controls\n");
609                         return (error);
610                 } else {
611                         if (bootverbose)
612                                 printf("vmm: PAT MSR access not supported\n");
613                         guest_msr_valid(MSR_PAT);
614                         vmx_no_patmsr = 1;
615                 }
616         }
617
618         /* Check support for VM-entry controls */
619         if (!vmx_no_patmsr) {
620                 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS,
621                                        MSR_VMX_TRUE_ENTRY_CTLS,
622                                        VM_ENTRY_CTLS_ONE_SETTING,
623                                        VM_ENTRY_CTLS_ZERO_SETTING,
624                                        &entry_ctls);
625         } else {
626                 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS,
627                                        MSR_VMX_TRUE_ENTRY_CTLS,
628                                        VM_ENTRY_CTLS_ONE_SETTING_NO_PAT,
629                                        VM_ENTRY_CTLS_ZERO_SETTING,
630                                        &entry_ctls);
631         }
632
633         if (error) {
634                 printf("vmx_init: processor does not support desired "
635                        "entry controls\n");
636                        return (error);
637         }
638
639         /*
640          * Check support for optional features by testing them
641          * as individual bits
642          */
643         cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
644                                         MSR_VMX_TRUE_PROCBASED_CTLS,
645                                         PROCBASED_HLT_EXITING, 0,
646                                         &tmp) == 0);
647
648         cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
649                                         MSR_VMX_PROCBASED_CTLS,
650                                         PROCBASED_MTF, 0,
651                                         &tmp) == 0);
652
653         cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
654                                          MSR_VMX_TRUE_PROCBASED_CTLS,
655                                          PROCBASED_PAUSE_EXITING, 0,
656                                          &tmp) == 0);
657
658         cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
659                                         MSR_VMX_PROCBASED_CTLS2,
660                                         PROCBASED2_UNRESTRICTED_GUEST, 0,
661                                         &tmp) == 0);
662
663         /* Initialize EPT */
664         error = ept_init();
665         if (error) {
666                 printf("vmx_init: ept initialization failed (%d)\n", error);
667                 return (error);
668         }
669
670         /*
671          * Stash the cr0 and cr4 bits that must be fixed to 0 or 1
672          */
673         fixed0 = rdmsr(MSR_VMX_CR0_FIXED0);
674         fixed1 = rdmsr(MSR_VMX_CR0_FIXED1);
675         cr0_ones_mask = fixed0 & fixed1;
676         cr0_zeros_mask = ~fixed0 & ~fixed1;
677
678         /*
679          * CR0_PE and CR0_PG can be set to zero in VMX non-root operation
680          * if unrestricted guest execution is allowed.
681          */
682         if (cap_unrestricted_guest)
683                 cr0_ones_mask &= ~(CR0_PG | CR0_PE);
684
685         /*
686          * Do not allow the guest to set CR0_NW or CR0_CD.
687          */
688         cr0_zeros_mask |= (CR0_NW | CR0_CD);
689
690         fixed0 = rdmsr(MSR_VMX_CR4_FIXED0);
691         fixed1 = rdmsr(MSR_VMX_CR4_FIXED1);
692         cr4_ones_mask = fixed0 & fixed1;
693         cr4_zeros_mask = ~fixed0 & ~fixed1;
694
695         vpid_init();
696
697         /* enable VMX operation */
698         smp_rendezvous(NULL, vmx_enable, NULL, NULL);
699
700         vmx_initialized = 1;
701
702         return (0);
703 }
704
705 static int
706 vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial)
707 {
708         int error, mask_ident, shadow_ident;
709         uint64_t mask_value;
710
711         if (which != 0 && which != 4)
712                 panic("vmx_setup_cr_shadow: unknown cr%d", which);
713
714         if (which == 0) {
715                 mask_ident = VMCS_CR0_MASK;
716                 mask_value = cr0_ones_mask | cr0_zeros_mask;
717                 shadow_ident = VMCS_CR0_SHADOW;
718         } else {
719                 mask_ident = VMCS_CR4_MASK;
720                 mask_value = cr4_ones_mask | cr4_zeros_mask;
721                 shadow_ident = VMCS_CR4_SHADOW;
722         }
723
724         error = vmcs_setreg(vmcs, 0, VMCS_IDENT(mask_ident), mask_value);
725         if (error)
726                 return (error);
727
728         error = vmcs_setreg(vmcs, 0, VMCS_IDENT(shadow_ident), initial);
729         if (error)
730                 return (error);
731
732         return (0);
733 }
734 #define vmx_setup_cr0_shadow(vmcs,init) vmx_setup_cr_shadow(0, (vmcs), (init))
735 #define vmx_setup_cr4_shadow(vmcs,init) vmx_setup_cr_shadow(4, (vmcs), (init))
736
737 static void *
738 vmx_vminit(struct vm *vm, pmap_t pmap)
739 {
740         uint16_t vpid[VM_MAXCPU];
741         int i, error, guest_msr_count;
742         struct vmx *vmx;
743
744         vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO);
745         if ((uintptr_t)vmx & PAGE_MASK) {
746                 panic("malloc of struct vmx not aligned on %d byte boundary",
747                       PAGE_SIZE);
748         }
749         vmx->vm = vm;
750
751         vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4));
752
753         /*
754          * Clean up EPTP-tagged guest physical and combined mappings
755          *
756          * VMX transitions are not required to invalidate any guest physical
757          * mappings. So, it may be possible for stale guest physical mappings
758          * to be present in the processor TLBs.
759          *
760          * Combined mappings for this EP4TA are also invalidated for all VPIDs.
761          */
762         ept_invalidate_mappings(vmx->eptp);
763
764         msr_bitmap_initialize(vmx->msr_bitmap);
765
766         /*
767          * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE.
768          * The guest FSBASE and GSBASE are saved and restored during
769          * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are
770          * always restored from the vmcs host state area on vm-exit.
771          *
772          * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in
773          * how they are saved/restored so can be directly accessed by the
774          * guest.
775          *
776          * Guest KGSBASE is saved and restored in the guest MSR save area.
777          * Host KGSBASE is restored before returning to userland from the pcb.
778          * There will be a window of time when we are executing in the host
779          * kernel context with a value of KGSBASE from the guest. This is ok
780          * because the value of KGSBASE is inconsequential in kernel context.
781          *
782          * MSR_EFER is saved and restored in the guest VMCS area on a
783          * VM exit and entry respectively. It is also restored from the
784          * host VMCS area on a VM exit.
785          */
786         if (guest_msr_rw(vmx, MSR_GSBASE) ||
787             guest_msr_rw(vmx, MSR_FSBASE) ||
788             guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) ||
789             guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) ||
790             guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) ||
791             guest_msr_rw(vmx, MSR_KGSBASE) ||
792             guest_msr_rw(vmx, MSR_EFER))
793                 panic("vmx_vminit: error setting guest msr access");
794
795         /*
796          * MSR_PAT is saved and restored in the guest VMCS are on a VM exit
797          * and entry respectively. It is also restored from the host VMCS
798          * area on a VM exit. However, if running on a system with no
799          * MSR_PAT save/restore support, leave access disabled so accesses
800          * will be trapped.
801          */
802         if (!vmx_no_patmsr && guest_msr_rw(vmx, MSR_PAT))
803                 panic("vmx_vminit: error setting guest pat msr access");
804
805         vpid_alloc(vpid, VM_MAXCPU);
806
807         for (i = 0; i < VM_MAXCPU; i++) {
808                 vmx->vmcs[i].identifier = vmx_revision();
809                 error = vmclear(&vmx->vmcs[i]);
810                 if (error != 0) {
811                         panic("vmx_vminit: vmclear error %d on vcpu %d\n",
812                               error, i);
813                 }
814
815                 error = vmcs_set_defaults(&vmx->vmcs[i],
816                                           (u_long)vmx_longjmp,
817                                           (u_long)&vmx->ctx[i],
818                                           vmx->eptp,
819                                           pinbased_ctls,
820                                           procbased_ctls,
821                                           procbased_ctls2,
822                                           exit_ctls, entry_ctls,
823                                           vtophys(vmx->msr_bitmap),
824                                           vpid[i]);
825
826                 if (error != 0)
827                         panic("vmx_vminit: vmcs_set_defaults error %d", error);
828
829                 vmx->cap[i].set = 0;
830                 vmx->cap[i].proc_ctls = procbased_ctls;
831
832                 vmx->state[i].lastcpu = -1;
833                 vmx->state[i].vpid = vpid[i];
834
835                 msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count);
836
837                 error = vmcs_set_msr_save(&vmx->vmcs[i],
838                                           vtophys(vmx->guest_msrs[i]),
839                                           guest_msr_count);
840                 if (error != 0)
841                         panic("vmcs_set_msr_save error %d", error);
842
843                 /*
844                  * Set up the CR0/4 shadows, and init the read shadow
845                  * to the power-on register value from the Intel Sys Arch.
846                  *  CR0 - 0x60000010
847                  *  CR4 - 0
848                  */
849                 error = vmx_setup_cr0_shadow(&vmx->vmcs[i], 0x60000010);
850                 if (error != 0)
851                         panic("vmx_setup_cr0_shadow %d", error);
852
853                 error = vmx_setup_cr4_shadow(&vmx->vmcs[i], 0);
854                 if (error != 0)
855                         panic("vmx_setup_cr4_shadow %d", error);
856
857                 vmx->ctx[i].pmap = pmap;
858                 vmx->ctx[i].eptp = vmx->eptp;
859         }
860
861         return (vmx);
862 }
863
864 static int
865 vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx)
866 {
867         int handled, func;
868         
869         func = vmxctx->guest_rax;
870
871         handled = x86_emulate_cpuid(vm, vcpu,
872                                     (uint32_t*)(&vmxctx->guest_rax),
873                                     (uint32_t*)(&vmxctx->guest_rbx),
874                                     (uint32_t*)(&vmxctx->guest_rcx),
875                                     (uint32_t*)(&vmxctx->guest_rdx));
876         return (handled);
877 }
878
879 static __inline void
880 vmx_run_trace(struct vmx *vmx, int vcpu)
881 {
882 #ifdef KTR
883         VMM_CTR1(vmx->vm, vcpu, "Resume execution at 0x%0lx", vmcs_guest_rip());
884 #endif
885 }
886
887 static __inline void
888 vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason,
889                int handled)
890 {
891 #ifdef KTR
892         VMM_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx",
893                  handled ? "handled" : "unhandled",
894                  exit_reason_to_str(exit_reason), rip);
895 #endif
896 }
897
898 static __inline void
899 vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip)
900 {
901 #ifdef KTR
902         VMM_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip);
903 #endif
904 }
905
906 static int
907 vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu)
908 {
909         int error, lastcpu;
910         struct vmxstate *vmxstate;
911         struct invvpid_desc invvpid_desc = { 0 };
912
913         vmxstate = &vmx->state[vcpu];
914         lastcpu = vmxstate->lastcpu;
915         vmxstate->lastcpu = curcpu;
916
917         if (lastcpu == curcpu) {
918                 error = 0;
919                 goto done;
920         }
921
922         vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1);
923
924         error = vmwrite(VMCS_HOST_TR_BASE, vmm_get_host_trbase());
925         if (error != 0)
926                 goto done;
927
928         error = vmwrite(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase());
929         if (error != 0)
930                 goto done;
931
932         error = vmwrite(VMCS_HOST_GS_BASE, vmm_get_host_gsbase());
933         if (error != 0)
934                 goto done;
935
936         /*
937          * If we are using VPIDs then invalidate all mappings tagged with 'vpid'
938          *
939          * We do this because this vcpu was executing on a different host
940          * cpu when it last ran. We do not track whether it invalidated
941          * mappings associated with its 'vpid' during that run. So we must
942          * assume that the mappings associated with 'vpid' on 'curcpu' are
943          * stale and invalidate them.
944          *
945          * Note that we incur this penalty only when the scheduler chooses to
946          * move the thread associated with this vcpu between host cpus.
947          *
948          * Note also that this will invalidate mappings tagged with 'vpid'
949          * for "all" EP4TAs.
950          */
951         if (vmxstate->vpid != 0) {
952                 invvpid_desc.vpid = vmxstate->vpid;
953                 invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc);
954         }
955 done:
956         return (error);
957 }
958
959 static void 
960 vm_exit_update_rip(struct vm_exit *vmexit)
961 {
962         int error;
963
964         error = vmwrite(VMCS_GUEST_RIP, vmexit->rip + vmexit->inst_length);
965         if (error)
966                 panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error);
967 }
968
969 /*
970  * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set.
971  */
972 CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0);
973
974 static void __inline
975 vmx_set_int_window_exiting(struct vmx *vmx, int vcpu)
976 {
977         int error;
978
979         vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING;
980
981         error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
982         if (error)
983                 panic("vmx_set_int_window_exiting: vmwrite error %d", error);
984 }
985
986 static void __inline
987 vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu)
988 {
989         int error;
990
991         vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING;
992
993         error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
994         if (error)
995                 panic("vmx_clear_int_window_exiting: vmwrite error %d", error);
996 }
997
998 static void __inline
999 vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu)
1000 {
1001         int error;
1002
1003         vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING;
1004
1005         error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
1006         if (error)
1007                 panic("vmx_set_nmi_window_exiting: vmwrite error %d", error);
1008 }
1009
1010 static void __inline
1011 vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu)
1012 {
1013         int error;
1014
1015         vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING;
1016
1017         error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
1018         if (error)
1019                 panic("vmx_clear_nmi_window_exiting: vmwrite error %d", error);
1020 }
1021
1022 static int
1023 vmx_inject_nmi(struct vmx *vmx, int vcpu)
1024 {
1025         int error;
1026         uint64_t info, interruptibility;
1027
1028         /* Bail out if no NMI requested */
1029         if (!vm_nmi_pending(vmx->vm, vcpu))
1030                 return (0);
1031
1032         error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility);
1033         if (error) {
1034                 panic("vmx_inject_nmi: vmread(interruptibility) %d",
1035                         error);
1036         }
1037         if (interruptibility & nmi_blocking_bits)
1038                 goto nmiblocked;
1039
1040         /*
1041          * Inject the virtual NMI. The vector must be the NMI IDT entry
1042          * or the VMCS entry check will fail.
1043          */
1044         info = VMCS_INTERRUPTION_INFO_NMI | VMCS_INTERRUPTION_INFO_VALID;
1045         info |= IDT_NMI;
1046
1047         error = vmwrite(VMCS_ENTRY_INTR_INFO, info);
1048         if (error)
1049                 panic("vmx_inject_nmi: vmwrite(intrinfo) %d", error);
1050
1051         VMM_CTR0(vmx->vm, vcpu, "Injecting vNMI");
1052
1053         /* Clear the request */
1054         vm_nmi_clear(vmx->vm, vcpu);
1055         return (1);
1056
1057 nmiblocked:
1058         /*
1059          * Set the NMI Window Exiting execution control so we can inject
1060          * the virtual NMI as soon as blocking condition goes away.
1061          */
1062         vmx_set_nmi_window_exiting(vmx, vcpu);
1063
1064         VMM_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting");
1065         return (1);
1066 }
1067
1068 static void
1069 vmx_inject_interrupts(struct vmx *vmx, int vcpu)
1070 {
1071         int error, vector;
1072         uint64_t info, rflags, interruptibility;
1073
1074         const int HWINTR_BLOCKED = VMCS_INTERRUPTIBILITY_STI_BLOCKING |
1075                                    VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING;
1076
1077         /*
1078          * If there is already an interrupt pending then just return.
1079          *
1080          * This could happen if an interrupt was injected on a prior
1081          * VM entry but the actual entry into guest mode was aborted
1082          * because of a pending AST.
1083          */
1084         error = vmread(VMCS_ENTRY_INTR_INFO, &info);
1085         if (error)
1086                 panic("vmx_inject_interrupts: vmread(intrinfo) %d", error);
1087         if (info & VMCS_INTERRUPTION_INFO_VALID)
1088                 return;
1089
1090         /*
1091          * NMI injection has priority so deal with those first
1092          */
1093         if (vmx_inject_nmi(vmx, vcpu))
1094                 return;
1095
1096         /* Ask the local apic for a vector to inject */
1097         vector = lapic_pending_intr(vmx->vm, vcpu);
1098         if (vector < 0)
1099                 return;
1100
1101         if (vector < 32 || vector > 255)
1102                 panic("vmx_inject_interrupts: invalid vector %d\n", vector);
1103
1104         /* Check RFLAGS.IF and the interruptibility state of the guest */
1105         error = vmread(VMCS_GUEST_RFLAGS, &rflags);
1106         if (error)
1107                 panic("vmx_inject_interrupts: vmread(rflags) %d", error);
1108
1109         if ((rflags & PSL_I) == 0)
1110                 goto cantinject;
1111
1112         error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility);
1113         if (error) {
1114                 panic("vmx_inject_interrupts: vmread(interruptibility) %d",
1115                         error);
1116         }
1117         if (interruptibility & HWINTR_BLOCKED)
1118                 goto cantinject;
1119
1120         /* Inject the interrupt */
1121         info = VMCS_INTERRUPTION_INFO_HW_INTR | VMCS_INTERRUPTION_INFO_VALID;
1122         info |= vector;
1123         error = vmwrite(VMCS_ENTRY_INTR_INFO, info);
1124         if (error)
1125                 panic("vmx_inject_interrupts: vmwrite(intrinfo) %d", error);
1126
1127         /* Update the Local APIC ISR */
1128         lapic_intr_accepted(vmx->vm, vcpu, vector);
1129
1130         VMM_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector);
1131
1132         return;
1133
1134 cantinject:
1135         /*
1136          * Set the Interrupt Window Exiting execution control so we can inject
1137          * the interrupt as soon as blocking condition goes away.
1138          */
1139         vmx_set_int_window_exiting(vmx, vcpu);
1140
1141         VMM_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting");
1142 }
1143
1144 static int
1145 vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
1146 {
1147         int error, cr, vmcs_guest_cr, vmcs_shadow_cr;
1148         uint64_t crval, regval, ones_mask, zeros_mask;
1149         const struct vmxctx *vmxctx;
1150
1151         /* We only handle mov to %cr0 or %cr4 at this time */
1152         if ((exitqual & 0xf0) != 0x00)
1153                 return (UNHANDLED);
1154
1155         cr = exitqual & 0xf;
1156         if (cr != 0 && cr != 4)
1157                 return (UNHANDLED);
1158
1159         vmxctx = &vmx->ctx[vcpu];
1160
1161         /*
1162          * We must use vmwrite() directly here because vmcs_setreg() will
1163          * call vmclear(vmcs) as a side-effect which we certainly don't want.
1164          */
1165         switch ((exitqual >> 8) & 0xf) {
1166         case 0:
1167                 regval = vmxctx->guest_rax;
1168                 break;
1169         case 1:
1170                 regval = vmxctx->guest_rcx;
1171                 break;
1172         case 2:
1173                 regval = vmxctx->guest_rdx;
1174                 break;
1175         case 3:
1176                 regval = vmxctx->guest_rbx;
1177                 break;
1178         case 4:
1179                 error = vmread(VMCS_GUEST_RSP, &regval);
1180                 if (error) {
1181                         panic("vmx_emulate_cr_access: "
1182                               "error %d reading guest rsp", error);
1183                 }
1184                 break;
1185         case 5:
1186                 regval = vmxctx->guest_rbp;
1187                 break;
1188         case 6:
1189                 regval = vmxctx->guest_rsi;
1190                 break;
1191         case 7:
1192                 regval = vmxctx->guest_rdi;
1193                 break;
1194         case 8:
1195                 regval = vmxctx->guest_r8;
1196                 break;
1197         case 9:
1198                 regval = vmxctx->guest_r9;
1199                 break;
1200         case 10:
1201                 regval = vmxctx->guest_r10;
1202                 break;
1203         case 11:
1204                 regval = vmxctx->guest_r11;
1205                 break;
1206         case 12:
1207                 regval = vmxctx->guest_r12;
1208                 break;
1209         case 13:
1210                 regval = vmxctx->guest_r13;
1211                 break;
1212         case 14:
1213                 regval = vmxctx->guest_r14;
1214                 break;
1215         case 15:
1216                 regval = vmxctx->guest_r15;
1217                 break;
1218         }
1219
1220         if (cr == 0) {
1221                 ones_mask = cr0_ones_mask;
1222                 zeros_mask = cr0_zeros_mask;
1223                 vmcs_guest_cr = VMCS_GUEST_CR0;
1224                 vmcs_shadow_cr = VMCS_CR0_SHADOW;
1225         } else {
1226                 ones_mask = cr4_ones_mask;
1227                 zeros_mask = cr4_zeros_mask;
1228                 vmcs_guest_cr = VMCS_GUEST_CR4;
1229                 vmcs_shadow_cr = VMCS_CR4_SHADOW;
1230         }
1231
1232         error = vmwrite(vmcs_shadow_cr, regval);
1233         if (error) {
1234                 panic("vmx_emulate_cr_access: error %d writing cr%d shadow",
1235                       error, cr);
1236         }
1237
1238         crval = regval | ones_mask;
1239         crval &= ~zeros_mask;
1240         error = vmwrite(vmcs_guest_cr, crval);
1241         if (error) {
1242                 panic("vmx_emulate_cr_access: error %d writing cr%d",
1243                       error, cr);
1244         }
1245
1246         if (cr == 0 && regval & CR0_PG) {
1247                 uint64_t efer, entry_ctls;
1248
1249                 /*
1250                  * If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and
1251                  * the "IA-32e mode guest" bit in VM-entry control must be
1252                  * equal.
1253                  */
1254                 error = vmread(VMCS_GUEST_IA32_EFER, &efer);
1255                 if (error) {
1256                   panic("vmx_emulate_cr_access: error %d efer read",
1257                         error);                 
1258                 }
1259                 if (efer & EFER_LME) {
1260                         efer |= EFER_LMA;
1261                         error = vmwrite(VMCS_GUEST_IA32_EFER, efer);
1262                         if (error) {
1263                                 panic("vmx_emulate_cr_access: error %d"
1264                                       " efer write", error);
1265                         }
1266                         error = vmread(VMCS_ENTRY_CTLS, &entry_ctls);
1267                         if (error) {
1268                                 panic("vmx_emulate_cr_access: error %d"
1269                                       " entry ctls read", error);
1270                         }
1271                         entry_ctls |= VM_ENTRY_GUEST_LMA;
1272                         error = vmwrite(VMCS_ENTRY_CTLS, entry_ctls);
1273                         if (error) {
1274                                 panic("vmx_emulate_cr_access: error %d"
1275                                       " entry ctls write", error);
1276                         }
1277                 }
1278         }
1279
1280         return (HANDLED);
1281 }
1282
1283 static int
1284 ept_fault_type(uint64_t ept_qual)
1285 {
1286         int fault_type;
1287
1288         if (ept_qual & EPT_VIOLATION_DATA_WRITE)
1289                 fault_type = VM_PROT_WRITE;
1290         else if (ept_qual & EPT_VIOLATION_INST_FETCH)
1291                 fault_type = VM_PROT_EXECUTE;
1292         else
1293                 fault_type= VM_PROT_READ;
1294
1295         return (fault_type);
1296 }
1297
1298 static int
1299 ept_protection(uint64_t ept_qual)
1300 {
1301         int prot = 0;
1302
1303         if (ept_qual & EPT_VIOLATION_GPA_READABLE)
1304                 prot |= VM_PROT_READ;
1305         if (ept_qual & EPT_VIOLATION_GPA_WRITEABLE)
1306                 prot |= VM_PROT_WRITE;
1307         if (ept_qual & EPT_VIOLATION_GPA_EXECUTABLE)
1308                 prot |= VM_PROT_EXECUTE;
1309
1310         return (prot);
1311 }
1312
1313 static boolean_t
1314 ept_emulation_fault(uint64_t ept_qual)
1315 {
1316         int read, write;
1317
1318         /* EPT fault on an instruction fetch doesn't make sense here */
1319         if (ept_qual & EPT_VIOLATION_INST_FETCH)
1320                 return (FALSE);
1321
1322         /* EPT fault must be a read fault or a write fault */
1323         read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0;
1324         write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0;
1325         if ((read | write) == 0)
1326                 return (FALSE);
1327
1328         /*
1329          * The EPT violation must have been caused by accessing a
1330          * guest-physical address that is a translation of a guest-linear
1331          * address.
1332          */
1333         if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 ||
1334             (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) {
1335                 return (FALSE);
1336         }
1337
1338         return (TRUE);
1339 }
1340
1341 static int
1342 vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
1343 {
1344         int error, handled;
1345         struct vmcs *vmcs;
1346         struct vmxctx *vmxctx;
1347         uint32_t eax, ecx, edx, idtvec_info, idtvec_err, reason;
1348         uint64_t qual, gpa;
1349
1350         handled = 0;
1351         vmcs = &vmx->vmcs[vcpu];
1352         vmxctx = &vmx->ctx[vcpu];
1353         qual = vmexit->u.vmx.exit_qualification;
1354         reason = vmexit->u.vmx.exit_reason;
1355         vmexit->exitcode = VM_EXITCODE_BOGUS;
1356
1357         vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1);
1358
1359         /*
1360          * VM exits that could be triggered during event injection on the
1361          * previous VM entry need to be handled specially by re-injecting
1362          * the event.
1363          *
1364          * See "Information for VM Exits During Event Delivery" in Intel SDM
1365          * for details.
1366          */
1367         switch (reason) {
1368         case EXIT_REASON_EPT_FAULT:
1369         case EXIT_REASON_EPT_MISCONFIG:
1370         case EXIT_REASON_APIC:
1371         case EXIT_REASON_TASK_SWITCH:
1372         case EXIT_REASON_EXCEPTION:
1373                 idtvec_info = vmcs_idt_vectoring_info();
1374                 if (idtvec_info & VMCS_IDT_VEC_VALID) {
1375                         idtvec_info &= ~(1 << 12); /* clear undefined bit */
1376                         vmwrite(VMCS_ENTRY_INTR_INFO, idtvec_info);
1377                         if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
1378                                 idtvec_err = vmcs_idt_vectoring_err();
1379                                 vmwrite(VMCS_ENTRY_EXCEPTION_ERROR, idtvec_err);
1380                         }
1381                         vmwrite(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
1382                 }
1383         default:
1384                 break;
1385         }
1386
1387         switch (reason) {
1388         case EXIT_REASON_CR_ACCESS:
1389                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1);
1390                 handled = vmx_emulate_cr_access(vmx, vcpu, qual);
1391                 break;
1392         case EXIT_REASON_RDMSR:
1393                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1);
1394                 ecx = vmxctx->guest_rcx;
1395                 error = emulate_rdmsr(vmx->vm, vcpu, ecx);
1396                 if (error) {
1397                         vmexit->exitcode = VM_EXITCODE_RDMSR;
1398                         vmexit->u.msr.code = ecx;
1399                 } else
1400                         handled = 1;
1401                 break;
1402         case EXIT_REASON_WRMSR:
1403                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_WRMSR, 1);
1404                 eax = vmxctx->guest_rax;
1405                 ecx = vmxctx->guest_rcx;
1406                 edx = vmxctx->guest_rdx;
1407                 error = emulate_wrmsr(vmx->vm, vcpu, ecx,
1408                                         (uint64_t)edx << 32 | eax);
1409                 if (error) {
1410                         vmexit->exitcode = VM_EXITCODE_WRMSR;
1411                         vmexit->u.msr.code = ecx;
1412                         vmexit->u.msr.wval = (uint64_t)edx << 32 | eax;
1413                 } else
1414                         handled = 1;
1415                 break;
1416         case EXIT_REASON_HLT:
1417                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1);
1418                 vmexit->exitcode = VM_EXITCODE_HLT;
1419                 break;
1420         case EXIT_REASON_MTF:
1421                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1);
1422                 vmexit->exitcode = VM_EXITCODE_MTRAP;
1423                 break;
1424         case EXIT_REASON_PAUSE:
1425                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1);
1426                 vmexit->exitcode = VM_EXITCODE_PAUSE;
1427                 break;
1428         case EXIT_REASON_INTR_WINDOW:
1429                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1);
1430                 vmx_clear_int_window_exiting(vmx, vcpu);
1431                 VMM_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting");
1432                 return (1);
1433         case EXIT_REASON_EXT_INTR:
1434                 /*
1435                  * External interrupts serve only to cause VM exits and allow
1436                  * the host interrupt handler to run.
1437                  *
1438                  * If this external interrupt triggers a virtual interrupt
1439                  * to a VM, then that state will be recorded by the
1440                  * host interrupt handler in the VM's softc. We will inject
1441                  * this virtual interrupt during the subsequent VM enter.
1442                  */
1443
1444                 /*
1445                  * This is special. We want to treat this as an 'handled'
1446                  * VM-exit but not increment the instruction pointer.
1447                  */
1448                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1);
1449                 return (1);
1450         case EXIT_REASON_NMI_WINDOW:
1451                 /* Exit to allow the pending virtual NMI to be injected */
1452                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NMI_WINDOW, 1);
1453                 vmx_clear_nmi_window_exiting(vmx, vcpu);
1454                 VMM_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting");
1455                 return (1);
1456         case EXIT_REASON_INOUT:
1457                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1);
1458                 vmexit->exitcode = VM_EXITCODE_INOUT;
1459                 vmexit->u.inout.bytes = (qual & 0x7) + 1;
1460                 vmexit->u.inout.in = (qual & 0x8) ? 1 : 0;
1461                 vmexit->u.inout.string = (qual & 0x10) ? 1 : 0;
1462                 vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0;
1463                 vmexit->u.inout.port = (uint16_t)(qual >> 16);
1464                 vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax);
1465                 break;
1466         case EXIT_REASON_CPUID:
1467                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1);
1468                 handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx);
1469                 break;
1470         case EXIT_REASON_EPT_FAULT:
1471                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EPT_FAULT, 1);
1472                 /*
1473                  * If 'gpa' lies within the address space allocated to
1474                  * memory then this must be a nested page fault otherwise
1475                  * this must be an instruction that accesses MMIO space.
1476                  */
1477                 gpa = vmcs_gpa();
1478                 if (vm_mem_allocated(vmx->vm, gpa)) {
1479                         vmexit->exitcode = VM_EXITCODE_PAGING;
1480                         vmexit->u.paging.gpa = gpa;
1481                         vmexit->u.paging.fault_type = ept_fault_type(qual);
1482                         vmexit->u.paging.protection = ept_protection(qual);
1483                 } else if (ept_emulation_fault(qual)) {
1484                         vmexit->exitcode = VM_EXITCODE_INST_EMUL;
1485                         vmexit->u.inst_emul.gpa = gpa;
1486                         vmexit->u.inst_emul.gla = vmcs_gla();
1487                         vmexit->u.inst_emul.cr3 = vmcs_guest_cr3();
1488                 }
1489                 break;
1490         default:
1491                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1);
1492                 break;
1493         }
1494
1495         if (handled) {
1496                 /*
1497                  * It is possible that control is returned to userland
1498                  * even though we were able to handle the VM exit in the
1499                  * kernel.
1500                  *
1501                  * In such a case we want to make sure that the userland
1502                  * restarts guest execution at the instruction *after*
1503                  * the one we just processed. Therefore we update the
1504                  * guest rip in the VMCS and in 'vmexit'.
1505                  */
1506                 vm_exit_update_rip(vmexit);
1507                 vmexit->rip += vmexit->inst_length;
1508                 vmexit->inst_length = 0;
1509         } else {
1510                 if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
1511                         /*
1512                          * If this VM exit was not claimed by anybody then
1513                          * treat it as a generic VMX exit.
1514                          */
1515                         vmexit->exitcode = VM_EXITCODE_VMX;
1516                         vmexit->u.vmx.error = 0;
1517                 } else {
1518                         /*
1519                          * The exitcode and collateral have been populated.
1520                          * The VM exit will be processed further in userland.
1521                          */
1522                 }
1523         }
1524         return (handled);
1525 }
1526
1527 static int
1528 vmx_run(void *arg, int vcpu, register_t rip, pmap_t pmap)
1529 {
1530         int error, vie, rc, handled, astpending;
1531         uint32_t exit_reason;
1532         struct vmx *vmx;
1533         struct vmxctx *vmxctx;
1534         struct vmcs *vmcs;
1535         struct vm_exit *vmexit;
1536
1537         vmx = arg;
1538         vmcs = &vmx->vmcs[vcpu];
1539         vmxctx = &vmx->ctx[vcpu];
1540         vmxctx->launched = 0;
1541
1542         astpending = 0;
1543         vmexit = vm_exitinfo(vmx->vm, vcpu);
1544
1545         KASSERT(vmxctx->pmap == pmap,
1546             ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap));
1547         KASSERT(vmxctx->eptp == vmx->eptp,
1548             ("eptp %p different than ctx eptp %#lx", eptp, vmxctx->eptp));
1549
1550         /*
1551          * XXX Can we avoid doing this every time we do a vm run?
1552          */
1553         VMPTRLD(vmcs);
1554
1555         /*
1556          * XXX
1557          * We do this every time because we may setup the virtual machine
1558          * from a different process than the one that actually runs it.
1559          *
1560          * If the life of a virtual machine was spent entirely in the context
1561          * of a single process we could do this once in vmcs_set_defaults().
1562          */
1563         if ((error = vmwrite(VMCS_HOST_CR3, rcr3())) != 0)
1564                 panic("vmx_run: error %d writing to VMCS_HOST_CR3", error);
1565
1566         if ((error = vmwrite(VMCS_GUEST_RIP, rip)) != 0)
1567                 panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error);
1568
1569         if ((error = vmx_set_pcpu_defaults(vmx, vcpu)) != 0)
1570                 panic("vmx_run: error %d setting up pcpu defaults", error);
1571
1572         do {
1573                 lapic_timer_tick(vmx->vm, vcpu);
1574                 vmx_inject_interrupts(vmx, vcpu);
1575                 vmx_run_trace(vmx, vcpu);
1576                 rc = vmx_setjmp(vmxctx);
1577 #ifdef SETJMP_TRACE
1578                 vmx_setjmp_trace(vmx, vcpu, vmxctx, rc);
1579 #endif
1580                 switch (rc) {
1581                 case VMX_RETURN_DIRECT:
1582                         if (vmxctx->launched == 0) {
1583                                 vmxctx->launched = 1;
1584                                 vmx_launch(vmxctx);
1585                         } else
1586                                 vmx_resume(vmxctx);
1587                         panic("vmx_launch/resume should not return");
1588                         break;
1589                 case VMX_RETURN_LONGJMP:
1590                         break;                  /* vm exit */
1591                 case VMX_RETURN_AST:
1592                         astpending = 1;
1593                         break;
1594                 case VMX_RETURN_VMRESUME:
1595                         vie = vmcs_instruction_error();
1596                         if (vmxctx->launch_error == VM_FAIL_INVALID ||
1597                             vie != VMRESUME_WITH_NON_LAUNCHED_VMCS) {
1598                                 printf("vmresume error %d vmcs inst error %d\n",
1599                                         vmxctx->launch_error, vie);
1600                                 goto err_exit;
1601                         }
1602                         vmx_launch(vmxctx);     /* try to launch the guest */
1603                         panic("vmx_launch should not return");
1604                         break;
1605                 case VMX_RETURN_VMLAUNCH:
1606                         vie = vmcs_instruction_error();
1607 #if 1
1608                         printf("vmlaunch error %d vmcs inst error %d\n",
1609                                 vmxctx->launch_error, vie);
1610 #endif
1611                         goto err_exit;
1612                 case VMX_RETURN_INVEPT:
1613                         panic("vm %s:%d invept error %d",
1614                               vm_name(vmx->vm), vcpu, vmxctx->launch_error);
1615                 default:
1616                         panic("vmx_setjmp returned %d", rc);
1617                 }
1618                 
1619                 /* enable interrupts */
1620                 enable_intr();
1621
1622                 /* collect some basic information for VM exit processing */
1623                 vmexit->rip = rip = vmcs_guest_rip();
1624                 vmexit->inst_length = vmexit_instruction_length();
1625                 vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason();
1626                 vmexit->u.vmx.exit_qualification = vmcs_exit_qualification();
1627
1628                 if (astpending) {
1629                         handled = 1;
1630                         vmexit->inst_length = 0;
1631                         vmexit->exitcode = VM_EXITCODE_BOGUS;
1632                         vmx_astpending_trace(vmx, vcpu, rip);
1633                         vmm_stat_incr(vmx->vm, vcpu, VMEXIT_ASTPENDING, 1);
1634                         break;
1635                 }
1636
1637                 handled = vmx_exit_process(vmx, vcpu, vmexit);
1638                 vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled);
1639
1640         } while (handled);
1641
1642         /*
1643          * If a VM exit has been handled then the exitcode must be BOGUS
1644          * If a VM exit is not handled then the exitcode must not be BOGUS
1645          */
1646         if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) ||
1647             (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) {
1648                 panic("Mismatch between handled (%d) and exitcode (%d)",
1649                       handled, vmexit->exitcode);
1650         }
1651
1652         if (!handled)
1653                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_USERSPACE, 1);
1654
1655         VMM_CTR1(vmx->vm, vcpu, "goto userland: exitcode %d",vmexit->exitcode);
1656
1657         /*
1658          * XXX
1659          * We need to do this to ensure that any VMCS state cached by the
1660          * processor is flushed to memory. We need to do this in case the
1661          * VM moves to a different cpu the next time it runs.
1662          *
1663          * Can we avoid doing this?
1664          */
1665         VMCLEAR(vmcs);
1666         return (0);
1667
1668 err_exit:
1669         vmexit->exitcode = VM_EXITCODE_VMX;
1670         vmexit->u.vmx.exit_reason = (uint32_t)-1;
1671         vmexit->u.vmx.exit_qualification = (uint32_t)-1;
1672         vmexit->u.vmx.error = vie;
1673         VMCLEAR(vmcs);
1674         return (ENOEXEC);
1675 }
1676
1677 static void
1678 vmx_vmcleanup(void *arg)
1679 {
1680         int i, error;
1681         struct vmx *vmx = arg;
1682
1683         for (i = 0; i < VM_MAXCPU; i++)
1684                 vpid_free(vmx->state[i].vpid);
1685
1686         /*
1687          * XXXSMP we also need to clear the VMCS active on the other vcpus.
1688          */
1689         error = vmclear(&vmx->vmcs[0]);
1690         if (error != 0)
1691                 panic("vmx_vmcleanup: vmclear error %d on vcpu 0", error);
1692
1693         free(vmx, M_VMX);
1694
1695         return;
1696 }
1697
1698 static register_t *
1699 vmxctx_regptr(struct vmxctx *vmxctx, int reg)
1700 {
1701
1702         switch (reg) {
1703         case VM_REG_GUEST_RAX:
1704                 return (&vmxctx->guest_rax);
1705         case VM_REG_GUEST_RBX:
1706                 return (&vmxctx->guest_rbx);
1707         case VM_REG_GUEST_RCX:
1708                 return (&vmxctx->guest_rcx);
1709         case VM_REG_GUEST_RDX:
1710                 return (&vmxctx->guest_rdx);
1711         case VM_REG_GUEST_RSI:
1712                 return (&vmxctx->guest_rsi);
1713         case VM_REG_GUEST_RDI:
1714                 return (&vmxctx->guest_rdi);
1715         case VM_REG_GUEST_RBP:
1716                 return (&vmxctx->guest_rbp);
1717         case VM_REG_GUEST_R8:
1718                 return (&vmxctx->guest_r8);
1719         case VM_REG_GUEST_R9:
1720                 return (&vmxctx->guest_r9);
1721         case VM_REG_GUEST_R10:
1722                 return (&vmxctx->guest_r10);
1723         case VM_REG_GUEST_R11:
1724                 return (&vmxctx->guest_r11);
1725         case VM_REG_GUEST_R12:
1726                 return (&vmxctx->guest_r12);
1727         case VM_REG_GUEST_R13:
1728                 return (&vmxctx->guest_r13);
1729         case VM_REG_GUEST_R14:
1730                 return (&vmxctx->guest_r14);
1731         case VM_REG_GUEST_R15:
1732                 return (&vmxctx->guest_r15);
1733         default:
1734                 break;
1735         }
1736         return (NULL);
1737 }
1738
1739 static int
1740 vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval)
1741 {
1742         register_t *regp;
1743
1744         if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
1745                 *retval = *regp;
1746                 return (0);
1747         } else
1748                 return (EINVAL);
1749 }
1750
1751 static int
1752 vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val)
1753 {
1754         register_t *regp;
1755
1756         if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
1757                 *regp = val;
1758                 return (0);
1759         } else
1760                 return (EINVAL);
1761 }
1762
1763 static int
1764 vmx_shadow_reg(int reg)
1765 {
1766         int shreg;
1767
1768         shreg = -1;
1769
1770         switch (reg) {
1771         case VM_REG_GUEST_CR0:
1772                 shreg = VMCS_CR0_SHADOW;
1773                 break;
1774         case VM_REG_GUEST_CR4:
1775                 shreg = VMCS_CR4_SHADOW;
1776                 break;
1777         default:
1778                 break;
1779         }
1780
1781         return (shreg);
1782 }
1783
1784 static int
1785 vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval)
1786 {
1787         int running, hostcpu;
1788         struct vmx *vmx = arg;
1789
1790         running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
1791         if (running && hostcpu != curcpu)
1792                 panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu);
1793
1794         if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0)
1795                 return (0);
1796
1797         return (vmcs_getreg(&vmx->vmcs[vcpu], running, reg, retval));
1798 }
1799
1800 static int
1801 vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
1802 {
1803         int error, hostcpu, running, shadow;
1804         uint64_t ctls;
1805         struct vmx *vmx = arg;
1806
1807         running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
1808         if (running && hostcpu != curcpu)
1809                 panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu);
1810
1811         if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0)
1812                 return (0);
1813
1814         error = vmcs_setreg(&vmx->vmcs[vcpu], running, reg, val);
1815
1816         if (error == 0) {
1817                 /*
1818                  * If the "load EFER" VM-entry control is 1 then the
1819                  * value of EFER.LMA must be identical to "IA-32e mode guest"
1820                  * bit in the VM-entry control.
1821                  */
1822                 if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 &&
1823                     (reg == VM_REG_GUEST_EFER)) {
1824                         vmcs_getreg(&vmx->vmcs[vcpu], running,
1825                                     VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls);
1826                         if (val & EFER_LMA)
1827                                 ctls |= VM_ENTRY_GUEST_LMA;
1828                         else
1829                                 ctls &= ~VM_ENTRY_GUEST_LMA;
1830                         vmcs_setreg(&vmx->vmcs[vcpu], running,
1831                                     VMCS_IDENT(VMCS_ENTRY_CTLS), ctls);
1832                 }
1833
1834                 shadow = vmx_shadow_reg(reg);
1835                 if (shadow > 0) {
1836                         /*
1837                          * Store the unmodified value in the shadow
1838                          */                     
1839                         error = vmcs_setreg(&vmx->vmcs[vcpu], running,
1840                                     VMCS_IDENT(shadow), val);
1841                 }
1842         }
1843
1844         return (error);
1845 }
1846
1847 static int
1848 vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
1849 {
1850         struct vmx *vmx = arg;
1851
1852         return (vmcs_getdesc(&vmx->vmcs[vcpu], reg, desc));
1853 }
1854
1855 static int
1856 vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
1857 {
1858         struct vmx *vmx = arg;
1859
1860         return (vmcs_setdesc(&vmx->vmcs[vcpu], reg, desc));
1861 }
1862
1863 static int
1864 vmx_inject(void *arg, int vcpu, int type, int vector, uint32_t code,
1865            int code_valid)
1866 {
1867         int error;
1868         uint64_t info;
1869         struct vmx *vmx = arg;
1870         struct vmcs *vmcs = &vmx->vmcs[vcpu];
1871
1872         static uint32_t type_map[VM_EVENT_MAX] = {
1873                 0x1,            /* VM_EVENT_NONE */
1874                 0x0,            /* VM_HW_INTR */
1875                 0x2,            /* VM_NMI */
1876                 0x3,            /* VM_HW_EXCEPTION */
1877                 0x4,            /* VM_SW_INTR */
1878                 0x5,            /* VM_PRIV_SW_EXCEPTION */
1879                 0x6,            /* VM_SW_EXCEPTION */
1880         };
1881
1882         /*
1883          * If there is already an exception pending to be delivered to the
1884          * vcpu then just return.
1885          */
1886         error = vmcs_getreg(vmcs, 0, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), &info);
1887         if (error)
1888                 return (error);
1889
1890         if (info & VMCS_INTERRUPTION_INFO_VALID)
1891                 return (EAGAIN);
1892
1893         info = vector | (type_map[type] << 8) | (code_valid ? 1 << 11 : 0);
1894         info |= VMCS_INTERRUPTION_INFO_VALID;
1895         error = vmcs_setreg(vmcs, 0, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), info);
1896         if (error != 0)
1897                 return (error);
1898
1899         if (code_valid) {
1900                 error = vmcs_setreg(vmcs, 0,
1901                                     VMCS_IDENT(VMCS_ENTRY_EXCEPTION_ERROR),
1902                                     code);
1903         }
1904         return (error);
1905 }
1906
1907 static int
1908 vmx_getcap(void *arg, int vcpu, int type, int *retval)
1909 {
1910         struct vmx *vmx = arg;
1911         int vcap;
1912         int ret;
1913
1914         ret = ENOENT;
1915
1916         vcap = vmx->cap[vcpu].set;
1917
1918         switch (type) {
1919         case VM_CAP_HALT_EXIT:
1920                 if (cap_halt_exit)
1921                         ret = 0;
1922                 break;
1923         case VM_CAP_PAUSE_EXIT:
1924                 if (cap_pause_exit)
1925                         ret = 0;
1926                 break;
1927         case VM_CAP_MTRAP_EXIT:
1928                 if (cap_monitor_trap)
1929                         ret = 0;
1930                 break;
1931         case VM_CAP_UNRESTRICTED_GUEST:
1932                 if (cap_unrestricted_guest)
1933                         ret = 0;
1934                 break;
1935         default:
1936                 break;
1937         }
1938
1939         if (ret == 0)
1940                 *retval = (vcap & (1 << type)) ? 1 : 0;
1941
1942         return (ret);
1943 }
1944
1945 static int
1946 vmx_setcap(void *arg, int vcpu, int type, int val)
1947 {
1948         struct vmx *vmx = arg;
1949         struct vmcs *vmcs = &vmx->vmcs[vcpu];
1950         uint32_t baseval;
1951         uint32_t *pptr;
1952         int error;
1953         int flag;
1954         int reg;
1955         int retval;
1956
1957         retval = ENOENT;
1958         pptr = NULL;
1959
1960         switch (type) {
1961         case VM_CAP_HALT_EXIT:
1962                 if (cap_halt_exit) {
1963                         retval = 0;
1964                         pptr = &vmx->cap[vcpu].proc_ctls;
1965                         baseval = *pptr;
1966                         flag = PROCBASED_HLT_EXITING;
1967                         reg = VMCS_PRI_PROC_BASED_CTLS;
1968                 }
1969                 break;
1970         case VM_CAP_MTRAP_EXIT:
1971                 if (cap_monitor_trap) {
1972                         retval = 0;
1973                         pptr = &vmx->cap[vcpu].proc_ctls;
1974                         baseval = *pptr;
1975                         flag = PROCBASED_MTF;
1976                         reg = VMCS_PRI_PROC_BASED_CTLS;
1977                 }
1978                 break;
1979         case VM_CAP_PAUSE_EXIT:
1980                 if (cap_pause_exit) {
1981                         retval = 0;
1982                         pptr = &vmx->cap[vcpu].proc_ctls;
1983                         baseval = *pptr;
1984                         flag = PROCBASED_PAUSE_EXITING;
1985                         reg = VMCS_PRI_PROC_BASED_CTLS;
1986                 }
1987                 break;
1988         case VM_CAP_UNRESTRICTED_GUEST:
1989                 if (cap_unrestricted_guest) {
1990                         retval = 0;
1991                         baseval = procbased_ctls2;
1992                         flag = PROCBASED2_UNRESTRICTED_GUEST;
1993                         reg = VMCS_SEC_PROC_BASED_CTLS;
1994                 }
1995                 break;
1996         default:
1997                 break;
1998         }
1999
2000         if (retval == 0) {
2001                 if (val) {
2002                         baseval |= flag;
2003                 } else {
2004                         baseval &= ~flag;
2005                 }
2006                 VMPTRLD(vmcs);
2007                 error = vmwrite(reg, baseval);
2008                 VMCLEAR(vmcs);
2009
2010                 if (error) {
2011                         retval = error;
2012                 } else {
2013                         /*
2014                          * Update optional stored flags, and record
2015                          * setting
2016                          */
2017                         if (pptr != NULL) {
2018                                 *pptr = baseval;
2019                         }
2020
2021                         if (val) {
2022                                 vmx->cap[vcpu].set |= (1 << type);
2023                         } else {
2024                                 vmx->cap[vcpu].set &= ~(1 << type);
2025                         }
2026                 }
2027         }
2028
2029         return (retval);
2030 }
2031
2032 struct vmm_ops vmm_ops_intel = {
2033         vmx_init,
2034         vmx_cleanup,
2035         vmx_vminit,
2036         vmx_run,
2037         vmx_vmcleanup,
2038         vmx_getreg,
2039         vmx_setreg,
2040         vmx_getdesc,
2041         vmx_setdesc,
2042         vmx_inject,
2043         vmx_getcap,
2044         vmx_setcap,
2045         ept_vmspace_alloc,
2046         ept_vmspace_free,
2047 };