2 * Copyright (c) 2011 NetApp, Inc.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/kernel.h>
35 #include <sys/module.h>
36 #include <sys/sysctl.h>
37 #include <sys/malloc.h>
40 #include <sys/mutex.h>
42 #include <sys/rwlock.h>
43 #include <sys/sched.h>
45 #include <sys/systm.h>
48 #include <vm/vm_object.h>
49 #include <vm/vm_page.h>
51 #include <vm/vm_map.h>
52 #include <vm/vm_extern.h>
53 #include <vm/vm_param.h>
55 #include <machine/cpu.h>
56 #include <machine/vm.h>
57 #include <machine/pcb.h>
58 #include <machine/smp.h>
60 #include <x86/apicreg.h>
61 #include <machine/vmparam.h>
63 #include <machine/vmm.h>
64 #include <machine/vmm_dev.h>
76 #include "vmm_lapic.h"
85 enum vcpu_state state;
87 int hostcpu; /* host cpuid this vcpu last ran on */
88 uint64_t guest_msrs[VMM_MSR_NUM];
89 struct vlapic *vlapic;
91 struct savefpu *guestfpu; /* guest fpu state */
93 struct vm_exit exitinfo;
94 enum x2apic_state x2apic_state;
98 #define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
99 #define vcpu_lock(v) mtx_lock_spin(&((v)->mtx))
100 #define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx))
101 #define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED)
109 #define VM_MAX_MEMORY_SEGMENTS 2
112 void *cookie; /* processor-specific data */
113 void *iommu; /* iommu-specific data */
114 struct vhpet *vhpet; /* virtual HPET */
115 struct vioapic *vioapic; /* virtual ioapic */
116 struct vmspace *vmspace; /* guest's address space */
117 struct vcpu vcpu[VM_MAXCPU];
119 struct mem_seg mem_segs[VM_MAX_MEMORY_SEGMENTS];
120 char name[VM_MAX_NAMELEN];
123 * Set of active vcpus.
124 * An active vcpu is one that has been started implicitly (BSP) or
125 * explicitly (AP) by sending it a startup ipi.
127 volatile cpuset_t active_cpus;
129 struct mtx rendezvous_mtx;
130 cpuset_t rendezvous_req_cpus;
131 cpuset_t rendezvous_done_cpus;
132 void *rendezvous_arg;
133 vm_rendezvous_func_t rendezvous_func;
136 static int vmm_initialized;
138 static struct vmm_ops *ops;
139 #define VMM_INIT(num) (ops != NULL ? (*ops->init)(num) : 0)
140 #define VMM_CLEANUP() (ops != NULL ? (*ops->cleanup)() : 0)
141 #define VMM_RESUME() (ops != NULL ? (*ops->resume)() : 0)
143 #define VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL)
144 #define VMRUN(vmi, vcpu, rip, pmap, rptr) \
145 (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap, rptr) : ENXIO)
146 #define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
147 #define VMSPACE_ALLOC(min, max) \
148 (ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL)
149 #define VMSPACE_FREE(vmspace) \
150 (ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO)
151 #define VMGETREG(vmi, vcpu, num, retval) \
152 (ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
153 #define VMSETREG(vmi, vcpu, num, val) \
154 (ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO)
155 #define VMGETDESC(vmi, vcpu, num, desc) \
156 (ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO)
157 #define VMSETDESC(vmi, vcpu, num, desc) \
158 (ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO)
159 #define VMINJECT(vmi, vcpu, type, vec, ec, ecv) \
160 (ops != NULL ? (*ops->vminject)(vmi, vcpu, type, vec, ec, ecv) : ENXIO)
161 #define VMGETCAP(vmi, vcpu, num, retval) \
162 (ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO)
163 #define VMSETCAP(vmi, vcpu, num, val) \
164 (ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO)
165 #define VLAPIC_INIT(vmi, vcpu) \
166 (ops != NULL ? (*ops->vlapic_init)(vmi, vcpu) : NULL)
167 #define VLAPIC_CLEANUP(vmi, vlapic) \
168 (ops != NULL ? (*ops->vlapic_cleanup)(vmi, vlapic) : NULL)
170 #define fpu_start_emulating() load_cr0(rcr0() | CR0_TS)
171 #define fpu_stop_emulating() clts()
173 static MALLOC_DEFINE(M_VM, "vm", "vm");
174 CTASSERT(VMM_MSR_NUM <= 64); /* msr_mask can keep track of up to 64 msrs */
177 static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
179 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
181 static int vmm_ipinum;
182 SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
183 "IPI vector used for vcpu notifications");
185 static void vm_deactivate_cpu(struct vm *vm, int vcpuid);
188 vcpu_cleanup(struct vm *vm, int i)
190 struct vcpu *vcpu = &vm->vcpu[i];
192 VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic);
193 vmm_stat_free(vcpu->stats);
194 fpu_save_area_free(vcpu->guestfpu);
198 vcpu_init(struct vm *vm, uint32_t vcpu_id)
202 vcpu = &vm->vcpu[vcpu_id];
204 vcpu_lock_init(vcpu);
205 vcpu->hostcpu = NOCPU;
206 vcpu->vcpuid = vcpu_id;
207 vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
208 vm_set_x2apic_state(vm, vcpu_id, X2APIC_ENABLED);
209 vcpu->guestfpu = fpu_save_area_alloc();
210 fpu_save_area_reset(vcpu->guestfpu);
211 vcpu->stats = vmm_stat_alloc();
215 vm_exitinfo(struct vm *vm, int cpuid)
219 if (cpuid < 0 || cpuid >= VM_MAXCPU)
220 panic("vm_exitinfo: invalid cpuid %d", cpuid);
222 vcpu = &vm->vcpu[cpuid];
224 return (&vcpu->exitinfo);
238 vmm_host_state_init();
240 vmm_ipinum = vmm_ipi_alloc();
242 vmm_ipinum = IPI_AST;
244 error = vmm_mem_init();
249 ops = &vmm_ops_intel;
250 else if (vmm_is_amd())
256 vmm_resume_p = vmm_resume;
258 return (VMM_INIT(vmm_ipinum));
262 vmm_handler(module_t mod, int what, void *arg)
269 if (ppt_avail_devices() > 0)
276 error = vmmdev_cleanup();
280 if (vmm_ipinum != IPI_AST)
281 vmm_ipi_free(vmm_ipinum);
282 error = VMM_CLEANUP();
284 * Something bad happened - prevent new
285 * VMs from being created
298 static moduledata_t vmm_kmod = {
305 * vmm initialization has the following dependencies:
307 * - iommu initialization must happen after the pci passthru driver has had
308 * a chance to attach to any passthru devices (after SI_SUB_CONFIGURE).
310 * - VT-x initialization requires smp_rendezvous() and therefore must happen
311 * after SMP is fully functional (after SI_SUB_SMP).
313 DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY);
314 MODULE_VERSION(vmm, 1);
317 vm_create(const char *name, struct vm **retvm)
321 struct vmspace *vmspace;
326 * If vmm.ko could not be successfully initialized then don't attempt
327 * to create the virtual machine.
329 if (!vmm_initialized)
332 if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
335 vmspace = VMSPACE_ALLOC(VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS);
339 vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
340 strcpy(vm->name, name);
341 vm->vmspace = vmspace;
342 mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF);
343 vm->cookie = VMINIT(vm, vmspace_pmap(vmspace));
344 vm->vioapic = vioapic_init(vm);
345 vm->vhpet = vhpet_init(vm);
347 for (i = 0; i < VM_MAXCPU; i++) {
349 guest_msrs_init(vm, i);
352 vm_activate_cpu(vm, BSP);
359 vm_free_mem_seg(struct vm *vm, struct mem_seg *seg)
362 if (seg->object != NULL)
363 vmm_mem_free(vm->vmspace, seg->gpa, seg->len);
365 bzero(seg, sizeof(*seg));
369 vm_destroy(struct vm *vm)
373 ppt_unassign_all(vm);
375 if (vm->iommu != NULL)
376 iommu_destroy_domain(vm->iommu);
378 vhpet_cleanup(vm->vhpet);
379 vioapic_cleanup(vm->vioapic);
381 for (i = 0; i < vm->num_mem_segs; i++)
382 vm_free_mem_seg(vm, &vm->mem_segs[i]);
384 vm->num_mem_segs = 0;
386 for (i = 0; i < VM_MAXCPU; i++)
389 VMSPACE_FREE(vm->vmspace);
391 VMCLEANUP(vm->cookie);
397 vm_name(struct vm *vm)
403 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
407 if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
414 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
417 vmm_mmio_free(vm->vmspace, gpa, len);
422 vm_mem_allocated(struct vm *vm, vm_paddr_t gpa)
425 vm_paddr_t gpabase, gpalimit;
427 for (i = 0; i < vm->num_mem_segs; i++) {
428 gpabase = vm->mem_segs[i].gpa;
429 gpalimit = gpabase + vm->mem_segs[i].len;
430 if (gpa >= gpabase && gpa < gpalimit)
431 return (TRUE); /* 'gpa' is regular memory */
434 if (ppt_is_mmio(vm, gpa))
435 return (TRUE); /* 'gpa' is pci passthru mmio */
441 vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
443 int available, allocated;
448 if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0)
451 available = allocated = 0;
453 while (g < gpa + len) {
454 if (vm_mem_allocated(vm, g))
463 * If there are some allocated and some available pages in the address
464 * range then it is an error.
466 if (allocated && available)
470 * If the entire address range being requested has already been
471 * allocated then there isn't anything more to do.
473 if (allocated && available == 0)
476 if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
479 seg = &vm->mem_segs[vm->num_mem_segs];
481 if ((object = vmm_mem_alloc(vm->vmspace, gpa, len)) == NULL)
486 seg->object = object;
495 vm_gpa_unwire(struct vm *vm)
500 for (i = 0; i < vm->num_mem_segs; i++) {
501 seg = &vm->mem_segs[i];
505 rv = vm_map_unwire(&vm->vmspace->vm_map,
506 seg->gpa, seg->gpa + seg->len,
507 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
508 KASSERT(rv == KERN_SUCCESS, ("vm(%s) memory segment "
509 "%#lx/%ld could not be unwired: %d",
510 vm_name(vm), seg->gpa, seg->len, rv));
517 vm_gpa_wire(struct vm *vm)
522 for (i = 0; i < vm->num_mem_segs; i++) {
523 seg = &vm->mem_segs[i];
528 rv = vm_map_wire(&vm->vmspace->vm_map,
529 seg->gpa, seg->gpa + seg->len,
530 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
531 if (rv != KERN_SUCCESS)
537 if (i < vm->num_mem_segs) {
539 * Undo the wiring before returning an error.
549 vm_iommu_modify(struct vm *vm, boolean_t map)
554 void *vp, *cookie, *host_domain;
557 host_domain = iommu_host_domain();
559 for (i = 0; i < vm->num_mem_segs; i++) {
560 seg = &vm->mem_segs[i];
561 KASSERT(seg->wired, ("vm(%s) memory segment %#lx/%ld not wired",
562 vm_name(vm), seg->gpa, seg->len));
565 while (gpa < seg->gpa + seg->len) {
566 vp = vm_gpa_hold(vm, gpa, PAGE_SIZE, VM_PROT_WRITE,
568 KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx",
571 vm_gpa_release(cookie);
573 hpa = DMAP_TO_PHYS((uintptr_t)vp);
575 iommu_create_mapping(vm->iommu, gpa, hpa, sz);
576 iommu_remove_mapping(host_domain, hpa, sz);
578 iommu_remove_mapping(vm->iommu, gpa, sz);
579 iommu_create_mapping(host_domain, hpa, hpa, sz);
587 * Invalidate the cached translations associated with the domain
588 * from which pages were removed.
591 iommu_invalidate_tlb(host_domain);
593 iommu_invalidate_tlb(vm->iommu);
596 #define vm_iommu_unmap(vm) vm_iommu_modify((vm), FALSE)
597 #define vm_iommu_map(vm) vm_iommu_modify((vm), TRUE)
600 vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func)
604 error = ppt_unassign_device(vm, bus, slot, func);
608 if (ppt_assigned_devices(vm) == 0) {
616 vm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
622 * Virtual machines with pci passthru devices get special treatment:
623 * - the guest physical memory is wired
624 * - the iommu is programmed to do the 'gpa' to 'hpa' translation
626 * We need to do this before the first pci passthru device is attached.
628 if (ppt_assigned_devices(vm) == 0) {
629 KASSERT(vm->iommu == NULL,
630 ("vm_assign_pptdev: iommu must be NULL"));
631 maxaddr = vmm_mem_maxaddr();
632 vm->iommu = iommu_create_domain(maxaddr);
634 error = vm_gpa_wire(vm);
641 error = ppt_assign_device(vm, bus, slot, func);
646 vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
652 pageoff = gpa & PAGE_MASK;
653 if (len > PAGE_SIZE - pageoff)
654 panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
656 count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
657 trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
661 return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
669 vm_gpa_release(void *cookie)
671 vm_page_t m = cookie;
679 vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
680 struct vm_memory_segment *seg)
684 for (i = 0; i < vm->num_mem_segs; i++) {
685 if (gpabase == vm->mem_segs[i].gpa) {
686 seg->gpa = vm->mem_segs[i].gpa;
687 seg->len = vm->mem_segs[i].len;
688 seg->wired = vm->mem_segs[i].wired;
696 vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len,
697 vm_offset_t *offset, struct vm_object **object)
704 for (i = 0; i < vm->num_mem_segs; i++) {
705 if ((seg_obj = vm->mem_segs[i].object) == NULL)
708 seg_gpa = vm->mem_segs[i].gpa;
709 seg_len = vm->mem_segs[i].len;
711 if (gpa >= seg_gpa && gpa < seg_gpa + seg_len) {
712 *offset = gpa - seg_gpa;
714 vm_object_reference(seg_obj);
723 vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
726 if (vcpu < 0 || vcpu >= VM_MAXCPU)
729 if (reg >= VM_REG_LAST)
732 return (VMGETREG(vm->cookie, vcpu, reg, retval));
736 vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val)
739 if (vcpu < 0 || vcpu >= VM_MAXCPU)
742 if (reg >= VM_REG_LAST)
745 return (VMSETREG(vm->cookie, vcpu, reg, val));
749 is_descriptor_table(int reg)
753 case VM_REG_GUEST_IDTR:
754 case VM_REG_GUEST_GDTR:
762 is_segment_register(int reg)
766 case VM_REG_GUEST_ES:
767 case VM_REG_GUEST_CS:
768 case VM_REG_GUEST_SS:
769 case VM_REG_GUEST_DS:
770 case VM_REG_GUEST_FS:
771 case VM_REG_GUEST_GS:
772 case VM_REG_GUEST_TR:
773 case VM_REG_GUEST_LDTR:
781 vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
782 struct seg_desc *desc)
785 if (vcpu < 0 || vcpu >= VM_MAXCPU)
788 if (!is_segment_register(reg) && !is_descriptor_table(reg))
791 return (VMGETDESC(vm->cookie, vcpu, reg, desc));
795 vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
796 struct seg_desc *desc)
798 if (vcpu < 0 || vcpu >= VM_MAXCPU)
801 if (!is_segment_register(reg) && !is_descriptor_table(reg))
804 return (VMSETDESC(vm->cookie, vcpu, reg, desc));
808 restore_guest_fpustate(struct vcpu *vcpu)
811 /* flush host state to the pcb */
814 /* restore guest FPU state */
815 fpu_stop_emulating();
816 fpurestore(vcpu->guestfpu);
819 * The FPU is now "dirty" with the guest's state so turn on emulation
820 * to trap any access to the FPU by the host.
822 fpu_start_emulating();
826 save_guest_fpustate(struct vcpu *vcpu)
829 if ((rcr0() & CR0_TS) == 0)
830 panic("fpu emulation not enabled in host!");
832 /* save guest FPU state */
833 fpu_stop_emulating();
834 fpusave(vcpu->guestfpu);
835 fpu_start_emulating();
838 static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
841 vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
846 vcpu_assert_locked(vcpu);
849 * State transitions from the vmmdev_ioctl() must always begin from
850 * the VCPU_IDLE state. This guarantees that there is only a single
851 * ioctl() operating on a vcpu at any point.
854 while (vcpu->state != VCPU_IDLE)
855 msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
857 KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
861 if (vcpu->state == VCPU_RUNNING) {
862 KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
863 "mismatch for running vcpu", curcpu, vcpu->hostcpu));
865 KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
866 "vcpu that is not running", vcpu->hostcpu));
870 * The following state transitions are allowed:
871 * IDLE -> FROZEN -> IDLE
872 * FROZEN -> RUNNING -> FROZEN
873 * FROZEN -> SLEEPING -> FROZEN
875 switch (vcpu->state) {
879 error = (newstate != VCPU_FROZEN);
882 error = (newstate == VCPU_FROZEN);
892 vcpu->state = newstate;
893 if (newstate == VCPU_RUNNING)
894 vcpu->hostcpu = curcpu;
896 vcpu->hostcpu = NOCPU;
898 if (newstate == VCPU_IDLE)
899 wakeup(&vcpu->state);
905 vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
909 if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0)
910 panic("Error %d setting state to %d\n", error, newstate);
914 vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
918 if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0)
919 panic("Error %d setting state to %d", error, newstate);
923 vm_set_rendezvous_func(struct vm *vm, vm_rendezvous_func_t func)
926 KASSERT(mtx_owned(&vm->rendezvous_mtx), ("rendezvous_mtx not locked"));
929 * Update 'rendezvous_func' and execute a write memory barrier to
930 * ensure that it is visible across all host cpus. This is not needed
931 * for correctness but it does ensure that all the vcpus will notice
932 * that the rendezvous is requested immediately.
934 vm->rendezvous_func = func;
938 #define RENDEZVOUS_CTR0(vm, vcpuid, fmt) \
941 VCPU_CTR0(vm, vcpuid, fmt); \
947 vm_handle_rendezvous(struct vm *vm, int vcpuid)
950 KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU),
951 ("vm_handle_rendezvous: invalid vcpuid %d", vcpuid));
953 mtx_lock(&vm->rendezvous_mtx);
954 while (vm->rendezvous_func != NULL) {
955 /* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */
956 CPU_AND(&vm->rendezvous_req_cpus, &vm->active_cpus);
959 CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) &&
960 !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) {
961 VCPU_CTR0(vm, vcpuid, "Calling rendezvous func");
962 (*vm->rendezvous_func)(vm, vcpuid, vm->rendezvous_arg);
963 CPU_SET(vcpuid, &vm->rendezvous_done_cpus);
965 if (CPU_CMP(&vm->rendezvous_req_cpus,
966 &vm->rendezvous_done_cpus) == 0) {
967 VCPU_CTR0(vm, vcpuid, "Rendezvous completed");
968 vm_set_rendezvous_func(vm, NULL);
969 wakeup(&vm->rendezvous_func);
972 RENDEZVOUS_CTR0(vm, vcpuid, "Wait for rendezvous completion");
973 mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0,
976 mtx_unlock(&vm->rendezvous_mtx);
980 * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
983 vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
985 struct vm_exit *vmexit;
987 int t, timo, spindown;
989 vcpu = &vm->vcpu[vcpuid];
995 * Do a final check for pending NMI or interrupts before
996 * really putting this thread to sleep.
998 * These interrupts could have happened any time after we
999 * returned from VMRUN() and before we grabbed the vcpu lock.
1001 if (!vm_nmi_pending(vm, vcpuid) &&
1002 (intr_disabled || !vlapic_pending_intr(vcpu->vlapic, NULL))) {
1004 vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
1005 if (vlapic_enabled(vcpu->vlapic)) {
1007 * XXX msleep_spin() is not interruptible so use the
1008 * 'timo' to put an upper bound on the sleep time.
1011 msleep_spin(vcpu, &vcpu->mtx, "vmidle", timo);
1014 * Spindown the vcpu if the apic is disabled and it
1015 * had entered the halted state.
1019 vcpu_require_state_locked(vcpu, VCPU_FROZEN);
1020 vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
1025 * Since 'vm_deactivate_cpu()' grabs a sleep mutex we must call it
1026 * outside the confines of the vcpu spinlock.
1030 vmexit = vm_exitinfo(vm, vcpuid);
1031 vmexit->exitcode = VM_EXITCODE_SPINDOWN_CPU;
1032 vm_deactivate_cpu(vm, vcpuid);
1033 VCPU_CTR0(vm, vcpuid, "spinning down cpu");
1040 vm_handle_paging(struct vm *vm, int vcpuid, bool *retu)
1045 struct vm_exit *vme;
1047 vcpu = &vm->vcpu[vcpuid];
1048 vme = &vcpu->exitinfo;
1050 ftype = vme->u.paging.fault_type;
1051 KASSERT(ftype == VM_PROT_READ ||
1052 ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
1053 ("vm_handle_paging: invalid fault_type %d", ftype));
1055 if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
1056 rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
1057 vme->u.paging.gpa, ftype);
1062 map = &vm->vmspace->vm_map;
1063 rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL);
1065 VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, "
1066 "ftype = %d", rv, vme->u.paging.gpa, ftype);
1068 if (rv != KERN_SUCCESS)
1071 /* restart execution at the faulting instruction */
1072 vme->inst_length = 0;
1078 vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
1082 struct vm_exit *vme;
1083 int error, inst_length;
1084 uint64_t rip, gla, gpa, cr3;
1085 mem_region_read_t mread;
1086 mem_region_write_t mwrite;
1088 vcpu = &vm->vcpu[vcpuid];
1089 vme = &vcpu->exitinfo;
1092 inst_length = vme->inst_length;
1094 gla = vme->u.inst_emul.gla;
1095 gpa = vme->u.inst_emul.gpa;
1096 cr3 = vme->u.inst_emul.cr3;
1097 vie = &vme->u.inst_emul.vie;
1101 /* Fetch, decode and emulate the faulting instruction */
1102 if (vmm_fetch_instruction(vm, vcpuid, rip, inst_length, cr3, vie) != 0)
1105 if (vmm_decode_instruction(vm, vcpuid, gla, vie) != 0)
1108 /* return to userland unless this is an in-kernel emulated device */
1109 if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
1110 mread = lapic_mmio_read;
1111 mwrite = lapic_mmio_write;
1112 } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
1113 mread = vioapic_mmio_read;
1114 mwrite = vioapic_mmio_write;
1115 } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
1116 mread = vhpet_mmio_read;
1117 mwrite = vhpet_mmio_write;
1123 error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, mread, mwrite,
1130 vm_run(struct vm *vm, struct vm_run *vmrun)
1135 uint64_t tscval, rip;
1136 struct vm_exit *vme;
1137 bool retu, intr_disabled;
1140 vcpuid = vmrun->cpuid;
1142 if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1145 pmap = vmspace_pmap(vm->vmspace);
1146 vcpu = &vm->vcpu[vcpuid];
1147 vme = &vcpu->exitinfo;
1152 KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
1153 ("vm_run: absurd pm_active"));
1157 pcb = PCPU_GET(curpcb);
1158 set_pcb_flags(pcb, PCB_FULL_IRET);
1160 restore_guest_msrs(vm, vcpuid);
1161 restore_guest_fpustate(vcpu);
1163 vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
1164 error = VMRUN(vm->cookie, vcpuid, rip, pmap, &vm->rendezvous_func);
1165 vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
1167 save_guest_fpustate(vcpu);
1168 restore_host_msrs(vm, vcpuid);
1170 vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
1176 switch (vme->exitcode) {
1177 case VM_EXITCODE_IOAPIC_EOI:
1178 vioapic_process_eoi(vm, vcpuid,
1179 vme->u.ioapic_eoi.vector);
1181 case VM_EXITCODE_RENDEZVOUS:
1182 vm_handle_rendezvous(vm, vcpuid);
1185 case VM_EXITCODE_HLT:
1186 intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
1187 error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu);
1189 case VM_EXITCODE_PAGING:
1190 error = vm_handle_paging(vm, vcpuid, &retu);
1192 case VM_EXITCODE_INST_EMUL:
1193 error = vm_handle_inst_emul(vm, vcpuid, &retu);
1196 retu = true; /* handled in userland */
1201 if (error == 0 && retu == false) {
1202 rip = vme->rip + vme->inst_length;
1206 /* copy the exit information */
1207 bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit));
1212 vm_inject_event(struct vm *vm, int vcpuid, int type,
1213 int vector, uint32_t code, int code_valid)
1215 if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1218 if ((type > VM_EVENT_NONE && type < VM_EVENT_MAX) == 0)
1221 if (vector < 0 || vector > 255)
1224 return (VMINJECT(vm->cookie, vcpuid, type, vector, code, code_valid));
1227 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
1230 vm_inject_nmi(struct vm *vm, int vcpuid)
1234 if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1237 vcpu = &vm->vcpu[vcpuid];
1239 vcpu->nmi_pending = 1;
1240 vcpu_notify_event(vm, vcpuid, false);
1245 vm_nmi_pending(struct vm *vm, int vcpuid)
1249 if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1250 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
1252 vcpu = &vm->vcpu[vcpuid];
1254 return (vcpu->nmi_pending);
1258 vm_nmi_clear(struct vm *vm, int vcpuid)
1262 if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1263 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
1265 vcpu = &vm->vcpu[vcpuid];
1267 if (vcpu->nmi_pending == 0)
1268 panic("vm_nmi_clear: inconsistent nmi_pending state");
1270 vcpu->nmi_pending = 0;
1271 vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
1275 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
1277 if (vcpu < 0 || vcpu >= VM_MAXCPU)
1280 if (type < 0 || type >= VM_CAP_MAX)
1283 return (VMGETCAP(vm->cookie, vcpu, type, retval));
1287 vm_set_capability(struct vm *vm, int vcpu, int type, int val)
1289 if (vcpu < 0 || vcpu >= VM_MAXCPU)
1292 if (type < 0 || type >= VM_CAP_MAX)
1295 return (VMSETCAP(vm->cookie, vcpu, type, val));
1299 vm_guest_msrs(struct vm *vm, int cpu)
1301 return (vm->vcpu[cpu].guest_msrs);
1305 vm_lapic(struct vm *vm, int cpu)
1307 return (vm->vcpu[cpu].vlapic);
1311 vm_ioapic(struct vm *vm)
1314 return (vm->vioapic);
1318 vm_hpet(struct vm *vm)
1325 vmm_is_pptdev(int bus, int slot, int func)
1329 char *val, *cp, *cp2;
1333 * The length of an environment variable is limited to 128 bytes which
1334 * puts an upper limit on the number of passthru devices that may be
1335 * specified using a single environment variable.
1337 * Work around this by scanning multiple environment variable
1338 * names instead of a single one - yuck!
1340 const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL };
1342 /* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */
1344 for (i = 0; names[i] != NULL && !found; i++) {
1345 cp = val = getenv(names[i]);
1346 while (cp != NULL && *cp != '\0') {
1347 if ((cp2 = strchr(cp, ' ')) != NULL)
1350 n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
1351 if (n == 3 && bus == b && slot == s && func == f) {
1367 vm_iommu_domain(struct vm *vm)
1374 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
1380 if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1381 panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
1383 vcpu = &vm->vcpu[vcpuid];
1386 error = vcpu_set_state_locked(vcpu, newstate, from_idle);
1393 vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
1396 enum vcpu_state state;
1398 if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1399 panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
1401 vcpu = &vm->vcpu[vcpuid];
1404 state = vcpu->state;
1405 if (hostcpu != NULL)
1406 *hostcpu = vcpu->hostcpu;
1413 vm_activate_cpu(struct vm *vm, int vcpuid)
1416 KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU,
1417 ("vm_activate_cpu: invalid vcpuid %d", vcpuid));
1418 KASSERT(!CPU_ISSET(vcpuid, &vm->active_cpus),
1419 ("vm_activate_cpu: vcpuid %d is already active", vcpuid));
1421 VCPU_CTR0(vm, vcpuid, "activated");
1422 CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
1426 vm_deactivate_cpu(struct vm *vm, int vcpuid)
1429 KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU,
1430 ("vm_deactivate_cpu: invalid vcpuid %d", vcpuid));
1431 KASSERT(CPU_ISSET(vcpuid, &vm->active_cpus),
1432 ("vm_deactivate_cpu: vcpuid %d is not active", vcpuid));
1434 VCPU_CTR0(vm, vcpuid, "deactivated");
1435 CPU_CLR_ATOMIC(vcpuid, &vm->active_cpus);
1438 * If a vcpu rendezvous is in progress then it could be blocked
1439 * on 'vcpuid' - unblock it before disappearing forever.
1441 mtx_lock(&vm->rendezvous_mtx);
1442 if (vm->rendezvous_func != NULL) {
1443 VCPU_CTR0(vm, vcpuid, "unblock rendezvous after deactivation");
1444 wakeup(&vm->rendezvous_func);
1446 mtx_unlock(&vm->rendezvous_mtx);
1450 vm_active_cpus(struct vm *vm)
1453 return (vm->active_cpus);
1457 vcpu_stats(struct vm *vm, int vcpuid)
1460 return (vm->vcpu[vcpuid].stats);
1464 vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
1466 if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1469 *state = vm->vcpu[vcpuid].x2apic_state;
1475 vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
1477 if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1480 if (state >= X2APIC_STATE_LAST)
1483 vm->vcpu[vcpuid].x2apic_state = state;
1485 vlapic_set_x2apic_state(vm, vcpuid, state);
1491 * This function is called to ensure that a vcpu "sees" a pending event
1492 * as soon as possible:
1493 * - If the vcpu thread is sleeping then it is woken up.
1494 * - If the vcpu is running on a different host_cpu then an IPI will be directed
1495 * to the host_cpu to cause the vcpu to trap into the hypervisor.
1498 vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr)
1503 vcpu = &vm->vcpu[vcpuid];
1506 hostcpu = vcpu->hostcpu;
1507 if (vcpu->state == VCPU_RUNNING) {
1508 KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
1509 if (hostcpu != curcpu) {
1511 vlapic_post_intr(vcpu->vlapic, hostcpu,
1514 ipi_cpu(hostcpu, vmm_ipinum);
1518 * If the 'vcpu' is running on 'curcpu' then it must
1519 * be sending a notification to itself (e.g. SELF_IPI).
1520 * The pending event will be picked up when the vcpu
1521 * transitions back to guest context.
1525 KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
1526 "with hostcpu %d", vcpu->state, hostcpu));
1527 if (vcpu->state == VCPU_SLEEPING)
1534 vm_get_vmspace(struct vm *vm)
1537 return (vm->vmspace);
1541 vm_apicid2vcpuid(struct vm *vm, int apicid)
1544 * XXX apic id is assumed to be numerically identical to vcpu id
1550 vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest,
1551 vm_rendezvous_func_t func, void *arg)
1556 * Enforce that this function is called without any locks
1558 WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous");
1559 KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU),
1560 ("vm_smp_rendezvous: invalid vcpuid %d", vcpuid));
1563 mtx_lock(&vm->rendezvous_mtx);
1564 if (vm->rendezvous_func != NULL) {
1566 * If a rendezvous is already in progress then we need to
1567 * call the rendezvous handler in case this 'vcpuid' is one
1568 * of the targets of the rendezvous.
1570 RENDEZVOUS_CTR0(vm, vcpuid, "Rendezvous already in progress");
1571 mtx_unlock(&vm->rendezvous_mtx);
1572 vm_handle_rendezvous(vm, vcpuid);
1575 KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous "
1576 "rendezvous is still in progress"));
1578 RENDEZVOUS_CTR0(vm, vcpuid, "Initiating rendezvous");
1579 vm->rendezvous_req_cpus = dest;
1580 CPU_ZERO(&vm->rendezvous_done_cpus);
1581 vm->rendezvous_arg = arg;
1582 vm_set_rendezvous_func(vm, func);
1583 mtx_unlock(&vm->rendezvous_mtx);
1586 * Wake up any sleeping vcpus and trigger a VM-exit in any running
1587 * vcpus so they handle the rendezvous as soon as possible.
1589 for (i = 0; i < VM_MAXCPU; i++) {
1590 if (CPU_ISSET(i, &dest))
1591 vcpu_notify_event(vm, i, false);
1594 vm_handle_rendezvous(vm, vcpuid);