2 * Copyright (c) 2011 NetApp, Inc.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/kernel.h>
35 #include <sys/module.h>
36 #include <sys/sysctl.h>
37 #include <sys/malloc.h>
40 #include <sys/mutex.h>
42 #include <sys/sched.h>
44 #include <sys/systm.h>
48 #include <machine/vm.h>
49 #include <machine/pcb.h>
50 #include <machine/smp.h>
51 #include <x86/apicreg.h>
53 #include <machine/vmm.h>
57 #include <machine/vmm_dev.h>
62 #include "vmm_lapic.h"
71 enum vcpu_state state;
73 int hostcpu; /* host cpuid this vcpu last ran on */
74 uint64_t guest_msrs[VMM_MSR_NUM];
75 struct vlapic *vlapic;
77 struct savefpu *guestfpu; /* guest fpu state */
79 struct vm_exit exitinfo;
80 enum x2apic_state x2apic_state;
84 #define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
85 #define vcpu_lock(v) mtx_lock_spin(&((v)->mtx))
86 #define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx))
88 #define VM_MAX_MEMORY_SEGMENTS 2
91 void *cookie; /* processor-specific data */
92 void *iommu; /* iommu-specific data */
93 struct vcpu vcpu[VM_MAXCPU];
95 struct vm_memory_segment mem_segs[VM_MAX_MEMORY_SEGMENTS];
96 char name[VM_MAX_NAMELEN];
99 * Set of active vcpus.
100 * An active vcpu is one that has been started implicitly (BSP) or
101 * explicitly (AP) by sending it a startup ipi.
103 cpuset_t active_cpus;
106 static struct vmm_ops *ops;
107 #define VMM_INIT() (ops != NULL ? (*ops->init)() : 0)
108 #define VMM_CLEANUP() (ops != NULL ? (*ops->cleanup)() : 0)
110 #define VMINIT(vm) (ops != NULL ? (*ops->vminit)(vm): NULL)
111 #define VMRUN(vmi, vcpu, rip) \
112 (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip) : ENXIO)
113 #define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
114 #define VMMMAP_SET(vmi, gpa, hpa, len, attr, prot, spm) \
116 (*ops->vmmmap_set)(vmi, gpa, hpa, len, attr, prot, spm) : \
118 #define VMMMAP_GET(vmi, gpa) \
119 (ops != NULL ? (*ops->vmmmap_get)(vmi, gpa) : ENXIO)
120 #define VMGETREG(vmi, vcpu, num, retval) \
121 (ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
122 #define VMSETREG(vmi, vcpu, num, val) \
123 (ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO)
124 #define VMGETDESC(vmi, vcpu, num, desc) \
125 (ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO)
126 #define VMSETDESC(vmi, vcpu, num, desc) \
127 (ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO)
128 #define VMINJECT(vmi, vcpu, type, vec, ec, ecv) \
129 (ops != NULL ? (*ops->vminject)(vmi, vcpu, type, vec, ec, ecv) : ENXIO)
130 #define VMGETCAP(vmi, vcpu, num, retval) \
131 (ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO)
132 #define VMSETCAP(vmi, vcpu, num, val) \
133 (ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO)
135 #define fpu_start_emulating() load_cr0(rcr0() | CR0_TS)
136 #define fpu_stop_emulating() clts()
138 static MALLOC_DEFINE(M_VM, "vm", "vm");
139 CTASSERT(VMM_MSR_NUM <= 64); /* msr_mask can keep track of up to 64 msrs */
142 static VMM_STAT_DEFINE(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
145 vcpu_cleanup(struct vcpu *vcpu)
147 vlapic_cleanup(vcpu->vlapic);
148 vmm_stat_free(vcpu->stats);
149 fpu_save_area_free(vcpu->guestfpu);
153 vcpu_init(struct vm *vm, uint32_t vcpu_id)
157 vcpu = &vm->vcpu[vcpu_id];
159 vcpu_lock_init(vcpu);
160 vcpu->hostcpu = NOCPU;
161 vcpu->vcpuid = vcpu_id;
162 vcpu->vlapic = vlapic_init(vm, vcpu_id);
163 vm_set_x2apic_state(vm, vcpu_id, X2APIC_ENABLED);
164 vcpu->guestfpu = fpu_save_area_alloc();
165 fpu_save_area_reset(vcpu->guestfpu);
166 vcpu->stats = vmm_stat_alloc();
170 vm_exitinfo(struct vm *vm, int cpuid)
174 if (cpuid < 0 || cpuid >= VM_MAXCPU)
175 panic("vm_exitinfo: invalid cpuid %d", cpuid);
177 vcpu = &vm->vcpu[cpuid];
179 return (&vcpu->exitinfo);
187 vmm_host_state_init();
190 error = vmm_mem_init();
195 ops = &vmm_ops_intel;
196 else if (vmm_is_amd())
207 vmm_handler(module_t mod, int what, void *arg)
218 error = vmmdev_cleanup();
222 error = VMM_CLEANUP();
232 static moduledata_t vmm_kmod = {
239 * vmm initialization has the following dependencies:
241 * - iommu initialization must happen after the pci passthru driver has had
242 * a chance to attach to any passthru devices (after SI_SUB_CONFIGURE).
244 * - VT-x initialization requires smp_rendezvous() and therefore must happen
245 * after SMP is fully functional (after SI_SUB_SMP).
247 DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY);
248 MODULE_VERSION(vmm, 1);
250 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
253 vm_create(const char *name)
261 if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
264 vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
265 strcpy(vm->name, name);
266 vm->cookie = VMINIT(vm);
268 for (i = 0; i < VM_MAXCPU; i++) {
270 guest_msrs_init(vm, i);
273 maxaddr = vmm_mem_maxaddr();
274 vm->iommu = iommu_create_domain(maxaddr);
275 vm_activate_cpu(vm, BSP);
281 vm_free_mem_seg(struct vm *vm, struct vm_memory_segment *seg)
287 host_domain = iommu_host_domain();
290 while (len < seg->len) {
291 hpa = vm_gpa2hpa(vm, seg->gpa + len, PAGE_SIZE);
292 if (hpa == (vm_paddr_t)-1) {
293 panic("vm_free_mem_segs: cannot free hpa "
294 "associated with gpa 0x%016lx", seg->gpa + len);
298 * Remove the 'gpa' to 'hpa' mapping in VMs domain.
299 * And resurrect the 1:1 mapping for 'hpa' in 'host_domain'.
301 iommu_remove_mapping(vm->iommu, seg->gpa + len, PAGE_SIZE);
302 iommu_create_mapping(host_domain, hpa, hpa, PAGE_SIZE);
304 vmm_mem_free(hpa, PAGE_SIZE);
310 * Invalidate cached translations associated with 'vm->iommu' since
311 * we have now moved some pages from it.
313 iommu_invalidate_tlb(vm->iommu);
315 bzero(seg, sizeof(struct vm_memory_segment));
319 vm_destroy(struct vm *vm)
323 ppt_unassign_all(vm);
325 for (i = 0; i < vm->num_mem_segs; i++)
326 vm_free_mem_seg(vm, &vm->mem_segs[i]);
328 vm->num_mem_segs = 0;
330 for (i = 0; i < VM_MAXCPU; i++)
331 vcpu_cleanup(&vm->vcpu[i]);
333 iommu_destroy_domain(vm->iommu);
335 VMCLEANUP(vm->cookie);
341 vm_name(struct vm *vm)
347 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
349 const boolean_t spok = TRUE; /* superpage mappings are ok */
351 return (VMMMAP_SET(vm->cookie, gpa, hpa, len, VM_MEMATTR_UNCACHEABLE,
356 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
358 const boolean_t spok = TRUE; /* superpage mappings are ok */
360 return (VMMMAP_SET(vm->cookie, gpa, 0, len, 0,
361 VM_PROT_NONE, spok));
365 * Returns TRUE if 'gpa' is available for allocation and FALSE otherwise
368 vm_gpa_available(struct vm *vm, vm_paddr_t gpa)
371 vm_paddr_t gpabase, gpalimit;
374 panic("vm_gpa_available: gpa (0x%016lx) not page aligned", gpa);
376 for (i = 0; i < vm->num_mem_segs; i++) {
377 gpabase = vm->mem_segs[i].gpa;
378 gpalimit = gpabase + vm->mem_segs[i].len;
379 if (gpa >= gpabase && gpa < gpalimit)
387 vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
389 int error, available, allocated;
390 struct vm_memory_segment *seg;
394 const boolean_t spok = TRUE; /* superpage mappings are ok */
396 if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0)
399 available = allocated = 0;
401 while (g < gpa + len) {
402 if (vm_gpa_available(vm, g))
411 * If there are some allocated and some available pages in the address
412 * range then it is an error.
414 if (allocated && available)
418 * If the entire address range being requested has already been
419 * allocated then there isn't anything more to do.
421 if (allocated && available == 0)
424 if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
427 host_domain = iommu_host_domain();
429 seg = &vm->mem_segs[vm->num_mem_segs];
434 while (seg->len < len) {
435 hpa = vmm_mem_alloc(PAGE_SIZE);
441 error = VMMMAP_SET(vm->cookie, gpa + seg->len, hpa, PAGE_SIZE,
442 VM_MEMATTR_WRITE_BACK, VM_PROT_ALL, spok);
447 * Remove the 1:1 mapping for 'hpa' from the 'host_domain'.
448 * Add mapping for 'gpa + seg->len' to 'hpa' in the VMs domain.
450 iommu_remove_mapping(host_domain, hpa, PAGE_SIZE);
451 iommu_create_mapping(vm->iommu, gpa + seg->len, hpa, PAGE_SIZE);
453 seg->len += PAGE_SIZE;
457 vm_free_mem_seg(vm, seg);
462 * Invalidate cached translations associated with 'host_domain' since
463 * we have now moved some pages from it.
465 iommu_invalidate_tlb(host_domain);
473 vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t len)
477 nextpage = rounddown(gpa + PAGE_SIZE, PAGE_SIZE);
478 if (len > nextpage - gpa)
479 panic("vm_gpa2hpa: invalid gpa/len: 0x%016lx/%lu", gpa, len);
481 return (VMMMAP_GET(vm->cookie, gpa));
485 vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
486 struct vm_memory_segment *seg)
490 for (i = 0; i < vm->num_mem_segs; i++) {
491 if (gpabase == vm->mem_segs[i].gpa) {
492 *seg = vm->mem_segs[i];
500 vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
503 if (vcpu < 0 || vcpu >= VM_MAXCPU)
506 if (reg >= VM_REG_LAST)
509 return (VMGETREG(vm->cookie, vcpu, reg, retval));
513 vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val)
516 if (vcpu < 0 || vcpu >= VM_MAXCPU)
519 if (reg >= VM_REG_LAST)
522 return (VMSETREG(vm->cookie, vcpu, reg, val));
526 is_descriptor_table(int reg)
530 case VM_REG_GUEST_IDTR:
531 case VM_REG_GUEST_GDTR:
539 is_segment_register(int reg)
543 case VM_REG_GUEST_ES:
544 case VM_REG_GUEST_CS:
545 case VM_REG_GUEST_SS:
546 case VM_REG_GUEST_DS:
547 case VM_REG_GUEST_FS:
548 case VM_REG_GUEST_GS:
549 case VM_REG_GUEST_TR:
550 case VM_REG_GUEST_LDTR:
558 vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
559 struct seg_desc *desc)
562 if (vcpu < 0 || vcpu >= VM_MAXCPU)
565 if (!is_segment_register(reg) && !is_descriptor_table(reg))
568 return (VMGETDESC(vm->cookie, vcpu, reg, desc));
572 vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
573 struct seg_desc *desc)
575 if (vcpu < 0 || vcpu >= VM_MAXCPU)
578 if (!is_segment_register(reg) && !is_descriptor_table(reg))
581 return (VMSETDESC(vm->cookie, vcpu, reg, desc));
585 restore_guest_fpustate(struct vcpu *vcpu)
588 /* flush host state to the pcb */
591 /* restore guest FPU state */
592 fpu_stop_emulating();
593 fpurestore(vcpu->guestfpu);
596 * The FPU is now "dirty" with the guest's state so turn on emulation
597 * to trap any access to the FPU by the host.
599 fpu_start_emulating();
603 save_guest_fpustate(struct vcpu *vcpu)
606 if ((rcr0() & CR0_TS) == 0)
607 panic("fpu emulation not enabled in host!");
609 /* save guest FPU state */
610 fpu_stop_emulating();
611 fpusave(vcpu->guestfpu);
612 fpu_start_emulating();
615 static VMM_STAT_DEFINE(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
618 vm_run(struct vm *vm, struct vm_run *vmrun)
620 int error, vcpuid, sleepticks, t;
623 uint64_t tscval, rip;
626 vcpuid = vmrun->cpuid;
628 if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
631 vcpu = &vm->vcpu[vcpuid];
632 vme = &vmrun->vm_exit;
639 pcb = PCPU_GET(curpcb);
640 set_pcb_flags(pcb, PCB_FULL_IRET);
642 restore_guest_msrs(vm, vcpuid);
643 restore_guest_fpustate(vcpu);
645 vcpu->hostcpu = curcpu;
646 error = VMRUN(vm->cookie, vcpuid, rip);
647 vcpu->hostcpu = NOCPU;
649 save_guest_fpustate(vcpu);
650 restore_host_msrs(vm, vcpuid);
652 vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
654 /* copy the exit information */
655 bcopy(&vcpu->exitinfo, vme, sizeof(struct vm_exit));
660 * Oblige the guest's desire to 'hlt' by sleeping until the vcpu
663 if (error == 0 && vme->exitcode == VM_EXITCODE_HLT) {
667 * Figure out the number of host ticks until the next apic
668 * timer interrupt in the guest.
670 sleepticks = lapic_timer_tick(vm, vcpuid);
673 * If the guest local apic timer is disabled then sleep for
674 * a long time but not forever.
680 * Do a final check for pending NMI or interrupts before
681 * really putting this thread to sleep.
683 * These interrupts could have happened any time after we
684 * returned from VMRUN() and before we grabbed the vcpu lock.
686 if (!vm_nmi_pending(vm, vcpuid) &&
687 lapic_pending_intr(vm, vcpuid) < 0) {
689 panic("invalid sleepticks %d", sleepticks);
691 msleep_spin(vcpu, &vcpu->mtx, "vmidle", sleepticks);
692 vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
697 rip = vme->rip + vme->inst_length;
705 vm_inject_event(struct vm *vm, int vcpuid, int type,
706 int vector, uint32_t code, int code_valid)
708 if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
711 if ((type > VM_EVENT_NONE && type < VM_EVENT_MAX) == 0)
714 if (vector < 0 || vector > 255)
717 return (VMINJECT(vm->cookie, vcpuid, type, vector, code, code_valid));
720 static VMM_STAT_DEFINE(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
723 vm_inject_nmi(struct vm *vm, int vcpuid)
727 if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
730 vcpu = &vm->vcpu[vcpuid];
732 vcpu->nmi_pending = 1;
733 vm_interrupt_hostcpu(vm, vcpuid);
738 vm_nmi_pending(struct vm *vm, int vcpuid)
742 if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
743 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
745 vcpu = &vm->vcpu[vcpuid];
747 return (vcpu->nmi_pending);
751 vm_nmi_clear(struct vm *vm, int vcpuid)
755 if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
756 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
758 vcpu = &vm->vcpu[vcpuid];
760 if (vcpu->nmi_pending == 0)
761 panic("vm_nmi_clear: inconsistent nmi_pending state");
763 vcpu->nmi_pending = 0;
764 vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
768 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
770 if (vcpu < 0 || vcpu >= VM_MAXCPU)
773 if (type < 0 || type >= VM_CAP_MAX)
776 return (VMGETCAP(vm->cookie, vcpu, type, retval));
780 vm_set_capability(struct vm *vm, int vcpu, int type, int val)
782 if (vcpu < 0 || vcpu >= VM_MAXCPU)
785 if (type < 0 || type >= VM_CAP_MAX)
788 return (VMSETCAP(vm->cookie, vcpu, type, val));
792 vm_guest_msrs(struct vm *vm, int cpu)
794 return (vm->vcpu[cpu].guest_msrs);
798 vm_lapic(struct vm *vm, int cpu)
800 return (vm->vcpu[cpu].vlapic);
804 vmm_is_pptdev(int bus, int slot, int func)
808 char *val, *cp, *cp2;
812 * The length of an environment variable is limited to 128 bytes which
813 * puts an upper limit on the number of passthru devices that may be
814 * specified using a single environment variable.
816 * Work around this by scanning multiple environment variable
817 * names instead of a single one - yuck!
819 const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL };
821 /* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */
823 for (i = 0; names[i] != NULL && !found; i++) {
824 cp = val = getenv(names[i]);
825 while (cp != NULL && *cp != '\0') {
826 if ((cp2 = strchr(cp, ' ')) != NULL)
829 n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
830 if (n == 3 && bus == b && slot == s && func == f) {
846 vm_iommu_domain(struct vm *vm)
853 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state state)
858 if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
859 panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
861 vcpu = &vm->vcpu[vcpuid];
866 * The following state transitions are allowed:
867 * IDLE -> RUNNING -> IDLE
868 * IDLE -> CANNOT_RUN -> IDLE
870 if ((vcpu->state == VCPU_IDLE && state != VCPU_IDLE) ||
871 (vcpu->state != VCPU_IDLE && state == VCPU_IDLE)) {
884 vcpu_get_state(struct vm *vm, int vcpuid)
887 enum vcpu_state state;
889 if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
890 panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
892 vcpu = &vm->vcpu[vcpuid];
902 vm_activate_cpu(struct vm *vm, int vcpuid)
905 if (vcpuid >= 0 && vcpuid < VM_MAXCPU)
906 CPU_SET(vcpuid, &vm->active_cpus);
910 vm_active_cpus(struct vm *vm)
913 return (vm->active_cpus);
917 vcpu_stats(struct vm *vm, int vcpuid)
920 return (vm->vcpu[vcpuid].stats);
924 vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
926 if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
929 *state = vm->vcpu[vcpuid].x2apic_state;
935 vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
937 if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
940 if (state < 0 || state >= X2APIC_STATE_LAST)
943 vm->vcpu[vcpuid].x2apic_state = state;
945 vlapic_set_x2apic_state(vm, vcpuid, state);
951 vm_interrupt_hostcpu(struct vm *vm, int vcpuid)
956 vcpu = &vm->vcpu[vcpuid];
959 hostcpu = vcpu->hostcpu;
960 if (hostcpu == NOCPU) {
962 * If the vcpu is 'RUNNING' but without a valid 'hostcpu' then
963 * the host thread must be sleeping waiting for an event to
964 * kick the vcpu out of 'hlt'.
966 * XXX this is racy because the condition exists right before
967 * and after calling VMRUN() in vm_run(). The wakeup() is
968 * benign in this case.
970 if (vcpu->state == VCPU_RUNNING)
973 if (vcpu->state != VCPU_RUNNING)
974 panic("invalid vcpu state %d", vcpu->state);
975 if (hostcpu != curcpu)
976 ipi_cpu(hostcpu, vmm_ipinum);