sys/amd64/vmm/vmm.c

   1 /*-
   2  * Copyright (c) 2011 NetApp, Inc.
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  *
  14  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  17  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  24  * SUCH DAMAGE.
  25  *
  26  * $FreeBSD$
  27  */
  28
  29 #include <sys/cdefs.h>
  30 __FBSDID("$FreeBSD$");
  31
  32 #include <sys/param.h>
  33 #include <sys/systm.h>
  34 #include <sys/kernel.h>
  35 #include <sys/module.h>
  36 #include <sys/sysctl.h>
  37 #include <sys/malloc.h>
  38 #include <sys/pcpu.h>
  39 #include <sys/lock.h>
  40 #include <sys/mutex.h>
  41 #include <sys/proc.h>
  42 #include <sys/sched.h>
  43 #include <sys/smp.h>
  44 #include <sys/systm.h>
  45
  46 #include <vm/vm.h>
  47
  48 #include <machine/vm.h>
  49 #include <machine/pcb.h>
  50 #include <machine/smp.h>
  51 #include <x86/apicreg.h>
  52
  53 #include <machine/vmm.h>
  54 #include "vmm_host.h"
  55 #include "vmm_mem.h"
  56 #include "vmm_util.h"
  57 #include <machine/vmm_dev.h>
  58 #include "vlapic.h"
  59 #include "vmm_msr.h"
  60 #include "vmm_ipi.h"
  61 #include "vmm_stat.h"
  62 #include "vmm_lapic.h"
  63
  64 #include "io/ppt.h"
  65 #include "io/iommu.h"
  66
  67 struct vlapic;
  68
  69 struct vcpu {
  70         int             flags;
  71         enum vcpu_state state;
  72         struct mtx      mtx;
  73         int             hostcpu;        /* host cpuid this vcpu last ran on */
  74         uint64_t        guest_msrs[VMM_MSR_NUM];
  75         struct vlapic   *vlapic;
  76         int              vcpuid;
  77         struct savefpu  *guestfpu;      /* guest fpu state */
  78         void            *stats;
  79         struct vm_exit  exitinfo;
  80         enum x2apic_state x2apic_state;
  81         int             nmi_pending;
  82 };
  83
  84 #define vcpu_lock_init(v)       mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
  85 #define vcpu_lock(v)            mtx_lock_spin(&((v)->mtx))
  86 #define vcpu_unlock(v)          mtx_unlock_spin(&((v)->mtx))
  87
  88 #define VM_MAX_MEMORY_SEGMENTS  2
  89
  90 struct vm {
  91         void            *cookie;        /* processor-specific data */
  92         void            *iommu;         /* iommu-specific data */
  93         struct vcpu     vcpu[VM_MAXCPU];
  94         int             num_mem_segs;
  95         struct vm_memory_segment mem_segs[VM_MAX_MEMORY_SEGMENTS];
  96         char            name[VM_MAX_NAMELEN];
  97
  98         /*
  99          * Set of active vcpus.
 100          * An active vcpu is one that has been started implicitly (BSP) or
 101          * explicitly (AP) by sending it a startup ipi.
 102          */
 103         cpuset_t        active_cpus;
 104 };
 105
 106 static struct vmm_ops *ops;
 107 #define VMM_INIT()      (ops != NULL ? (*ops->init)() : 0)
 108 #define VMM_CLEANUP()   (ops != NULL ? (*ops->cleanup)() : 0)
 109
 110 #define VMINIT(vm)      (ops != NULL ? (*ops->vminit)(vm): NULL)
 111 #define VMRUN(vmi, vcpu, rip) \
 112         (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip) : ENXIO)
 113 #define VMCLEANUP(vmi)  (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
 114 #define VMMMAP_SET(vmi, gpa, hpa, len, attr, prot, spm)                 \
 115         (ops != NULL ?                                                  \
 116         (*ops->vmmmap_set)(vmi, gpa, hpa, len, attr, prot, spm) :       \
 117         ENXIO)
 118 #define VMMMAP_GET(vmi, gpa) \
 119         (ops != NULL ? (*ops->vmmmap_get)(vmi, gpa) : ENXIO)
 120 #define VMGETREG(vmi, vcpu, num, retval)                \
 121         (ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
 122 #define VMSETREG(vmi, vcpu, num, val)           \
 123         (ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO)
 124 #define VMGETDESC(vmi, vcpu, num, desc)         \
 125         (ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO)
 126 #define VMSETDESC(vmi, vcpu, num, desc)         \
 127         (ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO)
 128 #define VMINJECT(vmi, vcpu, type, vec, ec, ecv) \
 129         (ops != NULL ? (*ops->vminject)(vmi, vcpu, type, vec, ec, ecv) : ENXIO)
 130 #define VMGETCAP(vmi, vcpu, num, retval)        \
 131         (ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO)
 132 #define VMSETCAP(vmi, vcpu, num, val)           \
 133         (ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO)
 134
 135 #define fpu_start_emulating()   load_cr0(rcr0() | CR0_TS)
 136 #define fpu_stop_emulating()    clts()
 137
 138 static MALLOC_DEFINE(M_VM, "vm", "vm");
 139 CTASSERT(VMM_MSR_NUM <= 64);    /* msr_mask can keep track of up to 64 msrs */
 140
 141 /* statistics */
 142 static VMM_STAT_DEFINE(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
 143
 144 static void
 145 vcpu_cleanup(struct vcpu *vcpu)
 146 {
 147         vlapic_cleanup(vcpu->vlapic);
 148         vmm_stat_free(vcpu->stats);
 149         fpu_save_area_free(vcpu->guestfpu);
 150 }
 151
 152 static void
 153 vcpu_init(struct vm *vm, uint32_t vcpu_id)
 154 {
 155         struct vcpu *vcpu;
 156
 157         vcpu = &vm->vcpu[vcpu_id];
 158
 159         vcpu_lock_init(vcpu);
 160         vcpu->hostcpu = NOCPU;
 161         vcpu->vcpuid = vcpu_id;
 162         vcpu->vlapic = vlapic_init(vm, vcpu_id);
 163         vm_set_x2apic_state(vm, vcpu_id, X2APIC_ENABLED);
 164         vcpu->guestfpu = fpu_save_area_alloc();
 165         fpu_save_area_reset(vcpu->guestfpu);
 166         vcpu->stats = vmm_stat_alloc();
 167 }
 168
 169 struct vm_exit *
 170 vm_exitinfo(struct vm *vm, int cpuid)
 171 {
 172         struct vcpu *vcpu;
 173
 174         if (cpuid < 0 || cpuid >= VM_MAXCPU)
 175                 panic("vm_exitinfo: invalid cpuid %d", cpuid);
 176
 177         vcpu = &vm->vcpu[cpuid];
 178
 179         return (&vcpu->exitinfo);
 180 }
 181
 182 static int
 183 vmm_init(void)
 184 {
 185         int error;
 186
 187         vmm_host_state_init();
 188         vmm_ipi_init();
 189
 190         error = vmm_mem_init();
 191         if (error)
 192                 return (error);
 193
 194         if (vmm_is_intel())
 195                 ops = &vmm_ops_intel;
 196         else if (vmm_is_amd())
 197                 ops = &vmm_ops_amd;
 198         else
 199                 return (ENXIO);
 200
 201         vmm_msr_init();
 202
 203         return (VMM_INIT());
 204 }
 205
 206 static int
 207 vmm_handler(module_t mod, int what, void *arg)
 208 {
 209         int error;
 210
 211         switch (what) {
 212         case MOD_LOAD:
 213                 vmmdev_init();
 214                 iommu_init();
 215                 error = vmm_init();
 216                 break;
 217         case MOD_UNLOAD:
 218                 error = vmmdev_cleanup();
 219                 if (error == 0) {
 220                         iommu_cleanup();
 221                         vmm_ipi_cleanup();
 222                         error = VMM_CLEANUP();
 223                 }
 224                 break;
 225         default:
 226                 error = 0;
 227                 break;
 228         }
 229         return (error);
 230 }
 231
 232 static moduledata_t vmm_kmod = {
 233         "vmm",
 234         vmm_handler,
 235         NULL
 236 };
 237
 238 /*
 239  * vmm initialization has the following dependencies:
 240  *
 241  * - iommu initialization must happen after the pci passthru driver has had
 242  *   a chance to attach to any passthru devices (after SI_SUB_CONFIGURE).
 243  *
 244  * - VT-x initialization requires smp_rendezvous() and therefore must happen
 245  *   after SMP is fully functional (after SI_SUB_SMP).
 246  */
 247 DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY);
 248 MODULE_VERSION(vmm, 1);
 249
 250 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
 251
 252 struct vm *
 253 vm_create(const char *name)
 254 {
 255         int i;
 256         struct vm *vm;
 257         vm_paddr_t maxaddr;
 258
 259         const int BSP = 0;
 260
 261         if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
 262                 return (NULL);
 263
 264         vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
 265         strcpy(vm->name, name);
 266         vm->cookie = VMINIT(vm);
 267
 268         for (i = 0; i < VM_MAXCPU; i++) {
 269                 vcpu_init(vm, i);
 270                 guest_msrs_init(vm, i);
 271         }
 272
 273         maxaddr = vmm_mem_maxaddr();
 274         vm->iommu = iommu_create_domain(maxaddr);
 275         vm_activate_cpu(vm, BSP);
 276
 277         return (vm);
 278 }
 279
 280 static void
 281 vm_free_mem_seg(struct vm *vm, struct vm_memory_segment *seg)
 282 {
 283         size_t len;
 284         vm_paddr_t hpa;
 285         void *host_domain;
 286
 287         host_domain = iommu_host_domain();
 288
 289         len = 0;
 290         while (len < seg->len) {
 291                 hpa = vm_gpa2hpa(vm, seg->gpa + len, PAGE_SIZE);
 292                 if (hpa == (vm_paddr_t)-1) {
 293                         panic("vm_free_mem_segs: cannot free hpa "
 294                               "associated with gpa 0x%016lx", seg->gpa + len);
 295                 }
 296
 297                 /*
 298                  * Remove the 'gpa' to 'hpa' mapping in VMs domain.
 299                  * And resurrect the 1:1 mapping for 'hpa' in 'host_domain'.
 300                  */
 301                 iommu_remove_mapping(vm->iommu, seg->gpa + len, PAGE_SIZE);
 302                 iommu_create_mapping(host_domain, hpa, hpa, PAGE_SIZE);
 303
 304                 vmm_mem_free(hpa, PAGE_SIZE);
 305
 306                 len += PAGE_SIZE;
 307         }
 308
 309         /*
 310          * Invalidate cached translations associated with 'vm->iommu' since
 311          * we have now moved some pages from it.
 312          */
 313         iommu_invalidate_tlb(vm->iommu);
 314
 315         bzero(seg, sizeof(struct vm_memory_segment));
 316 }
 317
 318 void
 319 vm_destroy(struct vm *vm)
 320 {
 321         int i;
 322
 323         ppt_unassign_all(vm);
 324
 325         for (i = 0; i < vm->num_mem_segs; i++)
 326                 vm_free_mem_seg(vm, &vm->mem_segs[i]);
 327
 328         vm->num_mem_segs = 0;
 329
 330         for (i = 0; i < VM_MAXCPU; i++)
 331                 vcpu_cleanup(&vm->vcpu[i]);
 332
 333         iommu_destroy_domain(vm->iommu);
 334
 335         VMCLEANUP(vm->cookie);
 336
 337         free(vm, M_VM);
 338 }
 339
 340 const char *
 341 vm_name(struct vm *vm)
 342 {
 343         return (vm->name);
 344 }
 345
 346 int
 347 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
 348 {
 349         const boolean_t spok = TRUE;    /* superpage mappings are ok */
 350
 351         return (VMMMAP_SET(vm->cookie, gpa, hpa, len, VM_MEMATTR_UNCACHEABLE,
 352                            VM_PROT_RW, spok));
 353 }
 354
 355 int
 356 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
 357 {
 358         const boolean_t spok = TRUE;    /* superpage mappings are ok */
 359
 360         return (VMMMAP_SET(vm->cookie, gpa, 0, len, 0,
 361                            VM_PROT_NONE, spok));
 362 }
 363
 364 /*
 365  * Returns TRUE if 'gpa' is available for allocation and FALSE otherwise
 366  */
 367 static boolean_t
 368 vm_gpa_available(struct vm *vm, vm_paddr_t gpa)
 369 {
 370         int i;
 371         vm_paddr_t gpabase, gpalimit;
 372
 373         if (gpa & PAGE_MASK)
 374                 panic("vm_gpa_available: gpa (0x%016lx) not page aligned", gpa);
 375
 376         for (i = 0; i < vm->num_mem_segs; i++) {
 377                 gpabase = vm->mem_segs[i].gpa;
 378                 gpalimit = gpabase + vm->mem_segs[i].len;
 379                 if (gpa >= gpabase && gpa < gpalimit)
 380                         return (FALSE);
 381         }
 382
 383         return (TRUE);
 384 }
 385
 386 int
 387 vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
 388 {
 389         int error, available, allocated;
 390         struct vm_memory_segment *seg;
 391         vm_paddr_t g, hpa;
 392         void *host_domain;
 393
 394         const boolean_t spok = TRUE;    /* superpage mappings are ok */
 395
 396         if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0)
 397                 return (EINVAL);
 398
 399         available = allocated = 0;
 400         g = gpa;
 401         while (g < gpa + len) {
 402                 if (vm_gpa_available(vm, g))
 403                         available++;
 404                 else
 405                         allocated++;
 406
 407                 g += PAGE_SIZE;
 408         }
 409
 410         /*
 411          * If there are some allocated and some available pages in the address
 412          * range then it is an error.
 413          */
 414         if (allocated && available)
 415                 return (EINVAL);
 416
 417         /*
 418          * If the entire address range being requested has already been
 419          * allocated then there isn't anything more to do.
 420          */
 421         if (allocated && available == 0)
 422                 return (0);
 423
 424         if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
 425                 return (E2BIG);
 426
 427         host_domain = iommu_host_domain();
 428
 429         seg = &vm->mem_segs[vm->num_mem_segs];
 430
 431         error = 0;
 432         seg->gpa = gpa;
 433         seg->len = 0;
 434         while (seg->len < len) {
 435                 hpa = vmm_mem_alloc(PAGE_SIZE);
 436                 if (hpa == 0) {
 437                         error = ENOMEM;
 438                         break;
 439                 }
 440
 441                 error = VMMMAP_SET(vm->cookie, gpa + seg->len, hpa, PAGE_SIZE,
 442                                    VM_MEMATTR_WRITE_BACK, VM_PROT_ALL, spok);
 443                 if (error)
 444                         break;
 445
 446                 /*
 447                  * Remove the 1:1 mapping for 'hpa' from the 'host_domain'.
 448                  * Add mapping for 'gpa + seg->len' to 'hpa' in the VMs domain.
 449                  */
 450                 iommu_remove_mapping(host_domain, hpa, PAGE_SIZE);
 451                 iommu_create_mapping(vm->iommu, gpa + seg->len, hpa, PAGE_SIZE);
 452
 453                 seg->len += PAGE_SIZE;
 454         }
 455
 456         if (error) {
 457                 vm_free_mem_seg(vm, seg);
 458                 return (error);
 459         }
 460
 461         /*
 462          * Invalidate cached translations associated with 'host_domain' since
 463          * we have now moved some pages from it.
 464          */
 465         iommu_invalidate_tlb(host_domain);
 466
 467         vm->num_mem_segs++;
 468
 469         return (0);
 470 }
 471
 472 vm_paddr_t
 473 vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t len)
 474 {
 475         vm_paddr_t nextpage;
 476
 477         nextpage = rounddown(gpa + PAGE_SIZE, PAGE_SIZE);
 478         if (len > nextpage - gpa)
 479                 panic("vm_gpa2hpa: invalid gpa/len: 0x%016lx/%lu", gpa, len);
 480
 481         return (VMMMAP_GET(vm->cookie, gpa));
 482 }
 483
 484 int
 485 vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
 486                   struct vm_memory_segment *seg)
 487 {
 488         int i;
 489
 490         for (i = 0; i < vm->num_mem_segs; i++) {
 491                 if (gpabase == vm->mem_segs[i].gpa) {
 492                         *seg = vm->mem_segs[i];
 493                         return (0);
 494                 }
 495         }
 496         return (-1);
 497 }
 498
 499 int
 500 vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
 501 {
 502
 503         if (vcpu < 0 || vcpu >= VM_MAXCPU)
 504                 return (EINVAL);
 505
 506         if (reg >= VM_REG_LAST)
 507                 return (EINVAL);
 508
 509         return (VMGETREG(vm->cookie, vcpu, reg, retval));
 510 }
 511
 512 int
 513 vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val)
 514 {
 515
 516         if (vcpu < 0 || vcpu >= VM_MAXCPU)
 517                 return (EINVAL);
 518
 519         if (reg >= VM_REG_LAST)
 520                 return (EINVAL);
 521
 522         return (VMSETREG(vm->cookie, vcpu, reg, val));
 523 }
 524
 525 static boolean_t
 526 is_descriptor_table(int reg)
 527 {
 528
 529         switch (reg) {
 530         case VM_REG_GUEST_IDTR:
 531         case VM_REG_GUEST_GDTR:
 532                 return (TRUE);
 533         default:
 534                 return (FALSE);
 535         }
 536 }
 537
 538 static boolean_t
 539 is_segment_register(int reg)
 540 {
 541
 542         switch (reg) {
 543         case VM_REG_GUEST_ES:
 544         case VM_REG_GUEST_CS:
 545         case VM_REG_GUEST_SS:
 546         case VM_REG_GUEST_DS:
 547         case VM_REG_GUEST_FS:
 548         case VM_REG_GUEST_GS:
 549         case VM_REG_GUEST_TR:
 550         case VM_REG_GUEST_LDTR:
 551                 return (TRUE);
 552         default:
 553                 return (FALSE);
 554         }
 555 }
 556
 557 int
 558 vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
 559                 struct seg_desc *desc)
 560 {
 561
 562         if (vcpu < 0 || vcpu >= VM_MAXCPU)
 563                 return (EINVAL);
 564
 565         if (!is_segment_register(reg) && !is_descriptor_table(reg))
 566                 return (EINVAL);
 567
 568         return (VMGETDESC(vm->cookie, vcpu, reg, desc));
 569 }
 570
 571 int
 572 vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
 573                 struct seg_desc *desc)
 574 {
 575         if (vcpu < 0 || vcpu >= VM_MAXCPU)
 576                 return (EINVAL);
 577
 578         if (!is_segment_register(reg) && !is_descriptor_table(reg))
 579                 return (EINVAL);
 580
 581         return (VMSETDESC(vm->cookie, vcpu, reg, desc));
 582 }
 583
 584 static void
 585 restore_guest_fpustate(struct vcpu *vcpu)
 586 {
 587
 588         /* flush host state to the pcb */
 589         fpuexit(curthread);
 590
 591         /* restore guest FPU state */
 592         fpu_stop_emulating();
 593         fpurestore(vcpu->guestfpu);
 594
 595         /*
 596          * The FPU is now "dirty" with the guest's state so turn on emulation
 597          * to trap any access to the FPU by the host.
 598          */
 599         fpu_start_emulating();
 600 }
 601
 602 static void
 603 save_guest_fpustate(struct vcpu *vcpu)
 604 {
 605
 606         if ((rcr0() & CR0_TS) == 0)
 607                 panic("fpu emulation not enabled in host!");
 608
 609         /* save guest FPU state */
 610         fpu_stop_emulating();
 611         fpusave(vcpu->guestfpu);
 612         fpu_start_emulating();
 613 }
 614
 615 static VMM_STAT_DEFINE(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
 616
 617 int
 618 vm_run(struct vm *vm, struct vm_run *vmrun)
 619 {
 620         int error, vcpuid, sleepticks, t;
 621         struct vcpu *vcpu;
 622         struct pcb *pcb;
 623         uint64_t tscval, rip;
 624         struct vm_exit *vme;
 625
 626         vcpuid = vmrun->cpuid;
 627
 628         if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 629                 return (EINVAL);
 630
 631         vcpu = &vm->vcpu[vcpuid];
 632         vme = &vmrun->vm_exit;
 633         rip = vmrun->rip;
 634 restart:
 635         critical_enter();
 636
 637         tscval = rdtsc();
 638
 639         pcb = PCPU_GET(curpcb);
 640         set_pcb_flags(pcb, PCB_FULL_IRET);
 641
 642         restore_guest_msrs(vm, vcpuid);
 643         restore_guest_fpustate(vcpu);
 644
 645         vcpu->hostcpu = curcpu;
 646         error = VMRUN(vm->cookie, vcpuid, rip);
 647         vcpu->hostcpu = NOCPU;
 648
 649         save_guest_fpustate(vcpu);
 650         restore_host_msrs(vm, vcpuid);
 651
 652         vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
 653
 654         /* copy the exit information */
 655         bcopy(&vcpu->exitinfo, vme, sizeof(struct vm_exit));
 656
 657         critical_exit();
 658
 659         /*
 660          * Oblige the guest's desire to 'hlt' by sleeping until the vcpu
 661          * is ready to run.
 662          */
 663         if (error == 0 && vme->exitcode == VM_EXITCODE_HLT) {
 664                 vcpu_lock(vcpu);
 665
 666                 /*
 667                  * Figure out the number of host ticks until the next apic
 668                  * timer interrupt in the guest.
 669                  */
 670                 sleepticks = lapic_timer_tick(vm, vcpuid);
 671
 672                 /*
 673                  * If the guest local apic timer is disabled then sleep for
 674                  * a long time but not forever.
 675                  */
 676                 if (sleepticks < 0)
 677                         sleepticks = hz;
 678
 679                 /*
 680                  * Do a final check for pending NMI or interrupts before
 681                  * really putting this thread to sleep.
 682                  *
 683                  * These interrupts could have happened any time after we
 684                  * returned from VMRUN() and before we grabbed the vcpu lock.
 685                  */
 686                 if (!vm_nmi_pending(vm, vcpuid) &&
 687                     lapic_pending_intr(vm, vcpuid) < 0) {
 688                         if (sleepticks <= 0)
 689                                 panic("invalid sleepticks %d", sleepticks);
 690                         t = ticks;
 691                         msleep_spin(vcpu, &vcpu->mtx, "vmidle", sleepticks);
 692                         vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
 693                 }
 694
 695                 vcpu_unlock(vcpu);
 696
 697                 rip = vme->rip + vme->inst_length;
 698                 goto restart;
 699         }
 700
 701         return (error);
 702 }
 703
 704 int
 705 vm_inject_event(struct vm *vm, int vcpuid, int type,
 706                 int vector, uint32_t code, int code_valid)
 707 {
 708         if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 709                 return (EINVAL);
 710
 711         if ((type > VM_EVENT_NONE && type < VM_EVENT_MAX) == 0)
 712                 return (EINVAL);
 713
 714         if (vector < 0 || vector > 255)
 715                 return (EINVAL);
 716
 717         return (VMINJECT(vm->cookie, vcpuid, type, vector, code, code_valid));
 718 }
 719
 720 static VMM_STAT_DEFINE(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
 721
 722 int
 723 vm_inject_nmi(struct vm *vm, int vcpuid)
 724 {
 725         struct vcpu *vcpu;
 726
 727         if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 728                 return (EINVAL);
 729
 730         vcpu = &vm->vcpu[vcpuid];
 731
 732         vcpu->nmi_pending = 1;
 733         vm_interrupt_hostcpu(vm, vcpuid);
 734         return (0);
 735 }
 736
 737 int
 738 vm_nmi_pending(struct vm *vm, int vcpuid)
 739 {
 740         struct vcpu *vcpu;
 741
 742         if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 743                 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
 744
 745         vcpu = &vm->vcpu[vcpuid];
 746
 747         return (vcpu->nmi_pending);
 748 }
 749
 750 void
 751 vm_nmi_clear(struct vm *vm, int vcpuid)
 752 {
 753         struct vcpu *vcpu;
 754
 755         if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 756                 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
 757
 758         vcpu = &vm->vcpu[vcpuid];
 759
 760         if (vcpu->nmi_pending == 0)
 761                 panic("vm_nmi_clear: inconsistent nmi_pending state");
 762
 763         vcpu->nmi_pending = 0;
 764         vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
 765 }
 766
 767 int
 768 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
 769 {
 770         if (vcpu < 0 || vcpu >= VM_MAXCPU)
 771                 return (EINVAL);
 772
 773         if (type < 0 || type >= VM_CAP_MAX)
 774                 return (EINVAL);
 775
 776         return (VMGETCAP(vm->cookie, vcpu, type, retval));
 777 }
 778
 779 int
 780 vm_set_capability(struct vm *vm, int vcpu, int type, int val)
 781 {
 782         if (vcpu < 0 || vcpu >= VM_MAXCPU)
 783                 return (EINVAL);
 784
 785         if (type < 0 || type >= VM_CAP_MAX)
 786                 return (EINVAL);
 787
 788         return (VMSETCAP(vm->cookie, vcpu, type, val));
 789 }
 790
 791 uint64_t *
 792 vm_guest_msrs(struct vm *vm, int cpu)
 793 {
 794         return (vm->vcpu[cpu].guest_msrs);
 795 }
 796
 797 struct vlapic *
 798 vm_lapic(struct vm *vm, int cpu)
 799 {
 800         return (vm->vcpu[cpu].vlapic);
 801 }
 802
 803 boolean_t
 804 vmm_is_pptdev(int bus, int slot, int func)
 805 {
 806         int found, i, n;
 807         int b, s, f;
 808         char *val, *cp, *cp2;
 809
 810         /*
 811          * XXX
 812          * The length of an environment variable is limited to 128 bytes which
 813          * puts an upper limit on the number of passthru devices that may be
 814          * specified using a single environment variable.
 815          *
 816          * Work around this by scanning multiple environment variable
 817          * names instead of a single one - yuck!
 818          */
 819         const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL };
 820
 821         /* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */
 822         found = 0;
 823         for (i = 0; names[i] != NULL && !found; i++) {
 824                 cp = val = getenv(names[i]);
 825                 while (cp != NULL && *cp != '\0') {
 826                         if ((cp2 = strchr(cp, ' ')) != NULL)
 827                                 *cp2 = '\0';
 828
 829                         n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
 830                         if (n == 3 && bus == b && slot == s && func == f) {
 831                                 found = 1;
 832                                 break;
 833                         }
 834
 835                         if (cp2 != NULL)
 836                                 *cp2++ = ' ';
 837
 838                         cp = cp2;
 839                 }
 840                 freeenv(val);
 841         }
 842         return (found);
 843 }
 844
 845 void *
 846 vm_iommu_domain(struct vm *vm)
 847 {
 848
 849         return (vm->iommu);
 850 }
 851
 852 int
 853 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state state)
 854 {
 855         int error;
 856         struct vcpu *vcpu;
 857
 858         if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 859                 panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
 860
 861         vcpu = &vm->vcpu[vcpuid];
 862
 863         vcpu_lock(vcpu);
 864
 865         /*
 866          * The following state transitions are allowed:
 867          * IDLE -> RUNNING -> IDLE
 868          * IDLE -> CANNOT_RUN -> IDLE
 869          */
 870         if ((vcpu->state == VCPU_IDLE && state != VCPU_IDLE) ||
 871             (vcpu->state != VCPU_IDLE && state == VCPU_IDLE)) {
 872                 error = 0;
 873                 vcpu->state = state;
 874         } else {
 875                 error = EBUSY;
 876         }
 877
 878         vcpu_unlock(vcpu);
 879
 880         return (error);
 881 }
 882
 883 enum vcpu_state
 884 vcpu_get_state(struct vm *vm, int vcpuid)
 885 {
 886         struct vcpu *vcpu;
 887         enum vcpu_state state;
 888
 889         if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 890                 panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
 891
 892         vcpu = &vm->vcpu[vcpuid];
 893
 894         vcpu_lock(vcpu);
 895         state = vcpu->state;
 896         vcpu_unlock(vcpu);
 897
 898         return (state);
 899 }
 900
 901 void
 902 vm_activate_cpu(struct vm *vm, int vcpuid)
 903 {
 904
 905         if (vcpuid >= 0 && vcpuid < VM_MAXCPU)
 906                 CPU_SET(vcpuid, &vm->active_cpus);
 907 }
 908
 909 cpuset_t
 910 vm_active_cpus(struct vm *vm)
 911 {
 912
 913         return (vm->active_cpus);
 914 }
 915
 916 void *
 917 vcpu_stats(struct vm *vm, int vcpuid)
 918 {
 919
 920         return (vm->vcpu[vcpuid].stats);
 921 }
 922
 923 int
 924 vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
 925 {
 926         if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 927                 return (EINVAL);
 928
 929         *state = vm->vcpu[vcpuid].x2apic_state;
 930
 931         return (0);
 932 }
 933
 934 int
 935 vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
 936 {
 937         if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 938                 return (EINVAL);
 939
 940         if (state < 0 || state >= X2APIC_STATE_LAST)
 941                 return (EINVAL);
 942
 943         vm->vcpu[vcpuid].x2apic_state = state;
 944
 945         vlapic_set_x2apic_state(vm, vcpuid, state);
 946
 947         return (0);
 948 }
 949
 950 void
 951 vm_interrupt_hostcpu(struct vm *vm, int vcpuid)
 952 {
 953         int hostcpu;
 954         struct vcpu *vcpu;
 955
 956         vcpu = &vm->vcpu[vcpuid];
 957
 958         vcpu_lock(vcpu);
 959         hostcpu = vcpu->hostcpu;
 960         if (hostcpu == NOCPU) {
 961                 /*
 962                  * If the vcpu is 'RUNNING' but without a valid 'hostcpu' then
 963                  * the host thread must be sleeping waiting for an event to
 964                  * kick the vcpu out of 'hlt'.
 965                  *
 966                  * XXX this is racy because the condition exists right before
 967                  * and after calling VMRUN() in vm_run(). The wakeup() is
 968                  * benign in this case.
 969                  */
 970                 if (vcpu->state == VCPU_RUNNING)
 971                         wakeup_one(vcpu);
 972         } else {
 973                 if (vcpu->state != VCPU_RUNNING)
 974                         panic("invalid vcpu state %d", vcpu->state);
 975                 if (hostcpu != curcpu)
 976                         ipi_cpu(hostcpu, vmm_ipinum);
 977         }
 978         vcpu_unlock(vcpu);
 979 }