sys/amd64/vmm/vmm.c

   1 /*-
   2  * Copyright (c) 2011 NetApp, Inc.
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  *
  14  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
  15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  17  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
  18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  24  * SUCH DAMAGE.
  25  *
  26  * $FreeBSD$
  27  */
  28
  29 #include <sys/cdefs.h>
  30 __FBSDID("$FreeBSD$");
  31
  32 #include <sys/param.h>
  33 #include <sys/systm.h>
  34 #include <sys/kernel.h>
  35 #include <sys/module.h>
  36 #include <sys/sysctl.h>
  37 #include <sys/malloc.h>
  38 #include <sys/pcpu.h>
  39 #include <sys/lock.h>
  40 #include <sys/mutex.h>
  41 #include <sys/proc.h>
  42 #include <sys/rwlock.h>
  43 #include <sys/sched.h>
  44 #include <sys/smp.h>
  45 #include <sys/systm.h>
  46
  47 #include <vm/vm.h>
  48 #include <vm/vm_object.h>
  49 #include <vm/vm_page.h>
  50 #include <vm/pmap.h>
  51 #include <vm/vm_map.h>
  52 #include <vm/vm_extern.h>
  53 #include <vm/vm_param.h>
  54
  55 #include <machine/cpu.h>
  56 #include <machine/vm.h>
  57 #include <machine/pcb.h>
  58 #include <machine/smp.h>
  59 #include <x86/psl.h>
  60 #include <x86/apicreg.h>
  61 #include <machine/vmparam.h>
  62
  63 #include <machine/vmm.h>
  64 #include <machine/vmm_dev.h>
  65
  66 #include "vmm_ktr.h"
  67 #include "vmm_host.h"
  68 #include "vmm_mem.h"
  69 #include "vmm_util.h"
  70 #include "vhpet.h"
  71 #include "vioapic.h"
  72 #include "vlapic.h"
  73 #include "vmm_msr.h"
  74 #include "vmm_ipi.h"
  75 #include "vmm_stat.h"
  76 #include "vmm_lapic.h"
  77
  78 #include "io/ppt.h"
  79 #include "io/iommu.h"
  80
  81 struct vlapic;
  82
  83 struct vcpu {
  84         int             flags;
  85         enum vcpu_state state;
  86         struct mtx      mtx;
  87         int             hostcpu;        /* host cpuid this vcpu last ran on */
  88         uint64_t        guest_msrs[VMM_MSR_NUM];
  89         struct vlapic   *vlapic;
  90         int              vcpuid;
  91         struct savefpu  *guestfpu;      /* guest fpu state */
  92         void            *stats;
  93         struct vm_exit  exitinfo;
  94         enum x2apic_state x2apic_state;
  95         int             nmi_pending;
  96 };
  97
  98 #define vcpu_lock_init(v)       mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
  99 #define vcpu_lock(v)            mtx_lock_spin(&((v)->mtx))
 100 #define vcpu_unlock(v)          mtx_unlock_spin(&((v)->mtx))
 101 #define vcpu_assert_locked(v)   mtx_assert(&((v)->mtx), MA_OWNED)
 102
 103 struct mem_seg {
 104         vm_paddr_t      gpa;
 105         size_t          len;
 106         boolean_t       wired;
 107         vm_object_t     object;
 108 };
 109 #define VM_MAX_MEMORY_SEGMENTS  2
 110
 111 struct vm {
 112         void            *cookie;        /* processor-specific data */
 113         void            *iommu;         /* iommu-specific data */
 114         struct vhpet    *vhpet;         /* virtual HPET */
 115         struct vioapic  *vioapic;       /* virtual ioapic */
 116         struct vmspace  *vmspace;       /* guest's address space */
 117         struct vcpu     vcpu[VM_MAXCPU];
 118         int             num_mem_segs;
 119         struct mem_seg  mem_segs[VM_MAX_MEMORY_SEGMENTS];
 120         char            name[VM_MAX_NAMELEN];
 121
 122         /*
 123          * Set of active vcpus.
 124          * An active vcpu is one that has been started implicitly (BSP) or
 125          * explicitly (AP) by sending it a startup ipi.
 126          */
 127         volatile cpuset_t active_cpus;
 128
 129         struct mtx      rendezvous_mtx;
 130         cpuset_t        rendezvous_req_cpus;
 131         cpuset_t        rendezvous_done_cpus;
 132         void            *rendezvous_arg;
 133         vm_rendezvous_func_t rendezvous_func;
 134 };
 135
 136 static int vmm_initialized;
 137
 138 static struct vmm_ops *ops;
 139 #define VMM_INIT(num)   (ops != NULL ? (*ops->init)(num) : 0)
 140 #define VMM_CLEANUP()   (ops != NULL ? (*ops->cleanup)() : 0)
 141 #define VMM_RESUME()    (ops != NULL ? (*ops->resume)() : 0)
 142
 143 #define VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL)
 144 #define VMRUN(vmi, vcpu, rip, pmap, rptr) \
 145         (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap, rptr) : ENXIO)
 146 #define VMCLEANUP(vmi)  (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
 147 #define VMSPACE_ALLOC(min, max) \
 148         (ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL)
 149 #define VMSPACE_FREE(vmspace) \
 150         (ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO)
 151 #define VMGETREG(vmi, vcpu, num, retval)                \
 152         (ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
 153 #define VMSETREG(vmi, vcpu, num, val)           \
 154         (ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO)
 155 #define VMGETDESC(vmi, vcpu, num, desc)         \
 156         (ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO)
 157 #define VMSETDESC(vmi, vcpu, num, desc)         \
 158         (ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO)
 159 #define VMINJECT(vmi, vcpu, type, vec, ec, ecv) \
 160         (ops != NULL ? (*ops->vminject)(vmi, vcpu, type, vec, ec, ecv) : ENXIO)
 161 #define VMGETCAP(vmi, vcpu, num, retval)        \
 162         (ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO)
 163 #define VMSETCAP(vmi, vcpu, num, val)           \
 164         (ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO)
 165 #define VLAPIC_INIT(vmi, vcpu)                  \
 166         (ops != NULL ? (*ops->vlapic_init)(vmi, vcpu) : NULL)
 167 #define VLAPIC_CLEANUP(vmi, vlapic)             \
 168         (ops != NULL ? (*ops->vlapic_cleanup)(vmi, vlapic) : NULL)
 169
 170 #define fpu_start_emulating()   load_cr0(rcr0() | CR0_TS)
 171 #define fpu_stop_emulating()    clts()
 172
 173 static MALLOC_DEFINE(M_VM, "vm", "vm");
 174 CTASSERT(VMM_MSR_NUM <= 64);    /* msr_mask can keep track of up to 64 msrs */
 175
 176 /* statistics */
 177 static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
 178
 179 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
 180
 181 static int vmm_ipinum;
 182 SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
 183     "IPI vector used for vcpu notifications");
 184
 185 static void vm_deactivate_cpu(struct vm *vm, int vcpuid);
 186
 187 static void
 188 vcpu_cleanup(struct vm *vm, int i)
 189 {
 190         struct vcpu *vcpu = &vm->vcpu[i];
 191
 192         VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic);
 193         vmm_stat_free(vcpu->stats);
 194         fpu_save_area_free(vcpu->guestfpu);
 195 }
 196
 197 static void
 198 vcpu_init(struct vm *vm, uint32_t vcpu_id)
 199 {
 200         struct vcpu *vcpu;
 201
 202         vcpu = &vm->vcpu[vcpu_id];
 203
 204         vcpu_lock_init(vcpu);
 205         vcpu->hostcpu = NOCPU;
 206         vcpu->vcpuid = vcpu_id;
 207         vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
 208         vm_set_x2apic_state(vm, vcpu_id, X2APIC_ENABLED);
 209         vcpu->guestfpu = fpu_save_area_alloc();
 210         fpu_save_area_reset(vcpu->guestfpu);
 211         vcpu->stats = vmm_stat_alloc();
 212 }
 213
 214 struct vm_exit *
 215 vm_exitinfo(struct vm *vm, int cpuid)
 216 {
 217         struct vcpu *vcpu;
 218
 219         if (cpuid < 0 || cpuid >= VM_MAXCPU)
 220                 panic("vm_exitinfo: invalid cpuid %d", cpuid);
 221
 222         vcpu = &vm->vcpu[cpuid];
 223
 224         return (&vcpu->exitinfo);
 225 }
 226
 227 static void
 228 vmm_resume(void)
 229 {
 230         VMM_RESUME();
 231 }
 232
 233 static int
 234 vmm_init(void)
 235 {
 236         int error;
 237
 238         vmm_host_state_init();
 239
 240         vmm_ipinum = vmm_ipi_alloc();
 241         if (vmm_ipinum == 0)
 242                 vmm_ipinum = IPI_AST;
 243
 244         error = vmm_mem_init();
 245         if (error)
 246                 return (error);
 247
 248         if (vmm_is_intel())
 249                 ops = &vmm_ops_intel;
 250         else if (vmm_is_amd())
 251                 ops = &vmm_ops_amd;
 252         else
 253                 return (ENXIO);
 254
 255         vmm_msr_init();
 256         vmm_resume_p = vmm_resume;
 257
 258         return (VMM_INIT(vmm_ipinum));
 259 }
 260
 261 static int
 262 vmm_handler(module_t mod, int what, void *arg)
 263 {
 264         int error;
 265
 266         switch (what) {
 267         case MOD_LOAD:
 268                 vmmdev_init();
 269                 if (ppt_avail_devices() > 0)
 270                         iommu_init();
 271                 error = vmm_init();
 272                 if (error == 0)
 273                         vmm_initialized = 1;
 274                 break;
 275         case MOD_UNLOAD:
 276                 error = vmmdev_cleanup();
 277                 if (error == 0) {
 278                         vmm_resume_p = NULL;
 279                         iommu_cleanup();
 280                         if (vmm_ipinum != IPI_AST)
 281                                 vmm_ipi_free(vmm_ipinum);
 282                         error = VMM_CLEANUP();
 283                         /*
 284                          * Something bad happened - prevent new
 285                          * VMs from being created
 286                          */
 287                         if (error)
 288                                 vmm_initialized = 0;
 289                 }
 290                 break;
 291         default:
 292                 error = 0;
 293                 break;
 294         }
 295         return (error);
 296 }
 297
 298 static moduledata_t vmm_kmod = {
 299         "vmm",
 300         vmm_handler,
 301         NULL
 302 };
 303
 304 /*
 305  * vmm initialization has the following dependencies:
 306  *
 307  * - iommu initialization must happen after the pci passthru driver has had
 308  *   a chance to attach to any passthru devices (after SI_SUB_CONFIGURE).
 309  *
 310  * - VT-x initialization requires smp_rendezvous() and therefore must happen
 311  *   after SMP is fully functional (after SI_SUB_SMP).
 312  */
 313 DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY);
 314 MODULE_VERSION(vmm, 1);
 315
 316 int
 317 vm_create(const char *name, struct vm **retvm)
 318 {
 319         int i;
 320         struct vm *vm;
 321         struct vmspace *vmspace;
 322
 323         const int BSP = 0;
 324
 325         /*
 326          * If vmm.ko could not be successfully initialized then don't attempt
 327          * to create the virtual machine.
 328          */
 329         if (!vmm_initialized)
 330                 return (ENXIO);
 331
 332         if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
 333                 return (EINVAL);
 334
 335         vmspace = VMSPACE_ALLOC(VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS);
 336         if (vmspace == NULL)
 337                 return (ENOMEM);
 338
 339         vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
 340         strcpy(vm->name, name);
 341         vm->vmspace = vmspace;
 342         mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF);
 343         vm->cookie = VMINIT(vm, vmspace_pmap(vmspace));
 344         vm->vioapic = vioapic_init(vm);
 345         vm->vhpet = vhpet_init(vm);
 346
 347         for (i = 0; i < VM_MAXCPU; i++) {
 348                 vcpu_init(vm, i);
 349                 guest_msrs_init(vm, i);
 350         }
 351
 352         vm_activate_cpu(vm, BSP);
 353
 354         *retvm = vm;
 355         return (0);
 356 }
 357
 358 static void
 359 vm_free_mem_seg(struct vm *vm, struct mem_seg *seg)
 360 {
 361
 362         if (seg->object != NULL)
 363                 vmm_mem_free(vm->vmspace, seg->gpa, seg->len);
 364
 365         bzero(seg, sizeof(*seg));
 366 }
 367
 368 void
 369 vm_destroy(struct vm *vm)
 370 {
 371         int i;
 372
 373         ppt_unassign_all(vm);
 374
 375         if (vm->iommu != NULL)
 376                 iommu_destroy_domain(vm->iommu);
 377
 378         vhpet_cleanup(vm->vhpet);
 379         vioapic_cleanup(vm->vioapic);
 380
 381         for (i = 0; i < vm->num_mem_segs; i++)
 382                 vm_free_mem_seg(vm, &vm->mem_segs[i]);
 383
 384         vm->num_mem_segs = 0;
 385
 386         for (i = 0; i < VM_MAXCPU; i++)
 387                 vcpu_cleanup(vm, i);
 388
 389         VMSPACE_FREE(vm->vmspace);
 390
 391         VMCLEANUP(vm->cookie);
 392
 393         free(vm, M_VM);
 394 }
 395
 396 const char *
 397 vm_name(struct vm *vm)
 398 {
 399         return (vm->name);
 400 }
 401
 402 int
 403 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
 404 {
 405         vm_object_t obj;
 406
 407         if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
 408                 return (ENOMEM);
 409         else
 410                 return (0);
 411 }
 412
 413 int
 414 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
 415 {
 416
 417         vmm_mmio_free(vm->vmspace, gpa, len);
 418         return (0);
 419 }
 420
 421 boolean_t
 422 vm_mem_allocated(struct vm *vm, vm_paddr_t gpa)
 423 {
 424         int i;
 425         vm_paddr_t gpabase, gpalimit;
 426
 427         for (i = 0; i < vm->num_mem_segs; i++) {
 428                 gpabase = vm->mem_segs[i].gpa;
 429                 gpalimit = gpabase + vm->mem_segs[i].len;
 430                 if (gpa >= gpabase && gpa < gpalimit)
 431                         return (TRUE);          /* 'gpa' is regular memory */
 432         }
 433
 434         if (ppt_is_mmio(vm, gpa))
 435                 return (TRUE);                  /* 'gpa' is pci passthru mmio */
 436
 437         return (FALSE);
 438 }
 439
 440 int
 441 vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
 442 {
 443         int available, allocated;
 444         struct mem_seg *seg;
 445         vm_object_t object;
 446         vm_paddr_t g;
 447
 448         if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0)
 449                 return (EINVAL);
 450
 451         available = allocated = 0;
 452         g = gpa;
 453         while (g < gpa + len) {
 454                 if (vm_mem_allocated(vm, g))
 455                         allocated++;
 456                 else
 457                         available++;
 458
 459                 g += PAGE_SIZE;
 460         }
 461
 462         /*
 463          * If there are some allocated and some available pages in the address
 464          * range then it is an error.
 465          */
 466         if (allocated && available)
 467                 return (EINVAL);
 468
 469         /*
 470          * If the entire address range being requested has already been
 471          * allocated then there isn't anything more to do.
 472          */
 473         if (allocated && available == 0)
 474                 return (0);
 475
 476         if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
 477                 return (E2BIG);
 478
 479         seg = &vm->mem_segs[vm->num_mem_segs];
 480
 481         if ((object = vmm_mem_alloc(vm->vmspace, gpa, len)) == NULL)
 482                 return (ENOMEM);
 483
 484         seg->gpa = gpa;
 485         seg->len = len;
 486         seg->object = object;
 487         seg->wired = FALSE;
 488
 489         vm->num_mem_segs++;
 490
 491         return (0);
 492 }
 493
 494 static void
 495 vm_gpa_unwire(struct vm *vm)
 496 {
 497         int i, rv;
 498         struct mem_seg *seg;
 499
 500         for (i = 0; i < vm->num_mem_segs; i++) {
 501                 seg = &vm->mem_segs[i];
 502                 if (!seg->wired)
 503                         continue;
 504
 505                 rv = vm_map_unwire(&vm->vmspace->vm_map,
 506                                    seg->gpa, seg->gpa + seg->len,
 507                                    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
 508                 KASSERT(rv == KERN_SUCCESS, ("vm(%s) memory segment "
 509                     "%#lx/%ld could not be unwired: %d",
 510                     vm_name(vm), seg->gpa, seg->len, rv));
 511
 512                 seg->wired = FALSE;
 513         }
 514 }
 515
 516 static int
 517 vm_gpa_wire(struct vm *vm)
 518 {
 519         int i, rv;
 520         struct mem_seg *seg;
 521
 522         for (i = 0; i < vm->num_mem_segs; i++) {
 523                 seg = &vm->mem_segs[i];
 524                 if (seg->wired)
 525                         continue;
 526
 527                 /* XXX rlimits? */
 528                 rv = vm_map_wire(&vm->vmspace->vm_map,
 529                                  seg->gpa, seg->gpa + seg->len,
 530                                  VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
 531                 if (rv != KERN_SUCCESS)
 532                         break;
 533
 534                 seg->wired = TRUE;
 535         }
 536
 537         if (i < vm->num_mem_segs) {
 538                 /*
 539                  * Undo the wiring before returning an error.
 540                  */
 541                 vm_gpa_unwire(vm);
 542                 return (EAGAIN);
 543         }
 544
 545         return (0);
 546 }
 547
 548 static void
 549 vm_iommu_modify(struct vm *vm, boolean_t map)
 550 {
 551         int i, sz;
 552         vm_paddr_t gpa, hpa;
 553         struct mem_seg *seg;
 554         void *vp, *cookie, *host_domain;
 555
 556         sz = PAGE_SIZE;
 557         host_domain = iommu_host_domain();
 558
 559         for (i = 0; i < vm->num_mem_segs; i++) {
 560                 seg = &vm->mem_segs[i];
 561                 KASSERT(seg->wired, ("vm(%s) memory segment %#lx/%ld not wired",
 562                     vm_name(vm), seg->gpa, seg->len));
 563
 564                 gpa = seg->gpa;
 565                 while (gpa < seg->gpa + seg->len) {
 566                         vp = vm_gpa_hold(vm, gpa, PAGE_SIZE, VM_PROT_WRITE,
 567                                          &cookie);
 568                         KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx",
 569                             vm_name(vm), gpa));
 570
 571                         vm_gpa_release(cookie);
 572
 573                         hpa = DMAP_TO_PHYS((uintptr_t)vp);
 574                         if (map) {
 575                                 iommu_create_mapping(vm->iommu, gpa, hpa, sz);
 576                                 iommu_remove_mapping(host_domain, hpa, sz);
 577                         } else {
 578                                 iommu_remove_mapping(vm->iommu, gpa, sz);
 579                                 iommu_create_mapping(host_domain, hpa, hpa, sz);
 580                         }
 581
 582                         gpa += PAGE_SIZE;
 583                 }
 584         }
 585
 586         /*
 587          * Invalidate the cached translations associated with the domain
 588          * from which pages were removed.
 589          */
 590         if (map)
 591                 iommu_invalidate_tlb(host_domain);
 592         else
 593                 iommu_invalidate_tlb(vm->iommu);
 594 }
 595
 596 #define vm_iommu_unmap(vm)      vm_iommu_modify((vm), FALSE)
 597 #define vm_iommu_map(vm)        vm_iommu_modify((vm), TRUE)
 598
 599 int
 600 vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func)
 601 {
 602         int error;
 603
 604         error = ppt_unassign_device(vm, bus, slot, func);
 605         if (error)
 606                 return (error);
 607
 608         if (ppt_assigned_devices(vm) == 0) {
 609                 vm_iommu_unmap(vm);
 610                 vm_gpa_unwire(vm);
 611         }
 612         return (0);
 613 }
 614
 615 int
 616 vm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
 617 {
 618         int error;
 619         vm_paddr_t maxaddr;
 620
 621         /*
 622          * Virtual machines with pci passthru devices get special treatment:
 623          * - the guest physical memory is wired
 624          * - the iommu is programmed to do the 'gpa' to 'hpa' translation
 625          *
 626          * We need to do this before the first pci passthru device is attached.
 627          */
 628         if (ppt_assigned_devices(vm) == 0) {
 629                 KASSERT(vm->iommu == NULL,
 630                     ("vm_assign_pptdev: iommu must be NULL"));
 631                 maxaddr = vmm_mem_maxaddr();
 632                 vm->iommu = iommu_create_domain(maxaddr);
 633
 634                 error = vm_gpa_wire(vm);
 635                 if (error)
 636                         return (error);
 637
 638                 vm_iommu_map(vm);
 639         }
 640
 641         error = ppt_assign_device(vm, bus, slot, func);
 642         return (error);
 643 }
 644
 645 void *
 646 vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
 647             void **cookie)
 648 {
 649         int count, pageoff;
 650         vm_page_t m;
 651
 652         pageoff = gpa & PAGE_MASK;
 653         if (len > PAGE_SIZE - pageoff)
 654                 panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
 655
 656         count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
 657             trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
 658
 659         if (count == 1) {
 660                 *cookie = m;
 661                 return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
 662         } else {
 663                 *cookie = NULL;
 664                 return (NULL);
 665         }
 666 }
 667
 668 void
 669 vm_gpa_release(void *cookie)
 670 {
 671         vm_page_t m = cookie;
 672
 673         vm_page_lock(m);
 674         vm_page_unhold(m);
 675         vm_page_unlock(m);
 676 }
 677
 678 int
 679 vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
 680                   struct vm_memory_segment *seg)
 681 {
 682         int i;
 683
 684         for (i = 0; i < vm->num_mem_segs; i++) {
 685                 if (gpabase == vm->mem_segs[i].gpa) {
 686                         seg->gpa = vm->mem_segs[i].gpa;
 687                         seg->len = vm->mem_segs[i].len;
 688                         seg->wired = vm->mem_segs[i].wired;
 689                         return (0);
 690                 }
 691         }
 692         return (-1);
 693 }
 694
 695 int
 696 vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len,
 697               vm_offset_t *offset, struct vm_object **object)
 698 {
 699         int i;
 700         size_t seg_len;
 701         vm_paddr_t seg_gpa;
 702         vm_object_t seg_obj;
 703
 704         for (i = 0; i < vm->num_mem_segs; i++) {
 705                 if ((seg_obj = vm->mem_segs[i].object) == NULL)
 706                         continue;
 707
 708                 seg_gpa = vm->mem_segs[i].gpa;
 709                 seg_len = vm->mem_segs[i].len;
 710
 711                 if (gpa >= seg_gpa && gpa < seg_gpa + seg_len) {
 712                         *offset = gpa - seg_gpa;
 713                         *object = seg_obj;
 714                         vm_object_reference(seg_obj);
 715                         return (0);
 716                 }
 717         }
 718
 719         return (EINVAL);
 720 }
 721
 722 int
 723 vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
 724 {
 725
 726         if (vcpu < 0 || vcpu >= VM_MAXCPU)
 727                 return (EINVAL);
 728
 729         if (reg >= VM_REG_LAST)
 730                 return (EINVAL);
 731
 732         return (VMGETREG(vm->cookie, vcpu, reg, retval));
 733 }
 734
 735 int
 736 vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val)
 737 {
 738
 739         if (vcpu < 0 || vcpu >= VM_MAXCPU)
 740                 return (EINVAL);
 741
 742         if (reg >= VM_REG_LAST)
 743                 return (EINVAL);
 744
 745         return (VMSETREG(vm->cookie, vcpu, reg, val));
 746 }
 747
 748 static boolean_t
 749 is_descriptor_table(int reg)
 750 {
 751
 752         switch (reg) {
 753         case VM_REG_GUEST_IDTR:
 754         case VM_REG_GUEST_GDTR:
 755                 return (TRUE);
 756         default:
 757                 return (FALSE);
 758         }
 759 }
 760
 761 static boolean_t
 762 is_segment_register(int reg)
 763 {
 764
 765         switch (reg) {
 766         case VM_REG_GUEST_ES:
 767         case VM_REG_GUEST_CS:
 768         case VM_REG_GUEST_SS:
 769         case VM_REG_GUEST_DS:
 770         case VM_REG_GUEST_FS:
 771         case VM_REG_GUEST_GS:
 772         case VM_REG_GUEST_TR:
 773         case VM_REG_GUEST_LDTR:
 774                 return (TRUE);
 775         default:
 776                 return (FALSE);
 777         }
 778 }
 779
 780 int
 781 vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
 782                 struct seg_desc *desc)
 783 {
 784
 785         if (vcpu < 0 || vcpu >= VM_MAXCPU)
 786                 return (EINVAL);
 787
 788         if (!is_segment_register(reg) && !is_descriptor_table(reg))
 789                 return (EINVAL);
 790
 791         return (VMGETDESC(vm->cookie, vcpu, reg, desc));
 792 }
 793
 794 int
 795 vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
 796                 struct seg_desc *desc)
 797 {
 798         if (vcpu < 0 || vcpu >= VM_MAXCPU)
 799                 return (EINVAL);
 800
 801         if (!is_segment_register(reg) && !is_descriptor_table(reg))
 802                 return (EINVAL);
 803
 804         return (VMSETDESC(vm->cookie, vcpu, reg, desc));
 805 }
 806
 807 static void
 808 restore_guest_fpustate(struct vcpu *vcpu)
 809 {
 810
 811         /* flush host state to the pcb */
 812         fpuexit(curthread);
 813
 814         /* restore guest FPU state */
 815         fpu_stop_emulating();
 816         fpurestore(vcpu->guestfpu);
 817
 818         /*
 819          * The FPU is now "dirty" with the guest's state so turn on emulation
 820          * to trap any access to the FPU by the host.
 821          */
 822         fpu_start_emulating();
 823 }
 824
 825 static void
 826 save_guest_fpustate(struct vcpu *vcpu)
 827 {
 828
 829         if ((rcr0() & CR0_TS) == 0)
 830                 panic("fpu emulation not enabled in host!");
 831
 832         /* save guest FPU state */
 833         fpu_stop_emulating();
 834         fpusave(vcpu->guestfpu);
 835         fpu_start_emulating();
 836 }
 837
 838 static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
 839
 840 static int
 841 vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
 842     bool from_idle)
 843 {
 844         int error;
 845
 846         vcpu_assert_locked(vcpu);
 847
 848         /*
 849          * State transitions from the vmmdev_ioctl() must always begin from
 850          * the VCPU_IDLE state. This guarantees that there is only a single
 851          * ioctl() operating on a vcpu at any point.
 852          */
 853         if (from_idle) {
 854                 while (vcpu->state != VCPU_IDLE)
 855                         msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
 856         } else {
 857                 KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
 858                     "vcpu idle state"));
 859         }
 860
 861         if (vcpu->state == VCPU_RUNNING) {
 862                 KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
 863                     "mismatch for running vcpu", curcpu, vcpu->hostcpu));
 864         } else {
 865                 KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
 866                     "vcpu that is not running", vcpu->hostcpu));
 867         }
 868
 869         /*
 870          * The following state transitions are allowed:
 871          * IDLE -> FROZEN -> IDLE
 872          * FROZEN -> RUNNING -> FROZEN
 873          * FROZEN -> SLEEPING -> FROZEN
 874          */
 875         switch (vcpu->state) {
 876         case VCPU_IDLE:
 877         case VCPU_RUNNING:
 878         case VCPU_SLEEPING:
 879                 error = (newstate != VCPU_FROZEN);
 880                 break;
 881         case VCPU_FROZEN:
 882                 error = (newstate == VCPU_FROZEN);
 883                 break;
 884         default:
 885                 error = 1;
 886                 break;
 887         }
 888
 889         if (error)
 890                 return (EBUSY);
 891
 892         vcpu->state = newstate;
 893         if (newstate == VCPU_RUNNING)
 894                 vcpu->hostcpu = curcpu;
 895         else
 896                 vcpu->hostcpu = NOCPU;
 897
 898         if (newstate == VCPU_IDLE)
 899                 wakeup(&vcpu->state);
 900
 901         return (0);
 902 }
 903
 904 static void
 905 vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
 906 {
 907         int error;
 908
 909         if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0)
 910                 panic("Error %d setting state to %d\n", error, newstate);
 911 }
 912
 913 static void
 914 vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
 915 {
 916         int error;
 917
 918         if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0)
 919                 panic("Error %d setting state to %d", error, newstate);
 920 }
 921
 922 static void
 923 vm_set_rendezvous_func(struct vm *vm, vm_rendezvous_func_t func)
 924 {
 925
 926         KASSERT(mtx_owned(&vm->rendezvous_mtx), ("rendezvous_mtx not locked"));
 927
 928         /*
 929          * Update 'rendezvous_func' and execute a write memory barrier to
 930          * ensure that it is visible across all host cpus. This is not needed
 931          * for correctness but it does ensure that all the vcpus will notice
 932          * that the rendezvous is requested immediately.
 933          */
 934         vm->rendezvous_func = func;
 935         wmb();
 936 }
 937
 938 #define RENDEZVOUS_CTR0(vm, vcpuid, fmt)                                \
 939         do {                                                            \
 940                 if (vcpuid >= 0)                                        \
 941                         VCPU_CTR0(vm, vcpuid, fmt);                     \
 942                 else                                                    \
 943                         VM_CTR0(vm, fmt);                               \
 944         } while (0)
 945
 946 static void
 947 vm_handle_rendezvous(struct vm *vm, int vcpuid)
 948 {
 949
 950         KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU),
 951             ("vm_handle_rendezvous: invalid vcpuid %d", vcpuid));
 952
 953         mtx_lock(&vm->rendezvous_mtx);
 954         while (vm->rendezvous_func != NULL) {
 955                 /* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */
 956                 CPU_AND(&vm->rendezvous_req_cpus, &vm->active_cpus);
 957
 958                 if (vcpuid != -1 &&
 959                     CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) &&
 960                     !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) {
 961                         VCPU_CTR0(vm, vcpuid, "Calling rendezvous func");
 962                         (*vm->rendezvous_func)(vm, vcpuid, vm->rendezvous_arg);
 963                         CPU_SET(vcpuid, &vm->rendezvous_done_cpus);
 964                 }
 965                 if (CPU_CMP(&vm->rendezvous_req_cpus,
 966                     &vm->rendezvous_done_cpus) == 0) {
 967                         VCPU_CTR0(vm, vcpuid, "Rendezvous completed");
 968                         vm_set_rendezvous_func(vm, NULL);
 969                         wakeup(&vm->rendezvous_func);
 970                         break;
 971                 }
 972                 RENDEZVOUS_CTR0(vm, vcpuid, "Wait for rendezvous completion");
 973                 mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0,
 974                     "vmrndv", 0);
 975         }
 976         mtx_unlock(&vm->rendezvous_mtx);
 977 }
 978
 979 /*
 980  * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
 981  */
 982 static int
 983 vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
 984 {
 985         struct vm_exit *vmexit;
 986         struct vcpu *vcpu;
 987         int t, timo, spindown;
 988
 989         vcpu = &vm->vcpu[vcpuid];
 990         spindown = 0;
 991
 992         vcpu_lock(vcpu);
 993
 994         /*
 995          * Do a final check for pending NMI or interrupts before
 996          * really putting this thread to sleep.
 997          *
 998          * These interrupts could have happened any time after we
 999          * returned from VMRUN() and before we grabbed the vcpu lock.
1000          */
1001         if (!vm_nmi_pending(vm, vcpuid) &&
1002             (intr_disabled || !vlapic_pending_intr(vcpu->vlapic, NULL))) {
1003                 t = ticks;
1004                 vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
1005                 if (vlapic_enabled(vcpu->vlapic)) {
1006                         /*
1007                          * XXX msleep_spin() is not interruptible so use the
1008                          * 'timo' to put an upper bound on the sleep time.
1009                          */
1010                         timo = hz;
1011                         msleep_spin(vcpu, &vcpu->mtx, "vmidle", timo);
1012                 } else {
1013                         /*
1014                          * Spindown the vcpu if the apic is disabled and it
1015                          * had entered the halted state.
1016                          */
1017                         spindown = 1;
1018                 }
1019                 vcpu_require_state_locked(vcpu, VCPU_FROZEN);
1020                 vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
1021         }
1022         vcpu_unlock(vcpu);
1023
1024         /*
1025          * Since 'vm_deactivate_cpu()' grabs a sleep mutex we must call it
1026          * outside the confines of the vcpu spinlock.
1027          */
1028         if (spindown) {
1029                 *retu = true;
1030                 vmexit = vm_exitinfo(vm, vcpuid);
1031                 vmexit->exitcode = VM_EXITCODE_SPINDOWN_CPU;
1032                 vm_deactivate_cpu(vm, vcpuid);
1033                 VCPU_CTR0(vm, vcpuid, "spinning down cpu");
1034         }
1035
1036         return (0);
1037 }
1038
1039 static int
1040 vm_handle_paging(struct vm *vm, int vcpuid, bool *retu)
1041 {
1042         int rv, ftype;
1043         struct vm_map *map;
1044         struct vcpu *vcpu;
1045         struct vm_exit *vme;
1046
1047         vcpu = &vm->vcpu[vcpuid];
1048         vme = &vcpu->exitinfo;
1049
1050         ftype = vme->u.paging.fault_type;
1051         KASSERT(ftype == VM_PROT_READ ||
1052             ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
1053             ("vm_handle_paging: invalid fault_type %d", ftype));
1054
1055         if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
1056                 rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
1057                     vme->u.paging.gpa, ftype);
1058                 if (rv == 0)
1059                         goto done;
1060         }
1061
1062         map = &vm->vmspace->vm_map;
1063         rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL);
1064
1065         VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, "
1066             "ftype = %d", rv, vme->u.paging.gpa, ftype);
1067
1068         if (rv != KERN_SUCCESS)
1069                 return (EFAULT);
1070 done:
1071         /* restart execution at the faulting instruction */
1072         vme->inst_length = 0;
1073
1074         return (0);
1075 }
1076
1077 static int
1078 vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
1079 {
1080         struct vie *vie;
1081         struct vcpu *vcpu;
1082         struct vm_exit *vme;
1083         int error, inst_length;
1084         uint64_t rip, gla, gpa, cr3;
1085         mem_region_read_t mread;
1086         mem_region_write_t mwrite;
1087
1088         vcpu = &vm->vcpu[vcpuid];
1089         vme = &vcpu->exitinfo;
1090
1091         rip = vme->rip;
1092         inst_length = vme->inst_length;
1093
1094         gla = vme->u.inst_emul.gla;
1095         gpa = vme->u.inst_emul.gpa;
1096         cr3 = vme->u.inst_emul.cr3;
1097         vie = &vme->u.inst_emul.vie;
1098
1099         vie_init(vie);
1100
1101         /* Fetch, decode and emulate the faulting instruction */
1102         if (vmm_fetch_instruction(vm, vcpuid, rip, inst_length, cr3, vie) != 0)
1103                 return (EFAULT);
1104
1105         if (vmm_decode_instruction(vm, vcpuid, gla, vie) != 0)
1106                 return (EFAULT);
1107
1108         /* return to userland unless this is an in-kernel emulated device */
1109         if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
1110                 mread = lapic_mmio_read;
1111                 mwrite = lapic_mmio_write;
1112         } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
1113                 mread = vioapic_mmio_read;
1114                 mwrite = vioapic_mmio_write;
1115         } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
1116                 mread = vhpet_mmio_read;
1117                 mwrite = vhpet_mmio_write;
1118         } else {
1119                 *retu = true;
1120                 return (0);
1121         }
1122
1123         error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, mread, mwrite,
1124             retu);
1125
1126         return (error);
1127 }
1128
1129 int
1130 vm_run(struct vm *vm, struct vm_run *vmrun)
1131 {
1132         int error, vcpuid;
1133         struct vcpu *vcpu;
1134         struct pcb *pcb;
1135         uint64_t tscval, rip;
1136         struct vm_exit *vme;
1137         bool retu, intr_disabled;
1138         pmap_t pmap;
1139
1140         vcpuid = vmrun->cpuid;
1141
1142         if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1143                 return (EINVAL);
1144
1145         pmap = vmspace_pmap(vm->vmspace);
1146         vcpu = &vm->vcpu[vcpuid];
1147         vme = &vcpu->exitinfo;
1148         rip = vmrun->rip;
1149 restart:
1150         critical_enter();
1151
1152         KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
1153             ("vm_run: absurd pm_active"));
1154
1155         tscval = rdtsc();
1156
1157         pcb = PCPU_GET(curpcb);
1158         set_pcb_flags(pcb, PCB_FULL_IRET);
1159
1160         restore_guest_msrs(vm, vcpuid);
1161         restore_guest_fpustate(vcpu);
1162
1163         vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
1164         error = VMRUN(vm->cookie, vcpuid, rip, pmap, &vm->rendezvous_func);
1165         vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
1166
1167         save_guest_fpustate(vcpu);
1168         restore_host_msrs(vm, vcpuid);
1169
1170         vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
1171
1172         critical_exit();
1173
1174         if (error == 0) {
1175                 retu = false;
1176                 switch (vme->exitcode) {
1177                 case VM_EXITCODE_IOAPIC_EOI:
1178                         vioapic_process_eoi(vm, vcpuid,
1179                             vme->u.ioapic_eoi.vector);
1180                         break;
1181                 case VM_EXITCODE_RENDEZVOUS:
1182                         vm_handle_rendezvous(vm, vcpuid);
1183                         error = 0;
1184                         break;
1185                 case VM_EXITCODE_HLT:
1186                         intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
1187                         error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu);
1188                         break;
1189                 case VM_EXITCODE_PAGING:
1190                         error = vm_handle_paging(vm, vcpuid, &retu);
1191                         break;
1192                 case VM_EXITCODE_INST_EMUL:
1193                         error = vm_handle_inst_emul(vm, vcpuid, &retu);
1194                         break;
1195                 default:
1196                         retu = true;    /* handled in userland */
1197                         break;
1198                 }
1199         }
1200
1201         if (error == 0 && retu == false) {
1202                 rip = vme->rip + vme->inst_length;
1203                 goto restart;
1204         }
1205
1206         /* copy the exit information */
1207         bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit));
1208         return (error);
1209 }
1210
1211 int
1212 vm_inject_event(struct vm *vm, int vcpuid, int type,
1213                 int vector, uint32_t code, int code_valid)
1214 {
1215         if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1216                 return (EINVAL);
1217
1218         if ((type > VM_EVENT_NONE && type < VM_EVENT_MAX) == 0)
1219                 return (EINVAL);
1220
1221         if (vector < 0 || vector > 255)
1222                 return (EINVAL);
1223
1224         return (VMINJECT(vm->cookie, vcpuid, type, vector, code, code_valid));
1225 }
1226
1227 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
1228
1229 int
1230 vm_inject_nmi(struct vm *vm, int vcpuid)
1231 {
1232         struct vcpu *vcpu;
1233
1234         if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1235                 return (EINVAL);
1236
1237         vcpu = &vm->vcpu[vcpuid];
1238
1239         vcpu->nmi_pending = 1;
1240         vcpu_notify_event(vm, vcpuid, false);
1241         return (0);
1242 }
1243
1244 int
1245 vm_nmi_pending(struct vm *vm, int vcpuid)
1246 {
1247         struct vcpu *vcpu;
1248
1249         if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1250                 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
1251
1252         vcpu = &vm->vcpu[vcpuid];
1253
1254         return (vcpu->nmi_pending);
1255 }
1256
1257 void
1258 vm_nmi_clear(struct vm *vm, int vcpuid)
1259 {
1260         struct vcpu *vcpu;
1261
1262         if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1263                 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
1264
1265         vcpu = &vm->vcpu[vcpuid];
1266
1267         if (vcpu->nmi_pending == 0)
1268                 panic("vm_nmi_clear: inconsistent nmi_pending state");
1269
1270         vcpu->nmi_pending = 0;
1271         vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
1272 }
1273
1274 int
1275 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
1276 {
1277         if (vcpu < 0 || vcpu >= VM_MAXCPU)
1278                 return (EINVAL);
1279
1280         if (type < 0 || type >= VM_CAP_MAX)
1281                 return (EINVAL);
1282
1283         return (VMGETCAP(vm->cookie, vcpu, type, retval));
1284 }
1285
1286 int
1287 vm_set_capability(struct vm *vm, int vcpu, int type, int val)
1288 {
1289         if (vcpu < 0 || vcpu >= VM_MAXCPU)
1290                 return (EINVAL);
1291
1292         if (type < 0 || type >= VM_CAP_MAX)
1293                 return (EINVAL);
1294
1295         return (VMSETCAP(vm->cookie, vcpu, type, val));
1296 }
1297
1298 uint64_t *
1299 vm_guest_msrs(struct vm *vm, int cpu)
1300 {
1301         return (vm->vcpu[cpu].guest_msrs);
1302 }
1303
1304 struct vlapic *
1305 vm_lapic(struct vm *vm, int cpu)
1306 {
1307         return (vm->vcpu[cpu].vlapic);
1308 }
1309
1310 struct vioapic *
1311 vm_ioapic(struct vm *vm)
1312 {
1313
1314         return (vm->vioapic);
1315 }
1316
1317 struct vhpet *
1318 vm_hpet(struct vm *vm)
1319 {
1320
1321         return (vm->vhpet);
1322 }
1323
1324 boolean_t
1325 vmm_is_pptdev(int bus, int slot, int func)
1326 {
1327         int found, i, n;
1328         int b, s, f;
1329         char *val, *cp, *cp2;
1330
1331         /*
1332          * XXX
1333          * The length of an environment variable is limited to 128 bytes which
1334          * puts an upper limit on the number of passthru devices that may be
1335          * specified using a single environment variable.
1336          *
1337          * Work around this by scanning multiple environment variable
1338          * names instead of a single one - yuck!
1339          */
1340         const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL };
1341
1342         /* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */
1343         found = 0;
1344         for (i = 0; names[i] != NULL && !found; i++) {
1345                 cp = val = getenv(names[i]);
1346                 while (cp != NULL && *cp != '\0') {
1347                         if ((cp2 = strchr(cp, ' ')) != NULL)
1348                                 *cp2 = '\0';
1349
1350                         n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
1351                         if (n == 3 && bus == b && slot == s && func == f) {
1352                                 found = 1;
1353                                 break;
1354                         }
1355
1356                         if (cp2 != NULL)
1357                                 *cp2++ = ' ';
1358
1359                         cp = cp2;
1360                 }
1361                 freeenv(val);
1362         }
1363         return (found);
1364 }
1365
1366 void *
1367 vm_iommu_domain(struct vm *vm)
1368 {
1369
1370         return (vm->iommu);
1371 }
1372
1373 int
1374 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
1375     bool from_idle)
1376 {
1377         int error;
1378         struct vcpu *vcpu;
1379
1380         if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1381                 panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
1382
1383         vcpu = &vm->vcpu[vcpuid];
1384
1385         vcpu_lock(vcpu);
1386         error = vcpu_set_state_locked(vcpu, newstate, from_idle);
1387         vcpu_unlock(vcpu);
1388
1389         return (error);
1390 }
1391
1392 enum vcpu_state
1393 vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
1394 {
1395         struct vcpu *vcpu;
1396         enum vcpu_state state;
1397
1398         if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1399                 panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
1400
1401         vcpu = &vm->vcpu[vcpuid];
1402
1403         vcpu_lock(vcpu);
1404         state = vcpu->state;
1405         if (hostcpu != NULL)
1406                 *hostcpu = vcpu->hostcpu;
1407         vcpu_unlock(vcpu);
1408
1409         return (state);
1410 }
1411
1412 void
1413 vm_activate_cpu(struct vm *vm, int vcpuid)
1414 {
1415
1416         KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU,
1417             ("vm_activate_cpu: invalid vcpuid %d", vcpuid));
1418         KASSERT(!CPU_ISSET(vcpuid, &vm->active_cpus),
1419             ("vm_activate_cpu: vcpuid %d is already active", vcpuid));
1420
1421         VCPU_CTR0(vm, vcpuid, "activated");
1422         CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
1423 }
1424
1425 static void
1426 vm_deactivate_cpu(struct vm *vm, int vcpuid)
1427 {
1428
1429         KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU,
1430             ("vm_deactivate_cpu: invalid vcpuid %d", vcpuid));
1431         KASSERT(CPU_ISSET(vcpuid, &vm->active_cpus),
1432             ("vm_deactivate_cpu: vcpuid %d is not active", vcpuid));
1433
1434         VCPU_CTR0(vm, vcpuid, "deactivated");
1435         CPU_CLR_ATOMIC(vcpuid, &vm->active_cpus);
1436
1437         /*
1438          * If a vcpu rendezvous is in progress then it could be blocked
1439          * on 'vcpuid' - unblock it before disappearing forever.
1440          */
1441         mtx_lock(&vm->rendezvous_mtx);
1442         if (vm->rendezvous_func != NULL) {
1443                 VCPU_CTR0(vm, vcpuid, "unblock rendezvous after deactivation");
1444                 wakeup(&vm->rendezvous_func);
1445         }
1446         mtx_unlock(&vm->rendezvous_mtx);
1447 }
1448
1449 cpuset_t
1450 vm_active_cpus(struct vm *vm)
1451 {
1452
1453         return (vm->active_cpus);
1454 }
1455
1456 void *
1457 vcpu_stats(struct vm *vm, int vcpuid)
1458 {
1459
1460         return (vm->vcpu[vcpuid].stats);
1461 }
1462
1463 int
1464 vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
1465 {
1466         if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1467                 return (EINVAL);
1468
1469         *state = vm->vcpu[vcpuid].x2apic_state;
1470
1471         return (0);
1472 }
1473
1474 int
1475 vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
1476 {
1477         if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1478                 return (EINVAL);
1479
1480         if (state >= X2APIC_STATE_LAST)
1481                 return (EINVAL);
1482
1483         vm->vcpu[vcpuid].x2apic_state = state;
1484
1485         vlapic_set_x2apic_state(vm, vcpuid, state);
1486
1487         return (0);
1488 }
1489
1490 /*
1491  * This function is called to ensure that a vcpu "sees" a pending event
1492  * as soon as possible:
1493  * - If the vcpu thread is sleeping then it is woken up.
1494  * - If the vcpu is running on a different host_cpu then an IPI will be directed
1495  *   to the host_cpu to cause the vcpu to trap into the hypervisor.
1496  */
1497 void
1498 vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr)
1499 {
1500         int hostcpu;
1501         struct vcpu *vcpu;
1502
1503         vcpu = &vm->vcpu[vcpuid];
1504
1505         vcpu_lock(vcpu);
1506         hostcpu = vcpu->hostcpu;
1507         if (vcpu->state == VCPU_RUNNING) {
1508                 KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
1509                 if (hostcpu != curcpu) {
1510                         if (lapic_intr) {
1511                                 vlapic_post_intr(vcpu->vlapic, hostcpu,
1512                                     vmm_ipinum);
1513                         } else {
1514                                 ipi_cpu(hostcpu, vmm_ipinum);
1515                         }
1516                 } else {
1517                         /*
1518                          * If the 'vcpu' is running on 'curcpu' then it must
1519                          * be sending a notification to itself (e.g. SELF_IPI).
1520                          * The pending event will be picked up when the vcpu
1521                          * transitions back to guest context.
1522                          */
1523                 }
1524         } else {
1525                 KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
1526                     "with hostcpu %d", vcpu->state, hostcpu));
1527                 if (vcpu->state == VCPU_SLEEPING)
1528                         wakeup_one(vcpu);
1529         }
1530         vcpu_unlock(vcpu);
1531 }
1532
1533 struct vmspace *
1534 vm_get_vmspace(struct vm *vm)
1535 {
1536
1537         return (vm->vmspace);
1538 }
1539
1540 int
1541 vm_apicid2vcpuid(struct vm *vm, int apicid)
1542 {
1543         /*
1544          * XXX apic id is assumed to be numerically identical to vcpu id
1545          */
1546         return (apicid);
1547 }
1548
1549 void
1550 vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest,
1551     vm_rendezvous_func_t func, void *arg)
1552 {
1553         int i;
1554
1555         /*
1556          * Enforce that this function is called without any locks
1557          */
1558         WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous");
1559         KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU),
1560             ("vm_smp_rendezvous: invalid vcpuid %d", vcpuid));
1561
1562 restart:
1563         mtx_lock(&vm->rendezvous_mtx);
1564         if (vm->rendezvous_func != NULL) {
1565                 /*
1566                  * If a rendezvous is already in progress then we need to
1567                  * call the rendezvous handler in case this 'vcpuid' is one
1568                  * of the targets of the rendezvous.
1569                  */
1570                 RENDEZVOUS_CTR0(vm, vcpuid, "Rendezvous already in progress");
1571                 mtx_unlock(&vm->rendezvous_mtx);
1572                 vm_handle_rendezvous(vm, vcpuid);
1573                 goto restart;
1574         }
1575         KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous "
1576             "rendezvous is still in progress"));
1577
1578         RENDEZVOUS_CTR0(vm, vcpuid, "Initiating rendezvous");
1579         vm->rendezvous_req_cpus = dest;
1580         CPU_ZERO(&vm->rendezvous_done_cpus);
1581         vm->rendezvous_arg = arg;
1582         vm_set_rendezvous_func(vm, func);
1583         mtx_unlock(&vm->rendezvous_mtx);
1584
1585         /*
1586          * Wake up any sleeping vcpus and trigger a VM-exit in any running
1587          * vcpus so they handle the rendezvous as soon as possible.
1588          */
1589         for (i = 0; i < VM_MAXCPU; i++) {
1590                 if (CPU_ISSET(i, &dest))
1591                         vcpu_notify_event(vm, i, false);
1592         }
1593
1594         vm_handle_rendezvous(vm, vcpuid);
1595 }